From 7269512a28eb42029447d5380c764353278a74e1 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Mon, 2 Jun 2025 13:16:04 -0700 Subject: [PATCH 01/18] fix: replace function now can handle bpd.NA value. (#1786) --- bigframes/dtypes.py | 2 +- tests/system/small/test_series.py | 12 ++++++++++++ .../bigframes_vendored/ibis/expr/types/core.py | 6 +++--- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 262fa9dde7..2c5df89665 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -754,7 +754,7 @@ def bf_type_from_type_kind( def is_dtype(scalar: typing.Any, dtype: Dtype) -> bool: """Captures whether a scalar can be losslessly represented by a dtype.""" - if scalar is None: + if pd.isna(scalar): return True if pd.api.types.is_bool_dtype(dtype): return pd.api.types.is_bool(scalar) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 710e1481be..c391370805 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -629,6 +629,18 @@ def test_series_replace_list_scalar(scalars_dfs): ) +def test_series_replace_nans_with_pd_na(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_result = scalars_df[col_name].replace({pd.NA: "UNKNOWN"}).to_pandas() + pd_result = scalars_pandas_df[col_name].replace({pd.NA: "UNKNOWN"}) + + pd.testing.assert_series_equal( + pd_result, + bf_result, + ) + + @pytest.mark.parametrize( ("replacement_dict",), ( diff --git a/third_party/bigframes_vendored/ibis/expr/types/core.py b/third_party/bigframes_vendored/ibis/expr/types/core.py index 9685e4ddca..5704dc993a 100644 --- a/third_party/bigframes_vendored/ibis/expr/types/core.py +++ b/third_party/bigframes_vendored/ibis/expr/types/core.py @@ -19,6 +19,7 @@ import bigframes_vendored.ibis.expr.operations as ops from bigframes_vendored.ibis.expr.types.pretty import to_rich from bigframes_vendored.ibis.util import experimental +import pandas as pd from public import public from rich.console import Console from rich.jupyter import JupyterMixin @@ -34,7 +35,6 @@ EdgeAttributeGetter, NodeAttributeGetter, ) - import pandas as pd import polars as pl import pyarrow as pa import torch @@ -744,9 +744,9 @@ def _binop(op_class: type[ops.Binary], left: ir.Value, right: ir.Value) -> ir.Va def _is_null_literal(value: Any) -> bool: """Detect whether `value` will be treated by ibis as a null literal.""" - if value is None: - return True if isinstance(value, Expr): op = value.op() return isinstance(op, ops.Literal) and op.value is None + if pd.isna(value): + return True return False From 8af26d07cf3e8b22e0c69dd0172352fadc1857d8 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 2 Jun 2025 15:51:31 -0700 Subject: [PATCH 02/18] feat: implement ai.classify() (#1781) * feat: implement ai.classify() * check label duplicity --- bigframes/operations/ai.py | 97 +++++++++++++++++++++++- tests/system/large/operations/test_ai.py | 27 +++++++ tests/system/small/operations/test_ai.py | 59 ++++++++++++++ 3 files changed, 182 insertions(+), 1 deletion(-) diff --git a/bigframes/operations/ai.py b/bigframes/operations/ai.py index c65947f53f..87245d104e 100644 --- a/bigframes/operations/ai.py +++ b/bigframes/operations/ai.py @@ -16,7 +16,7 @@ import re import typing -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Sequence import warnings import numpy as np @@ -258,6 +258,101 @@ def extract_logprob(s: bigframes.series.Series) -> bigframes.series.Series: return concat([self._df, *attach_columns], axis=1) + def classify( + self, + instruction: str, + model, + labels: Sequence[str], + output_column: str = "result", + ground_with_google_search: bool = False, + attach_logprobs=False, + ): + """ + Classifies the rows of dataframes based on user instruction into the provided labels. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.ai_operators = True + >>> bpd.options.compute.ai_ops_confirmation_threshold = 25 + + >>> import bigframes.ml.llm as llm + >>> model = llm.GeminiTextGenerator(model_name="gemini-2.0-flash-001") + + >>> df = bpd.DataFrame({ + ... "feedback_text": [ + ... "The product is amazing, but the shipping was slow.", + ... "I had an issue with my recent bill.", + ... "The user interface is very intuitive." + ... ], + ... }) + >>> df.ai.classify("{feedback_text}", model=model, labels=["Shipping", "Billing", "UI"]) + feedback_text result + 0 The product is amazing, but the shipping was s... Shipping + 1 I had an issue with my recent bill. Billing + 2 The user interface is very intuitive. UI + + [3 rows x 2 columns] + + Args: + instruction (str): + An instruction on how to classify the data. This value must contain + column references by name, which should be wrapped in a pair of braces. + For example, if you have a column "feedback", you can refer to this column + with"{food}". + + model (bigframes.ml.llm.GeminiTextGenerator): + A GeminiTextGenerator provided by Bigframes ML package. + + labels (Sequence[str]): + A collection of labels (categories). It must contain at least two and at most 20 elements. + Labels are case sensitive. Duplicated labels are not allowed. + + output_column (str, default "result"): + The name of column for the output. + + ground_with_google_search (bool, default False): + Enables Grounding with Google Search for the GeminiTextGenerator model. + When set to True, the model incorporates relevant information from Google + Search results into its responses, enhancing their accuracy and factualness. + Note: Using this feature may impact billing costs. Refer to the pricing + page for details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models + The default is `False`. + + attach_logprobs (bool, default False): + Controls whether to attach an additional "logprob" column for each result. Logprobs are float-point values reflecting the confidence level + of the LLM for their responses. Higher values indicate more confidence. The value is in the range between negative infinite and 0. + + + Returns: + bigframes.pandas.DataFrame: DataFrame with classification result. + + Raises: + NotImplementedError: when the AI operator experiment is off. + ValueError: when the instruction refers to a non-existing column, when no + columns are referred to, or when the count of labels does not meet the + requirement. + """ + + if len(labels) < 2 or len(labels) > 20: + raise ValueError( + f"The number of labels should be between 2 and 20 (inclusive), but {len(labels)} labels are provided." + ) + + if len(set(labels)) != len(labels): + raise ValueError("There are duplicate labels.") + + updated_instruction = f"Based on the user instruction {instruction}, you must provide an answer that must exist in the following list of labels: {labels}" + + return self.map( + updated_instruction, + model, + output_schema={output_column: "string"}, + ground_with_google_search=ground_with_google_search, + attach_logprobs=attach_logprobs, + ) + def join( self, other, diff --git a/tests/system/large/operations/test_ai.py b/tests/system/large/operations/test_ai.py index 1b1d3a3376..c0716220b1 100644 --- a/tests/system/large/operations/test_ai.py +++ b/tests/system/large/operations/test_ai.py @@ -398,6 +398,33 @@ def test_map_invalid_model_raise_error(): ) +def test_classify(gemini_flash_model, session): + df = dataframe.DataFrame(data={"creature": ["dog", "rose"]}, session=session) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_result = df.ai.classify( + "{creature}", + gemini_flash_model, + labels=["animal", "plant"], + output_column="result", + ).to_pandas() + + expected_result = pd.DataFrame( + { + "creature": ["dog", "rose"], + "result": ["animal", "plant"], + } + ) + pandas.testing.assert_frame_equal( + actual_result, expected_result, check_index_type=False, check_dtype=False + ) + + @pytest.mark.parametrize( "instruction", [ diff --git a/tests/system/small/operations/test_ai.py b/tests/system/small/operations/test_ai.py index 25d411bef8..83aca8b5b1 100644 --- a/tests/system/small/operations/test_ai.py +++ b/tests/system/small/operations/test_ai.py @@ -108,6 +108,65 @@ def test_map(session): ) +def test_classify(session): + df = dataframe.DataFrame({"col": ["A", "B"]}, session=session) + model = FakeGeminiTextGenerator( + dataframe.DataFrame( + { + "result": ["A", "B"], + "full_response": _create_dummy_full_response(2), + }, + session=session, + ), + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 50, + ): + result = df.ai.classify( + "classify {col}", model=model, labels=["A", "B"] + ).to_pandas() + + pandas.testing.assert_frame_equal( + result, + pd.DataFrame( + {"col": ["A", "B"], "result": ["A", "B"]}, dtype=dtypes.STRING_DTYPE + ), + check_index_type=False, + ) + + +@pytest.mark.parametrize( + "labels", + [ + pytest.param([], id="empty-label"), + pytest.param(["A", "A", "B"], id="duplicate-labels"), + ], +) +def test_classify_invalid_labels_raise_error(session, labels): + df = dataframe.DataFrame({"col": ["A", "B"]}, session=session) + model = FakeGeminiTextGenerator( + dataframe.DataFrame( + { + "result": ["A", "B"], + "full_response": _create_dummy_full_response(2), + }, + session=session, + ), + ) + + with bigframes.option_context( + AI_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 50, + ), pytest.raises(ValueError): + df.ai.classify("classify {col}", model=model, labels=labels) + + def test_join(session): left_df = dataframe.DataFrame({"col_A": ["A"]}, session=session) right_df = dataframe.DataFrame({"col_B": ["B"]}, session=session) From 38d9b7376697f8e19124e5d1f5fccda82d920b92 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Mon, 2 Jun 2025 16:12:10 -0700 Subject: [PATCH 03/18] docs: fix docstrings to improve html rendering of code examples (#1788) * docs: fix docstrings to improve html rendering of code examples * fix examples docstring in one more file --- third_party/bigframes_vendored/pandas/core/computation/eval.py | 1 + third_party/bigframes_vendored/pandas/core/frame.py | 1 + third_party/bigframes_vendored/pandas/core/indexes/accessor.py | 1 + third_party/bigframes_vendored/pandas/io/gbq.py | 1 + 4 files changed, 4 insertions(+) diff --git a/third_party/bigframes_vendored/pandas/core/computation/eval.py b/third_party/bigframes_vendored/pandas/core/computation/eval.py index 56d60174a6..d3d11a9c2a 100644 --- a/third_party/bigframes_vendored/pandas/core/computation/eval.py +++ b/third_party/bigframes_vendored/pandas/core/computation/eval.py @@ -171,6 +171,7 @@ def eval( with plain ol' Python evaluation. **Examples:** + >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index c1b5b5a86b..6c927a5c26 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4253,6 +4253,7 @@ def corrwith( correlations. **Examples:** + >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None diff --git a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py index 469f35f181..dfb1cf9efc 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py @@ -204,6 +204,7 @@ def isocalendar(self): Calculate year, week, and day according to the ISO 8601 standard. **Examples:** + >>> import pandas as pd >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index aa4d862b65..a0d4092571 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -67,6 +67,7 @@ def read_gbq( >>> df = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins") Read table path with wildcard suffix and filters: + >>> df = bpd.read_gbq_table("bigquery-public-data.noaa_gsod.gsod19*", filters=[("_table_suffix", ">=", "30"), ("_table_suffix", "<=", "39")]) Preserve ordering in a query input. From e480d29f03636fa9824404ef90c510701e510195 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 3 Jun 2025 13:19:10 -0700 Subject: [PATCH 04/18] feat: Support isin with bigframes.pandas.Index arg (#1779) --- bigframes/core/indexes/base.py | 4 ++++ bigframes/series.py | 4 +++- tests/system/small/test_index.py | 34 ++++++++++++++++++++++++++++++- tests/system/small/test_series.py | 18 ++++++++++++++++ 4 files changed, 58 insertions(+), 2 deletions(-) diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index 44b1d9d4fa..bf5e4b53f2 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -504,6 +504,10 @@ def unique(self, level: Hashable | int | None = None) -> Index: return self.get_level_values(level).drop_duplicates() def isin(self, values) -> Index: + import bigframes.series as series + + if isinstance(values, (series.Series, Index)): + return Index(self.to_series().isin(values)) if not utils.is_list_like(values): raise TypeError( "only list-like objects are allowed to be passed to " diff --git a/bigframes/series.py b/bigframes/series.py index 74e8d03c8d..06b6615080 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -979,8 +979,10 @@ def nsmallest(self, n: int = 5, keep: str = "first") -> Series: ) def isin(self, values) -> "Series" | None: - if isinstance(values, (Series,)): + if isinstance(values, Series): return Series(self._block.isin(values._block)) + if isinstance(values, indexes.Index): + return Series(self._block.isin(values.to_series()._block)) if not _is_list_like(values): raise TypeError( "only list-like objects are allowed to be passed to " diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py index 9f45c8465b..6e230974fe 100644 --- a/tests/system/small/test_index.py +++ b/tests/system/small/test_index.py @@ -375,7 +375,7 @@ def test_index_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep): ) -def test_index_isin(scalars_df_index, scalars_pandas_df_index): +def test_index_isin_list(scalars_df_index, scalars_pandas_df_index): col_name = "int64_col" bf_series = ( scalars_df_index.set_index(col_name).index.isin([2, 55555, 4]).to_pandas() @@ -389,6 +389,38 @@ def test_index_isin(scalars_df_index, scalars_pandas_df_index): ) +def test_index_isin_bf_series(scalars_df_index, scalars_pandas_df_index, session): + col_name = "int64_col" + bf_series = ( + scalars_df_index.set_index(col_name) + .index.isin(bpd.Series([2, 55555, 4], session=session)) + .to_pandas() + ) + pd_result_array = scalars_pandas_df_index.set_index(col_name).index.isin( + [2, 55555, 4] + ) + pd.testing.assert_index_equal( + pd.Index(pd_result_array).set_names(col_name), + bf_series, + ) + + +def test_index_isin_bf_index(scalars_df_index, scalars_pandas_df_index, session): + col_name = "int64_col" + bf_series = ( + scalars_df_index.set_index(col_name) + .index.isin(bpd.Index([2, 55555, 4], session=session)) + .to_pandas() + ) + pd_result_array = scalars_pandas_df_index.set_index(col_name).index.isin( + [2, 55555, 4] + ) + pd.testing.assert_index_equal( + pd.Index(pd_result_array).set_names(col_name), + bf_series, + ) + + def test_multiindex_name_is_none(session): df = pd.DataFrame( { diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index c391370805..d0595afaa3 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1380,6 +1380,24 @@ def test_isin_bigframes_values(scalars_dfs, col_name, test_set, session): ) +def test_isin_bigframes_index(scalars_dfs, session): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = ( + scalars_df["string_col"] + .isin(bigframes.pandas.Index(["Hello, World!", "Hi", "こんにちは"], session=session)) + .to_pandas() + ) + pd_result = ( + scalars_pandas_df["string_col"] + .isin(pd.Index(["Hello, World!", "Hi", "こんにちは"])) + .astype("boolean") + ) + pd.testing.assert_series_equal( + pd_result, + bf_result, + ) + + @pytest.mark.parametrize( ( "col_name", From f495c84e231bafb065857fd19a2a7997a0e92f11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Tue, 3 Jun 2025 16:43:10 -0500 Subject: [PATCH 05/18] chore: use faster query_and_wait API in _read_gbq_colab (#1777) * chore: use faster query_and_wait API in _read_gbq_colab * try to fix unit tests * more unit test fixes * more test fixes * fix mypy * fix metrics counter in read_gbq with allow_large_results=False * use managedarrowtable * Update bigframes/session/loader.py * split out a few special case return values for read_gbq_query * support slice node for repr * fix failing system test * move slice into semiexecutor and out of readlocalnode * unit test for local executor * split method instead of using reloads * fix reference to _start_query * use limit rewrite for slice support * do not use numpy for offsets --- bigframes/blob/_functions.py | 4 + bigframes/core/array_value.py | 1 - bigframes/core/compile/compiler.py | 7 +- bigframes/core/compile/sqlglot/compiler.py | 7 +- bigframes/core/local_data.py | 2 +- bigframes/core/nodes.py | 10 ++ bigframes/core/pyarrow_utils.py | 9 + bigframes/core/rewrite/scan_reduction.py | 13 +- bigframes/core/rewrite/slices.py | 3 + bigframes/core/schema.py | 18 +- bigframes/functions/_function_client.py | 8 +- bigframes/session/__init__.py | 19 ++- bigframes/session/_io/bigquery/__init__.py | 60 +++++-- .../session/_io/bigquery/read_gbq_query.py | 90 ++++++++++ bigframes/session/bq_caching_executor.py | 35 +++- bigframes/session/loader.py | 160 ++++++++++++++---- bigframes/session/local_scan_executor.py | 14 +- bigframes/session/metrics.py | 13 +- bigframes/testing/mocks.py | 62 +++++-- .../small/functions/test_remote_function.py | 15 ++ .../small/session/test_read_gbq_colab.py | 29 +++- tests/system/small/test_encryption.py | 2 +- tests/unit/session/test_io_bigquery.py | 10 +- .../unit/session/test_local_scan_executor.py | 105 ++++++++++++ tests/unit/session/test_read_gbq_colab.py | 16 ++ tests/unit/session/test_read_gbq_query.py | 37 ++++ tests/unit/session/test_read_gbq_table.py | 11 +- tests/unit/session/test_session.py | 22 ++- 28 files changed, 665 insertions(+), 117 deletions(-) create mode 100644 bigframes/session/_io/bigquery/read_gbq_query.py create mode 100644 tests/unit/session/test_local_scan_executor.py create mode 100644 tests/unit/session/test_read_gbq_query.py diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py index f8fdb21946..51c030a23b 100644 --- a/bigframes/blob/_functions.py +++ b/bigframes/blob/_functions.py @@ -95,6 +95,10 @@ def _create_udf(self): sql, job_config=bigquery.QueryJobConfig(), metrics=self._session._metrics, + location=None, + project=None, + timeout=None, + query_with_job=True, ) return udf_name diff --git a/bigframes/core/array_value.py b/bigframes/core/array_value.py index 20773fd1b4..a6c700a485 100644 --- a/bigframes/core/array_value.py +++ b/bigframes/core/array_value.py @@ -34,7 +34,6 @@ import bigframes.core.ordering as orderings import bigframes.core.schema as schemata import bigframes.core.tree_properties -import bigframes.core.utils from bigframes.core.window_spec import WindowSpec import bigframes.dtypes import bigframes.exceptions as bfe diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index fb5399b7cb..451783602d 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -22,10 +22,9 @@ import bigframes_vendored.ibis.expr.api as ibis_api import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes import bigframes_vendored.ibis.expr.types as ibis_types -import pyarrow as pa from bigframes import dtypes, operations -from bigframes.core import expression +from bigframes.core import expression, pyarrow_utils import bigframes.core.compile.compiled as compiled import bigframes.core.compile.concat as concat_impl import bigframes.core.compile.configs as configs @@ -172,9 +171,7 @@ def compile_readlocal(node: nodes.ReadLocalNode, *args): pa_table = pa_table.rename_columns([item.id.sql for item in node.scan_list.items]) if offsets: - pa_table = pa_table.append_column( - offsets, pa.array(range(pa_table.num_rows), type=pa.int64()) - ) + pa_table = pyarrow_utils.append_offsets(pa_table, offsets) return compiled.UnorderedIR.from_polars(pa_table, bq_schema) diff --git a/bigframes/core/compile/sqlglot/compiler.py b/bigframes/core/compile/sqlglot/compiler.py index 1cb270297c..50169d1a8b 100644 --- a/bigframes/core/compile/sqlglot/compiler.py +++ b/bigframes/core/compile/sqlglot/compiler.py @@ -18,10 +18,9 @@ import typing from google.cloud import bigquery -import pyarrow as pa import sqlglot.expressions as sge -from bigframes.core import expression, guid, identifiers, nodes, rewrite +from bigframes.core import expression, guid, identifiers, nodes, pyarrow_utils, rewrite from bigframes.core.compile import configs import bigframes.core.compile.sqlglot.scalar_compiler as scalar_compiler import bigframes.core.compile.sqlglot.sqlglot_ir as ir @@ -155,9 +154,7 @@ def compile_readlocal(self, node: nodes.ReadLocalNode, *args) -> ir.SQLGlotIR: offsets = node.offsets_col.sql if node.offsets_col else None if offsets: - pa_table = pa_table.append_column( - offsets, pa.array(range(pa_table.num_rows), type=pa.int64()) - ) + pa_table = pyarrow_utils.append_offsets(pa_table, offsets) return ir.SQLGlotIR.from_pyarrow(pa_table, node.schema, uid_gen=self.uid_gen) diff --git a/bigframes/core/local_data.py b/bigframes/core/local_data.py index 2e8c4aff44..cef426aea8 100644 --- a/bigframes/core/local_data.py +++ b/bigframes/core/local_data.py @@ -295,7 +295,7 @@ def _adapt_chunked_array( def _adapt_arrow_array(array: pa.Array) -> tuple[pa.Array, bigframes.dtypes.Dtype]: - """Normalize the array to managed storage types. Preverse shapes, only transforms values.""" + """Normalize the array to managed storage types. Preserve shapes, only transforms values.""" if array.offset != 0: # Offset arrays don't have all operations implemented return _adapt_arrow_array(pa.concat_arrays([array])) diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index cc82c844f7..9e5ed12dfe 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -154,6 +154,16 @@ def is_limit(self) -> bool: and (self.stop > 0) ) + @property + def is_noop(self) -> bool: + """Returns whether this node doesn't actually change the results.""" + # TODO: Handle tail case. + return ( + ((not self.start) or (self.start == 0)) + and (self.step == 1) + and ((self.stop is None) or (self.stop == self.row_count)) + ) + @property def row_count(self) -> typing.Optional[int]: child_length = self.child.row_count diff --git a/bigframes/core/pyarrow_utils.py b/bigframes/core/pyarrow_utils.py index eead30d908..bcbffdc78c 100644 --- a/bigframes/core/pyarrow_utils.py +++ b/bigframes/core/pyarrow_utils.py @@ -85,3 +85,12 @@ def truncate_pyarrow_iterable( else: yield batch total_yielded += batch.num_rows + + +def append_offsets( + pa_table: pa.Table, + offsets_col: str, +) -> pa.Table: + return pa_table.append_column( + offsets_col, pa.array(range(pa_table.num_rows), type=pa.int64()) + ) diff --git a/bigframes/core/rewrite/scan_reduction.py b/bigframes/core/rewrite/scan_reduction.py index b9050c0c34..b0729337e7 100644 --- a/bigframes/core/rewrite/scan_reduction.py +++ b/bigframes/core/rewrite/scan_reduction.py @@ -16,6 +16,7 @@ from typing import Optional from bigframes.core import nodes +import bigframes.core.rewrite.slices def try_reduce_to_table_scan(root: nodes.BigFrameNode) -> Optional[nodes.ReadTableNode]: @@ -28,7 +29,15 @@ def try_reduce_to_table_scan(root: nodes.BigFrameNode) -> Optional[nodes.ReadTab return None -def try_reduce_to_local_scan(node: nodes.BigFrameNode) -> Optional[nodes.ReadLocalNode]: +def try_reduce_to_local_scan( + node: nodes.BigFrameNode, +) -> Optional[tuple[nodes.ReadLocalNode, Optional[int]]]: + """Create a ReadLocalNode with optional limit, if possible. + + Similar to ReadApiSemiExecutor._try_adapt_plan. + """ + node, limit = bigframes.core.rewrite.slices.pull_out_limit(node) + if not all( map( lambda x: isinstance(x, (nodes.ReadLocalNode, nodes.SelectionNode)), @@ -38,7 +47,7 @@ def try_reduce_to_local_scan(node: nodes.BigFrameNode) -> Optional[nodes.ReadLoc return None result = node.bottom_up(merge_scan) if isinstance(result, nodes.ReadLocalNode): - return result + return result, limit return None diff --git a/bigframes/core/rewrite/slices.py b/bigframes/core/rewrite/slices.py index 92911310da..bed3a8a3f3 100644 --- a/bigframes/core/rewrite/slices.py +++ b/bigframes/core/rewrite/slices.py @@ -57,6 +57,9 @@ def pull_out_limit( if (prior_limit is not None) and (prior_limit < limit): limit = prior_limit return new_root, limit + if root.is_noop: + new_root, prior_limit = pull_out_limit(root.child) + return new_root, prior_limit elif ( isinstance(root, (nodes.SelectionNode, nodes.ProjectionNode)) and root.row_preserving diff --git a/bigframes/core/schema.py b/bigframes/core/schema.py index 4f636ab210..b1a77d1259 100644 --- a/bigframes/core/schema.py +++ b/bigframes/core/schema.py @@ -17,7 +17,7 @@ from dataclasses import dataclass import functools import typing -from typing import Sequence +from typing import Dict, List, Sequence import google.cloud.bigquery import pyarrow @@ -47,14 +47,24 @@ def from_bq_table( column_type_overrides: typing.Optional[ typing.Dict[str, bigframes.dtypes.Dtype] ] = None, + ): + return ArraySchema.from_bq_schema( + table.schema, column_type_overrides=column_type_overrides + ) + + @classmethod + def from_bq_schema( + cls, + schema: List[google.cloud.bigquery.SchemaField], + column_type_overrides: typing.Optional[ + Dict[str, bigframes.dtypes.Dtype] + ] = None, ): if column_type_overrides is None: column_type_overrides = {} items = tuple( SchemaItem(name, column_type_overrides.get(name, dtype)) - for name, dtype in bigframes.dtypes.bf_type_from_type_kind( - table.schema - ).items() + for name, dtype in bigframes.dtypes.bf_type_from_type_kind(schema).items() ) return ArraySchema(items) diff --git a/bigframes/functions/_function_client.py b/bigframes/functions/_function_client.py index 0cc3d52c38..d03021dd23 100644 --- a/bigframes/functions/_function_client.py +++ b/bigframes/functions/_function_client.py @@ -125,11 +125,15 @@ def _ensure_dataset_exists(self) -> None: def _create_bq_function(self, create_function_ddl: str) -> None: # TODO(swast): plumb through the original, user-facing api_name. _, query_job = bigframes.session._io.bigquery.start_query_with_client( - self._session.bqclient, + cast(bigquery.Client, self._session.bqclient), create_function_ddl, job_config=bigquery.QueryJobConfig(), + location=None, + project=None, + timeout=None, + metrics=None, + query_with_job=True, ) - assert query_job is not None logger.info(f"Created bigframes function {query_job.ddl_target_routine}") def _format_function_options(self, function_options: dict) -> str: diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index c24dca554a..92708a7f93 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -537,6 +537,10 @@ def _read_gbq_colab( index_col=bigframes.enums.DefaultIndexKind.NULL, force_total_order=False, dry_run=typing.cast(Union[Literal[False], Literal[True]], dry_run), + # TODO(tswast): we may need to allow allow_large_results to be overwritten + # or possibly a general configuration object for an explicit + # destination table and write disposition. + allow_large_results=False, ) @overload @@ -1917,10 +1921,15 @@ def _start_query_ml_ddl( # https://cloud.google.com/bigquery/docs/customer-managed-encryption#encrypt-model job_config.destination_encryption_configuration = None iterator, query_job = bf_io_bigquery.start_query_with_client( - self.bqclient, sql, job_config=job_config, metrics=self._metrics + self.bqclient, + sql, + job_config=job_config, + metrics=self._metrics, + location=None, + project=None, + timeout=None, + query_with_job=True, ) - - assert query_job is not None return iterator, query_job def _create_object_table(self, path: str, connection: str) -> str: @@ -1943,6 +1952,10 @@ def _create_object_table(self, path: str, connection: str) -> str: sql, job_config=bigquery.QueryJobConfig(), metrics=self._metrics, + location=None, + project=None, + timeout=None, + query_with_job=True, ) return table diff --git a/bigframes/session/_io/bigquery/__init__.py b/bigframes/session/_io/bigquery/__init__.py index 267111afe0..fdc240fa69 100644 --- a/bigframes/session/_io/bigquery/__init__.py +++ b/bigframes/session/_io/bigquery/__init__.py @@ -22,7 +22,7 @@ import textwrap import types import typing -from typing import Dict, Iterable, Mapping, Optional, Tuple, Union +from typing import Dict, Iterable, Literal, Mapping, Optional, overload, Tuple, Union import bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq import google.api_core.exceptions @@ -38,7 +38,6 @@ IO_ORDERING_ID = "bqdf_row_nums" -MAX_LABELS_COUNT = 64 - 8 _LIST_TABLES_LIMIT = 10000 # calls to bqclient.list_tables # will be limited to this many tables @@ -73,7 +72,12 @@ def create_job_configs_labels( ) ) values = list(itertools.chain(job_configs_labels.values(), api_methods)) - return dict(zip(labels[:MAX_LABELS_COUNT], values[:MAX_LABELS_COUNT])) + return dict( + zip( + labels[: log_adapter.MAX_LABELS_COUNT], + values[: log_adapter.MAX_LABELS_COUNT], + ) + ) def create_export_data_statement( @@ -223,8 +227,7 @@ def format_option(key: str, value: Union[bool, str]) -> str: def add_and_trim_labels(job_config): """ Add additional labels to the job configuration and trim the total number of labels - to ensure they do not exceed the maximum limit allowed by BigQuery, which is 64 - labels per job. + to ensure they do not exceed MAX_LABELS_COUNT labels per job. """ api_methods = log_adapter.get_and_reset_api_methods(dry_run=job_config.dry_run) job_config.labels = create_job_configs_labels( @@ -233,23 +236,54 @@ def add_and_trim_labels(job_config): ) +@overload def start_query_with_client( bq_client: bigquery.Client, sql: str, - job_config: bigquery.job.QueryJobConfig, + *, + job_config: bigquery.QueryJobConfig, + location: Optional[str], + project: Optional[str], + timeout: Optional[float], + metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None, + query_with_job: Literal[True], +) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]: + ... + + +@overload +def start_query_with_client( + bq_client: bigquery.Client, + sql: str, + *, + job_config: bigquery.QueryJobConfig, + location: Optional[str], + project: Optional[str], + timeout: Optional[float], + metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None, + query_with_job: Literal[False], +) -> Tuple[bigquery.table.RowIterator, Optional[bigquery.QueryJob]]: + ... + + +def start_query_with_client( + bq_client: bigquery.Client, + sql: str, + *, + job_config: bigquery.QueryJobConfig, location: Optional[str] = None, project: Optional[str] = None, timeout: Optional[float] = None, metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None, - *, query_with_job: bool = True, ) -> Tuple[bigquery.table.RowIterator, Optional[bigquery.QueryJob]]: """ Starts query job and waits for results. """ try: - # Note: Ensure no additional labels are added to job_config after this point, - # as `add_and_trim_labels` ensures the label count does not exceed 64. + # Note: Ensure no additional labels are added to job_config after this + # point, as `add_and_trim_labels` ensures the label count does not + # exceed MAX_LABELS_COUNT. add_and_trim_labels(job_config) if not query_with_job: results_iterator = bq_client.query_and_wait( @@ -322,8 +356,8 @@ def delete_tables_matching_session_id( def create_bq_dataset_reference( bq_client: bigquery.Client, - location=None, - project=None, + location: Optional[str] = None, + project: Optional[str] = None, ) -> bigquery.DatasetReference: """Create and identify dataset(s) for temporary BQ resources. @@ -352,6 +386,9 @@ def create_bq_dataset_reference( location=location, job_config=job_config, project=project, + timeout=None, + metrics=None, + query_with_job=True, ) # The anonymous dataset is used by BigQuery to write query results and @@ -359,7 +396,6 @@ def create_bq_dataset_reference( # to the dataset, no BigQuery Session required. Note: there is a # different anonymous dataset per location. See: # https://cloud.google.com/bigquery/docs/cached-results#how_cached_results_are_stored - assert query_job is not None query_destination = query_job.destination return bigquery.DatasetReference( query_destination.project, diff --git a/bigframes/session/_io/bigquery/read_gbq_query.py b/bigframes/session/_io/bigquery/read_gbq_query.py new file mode 100644 index 0000000000..70c83d7875 --- /dev/null +++ b/bigframes/session/_io/bigquery/read_gbq_query.py @@ -0,0 +1,90 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Private helpers for implementing read_gbq_query.""" + +from __future__ import annotations + +from typing import Optional + +from google.cloud import bigquery +import google.cloud.bigquery.table +import pandas + +from bigframes import dataframe +from bigframes.core import local_data, pyarrow_utils +import bigframes.core as core +import bigframes.core.blocks as blocks +import bigframes.core.guid +import bigframes.core.schema as schemata +import bigframes.session + + +def create_dataframe_from_query_job_stats( + query_job: Optional[bigquery.QueryJob], *, session: bigframes.session.Session +) -> dataframe.DataFrame: + """Convert a QueryJob into a DataFrame with key statistics about the query. + + Any changes you make here, please try to keep in sync with pandas-gbq. + """ + return dataframe.DataFrame( + data=pandas.DataFrame( + { + "statement_type": [ + query_job.statement_type if query_job else "unknown" + ], + "job_id": [query_job.job_id if query_job else "unknown"], + "location": [query_job.location if query_job else "unknown"], + } + ), + session=session, + ) + + +def create_dataframe_from_row_iterator( + rows: google.cloud.bigquery.table.RowIterator, *, session: bigframes.session.Session +) -> dataframe.DataFrame: + """Convert a RowIterator into a DataFrame wrapping a LocalNode. + + This allows us to create a DataFrame from query results, even in the + 'jobless' case where there's no destination table. + """ + pa_table = rows.to_arrow() + + # TODO(tswast): Use array_value.promote_offsets() instead once that node is + # supported by the local engine. + offsets_col = bigframes.core.guid.generate_guid() + pa_table = pyarrow_utils.append_offsets(pa_table, offsets_col=offsets_col) + + # We use the ManagedArrowTable constructor directly, because the + # results of to_arrow() should be the source of truth with regards + # to canonical formats since it comes from either the BQ Storage + # Read API or has been transformed by google-cloud-bigquery to look + # like the output of the BQ Storage Read API. + mat = local_data.ManagedArrowTable( + pa_table, + schemata.ArraySchema.from_bq_schema( + list(rows.schema) + [bigquery.SchemaField(offsets_col, "INTEGER")] + ), + ) + mat.validate() + + array_value = core.ArrayValue.from_managed(mat, session) + block = blocks.Block( + array_value, + (offsets_col,), + [field.name for field in rows.schema], + (None,), + ) + return dataframe.DataFrame(block) diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index 33d3314a1e..47be6fa768 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -320,6 +320,10 @@ def export_gcs( export_data_statement, job_config=bigquery.QueryJobConfig(), metrics=self.metrics, + project=None, + location=None, + timeout=None, + query_with_job=True, ) return query_job @@ -383,14 +387,29 @@ def _run_execute_query( job_config.labels["bigframes-mode"] = "unordered" try: - iterator, query_job = bq_io.start_query_with_client( - self.bqclient, - sql, - job_config=job_config, - metrics=self.metrics, - query_with_job=query_with_job, - ) - return iterator, query_job + # Trick the type checker into thinking we got a literal. + if query_with_job: + return bq_io.start_query_with_client( + self.bqclient, + sql, + job_config=job_config, + metrics=self.metrics, + project=None, + location=None, + timeout=None, + query_with_job=True, + ) + else: + return bq_io.start_query_with_client( + self.bqclient, + sql, + job_config=job_config, + metrics=self.metrics, + project=None, + location=None, + timeout=None, + query_with_job=False, + ) except google.api_core.exceptions.BadRequest as e: # Unfortunately, this error type does not have a separate error code or exception type diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index ba669a62bb..cf02393fd8 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -22,6 +22,7 @@ import os import typing from typing import ( + cast, Dict, Generator, Hashable, @@ -39,6 +40,7 @@ import bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq import google.api_core.exceptions from google.cloud import bigquery_storage_v1 +import google.cloud.bigquery import google.cloud.bigquery as bigquery from google.cloud.bigquery_storage_v1 import types as bq_storage_types import pandas @@ -52,6 +54,7 @@ import bigframes.formatting_helpers as formatting_helpers from bigframes.session import dry_runs import bigframes.session._io.bigquery as bf_io_bigquery +import bigframes.session._io.bigquery.read_gbq_query as bf_read_gbq_query import bigframes.session._io.bigquery.read_gbq_table as bf_read_gbq_table import bigframes.session.metrics import bigframes.session.temporary_storage @@ -736,6 +739,7 @@ def read_gbq_query( # type: ignore[overload-overlap] filters: third_party_pandas_gbq.FiltersType = ..., dry_run: Literal[False] = ..., force_total_order: Optional[bool] = ..., + allow_large_results: bool = ..., ) -> dataframe.DataFrame: ... @@ -752,6 +756,7 @@ def read_gbq_query( filters: third_party_pandas_gbq.FiltersType = ..., dry_run: Literal[True] = ..., force_total_order: Optional[bool] = ..., + allow_large_results: bool = ..., ) -> pandas.Series: ... @@ -767,9 +772,8 @@ def read_gbq_query( filters: third_party_pandas_gbq.FiltersType = (), dry_run: bool = False, force_total_order: Optional[bool] = None, + allow_large_results: bool = True, ) -> dataframe.DataFrame | pandas.Series: - import bigframes.dataframe as dataframe - configuration = _transform_read_gbq_configuration(configuration) if "query" not in configuration: @@ -824,29 +828,72 @@ def read_gbq_query( query_job, list(columns), index_cols ) - # No cluster candidates as user query might not be clusterable (eg because of ORDER BY clause) - destination, query_job = self._query_to_destination( - query, - cluster_candidates=[], - configuration=configuration, - ) + query_job_for_metrics: Optional[bigquery.QueryJob] = None + destination: Optional[bigquery.TableReference] = None + # TODO(b/421161077): If an explicit destination table is set in + # configuration, should we respect that setting? + if allow_large_results: + destination, query_job = self._query_to_destination( + query, + # No cluster candidates as user query might not be clusterable + # (eg because of ORDER BY clause) + cluster_candidates=[], + configuration=configuration, + ) + query_job_for_metrics = query_job + rows = None + else: + job_config = typing.cast( + bigquery.QueryJobConfig, + bigquery.QueryJobConfig.from_api_repr(configuration), + ) + + # TODO(b/420984164): We may want to set a page_size here to limit + # the number of results in the first jobs.query response. + rows = self._start_query_with_job_optional( + query, + job_config=job_config, + ) + + # If there is a query job, fetch it so that we can get the + # statistics and destination table, if needed. + if rows.job_id and rows.location and rows.project: + query_job = cast( + bigquery.QueryJob, + self._bqclient.get_job( + rows.job_id, project=rows.project, location=rows.location + ), + ) + destination = query_job.destination + query_job_for_metrics = query_job + + # We split query execution from results fetching so that we can log + # metrics from either the query job, row iterator, or both. if self._metrics is not None: - self._metrics.count_job_stats(query_job) + self._metrics.count_job_stats( + query_job=query_job_for_metrics, row_iterator=rows + ) + + # It's possible that there's no job and corresponding destination table. + # In this case, we must create a local node. + # + # TODO(b/420984164): Tune the threshold for which we download to + # local node. Likely there are a wide range of sizes in which it + # makes sense to download the results beyond the first page, even if + # there is a job and destination table available. + if rows is not None and destination is None: + return bf_read_gbq_query.create_dataframe_from_row_iterator( + rows, + session=self._session, + ) - # If there was no destination table, that means the query must have - # been DDL or DML. Return some job metadata, instead. + # If there was no destination table and we've made it this far, that + # means the query must have been DDL or DML. Return some job metadata, + # instead. if not destination: - return dataframe.DataFrame( - data=pandas.DataFrame( - { - "statement_type": [ - query_job.statement_type if query_job else "unknown" - ], - "job_id": [query_job.job_id if query_job else "unknown"], - "location": [query_job.location if query_job else "unknown"], - } - ), + return bf_read_gbq_query.create_dataframe_from_query_job_stats( + query_job_for_metrics, session=self._session, ) @@ -872,9 +919,12 @@ def _query_to_destination( # bother trying to do a CREATE TEMP TABLE ... AS SELECT ... statement. dry_run_config = bigquery.QueryJobConfig() dry_run_config.dry_run = True - _, dry_run_job = self._start_query(query, job_config=dry_run_config) + dry_run_job = self._start_query_with_job( + query, + job_config=dry_run_config, + ) if dry_run_job.statement_type != "SELECT": - _, query_job = self._start_query(query) + query_job = self._start_query_with_job(query) return query_job.destination, query_job # Create a table to workaround BigQuery 10 GB query results limit. See: @@ -908,7 +958,7 @@ def _query_to_destination( # Write to temp table to workaround BigQuery 10 GB query results # limit. See: internal issue 303057336. job_config.labels["error_caught"] = "true" - _, query_job = self._start_query( + query_job = self._start_query_with_job( query, job_config=job_config, timeout=timeout, @@ -919,34 +969,72 @@ def _query_to_destination( # tables as the destination. For example, if the query has a # top-level ORDER BY, this conflicts with our ability to cluster # the table by the index column(s). - _, query_job = self._start_query(query, timeout=timeout) + query_job = self._start_query_with_job(query, timeout=timeout) return query_job.destination, query_job - def _start_query( + def _prepare_job_config( + self, + job_config: Optional[google.cloud.bigquery.QueryJobConfig] = None, + ) -> google.cloud.bigquery.QueryJobConfig: + job_config = bigquery.QueryJobConfig() if job_config is None else job_config + + if bigframes.options.compute.maximum_bytes_billed is not None: + # Maybe this should be pushed down into start_query_with_client + job_config.maximum_bytes_billed = ( + bigframes.options.compute.maximum_bytes_billed + ) + + return job_config + + def _start_query_with_job_optional( self, sql: str, + *, job_config: Optional[google.cloud.bigquery.QueryJobConfig] = None, timeout: Optional[float] = None, - ) -> Tuple[google.cloud.bigquery.table.RowIterator, bigquery.QueryJob]: + ) -> google.cloud.bigquery.table.RowIterator: + """ + Starts BigQuery query with job optional and waits for results. + + Do not execute dataframe through this API, instead use the executor. + """ + job_config = self._prepare_job_config(job_config) + rows, _ = bf_io_bigquery.start_query_with_client( + self._bqclient, + sql, + job_config=job_config, + timeout=timeout, + location=None, + project=None, + metrics=None, + query_with_job=False, + ) + return rows + + def _start_query_with_job( + self, + sql: str, + *, + job_config: Optional[google.cloud.bigquery.QueryJobConfig] = None, + timeout: Optional[float] = None, + ) -> bigquery.QueryJob: """ Starts BigQuery query job and waits for results. Do not execute dataframe through this API, instead use the executor. """ - job_config = bigquery.QueryJobConfig() if job_config is None else job_config - if bigframes.options.compute.maximum_bytes_billed is not None: - # Maybe this should be pushed down into start_query_with_client - job_config.maximum_bytes_billed = ( - bigframes.options.compute.maximum_bytes_billed - ) - iterator, query_job = bf_io_bigquery.start_query_with_client( + job_config = self._prepare_job_config(job_config) + _, query_job = bf_io_bigquery.start_query_with_client( self._bqclient, sql, job_config=job_config, timeout=timeout, + location=None, + project=None, + metrics=None, + query_with_job=True, ) - assert query_job is not None - return iterator, query_job + return query_job def _transform_read_gbq_configuration(configuration: Optional[dict]) -> dict: diff --git a/bigframes/session/local_scan_executor.py b/bigframes/session/local_scan_executor.py index 88304fa181..b4d7b226e2 100644 --- a/bigframes/session/local_scan_executor.py +++ b/bigframes/session/local_scan_executor.py @@ -30,11 +30,17 @@ def execute( ordered: bool, peek: Optional[int] = None, ) -> Optional[executor.ExecuteResult]: - node = rewrite.try_reduce_to_local_scan(plan) - if not node: + reduced_result = rewrite.try_reduce_to_local_scan(plan) + if not reduced_result: return None - # TODO: Can support some slicing, sorting + node, limit = reduced_result + + if limit is not None: + if peek is None or limit < peek: + peek = limit + + # TODO: Can support some sorting offsets_col = node.offsets_col.sql if (node.offsets_col is not None) else None arrow_table = node.local_data_source.to_pyarrow_table(offsets_col=offsets_col) if peek: @@ -46,8 +52,8 @@ def execute( arrow_table = arrow_table.select(needed_cols) arrow_table = arrow_table.rename_columns([id.sql for id in node.ids]) - total_rows = node.row_count + if (peek is not None) and (total_rows is not None): total_rows = min(peek, total_rows) diff --git a/bigframes/session/metrics.py b/bigframes/session/metrics.py index 6a8038e189..48cb92a8b4 100644 --- a/bigframes/session/metrics.py +++ b/bigframes/session/metrics.py @@ -79,17 +79,24 @@ def get_performance_stats( return None bytes_processed = query_job.total_bytes_processed - if not isinstance(bytes_processed, int): + if bytes_processed and not isinstance(bytes_processed, int): return None # filter out mocks slot_millis = query_job.slot_millis - if not isinstance(slot_millis, int): + if slot_millis and not isinstance(slot_millis, int): return None # filter out mocks execution_secs = (query_job.ended - query_job.created).total_seconds() query_char_count = len(query_job.query) - return query_char_count, bytes_processed, slot_millis, execution_secs + return ( + query_char_count, + # Not every job populates these. For example, slot_millis is missing + # from queries that came from cached results. + bytes_processed if bytes_processed else 0, + slot_millis if slot_millis else 0, + execution_secs, + ) def write_stats_to_disk( diff --git a/bigframes/testing/mocks.py b/bigframes/testing/mocks.py index ca6fa57d0b..7ddc2e2e6e 100644 --- a/bigframes/testing/mocks.py +++ b/bigframes/testing/mocks.py @@ -14,11 +14,14 @@ import copy import datetime -from typing import Any, Dict, Optional, Sequence +from typing import Any, Dict, Literal, Optional, Sequence import unittest.mock as mock +from bigframes_vendored.google_cloud_bigquery import _pandas_helpers import google.auth.credentials import google.cloud.bigquery +import google.cloud.bigquery.table +import pyarrow import pytest import bigframes @@ -40,6 +43,7 @@ def create_bigquery_session( table_schema: Sequence[google.cloud.bigquery.SchemaField] = TEST_SCHEMA, anonymous_dataset: Optional[google.cloud.bigquery.DatasetReference] = None, location: str = "test-region", + ordering_mode: Literal["strict", "partial"] = "partial", ) -> bigframes.Session: """[Experimental] Create a mock BigQuery DataFrames session that avoids making Google Cloud API calls. @@ -79,43 +83,75 @@ def create_bigquery_session( queries = [] job_configs = [] - def query_mock(query, *args, job_config=None, **kwargs): + def query_mock( + query, + *args, + job_config: Optional[google.cloud.bigquery.QueryJobConfig] = None, + **kwargs, + ): queries.append(query) job_configs.append(copy.deepcopy(job_config)) - query_job = mock.create_autospec(google.cloud.bigquery.QueryJob) + query_job = mock.create_autospec(google.cloud.bigquery.QueryJob, instance=True) query_job._properties = {} type(query_job).destination = mock.PropertyMock( return_value=anonymous_dataset.table("test_table"), ) - type(query_job).session_info = google.cloud.bigquery.SessionInfo( - {"sessionInfo": {"sessionId": session_id}}, - ) + type(query_job).statement_type = mock.PropertyMock(return_value="SELECT") + + if job_config is not None and job_config.create_session: + type(query_job).session_info = google.cloud.bigquery.SessionInfo( + {"sessionId": session_id}, + ) if query.startswith("SELECT CURRENT_TIMESTAMP()"): query_job.result = mock.MagicMock(return_value=[[bq_time]]) + elif "CREATE TEMP TABLE".casefold() in query.casefold(): + type(query_job).destination = mock.PropertyMock( + return_value=anonymous_dataset.table("temp_table_from_session"), + ) else: type(query_job).schema = mock.PropertyMock(return_value=table_schema) return query_job - existing_query_and_wait = bqclient.query_and_wait - def query_and_wait_mock(query, *args, job_config=None, **kwargs): queries.append(query) job_configs.append(copy.deepcopy(job_config)) + if query.startswith("SELECT CURRENT_TIMESTAMP()"): return iter([[datetime.datetime.now()]]) - else: - return existing_query_and_wait(query, *args, **kwargs) - bqclient.query = query_mock - bqclient.query_and_wait = query_and_wait_mock + rows = mock.create_autospec( + google.cloud.bigquery.table.RowIterator, instance=True + ) + row = mock.create_autospec(google.cloud.bigquery.table.Row, instance=True) + rows.__iter__.return_value = [row] + type(rows).schema = mock.PropertyMock(return_value=table_schema) + rows.to_arrow.return_value = pyarrow.Table.from_pydict( + {field.name: [None] for field in table_schema}, + schema=pyarrow.schema( + _pandas_helpers.bq_to_arrow_field(field) for field in table_schema + ), + ) + + if job_config is not None and job_config.destination is None: + # Assume that the query finishes fast enough for jobless mode. + type(rows).job_id = mock.PropertyMock(return_value=None) + + return rows + + bqclient.query.side_effect = query_mock + bqclient.query_and_wait.side_effect = query_and_wait_mock clients_provider = mock.create_autospec(bigframes.session.clients.ClientsProvider) type(clients_provider).bqclient = mock.PropertyMock(return_value=bqclient) clients_provider._credentials = credentials - bqoptions = bigframes.BigQueryOptions(credentials=credentials, location=location) + bqoptions = bigframes.BigQueryOptions( + credentials=credentials, + location=location, + ordering_mode=ordering_mode, + ) session = bigframes.Session(context=bqoptions, clients_provider=clients_provider) session._bq_connection_manager = mock.create_autospec( bigframes.clients.BqConnectionManager, instance=True diff --git a/tests/system/small/functions/test_remote_function.py b/tests/system/small/functions/test_remote_function.py index 51e0459014..7fc7caf2fc 100644 --- a/tests/system/small/functions/test_remote_function.py +++ b/tests/system/small/functions/test_remote_function.py @@ -764,6 +764,11 @@ def test_read_gbq_function_runs_existing_udf_array_output(session, routine_id_un """ ), job_config=bigquery.QueryJobConfig(), + location=None, + project=None, + timeout=None, + metrics=None, + query_with_job=True, ) func = session.read_gbq_function(routine_id_unique) @@ -797,6 +802,11 @@ def test_read_gbq_function_runs_existing_udf_2_params_array_output( """ ), job_config=bigquery.QueryJobConfig(), + location=None, + project=None, + timeout=None, + metrics=None, + query_with_job=True, ) func = session.read_gbq_function(routine_id_unique) @@ -832,6 +842,11 @@ def test_read_gbq_function_runs_existing_udf_4_params_array_output( """ ), job_config=bigquery.QueryJobConfig(), + location=None, + project=None, + timeout=None, + metrics=None, + query_with_job=True, ) func = session.read_gbq_function(routine_id_unique) diff --git a/tests/system/small/session/test_read_gbq_colab.py b/tests/system/small/session/test_read_gbq_colab.py index a821901e4c..0992a10055 100644 --- a/tests/system/small/session/test_read_gbq_colab.py +++ b/tests/system/small/session/test_read_gbq_colab.py @@ -19,18 +19,22 @@ def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_session): + # This query should return enough results to be too big to fit in a single + # page from jobs.query. executions_before_sql = maybe_ordered_session._metrics.execution_count df = maybe_ordered_session._read_gbq_colab( """ SELECT name, + state, + gender, + year, SUM(number) AS total FROM `bigquery-public-data.usa_names.usa_1910_2013` WHERE state LIKE 'W%' - GROUP BY name + GROUP BY name, state, gender, year ORDER BY total DESC - LIMIT 300 """ ) executions_before_python = maybe_ordered_session._metrics.execution_count @@ -39,12 +43,17 @@ def test_read_gbq_colab_to_pandas_batches_preserves_order_by(maybe_ordered_sessi ) executions_after = maybe_ordered_session._metrics.execution_count - total_rows = 0 + num_batches = 0 for batch in batches: assert batch["total"].is_monotonic_decreasing - total_rows += len(batch.index) + assert len(batch.index) == 100 + num_batches += 1 + + # Only test the first few pages to avoid downloading unnecessary data + # and so we can confirm we have full pages in each batch. + if num_batches >= 3: + break - assert total_rows > 0 assert executions_after == executions_before_python == executions_before_sql + 1 @@ -103,6 +112,9 @@ def test_read_gbq_colab_includes_formatted_scalars(session): # This is not a supported type, but ignored if not referenced. "some_object": object(), } + + # This query should return few enough results to be small enough to fit in a + # single page from jobs.query. df = session._read_gbq_colab( """ SELECT {some_integer} as some_integer, @@ -124,6 +136,7 @@ def test_read_gbq_colab_includes_formatted_scalars(session): "escaped": pandas.Series(["{escaped}"], dtype="string[pyarrow]"), } ), + check_index_type=False, # int64 vs Int64 ) @@ -152,4 +165,8 @@ def test_read_gbq_colab_includes_formatted_bigframes_dataframe( .assign(int64_col=scalars_pandas_df_index["int64_too"]) .reset_index(drop=False)[["int64_col", "rowindex"]] ) - pandas.testing.assert_frame_equal(result, expected) + pandas.testing.assert_frame_equal( + result, + expected, + check_index_type=False, # int64 vs Int64 + ) diff --git a/tests/system/small/test_encryption.py b/tests/system/small/test_encryption.py index 97f44694b0..1ba8ed7e09 100644 --- a/tests/system/small/test_encryption.py +++ b/tests/system/small/test_encryption.py @@ -70,7 +70,7 @@ def test_session_query_job(bq_cmek, session_with_bq_cmek): if not bq_cmek: # pragma: NO COVER pytest.skip("no cmek set for testing") # pragma: NO COVER - _, query_job = session_with_bq_cmek._loader._start_query( + query_job = session_with_bq_cmek._loader._start_query_with_job( "SELECT 123", job_config=bigquery.QueryJobConfig(use_query_cache=False) ) query_job.result() diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index e5e2c58d59..cfee5ea98d 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -14,7 +14,7 @@ import datetime import re -from typing import Iterable +from typing import Iterable, Optional from unittest import mock import google.cloud.bigquery as bigquery @@ -203,7 +203,7 @@ def test_add_and_trim_labels_length_limit_met(): [(None, None), (30.0, "test_api")], ) def test_start_query_with_client_labels_length_limit_met( - mock_bq_client, timeout, api_name + mock_bq_client: bigquery.Client, timeout: Optional[float], api_name ): sql = "select * from abc" cur_labels = { @@ -229,8 +229,12 @@ def test_start_query_with_client_labels_length_limit_met( io_bq.start_query_with_client( mock_bq_client, sql, - job_config, + job_config=job_config, + location=None, + project=None, timeout=timeout, + metrics=None, + query_with_job=True, ) assert job_config.labels is not None diff --git a/tests/unit/session/test_local_scan_executor.py b/tests/unit/session/test_local_scan_executor.py new file mode 100644 index 0000000000..30b1b5f78d --- /dev/null +++ b/tests/unit/session/test_local_scan_executor.py @@ -0,0 +1,105 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import pyarrow +import pytest + +from bigframes import dtypes +from bigframes.core import identifiers, local_data, nodes +from bigframes.session import local_scan_executor +from bigframes.testing import mocks + + +@pytest.fixture +def object_under_test(): + return local_scan_executor.LocalScanExecutor() + + +def create_read_local_node(arrow_table: pyarrow.Table): + session = mocks.create_bigquery_session() + local_data_source = local_data.ManagedArrowTable.from_pyarrow(arrow_table) + return nodes.ReadLocalNode( + local_data_source=local_data_source, + session=session, + scan_list=nodes.ScanList( + items=tuple( + nodes.ScanItem( + id=identifiers.ColumnId(column_name), + dtype=dtypes.arrow_dtype_to_bigframes_dtype( + arrow_table.field(column_name).type + ), + source_id=column_name, + ) + for column_name in arrow_table.column_names + ), + ), + ) + + +@pytest.mark.parametrize( + ("start", "stop", "expected_rows"), + ( + # No-op slices. + (None, None, 10), + (0, None, 10), + (None, 10, 10), + # Slices equivalent to limits. + (None, 7, 7), + (0, 3, 3), + ), +) +def test_local_scan_executor_with_slice(start, stop, expected_rows, object_under_test): + pyarrow_table = pyarrow.Table.from_pydict( + { + "rowindex": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + "letters": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"], + } + ) + assert pyarrow_table.num_rows == 10 + + local_node = create_read_local_node(pyarrow_table) + plan = nodes.SliceNode( + child=local_node, + start=start, + stop=stop, + ) + + result = object_under_test.execute(plan, ordered=True) + result_table = pyarrow.Table.from_batches(result.arrow_batches) + assert result_table.num_rows == expected_rows + + +@pytest.mark.parametrize( + ("start", "stop", "step"), + ( + (-1, None, 1), + (None, -1, 1), + (None, None, 2), + (None, None, -1), + (4, None, 6), + (1, 9, 8), + ), +) +def test_local_scan_executor_with_slice_unsupported_inputs( + start, stop, step, object_under_test +): + local_node = create_read_local_node(pyarrow.Table.from_pydict({"col": [1, 2, 3]})) + plan = nodes.SliceNode( + child=local_node, + start=start, + stop=stop, + step=step, + ) + assert object_under_test.execute(plan, ordered=True) is None diff --git a/tests/unit/session/test_read_gbq_colab.py b/tests/unit/session/test_read_gbq_colab.py index cffc6b3af7..c4635f85a9 100644 --- a/tests/unit/session/test_read_gbq_colab.py +++ b/tests/unit/session/test_read_gbq_colab.py @@ -80,3 +80,19 @@ def test_read_gbq_colab_includes_formatted_values_in_dry_run(monkeypatch): assert config.dry_run assert query.strip() == expected.strip() + + +def test_read_gbq_colab_doesnt_set_destination_table(): + """For best performance, we don't try to workaround the 10 GB query results limitation.""" + session = mocks.create_bigquery_session() + + _ = session._read_gbq_colab("SELECT 'my-test-query';") + queries = session._queries # type: ignore + configs = session._job_configs # type: ignore + + for query, config in zip(queries, configs): + if query == "SELECT 'my-test-query';" and not config.dry_run: + break + + assert query == "SELECT 'my-test-query';" + assert config.destination is None diff --git a/tests/unit/session/test_read_gbq_query.py b/tests/unit/session/test_read_gbq_query.py new file mode 100644 index 0000000000..afd9922426 --- /dev/null +++ b/tests/unit/session/test_read_gbq_query.py @@ -0,0 +1,37 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for read_gbq_query functions.""" + +from bigframes.testing import mocks + + +def test_read_gbq_query_sets_destination_table(): + """Workaround the 10 GB query results limitation by setting a destination table. + + See internal issue b/303057336. + """ + # Use partial ordering mode to skip column uniqueness checks. + session = mocks.create_bigquery_session(ordering_mode="partial") + + _ = session.read_gbq_query("SELECT 'my-test-query';") + queries = session._queries # type: ignore + configs = session._job_configs # type: ignore + + for query, config in zip(queries, configs): + if query == "SELECT 'my-test-query';" and not config.dry_run: + break + + assert query == "SELECT 'my-test-query';" + assert config.destination is not None diff --git a/tests/unit/session/test_read_gbq_table.py b/tests/unit/session/test_read_gbq_table.py index 6a4ae7cb60..0c67e05813 100644 --- a/tests/unit/session/test_read_gbq_table.py +++ b/tests/unit/session/test_read_gbq_table.py @@ -81,14 +81,17 @@ def test_infer_unique_columns(index_cols, primary_keys, values_distinct, expecte } bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True) bqclient.project = "test-project" - bqclient.get_table.return_value = table + session = mocks.create_bigquery_session( + bqclient=bqclient, table_schema=table.schema + ) + # Mock bqclient _after_ creating session to override its mocks. + bqclient.get_table.return_value = table + bqclient.query_and_wait.side_effect = None bqclient.query_and_wait.return_value = ( {"total_count": 3, "distinct_count": 3 if values_distinct else 2}, ) - session = mocks.create_bigquery_session( - bqclient=bqclient, table_schema=table.schema - ) + table._properties["location"] = session._location result = bf_read_gbq_table.infer_unique_columns(bqclient, table, index_cols) diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py index cbd31f588a..26b74a3f8a 100644 --- a/tests/unit/session/test_session.py +++ b/tests/unit/session/test_session.py @@ -273,7 +273,11 @@ def test_default_index_warning_raised_by_read_gbq(table): bqclient.project = "test-project" bqclient.get_table.return_value = table bqclient.query_and_wait.return_value = ({"total_count": 3, "distinct_count": 2},) - session = mocks.create_bigquery_session(bqclient=bqclient) + session = mocks.create_bigquery_session( + bqclient=bqclient, + # DefaultIndexWarning is only relevant for strict mode. + ordering_mode="strict", + ) table._properties["location"] = session._location with pytest.warns(bigframes.exceptions.DefaultIndexWarning): @@ -296,7 +300,11 @@ def test_default_index_warning_not_raised_by_read_gbq_index_col_sequential_int64 bqclient.project = "test-project" bqclient.get_table.return_value = table bqclient.query_and_wait.return_value = ({"total_count": 4, "distinct_count": 3},) - session = mocks.create_bigquery_session(bqclient=bqclient) + session = mocks.create_bigquery_session( + bqclient=bqclient, + # DefaultIndexWarning is only relevant for strict mode. + ordering_mode="strict", + ) table._properties["location"] = session._location # No warnings raised because we set the option allowing the default indexes. @@ -344,7 +352,10 @@ def test_default_index_warning_not_raised_by_read_gbq_index_col_columns( {"total_count": total_count, "distinct_count": distinct_count}, ) session = mocks.create_bigquery_session( - bqclient=bqclient, table_schema=table.schema + bqclient=bqclient, + table_schema=table.schema, + # DefaultIndexWarning is only relevant for strict mode. + ordering_mode="strict", ) table._properties["location"] = session._location @@ -386,7 +397,10 @@ def test_default_index_warning_not_raised_by_read_gbq_primary_key(table): bqclient.project = "test-project" bqclient.get_table.return_value = table session = mocks.create_bigquery_session( - bqclient=bqclient, table_schema=table.schema + bqclient=bqclient, + table_schema=table.schema, + # DefaultIndexWarning is only relevant for strict mode. + ordering_mode="strict", ) table._properties["location"] = session._location From 0b59cf1008613770fa1433c6da395e755c86fe22 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Wed, 4 Jun 2025 10:15:15 -0700 Subject: [PATCH 06/18] fix: allow KMeans model init parameter as k-means++ alias (#1790) --- bigframes/ml/cluster.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bigframes/ml/cluster.py b/bigframes/ml/cluster.py index a03dc937dc..cd27357680 100644 --- a/bigframes/ml/cluster.py +++ b/bigframes/ml/cluster.py @@ -59,7 +59,8 @@ def __init__( warm_start: bool = False, ): self.n_clusters = n_clusters - self.init = init + # allow the alias to be compatible with sklean + self.init = "kmeans++" if init == "k-means++" else init self.init_col = init_col self.distance_type = distance_type self.max_iter = max_iter From c31f67bc697a3858bc26489dc3808ba545a96291 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Wed, 4 Jun 2025 13:54:55 -0700 Subject: [PATCH 07/18] test: skip str.isdigit test on pyarrow dev versions (#1793) --- tests/system/small/operations/test_strings.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index 032d93c19d..8801faf657 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -325,6 +325,11 @@ def test_isalpha(weird_strings, weird_strings_pd): ) +@pytest.mark.skipif( + "dev" in pa.__version__, + # b/333484335 pyarrow is inconsistent on the behavior + reason="pyarrow dev version is inconsistent on isdigit behavior.", +) def test_isdigit(weird_strings, weird_strings_pd): pd_result = weird_strings_pd.str.isdigit() bf_result = weird_strings.str.isdigit().to_pandas() From 1d4564604baff612c3455fb088e442198084bf26 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Wed, 4 Jun 2025 13:57:31 -0700 Subject: [PATCH 08/18] test: Add dataframe unit test suite (#1751) --- bigframes/core/compile/polars/compiler.py | 422 +- bigframes/core/global_session.py | 20 + bigframes/core/rewrite/__init__.py | 3 +- bigframes/core/rewrite/windows.py | 33 +- bigframes/core/window_spec.py | 12 +- bigframes/operations/aggregations.py | 2 - bigframes/testing/polars_session.py | 36 +- noxfile.py | 15 +- tests/unit/test_dataframe_polars.py | 4422 +++++++++++++++++++++ 9 files changed, 4855 insertions(+), 110 deletions(-) create mode 100644 tests/unit/test_dataframe_polars.py diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index 14d8e8501c..a0e85d8c69 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -16,14 +16,17 @@ import dataclasses import functools import itertools -from typing import cast, Optional, Sequence, Tuple, TYPE_CHECKING +import operator +from typing import cast, Literal, Optional, Sequence, Tuple, TYPE_CHECKING + +import pandas as pd import bigframes.core -from bigframes.core import window_spec +from bigframes.core import identifiers, nodes, ordering, window_spec import bigframes.core.expression as ex import bigframes.core.guid as guid -import bigframes.core.nodes as nodes import bigframes.core.rewrite +import bigframes.dtypes import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -37,6 +40,45 @@ polars_installed = False if polars_installed: + _DTYPE_MAPPING = { + # Direct mappings + bigframes.dtypes.INT_DTYPE: pl.Int64(), + bigframes.dtypes.FLOAT_DTYPE: pl.Float64(), + bigframes.dtypes.BOOL_DTYPE: pl.Boolean(), + bigframes.dtypes.STRING_DTYPE: pl.String(), + bigframes.dtypes.NUMERIC_DTYPE: pl.Decimal(38, 9), + bigframes.dtypes.BIGNUMERIC_DTYPE: pl.Decimal(76, 38), + bigframes.dtypes.BYTES_DTYPE: pl.Binary(), + bigframes.dtypes.DATE_DTYPE: pl.Date(), + bigframes.dtypes.DATETIME_DTYPE: pl.Datetime(time_zone=None), + bigframes.dtypes.TIMESTAMP_DTYPE: pl.Datetime(time_zone="UTC"), + bigframes.dtypes.TIME_DTYPE: pl.Time(), + bigframes.dtypes.TIMEDELTA_DTYPE: pl.Duration(), + # Indirect mappings + bigframes.dtypes.GEO_DTYPE: pl.String(), + bigframes.dtypes.JSON_DTYPE: pl.String(), + } + + def _bigframes_dtype_to_polars_dtype( + dtype: bigframes.dtypes.ExpressionType, + ) -> pl.DataType: + if dtype is None: + return pl.Null() + if bigframes.dtypes.is_struct_like(dtype): + return pl.Struct( + [ + pl.Field(name, _bigframes_dtype_to_polars_dtype(type)) + for name, type in bigframes.dtypes.get_struct_fields(dtype).items() + ] + ) + if bigframes.dtypes.is_array_like(dtype): + return pl.Array( + inner=_bigframes_dtype_to_polars_dtype( + bigframes.dtypes.get_array_inner_type(dtype) + ) + ) + else: + return _DTYPE_MAPPING[dtype] @dataclasses.dataclass(frozen=True) class PolarsExpressionCompiler: @@ -47,33 +89,45 @@ class PolarsExpressionCompiler: """ @functools.singledispatchmethod - def compile_expression(self, expression: ex.Expression): + def compile_expression(self, expression: ex.Expression) -> pl.Expr: raise NotImplementedError(f"Cannot compile expression: {expression}") @compile_expression.register def _( self, expression: ex.ScalarConstantExpression, - ): - return pl.lit(expression.value) + ) -> pl.Expr: + value = expression.value + if not isinstance(value, float) and pd.isna(value): # type: ignore + value = None + if expression.dtype is None: + return pl.lit(None) + return pl.lit(value, _bigframes_dtype_to_polars_dtype(expression.dtype)) @compile_expression.register def _( self, expression: ex.DerefOp, - ): + ) -> pl.Expr: return pl.col(expression.id.sql) + @compile_expression.register + def _( + self, + expression: ex.SchemaFieldRefExpression, + ) -> pl.Expr: + return pl.col(expression.field.id.sql) + @compile_expression.register def _( self, expression: ex.OpExpression, - ): + ) -> pl.Expr: # TODO: Complete the implementation, convert to hash dispatch op = expression.op args = tuple(map(self.compile_expression, expression.inputs)) if isinstance(op, ops.invert_op.__class__): - return args[0].neg() + return ~args[0] if isinstance(op, ops.and_op.__class__): return args[0] & args[1] if isinstance(op, ops.or_op.__class__): @@ -82,6 +136,21 @@ def _( return args[0] + args[1] if isinstance(op, ops.sub_op.__class__): return args[0] - args[1] + if isinstance(op, ops.mul_op.__class__): + return args[0] * args[1] + if isinstance(op, ops.div_op.__class__): + return args[0] / args[1] + if isinstance(op, ops.floordiv_op.__class__): + # TODO: Handle int // 0 + return args[0] // args[1] + if isinstance(op, (ops.pow_op.__class__, ops.unsafe_pow_op.__class__)): + return args[0] ** args[1] + if isinstance(op, ops.abs_op.__class__): + return args[0].abs() + if isinstance(op, ops.neg_op.__class__): + return args[0].neg() + if isinstance(op, ops.pos_op.__class__): + return args[0] if isinstance(op, ops.ge_op.__class__): return args[0] >= args[1] if isinstance(op, ops.gt_op.__class__): @@ -91,23 +160,48 @@ def _( if isinstance(op, ops.lt_op.__class__): return args[0] < args[1] if isinstance(op, ops.eq_op.__class__): - return args[0] == args[1] + return args[0].eq(args[1]) + if isinstance(op, ops.eq_null_match_op.__class__): + return args[0].eq_missing(args[1]) if isinstance(op, ops.ne_op.__class__): - return args[0] != args[1] + return args[0].ne(args[1]) + if isinstance(op, ops.IsInOp): + # TODO: Filter out types that can't be coerced to right type + if op.match_nulls or not any(map(pd.isna, op.values)): + # newer polars version have nulls_equal arg + return args[0].is_in(op.values) + else: + return args[0].is_in(op.values) or args[0].is_null() if isinstance(op, ops.mod_op.__class__): return args[0] % args[1] if isinstance(op, ops.coalesce_op.__class__): return pl.coalesce(*args) + if isinstance(op, ops.fillna_op.__class__): + return pl.coalesce(*args) + if isinstance(op, ops.isnull_op.__class__): + return args[0].is_null() + if isinstance(op, ops.notnull_op.__class__): + return args[0].is_not_null() if isinstance(op, ops.CaseWhenOp): expr = pl.when(args[0]).then(args[1]) for pred, result in zip(args[2::2], args[3::2]): - return expr.when(pred).then(result) + expr = expr.when(pred).then(result) # type: ignore return expr if isinstance(op, ops.where_op.__class__): original, condition, otherwise = args return pl.when(condition).then(original).otherwise(otherwise) + if isinstance(op, ops.AsTypeOp): + return self.astype(args[0], op.to_type, safe=op.safe) + raise NotImplementedError(f"Polars compiler hasn't implemented {op}") + def astype( + self, col: pl.Expr, dtype: bigframes.dtypes.Dtype, safe: bool + ) -> pl.Expr: + # TODO: Polars casting works differently, need to lower instead to specific conversion ops. + # eg. We want "True" instead of "true" for bool to string. + return col.cast(_DTYPE_MAPPING[dtype], strict=not safe) + @dataclasses.dataclass(frozen=True) class PolarsAggregateCompiler: scalar_compiler = PolarsExpressionCompiler() @@ -149,12 +243,26 @@ def compile_agg_expr(self, expr: ex.Aggregation): return self.compile_agg_op(expr.op, inputs) - def compile_agg_op(self, op: agg_ops.WindowOp, inputs: Sequence[str] = []): + def compile_agg_op( + self, op: agg_ops.WindowOp, inputs: Sequence[str] = [] + ) -> pl.Expr: if isinstance(op, agg_ops.ProductOp): - # TODO: Need schema to cast back to original type if posisble (eg float back to int) - return pl.col(*inputs).log().sum().exp() + # TODO: Fix datatype inconsistency with float/int + return pl.col(*inputs).product() if isinstance(op, agg_ops.SumOp): return pl.sum(*inputs) + if isinstance(op, (agg_ops.SizeOp, agg_ops.SizeUnaryOp)): + return pl.len() + if isinstance(op, agg_ops.MeanOp): + return pl.mean(*inputs) + if isinstance(op, agg_ops.MedianOp): + return pl.median(*inputs) + if isinstance(op, agg_ops.AllOp): + return pl.all(*inputs) + if isinstance(op, agg_ops.AnyOp): + return pl.any(*inputs) # type: ignore + if isinstance(op, agg_ops.NuniqueOp): + return pl.col(*inputs).drop_nulls().n_unique() if isinstance(op, agg_ops.MinOp): return pl.min(*inputs) if isinstance(op, agg_ops.MaxOp): @@ -162,7 +270,35 @@ def compile_agg_op(self, op: agg_ops.WindowOp, inputs: Sequence[str] = []): if isinstance(op, agg_ops.CountOp): return pl.count(*inputs) if isinstance(op, agg_ops.CorrOp): - return pl.corr(*inputs) + return pl.corr( + pl.col(inputs[0]).fill_nan(None), pl.col(inputs[1]).fill_nan(None) + ) + if isinstance(op, agg_ops.CovOp): + return pl.cov( + pl.col(inputs[0]).fill_nan(None), pl.col(inputs[1]).fill_nan(None) + ) + if isinstance(op, agg_ops.StdOp): + return pl.std(inputs[0]) + if isinstance(op, agg_ops.VarOp): + return pl.var(inputs[0]) + if isinstance(op, agg_ops.PopVarOp): + return pl.var(inputs[0], ddof=0) + if isinstance(op, agg_ops.FirstNonNullOp): + return pl.col(*inputs).drop_nulls().first() + if isinstance(op, agg_ops.LastNonNullOp): + return pl.col(*inputs).drop_nulls().last() + if isinstance(op, agg_ops.FirstOp): + return pl.col(*inputs).first() + if isinstance(op, agg_ops.LastOp): + return pl.col(*inputs).last() + if isinstance(op, agg_ops.ShiftOp): + return pl.col(*inputs).shift(op.periods) + if isinstance(op, agg_ops.DiffOp): + return pl.col(*inputs) - pl.col(*inputs).shift(op.periods) + if isinstance(op, agg_ops.AnyValueOp): + return pl.max( + *inputs + ) # probably something faster? maybe just get first item? raise NotImplementedError( f"Aggregate op {op} not yet supported in polars engine." ) @@ -197,11 +333,14 @@ def compile(self, array_value: bigframes.core.ArrayValue) -> pl.LazyFrame: # TODO: Create standard way to configure BFET -> BFET rewrites # Polars has incomplete slice support in lazy mode - node = nodes.bottom_up(array_value.node, bigframes.core.rewrite.rewrite_slice) + node = array_value.node + node = bigframes.core.rewrite.column_pruning(node) + node = nodes.bottom_up(node, bigframes.core.rewrite.rewrite_slice) + node = bigframes.core.rewrite.pull_out_window_order(node) return self.compile_node(node) @functools.singledispatchmethod - def compile_node(self, node: nodes.BigFrameNode): + def compile_node(self, node: nodes.BigFrameNode) -> pl.LazyFrame: """Defines transformation but isn't cached, always use compile_node instead""" raise ValueError(f"Can't compile unrecognized node: {node}") @@ -213,7 +352,12 @@ def compile_readlocal(self, node: nodes.ReadLocalNode): lazy_frame = cast( pl.DataFrame, pl.from_arrow(node.local_data_source.data) ).lazy() - return lazy_frame.select(cols_to_read.keys()).rename(cols_to_read) + lazy_frame = lazy_frame.select(cols_to_read.keys()).rename(cols_to_read) + if node.offsets_col: + lazy_frame = lazy_frame.with_columns( + [pl.int_range(pl.len(), dtype=pl.Int64).alias(node.offsets_col.sql)] + ) + return lazy_frame @compile_node.register def compile_filter(self, node: nodes.FilterNode): @@ -227,17 +371,18 @@ def compile_orderby(self, node: nodes.OrderByNode): if len(node.by) == 0: # pragma: no cover return frame - - frame = frame.sort( - [ - self.expr_compiler.compile_expression(by.scalar_expression) - for by in node.by - ], - descending=[not by.direction.is_ascending for by in node.by], - nulls_last=[by.na_last for by in node.by], + return self._sort(frame, node.by) + + def _sort( + self, frame: pl.LazyFrame, by: Sequence[ordering.OrderingExpression] + ) -> pl.LazyFrame: + sorted = frame.sort( + [self.expr_compiler.compile_expression(by.scalar_expression) for by in by], + descending=[not by.direction.is_ascending for by in by], + nulls_last=[by.na_last for by in by], maintain_order=True, ) - return frame + return sorted @compile_node.register def compile_reversed(self, node: nodes.ReversedNode): @@ -251,10 +396,15 @@ def compile_selection(self, node: nodes.SelectionNode): @compile_node.register def compile_projection(self, node: nodes.ProjectionNode): - new_cols = [ - self.expr_compiler.compile_expression(ex).alias(name.sql) - for ex, name in node.assignments - ] + new_cols = [] + for proj_expr, name in node.assignments: + bound_expr = ex.bind_schema_fields(proj_expr, node.child.field_by_id) + new_col = self.expr_compiler.compile_expression(bound_expr).alias(name.sql) + if bound_expr.output_type is None: + new_col = new_col.cast( + _bigframes_dtype_to_polars_dtype(bigframes.dtypes.DEFAULT_DTYPE) + ) + new_cols.append(new_col) return self.compile_node(node.child).with_columns(new_cols) @compile_node.register @@ -265,37 +415,91 @@ def compile_offsets(self, node: nodes.PromoteOffsetsNode): @compile_node.register def compile_join(self, node: nodes.JoinNode): - # Always totally order this, as adding offsets is relatively cheap for in-memory columnar data - left = self.compile_node(node.left_child).with_columns( + left = self.compile_node(node.left_child) + right = self.compile_node(node.right_child) + left_on = [l_name.id.sql for l_name, _ in node.conditions] + right_on = [r_name.id.sql for _, r_name in node.conditions] + if node.type == "right": + return self._ordered_join( + right, left, "left", right_on, left_on, node.joins_nulls + ).select([id.sql for id in node.ids]) + return self._ordered_join( + left, right, node.type, left_on, right_on, node.joins_nulls + ) + + def _ordered_join( + self, + left_frame: pl.LazyFrame, + right_frame: pl.LazyFrame, + how: Literal["inner", "outer", "left", "cross"], + left_on: Sequence[str], + right_on: Sequence[str], + join_nulls: bool, + ): + if how == "right": + # seems to cause seg faults as of v1.30 for no apparent reason + raise ValueError("right join not supported") + left = left_frame.with_columns( [ pl.int_range(pl.len()).alias("_bf_join_l"), ] ) - right = self.compile_node(node.right_child).with_columns( + right = right_frame.with_columns( [ pl.int_range(pl.len()).alias("_bf_join_r"), ] ) - if node.type != "cross": - left_on = [l_name.id.sql for l_name, _ in node.conditions] - right_on = [r_name.id.sql for _, r_name in node.conditions] + if how != "cross": joined = left.join( - right, how=node.type, left_on=left_on, right_on=right_on, coalesce=False + right, + how=how, + left_on=left_on, + right_on=right_on, + # Note: join_nulls renamed to nulls_equal for polars 1.24 + join_nulls=join_nulls, # type: ignore + coalesce=False, ) else: - joined = left.join(right, how=node.type) - return joined.sort(["_bf_join_l", "_bf_join_r"]).drop( + joined = left.join(right, how=how, coalesce=False) + + join_order = ( + ["_bf_join_l", "_bf_join_r"] + if how != "right" + else ["_bf_join_r", "_bf_join_l"] + ) + return joined.sort(join_order, nulls_last=True).drop( ["_bf_join_l", "_bf_join_r"] ) @compile_node.register def compile_concat(self, node: nodes.ConcatNode): - return pl.concat(self.compile_node(child) for child in node.child_nodes) + child_frames = [self.compile_node(child) for child in node.child_nodes] + child_frames = [ + frame.rename( + {col: id.sql for col, id in zip(frame.columns, node.output_ids)} + ) + for frame in child_frames + ] + df = pl.concat(child_frames) + return df @compile_node.register def compile_agg(self, node: nodes.AggregateNode): df = self.compile_node(node.child) - + if node.dropna and len(node.by_column_ids) > 0: + df = df.filter( + [pl.col(ref.id.sql).is_not_null() for ref in node.by_column_ids] + ) + if node.order_by: + df = self._sort(df, node.order_by) + return self._aggregate(df, node.aggregations, node.by_column_ids) + + def _aggregate( + self, + df: pl.LazyFrame, + aggregations: Sequence[Tuple[ex.Aggregation, identifiers.ColumnId]], + grouping_keys: Tuple[ex.DerefOp, ...], + ) -> pl.LazyFrame: # Need to materialize columns to broadcast constants agg_inputs = [ list( @@ -304,7 +508,7 @@ def compile_agg(self, node: nodes.AggregateNode): self.agg_compiler.get_args(agg), ) ) - for agg, _ in node.aggregations + for agg, _ in aggregations ] df_agg_inputs = df.with_columns(itertools.chain(*agg_inputs)) @@ -313,18 +517,19 @@ def compile_agg(self, node: nodes.AggregateNode): self.agg_compiler.compile_agg_op( agg.op, list(map(lambda x: x.meta.output_name(), inputs)) ).alias(id.sql) - for (agg, id), inputs in zip(node.aggregations, agg_inputs) + for (agg, id), inputs in zip(aggregations, agg_inputs) ] - if len(node.by_column_ids) > 0: - group_exprs = [pl.col(ref.id.sql) for ref in node.by_column_ids] + if len(grouping_keys) > 0: + group_exprs = [pl.col(ref.id.sql) for ref in grouping_keys] grouped_df = df_agg_inputs.group_by(group_exprs) - return grouped_df.agg(agg_exprs).sort(group_exprs) + return grouped_df.agg(agg_exprs).sort(group_exprs, nulls_last=True) else: return df_agg_inputs.select(agg_exprs) @compile_node.register def compile_explode(self, node: nodes.ExplodeNode): + assert node.offsets_col is None df = self.compile_node(node.child) cols = [pl.col(col.id.sql) for col in node.column_ids] return df.explode(cols) @@ -338,55 +543,92 @@ def compile_sample(self, node: nodes.RandomSampleNode): @compile_node.register def compile_window(self, node: nodes.WindowOpNode): df = self.compile_node(node.child) - agg_expr = self.agg_compiler.compile_agg_expr(node.expression).alias( - node.output_name.sql - ) - # Three window types: completely unbound, grouped and row bounded window = node.window_spec - + # Should have been handled by reweriter + assert len(window.ordering) == 0 if window.min_periods > 0: raise NotImplementedError("min_period not yet supported for polars engine") - if window.bounds is None: + if (window.bounds is None) or (window.is_unbounded): # polars will automatically broadcast the aggregate to the matching input rows - if len(window.grouping_keys) == 0: # unbound window - pass - else: # partition-only window - agg_expr = agg_expr.over( - partition_by=[ref.id.sql for ref in window.grouping_keys] - ) - return df.with_columns([agg_expr]) - + agg_pl = self.agg_compiler.compile_agg_expr(node.expression) + if window.grouping_keys: + agg_pl = agg_pl.over(id.id.sql for id in window.grouping_keys) + result = df.with_columns(agg_pl.alias(node.output_name.sql)) else: # row-bounded window - assert isinstance(window.bounds, window_spec.RowsWindowBounds) - # Polars API semi-bounded, and any grouped rolling window challenging - # https://github.com/pola-rs/polars/issues/4799 - # https://github.com/pola-rs/polars/issues/8976 - index_col_name = "_bf_pl_engine_offsets" - indexed_df = df.with_row_index(index_col_name) - if len(window.grouping_keys) == 0: # rolling-only window - # https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.rolling.html - offset_n = window.bounds.start - period_n = _get_period(window.bounds) or df.collect().height - results = indexed_df.rolling( - index_column=index_col_name, - period=f"{period_n}i", - offset=f"{offset_n}i" if offset_n else None, - ).agg(agg_expr) - else: # groupby-rolling window - raise NotImplementedError( - "Groupby rolling windows not yet implemented in polars engine" - ) - # polars is columnar, so this is efficient - # TODO: why can't just add columns? - return pl.concat([df, results], how="horizontal") + window_result = self._calc_row_analytic_func( + df, node.expression, node.window_spec, node.output_name.sql + ) + result = pl.concat([df, window_result], how="horizontal") + + # Probably easier just to pull this out as a rewriter + if ( + node.expression.op.skips_nulls + and not node.never_skip_nulls + and node.expression.column_references + ): + nullity_expr = functools.reduce( + operator.or_, + ( + pl.col(column.sql).is_null() + for column in node.expression.column_references + ), + ) + result = result.with_columns( + pl.when(nullity_expr) + .then(None) + .otherwise(pl.col(node.output_name.sql)) + .alias(node.output_name.sql) + ) + return result + + def _calc_row_analytic_func( + self, + frame: pl.LazyFrame, + agg_expr: ex.Aggregation, + window: window_spec.WindowSpec, + name: str, + ) -> pl.LazyFrame: + if not isinstance(window.bounds, window_spec.RowsWindowBounds): + raise NotImplementedError("Only row bounds supported by polars engine") + groupby = None + if len(window.grouping_keys) > 0: + groupby = [ + self.expr_compiler.compile_expression(ref) + for ref in window.grouping_keys + ] + + # Polars API semi-bounded, and any grouped rolling window challenging + # https://github.com/pola-rs/polars/issues/4799 + # https://github.com/pola-rs/polars/issues/8976 + pl_agg_expr = self.agg_compiler.compile_agg_expr(agg_expr).alias(name) + index_col_name = "_bf_pl_engine_offsets" + indexed_df = frame.with_row_index(index_col_name) + # https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.rolling.html + period_n, offset_n = _get_period_and_offset(window.bounds) + return ( + indexed_df.rolling( + index_column=index_col_name, + period=f"{period_n}i", + offset=f"{offset_n}i" if (offset_n is not None) else None, + group_by=groupby, + ) + .agg(pl_agg_expr) + .select(name) + ) -def _get_period(bounds: window_spec.RowsWindowBounds) -> Optional[int]: - """Returns None if the boundary is infinite.""" - if bounds.start is None or bounds.end is None: - return None +def _get_period_and_offset( + bounds: window_spec.RowsWindowBounds, +) -> tuple[int, Optional[int]]: + # fixed size window + if (bounds.start is not None) and (bounds.end is not None): + return ((bounds.end - bounds.start + 1), bounds.start - 1) - # collecting height is a massive kludge - return bounds.end - bounds.start + 1 + LARGE_N = 1000000000 + if bounds.start is not None: + return (LARGE_N, bounds.start - 1) + if bounds.end is not None: + return (LARGE_N, None) + raise ValueError("Not a bounded window") diff --git a/bigframes/core/global_session.py b/bigframes/core/global_session.py index d4d70f5a06..8732b55990 100644 --- a/bigframes/core/global_session.py +++ b/bigframes/core/global_session.py @@ -112,3 +112,23 @@ def get_global_session(): def with_default_session(func: Callable[..., _T], *args, **kwargs) -> _T: return func(get_global_session(), *args, **kwargs) + + +class _GlobalSessionContext: + """ + Context manager for testing that sets global session. + """ + + def __init__(self, session: bigframes.session.Session): + self._session = session + + def __enter__(self): + global _global_session, _global_session_lock + with _global_session_lock: + self._previous_session = _global_session + _global_session = self._session + + def __exit__(self, *exc_details): + global _global_session, _global_session_lock + with _global_session_lock: + _global_session = self._previous_session diff --git a/bigframes/core/rewrite/__init__.py b/bigframes/core/rewrite/__init__.py index b8f1d26db8..5d554d45d7 100644 --- a/bigframes/core/rewrite/__init__.py +++ b/bigframes/core/rewrite/__init__.py @@ -24,7 +24,7 @@ ) from bigframes.core.rewrite.slices import pull_out_limit, pull_up_limits, rewrite_slice from bigframes.core.rewrite.timedeltas import rewrite_timedelta_expressions -from bigframes.core.rewrite.windows import rewrite_range_rolling +from bigframes.core.rewrite.windows import pull_out_window_order, rewrite_range_rolling __all__ = [ "legacy_join_as_projection", @@ -41,4 +41,5 @@ "bake_order", "try_reduce_to_local_scan", "fold_row_counts", + "pull_out_window_order", ] diff --git a/bigframes/core/rewrite/windows.py b/bigframes/core/rewrite/windows.py index 9f55db23af..6e9ba0dd3d 100644 --- a/bigframes/core/rewrite/windows.py +++ b/bigframes/core/rewrite/windows.py @@ -17,7 +17,7 @@ import dataclasses from bigframes import operations as ops -from bigframes.core import nodes +from bigframes.core import guid, identifiers, nodes, ordering def rewrite_range_rolling(node: nodes.BigFrameNode) -> nodes.BigFrameNode: @@ -43,3 +43,34 @@ def rewrite_range_rolling(node: nodes.BigFrameNode) -> nodes.BigFrameNode: node, window_spec=dataclasses.replace(node.window_spec, ordering=(new_ordering,)), ) + + +def pull_out_window_order(root: nodes.BigFrameNode) -> nodes.BigFrameNode: + return root.bottom_up(rewrite_window_node) + + +def rewrite_window_node(node: nodes.BigFrameNode) -> nodes.BigFrameNode: + if not isinstance(node, nodes.WindowOpNode): + return node + if len(node.window_spec.ordering) == 0: + return node + else: + offsets_id = guid.generate_guid() + w_offsets = nodes.PromoteOffsetsNode( + node.child, identifiers.ColumnId(offsets_id) + ) + sorted_child = nodes.OrderByNode(w_offsets, node.window_spec.ordering) + new_window_node = dataclasses.replace( + node, + child=sorted_child, + window_spec=node.window_spec.without_order(force=True), + ) + w_resetted_order = nodes.OrderByNode( + new_window_node, + by=(ordering.ascending_over(identifiers.ColumnId(offsets_id)),), + is_total_order=True, + ) + w_offsets_dropped = nodes.SelectionNode( + w_resetted_order, tuple(nodes.AliasedRef.identity(id) for id in node.ids) + ) + return w_offsets_dropped diff --git a/bigframes/core/window_spec.py b/bigframes/core/window_spec.py index d08ba3d12a..2be30135ee 100644 --- a/bigframes/core/window_spec.py +++ b/bigframes/core/window_spec.py @@ -234,7 +234,9 @@ def is_row_bounded(self): This is relevant for determining whether the window requires a total order to calculate deterministically. """ - return isinstance(self.bounds, RowsWindowBounds) + return isinstance(self.bounds, RowsWindowBounds) and ( + (self.bounds.start is not None) or (self.bounds.end is not None) + ) @property def is_range_bounded(self): @@ -254,7 +256,9 @@ def is_unbounded(self): This is relevant for determining whether the window requires a total order to calculate deterministically. """ - return self.bounds is None + return self.bounds is None or ( + self.bounds.start is None and self.bounds.end is None + ) @property def all_referenced_columns(self) -> Set[ids.ColumnId]: @@ -266,9 +270,9 @@ def all_referenced_columns(self) -> Set[ids.ColumnId]: ) return set(itertools.chain((i.id for i in self.grouping_keys), ordering_vars)) - def without_order(self) -> WindowSpec: + def without_order(self, force: bool = False) -> WindowSpec: """Removes ordering clause if ordering isn't required to define bounds.""" - if self.is_row_bounded: + if self.is_row_bounded and not force: raise ValueError("Cannot remove order from row-bounded window") return replace(self, ordering=()) diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index e3f15e67a1..1c321c0bf8 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -439,7 +439,6 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT return dtypes.INT_DTYPE -# TODO: Convert to NullaryWindowOp @dataclasses.dataclass(frozen=True) class RankOp(UnaryWindowOp): name: ClassVar[str] = "rank" @@ -456,7 +455,6 @@ def implicitly_inherits_order(self): return False -# TODO: Convert to NullaryWindowOp @dataclasses.dataclass(frozen=True) class DenseRankOp(UnaryWindowOp): @property diff --git a/bigframes/testing/polars_session.py b/bigframes/testing/polars_session.py index f8dda8da55..5e5de2d0b2 100644 --- a/bigframes/testing/polars_session.py +++ b/bigframes/testing/polars_session.py @@ -20,12 +20,9 @@ import polars import bigframes -import bigframes.clients import bigframes.core.blocks import bigframes.core.compile.polars -import bigframes.core.ordering import bigframes.dataframe -import bigframes.session.clients import bigframes.session.executor import bigframes.session.metrics @@ -35,6 +32,26 @@ class TestExecutor(bigframes.session.executor.Executor): compiler = bigframes.core.compile.polars.PolarsCompiler() + def peek( + self, + array_value: bigframes.core.ArrayValue, + n_rows: int, + use_explicit_destination: Optional[bool] = False, + ): + """ + A 'peek' efficiently accesses a small number of rows in the dataframe. + """ + lazy_frame: polars.LazyFrame = self.compiler.compile(array_value) + pa_table = lazy_frame.collect().limit(n_rows).to_arrow() + # Currently, pyarrow types might not quite be exactly the ones in the bigframes schema. + # Nullability may be different, and might use large versions of list, string datatypes. + return bigframes.session.executor.ExecuteResult( + arrow_batches=pa_table.to_batches(), + schema=array_value.schema, + total_bytes=pa_table.nbytes, + total_rows=pa_table.num_rows, + ) + def execute( self, array_value: bigframes.core.ArrayValue, @@ -58,6 +75,14 @@ def execute( total_rows=pa_table.num_rows, ) + def cached( + self, + array_value: bigframes.core.ArrayValue, + *, + config, + ) -> None: + return + class TestSession(bigframes.session.Session): def __init__(self): @@ -92,3 +117,8 @@ def read_pandas(self, pandas_dataframe, write_engine="default"): pandas_dataframe = pandas_dataframe.to_frame() local_block = bigframes.core.blocks.Block.from_local(pandas_dataframe, self) return bigframes.dataframe.DataFrame(local_block) + + @property + def bqclient(self): + # prevents logger from trying to call bq upon any errors + return None diff --git a/noxfile.py b/noxfile.py index 297e8f9d6f..e3cfbf83a4 100644 --- a/noxfile.py +++ b/noxfile.py @@ -79,7 +79,7 @@ UNIT_TEST_DEPENDENCIES: List[str] = [] UNIT_TEST_EXTRAS: List[str] = ["tests"] UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = { - "3.12": ["polars", "scikit-learn"], + "3.12": ["tests", "polars", "scikit-learn"], } # 3.10 is needed for Windows tests as it is the only version installed in the @@ -202,14 +202,11 @@ def install_unittest_dependencies(session, install_test_extra, *constraints): if UNIT_TEST_LOCAL_DEPENDENCIES: session.install(*UNIT_TEST_LOCAL_DEPENDENCIES, *constraints) - if install_test_extra and UNIT_TEST_EXTRAS_BY_PYTHON: - extras = UNIT_TEST_EXTRAS_BY_PYTHON.get(session.python, []) - if install_test_extra and UNIT_TEST_EXTRAS: - extras = UNIT_TEST_EXTRAS - else: - extras = [] - - if extras: + if install_test_extra: + if session.python in UNIT_TEST_EXTRAS_BY_PYTHON: + extras = UNIT_TEST_EXTRAS_BY_PYTHON[session.python] + else: + extras = UNIT_TEST_EXTRAS session.install("-e", f".[{','.join(extras)}]", *constraints) else: session.install("-e", ".", *constraints) diff --git a/tests/unit/test_dataframe_polars.py b/tests/unit/test_dataframe_polars.py new file mode 100644 index 0000000000..2bda563418 --- /dev/null +++ b/tests/unit/test_dataframe_polars.py @@ -0,0 +1,4422 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import io +import operator +import pathlib +import tempfile +import typing +from typing import Generator, List, Tuple + +import numpy as np +import pandas as pd +import pandas.testing +import pytest + +import bigframes +import bigframes._config.display_options as display_options +import bigframes.core.indexes as bf_indexes +import bigframes.dataframe as dataframe +import bigframes.pandas as bpd +import bigframes.series as series +from tests.system.utils import ( + assert_dfs_equivalent, + assert_pandas_df_equal, + assert_series_equal, + assert_series_equivalent, + convert_pandas_dtypes, +) + +pytest.importorskip("polars") +pytest.importorskip("pandas", minversion="2.0.0") + +CURRENT_DIR = pathlib.Path(__file__).parent +DATA_DIR = CURRENT_DIR.parent / "data" + + +@pytest.fixture(scope="module", autouse=True) +def session() -> Generator[bigframes.Session, None, None]: + import bigframes.core.global_session + from bigframes.testing import polars_session + + session = polars_session.TestSession() + with bigframes.core.global_session._GlobalSessionContext(session): + yield session + + +@pytest.fixture(scope="module") +def scalars_pandas_df_index() -> pd.DataFrame: + """pd.DataFrame pointing at test data.""" + + df = pd.read_json( + DATA_DIR / "scalars.jsonl", + lines=True, + ) + convert_pandas_dtypes(df, bytes_col=True) + + df = df.set_index("rowindex", drop=False) + df.index.name = None + return df.set_index("rowindex").sort_index() + + +@pytest.fixture(scope="module") +def scalars_df_index( + session: bigframes.Session, scalars_pandas_df_index +) -> bpd.DataFrame: + return session.read_pandas(scalars_pandas_df_index) + + +@pytest.fixture(scope="module") +def scalars_df_2_index( + session: bigframes.Session, scalars_pandas_df_index +) -> bpd.DataFrame: + return session.read_pandas(scalars_pandas_df_index) + + +@pytest.fixture(scope="module") +def scalars_dfs( + scalars_df_index, + scalars_pandas_df_index, +): + return scalars_df_index, scalars_pandas_df_index + + +def test_df_construct_copy(scalars_dfs): + columns = ["int64_col", "string_col", "float64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + # Make the mapping from label to col_id non-trivial + bf_df = scalars_df.copy() + bf_df["int64_col"] = bf_df["int64_col"] / 2 + pd_df = scalars_pandas_df.copy() + pd_df["int64_col"] = pd_df["int64_col"] / 2 + + bf_result = dataframe.DataFrame(bf_df, columns=columns).to_pandas() + + pd_result = pd.DataFrame(pd_df, columns=columns) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_construct_pandas_default(scalars_dfs): + # This should trigger the inlined codepath + columns = [ + "int64_too", + "int64_col", + "float64_col", + "bool_col", + "string_col", + "date_col", + "datetime_col", + "numeric_col", + "float64_col", + "time_col", + "timestamp_col", + ] + _, scalars_pandas_df = scalars_dfs + bf_result = dataframe.DataFrame(scalars_pandas_df, columns=columns).to_pandas() + pd_result = pd.DataFrame(scalars_pandas_df, columns=columns) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_construct_structs(session): + pd_frame = pd.Series( + [ + {"version": 1, "project": "pandas"}, + {"version": 2, "project": "pandas"}, + {"version": 1, "project": "numpy"}, + ] + ).to_frame() + bf_series = session.read_pandas(pd_frame) + pd.testing.assert_frame_equal( + bf_series.to_pandas(), pd_frame, check_index_type=False, check_dtype=False + ) + + +def test_df_construct_pandas_set_dtype(scalars_dfs): + columns = [ + "int64_too", + "int64_col", + "float64_col", + "bool_col", + ] + _, scalars_pandas_df = scalars_dfs + bf_result = dataframe.DataFrame( + scalars_pandas_df, columns=columns, dtype="Float64" + ).to_pandas() + pd_result = pd.DataFrame(scalars_pandas_df, columns=columns, dtype="Float64") + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_construct_from_series(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = dataframe.DataFrame( + {"a": scalars_df["int64_col"], "b": scalars_df["string_col"]}, + dtype="string[pyarrow]", + ) + pd_result = pd.DataFrame( + {"a": scalars_pandas_df["int64_col"], "b": scalars_pandas_df["string_col"]}, + dtype="string[pyarrow]", + ) + assert_dfs_equivalent(pd_result, bf_result) + + +def test_df_construct_from_dict(): + input_dict = { + "Animal": ["Falcon", "Falcon", "Parrot", "Parrot"], + # With a space in column name. We use standardized SQL schema ids to solve the problem that BQ schema doesn't support column names with spaces. b/296751058 + "Max Speed": [380.0, 370.0, 24.0, 26.0], + } + bf_result = dataframe.DataFrame(input_dict).to_pandas() + pd_result = pd.DataFrame(input_dict) + + pandas.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_df_construct_dtype(): + data = { + "int_col": [1, 2, 3], + "string_col": ["1.1", "2.0", "3.5"], + "float_col": [1.0, 2.0, 3.0], + } + dtype = pd.StringDtype(storage="pyarrow") + bf_result = dataframe.DataFrame(data, dtype=dtype) + pd_result = pd.DataFrame(data, dtype=dtype) + pd_result.index = pd_result.index.astype("Int64") + pandas.testing.assert_frame_equal(bf_result.to_pandas(), pd_result) + + +def test_get_column(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + series = scalars_df[col_name] + bf_result = series.to_pandas() + pd_result = scalars_pandas_df[col_name] + assert_series_equal(bf_result, pd_result) + + +def test_get_column_nonstring(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + series = scalars_df.rename(columns={"int64_col": 123.1})[123.1] + bf_result = series.to_pandas() + pd_result = scalars_pandas_df.rename(columns={"int64_col": 123.1})[123.1] + assert_series_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + "row_slice", + [ + (slice(1, 7, 2)), + (slice(1, 7, None)), + (slice(None, -3, None)), + ], +) +def test_get_rows_with_slice(scalars_dfs, row_slice): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[row_slice].to_pandas() + pd_result = scalars_pandas_df[row_slice] + assert_pandas_df_equal(bf_result, pd_result) + + +def test_hasattr(scalars_dfs): + scalars_df, _ = scalars_dfs + assert hasattr(scalars_df, "int64_col") + assert hasattr(scalars_df, "head") + assert not hasattr(scalars_df, "not_exist") + + +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_head_with_custom_column_labels( + scalars_df_index, scalars_pandas_df_index, ordered +): + rename_mapping = { + "int64_col": "Integer Column", + "string_col": "言語列", + } + bf_df = scalars_df_index.rename(columns=rename_mapping).head(3) + bf_result = bf_df.to_pandas(ordered=ordered) + pd_result = scalars_pandas_df_index.rename(columns=rename_mapping).head(3) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) + + +def test_tail_with_custom_column_labels(scalars_df_index, scalars_pandas_df_index): + rename_mapping = { + "int64_col": "Integer Column", + "string_col": "言語列", + } + bf_df = scalars_df_index.rename(columns=rename_mapping).tail(3) + bf_result = bf_df.to_pandas() + pd_result = scalars_pandas_df_index.rename(columns=rename_mapping).tail(3) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_get_column_by_attr(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + series = scalars_df.int64_col + bf_result = series.to_pandas() + pd_result = scalars_pandas_df.int64_col + assert_series_equal(bf_result, pd_result) + + +def test_get_columns(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_names = ["bool_col", "float64_col", "int64_col"] + df_subset = scalars_df.get(col_names) + df_pandas = df_subset.to_pandas() + pd.testing.assert_index_equal( + df_pandas.columns, scalars_pandas_df[col_names].columns + ) + + +def test_get_columns_default(scalars_dfs): + scalars_df, _ = scalars_dfs + col_names = ["not", "column", "names"] + result = scalars_df.get(col_names, "default_val") + assert result == "default_val" + + +@pytest.mark.parametrize( + ("loc", "column", "value", "allow_duplicates"), + [ + (0, 666, 2, False), + (5, "float64_col", 2.2, True), + (13, "rowindex_2", [8, 7, 6, 5, 4, 3, 2, 1, 0], True), + pytest.param( + 14, + "test", + 2, + False, + marks=pytest.mark.xfail( + raises=IndexError, + ), + ), + pytest.param( + 12, + "int64_col", + 2, + False, + marks=pytest.mark.xfail( + raises=ValueError, + ), + ), + ], +) +def test_insert(scalars_dfs, loc, column, value, allow_duplicates): + scalars_df, scalars_pandas_df = scalars_dfs + # insert works inplace, so will influence other tests. + # make a copy to avoid inplace changes. + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df.insert(loc, column, value, allow_duplicates) + pd_df.insert(loc, column, value, allow_duplicates) + + pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df, check_dtype=False) + + +def test_where_series_cond(scalars_df_index, scalars_pandas_df_index): + # Condition is dataframe, other is None (as default). + cond_bf = scalars_df_index["int64_col"] > 0 + cond_pd = scalars_pandas_df_index["int64_col"] > 0 + bf_result = scalars_df_index.where(cond_bf).to_pandas() + pd_result = scalars_pandas_df_index.where(cond_pd) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_mask_series_cond(scalars_df_index, scalars_pandas_df_index): + cond_bf = scalars_df_index["int64_col"] > 0 + cond_pd = scalars_pandas_df_index["int64_col"] > 0 + + bf_df = scalars_df_index[["int64_too", "int64_col", "float64_col"]] + pd_df = scalars_pandas_df_index[["int64_too", "int64_col", "float64_col"]] + bf_result = bf_df.mask(cond_bf, bf_df + 1).to_pandas() + pd_result = pd_df.mask(cond_pd, pd_df + 1) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_where_series_multi_index(scalars_df_index, scalars_pandas_df_index): + # Test when a dataframe has multi-index or multi-columns. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + + dataframe_bf.columns = pd.MultiIndex.from_tuples( + [("str1", 1), ("str2", 2)], names=["STR", "INT"] + ) + cond_bf = dataframe_bf["str1"] > 0 + + with pytest.raises(NotImplementedError) as context: + dataframe_bf.where(cond_bf).to_pandas() + assert ( + str(context.value) + == "The dataframe.where() method does not support multi-index and/or multi-column." + ) + + +def test_where_series_cond_const_other(scalars_df_index, scalars_pandas_df_index): + # Condition is a series, other is a constant. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + dataframe_pd = scalars_pandas_df_index[columns] + dataframe_bf.columns.name = "test_name" + dataframe_pd.columns.name = "test_name" + + cond_bf = dataframe_bf["int64_col"] > 0 + cond_pd = dataframe_pd["int64_col"] > 0 + other = 0 + + bf_result = dataframe_bf.where(cond_bf, other).to_pandas() + pd_result = dataframe_pd.where(cond_pd, other) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_where_series_cond_dataframe_other(scalars_df_index, scalars_pandas_df_index): + # Condition is a series, other is a dataframe. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + dataframe_pd = scalars_pandas_df_index[columns] + + cond_bf = dataframe_bf["int64_col"] > 0 + cond_pd = dataframe_pd["int64_col"] > 0 + other_bf = -dataframe_bf + other_pd = -dataframe_pd + + bf_result = dataframe_bf.where(cond_bf, other_bf).to_pandas() + pd_result = dataframe_pd.where(cond_pd, other_pd) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_where_dataframe_cond(scalars_df_index, scalars_pandas_df_index): + # Condition is a dataframe, other is None. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + dataframe_pd = scalars_pandas_df_index[columns] + + cond_bf = dataframe_bf > 0 + cond_pd = dataframe_pd > 0 + + bf_result = dataframe_bf.where(cond_bf, None).to_pandas() + pd_result = dataframe_pd.where(cond_pd, None) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_where_dataframe_cond_const_other(scalars_df_index, scalars_pandas_df_index): + # Condition is a dataframe, other is a constant. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + dataframe_pd = scalars_pandas_df_index[columns] + + cond_bf = dataframe_bf > 0 + cond_pd = dataframe_pd > 0 + other_bf = 10 + other_pd = 10 + + bf_result = dataframe_bf.where(cond_bf, other_bf).to_pandas() + pd_result = dataframe_pd.where(cond_pd, other_pd) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_where_dataframe_cond_dataframe_other( + scalars_df_index, scalars_pandas_df_index +): + # Condition is a dataframe, other is a dataframe. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + dataframe_pd = scalars_pandas_df_index[columns] + + cond_bf = dataframe_bf > 0 + cond_pd = dataframe_pd > 0 + other_bf = dataframe_bf * 2 + other_pd = dataframe_pd * 2 + + bf_result = dataframe_bf.where(cond_bf, other_bf).to_pandas() + pd_result = dataframe_pd.where(cond_pd, other_pd) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_drop_column(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + df_pandas = scalars_df.drop(columns=col_name).to_pandas() + pd.testing.assert_index_equal( + df_pandas.columns, scalars_pandas_df.drop(columns=col_name).columns + ) + + +def test_drop_columns(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_names = ["int64_col", "geography_col", "time_col"] + df_pandas = scalars_df.drop(columns=col_names).to_pandas() + pd.testing.assert_index_equal( + df_pandas.columns, scalars_pandas_df.drop(columns=col_names).columns + ) + + +def test_drop_labels_axis_1(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + labels = ["int64_col", "geography_col", "time_col"] + + pd_result = scalars_pandas_df.drop(labels=labels, axis=1) + bf_result = scalars_df.drop(labels=labels, axis=1).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_with_custom_column_labels(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + rename_mapping = { + "int64_col": "Integer Column", + "string_col": "言語列", + } + dropped_columns = [ + "言語列", + "timestamp_col", + ] + bf_df = scalars_df.rename(columns=rename_mapping).drop(columns=dropped_columns) + bf_result = bf_df.to_pandas() + pd_result = scalars_pandas_df.rename(columns=rename_mapping).drop( + columns=dropped_columns + ) + assert_pandas_df_equal(bf_result, pd_result) + + +def test_df_memory_usage(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df.memory_usage() + bf_result = scalars_df.memory_usage() + + pd.testing.assert_series_equal(pd_result, bf_result, rtol=1.5) + + +def test_df_info(scalars_dfs): + expected = ( + "\n" + "Index: 9 entries, 0 to 8\n" + "Data columns (total 13 columns):\n" + " # Column Non-Null Count Dtype\n" + "--- ------------- ---------------- ------------------------------\n" + " 0 bool_col 8 non-null boolean\n" + " 1 bytes_col 6 non-null binary[pyarrow]\n" + " 2 date_col 7 non-null date32[day][pyarrow]\n" + " 3 datetime_col 6 non-null timestamp[us][pyarrow]\n" + " 4 geography_col 4 non-null geometry\n" + " 5 int64_col 8 non-null Int64\n" + " 6 int64_too 9 non-null Int64\n" + " 7 numeric_col 6 non-null decimal128(38, 9)[pyarrow]\n" + " 8 float64_col 7 non-null Float64\n" + " 9 rowindex_2 9 non-null Int64\n" + " 10 string_col 8 non-null string\n" + " 11 time_col 6 non-null time64[us][pyarrow]\n" + " 12 timestamp_col 6 non-null timestamp[us, tz=UTC][pyarrow]\n" + "dtypes: Float64(1), Int64(3), binary[pyarrow](1), boolean(1), date32[day][pyarrow](1), decimal128(38, 9)[pyarrow](1), geometry(1), string(1), time64[us][pyarrow](1), timestamp[us, tz=UTC][pyarrow](1), timestamp[us][pyarrow](1)\n" + "memory usage: 1269 bytes\n" + ) + + scalars_df, _ = scalars_dfs + bf_result = io.StringIO() + + scalars_df.info(buf=bf_result) + + assert expected == bf_result.getvalue() + + +@pytest.mark.parametrize( + ("include", "exclude"), + [ + ("Int64", None), + (["int"], None), + ("number", None), + ([pd.Int64Dtype(), pd.BooleanDtype()], None), + (None, [pd.Int64Dtype(), pd.BooleanDtype()]), + ("Int64", ["boolean"]), + ], +) +def test_select_dtypes(scalars_dfs, include, exclude): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df.select_dtypes(include=include, exclude=exclude) + bf_result = scalars_df.select_dtypes(include=include, exclude=exclude).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df.drop(index=[4, 1, 2]) + bf_result = scalars_df.drop(index=[4, 1, 2]).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_pandas_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + drop_index = scalars_pandas_df.iloc[[4, 1, 2]].index + + pd_result = scalars_pandas_df.drop(index=drop_index) + bf_result = scalars_df.drop(index=drop_index).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_bigframes_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + drop_index = scalars_df.loc[[4, 1, 2]].index + drop_pandas_index = scalars_pandas_df.loc[[4, 1, 2]].index + + pd_result = scalars_pandas_df.drop(index=drop_pandas_index) + bf_result = scalars_df.drop(index=drop_index).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_bigframes_index_with_na(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.copy() + scalars_pandas_df = scalars_pandas_df.copy() + scalars_df = scalars_df.set_index("bytes_col") + scalars_pandas_df = scalars_pandas_df.set_index("bytes_col") + drop_index = scalars_df.iloc[[3, 5]].index + drop_pandas_index = scalars_pandas_df.iloc[[3, 5]].index + + pd_result = scalars_pandas_df.drop(index=drop_pandas_index) # drop_pandas_index) + bf_result = scalars_df.drop(index=drop_index).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_bigframes_multiindex(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.copy() + scalars_pandas_df = scalars_pandas_df.copy() + sub_df = scalars_df.iloc[[4, 1, 2]] + sub_pandas_df = scalars_pandas_df.iloc[[4, 1, 2]] + sub_df = sub_df.set_index(["bytes_col", "numeric_col"]) + sub_pandas_df = sub_pandas_df.set_index(["bytes_col", "numeric_col"]) + drop_index = sub_df.index + drop_pandas_index = sub_pandas_df.index + + scalars_df = scalars_df.set_index(["bytes_col", "numeric_col"]) + scalars_pandas_df = scalars_pandas_df.set_index(["bytes_col", "numeric_col"]) + bf_result = scalars_df.drop(index=drop_index).to_pandas() + pd_result = scalars_pandas_df.drop(index=drop_pandas_index) + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_labels_axis_0(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df.drop(labels=[4, 1, 2], axis=0) + bf_result = scalars_df.drop(labels=[4, 1, 2], axis=0).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_index_and_columns(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df.drop(index=[4, 1, 2], columns="int64_col") + bf_result = scalars_df.drop(index=[4, 1, 2], columns="int64_col").to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_rename(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name_dict = {"bool_col": 1.2345} + df_pandas = scalars_df.rename(columns=col_name_dict).to_pandas() + pd.testing.assert_index_equal( + df_pandas.columns, scalars_pandas_df.rename(columns=col_name_dict).columns + ) + + +def test_df_peek(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + peek_result = scalars_df.peek(n=3, force=False, allow_large_results=True) + + pd.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) + assert len(peek_result) == 3 + + +def test_df_peek_with_large_results_not_allowed(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + peek_result = scalars_df.peek(n=3, force=False, allow_large_results=False) + + pd.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) + assert len(peek_result) == 3 + + +def test_df_peek_filtered(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + peek_result = scalars_df[scalars_df.int64_col != 0].peek(n=3, force=False) + pd.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) + assert len(peek_result) == 3 + + +def test_df_peek_force_default(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + peek_result = scalars_df[["int64_col", "int64_too"]].cumsum().peek(n=3) + pd.testing.assert_index_equal( + scalars_pandas_df[["int64_col", "int64_too"]].columns, peek_result.columns + ) + assert len(peek_result) == 3 + + +def test_df_peek_reset_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + peek_result = ( + scalars_df[["int64_col", "int64_too"]].reset_index(drop=True).peek(n=3) + ) + pd.testing.assert_index_equal( + scalars_pandas_df[["int64_col", "int64_too"]].columns, peek_result.columns + ) + assert len(peek_result) == 3 + + +def test_repr_w_all_rows(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + # Remove columns with flaky formatting, like NUMERIC columns (which use the + # object dtype). Also makes a copy so that mutating the index name doesn't + # break other tests. + scalars_df = scalars_df.drop(columns=["numeric_col"]) + scalars_pandas_df = scalars_pandas_df.drop(columns=["numeric_col"]) + + # When there are 10 or fewer rows, the outputs should be identical. + actual = repr(scalars_df.head(10)) + + with display_options.pandas_repr(bigframes.options.display): + expected = repr(scalars_pandas_df.head(10)) + + assert actual == expected + + +def test_join_repr(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + scalars_df = ( + scalars_df[["int64_col"]] + .join(scalars_df.set_index("int64_col")[["int64_too"]]) + .sort_index() + ) + scalars_pandas_df = ( + scalars_pandas_df[["int64_col"]] + .join(scalars_pandas_df.set_index("int64_col")[["int64_too"]]) + .sort_index() + ) + # Pandas join result index name seems to depend on the index values in a way that bigframes can't match exactly + scalars_pandas_df.index.name = None + + actual = repr(scalars_df) + + with display_options.pandas_repr(bigframes.options.display): + expected = repr(scalars_pandas_df) + + assert actual == expected + + +def test_repr_html_w_all_rows(scalars_dfs, session): + scalars_df, _ = scalars_dfs + # get a pandas df of the expected format + df, _ = scalars_df._block.to_pandas() + pandas_df = df.set_axis(scalars_df._block.column_labels, axis=1) + pandas_df.index.name = scalars_df.index.name + + # When there are 10 or fewer rows, the outputs should be identical except for the extra note. + actual = scalars_df.head(10)._repr_html_() + + with display_options.pandas_repr(bigframes.options.display): + pandas_repr = pandas_df.head(10)._repr_html_() + + expected = ( + pandas_repr + + f"[{len(pandas_df.index)} rows x {len(pandas_df.columns)} columns in total]" + ) + assert actual == expected + + +def test_df_column_name_with_space(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name_dict = {"bool_col": "bool col"} + df_pandas = scalars_df.rename(columns=col_name_dict).to_pandas() + pd.testing.assert_index_equal( + df_pandas.columns, scalars_pandas_df.rename(columns=col_name_dict).columns + ) + + +def test_df_column_name_duplicate(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name_dict = {"int64_too": "int64_col"} + df_pandas = scalars_df.rename(columns=col_name_dict).to_pandas() + pd.testing.assert_index_equal( + df_pandas.columns, scalars_pandas_df.rename(columns=col_name_dict).columns + ) + + +def test_get_df_column_name_duplicate(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name_dict = {"int64_too": "int64_col"} + + bf_result = scalars_df.rename(columns=col_name_dict)["int64_col"].to_pandas() + pd_result = scalars_pandas_df.rename(columns=col_name_dict)["int64_col"] + pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) + + +@pytest.mark.parametrize( + ("indices", "axis"), + [ + ([1, 3, 5], 0), + ([2, 4, 6], 1), + ([1, -3, -5, -6], "index"), + ([-2, -4, -6], "columns"), + ], +) +def test_take_df(scalars_dfs, indices, axis): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.take(indices, axis=axis).to_pandas() + pd_result = scalars_pandas_df.take(indices, axis=axis) + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_filter_df(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_bool_series = scalars_df["bool_col"] + bf_result = scalars_df[bf_bool_series].to_pandas() + + pd_bool_series = scalars_pandas_df["bool_col"] + pd_result = scalars_pandas_df[pd_bool_series] + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_assign_new_column(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + kwargs = {"new_col": 2} + df = scalars_df.assign(**kwargs) + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.assign(**kwargs) + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_assign_new_column_w_loc(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df.loc[:, "new_col"] = 2 + pd_df.loc[:, "new_col"] = 2 + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("scalar",), + [ + (2.1,), + (None,), + ], +) +def test_assign_new_column_w_setitem(scalars_dfs, scalar): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df["new_col"] = scalar + pd_df["new_col"] = scalar + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `float64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Float64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_assign_new_column_w_setitem_dataframe(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df["int64_col"] = bf_df["int64_too"].to_frame() + pd_df["int64_col"] = pd_df["int64_too"].to_frame() + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_df["int64_col"] = pd_df["int64_col"].astype("Int64") + + pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df) + + +def test_assign_new_column_w_setitem_dataframe_error(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + + with pytest.raises(ValueError): + bf_df["impossible_col"] = bf_df[["int64_too", "string_col"]] + with pytest.raises(ValueError): + pd_df["impossible_col"] = pd_df[["int64_too", "string_col"]] + + +def test_assign_new_column_w_setitem_list(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + pd_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_assign_new_column_w_setitem_list_repeated(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + pd_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + bf_df["new_col_2"] = [1, 3, 2, 5, 4, 7, 6, 9, 8] + pd_df["new_col_2"] = [1, 3, 2, 5, 4, 7, 6, 9, 8] + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + pd_result["new_col_2"] = pd_result["new_col_2"].astype("Int64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_assign_new_column_w_setitem_list_custom_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + + # set the custom index + pd_df = pd_df.set_index(["string_col", "int64_col"]) + bf_df = bf_df.set_index(["string_col", "int64_col"]) + + bf_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + pd_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_assign_new_column_w_setitem_list_error(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + + with pytest.raises(ValueError): + pd_df["new_col"] = [1, 2, 3] # should be len 9, is 3 + with pytest.raises(ValueError): + bf_df["new_col"] = [1, 2, 3] + + +def test_assign_existing_column(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + kwargs = {"int64_col": 2} + df = scalars_df.assign(**kwargs) + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.assign(**kwargs) + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["int64_col"] = pd_result["int64_col"].astype("Int64") + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_assign_listlike_to_empty_df(session): + empty_df = dataframe.DataFrame(session=session) + empty_pandas_df = pd.DataFrame() + + bf_result = empty_df.assign(new_col=[1, 2, 3]) + pd_result = empty_pandas_df.assign(new_col=[1, 2, 3]) + + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + pd_result.index = pd_result.index.astype("Int64") + assert_pandas_df_equal(bf_result.to_pandas(), pd_result) + + +def test_assign_to_empty_df_multiindex_error(session): + empty_df = dataframe.DataFrame(session=session) + empty_pandas_df = pd.DataFrame() + + empty_df["empty_col_1"] = typing.cast(series.Series, []) + empty_df["empty_col_2"] = typing.cast(series.Series, []) + empty_pandas_df["empty_col_1"] = [] + empty_pandas_df["empty_col_2"] = [] + empty_df = empty_df.set_index(["empty_col_1", "empty_col_2"]) + empty_pandas_df = empty_pandas_df.set_index(["empty_col_1", "empty_col_2"]) + + with pytest.raises(ValueError): + empty_df.assign(new_col=[1, 2, 3, 4, 5, 6, 7, 8, 9]) + with pytest.raises(ValueError): + empty_pandas_df.assign(new_col=[1, 2, 3, 4, 5, 6, 7, 8, 9]) + + +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_assign_series(scalars_dfs, ordered): + scalars_df, scalars_pandas_df = scalars_dfs + column_name = "int64_col" + df = scalars_df.assign(new_col=scalars_df[column_name]) + bf_result = df.to_pandas(ordered=ordered) + pd_result = scalars_pandas_df.assign(new_col=scalars_pandas_df[column_name]) + + assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) + + +def test_assign_series_overwrite(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + column_name = "int64_col" + df = scalars_df.assign(**{column_name: scalars_df[column_name] + 3}) + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.assign( + **{column_name: scalars_pandas_df[column_name] + 3} + ) + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_assign_sequential(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + kwargs = {"int64_col": 2, "new_col": 3, "new_col2": 4} + df = scalars_df.assign(**kwargs) + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.assign(**kwargs) + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["int64_col"] = pd_result["int64_col"].astype("Int64") + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + pd_result["new_col2"] = pd_result["new_col2"].astype("Int64") + + assert_pandas_df_equal(bf_result, pd_result) + + +# Require an index so that the self-join is consistent each time. +def test_assign_same_table_different_index_performs_self_join( + scalars_df_index, scalars_pandas_df_index +): + column_name = "int64_col" + bf_df = scalars_df_index.assign( + alternative_index=scalars_df_index["rowindex_2"] + 2 + ) + pd_df = scalars_pandas_df_index.assign( + alternative_index=scalars_pandas_df_index["rowindex_2"] + 2 + ) + bf_df_2 = bf_df.set_index("alternative_index") + pd_df_2 = pd_df.set_index("alternative_index") + bf_result = bf_df.assign(new_col=bf_df_2[column_name] * 10).to_pandas() + pd_result = pd_df.assign(new_col=pd_df_2[column_name] * 10) + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +# Different table expression must have Index +def test_assign_different_df( + scalars_df_index, scalars_df_2_index, scalars_pandas_df_index +): + column_name = "int64_col" + df = scalars_df_index.assign(new_col=scalars_df_2_index[column_name]) + bf_result = df.to_pandas() + # Doesn't matter to pandas if it comes from the same DF or a different DF. + pd_result = scalars_pandas_df_index.assign( + new_col=scalars_pandas_df_index[column_name] + ) + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_assign_different_df_w_loc( + scalars_df_index, scalars_df_2_index, scalars_pandas_df_index +): + bf_df = scalars_df_index.copy() + bf_df2 = scalars_df_2_index.copy() + pd_df = scalars_pandas_df_index.copy() + assert "int64_col" in bf_df.columns + assert "int64_col" in pd_df.columns + bf_df.loc[:, "int64_col"] = bf_df2.loc[:, "int64_col"] + 1 + pd_df.loc[:, "int64_col"] = pd_df.loc[:, "int64_col"] + 1 + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["int64_col"] = pd_result["int64_col"].astype("Int64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_assign_different_df_w_setitem( + scalars_df_index, scalars_df_2_index, scalars_pandas_df_index +): + bf_df = scalars_df_index.copy() + bf_df2 = scalars_df_2_index.copy() + pd_df = scalars_pandas_df_index.copy() + assert "int64_col" in bf_df.columns + assert "int64_col" in pd_df.columns + bf_df["int64_col"] = bf_df2["int64_col"] + 1 + pd_df["int64_col"] = pd_df["int64_col"] + 1 + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["int64_col"] = pd_result["int64_col"].astype("Int64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_assign_callable_lambda(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + kwargs = {"new_col": lambda x: x["int64_col"] + x["int64_too"]} + df = scalars_df.assign(**kwargs) + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.assign(**kwargs) + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + + assert_pandas_df_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("axis", "how", "ignore_index", "subset"), + [ + (0, "any", False, None), + (0, "any", True, None), + (0, "all", False, ["bool_col", "time_col"]), + (0, "any", False, ["bool_col", "time_col"]), + (0, "all", False, "time_col"), + (1, "any", False, None), + (1, "all", False, None), + ], +) +def test_df_dropna(scalars_dfs, axis, how, ignore_index, subset): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index, subset=subset) + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.dropna( + axis=axis, how=how, ignore_index=ignore_index, subset=subset + ) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_dropna_range_columns(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.copy() + scalars_pandas_df = scalars_pandas_df.copy() + scalars_df.columns = pandas.RangeIndex(0, len(scalars_df.columns)) + scalars_pandas_df.columns = pandas.RangeIndex(0, len(scalars_pandas_df.columns)) + + df = scalars_df.dropna() + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.dropna() + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_interpolate(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + columns = ["int64_col", "int64_too", "float64_col"] + bf_result = scalars_df[columns].interpolate().to_pandas() + # Pandas can only interpolate on "float64" columns + # https://github.com/pandas-dev/pandas/issues/40252 + pd_result = scalars_pandas_df[columns].astype("float64").interpolate() + + pandas.testing.assert_frame_equal( + bf_result, + pd_result, + check_index_type=False, + check_dtype=False, + ) + + +@pytest.mark.parametrize( + "col, fill_value", + [ + (["int64_col", "float64_col"], 3), + (["string_col"], "A"), + (["datetime_col"], pd.Timestamp("2023-01-01")), + ], +) +def test_df_fillna(scalars_dfs, col, fill_value): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[col].fillna(fill_value).to_pandas() + pd_result = scalars_pandas_df[col].fillna(fill_value) + + pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +def test_df_ffill(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[["int64_col", "float64_col"]].ffill(limit=1).to_pandas() + pd_result = scalars_pandas_df[["int64_col", "float64_col"]].ffill(limit=1) + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_bfill(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[["int64_col", "float64_col"]].bfill().to_pandas() + pd_result = scalars_pandas_df[["int64_col", "float64_col"]].bfill() + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_apply_series_series_callable( + scalars_df_index, + scalars_pandas_df_index, +): + columns = ["int64_too", "int64_col"] + + def foo(series, arg1, arg2, *, kwarg1=0, kwarg2=0): + return series**2 + (arg1 * arg2 % 4) + (kwarg1 * kwarg2 % 7) + + bf_result = ( + scalars_df_index[columns] + .apply(foo, args=(33, 61), kwarg1=52, kwarg2=21) + .to_pandas() + ) + + pd_result = scalars_pandas_df_index[columns].apply( + foo, args=(33, 61), kwarg1=52, kwarg2=21 + ) + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_apply_series_listlike_callable( + scalars_df_index, + scalars_pandas_df_index, +): + columns = ["int64_too", "int64_col"] + bf_result = ( + scalars_df_index[columns].apply(lambda x: [len(x), x.min(), 24]).to_pandas() + ) + + pd_result = scalars_pandas_df_index[columns].apply(lambda x: [len(x), x.min(), 24]) + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result.index = pd_result.index.astype("Int64") + pd_result = pd_result.astype("Int64") + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_apply_series_scalar_callable( + scalars_df_index, + scalars_pandas_df_index, +): + columns = ["int64_too", "int64_col"] + bf_result = scalars_df_index[columns].apply(lambda x: x.sum()) + + pd_result = scalars_pandas_df_index[columns].apply(lambda x: x.sum()) + + pandas.testing.assert_series_equal(bf_result, pd_result) + + +def test_df_pipe( + scalars_df_index, + scalars_pandas_df_index, +): + columns = ["int64_too", "int64_col"] + + def foo(x: int, y: int, df): + return (df + x) % y + + bf_result = ( + scalars_df_index[columns] + .pipe((foo, "df"), x=7, y=9) + .pipe(lambda x: x**2) + .to_pandas() + ) + + pd_result = ( + scalars_pandas_df_index[columns] + .pipe((foo, "df"), x=7, y=9) + .pipe(lambda x: x**2) + ) + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_keys( + scalars_df_index, + scalars_pandas_df_index, +): + pandas.testing.assert_index_equal( + scalars_df_index.keys(), scalars_pandas_df_index.keys() + ) + + +def test_df_iter( + scalars_df_index, + scalars_pandas_df_index, +): + for bf_i, df_i in zip(scalars_df_index, scalars_pandas_df_index): + assert bf_i == df_i + + +def test_iterrows( + scalars_df_index, + scalars_pandas_df_index, +): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df_index = scalars_df_index.add_suffix("_suffix", axis=1) + scalars_pandas_df_index = scalars_pandas_df_index.add_suffix("_suffix", axis=1) + for (bf_index, bf_series), (pd_index, pd_series) in zip( + scalars_df_index.iterrows(), scalars_pandas_df_index.iterrows() + ): + assert bf_index == pd_index + pandas.testing.assert_series_equal(bf_series, pd_series) + + +@pytest.mark.parametrize( + ( + "index", + "name", + ), + [ + ( + True, + "my_df", + ), + (False, None), + ], +) +def test_itertuples(scalars_df_index, index, name): + # Numeric has slightly different representation as a result of conversions. + bf_tuples = scalars_df_index.itertuples(index, name) + pd_tuples = scalars_df_index.to_pandas().itertuples(index, name) + for bf_tuple, pd_tuple in zip(bf_tuples, pd_tuples): + assert bf_tuple == pd_tuple + + +def test_df_cross_merge(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + left_columns = ["int64_col", "float64_col", "rowindex_2"] + right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] + + left = scalars_df[left_columns] + # Offset the rows somewhat so that outer join can have an effect. + right = scalars_df[right_columns].assign(rowindex_2=scalars_df["rowindex_2"] + 2) + + bf_result = left.merge(right, "cross").to_pandas() + + pd_result = scalars_pandas_df[left_columns].merge( + scalars_pandas_df[right_columns].assign( + rowindex_2=scalars_pandas_df["rowindex_2"] + 2 + ), + "cross", + ) + pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + +@pytest.mark.parametrize( + ("merge_how",), + [ + ("inner",), + ("outer",), + ("left",), + ("right",), + ], +) +def test_df_merge(scalars_dfs, merge_how): + scalars_df, scalars_pandas_df = scalars_dfs + on = "rowindex_2" + left_columns = ["int64_col", "float64_col", "rowindex_2"] + right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] + + left = scalars_df[left_columns] + # Offset the rows somewhat so that outer join can have an effect. + right = scalars_df[right_columns].assign(rowindex_2=scalars_df["rowindex_2"] + 2) + + df = left.merge(right, merge_how, on, sort=True) + bf_result = df.to_pandas() + + pd_result = scalars_pandas_df[left_columns].merge( + scalars_pandas_df[right_columns].assign( + rowindex_2=scalars_pandas_df["rowindex_2"] + 2 + ), + merge_how, + on, + sort=True, + ) + + assert_pandas_df_equal( + bf_result, pd_result, ignore_order=True, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("left_on", "right_on"), + [ + (["int64_col", "rowindex_2"], ["int64_col", "rowindex_2"]), + (["rowindex_2", "int64_col"], ["int64_col", "rowindex_2"]), + # Polars engine is currently strict on join key types + # (["rowindex_2", "float64_col"], ["int64_col", "rowindex_2"]), + ], +) +def test_df_merge_multi_key(scalars_dfs, left_on, right_on): + scalars_df, scalars_pandas_df = scalars_dfs + left_columns = ["int64_col", "float64_col", "rowindex_2"] + right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] + + left = scalars_df[left_columns] + # Offset the rows somewhat so that outer join can have an effect. + right = scalars_df[right_columns].assign(rowindex_2=scalars_df["rowindex_2"] + 2) + + df = left.merge(right, "outer", left_on=left_on, right_on=right_on, sort=True) + bf_result = df.to_pandas() + + pd_result = scalars_pandas_df[left_columns].merge( + scalars_pandas_df[right_columns].assign( + rowindex_2=scalars_pandas_df["rowindex_2"] + 2 + ), + "outer", + left_on=left_on, + right_on=right_on, + sort=True, + ) + + assert_pandas_df_equal( + bf_result, pd_result, ignore_order=True, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("merge_how",), + [ + ("inner",), + ("outer",), + ("left",), + ("right",), + ], +) +def test_merge_custom_col_name(scalars_dfs, merge_how): + scalars_df, scalars_pandas_df = scalars_dfs + left_columns = ["int64_col", "float64_col"] + right_columns = ["int64_col", "bool_col", "string_col"] + on = "int64_col" + rename_columns = {"float64_col": "f64_col"} + + left = scalars_df[left_columns] + left = left.rename(columns=rename_columns) + right = scalars_df[right_columns] + df = left.merge(right, merge_how, on, sort=True) + bf_result = df.to_pandas() + + pandas_left_df = scalars_pandas_df[left_columns] + pandas_left_df = pandas_left_df.rename(columns=rename_columns) + pandas_right_df = scalars_pandas_df[right_columns] + pd_result = pandas_left_df.merge(pandas_right_df, merge_how, on, sort=True) + + assert_pandas_df_equal( + bf_result, pd_result, ignore_order=True, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("merge_how",), + [ + ("inner",), + ("outer",), + ("left",), + ("right",), + ], +) +def test_merge_left_on_right_on(scalars_dfs, merge_how): + scalars_df, scalars_pandas_df = scalars_dfs + left_columns = ["int64_col", "float64_col", "int64_too"] + right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] + + left = scalars_df[left_columns] + right = scalars_df[right_columns] + + df = left.merge( + right, merge_how, left_on="int64_too", right_on="rowindex_2", sort=True + ) + bf_result = df.to_pandas() + + pd_result = scalars_pandas_df[left_columns].merge( + scalars_pandas_df[right_columns], + merge_how, + left_on="int64_too", + right_on="rowindex_2", + sort=True, + ) + + assert_pandas_df_equal( + bf_result, pd_result, ignore_order=True, check_index_type=False + ) + + +def test_shape(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.shape + pd_result = scalars_pandas_df.shape + + assert bf_result == pd_result + + +def test_len(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = len(scalars_df) + pd_result = len(scalars_pandas_df) + + assert bf_result == pd_result + + +@pytest.mark.parametrize( + ("n_rows",), + [ + (50,), + (10000,), + ], +) +def test_df_len_local(session, n_rows): + assert ( + len( + session.read_pandas( + pd.DataFrame(np.random.randint(1, 7, n_rows), columns=["one"]), + ) + ) + == n_rows + ) + + +def test_size(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.size + pd_result = scalars_pandas_df.size + + assert bf_result == pd_result + + +def test_ndim(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.ndim + pd_result = scalars_pandas_df.ndim + + assert bf_result == pd_result + + +def test_empty_false(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.empty + pd_result = scalars_pandas_df.empty + + assert bf_result == pd_result + + +def test_empty_true_column_filter(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df[[]].empty + pd_result = scalars_pandas_df[[]].empty + + assert bf_result == pd_result + + +def test_empty_true_row_filter(scalars_dfs: Tuple[dataframe.DataFrame, pd.DataFrame]): + scalars_df, scalars_pandas_df = scalars_dfs + bf_bool: series.Series = typing.cast(series.Series, scalars_df["bool_col"]) + pd_bool: pd.Series = scalars_pandas_df["bool_col"] + bf_false = bf_bool.notna() & (bf_bool != bf_bool) + pd_false = pd_bool.notna() & (pd_bool != pd_bool) + + bf_result = scalars_df[bf_false].empty + pd_result = scalars_pandas_df[pd_false].empty + + assert pd_result + assert bf_result == pd_result + + +def test_empty_true_memtable(session: bigframes.Session): + bf_df = dataframe.DataFrame(session=session) + pd_df = pd.DataFrame() + + bf_result = bf_df.empty + pd_result = pd_df.empty + + assert pd_result + assert bf_result == pd_result + + +@pytest.mark.parametrize( + ("drop",), + ((True,), (False,)), +) +def test_reset_index(scalars_df_index, scalars_pandas_df_index, drop): + df = scalars_df_index.reset_index(drop=drop) + assert df.index.name is None + + bf_result = df.to_pandas() + pd_result = scalars_pandas_df_index.reset_index(drop=drop) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + # reset_index should maintain the original ordering. + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_reset_index_then_filter( + scalars_df_index, + scalars_pandas_df_index, +): + bf_filter = scalars_df_index["bool_col"].fillna(True) + bf_df = scalars_df_index.reset_index()[bf_filter] + bf_result = bf_df.to_pandas() + pd_filter = scalars_pandas_df_index["bool_col"].fillna(True) + pd_result = scalars_pandas_df_index.reset_index()[pd_filter] + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + # reset_index should maintain the original ordering and index keys + # post-filter will have gaps. + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_reset_index_with_unnamed_index( + scalars_df_index, + scalars_pandas_df_index, +): + scalars_df_index = scalars_df_index.copy() + scalars_pandas_df_index = scalars_pandas_df_index.copy() + + scalars_df_index.index.name = None + scalars_pandas_df_index.index.name = None + df = scalars_df_index.reset_index(drop=False) + assert df.index.name is None + + # reset_index(drop=False) creates a new column "index". + assert df.columns[0] == "index" + + bf_result = df.to_pandas() + pd_result = scalars_pandas_df_index.reset_index(drop=False) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + # reset_index should maintain the original ordering. + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_reset_index_with_unnamed_multiindex( + scalars_df_index, + scalars_pandas_df_index, +): + bf_df = dataframe.DataFrame( + ([1, 2, 3], [2, 5, 7]), + index=pd.MultiIndex.from_tuples([("a", "aa"), ("a", "aa")]), + ) + pd_df = pd.DataFrame( + ([1, 2, 3], [2, 5, 7]), + index=pd.MultiIndex.from_tuples([("a", "aa"), ("a", "aa")]), + ) + + bf_df = bf_df.reset_index() + pd_df = pd_df.reset_index() + + assert pd_df.columns[0] == "level_0" + assert bf_df.columns[0] == "level_0" + assert pd_df.columns[1] == "level_1" + assert bf_df.columns[1] == "level_1" + + +def test_reset_index_with_unnamed_index_and_index_column( + scalars_df_index, + scalars_pandas_df_index, +): + scalars_df_index = scalars_df_index.copy() + scalars_pandas_df_index = scalars_pandas_df_index.copy() + + scalars_df_index.index.name = None + scalars_pandas_df_index.index.name = None + df = scalars_df_index.assign(index=scalars_df_index["int64_col"]).reset_index( + drop=False + ) + assert df.index.name is None + + # reset_index(drop=False) creates a new column "level_0" if the "index" column already exists. + assert df.columns[0] == "level_0" + + bf_result = df.to_pandas() + pd_result = scalars_pandas_df_index.assign( + index=scalars_pandas_df_index["int64_col"] + ).reset_index(drop=False) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + # reset_index should maintain the original ordering. + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("drop",), + ( + (True,), + (False,), + ), +) +@pytest.mark.parametrize( + ("append",), + ( + (True,), + (False,), + ), +) +@pytest.mark.parametrize( + ("index_column",), + (("int64_too",), ("string_col",), ("timestamp_col",)), +) +def test_set_index(scalars_dfs, index_column, drop, append): + scalars_df, scalars_pandas_df = scalars_dfs + df = scalars_df.set_index(index_column, append=append, drop=drop) + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.set_index(index_column, append=append, drop=drop) + + # Sort to disambiguate when there are duplicate index labels. + # Note: Doesn't use assert_pandas_df_equal_ignore_ordering because we get + # "ValueError: 'timestamp_col' is both an index level and a column label, + # which is ambiguous" when trying to sort by a column with the same name as + # the index. + bf_result = bf_result.sort_values("rowindex_2") + pd_result = pd_result.sort_values("rowindex_2") + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_set_index_key_error(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + with pytest.raises(KeyError): + scalars_pandas_df.set_index(["not_a_col"]) + with pytest.raises(KeyError): + scalars_df.set_index(["not_a_col"]) + + +@pytest.mark.parametrize( + ("ascending",), + ((True,), (False,)), +) +@pytest.mark.parametrize( + ("na_position",), + (("first",), ("last",)), +) +def test_sort_index(scalars_dfs, ascending, na_position): + index_column = "int64_col" + scalars_df, scalars_pandas_df = scalars_dfs + df = scalars_df.set_index(index_column) + bf_result = df.sort_index(ascending=ascending, na_position=na_position).to_pandas() + pd_result = scalars_pandas_df.set_index(index_column).sort_index( + ascending=ascending, na_position=na_position + ) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_dataframe_sort_index_inplace(scalars_dfs): + index_column = "int64_col" + scalars_df, scalars_pandas_df = scalars_dfs + df = scalars_df.copy().set_index(index_column) + df.sort_index(ascending=False, inplace=True) + bf_result = df.to_pandas() + + pd_result = scalars_pandas_df.set_index(index_column).sort_index(ascending=False) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_abs(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + columns = ["int64_col", "int64_too", "float64_col"] + + bf_result = scalars_df[columns].abs() + pd_result = scalars_pandas_df[columns].abs() + + assert_dfs_equivalent(pd_result, bf_result) + + +def test_df_pos(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = (+scalars_df[["int64_col", "numeric_col"]]).to_pandas() + pd_result = +scalars_pandas_df[["int64_col", "numeric_col"]] + + assert_pandas_df_equal(pd_result, bf_result) + + +def test_df_neg(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = (-scalars_df[["int64_col", "numeric_col"]]).to_pandas() + pd_result = -scalars_pandas_df[["int64_col", "numeric_col"]] + + assert_pandas_df_equal(pd_result, bf_result) + + +def test_df_invert(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + columns = ["int64_col", "bool_col"] + + bf_result = (~scalars_df[columns]).to_pandas() + pd_result = ~scalars_pandas_df[columns] + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_df_isnull(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + columns = ["int64_col", "int64_too", "string_col", "bool_col"] + bf_result = scalars_df[columns].isnull().to_pandas() + pd_result = scalars_pandas_df[columns].isnull() + + # One of dtype mismatches to be documented. Here, the `bf_result.dtype` is + # `BooleanDtype` but the `pd_result.dtype` is `bool`. + pd_result["int64_col"] = pd_result["int64_col"].astype(pd.BooleanDtype()) + pd_result["int64_too"] = pd_result["int64_too"].astype(pd.BooleanDtype()) + pd_result["string_col"] = pd_result["string_col"].astype(pd.BooleanDtype()) + pd_result["bool_col"] = pd_result["bool_col"].astype(pd.BooleanDtype()) + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_df_notnull(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + columns = ["int64_col", "int64_too", "string_col", "bool_col"] + bf_result = scalars_df[columns].notnull().to_pandas() + pd_result = scalars_pandas_df[columns].notnull() + + # One of dtype mismatches to be documented. Here, the `bf_result.dtype` is + # `BooleanDtype` but the `pd_result.dtype` is `bool`. + pd_result["int64_col"] = pd_result["int64_col"].astype(pd.BooleanDtype()) + pd_result["int64_too"] = pd_result["int64_too"].astype(pd.BooleanDtype()) + pd_result["string_col"] = pd_result["string_col"].astype(pd.BooleanDtype()) + pd_result["bool_col"] = pd_result["bool_col"].astype(pd.BooleanDtype()) + + assert_pandas_df_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("left_labels", "right_labels", "overwrite", "fill_value"), + [ + (["a", "b", "c"], ["c", "a", "b"], True, None), + (["a", "b", "c"], ["c", "a", "b"], False, None), + (["a", "b", "c"], ["a", "b", "c"], False, 2), + ], + ids=[ + "one_one_match_overwrite", + "one_one_match_no_overwrite", + "exact_match", + ], +) +def test_combine( + scalars_df_index, + scalars_df_2_index, + scalars_pandas_df_index, + left_labels, + right_labels, + overwrite, + fill_value, +): + if pd.__version__.startswith("1."): + pytest.skip("pd.NA vs NaN not handled well in pandas 1.x.") + columns = ["int64_too", "int64_col", "float64_col"] + + bf_df_a = scalars_df_index[columns] + bf_df_a.columns = left_labels + bf_df_b = scalars_df_2_index[columns] + bf_df_b.columns = right_labels + bf_result = bf_df_a.combine( + bf_df_b, + lambda x, y: x**2 + 2 * x * y + y**2, + overwrite=overwrite, + fill_value=fill_value, + ).to_pandas() + + pd_df_a = scalars_pandas_df_index[columns] + pd_df_a.columns = left_labels + pd_df_b = scalars_pandas_df_index[columns] + pd_df_b.columns = right_labels + pd_result = pd_df_a.combine( + pd_df_b, + lambda x, y: x**2 + 2 * x * y + y**2, + overwrite=overwrite, + fill_value=fill_value, + ) + + # Some dtype inconsistency for all-NULL columns + pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("overwrite", "filter_func"), + [ + (True, None), + (False, None), + (True, lambda x: x.isna() | (x % 2 == 0)), + ], + ids=[ + "default", + "overwritefalse", + "customfilter", + ], +) +def test_df_update(overwrite, filter_func): + if pd.__version__.startswith("1."): + pytest.skip("dtype handled differently in pandas 1.x.") + + index1: pandas.Index = pandas.Index([1, 2, 3, 4], dtype="Int64") + + index2: pandas.Index = pandas.Index([1, 2, 4, 5], dtype="Int64") + pd_df1 = pandas.DataFrame( + {"a": [1, None, 3, 4], "b": [5, 6, None, 8]}, dtype="Int64", index=index1 + ) + pd_df2 = pandas.DataFrame( + {"a": [None, 20, 30, 40], "c": [90, None, 110, 120]}, + dtype="Int64", + index=index2, + ) + + bf_df1 = dataframe.DataFrame(pd_df1) + bf_df2 = dataframe.DataFrame(pd_df2) + + bf_df1.update(bf_df2, overwrite=overwrite, filter_func=filter_func) + pd_df1.update(pd_df2, overwrite=overwrite, filter_func=filter_func) + + pd.testing.assert_frame_equal(bf_df1.to_pandas(), pd_df1) + + +def test_df_idxmin(): + pd_df = pd.DataFrame( + {"a": [1, 2, 3], "b": [7, None, 3], "c": [4, 4, 4]}, index=["x", "y", "z"] + ) + bf_df = dataframe.DataFrame(pd_df) + + bf_result = bf_df.idxmin().to_pandas() + pd_result = pd_df.idxmin() + + pd.testing.assert_series_equal( + bf_result, pd_result, check_index_type=False, check_dtype=False + ) + + +def test_df_idxmax(): + pd_df = pd.DataFrame( + {"a": [1, 2, 3], "b": [7, None, 3], "c": [4, 4, 4]}, index=["x", "y", "z"] + ) + bf_df = dataframe.DataFrame(pd_df) + + bf_result = bf_df.idxmax().to_pandas() + pd_result = pd_df.idxmax() + + pd.testing.assert_series_equal( + bf_result, pd_result, check_index_type=False, check_dtype=False + ) + + +@pytest.mark.parametrize( + ("join", "axis"), + [ + ("outer", None), + ("outer", 0), + ("outer", 1), + ("left", 0), + ("right", 1), + ("inner", None), + ("inner", 1), + ], +) +def test_df_align(join, axis): + + index1: pandas.Index = pandas.Index([1, 2, 3, 4], dtype="Int64") + + index2: pandas.Index = pandas.Index([1, 2, 4, 5], dtype="Int64") + pd_df1 = pandas.DataFrame( + {"a": [1, None, 3, 4], "b": [5, 6, None, 8]}, dtype="Int64", index=index1 + ) + pd_df2 = pandas.DataFrame( + {"a": [None, 20, 30, 40], "c": [90, None, 110, 120]}, + dtype="Int64", + index=index2, + ) + + bf_df1 = dataframe.DataFrame(pd_df1) + bf_df2 = dataframe.DataFrame(pd_df2) + + bf_result1, bf_result2 = bf_df1.align(bf_df2, join=join, axis=axis) + pd_result1, pd_result2 = pd_df1.align(pd_df2, join=join, axis=axis) + + # Don't check dtype as pandas does unnecessary float conversion + assert isinstance(bf_result1, dataframe.DataFrame) and isinstance( + bf_result2, dataframe.DataFrame + ) + pd.testing.assert_frame_equal(bf_result1.to_pandas(), pd_result1, check_dtype=False) + pd.testing.assert_frame_equal(bf_result2.to_pandas(), pd_result2, check_dtype=False) + + +def test_combine_first( + scalars_df_index, + scalars_df_2_index, + scalars_pandas_df_index, +): + if pd.__version__.startswith("1."): + pytest.skip("pd.NA vs NaN not handled well in pandas 1.x.") + columns = ["int64_too", "int64_col", "float64_col"] + + bf_df_a = scalars_df_index[columns].iloc[0:6] + bf_df_a.columns = ["a", "b", "c"] + bf_df_b = scalars_df_2_index[columns].iloc[2:8] + bf_df_b.columns = ["b", "a", "d"] + bf_result = bf_df_a.combine_first(bf_df_b).to_pandas() + + pd_df_a = scalars_pandas_df_index[columns].iloc[0:6] + pd_df_a.columns = ["a", "b", "c"] + pd_df_b = scalars_pandas_df_index[columns].iloc[2:8] + pd_df_b.columns = ["b", "a", "d"] + pd_result = pd_df_a.combine_first(pd_df_b) + + # Some dtype inconsistency for all-NULL columns + pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +def test_df_corr_w_invalid_parameters(scalars_dfs): + columns = ["int64_too", "int64_col", "float64_col"] + scalars_df, _ = scalars_dfs + + with pytest.raises(NotImplementedError): + scalars_df[columns].corr(method="kendall") + + with pytest.raises(NotImplementedError): + scalars_df[columns].corr(min_periods=1) + + +@pytest.mark.parametrize( + ("columns", "numeric_only"), + [ + (["bool_col", "int64_col", "float64_col"], True), + (["bool_col", "int64_col", "float64_col"], False), + (["bool_col", "int64_col", "float64_col", "string_col"], True), + pytest.param( + ["bool_col", "int64_col", "float64_col", "string_col"], + False, + marks=pytest.mark.xfail( + raises=NotImplementedError, + ), + ), + ], +) +def test_cov_w_numeric_only(scalars_dfs, columns, numeric_only): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[columns].cov(numeric_only=numeric_only).to_pandas() + pd_result = scalars_pandas_df[columns].cov(numeric_only=numeric_only) + # BigFrames and Pandas differ in their data type handling: + # - Column types: BigFrames uses Float64, Pandas uses float64. + # - Index types: BigFrames uses strign, Pandas uses object. + pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) + # Only check row order in ordered mode. + pd.testing.assert_frame_equal( + bf_result, + pd_result, + check_dtype=False, + check_index_type=False, + check_like=~scalars_df._block.session._strictly_ordered, + ) + + +def test_df_corrwith_df(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + l_cols = ["int64_col", "float64_col", "int64_too"] + r_cols = ["int64_too", "float64_col"] + + bf_result = scalars_df[l_cols].corrwith(scalars_df[r_cols]).to_pandas() + pd_result = scalars_pandas_df[l_cols].corrwith(scalars_pandas_df[r_cols]) + + # BigFrames and Pandas differ in their data type handling: + # - Column types: BigFrames uses Float64, Pandas uses float64. + # - Index types: BigFrames uses strign, Pandas uses object. + pd.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_df_corrwith_df_numeric_only(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + l_cols = ["int64_col", "float64_col", "int64_too", "string_col"] + r_cols = ["int64_too", "float64_col", "bool_col"] + + bf_result = ( + scalars_df[l_cols].corrwith(scalars_df[r_cols], numeric_only=True).to_pandas() + ) + pd_result = scalars_pandas_df[l_cols].corrwith( + scalars_pandas_df[r_cols], numeric_only=True + ) + + # BigFrames and Pandas differ in their data type handling: + # - Column types: BigFrames uses Float64, Pandas uses float64. + # - Index types: BigFrames uses strign, Pandas uses object. + pd.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_df_corrwith_df_non_numeric_error(scalars_dfs): + scalars_df, _ = scalars_dfs + + l_cols = ["int64_col", "float64_col", "int64_too", "string_col"] + r_cols = ["int64_too", "float64_col", "bool_col"] + + with pytest.raises(NotImplementedError): + scalars_df[l_cols].corrwith(scalars_df[r_cols], numeric_only=False) + + +def test_df_corrwith_series(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + + l_cols = ["int64_col", "float64_col", "int64_too"] + r_col = "float64_col" + + bf_result = scalars_df[l_cols].corrwith(scalars_df[r_col]).to_pandas() + pd_result = scalars_pandas_df[l_cols].corrwith(scalars_pandas_df[r_col]) + + # BigFrames and Pandas differ in their data type handling: + # - Column types: BigFrames uses Float64, Pandas uses float64. + # - Index types: BigFrames uses strign, Pandas uses object. + pd.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("op"), + [ + operator.add, + operator.sub, + operator.mul, + operator.truediv, + # operator.floordiv, + operator.eq, + operator.ne, + operator.gt, + operator.ge, + operator.lt, + operator.le, + ], + ids=[ + "add", + "subtract", + "multiply", + "true_divide", + # "floor_divide", + "eq", + "ne", + "gt", + "ge", + "lt", + "le", + ], +) +# TODO(garrettwu): deal with NA values +@pytest.mark.parametrize(("other_scalar"), [1, 2.5, 0, 0.0]) +@pytest.mark.parametrize(("reverse_operands"), [True, False]) +def test_scalar_binop(scalars_dfs, op, other_scalar, reverse_operands): + scalars_df, scalars_pandas_df = scalars_dfs + columns = ["int64_col", "float64_col"] + + maybe_reversed_op = (lambda x, y: op(y, x)) if reverse_operands else op + + bf_result = maybe_reversed_op(scalars_df[columns], other_scalar).to_pandas() + pd_result = maybe_reversed_op(scalars_pandas_df[columns], other_scalar) + + assert_pandas_df_equal(bf_result, pd_result) + + +@pytest.mark.parametrize(("other_scalar"), [1, -2]) +def test_mod(scalars_dfs, other_scalar): + # Zero case excluded as pandas produces 0 result for Int64 inputs rather than NA/NaN. + # This is likely a pandas bug as mod 0 is undefined in other dtypes, and most programming languages. + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = (scalars_df[["int64_col", "int64_too"]] % other_scalar).to_pandas() + pd_result = scalars_pandas_df[["int64_col", "int64_too"]] % other_scalar + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_scalar_binop_str_exception(scalars_dfs): + scalars_df, _ = scalars_dfs + columns = ["string_col"] + with pytest.raises(TypeError, match="Cannot add dtypes"): + (scalars_df[columns] + 1).to_pandas() + + +@pytest.mark.parametrize( + ("op"), + [ + (lambda x, y: x.add(y, axis="index")), + (lambda x, y: x.radd(y, axis="index")), + (lambda x, y: x.sub(y, axis="index")), + (lambda x, y: x.rsub(y, axis="index")), + (lambda x, y: x.mul(y, axis="index")), + (lambda x, y: x.rmul(y, axis="index")), + (lambda x, y: x.truediv(y, axis="index")), + (lambda x, y: x.rtruediv(y, axis="index")), + # (lambda x, y: x.floordiv(y, axis="index")), + # (lambda x, y: x.floordiv(y, axis="index")), + (lambda x, y: x.gt(y, axis="index")), + (lambda x, y: x.ge(y, axis="index")), + (lambda x, y: x.lt(y, axis="index")), + (lambda x, y: x.le(y, axis="index")), + ], + ids=[ + "add", + "radd", + "sub", + "rsub", + "mul", + "rmul", + "truediv", + "rtruediv", + # "floordiv", + # "rfloordiv", + "gt", + "ge", + "lt", + "le", + ], +) +def test_series_binop_axis_index( + scalars_dfs, + op, +): + scalars_df, scalars_pandas_df = scalars_dfs + df_columns = ["int64_col", "float64_col"] + series_column = "int64_too" + + bf_result = op(scalars_df[df_columns], scalars_df[series_column]).to_pandas() + pd_result = op(scalars_pandas_df[df_columns], scalars_pandas_df[series_column]) + + assert_pandas_df_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("input"), + [ + ((1000, 2000, 3000)), + (pd.Index([1000, 2000, 3000])), + (pd.Series((1000, 2000), index=["int64_too", "float64_col"])), + ], + ids=[ + "tuple", + "pd_index", + "pd_series", + ], +) +def test_listlike_binop_axis_1_in_memory_data(scalars_dfs, input): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + + df_columns = ["int64_col", "float64_col", "int64_too"] + + bf_result = scalars_df[df_columns].add(input, axis=1).to_pandas() + if hasattr(input, "to_pandas"): + input = input.to_pandas() + pd_result = scalars_pandas_df[df_columns].add(input, axis=1) + + assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + + +def test_df_reverse_binop_pandas(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + + pd_series = pd.Series([100, 200, 300]) + + df_columns = ["int64_col", "float64_col", "int64_too"] + + bf_result = pd_series + scalars_df[df_columns].to_pandas() + pd_result = pd_series + scalars_pandas_df[df_columns] + + assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + + +def test_listlike_binop_axis_1_bf_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + df_columns = ["int64_col", "float64_col", "int64_too"] + + bf_result = ( + scalars_df[df_columns] + .add(bf_indexes.Index([1000, 2000, 3000]), axis=1) + .to_pandas() + ) + pd_result = scalars_pandas_df[df_columns].add(pd.Index([1000, 2000, 3000]), axis=1) + + assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + + +def test_binop_with_self_aggregate(session, scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + df_columns = ["int64_col", "float64_col", "int64_too"] + + bf_df = scalars_df[df_columns] + bf_result = (bf_df - bf_df.mean()).to_pandas() + + pd_df = scalars_pandas_df[df_columns] + pd_result = pd_df - pd_df.mean() + + assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("left_labels", "right_labels"), + [ + (["a", "a", "b"], ["c", "c", "d"]), + (["a", "b", "c"], ["c", "a", "b"]), + (["a", "c", "c"], ["c", "a", "c"]), + (["a", "b", "c"], ["a", "b", "c"]), + ], + ids=[ + "no_overlap", + "one_one_match", + "multi_match", + "exact_match", + ], +) +def test_binop_df_df_binary_op( + scalars_df_index, + scalars_df_2_index, + scalars_pandas_df_index, + left_labels, + right_labels, +): + if pd.__version__.startswith("1."): + pytest.skip("pd.NA vs NaN not handled well in pandas 1.x.") + columns = ["int64_too", "int64_col", "float64_col"] + + bf_df_a = scalars_df_index[columns] + bf_df_a.columns = left_labels + bf_df_b = scalars_df_2_index[columns] + bf_df_b.columns = right_labels + bf_result = (bf_df_a - bf_df_b).to_pandas() + + pd_df_a = scalars_pandas_df_index[columns] + pd_df_a.columns = left_labels + pd_df_b = scalars_pandas_df_index[columns] + pd_df_b.columns = right_labels + pd_result = pd_df_a - pd_df_b + + # Some dtype inconsistency for all-NULL columns + pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +# Differnt table will only work for explicit index, since default index orders are arbitrary. +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_series_binop_add_different_table( + scalars_df_index, scalars_pandas_df_index, scalars_df_2_index, ordered +): + df_columns = ["int64_col", "float64_col"] + series_column = "int64_too" + + bf_result = ( + scalars_df_index[df_columns] + .add(scalars_df_2_index[series_column], axis="index") + .to_pandas(ordered=ordered) + ) + pd_result = scalars_pandas_df_index[df_columns].add( + scalars_pandas_df_index[series_column], axis="index" + ) + + assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) + + +# TODO(garrettwu): Test series binop with different index + +all_joins = pytest.mark.parametrize( + ("how",), + (("outer",), ("left",), ("right",), ("inner",), ("cross",)), +) + + +@all_joins +def test_join_same_table(scalars_dfs, how): + bf_df, pd_df = scalars_dfs + if not bf_df._session._strictly_ordered and how == "cross": + pytest.skip("Cross join not supported in partial ordering mode.") + + bf_df_a = bf_df.set_index("int64_too")[["string_col", "int64_col"]] + bf_df_a = bf_df_a.sort_index() + + bf_df_b = bf_df.set_index("int64_too")[["float64_col"]] + bf_df_b = bf_df_b[bf_df_b.float64_col > 0] + bf_df_b = bf_df_b.sort_values("float64_col") + + bf_result = bf_df_a.join(bf_df_b, how=how).to_pandas() + + pd_df_a = pd_df.set_index("int64_too")[["string_col", "int64_col"]].sort_index() + pd_df_a = pd_df_a.sort_index() + + pd_df_b = pd_df.set_index("int64_too")[["float64_col"]] + pd_df_b = pd_df_b[pd_df_b.float64_col > 0] + pd_df_b = pd_df_b.sort_values("float64_col") + + pd_result = pd_df_a.join(pd_df_b, how=how) + + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + + +@all_joins +def test_join_different_table( + scalars_df_index, scalars_df_2_index, scalars_pandas_df_index, how +): + bf_df_a = scalars_df_index[["string_col", "int64_col"]] + bf_df_b = scalars_df_2_index.dropna()[["float64_col"]] + bf_result = bf_df_a.join(bf_df_b, how=how).to_pandas() + pd_df_a = scalars_pandas_df_index[["string_col", "int64_col"]] + pd_df_b = scalars_pandas_df_index.dropna()[["float64_col"]] + pd_result = pd_df_a.join(pd_df_b, how=how) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + + +def test_join_duplicate_columns_raises_not_implemented(scalars_dfs): + scalars_df, _ = scalars_dfs + df_a = scalars_df[["string_col", "float64_col"]] + df_b = scalars_df[["float64_col"]] + with pytest.raises(NotImplementedError): + df_a.join(df_b, how="outer").to_pandas() + + +@all_joins +def test_join_param_on(scalars_dfs, how): + bf_df, pd_df = scalars_dfs + + bf_df_a = bf_df[["string_col", "int64_col", "rowindex_2"]] + bf_df_a = bf_df_a.assign(rowindex_2=bf_df_a["rowindex_2"] + 2) + bf_df_b = bf_df[["float64_col"]] + + if how == "cross": + with pytest.raises(ValueError): + bf_df_a.join(bf_df_b, on="rowindex_2", how=how) + else: + bf_result = bf_df_a.join(bf_df_b, on="rowindex_2", how=how).to_pandas() + + pd_df_a = pd_df[["string_col", "int64_col", "rowindex_2"]] + pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2) + pd_df_b = pd_df[["float64_col"]] + pd_result = pd_df_a.join(pd_df_b, on="rowindex_2", how=how) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + + +@all_joins +def test_df_join_series(scalars_dfs, how): + bf_df, pd_df = scalars_dfs + + bf_df_a = bf_df[["string_col", "int64_col", "rowindex_2"]] + bf_df_a = bf_df_a.assign(rowindex_2=bf_df_a["rowindex_2"] + 2) + bf_series_b = bf_df["float64_col"] + + if how == "cross": + with pytest.raises(ValueError): + bf_df_a.join(bf_series_b, on="rowindex_2", how=how) + else: + bf_result = bf_df_a.join(bf_series_b, on="rowindex_2", how=how).to_pandas() + + pd_df_a = pd_df[["string_col", "int64_col", "rowindex_2"]] + pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2) + pd_series_b = pd_df["float64_col"] + pd_result = pd_df_a.join(pd_series_b, on="rowindex_2", how=how) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + + +@pytest.mark.parametrize( + ("by", "ascending", "na_position"), + [ + ("int64_col", True, "first"), + (["bool_col", "int64_col"], True, "last"), + ("int64_col", False, "first"), + (["bool_col", "int64_col"], [False, True], "last"), + (["bool_col", "int64_col"], [True, False], "first"), + ], +) +def test_dataframe_sort_values( + scalars_df_index, scalars_pandas_df_index, by, ascending, na_position +): + # Test needs values to be unique + bf_result = scalars_df_index.sort_values( + by, ascending=ascending, na_position=na_position + ).to_pandas() + pd_result = scalars_pandas_df_index.sort_values( + by, ascending=ascending, na_position=na_position + ) + + pandas.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("by", "ascending", "na_position"), + [ + ("int64_col", True, "first"), + (["bool_col", "int64_col"], True, "last"), + ], +) +def test_dataframe_sort_values_inplace( + scalars_df_index, scalars_pandas_df_index, by, ascending, na_position +): + # Test needs values to be unique + bf_sorted = scalars_df_index.copy() + bf_sorted.sort_values( + by, ascending=ascending, na_position=na_position, inplace=True + ) + bf_result = bf_sorted.to_pandas() + pd_result = scalars_pandas_df_index.sort_values( + by, ascending=ascending, na_position=na_position + ) + + pandas.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_dataframe_sort_values_invalid_input(scalars_df_index): + with pytest.raises(KeyError): + scalars_df_index.sort_values(by=scalars_df_index["int64_col"]) + + +def test_dataframe_sort_values_stable(scalars_df_index, scalars_pandas_df_index): + bf_result = ( + scalars_df_index.sort_values("int64_col", kind="stable") + .sort_values("bool_col", kind="stable") + .to_pandas() + ) + pd_result = scalars_pandas_df_index.sort_values( + "int64_col", kind="stable" + ).sort_values("bool_col", kind="stable") + + pandas.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("operator", "columns"), + [ + pytest.param(lambda x: x.cumsum(), ["float64_col", "int64_too"]), + # pytest.param(lambda x: x.cumprod(), ["float64_col", "int64_too"]), + pytest.param( + lambda x: x.cumprod(), + ["string_col"], + marks=pytest.mark.xfail( + raises=ValueError, + ), + ), + ], + ids=[ + "cumsum", + # "cumprod", + "non-numeric", + ], +) +def test_dataframe_numeric_analytic_op( + scalars_df_index, scalars_pandas_df_index, operator, columns +): + # TODO: Add nullable ints (pandas 1.x has poor behavior on these) + bf_series = operator(scalars_df_index[columns]) + pd_series = operator(scalars_pandas_df_index[columns]) + bf_result = bf_series.to_pandas() + pd.testing.assert_frame_equal(pd_series, bf_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("operator"), + [ + (lambda x: x.cummin()), + (lambda x: x.cummax()), + (lambda x: x.shift(2)), + (lambda x: x.shift(-2)), + ], + ids=[ + "cummin", + "cummax", + "shiftpostive", + "shiftnegative", + ], +) +def test_dataframe_general_analytic_op( + scalars_df_index, scalars_pandas_df_index, operator +): + col_names = ["int64_too", "float64_col", "int64_col", "bool_col"] + bf_series = operator(scalars_df_index[col_names]) + pd_series = operator(scalars_pandas_df_index[col_names]) + bf_result = bf_series.to_pandas() + pd.testing.assert_frame_equal( + pd_series, + bf_result, + ) + + +@pytest.mark.parametrize( + ("periods",), + [ + (1,), + (2,), + (-1,), + ], +) +def test_dataframe_diff(scalars_df_index, scalars_pandas_df_index, periods): + col_names = ["int64_too", "float64_col", "int64_col"] + bf_result = scalars_df_index[col_names].diff(periods=periods).to_pandas() + pd_result = scalars_pandas_df_index[col_names].diff(periods=periods) + pd.testing.assert_frame_equal( + pd_result, + bf_result, + ) + + +@pytest.mark.parametrize( + ("periods",), + [ + (1,), + (2,), + (-1,), + ], +) +def test_dataframe_pct_change(scalars_df_index, scalars_pandas_df_index, periods): + col_names = ["int64_too", "float64_col", "int64_col"] + bf_result = scalars_df_index[col_names].pct_change(periods=periods).to_pandas() + pd_result = scalars_pandas_df_index[col_names].pct_change(periods=periods) + pd.testing.assert_frame_equal( + pd_result, + bf_result, + ) + + +def test_dataframe_agg_single_string(scalars_dfs): + numeric_cols = ["int64_col", "int64_too", "float64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df[numeric_cols].agg("sum").to_pandas() + pd_result = scalars_pandas_df[numeric_cols].agg("sum") + + assert bf_result.dtype == "Float64" + pd.testing.assert_series_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("agg",), + ( + ("sum",), + ("size",), + ), +) +def test_dataframe_agg_int_single_string(scalars_dfs, agg): + numeric_cols = ["int64_col", "int64_too", "bool_col"] + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df[numeric_cols].agg(agg).to_pandas() + pd_result = scalars_pandas_df[numeric_cols].agg(agg) + + assert bf_result.dtype == "Int64" + pd.testing.assert_series_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_dataframe_agg_multi_string(scalars_dfs): + numeric_cols = ["int64_col", "int64_too", "float64_col"] + aggregations = [ + "sum", + "mean", + "median", + "std", + "var", + "min", + "max", + "nunique", + "count", + ] + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[numeric_cols].agg(aggregations) + pd_result = scalars_pandas_df[numeric_cols].agg(aggregations) + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + # Drop median, as it's an approximation. + bf_median = bf_result.loc["median", :] + bf_result = bf_result.drop(labels=["median"]) + pd_result = pd_result.drop(labels=["median"]) + + assert_dfs_equivalent(pd_result, bf_result, check_index_type=False) + + # Double-check that median is at least plausible. + assert ( + (bf_result.loc["min", :] <= bf_median) & (bf_median <= bf_result.loc["max", :]) + ).all() + + +def test_dataframe_agg_int_multi_string(scalars_dfs): + numeric_cols = ["int64_col", "int64_too", "bool_col"] + aggregations = [ + "sum", + "nunique", + "count", + "size", + ] + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[numeric_cols].agg(aggregations).to_pandas() + pd_result = scalars_pandas_df[numeric_cols].agg(aggregations) + + for dtype in bf_result.dtypes: + assert dtype == "Int64" + + # Pandas may produce narrower numeric types + # Pandas has object index type + pd.testing.assert_frame_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_df_transpose(): + # Include some floats to ensure type coercion + values = [[0, 3.5, True], [1, 4.5, False], [2, 6.5, None]] + # Test complex case of both axes being multi-indices with non-unique elements + + columns: pandas.Index = pd.Index( + ["A", "B", "A"], dtype=pd.StringDtype(storage="pyarrow") + ) + columns_multi = pd.MultiIndex.from_arrays([columns, columns], names=["c1", "c2"]) + + index: pandas.Index = pd.Index( + ["b", "a", "a"], dtype=pd.StringDtype(storage="pyarrow") + ) + rows_multi = pd.MultiIndex.from_arrays([index, index], names=["r1", "r2"]) + + pd_df = pandas.DataFrame(values, index=rows_multi, columns=columns_multi) + bf_df = dataframe.DataFrame(values, index=rows_multi, columns=columns_multi) + + pd_result = pd_df.T + bf_result = bf_df.T.to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) + + +def test_df_transpose_error(): + with pytest.raises(TypeError, match="Cannot coerce.*to a common type."): + dataframe.DataFrame([[1, "hello"], [2, "world"]]).transpose() + + +def test_df_transpose_repeated_uses_cache(): + bf_df = dataframe.DataFrame([[1, 2.5], [2, 3.5]]) + pd_df = pandas.DataFrame([[1, 2.5], [2, 3.5]]) + # Transposing many times so that operation will fail from complexity if not using cache + for i in range(10): + # Cache still works even with simple scalar binop + bf_df = bf_df.transpose() + i + pd_df = pd_df.transpose() + i + + pd.testing.assert_frame_equal( + pd_df, bf_df.to_pandas(), check_dtype=False, check_index_type=False + ) + + +def test_df_stack(scalars_dfs): + if pandas.__version__.startswith("1.") or pandas.__version__.startswith("2.0"): + pytest.skip("pandas <2.1 uses different stack implementation") + scalars_df, scalars_pandas_df = scalars_dfs + # To match bigquery dataframes + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") + # Can only stack identically-typed columns + columns = ["int64_col", "int64_too", "rowindex_2"] + + bf_result = scalars_df[columns].stack().to_pandas() + pd_result = scalars_pandas_df[columns].stack(future_stack=True) + + # Pandas produces NaN, where bq dataframes produces pd.NA + assert_series_equal(bf_result, pd_result, check_dtype=False) + + +def test_df_melt_default(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + # To match bigquery dataframes + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") + # Can only stack identically-typed columns + columns = ["int64_col", "int64_too", "rowindex_2"] + + bf_result = scalars_df[columns].melt().to_pandas() + pd_result = scalars_pandas_df[columns].melt() + + # Pandas produces int64 index, Bigframes produces Int64 (nullable) + pd.testing.assert_frame_equal( + bf_result, + pd_result, + check_index_type=False, + check_dtype=False, + ) + + +def test_df_melt_parameterized(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + # To match bigquery dataframes + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") + # Can only stack identically-typed columns + + bf_result = scalars_df.melt( + var_name="alice", + value_name="bob", + id_vars=["string_col"], + value_vars=["int64_col", "int64_too"], + ).to_pandas() + pd_result = scalars_pandas_df.melt( + var_name="alice", + value_name="bob", + id_vars=["string_col"], + value_vars=["int64_col", "int64_too"], + ) + + # Pandas produces int64 index, Bigframes produces Int64 (nullable) + pd.testing.assert_frame_equal( + bf_result, pd_result, check_index_type=False, check_dtype=False + ) + + +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_df_unstack(scalars_dfs, ordered): + scalars_df, scalars_pandas_df = scalars_dfs + # To match bigquery dataframes + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") + # Can only stack identically-typed columns + columns = [ + "rowindex_2", + "int64_col", + "int64_too", + ] + + # unstack on mono-index produces series + bf_result = scalars_df[columns].unstack().to_pandas(ordered=ordered) + pd_result = scalars_pandas_df[columns].unstack() + + # Pandas produces NaN, where bq dataframes produces pd.NA + assert_series_equal( + bf_result, pd_result, check_dtype=False, ignore_order=not ordered + ) + + +def test_ipython_key_completions_with_drop(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_names = "string_col" + bf_dataframe = scalars_df.drop(columns=col_names) + pd_dataframe = scalars_pandas_df.drop(columns=col_names) + expected = pd_dataframe.columns.tolist() + + results = bf_dataframe._ipython_key_completions_() + + assert col_names not in results + assert results == expected + # _ipython_key_completions_ is called with square brackets + # so only column names are relevant with tab completion + assert "to_gbq" not in results + assert "merge" not in results + assert "drop" not in results + + +def test_ipython_key_completions_with_rename(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name_dict = {"string_col": "a_renamed_column"} + bf_dataframe = scalars_df.rename(columns=col_name_dict) + pd_dataframe = scalars_pandas_df.rename(columns=col_name_dict) + expected = pd_dataframe.columns.tolist() + + results = bf_dataframe._ipython_key_completions_() + + assert "string_col" not in results + assert "a_renamed_column" in results + assert results == expected + # _ipython_key_completions_ is called with square brackets + # so only column names are relevant with tab completion + assert "to_gbq" not in results + assert "merge" not in results + assert "drop" not in results + + +def test__dir__with_drop(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_names = "string_col" + bf_dataframe = scalars_df.drop(columns=col_names) + pd_dataframe = scalars_pandas_df.drop(columns=col_names) + expected = pd_dataframe.columns.tolist() + + results = dir(bf_dataframe) + + assert col_names not in results + assert frozenset(expected) <= frozenset(results) + # __dir__ is called with a '.' and displays all methods, columns names, etc. + assert "to_gbq" in results + assert "merge" in results + assert "drop" in results + + +def test__dir__with_rename(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name_dict = {"string_col": "a_renamed_column"} + bf_dataframe = scalars_df.rename(columns=col_name_dict) + pd_dataframe = scalars_pandas_df.rename(columns=col_name_dict) + expected = pd_dataframe.columns.tolist() + + results = dir(bf_dataframe) + + assert "string_col" not in results + assert "a_renamed_column" in results + assert frozenset(expected) <= frozenset(results) + # __dir__ is called with a '.' and displays all methods, columns names, etc. + assert "to_gbq" in results + assert "merge" in results + assert "drop" in results + + +@pytest.mark.parametrize( + ("start", "stop", "step"), + [ + (0, 0, None), + (None, None, None), + (1, None, None), + (None, 4, None), + (None, None, 2), + (None, 50000000000, 1), + (5, 4, None), + (3, None, 2), + (1, 7, 2), + (1, 7, 50000000000), + ], +) +def test_iloc_slice(scalars_df_index, scalars_pandas_df_index, start, stop, step): + bf_result = scalars_df_index.iloc[start:stop:step].to_pandas() + pd_result = scalars_pandas_df_index.iloc[start:stop:step] + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_iloc_slice_zero_step(scalars_df_index): + with pytest.raises(ValueError): + scalars_df_index.iloc[0:0:0] + + +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_iloc_slice_nested(scalars_df_index, scalars_pandas_df_index, ordered): + bf_result = scalars_df_index.iloc[1:].iloc[1:].to_pandas(ordered=ordered) + pd_result = scalars_pandas_df_index.iloc[1:].iloc[1:] + + assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) + + +@pytest.mark.parametrize( + "index", + [0, 5, -2, (2,)], +) +def test_iloc_single_integer(scalars_df_index, scalars_pandas_df_index, index): + bf_result = scalars_df_index.iloc[index] + pd_result = scalars_pandas_df_index.iloc[index] + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + "index", + [(2, 5), (5, 0), (0, 0)], +) +def test_iloc_tuple(scalars_df_index, scalars_pandas_df_index, index): + bf_result = scalars_df_index.iloc[index] + pd_result = scalars_pandas_df_index.iloc[index] + + assert bf_result == pd_result + + +@pytest.mark.parametrize( + "index", + [(slice(None), [1, 2, 3]), (slice(1, 7, 2), [2, 5, 3])], +) +def test_iloc_tuple_multi_columns(scalars_df_index, scalars_pandas_df_index, index): + bf_result = scalars_df_index.iloc[index].to_pandas() + pd_result = scalars_pandas_df_index.iloc[index] + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_iloc_tuple_multi_columns_single_row(scalars_df_index, scalars_pandas_df_index): + index = (2, [2, 1, 3, -4]) + bf_result = scalars_df_index.iloc[index] + pd_result = scalars_pandas_df_index.iloc[index] + pd.testing.assert_series_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("index", "error"), + [ + ((1, 1, 1), pd.errors.IndexingError), + (("asd", "asd", "asd"), pd.errors.IndexingError), + (("asd"), TypeError), + ], +) +def test_iloc_tuple_errors(scalars_df_index, scalars_pandas_df_index, index, error): + with pytest.raises(error): + scalars_df_index.iloc[index] + with pytest.raises(error): + scalars_pandas_df_index.iloc[index] + + +@pytest.mark.parametrize( + "index", + [(2, 5), (5, 0), (0, 0)], +) +def test_iat(scalars_df_index, scalars_pandas_df_index, index): + bf_result = scalars_df_index.iat[index] + pd_result = scalars_pandas_df_index.iat[index] + + assert bf_result == pd_result + + +@pytest.mark.parametrize( + ("index", "error"), + [ + (0, TypeError), + ("asd", ValueError), + ((1, 2, 3), TypeError), + (("asd", "asd"), ValueError), + ], +) +def test_iat_errors(scalars_df_index, scalars_pandas_df_index, index, error): + with pytest.raises(error): + scalars_pandas_df_index.iat[index] + with pytest.raises(error): + scalars_df_index.iat[index] + + +def test_iloc_single_integer_out_of_bound_error( + scalars_df_index, scalars_pandas_df_index +): + with pytest.raises(IndexError, match="single positional indexer is out-of-bounds"): + scalars_df_index.iloc[99] + + +def test_loc_bool_series(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.loc[scalars_df_index.bool_col].to_pandas() + pd_result = scalars_pandas_df_index.loc[scalars_pandas_df_index.bool_col] + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_loc_select_column(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.loc[:, "int64_col"].to_pandas() + pd_result = scalars_pandas_df_index.loc[:, "int64_col"] + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_loc_select_with_column_condition(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.loc[:, scalars_df_index.dtypes == "Int64"].to_pandas() + pd_result = scalars_pandas_df_index.loc[ + :, scalars_pandas_df_index.dtypes == "Int64" + ] + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_loc_select_with_column_condition_bf_series( + scalars_df_index, scalars_pandas_df_index +): + # (b/347072677) GEOGRAPH type doesn't support DISTINCT op + columns = [ + item for item in scalars_pandas_df_index.columns if item != "geography_col" + ] + scalars_df_index = scalars_df_index[columns] + scalars_pandas_df_index = scalars_pandas_df_index[columns] + + size_half = len(scalars_pandas_df_index) / 2 + bf_result = scalars_df_index.loc[ + :, scalars_df_index.nunique() > size_half + ].to_pandas() + pd_result = scalars_pandas_df_index.loc[ + :, scalars_pandas_df_index.nunique() > size_half + ] + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_loc_single_index_with_duplicate(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.set_index("string_col", drop=False) + scalars_pandas_df_index = scalars_pandas_df_index.set_index( + "string_col", drop=False + ) + index = "Hello, World!" + bf_result = scalars_df_index.loc[index] + pd_result = scalars_pandas_df_index.loc[index] + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_single_index_no_duplicate(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.set_index("int64_too", drop=False) + scalars_pandas_df_index = scalars_pandas_df_index.set_index("int64_too", drop=False) + index = -2345 + bf_result = scalars_df_index.loc[index] + pd_result = scalars_pandas_df_index.loc[index] + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_at_with_duplicate(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.set_index("string_col", drop=False) + scalars_pandas_df_index = scalars_pandas_df_index.set_index( + "string_col", drop=False + ) + index = "Hello, World!" + bf_result = scalars_df_index.at[index, "int64_too"] + pd_result = scalars_pandas_df_index.at[index, "int64_too"] + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_at_no_duplicate(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.set_index("int64_too", drop=False) + scalars_pandas_df_index = scalars_pandas_df_index.set_index("int64_too", drop=False) + index = -2345 + bf_result = scalars_df_index.at[index, "string_col"] + pd_result = scalars_pandas_df_index.at[index, "string_col"] + assert bf_result == pd_result + + +def test_loc_setitem_bool_series_scalar_new_col(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df.loc[bf_df["int64_too"] == 0, "new_col"] = 99 + pd_df.loc[pd_df["int64_too"] == 0, "new_col"] = 99 + + # pandas uses float64 instead + pd_df["new_col"] = pd_df["new_col"].astype("Float64") + + pd.testing.assert_frame_equal( + bf_df.to_pandas(), + pd_df, + ) + + +@pytest.mark.parametrize( + ("col", "value"), + [ + ("string_col", "hello"), + ("int64_col", 3), + ("float64_col", 3.5), + ], +) +def test_loc_setitem_bool_series_scalar_existing_col(scalars_dfs, col, value): + if pd.__version__.startswith("1."): + pytest.skip("this loc overload not supported in pandas 1.x.") + + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df.loc[bf_df["int64_too"] == 1, col] = value + pd_df.loc[pd_df["int64_too"] == 1, col] = value + + pd.testing.assert_frame_equal( + bf_df.to_pandas(), + pd_df, + ) + + +def test_loc_setitem_bool_series_scalar_error(scalars_dfs): + if pd.__version__.startswith("1."): + pytest.skip("this loc overload not supported in pandas 1.x.") + + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + + with pytest.raises(Exception): + bf_df.loc[bf_df["int64_too"] == 1, "string_col"] = 99 + with pytest.raises(Exception): + pd_df.loc[pd_df["int64_too"] == 1, "string_col"] = 99 + + +@pytest.mark.parametrize( + ("col", "op"), + [ + # Int aggregates + pytest.param("int64_col", lambda x: x.sum(), id="int-sum"), + pytest.param("int64_col", lambda x: x.min(), id="int-min"), + pytest.param("int64_col", lambda x: x.max(), id="int-max"), + pytest.param("int64_col", lambda x: x.count(), id="int-count"), + pytest.param("int64_col", lambda x: x.nunique(), id="int-nunique"), + # Float aggregates + pytest.param("float64_col", lambda x: x.count(), id="float-count"), + pytest.param("float64_col", lambda x: x.nunique(), id="float-nunique"), + # Bool aggregates + pytest.param("bool_col", lambda x: x.sum(), id="bool-sum"), + pytest.param("bool_col", lambda x: x.count(), id="bool-count"), + pytest.param("bool_col", lambda x: x.nunique(), id="bool-nunique"), + # String aggregates + pytest.param("string_col", lambda x: x.count(), id="string-count"), + pytest.param("string_col", lambda x: x.nunique(), id="string-nunique"), + ], +) +def test_dataframe_aggregate_int(scalars_df_index, scalars_pandas_df_index, col, op): + bf_result = op(scalars_df_index[[col]]).to_pandas() + pd_result = op(scalars_pandas_df_index[[col]]) + + # Check dtype separately + assert bf_result.dtype == "Int64" + # Is otherwise "object" dtype + pd_result.index = pd_result.index.astype("string[pyarrow]") + # Pandas may produce narrower numeric types + assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) + + +@pytest.mark.parametrize( + ("col", "op"), + [ + pytest.param("bool_col", lambda x: x.min(), id="bool-min"), + pytest.param("bool_col", lambda x: x.max(), id="bool-max"), + ], +) +def test_dataframe_aggregate_bool(scalars_df_index, scalars_pandas_df_index, col, op): + bf_result = op(scalars_df_index[[col]]).to_pandas() + pd_result = op(scalars_pandas_df_index[[col]]) + + # Check dtype separately + assert bf_result.dtype == "boolean" + + # Pandas may produce narrower numeric types + # Pandas has object index type + pd_result.index = pd_result.index.astype("string[pyarrow]") + assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) + + +@pytest.mark.parametrize( + ("op", "bf_dtype"), + [ + (lambda x: x.sum(numeric_only=True), "Float64"), + (lambda x: x.mean(numeric_only=True), "Float64"), + (lambda x: x.min(numeric_only=True), "Float64"), + (lambda x: x.max(numeric_only=True), "Float64"), + (lambda x: x.std(numeric_only=True), "Float64"), + (lambda x: x.var(numeric_only=True), "Float64"), + (lambda x: x.count(numeric_only=False), "Int64"), + (lambda x: x.nunique(), "Int64"), + ], + ids=["sum", "mean", "min", "max", "std", "var", "count", "nunique"], +) +def test_dataframe_aggregates(scalars_dfs, op, bf_dtype): + scalars_df_index, scalars_pandas_df_index = scalars_dfs + col_names = ["int64_too", "float64_col", "string_col", "int64_col", "bool_col"] + bf_series = op(scalars_df_index[col_names]) + bf_result = bf_series + pd_result = op(scalars_pandas_df_index[col_names]) + + # Check dtype separately + assert bf_result.dtype == bf_dtype + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + # Pandas has object index type + pd_result.index = pd_result.index.astype("string[pyarrow]") + assert_series_equivalent( + pd_result, + bf_result, + check_dtype=False, + check_index_type=False, + ) + + +@pytest.mark.parametrize( + ("op"), + [ + (lambda x: x.sum(axis=1, numeric_only=True)), + (lambda x: x.mean(axis=1, numeric_only=True)), + (lambda x: x.min(axis=1, numeric_only=True)), + (lambda x: x.max(axis=1, numeric_only=True)), + (lambda x: x.std(axis=1, numeric_only=True)), + (lambda x: x.var(axis=1, numeric_only=True)), + ], + ids=["sum", "mean", "min", "max", "std", "var"], +) +def test_dataframe_aggregates_axis_1(scalars_df_index, scalars_pandas_df_index, op): + col_names = ["int64_too", "int64_col", "float64_col", "bool_col", "string_col"] + bf_result = op(scalars_df_index[col_names]).to_pandas() + pd_result = op(scalars_pandas_df_index[col_names]) + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + # Pandas has object index type + pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + + +@pytest.mark.parametrize( + ("op"), + [ + (lambda x: x.all(bool_only=True)), + (lambda x: x.any(bool_only=True)), + (lambda x: x.all(axis=1, bool_only=True)), + (lambda x: x.any(axis=1, bool_only=True)), + ], + ids=["all_axis0", "any_axis0", "all_axis1", "any_axis1"], +) +def test_dataframe_bool_aggregates(scalars_df_index, scalars_pandas_df_index, op): + # Pandas will drop nullable 'boolean' dtype so we convert first to bool, then cast back later + scalars_df_index = scalars_df_index.assign( + bool_col=scalars_df_index.bool_col.fillna(False) + ) + scalars_pandas_df_index = scalars_pandas_df_index.assign( + bool_col=scalars_pandas_df_index.bool_col.fillna(False).astype("bool") + ) + bf_series = op(scalars_df_index) + pd_series = op(scalars_pandas_df_index).astype("boolean") + bf_result = bf_series.to_pandas() + + pd_series.index = pd_series.index.astype(bf_result.index.dtype) + pd.testing.assert_series_equal(pd_series, bf_result, check_index_type=False) + + +def test_dataframe_prod(scalars_df_index, scalars_pandas_df_index): + col_names = ["int64_too", "float64_col"] + bf_series = scalars_df_index[col_names].prod() + pd_series = scalars_pandas_df_index[col_names].prod() + bf_result = bf_series.to_pandas() + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_series = pd_series.astype("Float64") + # Pandas has object index type + pd.testing.assert_series_equal(pd_series, bf_result, check_index_type=False) + + +def test_df_skew_too_few_values(scalars_dfs): + columns = ["float64_col", "int64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[columns].head(2).skew().to_pandas() + pd_result = scalars_pandas_df[columns].head(2).skew() + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + + +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_df_skew(scalars_dfs, ordered): + columns = ["float64_col", "int64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[columns].skew().to_pandas(ordered=ordered) + pd_result = scalars_pandas_df[columns].skew() + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + assert_series_equal( + pd_result, bf_result, check_index_type=False, ignore_order=not ordered + ) + + +def test_df_kurt_too_few_values(scalars_dfs): + columns = ["float64_col", "int64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[columns].head(2).kurt().to_pandas() + pd_result = scalars_pandas_df[columns].head(2).kurt() + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + + +def test_df_kurt(scalars_dfs): + columns = ["float64_col", "int64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[columns].kurt().to_pandas() + pd_result = scalars_pandas_df[columns].kurt() + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + + +def test_sample_raises_value_error(scalars_dfs): + scalars_df, _ = scalars_dfs + with pytest.raises( + ValueError, match="Only one of 'n' or 'frac' parameter can be specified." + ): + scalars_df.sample(frac=0.5, n=4) + + +@pytest.mark.parametrize( + ("axis",), + [ + (None,), + (0,), + (1,), + ], +) +def test_df_add_prefix(scalars_df_index, scalars_pandas_df_index, axis): + if pd.__version__.startswith("1."): + pytest.skip("add_prefix axis parameter not supported in pandas 1.x.") + bf_result = scalars_df_index.add_prefix("prefix_", axis).to_pandas() + + pd_result = scalars_pandas_df_index.add_prefix("prefix_", axis) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + check_index_type=False, + ) + + +@pytest.mark.parametrize( + ("axis",), + [ + (0,), + (1,), + ], +) +def test_df_add_suffix(scalars_df_index, scalars_pandas_df_index, axis): + if pd.__version__.startswith("1."): + pytest.skip("add_prefix axis parameter not supported in pandas 1.x.") + bf_result = scalars_df_index.add_suffix("_suffix", axis).to_pandas() + + pd_result = scalars_pandas_df_index.add_suffix("_suffix", axis) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + check_index_type=False, + ) + + +def test_df_astype_error_error(session): + input = pd.DataFrame(["hello", "world", "3.11", "4000"]) + with pytest.raises(ValueError): + session.read_pandas(input).astype("Float64", errors="bad_value") + + +def test_df_columns_filter_items(scalars_df_index, scalars_pandas_df_index): + if pd.__version__.startswith("2.0") or pd.__version__.startswith("1."): + pytest.skip("pandas filter items behavior different pre-2.1") + bf_result = scalars_df_index.filter(items=["string_col", "int64_col"]).to_pandas() + + pd_result = scalars_pandas_df_index.filter(items=["string_col", "int64_col"]) + # Ignore column ordering as pandas order differently depending on version + pd.testing.assert_frame_equal( + bf_result.sort_index(axis=1), + pd_result.sort_index(axis=1), + ) + + +def test_df_columns_filter_like(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.filter(like="64_col").to_pandas() + + pd_result = scalars_pandas_df_index.filter(like="64_col") + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_columns_filter_regex(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.filter(regex="^[^_]+$").to_pandas() + + pd_result = scalars_pandas_df_index.filter(regex="^[^_]+$") + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_reindex_rows_list(scalars_dfs): + scalars_df_index, scalars_pandas_df_index = scalars_dfs + bf_result = scalars_df_index.reindex(index=[5, 1, 3, 99, 1]) + + pd_result = scalars_pandas_df_index.reindex(index=[5, 1, 3, 99, 1]) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + assert_dfs_equivalent( + pd_result, + bf_result, + ) + + +def test_df_reindex_rows_index(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.reindex( + index=pd.Index([5, 1, 3, 99, 1], name="newname") + ).to_pandas() + + pd_result = scalars_pandas_df_index.reindex( + index=pd.Index([5, 1, 3, 99, 1], name="newname") + ) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_reindex_nonunique(scalars_df_index): + with pytest.raises(ValueError): + # int64_too is non-unique + scalars_df_index.set_index("int64_too").reindex( + index=[5, 1, 3, 99, 1], validate=True + ) + + +def test_df_reindex_columns(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.reindex( + columns=["not_a_col", "int64_col", "int64_too"] + ).to_pandas() + + pd_result = scalars_pandas_df_index.reindex( + columns=["not_a_col", "int64_col", "int64_too"] + ) + + # Pandas uses float64 as default for newly created empty column, bf uses Float64 + pd_result.not_a_col = pd_result.not_a_col.astype(pandas.Float64Dtype()) + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_reindex_columns_with_same_order(scalars_df_index, scalars_pandas_df_index): + # First, make sure the two dataframes have the same columns in order. + columns = ["int64_col", "int64_too"] + bf = scalars_df_index[columns] + pd_df = scalars_pandas_df_index[columns] + + bf_result = bf.reindex(columns=columns).to_pandas() + pd_result = pd_df.reindex(columns=columns) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_equals_identical(scalars_df_index, scalars_pandas_df_index): + unsupported = [ + "geography_col", + ] + scalars_df_index = scalars_df_index.drop(columns=unsupported) + scalars_pandas_df_index = scalars_pandas_df_index.drop(columns=unsupported) + + bf_result = scalars_df_index.equals(scalars_df_index) + pd_result = scalars_pandas_df_index.equals(scalars_pandas_df_index) + + assert pd_result == bf_result + + +def test_df_equals_series(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index[["int64_col"]].equals(scalars_df_index["int64_col"]) + pd_result = scalars_pandas_df_index[["int64_col"]].equals( + scalars_pandas_df_index["int64_col"] + ) + + assert pd_result == bf_result + + +def test_df_equals_different_dtype(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_col", "int64_too"] + scalars_df_index = scalars_df_index[columns] + scalars_pandas_df_index = scalars_pandas_df_index[columns] + + bf_modified = scalars_df_index.copy() + bf_modified = bf_modified.astype("Float64") + + pd_modified = scalars_pandas_df_index.copy() + pd_modified = pd_modified.astype("Float64") + + bf_result = scalars_df_index.equals(bf_modified) + pd_result = scalars_pandas_df_index.equals(pd_modified) + + assert pd_result == bf_result + + +def test_df_equals_different_values(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_col", "int64_too"] + scalars_df_index = scalars_df_index[columns] + scalars_pandas_df_index = scalars_pandas_df_index[columns] + + bf_modified = scalars_df_index.copy() + bf_modified["int64_col"] = bf_modified.int64_col + 1 + + pd_modified = scalars_pandas_df_index.copy() + pd_modified["int64_col"] = pd_modified.int64_col + 1 + + bf_result = scalars_df_index.equals(bf_modified) + pd_result = scalars_pandas_df_index.equals(pd_modified) + + assert pd_result == bf_result + + +def test_df_equals_extra_column(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_col", "int64_too"] + more_columns = ["int64_col", "int64_too", "float64_col"] + + bf_result = scalars_df_index[columns].equals(scalars_df_index[more_columns]) + pd_result = scalars_pandas_df_index[columns].equals( + scalars_pandas_df_index[more_columns] + ) + + assert pd_result == bf_result + + +def test_df_reindex_like(scalars_df_index, scalars_pandas_df_index): + reindex_target_bf = scalars_df_index.reindex( + columns=["not_a_col", "int64_col", "int64_too"], index=[5, 1, 3, 99, 1] + ) + bf_result = scalars_df_index.reindex_like(reindex_target_bf).to_pandas() + + reindex_target_pd = scalars_pandas_df_index.reindex( + columns=["not_a_col", "int64_col", "int64_too"], index=[5, 1, 3, 99, 1] + ) + pd_result = scalars_pandas_df_index.reindex_like(reindex_target_pd) + + # Pandas uses float64 as default for newly created empty column, bf uses Float64 + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + # Pandas uses float64 as default for newly created empty column, bf uses Float64 + pd_result.not_a_col = pd_result.not_a_col.astype(pandas.Float64Dtype()) + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_values(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.values + + pd_result = scalars_pandas_df_index.values + # Numpy isn't equipped to compare non-numeric objects, so convert back to dataframe + pd.testing.assert_frame_equal( + pd.DataFrame(bf_result), pd.DataFrame(pd_result), check_dtype=False + ) + + +def test_df_to_numpy(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.to_numpy() + + pd_result = scalars_pandas_df_index.to_numpy() + # Numpy isn't equipped to compare non-numeric objects, so convert back to dataframe + pd.testing.assert_frame_equal( + pd.DataFrame(bf_result), pd.DataFrame(pd_result), check_dtype=False + ) + + +def test_df___array__(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.__array__() + + pd_result = scalars_pandas_df_index.__array__() + # Numpy isn't equipped to compare non-numeric objects, so convert back to dataframe + pd.testing.assert_frame_equal( + pd.DataFrame(bf_result), pd.DataFrame(pd_result), check_dtype=False + ) + + +def test_df_getattr_attribute_error_when_pandas_has(scalars_df_index): + # swapaxes is implemented in pandas but not in bigframes + with pytest.raises(AttributeError): + scalars_df_index.swapaxes() + + +def test_df_getattr_attribute_error(scalars_df_index): + with pytest.raises(AttributeError): + scalars_df_index.not_a_method() + + +def test_df_getattr_axes(): + df = dataframe.DataFrame( + [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] + ) + assert isinstance(df.index, bigframes.core.indexes.Index) + assert isinstance(df.columns, pandas.Index) + assert isinstance(df.my_column, series.Series) + + +def test_df_setattr_index(): + pd_df = pandas.DataFrame( + [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] + ) + bf_df = dataframe.DataFrame(pd_df) + + pd_df.index = pandas.Index([4, 5]) + bf_df.index = [4, 5] + + assert_pandas_df_equal( + pd_df, bf_df.to_pandas(), check_index_type=False, check_dtype=False + ) + + +def test_df_setattr_columns(): + pd_df = pandas.DataFrame( + [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] + ) + bf_df = dataframe.DataFrame(pd_df) + + pd_df.columns = typing.cast(pandas.Index, pandas.Index([4, 5, 6])) + + bf_df.columns = pandas.Index([4, 5, 6]) + + assert_pandas_df_equal( + pd_df, bf_df.to_pandas(), check_index_type=False, check_dtype=False + ) + + +def test_df_setattr_modify_column(): + pd_df = pandas.DataFrame( + [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] + ) + bf_df = dataframe.DataFrame(pd_df) + pd_df.my_column = [4, 5] + bf_df.my_column = [4, 5] + + assert_pandas_df_equal( + pd_df, bf_df.to_pandas(), check_index_type=False, check_dtype=False + ) + + +def test_loc_list_string_index(scalars_df_index, scalars_pandas_df_index): + index_list = scalars_pandas_df_index.string_col.iloc[[0, 1, 1, 5]].values + + scalars_df_index = scalars_df_index.set_index("string_col") + scalars_pandas_df_index = scalars_pandas_df_index.set_index("string_col") + + bf_result = scalars_df_index.loc[index_list].to_pandas() + pd_result = scalars_pandas_df_index.loc[index_list] + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_loc_list_integer_index(scalars_df_index, scalars_pandas_df_index): + index_list = [3, 2, 1, 3, 2, 1] + + bf_result = scalars_df_index.loc[index_list] + pd_result = scalars_pandas_df_index.loc[index_list] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_list_multiindex(scalars_dfs): + scalars_df_index, scalars_pandas_df_index = scalars_dfs + scalars_df_multiindex = scalars_df_index.set_index(["string_col", "int64_col"]) + scalars_pandas_df_multiindex = scalars_pandas_df_index.set_index( + ["string_col", "int64_col"] + ) + index_list = [("Hello, World!", -234892), ("Hello, World!", 123456789)] + + bf_result = scalars_df_multiindex.loc[index_list] + pd_result = scalars_pandas_df_multiindex.loc[index_list] + + assert_dfs_equivalent( + pd_result, + bf_result, + ) + + +@pytest.mark.parametrize( + "index_list", + [ + [0, 1, 2, 3, 4, 4], + [0, 0, 0, 5, 4, 7, -2, -5, 3], + [-1, -2, -3, -4, -5, -5], + ], +) +def test_iloc_list(scalars_df_index, scalars_pandas_df_index, index_list): + bf_result = scalars_df_index.iloc[index_list] + pd_result = scalars_pandas_df_index.iloc[index_list] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_iloc_list_multiindex(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.copy() + scalars_pandas_df = scalars_pandas_df.copy() + scalars_df = scalars_df.set_index(["bytes_col", "numeric_col"]) + scalars_pandas_df = scalars_pandas_df.set_index(["bytes_col", "numeric_col"]) + + index_list = [0, 0, 0, 5, 4, 7] + + bf_result = scalars_df.iloc[index_list] + pd_result = scalars_pandas_df.iloc[index_list] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_iloc_empty_list(scalars_df_index, scalars_pandas_df_index): + + index_list: List[int] = [] + + bf_result = scalars_df_index.iloc[index_list] + pd_result = scalars_pandas_df_index.iloc[index_list] + + bf_result = bf_result.to_pandas() + assert bf_result.shape == pd_result.shape # types are known to be different + + +def test_rename_axis(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.rename_axis("newindexname") + pd_result = scalars_pandas_df_index.rename_axis("newindexname") + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_rename_axis_nonstring(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.rename_axis((4,)) + pd_result = scalars_pandas_df_index.rename_axis((4,)) + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_bf_series_string_index(scalars_df_index, scalars_pandas_df_index): + pd_string_series = scalars_pandas_df_index.string_col.iloc[[0, 5, 1, 1, 5]] + bf_string_series = scalars_df_index.string_col.iloc[[0, 5, 1, 1, 5]] + + scalars_df_index = scalars_df_index.set_index("string_col") + scalars_pandas_df_index = scalars_pandas_df_index.set_index("string_col") + + bf_result = scalars_df_index.loc[bf_string_series] + pd_result = scalars_pandas_df_index.loc[pd_string_series] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_bf_series_multiindex(scalars_df_index, scalars_pandas_df_index): + pd_string_series = scalars_pandas_df_index.string_col.iloc[[0, 5, 1, 1, 5]] + bf_string_series = scalars_df_index.string_col.iloc[[0, 5, 1, 1, 5]] + + scalars_df_multiindex = scalars_df_index.set_index(["string_col", "int64_col"]) + scalars_pandas_df_multiindex = scalars_pandas_df_index.set_index( + ["string_col", "int64_col"] + ) + + bf_result = scalars_df_multiindex.loc[bf_string_series] + pd_result = scalars_pandas_df_multiindex.loc[pd_string_series] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_bf_index_integer_index(scalars_df_index, scalars_pandas_df_index): + pd_index = scalars_pandas_df_index.iloc[[0, 5, 1, 1, 5]].index + bf_index = scalars_df_index.iloc[[0, 5, 1, 1, 5]].index + + bf_result = scalars_df_index.loc[bf_index] + pd_result = scalars_pandas_df_index.loc[pd_index] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_bf_index_integer_index_renamed_col( + scalars_df_index, scalars_pandas_df_index +): + scalars_df_index = scalars_df_index.rename(columns={"int64_col": "rename"}) + scalars_pandas_df_index = scalars_pandas_df_index.rename( + columns={"int64_col": "rename"} + ) + + pd_index = scalars_pandas_df_index.iloc[[0, 5, 1, 1, 5]].index + bf_index = scalars_df_index.iloc[[0, 5, 1, 1, 5]].index + + bf_result = scalars_df_index.loc[bf_index] + pd_result = scalars_pandas_df_index.loc[pd_index] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +@pytest.mark.parametrize( + ("subset"), + [ + None, + "bool_col", + ["bool_col", "int64_too"], + ], +) +@pytest.mark.parametrize( + ("keep",), + [ + (False,), + ], +) +def test_df_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, subset): + columns = ["bool_col", "int64_too", "int64_col"] + bf_df = scalars_df_index[columns].drop_duplicates(subset, keep=keep).to_pandas() + pd_df = scalars_pandas_df_index[columns].drop_duplicates(subset, keep=keep) + pd.testing.assert_frame_equal( + pd_df, + bf_df, + ) + + +@pytest.mark.parametrize( + ("subset"), + [ + None, + ["bool_col"], + ], +) +@pytest.mark.parametrize( + ("keep",), + [ + (False,), + ], +) +def test_df_duplicated(scalars_df_index, scalars_pandas_df_index, keep, subset): + columns = ["bool_col", "int64_too", "int64_col"] + bf_series = scalars_df_index[columns].duplicated(subset, keep=keep).to_pandas() + pd_series = scalars_pandas_df_index[columns].duplicated(subset, keep=keep) + pd.testing.assert_series_equal(pd_series, bf_series, check_dtype=False) + + +def test_df_from_dict_columns_orient(): + data = {"a": [1, 2], "b": [3.3, 2.4]} + bf_result = dataframe.DataFrame.from_dict(data, orient="columns").to_pandas() + pd_result = pd.DataFrame.from_dict(data, orient="columns") + assert_pandas_df_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_df_from_dict_index_orient(): + data = {"a": [1, 2], "b": [3.3, 2.4]} + bf_result = dataframe.DataFrame.from_dict( + data, orient="index", columns=["col1", "col2"] + ).to_pandas() + pd_result = pd.DataFrame.from_dict(data, orient="index", columns=["col1", "col2"]) + assert_pandas_df_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_df_from_dict_tight_orient(): + data = { + "index": [("i1", "i2"), ("i3", "i4")], + "columns": ["col1", "col2"], + "data": [[1, 2.6], [3, 4.5]], + "index_names": ["in1", "in2"], + "column_names": ["column_axis"], + } + + bf_result = dataframe.DataFrame.from_dict(data, orient="tight").to_pandas() + pd_result = pd.DataFrame.from_dict(data, orient="tight") + assert_pandas_df_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_df_from_records(): + records = ((1, "a"), (2.5, "b"), (3.3, "c"), (4.9, "d")) + + bf_result = dataframe.DataFrame.from_records( + records, columns=["c1", "c2"] + ).to_pandas() + pd_result = pd.DataFrame.from_records(records, columns=["c1", "c2"]) + assert_pandas_df_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_df_to_dict(scalars_df_index, scalars_pandas_df_index): + unsupported = ["numeric_col"] # formatted differently + bf_result = scalars_df_index.drop(columns=unsupported).to_dict() + pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_dict() + + assert bf_result == pd_result + + +def test_df_to_json_local_str(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.to_json() + # default_handler for arrow types that have no default conversion + pd_result = scalars_pandas_df_index.to_json(default_handler=str) + + assert bf_result == pd_result + + +def test_df_to_json_local_file(scalars_df_index, scalars_pandas_df_index): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.to_json(bf_result_file, orient="table") + # default_handler for arrow types that have no default conversion + scalars_pandas_df_index.to_json( + pd_result_file, orient="table", default_handler=str + ) + + bf_result = bf_result_file.read() + pd_result = pd_result_file.read() + + assert bf_result == pd_result + + +def test_df_to_csv_local_str(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.to_csv() + # default_handler for arrow types that have no default conversion + pd_result = scalars_pandas_df_index.to_csv() + + assert bf_result == pd_result + + +def test_df_to_csv_local_file(scalars_df_index, scalars_pandas_df_index): + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.to_csv(bf_result_file) + scalars_pandas_df_index.to_csv(pd_result_file) + + bf_result = bf_result_file.read() + pd_result = pd_result_file.read() + + assert bf_result == pd_result + + +def test_df_to_parquet_local_bytes(scalars_df_index, scalars_pandas_df_index): + # GEOGRAPHY not supported in parquet export. + unsupported = ["geography_col"] + + bf_result = scalars_df_index.drop(columns=unsupported).to_parquet() + # default_handler for arrow types that have no default conversion + pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_parquet() + + assert bf_result == pd_result + + +def test_df_to_parquet_local_file(scalars_df_index, scalars_pandas_df_index): + # GEOGRAPHY not supported in parquet export. + unsupported = ["geography_col"] + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.drop(columns=unsupported).to_parquet(bf_result_file) + scalars_pandas_df_index.drop(columns=unsupported).to_parquet(pd_result_file) + + bf_result = bf_result_file.read() + pd_result = pd_result_file.read() + + assert bf_result == pd_result + + +def test_df_to_records(scalars_df_index, scalars_pandas_df_index): + unsupported = ["numeric_col"] + bf_result = scalars_df_index.drop(columns=unsupported).to_records() + pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_records() + + for bfi, pdi in zip(bf_result, pd_result): + for bfj, pdj in zip(bfi, pdi): + assert pd.isna(bfj) and pd.isna(pdj) or bfj == pdj + + +def test_df_to_string(scalars_df_index, scalars_pandas_df_index): + unsupported = ["numeric_col"] # formatted differently + + bf_result = scalars_df_index.drop(columns=unsupported).to_string() + pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_string() + + assert bf_result == pd_result + + +def test_df_to_html(scalars_df_index, scalars_pandas_df_index): + unsupported = ["numeric_col"] # formatted differently + + bf_result = scalars_df_index.drop(columns=unsupported).to_html() + pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_html() + + assert bf_result == pd_result + + +def test_df_to_markdown(scalars_df_index, scalars_pandas_df_index): + # Nulls have bug from tabulate https://github.com/astanin/python-tabulate/issues/231 + bf_result = scalars_df_index.dropna().to_markdown() + pd_result = scalars_pandas_df_index.dropna().to_markdown() + + assert bf_result == pd_result + + +def test_df_to_pickle(scalars_df_index, scalars_pandas_df_index): + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.to_pickle(bf_result_file) + scalars_pandas_df_index.to_pickle(pd_result_file) + bf_result = bf_result_file.read() + pd_result = pd_result_file.read() + + assert bf_result == pd_result + + +def test_df_to_orc(scalars_df_index, scalars_pandas_df_index): + unsupported = [ + "numeric_col", + "bytes_col", + "date_col", + "datetime_col", + "time_col", + "timestamp_col", + "geography_col", + ] + + bf_result_file = tempfile.TemporaryFile() + pd_result_file = tempfile.TemporaryFile() + scalars_df_index.drop(columns=unsupported).to_orc(bf_result_file) + scalars_pandas_df_index.drop(columns=unsupported).reset_index().to_orc( + pd_result_file + ) + bf_result = bf_result_file.read() + pd_result = bf_result_file.read() + + assert bf_result == pd_result + + +@pytest.mark.parametrize( + ("expr",), + [ + ("new_col = int64_col + int64_too",), + ("new_col = (rowindex > 3) | bool_col",), + ("int64_too = bool_col\nnew_col2 = rowindex",), + ], +) +def test_df_eval(scalars_dfs, expr): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.eval(expr).to_pandas() + pd_result = scalars_pandas_df.eval(expr) + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("expr",), + [ + ("int64_col > int64_too",), + ("bool_col",), + ("((int64_col - int64_too) % @local_var) == 0",), + ], +) +def test_df_query(scalars_dfs, expr): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + # local_var is referenced in expressions + local_var = 3 # NOQA + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.query(expr).to_pandas() + pd_result = scalars_pandas_df.query(expr) + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("subset", "normalize", "ascending", "dropna"), + [ + (None, False, False, False), + (None, True, True, True), + ("bool_col", True, False, True), + ], +) +def test_df_value_counts(scalars_dfs, subset, normalize, ascending, dropna): + if pd.__version__.startswith("1."): + pytest.skip("pandas 1.x produces different column labels.") + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = ( + scalars_df[["string_col", "bool_col"]] + .value_counts(subset, normalize=normalize, ascending=ascending, dropna=dropna) + .to_pandas() + ) + pd_result = scalars_pandas_df[["string_col", "bool_col"]].value_counts( + subset, normalize=normalize, ascending=ascending, dropna=dropna + ) + + pd.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_df_bool_interpretation_error(scalars_df_index): + with pytest.raises(ValueError): + True if scalars_df_index else False + + +def test_assign_after_binop_row_joins(): + pd_df = pd.DataFrame( + { + "idx1": [1, 1, 1, 1, 2, 2, 2, 2], + "idx2": [10, 10, 20, 20, 10, 10, 20, 20], + "metric1": [10, 14, 2, 13, 6, 2, 9, 5], + "metric2": [25, -3, 8, 2, -1, 0, 0, -4], + }, + dtype=pd.Int64Dtype(), + ).set_index(["idx1", "idx2"]) + bf_df = dataframe.DataFrame(pd_df) + + # Expect implicit joiner to be used, preserving input cardinality rather than getting relational join + bf_df["metric_diff"] = bf_df.metric1 - bf_df.metric2 + pd_df["metric_diff"] = pd_df.metric1 - pd_df.metric2 + + assert_pandas_df_equal(bf_df.to_pandas(), pd_df) + + +def test_df_dot_inline(session): + df1 = pd.DataFrame([[1, 2, 3], [2, 5, 7]]) + df2 = pd.DataFrame([[2, 4, 8], [1, 5, 10], [3, 6, 9]]) + + bf1 = session.read_pandas(df1) + bf2 = session.read_pandas(df2) + bf_result = bf1.dot(bf2).to_pandas() + pd_result = df1.dot(df2) + + # Patch pandas dtypes for testing parity + # Pandas uses int64 instead of Int64 (nullable) dtype. + for name in pd_result.columns: + pd_result[name] = pd_result[name].astype(pd.Int64Dtype()) + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_dot_series_inline(): + left = [[1, 2, 3], [2, 5, 7]] + right = [2, 1, 3] + + bf1 = dataframe.DataFrame(left) + bf2 = series.Series(right) + bf_result = bf1.dot(bf2).to_pandas() + + df1 = pd.DataFrame(left) + df2 = pd.Series(right) + pd_result = df1.dot(df2) + + # Patch pandas dtypes for testing parity + # Pandas result is int64 instead of Int64 (nullable) dtype. + pd_result = pd_result.astype(pd.Int64Dtype()) + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("col_names", "ignore_index"), + [ + pytest.param(["A"], False, id="one_array_false"), + pytest.param(["A"], True, id="one_array_true"), + pytest.param(["B"], False, id="one_float_false"), + pytest.param(["B"], True, id="one_float_true"), + pytest.param(["A", "C"], False, id="two_arrays_false"), + pytest.param(["A", "C"], True, id="two_arrays_true"), + ], +) +def test_dataframe_explode(col_names, ignore_index, session): + data = { + "A": [[0, 1, 2], [], [3, 4]], + "B": 3, + "C": [["a", "b", "c"], np.nan, ["d", "e"]], + } + + df = bpd.DataFrame(data, session=session) + pd_df = df.to_pandas() + pd_result = pd_df.explode(col_names, ignore_index=ignore_index) + bf_result = df.explode(col_names, ignore_index=ignore_index) + + # Check that to_pandas() results in at most a single query execution + bf_materialized = bf_result.to_pandas() + + pd.testing.assert_frame_equal( + bf_materialized, + pd_result, + check_index_type=False, + check_dtype=False, + ) + + +@pytest.mark.parametrize( + ("ignore_index", "ordered"), + [ + pytest.param(True, True, id="include_index_ordered"), + pytest.param(True, False, id="include_index_unordered"), + pytest.param(False, True, id="ignore_index_ordered"), + ], +) +def test_dataframe_explode_reserve_order(session, ignore_index, ordered): + data = { + "a": [np.random.randint(0, 10, 10) for _ in range(10)], + "b": [np.random.randint(0, 10, 10) for _ in range(10)], + } + df = bpd.DataFrame(data) + pd_df = pd.DataFrame(data) + + res = df.explode(["a", "b"], ignore_index=ignore_index).to_pandas(ordered=ordered) + pd_res = pd_df.explode(["a", "b"], ignore_index=ignore_index).astype( + pd.Int64Dtype() + ) + pd.testing.assert_frame_equal( + res if ordered else res.sort_index(), + pd_res, + check_index_type=False, + ) + + +@pytest.mark.parametrize( + ("col_names"), + [ + pytest.param([], id="empty", marks=pytest.mark.xfail(raises=ValueError)), + pytest.param( + ["A", "A"], id="duplicate", marks=pytest.mark.xfail(raises=ValueError) + ), + pytest.param("unknown", id="unknown", marks=pytest.mark.xfail(raises=KeyError)), + ], +) +def test_dataframe_explode_xfail(col_names): + df = bpd.DataFrame({"A": [[0, 1, 2], [], [3, 4]]}) + df.explode(col_names) From a600b235529e3e42fd5e7e803f5718b2d1d3804d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Thu, 5 Jun 2025 17:30:14 -0500 Subject: [PATCH 09/18] test: avoid exact float comparison in `test_apply_lambda` (#1795) * test: avoid exact float comparison in `test_apply_lambda` * use by_row=False in apply_simple_udf too --- tests/system/small/test_series.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index d0595afaa3..f7e013e2a4 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -4285,13 +4285,16 @@ def test_apply_lambda(scalars_dfs, col, lambda_): bf_result = bf_col.apply(lambda_, by_row=False).to_pandas() pd_col = scalars_pandas_df[col] - if pd.__version__.startswith("2.2"): + if pd.__version__[:3] in ("2.2", "2.3"): pd_result = pd_col.apply(lambda_, by_row=False) else: pd_result = pd_col.apply(lambda_) # ignore dtype check, which are Int64 and object respectively - assert_series_equal(bf_result, pd_result, check_dtype=False) + # Some columns implicitly convert to floating point. Use check_exact=False to ensure we're "close enough" + assert_series_equal( + bf_result, pd_result, check_dtype=False, check_exact=False, rtol=0.001 + ) @pytest.mark.parametrize( @@ -4375,13 +4378,16 @@ def foo(x): pd_col = scalars_pandas_df["int64_col"] - if pd.__version__.startswith("2.2"): + if pd.__version__[:3] in ("2.2", "2.3"): pd_result = pd_col.apply(foo, by_row=False) else: pd_result = pd_col.apply(foo) # ignore dtype check, which are Int64 and object respectively - assert_series_equal(bf_result, pd_result, check_dtype=False) + # Some columns implicitly convert to floating point. Use check_exact=False to ensure we're "close enough" + assert_series_equal( + bf_result, pd_result, check_dtype=False, check_exact=False, rtol=0.001 + ) @pytest.mark.parametrize( From 86159a7d24102574c26764a056478757844e2eca Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 5 Jun 2025 20:53:52 -0700 Subject: [PATCH 10/18] feat: add blob.transcribe function (#1773) * add transcribe function * add verbose * add some debugging message * transcribe functin is completed. test case is done * move the place to capture col name * remove a few features, update testcase * change the testcase, add data * introduce user specified instructions * tweak prompt * rebase confest * change the way to read in input audio * update variable names * change variable names * change the way past in input * remove addtional instruction for now * change the column name * add a name for result --- bigframes/operations/blob.py | 76 +++++++++++++++++++- scripts/data/audio/audio_LJ001-0010.wav | Bin 0 -> 388966 bytes scripts/data/pdfs/pdfs_sample-local-pdf.pdf | Bin 0 -> 18321 bytes scripts/data/pdfs/test-protected.pdf | Bin 0 -> 12721 bytes tests/system/conftest.py | 14 ++++ tests/system/large/blob/test_function.py | 51 +++++++++++++ 6 files changed, 140 insertions(+), 1 deletion(-) create mode 100644 scripts/data/audio/audio_LJ001-0010.wav create mode 100644 scripts/data/pdfs/pdfs_sample-local-pdf.pdf create mode 100644 scripts/data/pdfs/test-protected.pdf diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index 8da88d1ff8..e143cfc519 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -15,7 +15,7 @@ from __future__ import annotations import os -from typing import cast, Optional, Union +from typing import cast, Literal, Optional, Union import warnings import IPython.display as ipy_display @@ -736,3 +736,77 @@ def pdf_chunk( return struct_series else: return content_series + + def audio_transcribe( + self, + *, + connection: Optional[str] = None, + model_name: Optional[ + Literal[ + "gemini-2.0-flash-001", + "gemini-2.0-flash-lite-001", + ] + ] = None, + verbose: bool = False, + ) -> bigframes.series.Series: + """ + Transcribe audio content using a Gemini multimodal model. + + Args: + connection (str or None, default None): BQ connection used for + function internet transactions, and the output blob if "dst" + is str. If None, uses default connection of the session. + model_name (str): The model for natural language tasks. Accepted + values are "gemini-2.0-flash-lite-001", and "gemini-2.0-flash-001". + See "https://ai.google.dev/gemini-api/docs/models" for model choices. + verbose (bool, default "False"): controls the verbosity of the output. + When set to True, both error messages and the transcribed content + are displayed. Conversely, when set to False, only the transcribed + content is presented, suppressing error messages. + + Returns: + bigframes.series.Series: str or struct[str, str], + depend on the "verbose" parameter. + Contains the transcribed text from the audio file. + Includes error messages if verbosity is enabled. + """ + import bigframes.bigquery as bbq + import bigframes.ml.llm as llm + import bigframes.pandas as bpd + + # col name doesn't matter here. Rename to avoid column name conflicts + audio_series = bigframes.series.Series(self._block) + + prompt_text = "**Task:** Transcribe the provided audio. **Instructions:** - Your response must contain only the verbatim transcription of the audio. - Do not include any introductory text, summaries, or conversational filler in your response. The output should begin directly with the first word of the audio." + + llm_model = llm.GeminiTextGenerator( + model_name=model_name, + session=self._block.session, + connection_name=connection, + ) + + # transcribe audio using ML.GENERATE_TEXT + transcribed_results = llm_model.predict( + X=audio_series, + prompt=[prompt_text, audio_series], + temperature=0.0, + ) + + transcribed_content_series = cast( + bpd.Series, transcribed_results["ml_generate_text_llm_result"] + ).rename("transcribed_content") + + if verbose: + transcribed_status_series = cast( + bpd.Series, transcribed_results["ml_generate_text_status"] + ) + results_df = bpd.DataFrame( + { + "status": transcribed_status_series, + "content": transcribed_content_series, + } + ) + results_struct = bbq.struct(results_df).rename("transcription_results") + return results_struct + else: + return transcribed_content_series diff --git a/scripts/data/audio/audio_LJ001-0010.wav b/scripts/data/audio/audio_LJ001-0010.wav new file mode 100644 index 0000000000000000000000000000000000000000..01a2e68829a506063f8ed8b090a4516a02107a62 GIT binary patch literal 388966 zcmX7Q1#}xp({0a;m}RCIa+@ZrTyAlm(-M{^~DYK(32q6lN*!~E$yND2tENINg*&~zS zoDxz-O3D9!{@-slcwIpp#7>;>aKN(*Ub~2ocnC-M|9c2TB82?!I%=dsI%Gmt6!L#Z zIEq5CC;?T2qc*CC8ld`c)J1jRxemOpgA(DW3};nBK_~#;r$Y)Dok+a!X*N9gHg=?jiwfip&?2_$*3*rfI6cds1F?d&>%GM{~rBNU(^%+-UhWqP2tSiC>}-_ zh>S=BqvPSKCI2_B|BdQ3c?h5XAGt`*k`v@GIYRc49b_jwwvz2+H~hAf>?eodKc~q# zauv>f3S<09(n&siM*-}nf^Qy!YQZOUg3ld^rlP;mQnViJMhDSpbP@fBZlk;CIeLX& zqSx^AIeG$rxdi_^h_=9)GvGblQ4>@Z`NQZ+$#?RA94D*EWYUi`A#ub&yf_Dc#1HW$ zd;kRA*7`}?1;vd+CRV0CQAY;io_%7dIWYKWdx#$Rbfl85;YD5j7 z{-$s?y@#qyHN8|EvceZ)s>YM43kEkK?_KC93i{KpTZ0N zJ@=b+`VxKXy}{o5p1GbDp30uOo2GP8bV7=iy|N8Ps-s%b8<`@-Ak}AeC+$DFlln!58pfB#4yNsqg1Yz|cvb ztg~wGs{5)^6#W?+brG#3Q}HmlzcfpHFU%LF2up+m!ehZ9B#8&aaOsgWLN1n@$?xQA^Iy58Tno;@{=;@- zW7u%E0lS*5!?ogf3WC^KUWz}HK9rtuDGF8JG%I!A^)Czyjq#=zreyPUb3ZdSEd{Ik z3?=%9y7}4w%@I`>Wi3SuW&qubYK-cULHL>cm)sM4d$qh(UM|m*_sMp-KR$`y;jj1s z-is&VsyI*HCD)SgNp+;tVw`wDFbT_eEB}SN#hv0-ah*7td&tgZ8?q)=$Hu`3YqNnY z#bO@~uhz4}xeI)2@tf2H|4kmEDEbj|M0rL1T6>mw!ozq;1kc>43CJS}gq|&x^r{5 z)!agE5crXuea3!bTXNO-Rf1VsBR|G($ya2i&oTRz57pna`}A3c%f_~*@1_~%GIO|v zwmdKoFuySIM%nN~KVMg*X|BGl+@d(aT%|Wt?NBJGj@QXCvQx^CK1UY z{7gP6_mVx*MQMhVBq^mI;ubMh+$+$+QeMq(<6^jM*2czieYqZ71kAwQ>~OXv+lrmR z-eb#Ij?H2pfxqu%FR=>lGZ!x06(`H(I03bz7SJ;DOX*a{>slCcjULl>v%zxEQq4Ns zI>B1gnhJAmy?L6ct}$KTUAIG%shY2xqFBldr-P`UAiGCS|^Q>=17mF zaJiElF8?bfOB{ITXt7L~CqxR*`BwZHu;6vJ3;W48*SFahz+Pnu%)smHQHWUAV1yhy z5@zWyhzlFJHQWL2Jokf}#orZ{NPY1Sf~Zh>0P{t$R<&4jO849_2K?ZuIl^+02rC;3U$q}@^= z__9^r3$I<`DY2@!PpB^B^DB4mwV0?aqao8f++ry{~+VgWa=i}N#RqzR%d7{8T6*c=DX$?%TvoN>r3k~ z>mX~S^@AnIl4Dw9Y-PyM)z$V=A5>0I%wjsyDQGn5hzsWxVyDq>c6hnp3{UL&_RW;Mp(hW8AHf=DQECVd5mU-6K)-%>o z))rQcb&RE+`JQo*!K)jrU95hhT(3C9Y@r)aKge}lE+@;|q-YptlGH^qO25QXv8!}M z+6k}Uh!ez8p_Qt`%p2`8E-(tmCT)dMOd#BDc^g61i1~{-*fImW^_x}WS7d!+ zyRxI@)nVmv#aX65eFJqMP4HU4BQK4;ts4R5m4{_}|J-3NX^F{fNdi!{No{gRXo^2khca!&?H`g2Q8|Iq=)^6%s zMXSa?>s1`ip!*) z@?m+Rd|Ub^Iv`gK;j>O6~$==T1JDx6{BKI8kYImypnkUpd*89!-$s6o* zdQ-g(z_yQlPko}VFZ-F5x$eR;DGWyuE4qV50ot0R3es}=Jkw-LfVG?Tlr_Zvfxj}~ zp8sh7Z!q4+e&N>h=CP(jhR3>KO|q)7(#u3Ly{Jbd3p?c<@;k_&X6c2vLHr_=3**Iw z(p~u!KeR8Qrq`yXUv}wD+WUgmXY(AHc6N+SiMgiRUCC)ZJZx%iKR8mH@O~qgPh*7@fB^@LfnUH zLzGk!qh(CgYU!qU2E9{!Wtx$}QZGPsA$S6N%T>?Qm;1?SJW-A|uKAuqH(fEixU8g3 z8CO2E=xToFT*g)v*%Ye{*EK!V?aen#1O1{x=EeKMA7X7Bv=jHt<7@4y^FQ9OI;n}<BsRVRIK<4UV9 z@MOUAz*T|0LTwSY&{aVz%tgBX>a&zbI_FDphdIh@6UyRCHC3aK<@C+& zkj-Qr$b6LZId^hFyV8jj<&KMvFlTN1LH8PWJ@-rd5PfOPg4!38&cr>b+%|D;)mzoe zYi?=AcXD+NZ}2(fxc@iHTf=rdtgLmh*;&arO}g-h)o zE`Olo>AG~brbh6r*f9y;V>-uvsgjToQ+Y=CqJWjw8m7g{74mhqotv$wR$$0@p3*qw zb?V8KHoxV}(kyRoMb`H0odqpRkK4Dn7Wp1}$MY4CP1U|T{1fLXgCT*h{i^NA>fx2Q z$1h2^kSNz&o>aAMyRHN3@AbQ=OI7xx3fZLm<{2AH-V}ezn3TCDMUyclEA{>McRN#; z7aijIk~dkH8n7=E(ij9irYV>jouko7&kTkO=6#V3!6M|<7pTba7dYt|KdM;a`OhIPR?Ce+$yJWPNTFhX&v&u zrkww}GE-^$M+#Q=Vs5HtTke{!TC4dV4rmcIDDHGjyVyx_r7^LQ$ghtw5Y>@Wgd1#c z*KpgEB6HT#RL_rb-?ydKOFx@gztF#ULCMe3nkB~aAsvpO>>LIc#K#-k?iSce2d3{OCS7mPp-10 zWqxU1R<=1$S(uuxOjDOM$gyX>5-x-k5Vdtu*CL%8ryyzc*%F$Q@WRr(%cWi?f<@n|BwtNoX~(Rx9FdABqUh+ z;hAplQtmC>_M>BJtD;*)4~oAO>+h@nykK^5NpX0^%wkVTz0#75?YR$q74#j;<$x)= zJiWs-Uf;wj2TTpPA9^a}dEmiHXDZbWrTuFtQn)|ZkDk@;i@wek-HKjiw@LB;;r;nX zN=e$ZoMri)%935Ry?=B4_^P}^@?kB#QRSy@sOmvyiCcWVy*0dtJvUt^%Esx=S8m(j zQSybVIiVRL2g6PVpRBUIZhE7FW?!NPk*-_=_l0tuO-@_zqddQPiCps2KBVkj=`-8g zyz7OTK7Z<$evRL7eT+W9^4MUs7Wyl#KLgGOZx2`!RwuGaaE_%+>EIb&!Qb`N_SJR_ zD}0sn_pfH(H~wmnzAw|4=U)_Aw!+!Wd(bymTrDpo7wBosLuEf*b(LGuR8Hg9c;7m@ zIaBOgDyo#dFcd`ZZB)PItSYNQng%uv9vA#3@qO*5P0zLrjLJl2uBE+x#SWYNdqL{x zqL(H2ZSl@6Ws@ro*h>riOL$*hV%B%GywTP({pr^ZP@c{7hvt;|x*veru#Ta@0aq-W zl^+Bom2#1u*6h~ugo4=otaLhUaaNnmtZb%;E&5?Q;{M>h%h!Q!=L5xh&1=PKRkBLW zWKlb0njP+=9PcXblx3GGiUMs5O_4D@n$~IhIqt8(h5_FKW5Y9R-ESzg{MM`>><7x{ zN-BDke=Q07{WE<-8DqM!3VV!T7KpVwbc!}uYDvR5kZOZx3doGTRCl%=9c0us;iB*z7HG_c{3tEV7~d7va=i~mx*tAvv9!eSH8NqTW&!1w(ROz z^>Qy2FR>MRRN{D{wfIQtAt9!Rs+nr2`T%nRX9-chi_YcFdgZN41{F*%=va8I^i)9~ zFCX-)T3DNuX4R?$hW!ey6PjMxuW^rNi`sbVwJl<49aSk8J8dO-fxNhNqOASdWl_%+tjY*jOZMiAABM1K;thhZ?x;v&~8@q zrH=K*n~V46j{M1`oh-awG|2IfJIC&E23Pzpd{D7m2nVEfItcr9^q*-Q0sUnI@L6 zkvxOEW}d0uXm#2$#Rjs4JK?le*lbCqZ%c+0-zr>QXet?4cqhMBR`-gPflKO}x)pW~ zZZI@va)dFYTipG|1ugzGqHzQVRQv-z?7XZgj+y z)X&+Fxj$o4){>%-@)wTT>{YQjuAo=bPV_HRLd{?vD=*VykyaYzxlz8lL{o6FU|C6Y z(fy)l1;2}0mWGtCDP3DS)mTyWX8W&wDq6Rw)-rBJ#Ix`#4a%GFEe15xhW?gMa06}2 zO9D%(ryTgPG)tZFs9>+_f^#uj>T6bdv~-o{lXS)C3Ebz`)PIsuP{gR8Xou-E7Tz+# z=x;?qzk}+AY%q_bPYEMEbKKv&QN9G%7Te3bl^K&W8f1m!e<-c)aCv$OCnX#8ONF#Z zpP;#>+^9%|Z?9E!mtT7_N^9gN3dNQ~)DcIA< zzJ@zTYH6njb_#qQ(7>;e#>>1=7wSeCR~dI{Hfd(-H<)htWdi_IMX;|Bywd&T4{xewXZ%kzC)~Zi6 zycD#+7h+fBozM83XH9whB_~~MLsc4mxVFtHlcS$$*3aOTcE!NA)dL zsJ4@D@=Z%zl3`yai!WxIebKUv5^hw^(to<2^oIVA!OX^j$aqM>Na`p8b zWs60T|LE!Fx~+Ffx<=4vcHTF^+u3<|3Z=vU;GxiS!zROqroM{t>HO_;;oC8 zYoDsWi=BK&ea+<^hI2vckXeDWA&N9Yx$gV;uz;$C{pMny;Boy8tIqnf6*Fe3{d|io7>MT@VA z1ETl(UkF)MrC-9wnoPV)8SHixj{3DVea7!m-{$?glva`+RNCD47G>-5Rhc+TUQLE+ zdIwAorNTM~?=an0UO=6Q7jHTChod5Wx~nWJ-(3LBUD z%2t#&wr4nRcrN%}a+BnbXcAhDZQ?x9gYHmY=`!&Lx8D>N|D#!I=R>VfEmuNh=(yk( zvGb$)#y5*Ot7*c$DSw(3mSf3Y`71VKMS5A@DG^_yhHGc(y>5NPE=HN zRrl)o0kV~jR>v6QtTp{ETLVldG=r4Q=+|-=VT&)+<*_X(i7RN7_dEYo(bLi!Y-8y3=5eFhru+?QD|y2_Q%t0$QaA7;@tUuvcb}#rJ}_s$!pi#icw?&VuY7X*1J9GPT)XY^Ow`nP%N+GO3-^0pg;5s+MaT>JDqO zRBILG=(|)!;J6e%RVqZ3Qq-qFf2*hEwB`h3KbYIbuH!XQFR2zjg=gXjR6y-R(Nrg~mwPGmE8<<_g6<7E{iySA>@UkC z^+5mY0b=m&NpA}ow zBQ?zwJpg-+6DjeF%#c{>FwY?S+RdBZYa@%ZAkrB0i{(a=54 zy~yQpFYs1CXSTm5**V(b@7n6Ye6iR>t|D&YO4tG}UfPU)F%w~(I!SC#%?Mdt_kPD6 zouzs=qIUU}XrlwRM_!J8A2%gNOP^&P!rm+yo zN$h6XNzYL1Q)V-ti3`@(Ik_vnRk>YdSFh8aHZBgR2$jNT1)nm`W<;T#_k}y%HQKet z8Bmc`<}2-E>kYFlh^vcvYJn!$Z15`xAbtn+YSk8W89II{u_b?>{m-LzB|Fa9S2{e7 z6|Q<-9oI$NEl0@9r4#&XcLzsl#aMfU`?KeT`;lj~=bF2ouPXb5Jt|S;HT6t2Qd3v6 zN%c%TGS1Ga4=nl_X%W4@GOkKd9~q29Mo#zx+%2zs1`NGjcj* zw<}%Fwo^RTuhulAE{Ws0@4`KN9;KtvvYOxPYvVo5MoQ`U0M&`hP8~? z9FlC|l*i@HEOelp4easuNp_3h-=F)5+wc^e}YZ$4Q3DU zPvk$TN7PHCBz;hVy40K$dNDFOvQcP9^IWB#gz#;B{X8+Q^X{dvR#1t(#ZKq#(m;7F z8N?g}+?QqkZA{X2W0K?td<$=X*E;7h#{lPR=X_^B_X+PSZjYqF|KcqCCuv8=GYzOg z_&Xcm8Seh%c;UR~_P9QKiaa6ixt{vI^;|D$Dy)R`MQib9Ihl;7=FtP`Wq7$j+_~N> z=1R4aIt2FQn`I{m5qnJythu3<&_@y8fFV>{exyTFLQhMwA+ zj0*oJWU`OBqkIGIICMF;Njpdf;2(XXiWy$z(d8KwMx%)_3{bD44+&>IO(qv?X4;f+kZQ|x~upM{7mr`kHpz`=B;+Yd)>qEbvIsji@?dox)ka ztr6*YEsb%rlx{Z9aS!(^XY~P zi+Y}Zr=gb4q`pXPlv)V8*)CqMJK9y@9O4?}zTr#fr$ZgGA6F&Ak&$*Y^O$oKO~#1p zxyRo3o(E!mH0gy>YiX9VEa}vi_H-q$bs^CZW%XB?1IMVtk&mN z+I(3}(-K}FjOBK->%2biaov9{?;76% zSP`7d4d6Qp+hE1}nAk&(#2@5z>A0j8v9M9zk2A?EG#ekpGHHT70|%o89z!0RhgE6S z@mA1(S&crm9({&0JwG)})V@Snf} z%O}lk)Kab~b{1BOlrX{9)$`mv!Z(2bEG!Xga;H6OJuiLha93ri%B&f!m_|hTv5>_d z6#nEh#m6$1*V8-I_YBvq!>#`2RKpByvBsgx)D!(;Z3;6J*XNVHKRtUrTV3lMbL<{4JVP7R08!IRi#JL-xzEEBi7ZF#58Sk zzVm{%{p!#O6@kJbz3@p@@ETb{B7u77!~wEM^N2#pz5Nz1uk0{Fm+(olFviU|133 zymQ!QLQ|oGJd=8EHyY4v0+P78&*;V!h z&I*sr(fAgs!9*!*Dkmx5D>9ipdLrp4Z4-9!^M&`qJzxJh+ z9dQM)W;Oyx=9aWUS|Q$)Ug5M*eNVGn;)gM zRoYN%OOxJp$0RI_=@SrVK46}&{i<$8C$S?uvntLMkF))0TU9crs7Y>}{J+b2-#5i| zbD5=%xr4r{y1DWbx+pyo`mm!s8{JxO6W%4&K;7uOXg+Q(UzF3S$BKSRqiPx*2%NlJ zQ3J8-h3C0Fmy(le*AeJ&oO4T}7VhT#?k04p#rx#TzFXTj&|BLtRgGN3n>>MK$pu=r{Fa zEBR`99IiF45O=Y+Cah_`1HX@CIw;mEv!QMqtDMOcQ|sZn+i?Rq6c~>WM33-?$nyJz z?ZCR5E=Gzc`9l6Ju+aj!uE6s$a_Rg&age+K>#-HrB=5*vbQ0Og5;Th9saR$e9Z&5> zRdI7*ka)#BS%(hewKzpyDtspS#>17?*Sb`HUX62!{bO2%E)2UAWVMzX`ViJP+cUdd zC_m?r>=T{S%Bz;nD(~i8&)=g5YWC`RO+VFiRSU&_I-d~1%q4r;yVAT3_zYL#4dzN+Z>GC=zUo}UUqOYtwt33|YccF4Q z9e`Tm9C5x-$oX?Exlim>?m54o=Y?>oTna)cL@f8w6UXU8-4M|7U zsnPTx#!EL-v|&C|OQ^BfCi#R}YzEtpixt-4B(x4K0p`&fZKIGgiPahgHVtXS)Vv&{ z44vTT*4r6IsIXTQ8S}p694xSv9xSg}vCgI`t5Qx}-=##&0dtIBh?N*)^$j)o3XGbG zTf9Zie;tYL$=o&ES}|PpUfEa?OMe2EubPUcd!c!w zPmD@ER69*SO+QyxTenR+L-SG9L2(VWm*Tj5kH7nZtI%cfT=y+*0p57=-|rN5-7 z;v=DezYn=Q4|{)cQIc$18V_;p;viYYA<98ytr2C z4}9TRISMbNDMLXp9n&!WTm0F$@0H3zzXad+lZ-j4f8;CNCihOe#Z}oe66%0ZXP1gs z6`LGGe3|$nvqSk>84paT+Q5Yhr@i>AP?=5k%=h&0edB*%KdL(wLaEUNG#5OgLNQaZ zf=a_bq*!r<*a$dWSHw(llpI3h0NGVz#wcDW;uTAkWvX53rrNsNEOjN-EINgB!JDKX zQnZ{X%@Df_b%eTNS8=`66`M(WbO=300@Z`Q3Nd&o#gf~2xLg5j#|_dj`2=u=j!L&h z6MXk7l0&Wz%;iKJBunB*@wgBNb&yFKBzeTc;tz4M_)P30v0^J}x%5VoWjC&m=1^Uz z$G~RqLk5x~xJ*WJny^dwE{qj(<4wcp|4Jdh9mR6BnvDMk%Eqj6moBWKLs`MyxRDY>&sywP*NG zZPC?Rg9`)(oslxoe>0MTS0t+LDyK7E>N<{*8i)b>RbU3zVO{J#PA}Az4df)nF`pGp zR4-NM)KS`xnrRxMdcizLn9v1NLhJ=w~YAx?aH79!B<}Fls5Kqbj4jBn97*zezFD9;iL4;WcCwx)03t zBY+r_utPqFL(pbw3+<(M(&wpSI6nwKmKbpef1RtsrEm_RkJLpDz<=Y_g39NEecXW>fVTBeMW0WjkNM{pPIvCEsjsg2Vfh@t}a5#xV z4^bT|g*pHh?*aMr0C0r@WrNg9d?_3jolWzqnn|YofX-o# z=y6pt5_%-4Dknuf47>0DK<}rRB=mPyDXU+!r7)&wT=7WTMaMs$z1$ON8!&0BnCg@+(kmpF-#5Q7q@DaxRU0{Vo(JIDB zcjR)?gYM1*LLc-b6V5EACXoB`O0kCUlpoB$<0lFcFxOAYtMFP<1>*Y{^o`6VGl)zA zs9E$XCYxEt?5EFChd?=T9>q}=l#-dv)K{!fd}nGXJ~I}msaD~8z#LRctEBVt9C@Ui zCjBL5g7x|c??fy$#fL$eVuL&wN9I9)bEMo2*8+{l3#6x>gR%c)F6T%*cSA+>+>q18agquz)zsP0WLbu7$#jYyPEDN<6%f~w}csud~a8GI)-HC~3 z;uTGmp~{WQ<%(a_Ou_lPg1P*&YvX`6! z1<3{+fYW3Xc%Xt<$v&9n?Lcp0CBsNE(UL!aC-O>)k{sd);AsyA{(AywJ19~Us;)N} z;f+wm9+qE0q{zk>$U>MUD$q#m!RKX89*N_j)*OJ=qG@!Z(xRJZ9_rsect_aVh!0UH z=33O4h^|5X&FSjr0#+z#!yu7df49QMAi6 zM2?BfFr`K9)f8(}wTar*>T61qVj>+%m6NTwNGcaB(5d<9yXDH=(ZdMy>;@ zsnf_vg;Nu$oz!KjJ-qI`lj0ZMt74fa`N{APKijSqs@)ukQ z?uSTUVD>R~`Vch}%_Vp69h`w9$zq5FEa{Fmp)!;Yo_?8}!#5#=Z53OIEybH+Tj{E_ z88BKF)}llziatTNrUmL4@4SIPz8(-ZL`TpKsS2E0SwF0Yl{@+;hcB%p7|gN{%Ipq*L+weTK#29u~j zsxg{Bb?*!}O`|M-`%UoQ4zjAFmM6yTx)jwdrVTnT1q*B0Rlp9(bSHZTdk^`H{95sZ z{Dq9Ac)Ffqgd!PO+GDBHA32KjG^U3!g-m6|O{RjT=-$+AlmTZQ!z}9rsw?0l*5*3*P{;L0<){zvV5IUD0$INFMF?VPMy?|0u<4_S9 z2PiZdACc>Tj^#5{M5R(2*(X28$H`$dol2r3n6FF^#ovmyigczvbZUaBPh>6bE%%h3 z1D?thhk)9wDde!*_z+o+>QdLC4-!o$(7;@i#F#Gq*Kjpf3B}D7Hcn~0k zB@lgbr8KF5EXpiap~{q*J`R{RhdxW!hne&Obg5~e8C^mKk!mCr-jfT8v2Jold7WG& zPsVd_D*!S za2~fa_DFk8M_bnoPiOWg-&d+c@~Li$eyRuR0UDh~R&7_lW8P6uNs3%u`cD|gKjX@{ znS3{)z1TyV0@%QfZ<9X&zZRk_IG#eRJO#DU7wCRAg`+~s2W^xYB~eZ27jzJl#580C z`ZmP5MPQYyl#v>Xc98SP}K#T*;=uSxd{<)1*%29%Lk>M;(q9K z9OA$7i-k$zV(F}W9HMd<+K-$lk}^ZDWeti3dxwy-5DORKPw?M%D3&UtHqbQK+Q2NM z?@?uN&M@*0FM*8o2PkK|;duN}ZZ1EB`fMyHZMVwnA#b%szfgbZ?w+M)Q`IOAG`{5! zQw7LRTS568iv)5WvRXA_Aq3|9BRmZM0PSR%WP|9uP?qIGkWqND0no%%suJLrIrL(B z3O$+ro8C^ZfH^RY+Ja7y-2c^Wn8%Aj>s%iVN7GOjB$E|{1^sbnsB``o2V21>uK)`V zr;pL^Xr7K^{$@fHS&ARZpQ=;pYMR!XEOi6*IMpy^9YrcVoC-$%WCZv_rhEysh}GnL zsJc@?H9SJT1$n18D4qL2hb#>4UlSAtpY|AHWeUXI72t6PKoN39QpwZhmvS6vp)sBU zs_{_N5e-JYK!=`3)&u&v3Nax~o(=1n(h#Ae=4KNjSDrv#23dJ1UsUNBbZL_l7L+4OytF6a{g7 z3TWtq!0!>{7bB?YSE5Y7J_`Z0>gc|JFV@k0Xhi2y#S}$@g8+8A1$e_xF+njX&TiI5j!!BX8( z4djDw)s)|i z-wW#39f0#`9EvMLzTOE{YzOie8AbZS{GA0>+XIhH@O+eP22TtpFY!|F<*s-NFanC8 zLT(GzO(05QBtsz@&OrO6-qQ<+Jj9xfyuq3aLFPC*K0* zrC{C`;)W2nz7Y)jC|ra4ehm-AexL-*2Fu)%k3;?QNDhI#m5(cvL}J57@nksuf%nyg zh_@27^!H&`gIH7r=xnb*=|2Sgup4Plf=DL52GQ{&Ho%?f2f3p@;h-{W3o4@=S&#eT z-GHKQ<0p{GE5ZM#L%*Rmp-DDenZ*eD2W_CE;3b2hyZi*YW3M1aI8Y$O$m5{)Z$WpW zL%>H?LvO1Q#Ee>0XKDe&k0;a%@YKW9ZJ5I&>BqF0X$e?)5L1uI1uGq+MA*+D0Y)mr zUI%u-BO)=Qc2L8mpc>R3N`P+rBzhv=k(L+I;nAq~iHs4-{av3NS(4=V2xKqwhF7jTy!_{K^=<)2|6h@bFtHgN)O zECdw#uUsK_0ehxF28o6%Gy!a*B6fIJ3gF5~*o-ei6<-Htew^H0-X`bC^`W=i1qb4E z`2$p--QX-OX%0siyel930AC&l71#pEU|#`=79uT%O3w?s@H3RZjE?Eo6)F0k+ zA3FA}zymweqv-KqO-ya1EP(m0LgbwaQM?Y_1T2iHaw;8GgwiRIY6^&L7I<$ET~4J@ z@2Cy1igpN9hZ!{p)~KD(XYj!t3Ib2wN&O31>;kn9vfBbGiOPUn(FS(Y$R?j)r->-o zPlBOl!a29VvInSD)D)@}V4!`lC&qE``dv_`%$L2Ok$VRU?|gY9>;}<`EC4G`AyvsO zTn#^fDrO7J+B5P6@X*_GhOEQg|8G}@3hWQw)*rCSKQLGKk)GresH(;TzBvtu%3odp zzSjyL#yXg%i@{e7P_=HsJHQVe(1#5t0R*@TFq7-S{iq7}BNnbU2CV9XeIBkN4K)bj z>JF%sKT;BvKzE_XfWHi)YtkjuIcf~mlxhqS@;K~AQ4``@G*pLwQCWaWY=8-0!wx1U z*jHm1+K#TH3doe*!3JBY2M`0I=(cc#(XYUppP-@8+ZY7*i=z6%yx$Dhtxeg{Ex;m6 z&@!|aYDWcB8IgjksiJ?-s)%d1npfeH2(jPx5C z4fWoo|J#M+99W|OW?&*@_wi^UcsZK{Y@^i@_6DL7bZgzO#VZ4)gg6to!bR z?^_9S!Zh&W|LyQn1I_-w?Bh=rq7#6PMu30s0o3G1ZK)#=RhvRgtPFKbD)`x4*#F}` z>WY5D*q4x#;DeRH!gFAElO}+>H$(L42j2CC1j0E>V3(8k=pFQhx1mw6D@!nBv$rM8j%+RGY3W^8{2nYg-f`X(HBIuV81;heH5fDUL8WE807<#(9 z?!@i>KhNDi-gnMt>dv|Qti9q{&szJ;K0pY$>jfYOv(RjGiif%0My}rEepfD}o(_bR zOXH+%5Jip1`M+Rh{jiUVTB3*7Jq1Rc#LhJ5!4;ZZ9^g|w;FA+1V-b^W(T^Am?foHn zAW@;(UI>5tS~?F^cL?-)6j}TY>^YNxcl47C!7epX*bnC!_Ke9!zUT{E{3Uq;Tt1EL z?<2X2IQb3?;xw!ug3*5jD*O$9T`gGxt$qy~kHY)Q-23L72mbs8osNK3EY3!-kUl`s z8doRi{5?qICwYjGzH}{v#UA2ro`qn`dojW*l0fV#lLsBiu>XzARgTe)V0Kq=Rt_Y) zk1T)1^$+@%Zf*6+nn2S1b0p$aN}^Ww&7u9Tj%`;hsM} z_>{qlKDbf_{9z*QBuaw^Yy%&di);BI&I-YLQl+8jOx%Y=gjU`>he2bshU?Mg60 zM_6bW^!_5c^I_N}sTF?q6#EKYf{u^EqIaqiD|ECP?|Tf) zzK>4SV`wo7XKaCgl_9@ugKq4Iv@Jgb{r697Buw%uJm}e>W5b}nv41h)6fbY+R?29nVUf3h568ifY+FXoC*aDjuf@uZAMiH2c0kYh~ z80X>b|G~2#Vys-)C>XQ<9Ttj~c9nLM#sd%jM1}~&I_yPw%tiQ21+p2Bd{}K0#{U5GT_A>;2Cd97Nh<>Be?>E*@wT62E%^_dw&d@z74r{LBc#>c^Qre z@RU3-whVaEW%M|Mr2$fd#?wSDLir@<~|1+ zor&vz46j%OO}q`Pm!Ly*9G*BGdVSe7&b0(S@))Srf-F=98~%&vxdOX-!yDd1q^-dW zWW-cn+%;? ztu_d*PCxQ=q?Y@S2gZ z>_@QpTeyk`viVi`!v#q57#`vQ)Qra1evpwLCd67Zw9g&W3UO56C!tV3Nn&BPvnj#^bC&9MOo8OlbBM%xnatSHZV#y0NR(6^P&a z;z)%|4T$rH&~O#($M+{e_i2cUC|oNHS7S9|hFsjqPJ-871P)99PaKZ3RN&0lVU=?j z_Znt(7O`*;IJ6tMdJz_Ly260r5s=@8@vlM3zugv>;MKl}`-wo$39w--@FEBL=5BQL zuwf8p!CJinuib^z4-vyoS15e0BQzTXkIscx+(k@RBhv<89zEc#nfPlGt|ph%KpR@4BdRDHkja46ObKisk1M~4tIh&aPX!O?0sE@pBhX$6zDJ-x>F-3K7uFx1;kwkjM)zR z+g!bXB){RlfD_18GqKa35}0)inmZ2-=D>%`VfkFFm7IbWx1(!v1+%b1mKe-G4R(=W zevhEvBZ&L+kXDUIcnx!T2UeK?@9&PYl3_c4%*z73KSI>r#V&s)bUqk0eGwPE-~s7w zNs}S9KkQQvd)>qF3^P~2TgL$3CPP!*k;AGW%|XP(PFyh?T1*CV4TR>}z!nPVqyiF_ z;A;-%cO1EEKd}36=;s2?W~S@~sa`-lq~oexpwSM{PcS093Rla;Q3Yb^Pj7MOvH?iyfMmWnXjcg`Yy(iF9=dIS_8#Ef z2Vvzc;A@94_j;V0h^uiA&{#K5tb^~dRw6&{h|vQPzs>L^J6?-|t%yL$K*a={p9BeC zfM<+`Z}i4*8h6!IiSs4MJkdBS4eubTl)+E_MND0Q1Uh8C&amkeSfMw@vA}9~@b@C1 zNDMrjyKhc}-a8;WtVi|t0@%+iaPt$uw+?QNM8eaokkSckjKMW}18b*)yS0V1&4{PJ zfD=TC)9~qf98ZD3g|JyE=7yDb#L;lX;W$`36j{m%&6Hs@UwA-Q%&0Fgf!^kg>znX5 z++K*!3i!!AXm2-SV-q6!8pic@%ZdT|!a`^w8?Yi+|ntdKP&5 z7;|I}jDg2?f|mQ>dOh)(fvj0dn5idxQZ7>jB&rN!d$huQVFc!g1>NA+)$iphNiM%si*i_4b8+u ztJG)$5Yg8i(>}0{3^E*r(QxcL%^jf*>7)z=s^y2*Ek5Qu_iU`ymE9;rC?BCKZx!Ki+Vh+k#Pfty)OX zil0~o-$&fu#@7P8Vs$eoUWI?RAPXnMDlfRbD*>9Q1H#>ajMw2`x8a{RpwXK^l82Z% zbE6LL(m@^z4hJlufP?|CL@cxyfzKG|h5H6mH|};C zjd;})X9dFt1K@2&*!dZJ=`?cTc}QOcbl_R}K!V$lQCn6H7&T|fjPUMw^bhHH9YYy;*f!L>qRoj{yxfJF)rC-?BR z0)EztvDjU2h25&)CpuU@0#YPkB!5WmiYg~) zhB3J3Gwnouvp*AzF~jjS4Bw+63;sO-8gN3o7RauHW$0mK{Vf==0!Jw{T#LVxXLHwS zYD{!{n3pBcbqu7W?Du)M(=wvMsj;kqfF5|%hG*Mt4-WW*=4`{+{bCIzh zxZ|J^-eH8y+&h}FBKE50Rs8-hC+RST8E4^NTreYlXx@SID`B4k#8o~HVs$I#LFOjK zyOb0YQjtaYLy}}1lr{o-2*$V?w`}wUda)d{^Mz&tF_%EMFUcWyEv#a~yfmu^j;U{M<5zxFz<&NR+^ZD0Si>!=SW*?wN2b?#WFF8HqfXqtj{*=%gO< zJ%fC=p}Q(*3I7i7{@Un9w+j5m{xSWL7_N3(Qi-2}5f^F5_-&y_a#{t(rsdkeYHe_Y zK~7?f0teX;(Udhv5Bx;iQuZK7NooBtwkKvwtYS5+#kE`T6Za8kj%U1<02Q;L&1$!8 zU5F#A`)G#eRO1XrA+drwAnp*&sXLxYJra|!wFRz3EBfFo>m=^b&S)}YEJic43ExHL zNG}mloP=2><+}ZXTeCIPf$8$D(+j~=#Hygcy1OB?!Zp}%>gDa8|L`iUFQA0bO#hnBBH%Vbq%prJQr?Qm<98L%0z zaaVgabm$8^dSN8W%bY=)vkbFm9OB^%z%+uPqppf-BfT0rD!^wY&L9?0FT^f#2(nSylNOah6Ly@z z!-}Y()`*uv$`<$gq;4JZsRT;5O({2HD#VRF%%wJ5o4MHpor-)*4nU;frw4u}=L&@; znG0xhG6o`CJap@UQK>IJNudTv%I88bv+{TL@mOCFPj&d68OR6=Gae`{(Uho7A91+j zmUu(k5gmOoU%_QWWygHV>oPMkN{Gaq6)Q_-bmBIxsf8Aq6*+%qaQZE)9kOd;2)kJp z$W;jqi|A(kQ;dV`hT12?Ato@&H8|fJ?@-_;fkosuA&`@`BYTC^3iAnLgE-1utHmc# zyAfk??#${MT$|m@Ht^>5?hXUzL*(N7aT5+^#1C<<59Y}D_jc=unU*|@6*T@~9^b3n zu|pWypB(oT0=)63bBlt4yo%YbU0=x2Zh82j`tT9$D}zhPg8$VN-iL#bbm$C@b=qmqL$ zhj`#sYF^{kIZ=RqEApk$J&t%@j1SHbSVWts-Dp8fBxX?u^Z?4^ghj~k$XGcaa(Gry zj7D+{R+NGhvlmKSr~b%|I0F5WJyUWcvN5uD@>9X!n4x%<1K(M9u%03A@cIHP=;NF{ z&mcErb`jNx;8j}4%j*gD?T?Y^pNs_&^YjWH^fTsOqO!n}X2{Kv47jce?+J!oXiH*Q zpxer^uw{^2Ru9NduVu~Phgry=1~BmT(KJx0o-BV zg_z2|AlbaAsN`;0WbO>btSNYK4X#xKY!%uRT$Px{IgsN7;ULbEXOV#i;5x!19nb@P zl9`T(&QCs#C;%GZssZ20u9)y~;sw3m184YPYu z{%CcPp8GmAhN|a(0lG89|Y>nNfd^+HxdoU_!sjLW0M&`;l^=BI`v7<;8^r|dM0xQ zI|S6fsHAB}kyVJC{EZ* zXsn9qZ|oqkn?Wts;i!kEn8#UnmgA@fHZU&4K^8&2^n7O2F1th22ocWcwu$JK5QmAp zDz{EppHgnY=Bar`IqOSCFk@G64$8&YB&R0-5E+cft%V%*kdzon_D1_R;B{KS39qJJ zh<1!Iq6)cTI8ZABKe5k1cFed{AgFo3LZYsN-q`b0(QPb6hyI$;vkfuZZ{}IvOA_PNw)Jqg>DS9@HL;3~lCw6g(^Yj@;Dc{GjI1ckSKg3UV zj7sFD6pUHH02t-Of8r_6W8S3}sXca?h*MR-cHTo#iBHx&j5fh~MHCTX82uX9o%ks@ z4Lkcp{t!q=)QdS{`4j4h6u@wLgZu?CH9kjiL6AP!&wmZL>0v>CUOQDG4n6u zlrv-fFDh$hIQDdi9%RvsC`v($XKl?)Sq&|69f2JjR;BDAid7D-Inf>>OA!kNvlm&0 zH8;E3!eWBw(qrgH)CE6S6VbBN4&#!R=Ifj-^(cIS9Yj`6!V&@@84aQ@BNzw!%9MwB zpEZLxS47Km2{F2@L6$~8Ub&&}8 zIU#ynX)+C_7$EnNG%mWN^cC_3<}2Em?1+d>{zT8BTp|{z zHx14u7bFj0CZN1R!$P~nW_k?$RE!i2PC)M#ztdOzU_HhXai#{g=IRUAs%v0z!98hr zEqst4w28tk89P@ZqYDmCj!YgP@;;fE=*}?RD$qLQFPtG~%X**D$$3+LA^q^fj^CjXkpAdZn?ok7@2mOi`=WNJ|$!Cb8L?_-2 zRp*{#3#4OYOTo{%-p4r-C1^9TGQ>OFVqhy)PNI`QlwsBFa$lJ>Fyn*ch^_z;P1GX7 z3+XpxFGMQZnmmMgf%BuDS!0sp(HjlOQ^ZL2fLQU$6NGp0@06N$C4RCFWQ|0xq}Q}U%lt}h)4$1L$r7k>&W3(O{=~|ZJwA~^ z!|*){*CA$;)3ZCu?w>c#rzPoS>;jXyiLAn^QLOy33r_5$PclXXo>J#xR*VAS31orH zpyVpNmqAD?@RT($>t=eD$a}Od*ZlawdGJmJ&Q|m&Ie*TGJcz58v@T$cS?@6y zvC~4%NN)Tbr3Ggc78aGBKywjI{FxX?hAmi>=-e`EvJXs;;&?<$%$zA~z%CZ!UGOoY0#SoLCwkO;dNcdO z>_Cz;i<*hCNBL=a`X%Fmx+EGs$2tB^)aTzA-_#~MR`eoKT@c>{Cm^F1*@1& zxUxqb3vNnuqju<{>=!cTI8!nOu1gEH&FfOL%&zR^kgpQGc(7|s%p>-32JFD|V06*X zh(PpWRBg4!ldMSdzFnh9&r2jCs>BSt2?_q8d zQA^IutVavcI%H3jgW4mP;_tLI{g(2J>>=t**8hA5ahDh+R;PugMW2TLF0RJhDsakEPd~3tnSU8oL_Nl}3Rk9{M5l|jJGB)JEpfe_m_WYH zcM-7!3t>Oeiuhvu3C|^uW>ysSF7J00D|PW$i6EA7B(Cfb{kgKq`(nr$*iGbJT+FoO zeC)08HF6~OP%0~7}|DxAEk5krKe4TMf&l5N*;+364=2~h}bjyT|gnyFXaONDF zdL+Wq?xNGc`)$SjY}6uoKfj8uGksQMDaIDH!fe63#48agcvWHx?>=G#6KS||K)W)h z@EYW)w3)yz@-oq#5vV|X<;pp$EZUiK;~~R4?6^9Shn*dPtsI3hCOnm|vpN=0C32}i ze)>8&4p~0AI5i|Xqtt`2GEqdJu~;XjRp>`tiPPc=L^j?<#q(LI(gXONt1GMnIQnyR zqr{@iLHUT&^bej(zRP|H`**zV^Z5zuD%^S`?(!a>P`4lQegsM@vLQznRUbWqk;*8b zC9Lo))+W?0&n0THrsq9ntVFmX#{A0rP?)Fr$>?MJ6Z^?BsblJm{DU4TScyO(Vi)tb zsKP{6=J`S+^swg`C~$Av2_h-|fwLFfnjd5fPUNM>(Hoe{8*pv1Id)@2XOl6`%t(Y6_ZWzk2qGxwt;D#@C;|(~m$-h$ ztMOHF{}fpSIX{1=zp!)i{2no`KymGe>v^;pk$|hsL=JZAxc*<|zH>&P3)wtZ_XFK#0!xJ1Y@APsXZ85Q-+xr?qWs}a!!7k3bl+ZZ4@XGy>3tHN^hRKccdVF6}L$}QGBnMugi zi6&yDNvweoi5V4)2d);AsdIgpdGL9kh91XgXS@>+*n1Iqk>1LRl(~r2lDIQeL>u`w zeUbN(5XG5)SS=Gn*m->JpTuy+5E-4|T4eU37f((~4-_~`s}naxq_N_lPLF&b2h*$6C3C=)GRXrBa7YkNUYG4FN>Zz`5k9UwohcE2eS7`e$4v? z#GOiI$mHztQm=ef)Z3z~C)hHxAfsRO2FQPjCe#)C0wSt-_c!kwA+~Zop1wf86!V}A zL?dQczFXW4O9Wt*OfMC=gHb5@K4PU>ezp zu{%w4Bwn&=q}>Il5}A_nFq1QCM8!--C+<=a=*xLA1F`xdi&A4g;?7at-yt#vd(zBG z)C)bDl^i{hIjs^pq$Iqvm_PGX_M3@hLU)2UFm~x})Cy9sWbE2Sg%E$FpN=vx~+Y@X?R5@hs?2548P5to>wMyuXce<29gW0&}?)e>$ zj4am2)Fbggd{8?25D!rm3Qo;Bg};lu!`aZInY-Dir8U{F6L%F6(b!p|Z3X|}J#4&^ zSd$ZLxWsct8?k`#`@G*okK>g@Cts}4i_9SI-yv_}%7?gjS40(i8^lV1|MW7}7Obhs zH2A>?es1^Y{z<LBYA%|-$&NXE-&rDJ78&TcCoqAK&)aGp?{D$iHB!VSUE4-$NCDtK~YIc#y_l32n4Iv%52tAqlXEx(mBJ&c}=sT21+-*-i zi(Dd5g)4ZXnkPdh>#E1^!uNVzMY`Ol!xe>@O+L+oEM!Y(R~!FVEkG1 zo1gOqYDjQU(cNR75lo4Bi40Fv;v&lm9SNLzUfT#+X%o?xr#xH<7aFB3%-5W)$ostK zfjZSftNdh#mbpNzFbjr99ANj75yWw*Wm;RTWQdH!2ogP5_CUx6iAh8m@)&`!)RM5X zh-Utca&Wa%pbc3tv$^Pni#uEe0*RH|=hcW#0oF_ zeasZ}24XZb5+$SWi9ynfw^*NJEy7McYjkpb!TZFj2sycU z7d?Sk&i9KC`l!GZ`V^x{=#_t^_c8JXYZU!>!H5{qqDCTGa&+OLk zSaJO4K0}|RUr-)g1n=V2MCPXU`4d;iXlu@f-_-y$C6F-{=*R!%U@yZCt9Qf^iZh-9*No6~s)YMFs3Z^ESwn&U z{N;Z%+VNk(Hb|hve^=UFFTx{E;C~UmfVHP#KFN~(*cU>N|3LS{of1{9vG^bIqqy6D zlWPqAvw0Z)uXP>%mV+Hvy5c^HdDy$;5dN2YEOtJekN>%yjsJSh#U81Hu@}W@?7J3% zeY3v8ZY`Jbzj<-kr-lCsRf~Cd$Ifa%BuQ7S&8y zVQM8>sh8{@$pP%8`_TDq=9DAf~RO3_|6#q%zckHmG+X`$2&WVx}(hT{>vYyi0&Q11A#}&tQXNlyF zY=pAEvR)P~jd1>mohg%@L$JrzCFHy}u>b1-JQZNGb3Ar4eIj`%&BM<4ov}CJV&_A< z&Gm<5jC`5=UF=yn%`wcj)Un#(E&WI`*V$LH2RkD#$6kF7X&ra&RD6dW35#VvNX^nm z_9**D(!Mg4Vz%^q`yQLYIMsZ>wL|uvYKG#ZbGGAGJniVJ;vZEXb)lkzR3<&=ob0%b zos@SvjP@|cU_|m5?BsgMHQrffH{&^OtEF zFYDWmC$1x^3S}4R4d+3#%{&-S^6-+LkQrq&l##N7*o9+|<1TjRO>hi#&6G@%1t{9c zH^@#&b~zmOuN-MExooz)mm*L8iS#=}(FvDb@~ZTVbgWbb&r(TmVxP5Jk}=X)NjvOs zxZLGgI3M3@a*yHiBF8@K zX2(s5LAqIPker|uEppp_?7_bR`!4EXwM&XWHQT&D^Lod7l-DuObzYA(nVzAZchxgh zzse>$57{o6b4)`_v4%bROZsmNCrmfZ4=wZXERzqgXYVJHck#3fPj#rOoxGbQ7kkw` zvQM^$+NWDDnWq~M>;BN)(!XZvYaMLMbiC%|{>7ak`={m#Q;y|*`#Af@)_&HPZS$>C z?B6}v`K^48^0)Bx-trMaBTjbyIr#U`_Q6HLx+rz%r{QmU=&HvQc;C}rPrduajjytD z9?wQE}S4(tCI4>ZmbMGZJ;F8SO8%eKR^SXt;W`wXI=DlcYASqPXZ;$(NOL zs~0w0ZeFV2YHfCmkxf(0_gLw(%qPb4l6t11n_|6dyJ@AtuIp>e$20C;Gu}5`$MavS z48u(m9oJCdpGj{^IwLqb*qhA%&Tm_I)_SB!80;P4C*!bVtQcc+>qtrc`+%e zyW5P3%M0+0Oao#*$*VJ^Xw!Z4anxNX1^|u;pv;{3c>DQWjI>Y5p)Sqh<9t%~e%Ab|N ziotj;kKC@a9){t&GX7)6HkLL*fs9)21qUl0IpQc9LI^z$f zb(Wjw5)HCa+%S-O0TwZ>4 z*NtPBCApRL2doD*+3Iltlf&N*Z%P{1VNcgxJyx{)Dq(iW*J|t#Yl&!GTc1+aqcpaB zTlvANL)D+x?`R&@+SxS3zDYLPbGYY8k2C7?sz#5M%3oa{+I?)J9hdFVmMF_xmc_Pl zj(2U_tWsNu{RH;w?r*xIH|e5uX-#7qI@bNzu%P8&OMzjLxx2BQbF}13?D?7RNQ?X6 zl~wPmXB78ojBArHC8S;4?F2{Uw50v&9o4P*UtW&7_UVnb|Gjj*`vvdoC8b-NGZa#< z5nji_JB6Q$K9;()L%SYLUFB_?D&=sM!* zqbtn+x(2ESs&=U!D3{{dukB?goG0z;?O3OE{%Y&y2y=u>lBF5W`HpvNrFdr9ZsR7M zL|4|D+Zx<3qUPh;Gj&qPyfW`r<%Pg2TP_Z|a-%3ww?O$98kj#v{T9q;k9{#uz=`>gg^!`9~8TDf6>?V4=2x2LC5Su9h# ztg;WJ8ax~0L+4arkkY=)J{8Z2{9CGV9mX!>ha8!%8MZGBgIaHDAJnJTWLCUbb-w<3 zOQ7MXp|_#0xrcqPbgRcv%{X;8`Ji&t**2>1-M%mQj!G>}I@k70;wi(Q zm7hQP`SOiNr*FqyyL>b1LXXGhnvMF4>M^Q?nk~L2&y^8H$scwc(6d+in4~wOM|!{P zVUylAq&MxaFqZsSxuRlm?b5n_wRg2VUfA4fD0G zee1}hz-v4HGoLq{x%W7$Znx<(=|}d%^4pr}KKo<8Z+pCJV&9iD+P0S`CLpc?wMQz8E9X@IRne`cs=3e}sQJM=)@zsNyPofO`FPxvkCOdQ?ym}0 zE|Ii%_KdrRK2ePj60^n>+=O=iv2C$!`@?QBrjHdJk@m|FF2)!eF+bTC5Cb)&quul})#Y zo-e!c?#Z-ck|R?Orr#Y^8ESeY-)^_6NBMOO3{M`?Iqt=kFMIdXbj?ng6cQM)MfR2M zr^A(j|_Nn5I z!Uwy8OvF#9=< z8ZM<=J#%TxzR^{6JQB(=k^TdoUgnVI4R_D$mFo|Vf#b>@ad=;jvbjN zE0@bUI^MFcvn6Ba?h~flrq3*A9Xax8l9P_t9Ouk)S{GK=6=xOCDKk|ZuIy5IyvkJb zX=6e2JFOD)3C9PD&pfAj$$U$^a@Awy2XuSo>w0^?KV#{%$=}6v4m%YY8ojc6Wrvj+ zVcq^!tf*T3Y|VwHJKZk-eWceR*&q8)ZOVDl_!gcaInA-lcVEDbHpb3ld;f2E#=zTM z)}{x>t%y1l{FC!Rb5qIe>?wuU^KayK&y(f8Ub?ruS4Cd!0o{Iu)+gV4ukUvMZs8-M zn*Ec!Hmk4VDNE;Nbiw+DXetZ8~!)HC;3(fXnt#ZU9LXO|a*7OyKmQa8QzJ4buj zt4gi+z~HqpXQT3hy#q%0yyg9!=RB28wL$evc2%OcePY^doMo76x@9?Jd_{k_WkXYB zLtSP6lAPkhWwlr?XsQ0KrndImrf0^FtTu4e^RjVXw|(utZ~Fh^yIWoCa@1e5T<%mk zq3tL33AHg7Bc_M<3O$jLm2B$})#DxYS1ohO`re%SLL3ayWeX~V_N0S{hIys zi7h*|D_U+gTN{3_*;-Lo)uGl{BdP6IySA=J*==Uxrjve8m)_Zw6>{dqzo+dp z?|pJ@Ov#RBn?++jB~SMLz`s-M*tVN{Y#8)=KhHiVdZ;@|Iv$L@>QQLCS@m_<`ieJ7 zh80cE|036s^Y!CVk8|?JRDY}sv7U3tJ)(k(qKC$fi#12(MQjf%3;H&|C!pH<9o2AW zroLJGmDbX{xOG+Y>Uw#-R9m5)-*~R}?dl)v-mg+Xtp}EpkDLG1&o>;CB`6knWkfwosqc~3?_9Te-KTc>vg7CN zMnrzE?rbe>=vjNW=0e4r6_KU0^JhN!8xL-KC)caGwRN32z$RBn1C~TIN9RQPh3yUx zjF1Mm4Hy!5BH(4;KJp=kiLJlr#_NXa(i*?2d7IQoFaY zZ|l6)ORZO1H@5y_2)9mkrpc!(vt`AKRMmTGg?f*4pZ;XiG2hdDcE6qe{`LMZCv;Ez zHh5*s#ST7cyLvBA|5q{0qOL5-T~ZQ$=j6F*$A=sYxEz!_qVmJ0)7qa54oiFMS(PmG z`?lupW4r#GQI}ENp*TGytvTG!=a}=9VW{by)}u{lwEwI6q-f9MA@|hJ@{0ehlr?P7 z&N0@@252Vue-v5~>JvUQWM*)e(3N4MB1VU9@x3QkTIA*n=D&;?`b!P5b?W-BYU*nG z)^@I*R=cWtK~=B1F^wgynWik$SH`NAF)cgva#OQ?v}3qytTIe~QTn#x9mhiZwbs*R zdD7S}jyE=axbu})Q-0~tE3RjJUf0-8%le&oAwBpHg}L=w;n~vD_xGK7;mCwtr;c8{ z6O;R_dTWiNZd~j1miO&b{VZ`$yS(0OT6ay~$GtA}I^XN%cD*CgJT96x8L#NNYY#LI ztG-Y+D6i;t$D5yL#by6oGO7Hfn!cuHd1tRizyAg72t5;>9ep6`Y}AsN>#<))1qNJE z+{E+Xly<%4L;Wu8M~!D23agvT`jieVbyU7p`9)b{<(=B?+Py7fn`UTdG#zf*q;E7; z*yqVU^SJ8uiFc!BhWf6|-}RCqs-cVL+3x4Y`OcU(=yvS4NlPNaW7l*tq`lQI^2MaE zYE6LIS)NvXDy#Ku{_*ekY5x7;L1o_Iil0iRmfxu>sg)YHd(4i`&hYQCwQKu6GkZt( z`nh{eYIxLTzn>HliVW#s`x{Gqf5?H{GCjOppCS9EnQvzW{U`k}2;nuF@bSH4&}yGUAUE)FcR6dy0$ zSN&h(#iryIXXB09nRO!@?&>3)V-#~pa;C$(cQk&6C5I)_zZHkQ7YDr&wlHdY+;>Sysjnu@jr`g#(Nm#rkykihgI#Kd#`6qYZ zKfC$ZXGed%o|3zy?C;8NN~Go8$|Fl%4F#_E!+Un{YoC*{zMZlC^3E}xS0#JI-VR-l zr<|P+`p@@*=7ej%E~NCp-I+J#S>vCs&%O6-Np4|f_j);QQTokXB;V|JAfzVde@PwN zF6=NT?Sr^U!Jqj&@%hpB8_y%MAMGz%N_Cs+tfl)4P8FmS7Uaj|_04Zr(xdX@n!2j- zmDj2!*6(O4(*0=K>3C7T(Bmi1Y|YDF8sB}syS>*b$}O2bS2ORt_VpCaE3@1CwU17+ zMwO+GOFEqXr04MPH(c4eb`{HO0*Y2#)1Lk3#PD;fta-UZO7ctll*U%hFF#+k#n8c1 zm9VPqw+TJtJElm|FLr6_;7ID3kRS0*NTr`%bJVkq>Idszjk^k*Pjxw4@;@s6u`r=P zQSM!LMf;@XfL`jjs0h;h;k_VWW#lJqJQIE6PKS*N3<-Sfzt5*dJ=fLGI?*^vd#38U z($uooE5j;Imb5EYmY%8@Rq0jHShlmwr)qrt@#c8rOzSM?AFhqq-FKyQt}@v(RP(v| z5;|J8;P-nSeJy=_^uWx-cT)-yuf%=bsVQStzve;y@h>T|%opm~YL}H3-t2#==IpnZ z4rJ|p@?&mn!S&p~@^(J+%G=nGDZ3b!p0GFCGvS-G8|`OiNIL$RJT}EQZbC%osMfIX zh%sRYH9KvC>i#UgT>N2C&w{SmyB=@P8&_Oe+_9{#{BZp~{S|XtS0Bv_!RuoE6HY~M zix?a^DCULOHj&|hL%kYRTcuO13v`;M@eQ+UzAMSiiF@{G&b<891%2{w#CwPTzrI$_chtw6D}8;tzV@17@+V{RGlSC~wC^1EFinyk z&~4D56=@4KxsC@~Y11DyiTCp_PCxVc+2&ir9z1^bS;2$6<@vSG+GKxMEwKgoy&2Ud z;zY!X_#>%%JFV$_tlc+lZ>6kE%1Znup&=o$jeq!3^?!!K>Zui3rFROx%X$52>$3}a z(RqP+8}kPjKdidcFsjwd_KEsburqpd^z?|QVJ}Af9$^k^7ZMUQBk)xpneuZ-f0Mms zal`)drv+ZQ(wyYH$8E6_U47x+_Y@>T<<$N=wNJf%A-yXy4}unWS+?gYIi>6SmLR~ zZ&S9%rv|T74Ao`U4X7AfJip+rXH%ZeeAfNRw#RcHKYli}q3?ntg|Q1jQ&q?<yFzVz>KfbVJ;Pl?@x?btkz4s48HV*wLeNXJ2!1ErHRdJ?!mGklj-pji6 z<&Cb_Bkm7;*!$t(dmAnVT}Zy!=V?aOaqXnmYTH_c$?ItFh{!Jzrlj3Z8=0DxlH2xi z#-g4LJ-*BQv8^^v5q8bfA?a+E>HcULTQ{>bqhNT>`kV{TY!CWmnI2~48p}iLKWl!^ zm|`!HG)ni&s#Sk^X9QgF@9o`5GfY$E@tS(Ie4b0`2(@(9*ER2Nn%BIu`KQL3y2G_? z>T>GO)|=~g)~&1e)P}e8(Ki^sw#3*|tuNRod9RJ#-uaC_x4NZwUDy4qj0s&H_b~O5 z4@(*JL)SU&y2PhOj|dx~>1Q9*w5noF@v59b*^0cOxtd(vqgl5!H)h}RdT=}Y_1xsb z-c_O68-}-RlO$I?=LEeN+Adrl*)Q>N$KhRc-Qv4<=sF-{YujG20ih%Prug*M>~LK% zZqoYJYO3}X`{#LNpL?Qu7Lxs6jh%wcX$pXG+&3eZ&F(809i+Z+&!qWAz*L z>+46?T`7AxXULOVkL#YCewy%P>C=F`WragZr&UeU?lb4R&dHqeFI0N3B|-Niw#9C3 zQ(8yj8e*!qRJ>mnQ2Je2&x+E@fmL~x z88yA@|In7V{ND0UOJ3_b-7Cfo<}9nj(F;#SIOiH9%Tl}{KPhd+>ew1dn%tsl)GYKY z_o!5UAkTJvYEN}oY(H5SSu@RTO?`|dx-l)Mbt|nKBvaKtcn;Q7`TP~QGRPI)E_zd& zTkX8sh9$=)7R9efd^y1@x?@C5Ko_4*nrb}hCf~IYo$a~J?Q1PnZ&y@Q%&HtxHm-DP zNk~acS!PB5nqk@_Eywi-3^C@T&OwU*JW9Re{bmJPgQG%BA)cXELk5O~1a9>W_i5+# zljlpCNX0qHyN>ymL#6?`?oGwo(Ty`2R@4uxJJay3cC^;oc)U*9@J~}lb35Ha(-O-a z<09t^(oW97lIe1}=1=)qWsN4rBSZPR_foI3@(-1BB(JDjTncM5o(bn+QJS~fJ~aPe zJz{vz_?~%*VTxh9VYF+n{ITkmPfTFCf2TlYkluGoKu0jcl`&CK%OXpoy`y%7p9%Rk zV2=L`AD8Be$3{htQl=WMn5O9Om}EI=`dfFfxwfgGadOk(=6Q|r+J-t?{kK|M!_da_ z4PR*!bQXP#snL4haR$#7nX#&>it;puckEq9$~UGK_Ax?1h?RhiCt&O<(BhD+A^ zpp&*lXN>ZR>}zGU@fXK3S9|?5>mub`;~Yngrbzdz<4c*JW3}rm>p_P?c~ZLEI8phw z+}AotzF58kPbM3$nuy4ls6MLM>5=aBkKdaIe_5 z$A`)m@Z2yx4%2bY%(m*`Nv$t!Re6nJSY@The`4@G)eS_o-o|E>1X0&O#Ypm~v z=Aq{A)J3Kk`4VkUdv55my2#dD0hN`j93MG1SF63{`W8dB_fcn$s?GjAJx|vJ`aW^B zam@D|ExB*Bdi7DKG%fVpte&HNNg1Ol1151Sb zl>l$YBzcP859&81qx`n{D5PurU-X}1C&QUhC_V60-^J~=u`StMXVxucgy`d>peXTy*+&}nH z`8DZ?FvFu@-^TDMSuRavw6yA`>C-r+YnC=Xc$Pfga!KAp`ljMr`{%MZ6n6PxOR{r- zdVpoUDcbX2`O(VEfGg^xq6j~SdWGR<yciOp9IXG|>ymV-K1({lvBLfcPf6+H zZ141xDeP9PX?L=%#go4u+Rw{2du@@084r84l`c1!0zcE=Ynb8}F8i+HHJi*=)tGFk zlo#4pSJj)#d`{Qju{L_hTOwMMBR)2i>arrHHjMN6!r!arXTL+9L$uD2KAN`G!+koc z$LpskF3D##|0S6kvc0)OORvb+UEekxP_D%)`Ep&Q*F@=u234B{b=8({6VBFdv&Z{} z*>@||_AS==zGm%4Vq7&yW_+x)Vxr*5D1X29Pq-AsEUSLjzZzUv=u z_t&q|T(ke?S{>-slqp{q_G+WnQWE~VBdjGa=%Q-AzJum+&~g2C*T(Q`cB5^Nk5=)y zDconNrf-wCW?u08rjCwofrTwoO)Y*K%{h(6;1w4C#@Qhe&6S3)f>WBVn|4LKTKB3c zA@p)tgmrB2uBy1EF%fp7v#?FbHJ{@}Mny!(zM5{9?J>72?lwlgld_^vxL;if1*SZl7N@t9e}LB*PSAJC9?wnYM1q9BYX=Lh-ugrm~%V zckQ^qmpqLn?KPvKmX&^OO-U`Ss%#wJaezIcXmi3D|KPj^ziXa9RQJ`~^?1+_X{rwC zW;#$`82qs%T{|&K>ljsLj7irVFMASk&EslaZK$93=hg3q&h>v%F*$6u`eeN%FwkW% z1o`+G&$zY*tE-ntq~Q(qR~^1yQ>{8liXuj?mu)gm^$+w&EZcooo7iG-I~{BYs5#VcP9MSxyz|6&-O2j zyprB(8C+I&@admj>&@DS>)VV`T)lfT?6r{itdSnSr2bjDwx&;)W9IGmE_FN~x%tw! z3DO>iE``Xe2F%MlpZ8UH+F7&zAZ(UFsU%oLhRK zi0xyPkj+iB{DH-@G>liz;&Wt$H?eezfPv<-exxy9xP%+{Hu@%9?@ z?~mF7y#H({u~$dVY5uypF|o>WByV2EOU;@xpZFX}hgyyIX`k;KJT&vYv{sK+kMMyW zT`Gb?rmG`b!)2?yzqO6Czo2X~&)3xJ`r8h9Yg@{lR@rj1#QCW?(Z0q{Tm7VUaMGl* znN7OHKN?Oo$l7eN|5Gs{PUih)#Wx|1Az4+So+texTDvHBng3D_RqJb?dA}`@R(%eDt}3r^LZH!Sc*{b0kmp9pD#z>I9XxWK?`c;0_R{xtUQ6pzv81Y3 zre{UCHY=@9RYrYcQYYj4MTHT0l8}JrZ&*22V=f^)%P^ zcl!%9xlNm5{Q+$o3 zM#+Gv_>%iZ|A>+X~`hS7P4y2?FDuf}Aah$ySe7iah z+Xwh=Jyz$T%Uc`peN3ANN7UaYz47!rd$*(!HGkKoM(&q)ubig3 z9shHsLEk?Gj~}SM8`n>EtiTiz;#2k1V{Xl`CzQXTK@+oNiu zX=S|LysDrnDA{LviPrU9$jXM5=AV2TU0a&}k?r&tXZ*z0q^#2H(U-{{`Bk>`l1vOq zX=yPpiRz%c;QB3~pdlwkWd^C3TLc9w4N`8%Yw;(&EU#DtbZb$5ehk^|L;y_3Br z>NgwvXrz8wwF&0W<5%ia8;j!nG+$KQ^1m0UtvKQgj+kvd(DaedPu}fX#@nKOPpA)Q z*E=))a*RLdPNtvNo_GE;390dc`%( zUB3d|%er>H73#T$htgoxQpNAGpR9FG&mf0oSd%<NJa-g)fP$b#gx_b)h|j~Y+hS? zKI*ZgO4mbiOcEe{-<6^}so3dtyV+(@#3 zs&0s4f#gs5JfA5Bi9;9EOI6(Zifn|}HHXyPNBNOzwxvS#q4ZtDG4;RJqk5aV$r5cy z@zzRS(fP|(d4|{*wf0wsD!wz%mgdMNOUm%9U4QE`S%lhY9qC#xAL2AR%<_HCw=H>+ zp0dOC?z)GL?dreu6Aizs@5`*Zt9GgSgmaYh8lEo?`wu3nCGd75OgGY!4ELfiX_39j~z5sJ6u6Qu(s!HPmf8J?-q zU*RLOI%muNmi{K=6F%b=pST{_ZzwlP-?r|R{h+vJzh!GDJ1cu+-D-PDx!wMj@jcfU z(rX5d=^N>5ww3z-Y>Covv(32KbjTYet`Nmak2*)GwVV8rq_^=y$6)!Nru)VyNjJxw*8Vn= z~&43>OX0J<*S-7++le@6{A@0 z+$TA%S}M`lzLYOlI?M&OZK@0QZ;iX<(Xt_??e;$vk8L%^S<(+>_ssv=JQQsmYRfX* zd$-H1w9S&ua4xWYA-N&X!&8U)%Yqdj;dv_2=!O|2a(S_Gzr2&=fAS*=e>@RmyZpX$ zp>3OVo#bVU#r}uvRa=O84eo(kX}xEQm#z4J9GwMtQ`ytTuSVLYX_{KIE6(EX z?(Vv{ySwY+&H{@Ts|%&>?vlon>)-spCr@FaY;*59bLPysd4KaF%a|4XMsg7`hxbCG z#jbcbQYJEyqmc{PJR(at%Z}tbq8z)~(gGgh8TwK)IGI~6>Nj(p`6LrZT>{T%g>Vvu~ZskwNw-*dL->_JyvW@bdnC0xInR%L~dl0EGF(a&6}L5gJy-f zm1;6o>#ti}DB2KevEjZj9~tmXMBnpcv2`L7_7ZoLP7$9VvaFU_cOwnrENMLc8^0lT z#5W@s2oVM=jN8juF`VVF<)L}4Wte%n>5%0aJ%-wEsbQ~jsq{F(fF9;wBRlc6m_Hsw zMv3-{ETVEzFzFz65Pd@%@J`~n$aIb-@8LO&yD$$ob2!z5JH?wU^XMY3Kf8&EU{|rd z!F4tj{VQeY>hy%w)_O71azW%gk9W3<~uu-U&Y*J`|^60<~ktYUJu|6% z(C+?6YyoX$5w;b7hfKkD5$VV~R7UusDZ)%N8o9@LgLgd5F5nL%_1q2ihj4;(U>|~p z^CVY?p2GGD^U*SF4Y)t`$74`hNC)ri6G&fdJ@$iNgp{FId5nL6EE4AN!@+gC1RR&6 z;rYt}b_OVC&#=9aC_aVl1wORCybf6bBIs6R5`Gr^(EH)_*bQVD_yI?N3UnZP2wBH} zM0`*SH+g>yXdb|K4{jmW1r! zOVM#yHQ$ElQH;;w`(te!!_LDzgb!?IECCG%-K7O<;NEhZ@mEMEJ`PXD-w9z5-QW2( zp&HYJ&;4R-1#*!;j@?IBaG~J)SI3U#XQO`n4mJ+?0#66(c}AGQrSPrb_`QM;0`;g9 z_-ro&U(*BNS2z=Vc0Yi7G=+%Ki?Fi#qK}c2pwg}fb?yXkPHGk=@ZCV~Yyc1HL;MnM z6PLrg@`nI!m?HGz9`aAYxgA`)uoCdh_QYmk|AHU-DO7^Z0@dnROoP;+Jo-TBhfF{p zf%Z2ZTx_Or&xHH1M(X&TNFrY=!JC5k_Kn z!O8vs#t6CG190XJ=HCm`Fz8~gGv+nIgF&EuuKlQ06g1PcF)=o4fBdJNR#1nPkKg5UZc zv>Y7BmFQKZNO%s8J6U`TDEg} zc|uQ=1K<0Bm=qfUnLYp=JYW{^n}tZ^77#5n z(8K5gGz5Eu^~MgOy|H2xM>EiM*fw+&;BMKFGkyy7{5j5nKg^3EuYKmPz@NMEm-(I$ zxg6I5JV61h&_{?TItqvZ8JZ0=%mLI4`;7^0gGlRuokJ^;;pk6bCtc9r;M3}lCE-i4 zM@WCnAH~6^^%>H}Z)J1XL}oJu^)uye$urNhgi-}mBi)~~Le`+;u_jU{t`@zJY_?h` zy&)ZJ)eFu%&&XrMBy54OmcPp0qSNUU7C-YJv)**lTx!{589^m6&)8+`THXi!7u}0F zle@`yB1{x1b`&2HdrM}DEg~0D4Uvvd!9JlzVHH=!L@}$VeU|4IUo&kYE$_^B)E|00 zy_x+8&SgeqIpIa75$8nL#l1zRM1RCPMV-i5pey}~;rKyxgs=tXe>r=Bxx`$d-clFn z>(t*A#^6i}?az+jayiJi5IIUT12f~f_)X#!d4udtZX%zO2M8a00G^2cgt+O>>)1K$ zU3M_rjak4rvtf)6)5sp@K5||{7Sau!hrYomd=8$2y(iWac~~bxMV4UAXd>1F?FIgf zw_&Z&kl{aZnd~bzg#DM%Fr(Pl>=kYzpUdlo0kB47=p=CT^~T;|gJF+9fJfn%u@l%N z%m$SsdB{g$H@^X#bC2=o+l2-D_~$43B*{Rankiep{wkdQ(f8aMpucEJ)B(D2dI* z_YsrGW1d9>8Yc*0>Q(!Y<&ivG3SQd_DFMU5a(ZqS0lrZ!`** zg;~OV{s8a8=dhhvl&zy5(0!TN%zfrH`;_wqmw8u2kL*EDV$bnHd<(IMv?E1iBw2~; zF@J0gasw)ea_%y>h)afgPtE>>FAX;tB4Q@&58WVVr=e>hkGG@SVBi0MPD9;L8#Et0 zQ@eqeX@B4o74YC`rLZ1$zNb)yp~4LQE*H%&F)Wyl}ckI(RK+y!k6*OyC&bAkWYu@M#Z1H-YkgAT}1wgHvH6tfvv+NBaw+yI3&r2YEeT z!f)m$fiGhzKUJ6qSy+MQpq*hp)WD%#L6)JVNTHA<>=gz<_4kzD#b4noxRKlyZU>jj zjpvp8S3U!3f1MDG-h|z*2{0P=4Q)G8OzS=dNah86=m_7f`iHkhRzu=7M9FJU#` zfK>$rEY4$t!*;8X1`oP+!B3r>(1;ky&S z3B4Oy0ZdgC_8euA?Qp$?uqyw7y;lmmhynIgTVWSp1{FgO$PWJiQ5g(A-Or)AvY?x> zfe4KhVBY9sSO*ByLTUVCeh#nYFYxQQZ=8Z($jP|-P*EL({dA0QQ&Npnuk7%JUjQan)j9T)Xu^q6dmt!<~4%X@s#KI4S_uR#OV57Kju9#f|HNkx3B}zd2o5`z^Xsa&P|H!t< z-`nQOM%vcMimX3LH;cy;&k>g0K;@ZmW4Z2-#t}D}O`DTtZ$&IHC&-5_{Pt#OWA5*g>pE=1s=bj2r zktJ}#vO|~Q-^D$p(b5jq$7CwGM3JDZa=M{%cJ1lHIT@UmICfIpuvsNpgB_vw=`nTp zrYp7a6?vuGOM8{3m3FP%R9Dv0sOv*@N9Ia`z44Ry?ecB+TJIL;{Kvl1 zrl%-Kh_mGCq-u4ey7pD|)=Ev;fwIbqQ`J#*eB-RPciP^jt+chUj@TmEVihObYZt3{ zZ5N@auq(4ov{6}26ZOIU5C>+D>6{TW_oltLcg$)!oj*(5l?=AM=%{lY?UwI++-aC< zr0Z)(k$jm|EV+a+YIW*MZP^WXD^HZ&Ew8QUT47g}(B!Y*W9mv(BAn#0O_J>eWs<7O z&Bdjg!ybpp4w>?k)`zT~Ngm>%d=GXUA1<6_`cnimn!agTW_oLRY58L+Fu1~V&fdlv z-C6xs^BxOk_M;lODyRWZp~2)Vafs*z?n$gBZSgUJp5HB8VoS`Qj3MUR=Bb96x=BWa zN~M*Y30)yJN=MkdQG9i3SH*ff_q6iVd5m;tU7k6G*uS?rO5EoknYL^8wYoOsRXs0P zmF+7_DOXp-*EltQZO4tbm{ZtQNnhD)P!{)a+v48jR^>L>?Xb$=bWHic&P^5~PDj0j z4eWLE2c1Gw(>APmNyEwp=f=JbfequEEG-k&5^aCOF3WM&$iG7+BDG|b)Zc28b!VFh z>xGi#qI?3FUS@%1hjFm}h~c+o9cSQo@Mp2-*4fHRRf5;ofYzXk0k?go`Y8gAcx5_# zw%UcInlqYbR(~wtR;2hNPy3Vh{C8}&d;Zaig)I&GENUBeTBdg#;pXPOBj9~dWMH(9 zx0l5uNu_e=CO>OcK@{_L%wO~m>XoToH%2>JGe9kA%Wn%;?`gf>JiR%#^|AU{`w{IX zQ!|xIn>i(Z7udZpYom>gty=cQx>j;Ue2q*%bd1sBYy7KyVDsaqA8ox2XQ&!#Dt!nW zEwgqE@Gu3qhmH%I61+3eGq`t0Yd_>87+Egy;8PFw#>Fz{?0a4I#`rWDliS* zWKe37+PVP?+MoK#p5^wTQ>+g-Rd}rl93MI&@?NMo$gX2-$RB@^>YMda#MZc@VN}`2 zLQA$eW!^7M!nxFInOk#f%5Ss;n|5&9L|OLx+^_q*4Ga!_5hm+6&i|^n#q*il5yv?B zaI5dcV7{F`!1Sl*8h>iWs2f{Hx70T6Y%({_tv^*Gu6^H-&|Ke=qjonqTcC~-z7c|S zx6NtUCHVupzvMk^wn%cx9hikdjRozYt@B#*eV_L$G5K0%Tp?NKsTpY*i+!{;yBzZN z?qCx-C0rSHBIu`ofZud)%T85B6Rd#m{9YlAfTb4IhHy&wtRohKM%vR5U7Ic@jYfSwq zZ+ecf#ENrx=(#*-OvKCR`Y4A8ETSf2N5@=`H2FMyndwWTx@>j6I&*F!`E6#r)35I- z^M3~veXHNCTf$r+&nu$c75)BYvI!vj3BB{N)6%4C{i)HGgRo>;lnjg^kCy4x!#|)OE=RoZCIP4d1mA7hE;VZtIt*bt_ZBOtDd?ZghjirHysP=9V;X?RDCW+LcgNR^GFv$glYpjuGL z?vDLCvUAj;sEr-_cxYq|NQnWd|5*4jYd}inH~!!*xqql)!&FUv%aevhbyl^9t6x^2 zrB_O7%knD9&u;QG5Ris_2z#W z1fFTw;%jWXxZ3y)2;LFl-|1QG_Q+@9OT#X7U|bWe^z2;ijhebVamJuTGM@Sn{BcFR zA+h`KsN!vn-Avn&TUKkF+k6u{Mn&4j?2TCx^)l2UXmNnpcZ*w?!(dyjcq6tN8O=|j zZs|L!XEe>IcdRv59xeY`np}hz4k*4~-l^K5KBZ;0cC_g>yBc?u>Sb1nIz@>6OvN(W z_g0?bzp(`7qH(|Br^%I$;i9?G$PBB64h-&4jHF_6)q} zbKXtixJ@=m{?J%=WgmQG$_Sf|B;7uW)yvBZD`_kx%6!)e3Lx z4s9V@BRnGG!hUva2^!^VavUU4F#FVA)ggHk(?%tHjqmk2^vk>-2UF>s@^VotO8+9i z%2QpB1k4Zn6%`wOKJsJu%aFYt9Q;;zFpe2AgJ>7lTUf)`m_(X_=9cbNEgvheTs&=9LgBZ9ugdRN6T4@o z7Npml)sE-grRmCC&pScyB0k6Li5(gf5tSeIU+`c4OWnUHgKV~u-q7UmsQz}_l7=nS zJ1Q2J2b5(O_beJ!G^@l|{-RpkaHwT+`yc%eYA%uipQZk=I&Cw+mb1NL`@v?4lqY5Q zUsykKgrrhZE7nTV<-ALdSG(V&fH#4If*d;(`ET+a=pLkaN%W&~)#}>3qPX8Tlcm3Y ze7pWVA+b8+S>C9M+byLQTk?p!T;&-M6RwEy?6jxTgxE1r&7qYYmikaGALMhyw~)W- zJ^JF7K6MW(t`#>ItSCs%&&o^9yIt_BBo>rV*PB;r3QcxwB|2B~6)LoJ|e%1e#`uq_(%IHygIplwlm_-EYsWntLs~$ z_;V-4J7N2e{2%d&Co&TAoGL^usg_M-v;3gyU%w}ze@7>E>epEp(=jq7{?`m4QE{^8zvWlcJzzPM)^`T+4F+;JMR_VcfHPdjCJecVyzfW4xu)* zgSMo6U0zJax5kCs5nOWl*ZXLs$`xge^& z<8t3>=LI$@tj+RVW7|YlRTciu{`mV*=JwydeRH?4e7Ci+ zCqg-2z4IX(9lF8NuRX2qL#Z+M{cn$q&gmD@)fqlnBXa@@cUMRo|89R`JjSG>a!Get zligV5GKa6q3HDNZ#=g=a-HC91>BKmhoWHwy`5f|J7m(%W;r+9SoKr*f7xiF7x7 zYBE-+3ZG@YPCoJT$@dOF)+7$jd|5zL2erv5OysBZ@j4OQ6y?(8Y1inkzFmgLjtFlG z=<0q|@mrM1rs#LKbgpqI(dW+mbN~0w-;zI$xvL7&i}A|vhVAN3Mk_{%@WgoO7#qR% zF6`PVvbVCmwqbTz_FCmdhXhA!=Lv3~eO3hw2}}$q^{4#C`CN8?<#fc>h}|*qEt9HO z7R6?b_qVe6-9N9S-px^#A8zVss>cs296a9zkBqt9HKV(>$KW0VyB>@#2u}3+ z$KkPLA_uC(rUw;)`7eJDPot9)lbcgg(vN1T^1@5~pwajjV**=@zqZ>Y;mde;JsBo$GpC|$GUBFUFvLO|HwKWdvA_WJJ-uk}^2njV=e&M9^pmQ1 z8-q_vZdI?sysQ!FCsK!}iqeK=?#n)z|FATtc3GRNaVfJB&S!^hT8V zU%2g2{iTX<|Id4s?;M{7Pfzzlu4i519rA2Xie9q4_41bNYE$u>oQLV7l6xn`Cv`}> z^k-qwq?%^+ZMs!-O4-xvRd8r@P?t;H26x}s^fv>sAHw`T~%MVd7jC>Kl~g0&iLN* z-4VDfWK1a0@or$f?|ZKv?m15Oax1bMWv_0m6_r+GJEXq#z8|zuM9kXYXGWPZF#xRf$Eb8znSlHl5!)=$Ru{1Q?xJu2(2 z2zFef8s(YmTj0<7Rr}co-3(8Q-W~lg!Y8a>NY|hRe&5_rE7Qbs=4AV@Iz#d3tnG=D z;@5uK@v-#tupjoRirlEm;p*XBxi#Z*$A4RdV^_MTq<2M+cU@|ut_Pp?{G)hIXv`6< zLRDl@;GY$#HxsufmL*4|@6Otp-%wUnXKWi`>dsFjR!N>($ICl9u62v?Ip7}{FfZWW zAnS)oV)SIil3pKPP{a#b5YVl~9#&vmm{C zPJ0UbT6);g=$#aDF^25&r*lH=y{JK<2EPre+p=i%xbaO(zZ%;TNltV6zSQYy*E0Ue ziqGv(e7bT)!%_7M;{)bDfx&7-TWzwHL2gZ6g+8f%NkI=H=XIgFb?dgV%bQMZ(Zj=E zbvWhO-(E>*$v=5&s!is;9HKb6Qrg(q{@pwm&Vf~; zOV-tjpDsBb&0egxgTGhDk&*pkq%mmphNvNtL19#fZ{BXsC#*AsazlAjWm%8xpp;!d zr+$lz_x`>;(e-zS;^g`uLud3q*;Ln=fp;P<#AbIg#BPdi4!aw8%6*;PcD%p2xm8os ztHdEEHlsE5PMTAuP4>Kkqrg$SwC3pT83VF|>~FQ$u8;EvcihX$|KyIbVR50^ z!CQm)4qXGSd|TZ_4)Io<(Bqalno$jB%DUz`WIfGTo!&M5e&#>96Us(3oYP)oUXzpT zl3gG94GP9X{_a>8)Iaco&qtL?ei~;i`u608){1L|hV0m^Sy^whSLLfp{;iB_IHPVg zPGn!B4&vpq>khr$c6iP5Y4u$h_%kFZ(lLq;pAxF>_#${vV4jz!^EjCn`_E!j2iBz& zU(2#2J0x`a)gi%>^dTcN|4-HGwtmzyGF2h>*b=A=8ylGsxjJHHXvYpso+Zj};_d8J z-N`0-Rdi8n_MzWVzuo@~%C#x{Sh}^kdsCd|nrR^GgEdN4*?BrYax3xN<4Xi}44)Bw zH6|naepEqZuZY7Tp#d}82ic1xt~_cO(loYwYOX1LXYzu?F^PAQ^D{o@jj6cZwAi3P zzF9Ml4xX)kcRK_GkAhR-RPQX882MKMW0vTM=Hpd!iqrF2a%^%B=1na;Qd(6-H11b- zH9E5)c$w8{`wK2F+*`dq_}vbYhDAiQg^vv@3DIF4GXrFMem#)8+x2-?mYgrGY zXWQPo;)?yHT}nbrRHf4L@m0U-JX&fsw@pvj>$to1Kt-P8f2wfL2Yx$(+(K@KGrXiY@dm(v*Z&o~o0!p4W%7yNFTNAMFed zubkhw;4Xh0d4;ddQ$oV^F}Z2{ns?XgDs-jsB|}U1mnT&nfKE}$_DVw;9fb1Y74hpY2oQG1cXwVzN~r-jkhVJf(JQTweXK{B0>-`nB{} zg;(v{rW5V0#>Je3d;;%1)1l6($=T0Y?08cVVq+nebHye`BW($-TVHvv?0#wgvY8c^ zs$=Rsn>)1cH4bL?Vyh(wZCT|UmmhBRo?Uz|`40@Z>F?mjd*AeY?8Z2cRL+xq5$!~5 zna9RJO@8x?`b9PRDx&&w&HDQO%{uiqgPf6LGbI~jhZWBqwmM}vb#tsz_{x06laYZ` zxGtups`hh5Z0X42JfOj2%8pgl0;l{n=Y$w)uSV_Y1n!(a>>V@Q%QZ zet&t-azEl+uQ+O5Ozh&%S^m|Pwa#cjY9cDHmiw2_tms`mu6{sEl_tuZ%k?GHQg?Y@ zvyRt94T2h&sobhAN-xCpF#cqnm!VPSk!hZD$W*TO`+H ziHePmRjLb~_P*!*p7>4iv+*15JIb5zSnEkX-H_dD9 z+)nD7&Eq)^+b@|XTd4Tvu+`;^dy1F4&nNHSUe;a;&o1uERgWDr><(FHiRNMda<46) z^v~KWS`{tbnwK~4X^Cvx->%hRW-Yyee+Hcznnh-Dxj0=^MR4dM{s6tr^tbMZx~QeF z=~iR!#@fao%_rLuG%E~w7Mx#>7fKAW9`@TDo1BAPZQU!~z1)?qSDj-W-`cmzQ*DB! z{mIkNG4cuMCMmO+ArFhIui z%{P1Nx5HBtal`T2x7ClUs;f5EIMr`#%5Gb(Pos_@YsG4rx5IJOaL;qTc>y;%BzL$Q zc-a4e&kv9PT=MJ}+Pokggi4E_zO1c(lc6rO_EXKJ+8ydu0J0g0UzO|)7g zdnBK0_fGy)cG~KJXcTsyn`1HQY}D?}{Tp`HEv-FL+fp~M@o`H|d%NKgy$9(c+G;() z?xw>ump5*GJy&_n_8RZ`&Ar5Rv5Spkn%zK~c<2okD99Poyi9*ub5EVtmfLnro!