From 77d2c8857f2cac710c93d8c4d9e297e71e1c70b4 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 3 Oct 2024 19:34:28 +0000 Subject: [PATCH] perf: repr generates fewer queries --- bigframes/core/blocks.py | 3 ++- bigframes/dataframe.py | 1 - tests/system/small/test_dataframe.py | 7 ++++++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 9e245399cd..2b3734edd5 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1557,10 +1557,11 @@ def retrieve_repr_request_results( Returns a tuple of the dataframe and the overall number of rows of the query. """ + # head caches full underlying expression, so row_count will be free after head_result = self.session._executor.head(self.expr, max_results) count = self.session._executor.get_row_count(self.expr) - arrow = self.session._executor.execute(self.expr).to_arrow_table() + arrow = head_result.to_arrow_table() df = io_pandas.arrow_to_pandas(arrow, schema=self.expr.schema) self._copy_index_to_pandas(df) return df, count, head_result.query_job diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 7fa584bcc0..efd0e65adb 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -690,7 +690,6 @@ def _repr_html_(self) -> str: if opts.repr_mode == "deferred": return formatter.repr_query_job(self._compute_dry_run()) - self._cached() # TODO(swast): pass max_columns and get the true column count back. Maybe # get 1 more column than we have requested so that pandas can add the # ... for us? diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 8c2912edd4..cfd6efe9bd 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -591,15 +591,19 @@ def test_join_repr(scalars_dfs_maybe_ordered): assert actual == expected -def test_repr_html_w_all_rows(scalars_dfs): +def test_repr_html_w_all_rows(scalars_dfs, session): + metrics = session._metrics scalars_df, _ = scalars_dfs # get a pandas df of the expected format df, _ = scalars_df._block.to_pandas() pandas_df = df.set_axis(scalars_df._block.column_labels, axis=1) pandas_df.index.name = scalars_df.index.name + executions_pre = metrics.execution_count # When there are 10 or fewer rows, the outputs should be identical except for the extra note. actual = scalars_df.head(10)._repr_html_() + executions_post = metrics.execution_count + with display_options.pandas_repr(bigframes.options.display): pandas_repr = pandas_df.head(10)._repr_html_() @@ -608,6 +612,7 @@ def test_repr_html_w_all_rows(scalars_dfs): + f"[{len(pandas_df.index)} rows x {len(pandas_df.columns)} columns in total]" ) assert actual == expected + assert (executions_post - executions_pre) <= 2 def test_df_column_name_with_space(scalars_dfs):