From 912b8742963cf6f188cb15b17afbd87131704be3 Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Fri, 16 Feb 2024 01:49:22 +0000 Subject: [PATCH 01/10] perf: inline read_pandas for small data --- bigframes/dataframe.py | 16 +--------------- bigframes/operations/base.py | 12 +----------- bigframes/session/__init__.py | 30 +++++++++++++++++++++++++++--- 3 files changed, 29 insertions(+), 29 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 9db567a497..8f84979b21 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -69,10 +69,6 @@ import bigframes.session -# BigQuery has 1 MB query size limit, 5000 items shouldn't take more than 10% of this depending on data type. -# TODO(tbergeron): Convert to bytes-based limit -MAX_INLINE_DF_SIZE = 5000 - LevelType = typing.Hashable LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]] SingleItemValue = Union[bigframes.series.Series, int, float, Callable] @@ -170,17 +166,7 @@ def __init__( columns=columns, # type:ignore dtype=dtype, # type:ignore ) - if ( - pd_dataframe.size < MAX_INLINE_DF_SIZE - # TODO(swast): Workaround data types limitation in inline data. - and not any( - dt.pyarrow_dtype - for dt in pd_dataframe.dtypes - if isinstance(dt, pandas.ArrowDtype) - ) - ): - self._block = blocks.Block.from_local(pd_dataframe) - elif session: + if session: self._block = session.read_pandas(pd_dataframe)._get_block() else: self._block = bigframes.pandas.read_pandas(pd_dataframe)._get_block() diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 04114b43cb..96134bc4b4 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -104,17 +104,7 @@ def __init__( if pd_series.name is None: # to_frame will set default numeric column label if unnamed, but we do not support int column label, so must rename pd_dataframe = pd_dataframe.set_axis(["unnamed_col"], axis=1) - if ( - pd_dataframe.size < MAX_INLINE_SERIES_SIZE - # TODO(swast): Workaround data types limitation in inline data. - and not any( - dt.pyarrow_dtype - for dt in pd_dataframe.dtypes - if isinstance(dt, pd.ArrowDtype) - ) - ): - block = blocks.Block.from_local(pd_dataframe) - elif session: + if session: block = session.read_pandas(pd_dataframe)._get_block() else: # Uses default global session diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index df0cd6e947..b8aad20f5b 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -108,6 +108,10 @@ "UTF-32LE", } +# BigQuery has 1 MB query size limit, 5000 items shouldn't take more than 10% of this depending on data type. +# TODO(tbergeron): Convert to bytes-based limit +MAX_INLINE_DF_SIZE = 5000 + logger = logging.getLogger(__name__) @@ -882,6 +886,26 @@ def read_pandas(self, pandas_dataframe: pandas.DataFrame) -> dataframe.DataFrame def _read_pandas( self, pandas_dataframe: pandas.DataFrame, api_name: str + ) -> dataframe.DataFrame: + if ( + pandas_dataframe.size < MAX_INLINE_DF_SIZE + # TODO(swast): Workaround data types limitation in inline data. + and not any( + dt.pyarrow_dtype + for dt in pandas_dataframe.dtypes + if isinstance(dt, pandas.ArrowDtype) + ) + ): + return self._read_pandas_inline(pandas_dataframe) + return self._read_pandas_load_job(pandas_dataframe, api_name) + + def _read_pandas_inline( + self, pandas_dataframe: pandas.DataFrame + ) -> dataframe.DataFrame: + return dataframe.DataFrame(blocks.Block.from_local(pandas_dataframe)) + + def _read_pandas_load_job( + self, pandas_dataframe: pandas.DataFrame, api_name: str ) -> dataframe.DataFrame: col_labels, idx_labels = ( pandas_dataframe.columns.to_list(), @@ -1079,7 +1103,7 @@ def read_csv( encoding=encoding, **kwargs, ) - return self.read_pandas(pandas_df) # type: ignore + return self._read_pandas(pandas_df, "read_csv") # type: ignore def read_pickle( self, @@ -1096,7 +1120,7 @@ def read_pickle( if isinstance(pandas_obj, pandas.Series): if pandas_obj.name is None: pandas_obj.name = "0" - bigframes_df = self.read_pandas(pandas_obj.to_frame()) + bigframes_df = self._read_pandas(pandas_obj.to_frame(), "read_pickle") return bigframes_df[bigframes_df.columns[0]] return self._read_pandas(pandas_obj, "read_pickle") @@ -1196,7 +1220,7 @@ def read_json( engine=engine, **kwargs, ) - return self.read_pandas(pandas_df) + return self._read_pandas(pandas_df, "read_json") def _check_file_size(self, filepath: str): max_size = 1024 * 1024 * 1024 # 1 GB in bytes From 22dc85c047160beca727258084434bae269a5b64 Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Fri, 16 Feb 2024 18:48:49 +0000 Subject: [PATCH 02/10] fix tests --- .../getting_started_bq_dataframes.ipynb | 214 +++++++++++++++++- tests/unit/session/test_io_bigquery.py | 3 +- 2 files changed, 214 insertions(+), 3 deletions(-) diff --git a/notebooks/getting_started/getting_started_bq_dataframes.ipynb b/notebooks/getting_started/getting_started_bq_dataframes.ipynb index a9b6aefe30..403e871135 100644 --- a/notebooks/getting_started/getting_started_bq_dataframes.ipynb +++ b/notebooks/getting_started/getting_started_bq_dataframes.ipynb @@ -346,7 +346,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "id": "PyQmSRbKA8r-" }, @@ -365,6 +365,15 @@ "### Set BigQuery DataFrames options" ] }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, { "cell_type": "code", "execution_count": null, @@ -377,6 +386,81 @@ "bf.options.bigquery.location = REGION" ] }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 4dc6c41c-3e96-47db-98e4-ae6c2f6bf50e is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
12
013
124
\n", + "

2 rows × 2 columns

\n", + "
[2 rows x 2 columns in total]" + ], + "text/plain": [ + " 1 2\n", + "0 1 3\n", + "1 2 4\n", + "\n", + "[2 rows x 2 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bf.DataFrame({\"1\": [1, 2], \"2\": [3, 4]})" + ] + }, { "cell_type": "markdown", "metadata": { @@ -386,6 +470,122 @@ "If you want to reset the location of the created DataFrame or Series objects, reset the session by executing `bf.close_session()`. After that, you can reuse `bf.options.bigquery.location` to specify another location." ] }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "pd_df = pd.DataFrame({\"1\": [1, 2], \"2\": [3, 4]})" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Load job c9eedc60-a8ca-4b5c-8043-fd38525b5126 is DONE. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/.pyenv/versions/3.10.9/lib/python3.10/site-packages/ibis/backends/bigquery/__init__.py:499: FutureWarning: `database` is deprecated as of v7.1, removed in v8.0; The bigquery backend cannot return a table expression using only a `database` specifier. Include a `schema` argument.\n", + " util.warn_deprecated(\n" + ] + }, + { + "data": { + "text/html": [ + "Query job e9c219ee-880f-464a-9464-cb2dab20d498 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job d852b9ab-3814-4fd0-91af-53c57152c027 is DONE. 64 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
12
013
124
\n", + "

2 rows × 2 columns

\n", + "
[2 rows x 2 columns in total]" + ], + "text/plain": [ + " 1 2\n", + "0 1 3\n", + "1 2 4\n", + "\n", + "[2 rows x 2 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bf.read_pandas(pd_df)" + ] + }, { "cell_type": "markdown", "metadata": { @@ -973,6 +1173,18 @@ "kernelspec": { "display_name": "Python 3", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" } }, "nbformat": 4, diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index 96bb7bf67f..d610574efc 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -81,9 +81,8 @@ def test_create_job_configs_labels_log_adaptor_call_method_under_length_limit(): "recent-bigframes-api-2": "dataframe-__init__", "recent-bigframes-api-3": "dataframe-head", "recent-bigframes-api-4": "dataframe-__init__", + "recent-bigframes-api-5": "dataframe-__init__", } - assert labels is not None - assert len(labels) == 7 assert labels == expected_dict From e113de7a0cc7717b0426d39c54d02e20f763d346 Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Fri, 16 Feb 2024 18:53:45 +0000 Subject: [PATCH 03/10] revert unrelated file --- .../getting_started_bq_dataframes.ipynb | 214 +----------------- 1 file changed, 1 insertion(+), 213 deletions(-) diff --git a/notebooks/getting_started/getting_started_bq_dataframes.ipynb b/notebooks/getting_started/getting_started_bq_dataframes.ipynb index 403e871135..a9b6aefe30 100644 --- a/notebooks/getting_started/getting_started_bq_dataframes.ipynb +++ b/notebooks/getting_started/getting_started_bq_dataframes.ipynb @@ -346,7 +346,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "id": "PyQmSRbKA8r-" }, @@ -365,15 +365,6 @@ "### Set BigQuery DataFrames options" ] }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd" - ] - }, { "cell_type": "code", "execution_count": null, @@ -386,81 +377,6 @@ "bf.options.bigquery.location = REGION" ] }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 4dc6c41c-3e96-47db-98e4-ae6c2f6bf50e is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
12
013
124
\n", - "

2 rows × 2 columns

\n", - "
[2 rows x 2 columns in total]" - ], - "text/plain": [ - " 1 2\n", - "0 1 3\n", - "1 2 4\n", - "\n", - "[2 rows x 2 columns]" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "bf.DataFrame({\"1\": [1, 2], \"2\": [3, 4]})" - ] - }, { "cell_type": "markdown", "metadata": { @@ -470,122 +386,6 @@ "If you want to reset the location of the created DataFrame or Series objects, reset the session by executing `bf.close_session()`. After that, you can reuse `bf.options.bigquery.location` to specify another location." ] }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "pd_df = pd.DataFrame({\"1\": [1, 2], \"2\": [3, 4]})" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Load job c9eedc60-a8ca-4b5c-8043-fd38525b5126 is DONE. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/garrettwu/.pyenv/versions/3.10.9/lib/python3.10/site-packages/ibis/backends/bigquery/__init__.py:499: FutureWarning: `database` is deprecated as of v7.1, removed in v8.0; The bigquery backend cannot return a table expression using only a `database` specifier. Include a `schema` argument.\n", - " util.warn_deprecated(\n" - ] - }, - { - "data": { - "text/html": [ - "Query job e9c219ee-880f-464a-9464-cb2dab20d498 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job d852b9ab-3814-4fd0-91af-53c57152c027 is DONE. 64 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
12
013
124
\n", - "

2 rows × 2 columns

\n", - "
[2 rows x 2 columns in total]" - ], - "text/plain": [ - " 1 2\n", - "0 1 3\n", - "1 2 4\n", - "\n", - "[2 rows x 2 columns]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "bf.read_pandas(pd_df)" - ] - }, { "cell_type": "markdown", "metadata": { @@ -1173,18 +973,6 @@ "kernelspec": { "display_name": "Python 3", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.9" } }, "nbformat": 4, From 3a8520a6167a2981495f5fadd4c324cf4687fbde Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Fri, 16 Feb 2024 21:15:56 +0000 Subject: [PATCH 04/10] fix tests --- tests/unit/session/test_io_bigquery.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index d610574efc..406de2b88e 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -23,6 +23,7 @@ from bigframes.core import log_adapter import bigframes.pandas as bpd import bigframes.session._io.bigquery as io_bq +from tests.unit import resources def test_create_job_configs_labels_is_none(): @@ -64,7 +65,9 @@ def test_create_job_configs_labels_log_adaptor_call_method_under_length_limit(): "bigframes-api": "read_pandas", "source": "bigquery-dataframes-temp", } - df = bpd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + df = bpd.DataFrame( + {"col1": [1, 2], "col2": [3, 4]}, session=resources.create_bigquery_session() + ) # Test running two methods df.head() df.max() @@ -88,7 +91,9 @@ def test_create_job_configs_labels_log_adaptor_call_method_under_length_limit(): def test_create_job_configs_labels_length_limit_met_and_labels_is_none(): log_adapter.get_and_reset_api_methods() - df = bpd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + df = bpd.DataFrame( + {"col1": [1, 2], "col2": [3, 4]}, session=resources.create_bigquery_session() + ) # Test running methods more than the labels' length limit for i in range(66): df.head() @@ -113,7 +118,9 @@ def test_create_job_configs_labels_length_limit_met(): value = f"test{i}" cur_labels[key] = value # If cur_labels length is 62, we can only add one label from api_methods - df = bpd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + df = bpd.DataFrame( + {"col1": [1, 2], "col2": [3, 4]}, session=resources.create_bigquery_session() + ) # Test running two methods df.head() df.max() From b2e003b383929042d7368bfc0f1dad6e1805107c Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Fri, 16 Feb 2024 21:25:15 +0000 Subject: [PATCH 05/10] fix tests --- bigframes/operations/base.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 96134bc4b4..154247c033 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -30,10 +30,6 @@ import bigframes.session import third_party.bigframes_vendored.pandas.pandas._typing as vendored_pandas_typing -# BigQuery has 1 MB query size limit, 5000 items shouldn't take more than 10% of this depending on data type. -# TODO(tbergeron): Convert to bytes-based limit -MAX_INLINE_SERIES_SIZE = 5000 - class SeriesMethods: def __init__( From 912af6e5ecb94a03de5fc4da75b0f0c4dc95d2e7 Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Tue, 20 Feb 2024 19:24:14 +0000 Subject: [PATCH 06/10] fix tests --- bigframes/session/__init__.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index b8aad20f5b..79f5323d6d 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -891,9 +891,12 @@ def _read_pandas( pandas_dataframe.size < MAX_INLINE_DF_SIZE # TODO(swast): Workaround data types limitation in inline data. and not any( - dt.pyarrow_dtype - for dt in pandas_dataframe.dtypes - if isinstance(dt, pandas.ArrowDtype) + ( + isinstance(s.dtype, pandas.ArrowDtype) + or pandas.api.types.is_list_like(s) + or pandas.api.types.is_datetime64_dtype(s) + ) + for _, s in pandas_dataframe.items() ) ): return self._read_pandas_inline(pandas_dataframe) From b533641767d4045b78e2538fc5000884b5a74a1b Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Tue, 20 Feb 2024 21:24:21 +0000 Subject: [PATCH 07/10] fix tests --- bigframes/session/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 79f5323d6d..32f9702d3d 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -893,7 +893,7 @@ def _read_pandas( and not any( ( isinstance(s.dtype, pandas.ArrowDtype) - or pandas.api.types.is_list_like(s) + or (len(s) > 0 and pandas.api.types.is_list_like(s[0])) or pandas.api.types.is_datetime64_dtype(s) ) for _, s in pandas_dataframe.items() From 86f7eeba337d80e677c7a4402d2bc33fef09617a Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Tue, 20 Feb 2024 21:53:58 +0000 Subject: [PATCH 08/10] fix tests --- bigframes/session/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 32f9702d3d..e5706ab0a6 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -893,7 +893,7 @@ def _read_pandas( and not any( ( isinstance(s.dtype, pandas.ArrowDtype) - or (len(s) > 0 and pandas.api.types.is_list_like(s[0])) + or (len(s) > 0 and pandas.api.types.is_list_like(s.iloc[0])) or pandas.api.types.is_datetime64_dtype(s) ) for _, s in pandas_dataframe.items() From d373770ae027582b2b145a6ac485e0607b698257 Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Tue, 20 Feb 2024 23:08:30 +0000 Subject: [PATCH 09/10] fix tests --- bigframes/session/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index e5706ab0a6..20dd39c0fa 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -894,7 +894,7 @@ def _read_pandas( ( isinstance(s.dtype, pandas.ArrowDtype) or (len(s) > 0 and pandas.api.types.is_list_like(s.iloc[0])) - or pandas.api.types.is_datetime64_dtype(s) + or pandas.api.types.is_datetime64_any_dtype(s) ) for _, s in pandas_dataframe.items() ) From abd65b97bf3f007b24acfc7dc8f8eba3e13b0a42 Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Tue, 20 Feb 2024 23:38:44 +0000 Subject: [PATCH 10/10] fix tests --- tests/system/small/test_progress_bar.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/system/small/test_progress_bar.py b/tests/system/small/test_progress_bar.py index bd13ac2240..1c04b580fc 100644 --- a/tests/system/small/test_progress_bar.py +++ b/tests/system/small/test_progress_bar.py @@ -15,10 +15,12 @@ import re import tempfile +import numpy as np import pandas as pd import bigframes as bf import bigframes.formatting_helpers as formatting_helpers +from bigframes.session import MAX_INLINE_DF_SIZE job_load_message_regex = r"\w+ job [\w-]+ is \w+\." @@ -66,10 +68,15 @@ def test_progress_bar_extract_jobs( def test_progress_bar_load_jobs( session: bf.Session, penguins_pandas_df_default_index: pd.DataFrame, capsys ): + # repeat the DF to be big enough to trigger the load job. + df = penguins_pandas_df_default_index + while len(df) < MAX_INLINE_DF_SIZE: + df = pd.DataFrame(np.repeat(df.values, 2, axis=0)) + bf.options.display.progress_bar = "terminal" with tempfile.TemporaryDirectory() as dir: path = dir + "/test_read_csv_progress_bar*.csv" - penguins_pandas_df_default_index.to_csv(path, index=False) + df.to_csv(path, index=False) capsys.readouterr() # clear output session.read_csv(path)