From a436194957f4fa41c979aef8bd19164d4a3ea8dc Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Fri, 10 Nov 2023 17:13:56 +0000 Subject: [PATCH 1/4] fix: use random table for read_pandas --- bigframes/core/__init__.py | 2 +- bigframes/session/__init__.py | 14 +++----------- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 63f36d4ddd..b640692bc8 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -165,7 +165,7 @@ def cached(self, cluster_cols: typing.Sequence[str]) -> ArrayValue: ibis_expr = compiled_value._to_ibis_expr( ordering_mode="unordered", expose_hidden_cols=True ) - tmp_table = self.session._ibis_to_session_table( + tmp_table = self.session._ibis_to_temp_table( ibis_expr, cluster_cols=cluster_cols, api_name="cached" ) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index bd5845631b..a1bc75c3ed 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -36,7 +36,6 @@ Tuple, Union, ) -import uuid import warnings import google.api_core.client_info @@ -977,7 +976,7 @@ def _read_pandas( job_config.clustering_fields = cluster_cols job_config.labels = {"bigframes-api": api_name} - load_table_destination = self._create_session_table() + load_table_destination = bigframes_io.random_table(self._anonymous_dataset) load_job = self.bqclient.load_table_from_dataframe( pandas_dataframe_copy, load_table_destination, @@ -1269,13 +1268,6 @@ def _check_file_size(self, filepath: str): "for large files to avoid loading the file into local memory." ) - def _create_session_table(self) -> bigquery.TableReference: - table_name = f"{uuid.uuid4().hex}" - dataset = bigquery.Dataset( - bigquery.DatasetReference(self.bqclient.project, "_SESSION") - ) - return dataset.table(table_name) - def _create_empty_temp_table( self, schema: Iterable[bigquery.SchemaField], @@ -1310,7 +1302,7 @@ def _create_sequential_ordering( ibis.row_number().cast(ibis_dtypes.int64).name(default_ordering_name) ) table = table.mutate(**{default_ordering_name: default_ordering_col}) - table_ref = self._ibis_to_session_table( + table_ref = self._ibis_to_temp_table( table, cluster_cols=list(index_cols) + [default_ordering_name], api_name=api_name, @@ -1326,7 +1318,7 @@ def _create_sequential_ordering( ) return table, ordering - def _ibis_to_session_table( + def _ibis_to_temp_table( self, table: ibis_types.Table, cluster_cols: Iterable[str], From 03606cda30eb7645bfd4534460112dcca56b0ab0 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 10 Nov 2023 12:50:14 -0600 Subject: [PATCH 2/4] fix: default to 7 days expiration for `read_csv`, `read_json`, `read_parquet` (#193) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Follow-up to https://togithub.com/googleapis/python-bigquery-dataframes/pull/175/files#r1389686556 🦕 --- bigframes/session/__init__.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index bd5845631b..27616de08d 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -836,11 +836,20 @@ def _read_bigquery_load_job( ) self._start_generic_job(load_job) + table_id = f"{table.project}.{table.dataset_id}.{table.table_id}" + + # Update the table expiration so we aren't limited to the default 24 + # hours of the anonymous dataset. + table_expiration = bigquery.Table(table_id) + table_expiration.expires = ( + datetime.datetime.now(datetime.timezone.utc) + constants.DEFAULT_EXPIRATION + ) + self.bqclient.update_table(table_expiration, ["expires"]) # The BigQuery REST API for tables.get doesn't take a session ID, so we # can't get the schema for a temp table that way. return self.read_gbq_table( - f"{table.project}.{table.dataset_id}.{table.table_id}", + table_id, index_col=index_col, col_order=col_order, ) From 96b8b1da9895971e4d5f2ee6744faf14848c8e76 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Fri, 10 Nov 2023 17:13:56 +0000 Subject: [PATCH 3/4] fix: use random table for read_pandas --- bigframes/core/__init__.py | 2 +- bigframes/session/__init__.py | 14 +++----------- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 63f36d4ddd..b640692bc8 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -165,7 +165,7 @@ def cached(self, cluster_cols: typing.Sequence[str]) -> ArrayValue: ibis_expr = compiled_value._to_ibis_expr( ordering_mode="unordered", expose_hidden_cols=True ) - tmp_table = self.session._ibis_to_session_table( + tmp_table = self.session._ibis_to_temp_table( ibis_expr, cluster_cols=cluster_cols, api_name="cached" ) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 27616de08d..005865d9b7 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -36,7 +36,6 @@ Tuple, Union, ) -import uuid import warnings import google.api_core.client_info @@ -986,7 +985,7 @@ def _read_pandas( job_config.clustering_fields = cluster_cols job_config.labels = {"bigframes-api": api_name} - load_table_destination = self._create_session_table() + load_table_destination = bigframes_io.random_table(self._anonymous_dataset) load_job = self.bqclient.load_table_from_dataframe( pandas_dataframe_copy, load_table_destination, @@ -1278,13 +1277,6 @@ def _check_file_size(self, filepath: str): "for large files to avoid loading the file into local memory." ) - def _create_session_table(self) -> bigquery.TableReference: - table_name = f"{uuid.uuid4().hex}" - dataset = bigquery.Dataset( - bigquery.DatasetReference(self.bqclient.project, "_SESSION") - ) - return dataset.table(table_name) - def _create_empty_temp_table( self, schema: Iterable[bigquery.SchemaField], @@ -1319,7 +1311,7 @@ def _create_sequential_ordering( ibis.row_number().cast(ibis_dtypes.int64).name(default_ordering_name) ) table = table.mutate(**{default_ordering_name: default_ordering_col}) - table_ref = self._ibis_to_session_table( + table_ref = self._ibis_to_temp_table( table, cluster_cols=list(index_cols) + [default_ordering_name], api_name=api_name, @@ -1335,7 +1327,7 @@ def _create_sequential_ordering( ) return table, ordering - def _ibis_to_session_table( + def _ibis_to_temp_table( self, table: ibis_types.Table, cluster_cols: Iterable[str], From 2d725968c9c9e88a4fe97151a3582821e995548a Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Fri, 10 Nov 2023 19:26:37 +0000 Subject: [PATCH 4/4] fix: use the real table expression --- bigframes/session/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 005865d9b7..9b881de9a0 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -998,8 +998,9 @@ def _read_pandas( total_ordering_columns=frozenset([ordering_col]), integer_encoding=IntegerEncoding(True, is_sequential=True), ) - table_expression = self.ibis_client.sql( - f"SELECT * FROM `{load_table_destination.table_id}`" + table_expression = self.ibis_client.table( + load_table_destination.table_id, + database=f"{load_table_destination.project}.{load_table_destination.dataset_id}", ) # b/297590178 Potentially a bug in bqclient.load_table_from_dataframe(), that only when the DF is empty, the index columns disappear in table_expression.