Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit fdd3e0d

Browse filesBrowse files
fix: avoid views when querying BigLake tables from SQL cells (#16562)
Workaround for internal bug 493608478. Migrates googleapis/python-bigquery-dataframes#2545 🦕 --------- Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
1 parent 56ccbd8 commit fdd3e0d
Copy full SHA for fdd3e0d

3 files changed

+88-5Lines changed: 88 additions & 5 deletions

File tree

Expand file treeCollapse file tree
Open diff view settings
Filter options
Expand file treeCollapse file tree
Open diff view settings
Collapse file

‎packages/bigframes/bigframes/core/pyformat.py‎

Copy file name to clipboardExpand all lines: packages/bigframes/bigframes/core/pyformat.py
+24-2Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121

2222
import string
2323
import typing
24-
from typing import Any, Optional, Union
24+
from typing import Any, Optional, Tuple, Union
2525

2626
import google.cloud.bigquery
2727
import pandas
@@ -39,7 +39,11 @@
3939

4040

4141
def _table_to_sql(table: _BQ_TABLE_TYPES) -> str:
42-
return f"`{table.project}`.`{table.dataset_id}`.`{table.table_id}`"
42+
# BiglakeIcebergTable IDs have 4 parts. BigFrames packs catalog.namespace
43+
# into the dataset_id.
44+
dataset_parts = table.dataset_id.split(".")
45+
dataset_sql = ".".join(f"`{part}`" for part in dataset_parts)
46+
return f"`{table.project}`.{dataset_sql}.`{table.table_id}`"
4347

4448

4549
def _pandas_df_to_sql_dry_run(pd_df: pandas.DataFrame) -> str:
@@ -102,6 +106,24 @@ def _field_to_template_value(
102106
return _pandas_df_to_sql(value, session=session, dry_run=dry_run, name=name)
103107

104108
if isinstance(value, bigframes.dataframe.DataFrame):
109+
import bigframes.core.bq_data as bq_data
110+
import bigframes.core.nodes as nodes
111+
112+
# TODO(b/493608478): Remove this workaround for BigLake/Iceberg tables,
113+
# which cannot currently be used in views, once a fix rolls out.
114+
def is_biglake(
115+
node: nodes.BigFrameNode, child_results: Tuple[bool, ...]
116+
) -> bool:
117+
if isinstance(node, nodes.ReadTableNode):
118+
return isinstance(node.source.table, bq_data.BiglakeIcebergTable)
119+
return any(child_results)
120+
121+
contains_biglake = value._block.expr.node.reduce_up(is_biglake)
122+
123+
if contains_biglake:
124+
sql_query, _, _ = value._to_sql_query(include_index=True)
125+
return f"({sql_query})"
126+
105127
return _table_to_sql(value._to_placeholder_table(dry_run=dry_run))
106128

107129
if isinstance(value, str):
Collapse file

‎packages/bigframes/bigframes/session/_io/bigquery/__init__.py‎

Copy file name to clipboardExpand all lines: packages/bigframes/bigframes/session/_io/bigquery/__init__.py
+7-3Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -515,9 +515,13 @@ def to_query(
515515
time_travel_timestamp: Optional[datetime.datetime] = None,
516516
) -> str:
517517
"""Compile query_or_table with conditions(filters, wildcards) to query."""
518-
sub_query = (
519-
f"({query_or_table})" if is_query(query_or_table) else f"`{query_or_table}`"
520-
)
518+
if is_query(query_or_table):
519+
sub_query = f"({query_or_table})"
520+
else:
521+
# Table ID can have 1, 2, 3, or 4 parts. Quoting all parts to be safe.
522+
# See: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#identifiers
523+
parts = query_or_table.split(".")
524+
sub_query = ".".join(f"`{part}`" for part in parts)
521525

522526
# TODO(b/338111344): Generate an index based on DefaultIndexKind if we
523527
# don't have index columns specified.
Collapse file

‎packages/bigframes/tests/unit/core/test_pyformat.py‎

Copy file name to clipboardExpand all lines: packages/bigframes/tests/unit/core/test_pyformat.py
+57Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,15 @@ def test_pyformat_with_query_string_replaces_variables(session):
500500
),
501501
"SELECT * FROM `ListedProject`.`ListedDataset`.`ListedTable`",
502502
),
503+
(
504+
google.cloud.bigquery.TableReference(
505+
google.cloud.bigquery.DatasetReference(
506+
"my-project", "my-catalog.my-namespace"
507+
),
508+
"my-table",
509+
),
510+
"SELECT * FROM `my-project`.`my-catalog`.`my-namespace`.`my-table`",
511+
),
503512
),
504513
)
505514
def test_pyformat_with_table_replaces_variables(table, expected_sql, session=session):
@@ -511,3 +520,51 @@ def test_pyformat_with_table_replaces_variables(table, expected_sql, session=ses
511520
sql = "SELECT * FROM {table}"
512521
got_sql = pyformat.pyformat(sql, pyformat_args=pyformat_args, session=session)
513522
assert got_sql == expected_sql
523+
524+
525+
def test_pyformat_with_bigframes_dataframe_biglake_table(session):
526+
# Create a real BigFrames DataFrame that points to a BigLake table.
527+
import bigframes.core.array_value as array_value
528+
import bigframes.core.blocks as blocks
529+
import bigframes.core.bq_data as bq_data
530+
import bigframes.dataframe
531+
532+
# Define the BigLake table
533+
project_id = "my-project"
534+
catalog_id = "my-catalog"
535+
namespace_id = "my-namespace"
536+
table_id = "my-table"
537+
schema = (google.cloud.bigquery.SchemaField("col", "INTEGER"),)
538+
539+
biglake_table = bq_data.BiglakeIcebergTable(
540+
project_id=project_id,
541+
catalog_id=catalog_id,
542+
namespace_id=namespace_id,
543+
table_id=table_id,
544+
physical_schema=schema,
545+
cluster_cols=(),
546+
metadata=bq_data.TableMetadata(
547+
location=bq_data.BigQueryRegion("us-central1"),
548+
type="TABLE",
549+
),
550+
)
551+
552+
# ArrayValue.from_table is what read_gbq uses.
553+
av = array_value.ArrayValue.from_table(biglake_table, session)
554+
block = blocks.Block(av, index_columns=[], column_labels=["col"])
555+
df = bigframes.dataframe.DataFrame(block)
556+
557+
pyformat_args = {"df": df}
558+
sql = "SELECT * FROM {df}"
559+
560+
got_sql = pyformat.pyformat(sql, pyformat_args=pyformat_args, session=session)
561+
562+
# For BigLake, we now expect a SUBQUERY, not a view reference.
563+
# The subquery should have correctly quoted 4-part ID.
564+
assert "SELECT" in got_sql
565+
assert project_id in got_sql
566+
assert catalog_id in got_sql
567+
assert namespace_id in got_sql
568+
assert table_id in got_sql
569+
assert got_sql.startswith("SELECT * FROM (SELECT")
570+
assert got_sql.endswith(")")

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.