Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit f93911c

Browse filesBrowse files
authored
fix: allow multi-part dataset IDs to support BigLake tables (#17137)
Relaxes DatasetReference.from_string and TableReference.from_string validation. Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/google-cloud-python/issues) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes b/512823729 🦕
1 parent 3624f3b commit f93911c
Copy full SHA for f93911c

8 files changed

+405-145Lines changed: 405 additions & 145 deletions

File tree

Expand file treeCollapse file tree
Open diff view settings
Filter options
Expand file treeCollapse file tree
Open diff view settings
Collapse file
+166Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Helper to turn string references into REST resources."""
16+
17+
# TODO(b/513204277): Consolidate these transformations with pandas-gbq and bigframes.
18+
19+
from __future__ import annotations
20+
21+
import re
22+
from typing import TypedDict, Union
23+
24+
25+
ParsedDatasetReference = TypedDict(
26+
"ParsedDatasetReference",
27+
{
28+
"projectId": str,
29+
"datasetId": str,
30+
},
31+
)
32+
33+
34+
ParsedTableReference = TypedDict(
35+
"ParsedTableReference",
36+
{
37+
"projectId": str,
38+
"datasetId": str,
39+
"tableId": str,
40+
},
41+
)
42+
43+
44+
_FULLY_QUALIFIED_DATASET_REFERENCE_PATTERN = re.compile(
45+
# In the past, organizations could prefix their project IDs with a domain
46+
# name. Such projects still exist, especially at Google.
47+
r"^(?P<legacy_project_domain>[^:]+:)?"
48+
r"(?P<project>[^.]+)\."
49+
# Match dataset or catalog + namespace.
50+
#
51+
# Namespace could be arbitrarily deeply nested in Iceberg/BigLake. Support
52+
# this without catastrophic backtracking by moving the trailing "." to the
53+
# table group.
54+
r"(?P<inner_parts>.*)"
55+
)
56+
57+
58+
_FULLY_QUALIFIED_TABLE_REFERENCE_PATTERN = re.compile(
59+
# In the past, organizations could prefix their project IDs with a domain
60+
# name. Such projects still exist, especially at Google.
61+
r"^(?P<legacy_project_domain>[^:]+:)?"
62+
r"(?P<project>[^.]+)\."
63+
# Match dataset or catalog + namespace.
64+
#
65+
# Namespace could be arbitrarily deeply nested in Iceberg/BigLake. Support
66+
# this without catastrophic backtracking by moving the trailing "." to the
67+
# table group.
68+
r"(?P<inner_parts>.*)"
69+
# Table names can't contain ".", as that's used as the separator.
70+
r"\.(?P<table>[^.]+)$"
71+
)
72+
73+
74+
_RELATIVE_TABLE_REFERENCE_PATTERN = re.compile(
75+
# Match dataset or catalog + namespace.
76+
#
77+
# Namespace could be arbitrarily deeply nested in Iceberg/BigLake. Support
78+
# this without catastrophic backtracking by moving the trailing "." to the
79+
# table group.
80+
r"(?P<inner_parts>.*)"
81+
# Table names can't contain ".", as that's used as the separator.
82+
r"\.(?P<table>[^.]+)$"
83+
)
84+
85+
86+
def parse_dataset_reference(
87+
dataset_id: str, *, default_project: Union[str, None]
88+
) -> ParsedDatasetReference:
89+
"""Parse a dataset ID string.
90+
91+
Returns:
92+
ParsedDatasetReference: A typed dictionary (to avoid circular dependencies).
93+
94+
Raises:
95+
ValueError: When a fully-qualified dataset ID can't be determined.
96+
"""
97+
regex_match = _FULLY_QUALIFIED_DATASET_REFERENCE_PATTERN.match(dataset_id)
98+
if regex_match:
99+
legacy_project_domain = regex_match.group("legacy_project_domain")
100+
project = regex_match.group("project")
101+
102+
if legacy_project_domain:
103+
output_project_id = f"{legacy_project_domain}{project}"
104+
else:
105+
output_project_id = project
106+
107+
return {
108+
"projectId": output_project_id,
109+
"datasetId": regex_match.group("inner_parts"),
110+
}
111+
112+
if not default_project:
113+
raise ValueError(
114+
"When default_project is not set, dataset_id must be a "
115+
"fully-qualified dataset ID in standard SQL format, "
116+
'e.g., "project.dataset_id" got {}'.format(dataset_id)
117+
)
118+
119+
return {"datasetId": dataset_id, "projectId": default_project}
120+
121+
122+
def parse_table_reference(
123+
table_id: str, *, default_project: Union[str, None]
124+
) -> ParsedTableReference:
125+
"""Parse a table ID string.
126+
127+
Returns:
128+
ParsedTableReference: A typed dictionary (to avoid circular dependencies).
129+
130+
Raises:
131+
ValueError: When a fully-qualified table ID can't be determined.
132+
"""
133+
regex_match = _FULLY_QUALIFIED_TABLE_REFERENCE_PATTERN.match(table_id)
134+
if regex_match:
135+
legacy_project_domain = regex_match.group("legacy_project_domain")
136+
project = regex_match.group("project")
137+
138+
if legacy_project_domain:
139+
output_project_id = f"{legacy_project_domain}{project}"
140+
else:
141+
output_project_id = project
142+
143+
return {
144+
"projectId": output_project_id,
145+
"datasetId": regex_match.group("inner_parts"),
146+
"tableId": regex_match.group("table"),
147+
}
148+
149+
if not default_project:
150+
raise ValueError(
151+
"Could not determine project ID. Supply a default project or a fully-qualified table ID, "
152+
f"such as 'project.dataset.table'. Got {table_id}."
153+
)
154+
155+
regex_match = _RELATIVE_TABLE_REFERENCE_PATTERN.match(table_id)
156+
if not regex_match:
157+
raise ValueError(
158+
"Could not parse table_id. Expected a table ID"
159+
f"such as 'project.dataset.table', but got {table_id}."
160+
)
161+
162+
return {
163+
"projectId": default_project,
164+
"datasetId": regex_match.group("inner_parts"),
165+
"tableId": regex_match.group("table"),
166+
}
Collapse file

‎packages/google-cloud-bigquery/google/cloud/bigquery/dataset.py‎

Copy file name to clipboardExpand all lines: packages/google-cloud-bigquery/google/cloud/bigquery/dataset.py
+9-22Lines changed: 9 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
from google.cloud.bigquery.table import Table, TableReference
3131
from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration
3232
from google.cloud.bigquery import external_config
33+
from google.cloud.bigquery import _string_references
3334

3435

3536
def _get_table_reference(self, table_id: str) -> TableReference:
@@ -123,7 +124,9 @@ def path(self):
123124
routine = _get_routine_reference
124125

125126
@classmethod
126-
def from_api_repr(cls, resource: dict) -> "DatasetReference":
127+
def from_api_repr(
128+
cls, resource: Union[dict, _string_references.ParsedDatasetReference]
129+
) -> "DatasetReference":
127130
"""Factory: construct a dataset reference given its API representation
128131
129132
Args:
@@ -166,28 +169,12 @@ def from_string(
166169
If ``dataset_id`` is not a fully-qualified dataset ID in
167170
standard SQL format.
168171
"""
169-
output_dataset_id = dataset_id
170-
parts = _helpers._split_id(dataset_id)
171-
172-
if len(parts) == 1:
173-
if default_project is not None:
174-
output_project_id = default_project
175-
else:
176-
raise ValueError(
177-
"When default_project is not set, dataset_id must be a "
178-
"fully-qualified dataset ID in standard SQL format, "
179-
'e.g., "project.dataset_id" got {}'.format(dataset_id)
180-
)
181-
elif len(parts) == 2:
182-
output_project_id, output_dataset_id = parts
183-
else:
184-
raise ValueError(
185-
"Too many parts in dataset_id. Expected a fully-qualified "
186-
"dataset ID in standard SQL format, "
187-
'e.g. "project.dataset_id", got {}'.format(dataset_id)
172+
return cls.from_api_repr(
173+
_string_references.parse_dataset_reference(
174+
dataset_id=dataset_id,
175+
default_project=default_project,
188176
)
189-
190-
return cls(output_project_id, output_dataset_id)
177+
)
191178

192179
def to_api_repr(self) -> dict:
193180
"""Construct the API resource representation of this dataset reference
Collapse file

‎packages/google-cloud-bigquery/google/cloud/bigquery/table.py‎

Copy file name to clipboardExpand all lines: packages/google-cloud-bigquery/google/cloud/bigquery/table.py
+9-13Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@
7272
from google.cloud.bigquery.schema import _parse_schema_resource
7373
from google.cloud.bigquery.schema import _to_schema_fields
7474
from google.cloud.bigquery import external_config
75+
from google.cloud.bigquery import _string_references
7576

7677
if typing.TYPE_CHECKING: # pragma: NO COVER
7778
# Unconditionally import optional dependencies again to tell pytype that
@@ -281,22 +282,17 @@ def from_string(
281282
If ``table_id`` is not a fully-qualified table ID in
282283
standard SQL format.
283284
"""
284-
from google.cloud.bigquery.dataset import DatasetReference
285-
286-
(
287-
output_project_id,
288-
output_dataset_id,
289-
output_table_id,
290-
) = _helpers._parse_3_part_id(
291-
table_id, default_project=default_project, property_name="table_id"
292-
)
293-
294-
return cls(
295-
DatasetReference(output_project_id, output_dataset_id), output_table_id
285+
return cls.from_api_repr(
286+
_string_references.parse_table_reference(
287+
table_id=table_id,
288+
default_project=default_project,
289+
)
296290
)
297291

298292
@classmethod
299-
def from_api_repr(cls, resource: dict) -> "TableReference":
293+
def from_api_repr(
294+
cls, resource: Union[dict, _string_references.ParsedTableReference]
295+
) -> "TableReference":
300296
"""Factory: construct a table reference given its API representation
301297
302298
Args:
Collapse file

‎packages/google-cloud-bigquery/tests/system/test_client.py‎

Copy file name to clipboardExpand all lines: packages/google-cloud-bigquery/tests/system/test_client.py
+24Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,18 @@ def test_get_dataset(self):
304304
self.assertEqual(got.friendly_name, "Friendly")
305305
self.assertEqual(got.description, "Description")
306306

307+
def test_get_dataset_w_public_biglake(self):
308+
dataset_id = "bigquery-public-data.biglake-public-nyc-taxi-iceberg.public_data"
309+
310+
dataset = Config.CLIENT.get_dataset(dataset_id)
311+
self.assertEqual(
312+
dataset.dataset_id, "biglake-public-nyc-taxi-iceberg.public_data"
313+
)
314+
self.assertEqual(dataset.project, "bigquery-public-data")
315+
self.assertGreater(
316+
dataset.created, datetime.datetime(2025, 1, 1, tzinfo=datetime.timezone.utc)
317+
)
318+
307319
def test_create_dataset_with_default_rounding_mode(self):
308320
DATASET_ID = _make_dataset_id("create_dataset_rounding_mode")
309321
dataset = self.temp_dataset(DATASET_ID, default_rounding_mode="ROUND_HALF_EVEN")
@@ -693,6 +705,18 @@ def test_delete_dataset_delete_contents_false(self):
693705
with self.assertRaises(exceptions.BadRequest):
694706
Config.CLIENT.delete_dataset(dataset)
695707

708+
def test_get_table_w_public_biglake(self):
709+
table_id = "bigquery-public-data.biglake-public-nyc-taxi-iceberg.public_data.nyc_taxicab"
710+
711+
table = Config.CLIENT.get_table(table_id)
712+
self.assertEqual(table.table_id, "nyc_taxicab")
713+
self.assertEqual(
714+
table.dataset_id, "biglake-public-nyc-taxi-iceberg.public_data"
715+
)
716+
self.assertEqual(table.project, "bigquery-public-data")
717+
schema_names = [field.name for field in table.schema]
718+
self.assertGreater(len(schema_names), 0)
719+
696720
def test_get_table_w_public_dataset(self):
697721
public = "bigquery-public-data"
698722
dataset_id = "samples"
Collapse file

‎packages/google-cloud-bigquery/tests/unit/test_dataset.py‎

Copy file name to clipboardExpand all lines: packages/google-cloud-bigquery/tests/unit/test_dataset.py
-48Lines changed: 0 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -820,54 +820,6 @@ def test_from_api_repr(self):
820820

821821
self.assertEqual(expected, got)
822822

823-
def test_from_string(self):
824-
cls = self._get_target_class()
825-
got = cls.from_string("string-project.string_dataset")
826-
self.assertEqual(got.project, "string-project")
827-
self.assertEqual(got.dataset_id, "string_dataset")
828-
829-
def test_from_string_w_prefix(self):
830-
cls = self._get_target_class()
831-
got = cls.from_string("google.com:string-project.string_dataset")
832-
self.assertEqual(got.project, "google.com:string-project")
833-
self.assertEqual(got.dataset_id, "string_dataset")
834-
835-
def test_from_string_legacy_string(self):
836-
cls = self._get_target_class()
837-
with self.assertRaises(ValueError):
838-
cls.from_string("string-project:string_dataset")
839-
840-
def test_from_string_w_incorrect_prefix(self):
841-
cls = self._get_target_class()
842-
with self.assertRaises(ValueError):
843-
cls.from_string("google.com.string-project.dataset_id")
844-
845-
def test_from_string_w_prefix_and_too_many_parts(self):
846-
cls = self._get_target_class()
847-
with self.assertRaises(ValueError):
848-
cls.from_string("google.com:string-project.dataset_id.table_id")
849-
850-
def test_from_string_not_fully_qualified(self):
851-
cls = self._get_target_class()
852-
with self.assertRaises(ValueError):
853-
cls.from_string("string_dataset")
854-
with self.assertRaises(ValueError):
855-
cls.from_string("a.b.c")
856-
857-
def test_from_string_with_default_project(self):
858-
cls = self._get_target_class()
859-
got = cls.from_string("string_dataset", default_project="default-project")
860-
self.assertEqual(got.project, "default-project")
861-
self.assertEqual(got.dataset_id, "string_dataset")
862-
863-
def test_from_string_ignores_default_project(self):
864-
cls = self._get_target_class()
865-
got = cls.from_string(
866-
"string-project.string_dataset", default_project="default-project"
867-
)
868-
self.assertEqual(got.project, "string-project")
869-
self.assertEqual(got.dataset_id, "string_dataset")
870-
871823
def test___eq___wrong_type(self):
872824
dataset = self._make_one("project_1", "dataset_1")
873825
other = object()
Collapse file

‎packages/google-cloud-bigquery/tests/unit/test_magics.py‎

Copy file name to clipboardExpand all lines: packages/google-cloud-bigquery/tests/unit/test_magics.py
+14-8Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1337,12 +1337,13 @@ def test_context_with_no_query_cache_from_context(monkeypatch):
13371337
ip = IPython.get_ipython()
13381338
monkeypatch.setattr(bigquery, "bigquery_magics", None)
13391339
bigquery.load_ipython_extension(ip)
1340+
context = magics.Context()
13401341
conn = make_connection()
1341-
monkeypatch.setattr(magics.context, "_connection", conn)
1342-
monkeypatch.setattr(magics.context, "project", "project-from-context")
1343-
monkeypatch.setattr(
1344-
magics.context.default_query_job_config, "use_query_cache", False
1345-
)
1342+
context._connection = conn
1343+
context.credentials = mock.create_autospec(google.auth.credentials.Credentials)
1344+
context.default_query_job_config = bigquery.QueryJobConfig(use_query_cache=False)
1345+
context.project = "project-from-context"
1346+
monkeypatch.setattr(magics, "context", context)
13461347

13471348
ip.run_cell_magic("bigquery", "", QUERY_STRING)
13481349

@@ -1415,12 +1416,17 @@ def test_bigquery_magic_with_progress_bar_type(monkeypatch):
14151416
ip = IPython.get_ipython()
14161417
monkeypatch.setattr(bigquery, "bigquery_magics", None)
14171418
bigquery.load_ipython_extension(ip)
1418-
magics.context.progress_bar_type = None
1419+
context = magics.Context()
1420+
conn = make_connection()
1421+
context._connection = conn
1422+
context.credentials = mock.create_autospec(google.auth.credentials.Credentials)
1423+
context.progress_bar_type = None
1424+
context.project = "unit-test-project"
1425+
monkeypatch.setattr(magics, "context", context)
14191426

14201427
run_query_patch = mock.patch(
14211428
"google.cloud.bigquery.magics.magics._run_query", autospec=True
14221429
)
1423-
magics.context.project = "unit-test-project"
14241430

14251431
with run_query_patch as run_query_mock:
14261432
ip.run_cell_magic(
@@ -2045,7 +2051,7 @@ def test_bigquery_magic_query_variable_not_identifier(monkeypatch):
20452051
# considered a table name, thus we expect an error that the table ID is not valid.
20462052
output = captured_io.stderr
20472053
assert "ERROR:" in output
2048-
assert "must be a fully-qualified ID" in output
2054+
assert "Could not parse table_id." in output
20492055

20502056

20512057
@pytest.mark.usefixtures("ipython_interactive")

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.