Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings
This repository was archived by the owner on May 7, 2026. It is now read-only.

Commit e91536c

Browse filesBrowse files
google-labs-jules[bot]tswastsycai
authored
feat: add bigframes.bigquery.ai.generate_embedding (#2343)
Implement AI.GENERATE_EMBEDDING function in bigframes.bigquery.ai. --- *PR created automatically by Jules for task [11924477578091076513](https://jules.google.com/task/11924477578091076513) started by @tswast* --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com> Co-authored-by: Tim Sweña <swast@google.com> Co-authored-by: Shenyang Cai <sycai@users.noreply.github.com>
1 parent 4b0f13b commit e91536c
Copy full SHA for e91536c

5 files changed

+305-33Lines changed: 305 additions & 33 deletions

File tree

Expand file treeCollapse file tree
Open diff view settings
Filter options
Expand file treeCollapse file tree
Open diff view settings
Collapse file

‎bigframes/bigquery/_operations/ai.py‎

Copy file name to clipboardExpand all lines: bigframes/bigquery/_operations/ai.py
+109-1Lines changed: 109 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from __future__ import annotations
2020

2121
import json
22-
from typing import Any, Iterable, List, Literal, Mapping, Tuple, Union
22+
from typing import Any, Dict, Iterable, List, Literal, Mapping, Optional, Tuple, Union
2323

2424
import pandas as pd
2525

@@ -28,6 +28,7 @@
2828
from bigframes import series, session
2929
from bigframes.core import convert
3030
from bigframes.core.logging import log_adapter
31+
import bigframes.core.sql.literals
3132
from bigframes.ml import core as ml_core
3233
from bigframes.operations import ai_ops, output_schemas
3334

@@ -388,6 +389,113 @@ def generate_double(
388389
return series_list[0]._apply_nary_op(operator, series_list[1:])
389390

390391

392+
@log_adapter.method_logger(custom_base_name="bigquery_ai")
393+
def generate_embedding(
394+
model_name: str,
395+
data: Union[dataframe.DataFrame, series.Series, pd.DataFrame, pd.Series],
396+
*,
397+
output_dimensionality: Optional[int] = None,
398+
task_type: Optional[str] = None,
399+
start_second: Optional[float] = None,
400+
end_second: Optional[float] = None,
401+
interval_seconds: Optional[float] = None,
402+
trial_id: Optional[int] = None,
403+
) -> dataframe.DataFrame:
404+
"""
405+
Creates embeddings that describe an entity—for example, a piece of text or an image.
406+
407+
**Examples:**
408+
409+
>>> import bigframes.pandas as bpd
410+
>>> import bigframes.bigquery as bbq
411+
>>> df = bpd.DataFrame({"content": ["apple", "bear", "pear"]})
412+
>>> bbq.ai.generate_embedding(
413+
... "project.dataset.model_name",
414+
... df
415+
... ) # doctest: +SKIP
416+
417+
Args:
418+
model_name (str):
419+
The name of a remote model from Vertex AI, such as the
420+
multimodalembedding@001 model.
421+
data (bigframes.pandas.DataFrame or bigframes.pandas.Series):
422+
The data to generate embeddings for. If a Series is provided, it is
423+
treated as the 'content' column. If a DataFrame is provided, it
424+
must contain a 'content' column, or you must rename the column you
425+
wish to embed to 'content'.
426+
output_dimensionality (int, optional):
427+
An INT64 value that specifies the number of dimensions to use when
428+
generating embeddings. For example, if you specify 256 AS
429+
output_dimensionality, then the embedding output column contains a
430+
256-dimensional embedding for each input value. To find the
431+
supported range of output dimensions, read about the available
432+
`Google text embedding models <https://docs.cloud.google.com/vertex-ai/generative-ai/docs/embeddings/get-text-embeddings#google-models>`_.
433+
task_type (str, optional):
434+
A STRING literal that specifies the intended downstream application to
435+
help the model produce better quality embeddings. For a list of
436+
supported task types and how to choose which one to use, see `Choose an
437+
embeddings task type <http://docs.cloud.google.com/vertex-ai/generative-ai/docs/embeddings/task-types>`_.
438+
start_second (float, optional):
439+
The second in the video at which to start the embedding. The default value is 0.
440+
end_second (float, optional):
441+
The second in the video at which to end the embedding. The default value is 120.
442+
interval_seconds (float, optional):
443+
The interval to use when creating embeddings. The default value is 16.
444+
trial_id (int, optional):
445+
An INT64 value that identifies the hyperparameter tuning trial that
446+
you want the function to evaluate. The function uses the optimal
447+
trial by default. Only specify this argument if you ran
448+
hyperparameter tuning when creating the model.
449+
450+
Returns:
451+
bigframes.pandas.DataFrame:
452+
A new DataFrame with the generated embeddings. See the `SQL
453+
reference for AI.GENERATE_EMBEDDING
454+
<https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-ai-generate-embedding#output>`_
455+
for details.
456+
"""
457+
if isinstance(data, (pd.DataFrame, pd.Series)):
458+
data = bpd.read_pandas(data)
459+
460+
if isinstance(data, series.Series):
461+
data = data.copy()
462+
data.name = "content"
463+
data_df = data.to_frame()
464+
elif isinstance(data, dataframe.DataFrame):
465+
data_df = data
466+
else:
467+
raise ValueError(f"Unsupported data type: {type(data)}")
468+
469+
# We need to get the SQL for the input data to pass as a subquery to the TVF
470+
source_sql = data_df.sql
471+
472+
struct_fields: Dict[str, bigframes.core.sql.literals.STRUCT_VALUES] = {}
473+
if output_dimensionality is not None:
474+
struct_fields["OUTPUT_DIMENSIONALITY"] = output_dimensionality
475+
if task_type is not None:
476+
struct_fields["TASK_TYPE"] = task_type
477+
if start_second is not None:
478+
struct_fields["START_SECOND"] = start_second
479+
if end_second is not None:
480+
struct_fields["END_SECOND"] = end_second
481+
if interval_seconds is not None:
482+
struct_fields["INTERVAL_SECONDS"] = interval_seconds
483+
if trial_id is not None:
484+
struct_fields["TRIAL_ID"] = trial_id
485+
486+
# Construct the TVF query
487+
query = f"""
488+
SELECT *
489+
FROM AI.GENERATE_EMBEDDING(
490+
MODEL `{model_name}`,
491+
({source_sql}),
492+
{bigframes.core.sql.literals.struct_literal(struct_fields)})
493+
)
494+
"""
495+
496+
return data_df._session.read_gbq(query)
497+
498+
391499
@log_adapter.method_logger(custom_base_name="bigquery_ai")
392500
def if_(
393501
prompt: PROMPT_TYPE,
Collapse file

‎bigframes/bigquery/ai.py‎

Copy file name to clipboardExpand all lines: bigframes/bigquery/ai.py
+2Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
generate,
2323
generate_bool,
2424
generate_double,
25+
generate_embedding,
2526
generate_int,
2627
if_,
2728
score,
@@ -33,6 +34,7 @@
3334
"generate",
3435
"generate_bool",
3536
"generate_double",
37+
"generate_embedding",
3638
"generate_int",
3739
"if_",
3840
"score",
Collapse file

‎bigframes/core/sql/literals.py‎

Copy file name to clipboard
+58Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import annotations
16+
17+
import collections.abc
18+
import json
19+
from typing import Any, List, Mapping, Union
20+
21+
import bigframes.core.sql
22+
23+
STRUCT_VALUES = Union[
24+
str, int, float, bool, Mapping[str, str], List[str], Mapping[str, Any]
25+
]
26+
STRUCT_TYPE = Mapping[str, STRUCT_VALUES]
27+
28+
29+
def struct_literal(struct_options: STRUCT_TYPE) -> str:
30+
rendered_options = []
31+
for option_name, option_value in struct_options.items():
32+
if option_name == "model_params":
33+
json_str = json.dumps(option_value)
34+
# Escape single quotes for SQL string literal
35+
sql_json_str = json_str.replace("'", "''")
36+
rendered_val = f"JSON'{sql_json_str}'"
37+
elif isinstance(option_value, collections.abc.Mapping):
38+
struct_body = ", ".join(
39+
[
40+
f"{bigframes.core.sql.simple_literal(v)} AS {k}"
41+
for k, v in option_value.items()
42+
]
43+
)
44+
rendered_val = f"STRUCT({struct_body})"
45+
elif isinstance(option_value, list):
46+
rendered_val = (
47+
"["
48+
+ ", ".join(
49+
[bigframes.core.sql.simple_literal(v) for v in option_value]
50+
)
51+
+ "]"
52+
)
53+
elif isinstance(option_value, bool):
54+
rendered_val = str(option_value).lower()
55+
else:
56+
rendered_val = bigframes.core.sql.simple_literal(option_value)
57+
rendered_options.append(f"{rendered_val} AS {option_name}")
58+
return f"STRUCT({', '.join(rendered_options)})"
Collapse file

‎bigframes/core/sql/ml.py‎

Copy file name to clipboardExpand all lines: bigframes/core/sql/ml.py
+2-32Lines changed: 2 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,11 @@
1414

1515
from __future__ import annotations
1616

17-
import collections.abc
18-
import json
1917
from typing import Any, Dict, List, Mapping, Optional, Union
2018

2119
import bigframes.core.compile.googlesql as googlesql
2220
import bigframes.core.sql
21+
import bigframes.core.sql.literals
2322

2423

2524
def create_model_ddl(
@@ -109,36 +108,7 @@ def _build_struct_sql(
109108
) -> str:
110109
if not struct_options:
111110
return ""
112-
113-
rendered_options = []
114-
for option_name, option_value in struct_options.items():
115-
if option_name == "model_params":
116-
json_str = json.dumps(option_value)
117-
# Escape single quotes for SQL string literal
118-
sql_json_str = json_str.replace("'", "''")
119-
rendered_val = f"JSON'{sql_json_str}'"
120-
elif isinstance(option_value, collections.abc.Mapping):
121-
struct_body = ", ".join(
122-
[
123-
f"{bigframes.core.sql.simple_literal(v)} AS {k}"
124-
for k, v in option_value.items()
125-
]
126-
)
127-
rendered_val = f"STRUCT({struct_body})"
128-
elif isinstance(option_value, list):
129-
rendered_val = (
130-
"["
131-
+ ", ".join(
132-
[bigframes.core.sql.simple_literal(v) for v in option_value]
133-
)
134-
+ "]"
135-
)
136-
elif isinstance(option_value, bool):
137-
rendered_val = str(option_value).lower()
138-
else:
139-
rendered_val = bigframes.core.sql.simple_literal(option_value)
140-
rendered_options.append(f"{rendered_val} AS {option_name}")
141-
return f", STRUCT({', '.join(rendered_options)})"
111+
return f", {bigframes.core.sql.literals.struct_literal(struct_options)}"
142112

143113

144114
def evaluate(
Collapse file

‎tests/unit/bigquery/test_ai.py‎

Copy file name to clipboard
+134Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from unittest import mock
16+
17+
import pandas as pd
18+
import pytest
19+
20+
import bigframes.bigquery as bbq
21+
import bigframes.dataframe
22+
import bigframes.series
23+
import bigframes.session
24+
25+
26+
@pytest.fixture
27+
def mock_session():
28+
return mock.create_autospec(spec=bigframes.session.Session)
29+
30+
31+
@pytest.fixture
32+
def mock_dataframe(mock_session):
33+
df = mock.create_autospec(spec=bigframes.dataframe.DataFrame)
34+
df._session = mock_session
35+
df.sql = "SELECT * FROM my_table"
36+
return df
37+
38+
39+
@pytest.fixture
40+
def mock_series(mock_session):
41+
series = mock.create_autospec(spec=bigframes.series.Series)
42+
series._session = mock_session
43+
# Mock to_frame to return a mock dataframe
44+
df = mock.create_autospec(spec=bigframes.dataframe.DataFrame)
45+
df._session = mock_session
46+
df.sql = "SELECT my_col AS content FROM my_table"
47+
series.copy.return_value = series
48+
series.to_frame.return_value = df
49+
return series
50+
51+
52+
def test_generate_embedding_with_dataframe(mock_dataframe, mock_session):
53+
model_name = "project.dataset.model"
54+
55+
bbq.ai.generate_embedding(
56+
model_name,
57+
mock_dataframe,
58+
output_dimensionality=256,
59+
)
60+
61+
mock_session.read_gbq.assert_called_once()
62+
query = mock_session.read_gbq.call_args[0][0]
63+
64+
# Normalize whitespace for comparison
65+
query = " ".join(query.split())
66+
67+
expected_part_1 = "SELECT * FROM AI.GENERATE_EMBEDDING("
68+
expected_part_2 = f"MODEL `{model_name}`,"
69+
expected_part_3 = "(SELECT * FROM my_table),"
70+
expected_part_4 = "STRUCT(256 AS OUTPUT_DIMENSIONALITY)"
71+
72+
assert expected_part_1 in query
73+
assert expected_part_2 in query
74+
assert expected_part_3 in query
75+
assert expected_part_4 in query
76+
77+
78+
def test_generate_embedding_with_series(mock_series, mock_session):
79+
model_name = "project.dataset.model"
80+
81+
bbq.ai.generate_embedding(
82+
model_name, mock_series, start_second=0.0, end_second=10.0, interval_seconds=5.0
83+
)
84+
85+
mock_session.read_gbq.assert_called_once()
86+
query = mock_session.read_gbq.call_args[0][0]
87+
query = " ".join(query.split())
88+
89+
assert f"MODEL `{model_name}`" in query
90+
assert "(SELECT my_col AS content FROM my_table)" in query
91+
assert (
92+
"STRUCT(0.0 AS START_SECOND, 10.0 AS END_SECOND, 5.0 AS INTERVAL_SECONDS)"
93+
in query
94+
)
95+
96+
97+
def test_generate_embedding_defaults(mock_dataframe, mock_session):
98+
model_name = "project.dataset.model"
99+
100+
bbq.ai.generate_embedding(
101+
model_name,
102+
mock_dataframe,
103+
)
104+
105+
mock_session.read_gbq.assert_called_once()
106+
query = mock_session.read_gbq.call_args[0][0]
107+
query = " ".join(query.split())
108+
109+
assert f"MODEL `{model_name}`" in query
110+
assert "STRUCT()" in query
111+
112+
113+
@mock.patch("bigframes.pandas.read_pandas")
114+
def test_generate_embedding_with_pandas_dataframe(
115+
read_pandas_mock, mock_dataframe, mock_session
116+
):
117+
# This tests that pandas input path works and calls read_pandas
118+
model_name = "project.dataset.model"
119+
120+
# Mock return value of read_pandas to be a BigFrames DataFrame
121+
read_pandas_mock.return_value = mock_dataframe
122+
123+
pandas_df = pd.DataFrame({"content": ["test"]})
124+
125+
bbq.ai.generate_embedding(
126+
model_name,
127+
pandas_df,
128+
)
129+
130+
read_pandas_mock.assert_called_once()
131+
# Check that read_pandas was called with something (the pandas df)
132+
assert read_pandas_mock.call_args[0][0] is pandas_df
133+
134+
mock_session.read_gbq.assert_called_once()

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.