Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

feat: implement confirmation threshold for semantic operators #1251

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Jan 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions 10 bigframes/_config/compute_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,13 +66,23 @@ class ComputeOptions:
engine to handle. However this comes at the cost of increase cost and latency.
extra_query_labels (Dict[str, Any], Options):
Stores additional custom labels for query configuration.
semmantic_ops_confirmation_threshold (int, optional):
Guards against unexepcted processing of large amount of rows by semantic operators.
If the number of rows exceeds the threshold, the user will be asked to confirm
their operations to resume. The default value is 0. Set the value to None
to turn off the guard.
semantic_ops_threshold_autofail (bool):
Guards against unexepcted processing of large amount of rows by semantic operators.
When set to True, the operation automatically fails without asking for user inputs.
"""

maximum_bytes_billed: Optional[int] = None
enable_multi_query_execution: bool = False
extra_query_labels: Dict[str, Any] = dataclasses.field(
default_factory=dict, init=False
)
semantic_ops_confirmation_threshold: Optional[int] = 0
semantic_ops_threshold_autofail = False

def assign_extra_query_labels(self, **kwargs: Any) -> None:
"""
Expand Down
4 changes: 4 additions & 0 deletions 4 bigframes/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ class QueryComplexityError(RuntimeError):
"""Query plan is too complex to execute."""


class OperationAbortedError(RuntimeError):
"""Operation is aborted."""


class TimeTravelDisabledWarning(Warning):
"""A query was reattempted without time travel."""

Expand Down
80 changes: 62 additions & 18 deletions 80 bigframes/operations/semantics.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@

import numpy as np

import bigframes.core.guid as guid
import bigframes.dtypes as dtypes
from bigframes import dtypes, exceptions
from bigframes.core import guid


class Semantics:
Expand Down Expand Up @@ -53,6 +53,7 @@ def agg(
>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None
>>> bpd.options.experiments.semantic_operators = True
>>> bpd.options.compute.semantic_ops_confirmation_threshold = 25

>>> import bigframes.ml.llm as llm
>>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001")
Expand Down Expand Up @@ -115,6 +116,15 @@ def agg(
self._validate_model(model)
columns = self._parse_columns(instruction)

if max_agg_rows <= 1:
raise ValueError(
f"Invalid value for `max_agg_rows`: {max_agg_rows}."
"It must be greater than 1."
)

work_estimate = len(self._df) * int(max_agg_rows / (max_agg_rows - 1))
self._confirm_operation(work_estimate)

df: bigframes.dataframe.DataFrame = self._df.copy()
for column in columns:
if column not in self._df.columns:
Expand All @@ -135,12 +145,6 @@ def agg(
"details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
)

if max_agg_rows <= 1:
raise ValueError(
f"Invalid value for `max_agg_rows`: {max_agg_rows}."
"It must be greater than 1."
)

user_instruction = self._format_instruction(instruction, columns)

num_cluster = 1
Expand Down Expand Up @@ -243,6 +247,7 @@ def cluster_by(
>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None
>>> bpd.options.experiments.semantic_operators = True
>>> bpd.options.compute.semantic_ops_confirmation_threshold = 25

>>> import bigframes.ml.llm as llm
>>> model = llm.TextEmbeddingGenerator()
Expand Down Expand Up @@ -296,6 +301,8 @@ def cluster_by(
"It must be greater than 1."
)

self._confirm_operation(len(self._df))

df: bigframes.dataframe.DataFrame = self._df.copy()
embeddings_df = model.predict(df[column])

Expand All @@ -314,6 +321,7 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals
>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None
>>> bpd.options.experiments.semantic_operators = True
>>> bpd.options.compute.semantic_ops_confirmation_threshold = 25

>>> import bigframes.ml.llm as llm
>>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001")
Expand Down Expand Up @@ -367,6 +375,8 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals
"details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
)

self._confirm_operation(len(self._df))

df: bigframes.dataframe.DataFrame = self._df[columns].copy()
for column in columns:
if df[column].dtype != dtypes.STRING_DTYPE:
Expand Down Expand Up @@ -403,6 +413,7 @@ def map(
>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None
>>> bpd.options.experiments.semantic_operators = True
>>> bpd.options.compute.semantic_ops_confirmation_threshold = 25

>>> import bigframes.ml.llm as llm
>>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001")
Expand Down Expand Up @@ -462,6 +473,8 @@ def map(
"details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
)

self._confirm_operation(len(self._df))

df: bigframes.dataframe.DataFrame = self._df[columns].copy()
for column in columns:
if df[column].dtype != dtypes.STRING_DTYPE:
Expand Down Expand Up @@ -490,7 +503,6 @@ def join(
other,
instruction: str,
model,
max_rows: int = 1000,
ground_with_google_search: bool = False,
):
"""
Expand All @@ -502,6 +514,7 @@ def join(
>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None
>>> bpd.options.experiments.semantic_operators = True
>>> bpd.options.compute.semantic_ops_confirmation_threshold = 25

>>> import bigframes.ml.llm as llm
>>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001")
Expand Down Expand Up @@ -561,12 +574,8 @@ def join(
"details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
)

joined_table_rows = len(self._df) * len(other)

if joined_table_rows > max_rows:
raise ValueError(
f"Number of rows that need processing is {joined_table_rows}, which exceeds row limit {max_rows}."
)
work_estimate = len(self._df) * len(other)
self._confirm_operation(work_estimate)

left_columns = []
right_columns = []
Expand Down Expand Up @@ -645,6 +654,7 @@ def search(

>>> import bigframes
>>> bigframes.options.experiments.semantic_operators = True
>>> bpd.options.compute.semantic_ops_confirmation_threshold = 25

>>> import bigframes.ml.llm as llm
>>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005")
Expand Down Expand Up @@ -680,6 +690,8 @@ def search(
if search_column not in self._df.columns:
raise ValueError(f"Column `{search_column}` not found")

self._confirm_operation(len(self._df))

import bigframes.ml.llm as llm

if not isinstance(model, llm.TextEmbeddingGenerator):
Expand Down Expand Up @@ -743,6 +755,7 @@ def top_k(
>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None
>>> bpd.options.experiments.semantic_operators = True
>>> bpd.options.compute.semantic_ops_confirmation_threshold = 25

>>> import bigframes.ml.llm as llm
>>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001")
Expand Down Expand Up @@ -803,6 +816,9 @@ def top_k(
"details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models"
)

work_estimate = int(len(self._df) * (len(self._df) - 1) / 2)
self._confirm_operation(work_estimate)

df: bigframes.dataframe.DataFrame = self._df[columns].copy()
column = columns[0]
if df[column].dtype != dtypes.STRING_DTYPE:
Expand Down Expand Up @@ -940,9 +956,8 @@ def sim_join(

>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None

>>> import bigframes
>>> bigframes.options.experiments.semantic_operators = True
>>> bpd.options.experiments.semantic_operators = True
>>> bpd.options.compute.semantic_ops_confirmation_threshold = 25

>>> import bigframes.ml.llm as llm
>>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005")
Expand Down Expand Up @@ -1001,6 +1016,9 @@ def sim_join(
if top_k < 1:
raise ValueError("top_k must be an integer greater than or equal to 1.")

work_estimate = len(self._df) * len(other)
self._confirm_operation(work_estimate)

base_table_embedding_column = guid.generate_guid()
base_table = self._attach_embedding(
other, right_on, base_table_embedding_column, model
Expand Down Expand Up @@ -1072,3 +1090,29 @@ def _validate_model(model):

if not isinstance(model, GeminiTextGenerator):
raise TypeError("Model is not GeminiText Generator")

@staticmethod
def _confirm_operation(row_count: int):
"""Raises OperationAbortedError when the confirmation fails"""
import bigframes

threshold = bigframes.options.compute.semantic_ops_confirmation_threshold

if threshold is None or row_count <= threshold:
return

if bigframes.options.compute.semantic_ops_threshold_autofail:
raise exceptions.OperationAbortedError(
f"Operation was cancelled because your work estimate is {row_count} rows, which exceeds the threshold {threshold} rows."
)

# Separate the prompt out. In IDE such VS Code, leaving prompt in the
# input function makes it less visible to the end user.
print(f"This operation will process about {row_count} rows.")
print(
"You can raise the confirmation threshold by setting `bigframes.options.compute.semantic_ops_confirmation_threshold` to a higher value. To completely turn off the confirmation check, set the threshold to `None`."
)
print("Proceed? [Y/n]")
reply = input().casefold()
if reply not in {"y", "yes", ""}:
raise exceptions.OperationAbortedError("Operation was cancelled.")
Loading
Loading
Morty Proxy This is a proxified and sanitized view of the page, visit original site.