diff --git a/bigframes/_config/compute_options.py b/bigframes/_config/compute_options.py index c8a54fe0b3..21b41eb185 100644 --- a/bigframes/_config/compute_options.py +++ b/bigframes/_config/compute_options.py @@ -66,6 +66,14 @@ class ComputeOptions: engine to handle. However this comes at the cost of increase cost and latency. extra_query_labels (Dict[str, Any], Options): Stores additional custom labels for query configuration. + semmantic_ops_confirmation_threshold (int, optional): + Guards against unexepcted processing of large amount of rows by semantic operators. + If the number of rows exceeds the threshold, the user will be asked to confirm + their operations to resume. The default value is 0. Set the value to None + to turn off the guard. + semantic_ops_threshold_autofail (bool): + Guards against unexepcted processing of large amount of rows by semantic operators. + When set to True, the operation automatically fails without asking for user inputs. """ maximum_bytes_billed: Optional[int] = None @@ -73,6 +81,8 @@ class ComputeOptions: extra_query_labels: Dict[str, Any] = dataclasses.field( default_factory=dict, init=False ) + semantic_ops_confirmation_threshold: Optional[int] = 0 + semantic_ops_threshold_autofail = False def assign_extra_query_labels(self, **kwargs: Any) -> None: """ diff --git a/bigframes/exceptions.py b/bigframes/exceptions.py index 27f3508ff4..3cb5f3665d 100644 --- a/bigframes/exceptions.py +++ b/bigframes/exceptions.py @@ -59,6 +59,10 @@ class QueryComplexityError(RuntimeError): """Query plan is too complex to execute.""" +class OperationAbortedError(RuntimeError): + """Operation is aborted.""" + + class TimeTravelDisabledWarning(Warning): """A query was reattempted without time travel.""" diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py index 79b92afe4f..6a537db4f3 100644 --- a/bigframes/operations/semantics.py +++ b/bigframes/operations/semantics.py @@ -20,8 +20,8 @@ import numpy as np -import bigframes.core.guid as guid -import bigframes.dtypes as dtypes +from bigframes import dtypes, exceptions +from bigframes.core import guid class Semantics: @@ -53,6 +53,7 @@ def agg( >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") @@ -115,6 +116,15 @@ def agg( self._validate_model(model) columns = self._parse_columns(instruction) + if max_agg_rows <= 1: + raise ValueError( + f"Invalid value for `max_agg_rows`: {max_agg_rows}." + "It must be greater than 1." + ) + + work_estimate = len(self._df) * int(max_agg_rows / (max_agg_rows - 1)) + self._confirm_operation(work_estimate) + df: bigframes.dataframe.DataFrame = self._df.copy() for column in columns: if column not in self._df.columns: @@ -135,12 +145,6 @@ def agg( "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" ) - if max_agg_rows <= 1: - raise ValueError( - f"Invalid value for `max_agg_rows`: {max_agg_rows}." - "It must be greater than 1." - ) - user_instruction = self._format_instruction(instruction, columns) num_cluster = 1 @@ -243,6 +247,7 @@ def cluster_by( >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm >>> model = llm.TextEmbeddingGenerator() @@ -296,6 +301,8 @@ def cluster_by( "It must be greater than 1." ) + self._confirm_operation(len(self._df)) + df: bigframes.dataframe.DataFrame = self._df.copy() embeddings_df = model.predict(df[column]) @@ -314,6 +321,7 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") @@ -367,6 +375,8 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" ) + self._confirm_operation(len(self._df)) + df: bigframes.dataframe.DataFrame = self._df[columns].copy() for column in columns: if df[column].dtype != dtypes.STRING_DTYPE: @@ -403,6 +413,7 @@ def map( >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") @@ -462,6 +473,8 @@ def map( "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" ) + self._confirm_operation(len(self._df)) + df: bigframes.dataframe.DataFrame = self._df[columns].copy() for column in columns: if df[column].dtype != dtypes.STRING_DTYPE: @@ -490,7 +503,6 @@ def join( other, instruction: str, model, - max_rows: int = 1000, ground_with_google_search: bool = False, ): """ @@ -502,6 +514,7 @@ def join( >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") @@ -561,12 +574,8 @@ def join( "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" ) - joined_table_rows = len(self._df) * len(other) - - if joined_table_rows > max_rows: - raise ValueError( - f"Number of rows that need processing is {joined_table_rows}, which exceeds row limit {max_rows}." - ) + work_estimate = len(self._df) * len(other) + self._confirm_operation(work_estimate) left_columns = [] right_columns = [] @@ -645,6 +654,7 @@ def search( >>> import bigframes >>> bigframes.options.experiments.semantic_operators = True + >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm >>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005") @@ -680,6 +690,8 @@ def search( if search_column not in self._df.columns: raise ValueError(f"Column `{search_column}` not found") + self._confirm_operation(len(self._df)) + import bigframes.ml.llm as llm if not isinstance(model, llm.TextEmbeddingGenerator): @@ -743,6 +755,7 @@ def top_k( >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") @@ -803,6 +816,9 @@ def top_k( "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" ) + work_estimate = int(len(self._df) * (len(self._df) - 1) / 2) + self._confirm_operation(work_estimate) + df: bigframes.dataframe.DataFrame = self._df[columns].copy() column = columns[0] if df[column].dtype != dtypes.STRING_DTYPE: @@ -940,9 +956,8 @@ def sim_join( >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None - - >>> import bigframes - >>> bigframes.options.experiments.semantic_operators = True + >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm >>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005") @@ -1001,6 +1016,9 @@ def sim_join( if top_k < 1: raise ValueError("top_k must be an integer greater than or equal to 1.") + work_estimate = len(self._df) * len(other) + self._confirm_operation(work_estimate) + base_table_embedding_column = guid.generate_guid() base_table = self._attach_embedding( other, right_on, base_table_embedding_column, model @@ -1072,3 +1090,29 @@ def _validate_model(model): if not isinstance(model, GeminiTextGenerator): raise TypeError("Model is not GeminiText Generator") + + @staticmethod + def _confirm_operation(row_count: int): + """Raises OperationAbortedError when the confirmation fails""" + import bigframes + + threshold = bigframes.options.compute.semantic_ops_confirmation_threshold + + if threshold is None or row_count <= threshold: + return + + if bigframes.options.compute.semantic_ops_threshold_autofail: + raise exceptions.OperationAbortedError( + f"Operation was cancelled because your work estimate is {row_count} rows, which exceeds the threshold {threshold} rows." + ) + + # Separate the prompt out. In IDE such VS Code, leaving prompt in the + # input function makes it less visible to the end user. + print(f"This operation will process about {row_count} rows.") + print( + "You can raise the confirmation threshold by setting `bigframes.options.compute.semantic_ops_confirmation_threshold` to a higher value. To completely turn off the confirmation check, set the threshold to `None`." + ) + print("Proceed? [Y/n]") + reply = input().casefold() + if reply not in {"y", "yes", ""}: + raise exceptions.OperationAbortedError("Operation was cancelled.") diff --git a/notebooks/experimental/semantic_operators.ipynb b/notebooks/experimental/semantic_operators.ipynb index 374236e152..8a2f083419 100644 --- a/notebooks/experimental/semantic_operators.ipynb +++ b/notebooks/experimental/semantic_operators.ipynb @@ -153,7 +153,43 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# API Syntax" + "*Note*: semantic operators could be expensive over a large set of data. As a result, our team added this option `bigframes.options.compute.sem_ops_confirmation_threshold` at `version 1.31.0` so that the BigQuery Dataframe will ask for your confirmation if the amount of data to be processed is too large. If the amount of rows exceeds your threshold, you will see a prompt for your keyboard input -- 'y' to proceed and 'n' to abort. If you abort the operation, no LLM processing will be done.\n", + "\n", + "The default threshold is 0, which means the operators will always ask for confirmations. You are free to adjust the value as needed. You can also set the threshold to `None` to disable this feature." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "if Version(bigframes.__version__) >= Version(\"1.31.0\"):\n", + " bigframes.options.compute.semantic_ops_confirmation_threshold = 100" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you would like your operations to fail automatically when the data is too large, set `bigframes.options.compute.semantic_ops_threshold_autofail` to `True`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# if Version(bigframes.__version__) >= Version(\"1.31.0\"):\n", + "# bigframes.options.compute.semantic_ops_threshold_autofail = True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# The API" ] }, { @@ -181,7 +217,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -239,7 +275,7 @@ "[3 rows x 2 columns]" ] }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -263,7 +299,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -317,7 +353,7 @@ "[1 rows x 2 columns]" ] }, - "execution_count": 8, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -351,7 +387,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -409,7 +445,7 @@ "[3 rows x 2 columns]" ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -431,7 +467,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -504,7 +540,7 @@ "[3 rows x 3 columns]" ] }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -531,7 +567,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -548,7 +584,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -620,7 +656,7 @@ "[4 rows x 2 columns]" ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -633,7 +669,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "!! **Important:** Semantic join can trigger probihitively expensitve operations! This operation first cross joins two dataframes, then invokes semantic filter on each row. That means if you have two dataframes of sizes `M` and `N`, the total amount of queries sent to the LLM is on the scale of `M * N`. Therefore, our team has added a parameter `max_rows`, a threshold that guards against unexpected expensive calls. With this parameter, the operator first calculates the size of your cross-joined data, and compares it with the threshold. If the size exceeds your threshold, the fuction will abort early with a `ValueError`. You can manually set the value of `max_rows` to raise or lower the threshold." + "!! **Important:** Semantic join can trigger probihitively expensitve operations! This operation first cross joins two dataframes, then invokes semantic filter on each row. That means if you have two dataframes of sizes `M` and `N`, the total amount of queries sent to the LLM is on the scale of `M * N`. " ] }, { @@ -654,7 +690,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -670,7 +706,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -754,7 +790,7 @@ "[6 rows x 2 columns]" ] }, - "execution_count": 14, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -781,7 +817,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -855,7 +891,7 @@ "[7 rows x 1 columns]" ] }, - "execution_count": 15, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -884,7 +920,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -903,7 +939,7 @@ "Name: Movies, dtype: string" ] }, - "execution_count": 16, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -936,7 +972,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -952,7 +988,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -1009,7 +1045,7 @@ "[2 rows x 1 columns]" ] }, - "execution_count": 18, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -1041,7 +1077,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -1105,7 +1141,7 @@ "[5 rows x 1 columns]" ] }, - "execution_count": 19, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1124,7 +1160,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -1188,7 +1224,7 @@ "[2 rows x 2 columns]" ] }, - "execution_count": 20, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1222,7 +1258,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -1239,7 +1275,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -1325,7 +1361,7 @@ "[5 rows x 3 columns]" ] }, - "execution_count": 22, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -1338,7 +1374,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "!! **Important** Like semantic join, this operator can also be very expensive. To guard against unexpected processing of large dataset, use the `max_rows` parameter to specify a threshold. " + "!! **Important** Like semantic join, this operator can also be very expensive. To guard against unexpected processing of large dataset, use the `bigframes.options.compute.sem_ops_confirmation_threshold` option to specify a threshold. " ] }, { @@ -1357,7 +1393,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -1373,7 +1409,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -1415,17 +1451,17 @@ "
I'm... banashark \n", - "321 Default search scope is an option in the Finde... kitsunesoba \n", - "390 Orthogonality and biology aren't friends. agumonkey \n", - "395 I chose some random physics book that was good... prawn \n", - "423 Seeing this get huge on Twitter. It's the... shenanigoat \n", - "427 Looking through the comments there are a numbe... moomin \n", - "428 Legacy media is a tough business. GBTC is payi... arcticbull \n", - "435 Same thing if you sell unsafe food, yet we hav... jabradoodle \n", - "437 There was briefly a thing called HSCSD ("... LeoPanthera \n", - "445 > This article is a bit comical to read and... lapcat \n", - "452 Large positions are most likely sold off in sm... meowkit \n", - "506 A US-based VPN (or really any VPN) is only goi... RandomBacon \n", - "542 rationale for the tabo... mechanical_fish \n", + "312 Do you have any reference for this?