diff --git a/bigframes/_config/compute_options.py b/bigframes/_config/compute_options.py index c8a54fe0b3..21b41eb185 100644 --- a/bigframes/_config/compute_options.py +++ b/bigframes/_config/compute_options.py @@ -66,6 +66,14 @@ class ComputeOptions: engine to handle. However this comes at the cost of increase cost and latency. extra_query_labels (Dict[str, Any], Options): Stores additional custom labels for query configuration. + semmantic_ops_confirmation_threshold (int, optional): + Guards against unexepcted processing of large amount of rows by semantic operators. + If the number of rows exceeds the threshold, the user will be asked to confirm + their operations to resume. The default value is 0. Set the value to None + to turn off the guard. + semantic_ops_threshold_autofail (bool): + Guards against unexepcted processing of large amount of rows by semantic operators. + When set to True, the operation automatically fails without asking for user inputs. """ maximum_bytes_billed: Optional[int] = None @@ -73,6 +81,8 @@ class ComputeOptions: extra_query_labels: Dict[str, Any] = dataclasses.field( default_factory=dict, init=False ) + semantic_ops_confirmation_threshold: Optional[int] = 0 + semantic_ops_threshold_autofail = False def assign_extra_query_labels(self, **kwargs: Any) -> None: """ diff --git a/bigframes/exceptions.py b/bigframes/exceptions.py index 27f3508ff4..3cb5f3665d 100644 --- a/bigframes/exceptions.py +++ b/bigframes/exceptions.py @@ -59,6 +59,10 @@ class QueryComplexityError(RuntimeError): """Query plan is too complex to execute.""" +class OperationAbortedError(RuntimeError): + """Operation is aborted.""" + + class TimeTravelDisabledWarning(Warning): """A query was reattempted without time travel.""" diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py index 79b92afe4f..6a537db4f3 100644 --- a/bigframes/operations/semantics.py +++ b/bigframes/operations/semantics.py @@ -20,8 +20,8 @@ import numpy as np -import bigframes.core.guid as guid -import bigframes.dtypes as dtypes +from bigframes import dtypes, exceptions +from bigframes.core import guid class Semantics: @@ -53,6 +53,7 @@ def agg( >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") @@ -115,6 +116,15 @@ def agg( self._validate_model(model) columns = self._parse_columns(instruction) + if max_agg_rows <= 1: + raise ValueError( + f"Invalid value for `max_agg_rows`: {max_agg_rows}." + "It must be greater than 1." + ) + + work_estimate = len(self._df) * int(max_agg_rows / (max_agg_rows - 1)) + self._confirm_operation(work_estimate) + df: bigframes.dataframe.DataFrame = self._df.copy() for column in columns: if column not in self._df.columns: @@ -135,12 +145,6 @@ def agg( "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" ) - if max_agg_rows <= 1: - raise ValueError( - f"Invalid value for `max_agg_rows`: {max_agg_rows}." - "It must be greater than 1." - ) - user_instruction = self._format_instruction(instruction, columns) num_cluster = 1 @@ -243,6 +247,7 @@ def cluster_by( >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm >>> model = llm.TextEmbeddingGenerator() @@ -296,6 +301,8 @@ def cluster_by( "It must be greater than 1." ) + self._confirm_operation(len(self._df)) + df: bigframes.dataframe.DataFrame = self._df.copy() embeddings_df = model.predict(df[column]) @@ -314,6 +321,7 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") @@ -367,6 +375,8 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" ) + self._confirm_operation(len(self._df)) + df: bigframes.dataframe.DataFrame = self._df[columns].copy() for column in columns: if df[column].dtype != dtypes.STRING_DTYPE: @@ -403,6 +413,7 @@ def map( >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") @@ -462,6 +473,8 @@ def map( "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" ) + self._confirm_operation(len(self._df)) + df: bigframes.dataframe.DataFrame = self._df[columns].copy() for column in columns: if df[column].dtype != dtypes.STRING_DTYPE: @@ -490,7 +503,6 @@ def join( other, instruction: str, model, - max_rows: int = 1000, ground_with_google_search: bool = False, ): """ @@ -502,6 +514,7 @@ def join( >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") @@ -561,12 +574,8 @@ def join( "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" ) - joined_table_rows = len(self._df) * len(other) - - if joined_table_rows > max_rows: - raise ValueError( - f"Number of rows that need processing is {joined_table_rows}, which exceeds row limit {max_rows}." - ) + work_estimate = len(self._df) * len(other) + self._confirm_operation(work_estimate) left_columns = [] right_columns = [] @@ -645,6 +654,7 @@ def search( >>> import bigframes >>> bigframes.options.experiments.semantic_operators = True + >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm >>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005") @@ -680,6 +690,8 @@ def search( if search_column not in self._df.columns: raise ValueError(f"Column `{search_column}` not found") + self._confirm_operation(len(self._df)) + import bigframes.ml.llm as llm if not isinstance(model, llm.TextEmbeddingGenerator): @@ -743,6 +755,7 @@ def top_k( >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") @@ -803,6 +816,9 @@ def top_k( "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" ) + work_estimate = int(len(self._df) * (len(self._df) - 1) / 2) + self._confirm_operation(work_estimate) + df: bigframes.dataframe.DataFrame = self._df[columns].copy() column = columns[0] if df[column].dtype != dtypes.STRING_DTYPE: @@ -940,9 +956,8 @@ def sim_join( >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None - - >>> import bigframes - >>> bigframes.options.experiments.semantic_operators = True + >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm >>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005") @@ -1001,6 +1016,9 @@ def sim_join( if top_k < 1: raise ValueError("top_k must be an integer greater than or equal to 1.") + work_estimate = len(self._df) * len(other) + self._confirm_operation(work_estimate) + base_table_embedding_column = guid.generate_guid() base_table = self._attach_embedding( other, right_on, base_table_embedding_column, model @@ -1072,3 +1090,29 @@ def _validate_model(model): if not isinstance(model, GeminiTextGenerator): raise TypeError("Model is not GeminiText Generator") + + @staticmethod + def _confirm_operation(row_count: int): + """Raises OperationAbortedError when the confirmation fails""" + import bigframes + + threshold = bigframes.options.compute.semantic_ops_confirmation_threshold + + if threshold is None or row_count <= threshold: + return + + if bigframes.options.compute.semantic_ops_threshold_autofail: + raise exceptions.OperationAbortedError( + f"Operation was cancelled because your work estimate is {row_count} rows, which exceeds the threshold {threshold} rows." + ) + + # Separate the prompt out. In IDE such VS Code, leaving prompt in the + # input function makes it less visible to the end user. + print(f"This operation will process about {row_count} rows.") + print( + "You can raise the confirmation threshold by setting `bigframes.options.compute.semantic_ops_confirmation_threshold` to a higher value. To completely turn off the confirmation check, set the threshold to `None`." + ) + print("Proceed? [Y/n]") + reply = input().casefold() + if reply not in {"y", "yes", ""}: + raise exceptions.OperationAbortedError("Operation was cancelled.") diff --git a/notebooks/experimental/semantic_operators.ipynb b/notebooks/experimental/semantic_operators.ipynb index 374236e152..8a2f083419 100644 --- a/notebooks/experimental/semantic_operators.ipynb +++ b/notebooks/experimental/semantic_operators.ipynb @@ -153,7 +153,43 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# API Syntax" + "*Note*: semantic operators could be expensive over a large set of data. As a result, our team added this option `bigframes.options.compute.sem_ops_confirmation_threshold` at `version 1.31.0` so that the BigQuery Dataframe will ask for your confirmation if the amount of data to be processed is too large. If the amount of rows exceeds your threshold, you will see a prompt for your keyboard input -- 'y' to proceed and 'n' to abort. If you abort the operation, no LLM processing will be done.\n", + "\n", + "The default threshold is 0, which means the operators will always ask for confirmations. You are free to adjust the value as needed. You can also set the threshold to `None` to disable this feature." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "if Version(bigframes.__version__) >= Version(\"1.31.0\"):\n", + " bigframes.options.compute.semantic_ops_confirmation_threshold = 100" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you would like your operations to fail automatically when the data is too large, set `bigframes.options.compute.semantic_ops_threshold_autofail` to `True`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# if Version(bigframes.__version__) >= Version(\"1.31.0\"):\n", + "# bigframes.options.compute.semantic_ops_threshold_autofail = True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# The API" ] }, { @@ -181,7 +217,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -239,7 +275,7 @@ "[3 rows x 2 columns]" ] }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -263,7 +299,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -317,7 +353,7 @@ "[1 rows x 2 columns]" ] }, - "execution_count": 8, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -351,7 +387,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -409,7 +445,7 @@ "[3 rows x 2 columns]" ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -431,7 +467,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -504,7 +540,7 @@ "[3 rows x 3 columns]" ] }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -531,7 +567,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -548,7 +584,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -620,7 +656,7 @@ "[4 rows x 2 columns]" ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -633,7 +669,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "!! **Important:** Semantic join can trigger probihitively expensitve operations! This operation first cross joins two dataframes, then invokes semantic filter on each row. That means if you have two dataframes of sizes `M` and `N`, the total amount of queries sent to the LLM is on the scale of `M * N`. Therefore, our team has added a parameter `max_rows`, a threshold that guards against unexpected expensive calls. With this parameter, the operator first calculates the size of your cross-joined data, and compares it with the threshold. If the size exceeds your threshold, the fuction will abort early with a `ValueError`. You can manually set the value of `max_rows` to raise or lower the threshold." + "!! **Important:** Semantic join can trigger probihitively expensitve operations! This operation first cross joins two dataframes, then invokes semantic filter on each row. That means if you have two dataframes of sizes `M` and `N`, the total amount of queries sent to the LLM is on the scale of `M * N`. " ] }, { @@ -654,7 +690,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -670,7 +706,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -754,7 +790,7 @@ "[6 rows x 2 columns]" ] }, - "execution_count": 14, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -781,7 +817,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -855,7 +891,7 @@ "[7 rows x 1 columns]" ] }, - "execution_count": 15, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -884,7 +920,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -903,7 +939,7 @@ "Name: Movies, dtype: string" ] }, - "execution_count": 16, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -936,7 +972,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -952,7 +988,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -1009,7 +1045,7 @@ "[2 rows x 1 columns]" ] }, - "execution_count": 18, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -1041,7 +1077,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -1105,7 +1141,7 @@ "[5 rows x 1 columns]" ] }, - "execution_count": 19, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1124,7 +1160,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -1188,7 +1224,7 @@ "[2 rows x 2 columns]" ] }, - "execution_count": 20, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1222,7 +1258,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -1239,7 +1275,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -1325,7 +1361,7 @@ "[5 rows x 3 columns]" ] }, - "execution_count": 22, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -1338,7 +1374,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "!! **Important** Like semantic join, this operator can also be very expensive. To guard against unexpected processing of large dataset, use the `max_rows` parameter to specify a threshold. " + "!! **Important** Like semantic join, this operator can also be very expensive. To guard against unexpected processing of large dataset, use the `bigframes.options.compute.sem_ops_confirmation_threshold` option to specify a threshold. " ] }, { @@ -1357,7 +1393,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -1373,7 +1409,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -1415,17 +1451,17 @@ " \n", " 0\n", " Smartphone\n", - " 2\n", + " 3\n", " \n", " \n", " 1\n", " Laptop\n", - " 2\n", + " 3\n", " \n", " \n", " 2\n", " Coffee Maker\n", - " 2\n", + " 1\n", " \n", " \n", " 3\n", @@ -1444,16 +1480,16 @@ ], "text/plain": [ " Product Cluster ID\n", - "0 Smartphone 2\n", - "1 Laptop 2\n", - "2 Coffee Maker 2\n", + "0 Smartphone 3\n", + "1 Laptop 3\n", + "2 Coffee Maker 1\n", "3 T-shirt 2\n", "4 Jeans 2\n", "\n", "[5 rows x 2 columns]" ] }, - "execution_count": 24, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -1487,7 +1523,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -1836,7 +1872,7 @@ "[3000 rows x 6 columns]" ] }, - "execution_count": 25, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -1855,16 +1891,16 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "2555" + "2556" ] }, - "execution_count": 26, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -1883,16 +1919,16 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "390.61878669276047" + "390.29068857589976" ] }, - "execution_count": 27, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -1901,6 +1937,23 @@ "hacker_news_with_texts['text'].str.len().mean()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[Optional] You can raise the confirmation threshold for a smoother experience." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "if Version(bigframes.__version__) >= Version(\"1.31.0\"):\n", + " bigframes.options.compute.semantic_ops_confirmation_threshold = 5000" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1910,9 +1963,16 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 30, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This operation will process about 2556 rows. Proceed? [Y/n]\n" + ] + }, { "name": "stderr", "output_type": "stream", @@ -1961,7 +2021,7 @@ " comment\n", " \n", " \n", - " 419\n", + " 420\n", " <NA>\n", " Well last time I got angry down votes for sayi...\n", " drieddust\n", @@ -1970,7 +2030,7 @@ " comment\n", " \n", " \n", - " 812\n", + " 814\n", " <NA>\n", " New iPhone should be announced on September. L...\n", " meerita\n", @@ -1979,7 +2039,7 @@ " comment\n", " \n", " \n", - " 1512\n", + " 1515\n", " <NA>\n", " Why would this take a week? i(phone)OS was ori...\n", " TheOtherHobbes\n", @@ -1988,7 +2048,7 @@ " comment\n", " \n", " \n", - " 1559\n", + " 1562\n", " <NA>\n", " &gt;or because Apple drama brings many clicks?...\n", " weberer\n", @@ -2004,22 +2064,22 @@ "text/plain": [ " title text by \\\n", "9 It doesn’t work on Safari, and WebKit based br... archiewood \n", - "419 Well last time I got angry down votes for sayi... drieddust \n", - "812 New iPhone should be announced on September. L... meerita \n", - "1512 Why would this take a week? i(phone)OS was ori... TheOtherHobbes \n", - "1559 >or because Apple drama brings many clicks?... weberer \n", + "420 Well last time I got angry down votes for sayi... drieddust \n", + "814 New iPhone should be announced on September. L... meerita \n", + "1515 Why would this take a week? i(phone)OS was ori... TheOtherHobbes \n", + "1562 >or because Apple drama brings many clicks?... weberer \n", "\n", " score timestamp type \n", "9 2023-04-21 16:45:13+00:00 comment \n", - "419 2021-01-11 19:27:27+00:00 comment \n", - "812 2019-07-30 20:54:42+00:00 comment \n", - "1512 2021-06-08 09:25:24+00:00 comment \n", - "1559 2022-09-05 13:16:02+00:00 comment \n", + "420 2021-01-11 19:27:27+00:00 comment \n", + "814 2019-07-30 20:54:42+00:00 comment \n", + "1515 2021-06-08 09:25:24+00:00 comment \n", + "1562 2022-09-05 13:16:02+00:00 comment \n", "\n", "[5 rows x 6 columns]" ] }, - "execution_count": 28, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -2051,7 +2111,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -2104,7 +2164,7 @@ " Frustrated, but hopeful.\n", " \n", " \n", - " 419\n", + " 420\n", " <NA>\n", " Well last time I got angry down votes for sayi...\n", " drieddust\n", @@ -2114,7 +2174,7 @@ " Frustrated and angry.\n", " \n", " \n", - " 812\n", + " 814\n", " <NA>\n", " New iPhone should be announced on September. L...\n", " meerita\n", @@ -2124,7 +2184,7 @@ " Excited anticipation.\n", " \n", " \n", - " 1512\n", + " 1515\n", " <NA>\n", " Why would this take a week? i(phone)OS was ori...\n", " TheOtherHobbes\n", @@ -2134,7 +2194,7 @@ " Frustrated, critical, obvious.\n", " \n", " \n", - " 1559\n", + " 1562\n", " <NA>\n", " &gt;or because Apple drama brings many clicks?...\n", " weberer\n", @@ -2151,34 +2211,34 @@ "text/plain": [ " title text by \\\n", "9 It doesn’t work on Safari, and WebKit based br... archiewood \n", - "419 Well last time I got angry down votes for sayi... drieddust \n", - "812 New iPhone should be announced on September. L... meerita \n", - "1512 Why would this take a week? i(phone)OS was ori... TheOtherHobbes \n", - "1559 >or because Apple drama brings many clicks?... weberer \n", + "420 Well last time I got angry down votes for sayi... drieddust \n", + "814 New iPhone should be announced on September. L... meerita \n", + "1515 Why would this take a week? i(phone)OS was ori... TheOtherHobbes \n", + "1562 >or because Apple drama brings many clicks?... weberer \n", "\n", " score timestamp type \\\n", "9 2023-04-21 16:45:13+00:00 comment \n", - "419 2021-01-11 19:27:27+00:00 comment \n", - "812 2019-07-30 20:54:42+00:00 comment \n", - "1512 2021-06-08 09:25:24+00:00 comment \n", - "1559 2022-09-05 13:16:02+00:00 comment \n", + "420 2021-01-11 19:27:27+00:00 comment \n", + "814 2019-07-30 20:54:42+00:00 comment \n", + "1515 2021-06-08 09:25:24+00:00 comment \n", + "1562 2022-09-05 13:16:02+00:00 comment \n", "\n", " sentiment \n", "9 Frustrated, but hopeful. \n", " \n", - "419 Frustrated and angry. \n", + "420 Frustrated and angry. \n", " \n", - "812 Excited anticipation. \n", + "814 Excited anticipation. \n", " \n", - "1512 Frustrated, critical, obvious. \n", + "1515 Frustrated, critical, obvious. \n", " \n", - "1559 Negative, clickbait, Apple. \n", + "1562 Negative, clickbait, Apple. \n", " \n", "\n", "[5 rows x 7 columns]" ] }, - "execution_count": 29, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -2196,14 +2256,14 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/venv/lib/python3.11/site-packages/IPython/core/interactiveshell.py:3577: UserWarning: Reading cached table from 2024-12-27 21:39:10.129973+00:00 to avoid incompatibilies with previous reads of this table. To read the latest version, set `use_cache=False` or close the current session with Session.close() or bigframes.pandas.close_session().\n", + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/venv/lib/python3.11/site-packages/IPython/core/interactiveshell.py:3577: UserWarning: Reading cached table from 2025-01-03 01:18:29.080474+00:00 to avoid incompatibilies with previous reads of this table. To read the latest version, set `use_cache=False` or close the current session with Session.close() or bigframes.pandas.close_session().\n", " exec(code_obj, self.user_global_ns, self.user_ns)\n" ] }, @@ -2553,7 +2613,7 @@ "[3000 rows x 6 columns]" ] }, - "execution_count": 30, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -2565,9 +2625,16 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 33, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This operation will process about 3000 rows. Proceed? [Y/n]\n" + ] + }, { "name": "stderr", "output_type": "stream", @@ -2643,7 +2710,7 @@ " comment\n", " \n", " \n", - " 208\n", + " 209\n", " <NA>\n", " I like the idea of moving that arrow the way h...\n", " rattray\n", @@ -2652,7 +2719,7 @@ " comment\n", " \n", " \n", - " 227\n", + " 228\n", " <NA>\n", " I don&#x27;t understand why a beginner would s...\n", " wolco\n", @@ -2661,7 +2728,7 @@ " comment\n", " \n", " \n", - " 289\n", + " 290\n", " <NA>\n", " I leaerned more with one minute of this than a...\n", " agumonkey\n", @@ -2670,7 +2737,7 @@ " comment\n", " \n", " \n", - " 302\n", + " 303\n", " <NA>\n", " I've suggested a <i>rationale</i> for the tabo...\n", " mechanical_fish\n", @@ -2679,7 +2746,7 @@ " comment\n", " \n", " \n", - " 311\n", + " 312\n", " <NA>\n", " Do you have any reference for this?<p>I&#x27;m...\n", " banashark\n", @@ -2688,7 +2755,7 @@ " comment\n", " \n", " \n", - " 321\n", + " 322\n", " <NA>\n", " Default search scope is an option in the Finde...\n", " kitsunesoba\n", @@ -2697,7 +2764,7 @@ " comment\n", " \n", " \n", - " 390\n", + " 391\n", " <NA>\n", " Orthogonality and biology aren&#x27;t friends.\n", " agumonkey\n", @@ -2706,7 +2773,7 @@ " comment\n", " \n", " \n", - " 395\n", + " 396\n", " <NA>\n", " I chose some random physics book that was good...\n", " prawn\n", @@ -2715,7 +2782,7 @@ " comment\n", " \n", " \n", - " 423\n", + " 424\n", " <NA>\n", " Seeing this get huge on Twitter. It&#x27;s the...\n", " shenanigoat\n", @@ -2724,7 +2791,7 @@ " comment\n", " \n", " \n", - " 427\n", + " 428\n", " <NA>\n", " Looking through the comments there are a numbe...\n", " moomin\n", @@ -2733,7 +2800,7 @@ " comment\n", " \n", " \n", - " 428\n", + " 429\n", " <NA>\n", " Legacy media is a tough business. GBTC is payi...\n", " arcticbull\n", @@ -2742,7 +2809,7 @@ " comment\n", " \n", " \n", - " 435\n", + " 436\n", " <NA>\n", " Same thing if you sell unsafe food, yet we hav...\n", " jabradoodle\n", @@ -2751,7 +2818,7 @@ " comment\n", " \n", " \n", - " 437\n", + " 438\n", " <NA>\n", " There was briefly a thing called HSCSD (&quot;...\n", " LeoPanthera\n", @@ -2760,7 +2827,7 @@ " comment\n", " \n", " \n", - " 445\n", + " 446\n", " <NA>\n", " &gt; This article is a bit comical to read and...\n", " lapcat\n", @@ -2769,7 +2836,7 @@ " comment\n", " \n", " \n", - " 452\n", + " 453\n", " <NA>\n", " Large positions are most likely sold off in sm...\n", " meowkit\n", @@ -2778,7 +2845,7 @@ " comment\n", " \n", " \n", - " 506\n", + " 507\n", " <NA>\n", " A US-based VPN (or really any VPN) is only goi...\n", " RandomBacon\n", @@ -2787,7 +2854,7 @@ " comment\n", " \n", " \n", - " 542\n", + " 543\n", " <NA>\n", " <a href=\"https:&#x2F;&#x2F;codeberg.org&#x2F;A...\n", " ElectronBadger\n", @@ -2796,7 +2863,7 @@ " comment\n", " \n", " \n", - " 564\n", + " 565\n", " <NA>\n", " It’s much harder for people without hands to w...\n", " Aeolun\n", @@ -2805,7 +2872,7 @@ " comment\n", " \n", " \n", - " 611\n", + " 612\n", " <NA>\n", " So by using ADMIN_SL0T instead was it just set...\n", " minitoar\n", @@ -2814,7 +2881,7 @@ " comment\n", " \n", " \n", - " 658\n", + " 660\n", " <NA>\n", " Outstanding!\n", " cafard\n", @@ -2823,7 +2890,7 @@ " comment\n", " \n", " \n", - " 671\n", + " 673\n", " <NA>\n", " On the other hand, something can be said for &...\n", " babby\n", @@ -2842,87 +2909,87 @@ "98 \n", "137 FDA reverses marketing ban on Juul e-cigarettes \n", "188 \n", - "208 \n", - "227 \n", - "289 \n", - "302 \n", - "311 \n", - "321 \n", - "390 \n", - "395 \n", - "423 \n", - "427 \n", + "209 \n", + "228 \n", + "290 \n", + "303 \n", + "312 \n", + "322 \n", + "391 \n", + "396 \n", + "424 \n", "428 \n", - "435 \n", - "437 \n", - "445 \n", - "452 \n", - "506 \n", - "542 \n", - "564 \n", - "611 \n", - "658 \n", - "671 \n", + "429 \n", + "436 \n", + "438 \n", + "446 \n", + "453 \n", + "507 \n", + "543 \n", + "565 \n", + "612 \n", + "660 \n", + "673 \n", "\n", " text by \\\n", "24 GiraffeNecktie \n", "98 i resisted switching to chrome for months beca... catshirt \n", "137 anigbrowl \n", "188 I think it's more than hazing. It may be ... bayesianhorse \n", - "208 I like the idea of moving that arrow the way h... rattray \n", - "227 I don't understand why a beginner would s... wolco \n", - "289 I leaerned more with one minute of this than a... agumonkey \n", - "302 I've suggested a rationale for the tabo... mechanical_fish \n", - "311 Do you have any reference for this?

I'm... banashark \n", - "321 Default search scope is an option in the Finde... kitsunesoba \n", - "390 Orthogonality and biology aren't friends. agumonkey \n", - "395 I chose some random physics book that was good... prawn \n", - "423 Seeing this get huge on Twitter. It's the... shenanigoat \n", - "427 Looking through the comments there are a numbe... moomin \n", - "428 Legacy media is a tough business. GBTC is payi... arcticbull \n", - "435 Same thing if you sell unsafe food, yet we hav... jabradoodle \n", - "437 There was briefly a thing called HSCSD ("... LeoPanthera \n", - "445 > This article is a bit comical to read and... lapcat \n", - "452 Large positions are most likely sold off in sm... meowkit \n", - "506 A US-based VPN (or really any VPN) is only goi... RandomBacon \n", - "542 rationale for the tabo... mechanical_fish \n", + "312 Do you have any reference for this?

I'm... banashark \n", + "322 Default search scope is an option in the Finde... kitsunesoba \n", + "391 Orthogonality and biology aren't friends. agumonkey \n", + "396 I chose some random physics book that was good... prawn \n", + "424 Seeing this get huge on Twitter. It's the... shenanigoat \n", + "428 Looking through the comments there are a numbe... moomin \n", + "429 Legacy media is a tough business. GBTC is payi... arcticbull \n", + "436 Same thing if you sell unsafe food, yet we hav... jabradoodle \n", + "438 There was briefly a thing called HSCSD ("... LeoPanthera \n", + "446 > This article is a bit comical to read and... lapcat \n", + "453 Large positions are most likely sold off in sm... meowkit \n", + "507 A US-based VPN (or really any VPN) is only goi... RandomBacon \n", + "543 2011-04-06 08:02:24+00:00 comment \n", "137 2 2024-06-06 16:42:40+00:00 story \n", "188 2015-06-18 16:42:53+00:00 comment \n", - "208 2015-06-08 02:15:30+00:00 comment \n", - "227 2019-02-03 14:35:43+00:00 comment \n", - "289 2016-07-16 06:19:39+00:00 comment \n", - "302 2008-12-17 04:42:02+00:00 comment \n", - "311 2023-11-13 19:57:00+00:00 comment \n", - "321 2017-08-13 17:15:19+00:00 comment \n", - "390 2016-04-24 16:33:41+00:00 comment \n", - "395 2011-03-27 22:29:51+00:00 comment \n", - "423 2016-01-09 03:04:22+00:00 comment \n", - "427 2024-10-01 14:37:04+00:00 comment \n", - "428 2021-04-16 16:30:33+00:00 comment \n", - "435 2023-08-03 20:47:52+00:00 comment \n", - "437 2019-02-11 19:49:29+00:00 comment \n", - "445 2023-01-02 16:00:49+00:00 comment \n", - "452 2021-01-27 23:22:48+00:00 comment \n", - "506 2019-04-05 00:58:58+00:00 comment \n", - "542 2023-12-13 08:13:15+00:00 comment \n", - "564 2024-05-03 11:58:13+00:00 comment \n", - "611 2021-03-05 16:07:56+00:00 comment \n", - "658 2022-06-09 09:51:54+00:00 comment \n", - "671 2013-08-12 00:31:02+00:00 comment \n", + "209 2015-06-08 02:15:30+00:00 comment \n", + "228 2019-02-03 14:35:43+00:00 comment \n", + "290 2016-07-16 06:19:39+00:00 comment \n", + "303 2008-12-17 04:42:02+00:00 comment \n", + "312 2023-11-13 19:57:00+00:00 comment \n", + "322 2017-08-13 17:15:19+00:00 comment \n", + "391 2016-04-24 16:33:41+00:00 comment \n", + "396 2011-03-27 22:29:51+00:00 comment \n", + "424 2016-01-09 03:04:22+00:00 comment \n", + "428 2024-10-01 14:37:04+00:00 comment \n", + "429 2021-04-16 16:30:33+00:00 comment \n", + "436 2023-08-03 20:47:52+00:00 comment \n", + "438 2019-02-11 19:49:29+00:00 comment \n", + "446 2023-01-02 16:00:49+00:00 comment \n", + "453 2021-01-27 23:22:48+00:00 comment \n", + "507 2019-04-05 00:58:58+00:00 comment \n", + "543 2023-12-13 08:13:15+00:00 comment \n", + "565 2024-05-03 11:58:13+00:00 comment \n", + "612 2021-03-05 16:07:56+00:00 comment \n", + "660 2022-06-09 09:51:54+00:00 comment \n", + "673 2013-08-12 00:31:02+00:00 comment \n", "...\n", "\n", "[123 rows x 6 columns]" ] }, - "execution_count": 31, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } diff --git a/tests/system/large/operations/test_semantics.py b/tests/system/large/operations/test_semantics.py index 7602be2fca..20219ef46e 100644 --- a/tests/system/large/operations/test_semantics.py +++ b/tests/system/large/operations/test_semantics.py @@ -12,22 +12,28 @@ # See the License for the specific language governing permissions and # limitations under the License. +from contextlib import nullcontext +from unittest.mock import patch + import pandas as pd import pandas.testing import pytest import bigframes -import bigframes.dataframe as dataframe -import bigframes.dtypes as dtypes +from bigframes import dataframe, dtypes, exceptions + +EXPERIMENT_OPTION = "experiments.semantic_operators" +THRESHOLD_OPTION = "compute.semantic_ops_confirmation_threshold" def test_semantics_experiment_off_raise_error(): - bigframes.options.experiments.semantic_operators = False df = dataframe.DataFrame( {"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]} ) - with pytest.raises(NotImplementedError): + with bigframes.option_context(EXPERIMENT_OPTION, False), pytest.raises( + NotImplementedError + ): df.semantics @@ -44,7 +50,6 @@ def test_semantics_experiment_off_raise_error(): ], ) def test_agg(session, gemini_flash_model, max_agg_rows, cluster_column): - bigframes.options.experiments.semantic_operators = True df = dataframe.DataFrame( data={ "Movies": [ @@ -61,20 +66,66 @@ def test_agg(session, gemini_flash_model, max_agg_rows, cluster_column): session=session, ) instruction = "Find the shared first name of actors in {Movies}. One word answer." - actual_s = df.semantics.agg( - instruction, - model=gemini_flash_model, - max_agg_rows=max_agg_rows, - cluster_column=cluster_column, - ).to_pandas() + + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 50, + ): + actual_s = df.semantics.agg( + instruction, + model=gemini_flash_model, + max_agg_rows=max_agg_rows, + cluster_column=cluster_column, + ).to_pandas() expected_s = pd.Series(["Leonardo \n"], dtype=dtypes.STRING_DTYPE) expected_s.name = "Movies" pandas.testing.assert_series_equal(actual_s, expected_s, check_index_type=False) +@pytest.mark.parametrize( + ("reply"), + [ + pytest.param("y"), + pytest.param( + "n", marks=pytest.mark.xfail(raises=exceptions.OperationAbortedError) + ), + ], +) +def test_agg_with_confirmation(session, gemini_flash_model, reply, monkeypatch): + df = dataframe.DataFrame( + data={ + "Movies": [ + "Titanic", + "The Wolf of Wall Street", + "Killers of the Flower Moon", + "The Revenant", + "Inception", + "Shuttle Island", + "The Great Gatsby", + ], + "Years": [1997, 2013, 2023, 2015, 2010, 2010, 2013], + }, + session=session, + ) + instruction = "Find the shared first name of actors in {Movies}. One word answer." + monkeypatch.setattr("builtins.input", lambda: reply) + + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 0, + ): + df.semantics.agg( + instruction, + model=gemini_flash_model, + ) + + def test_agg_w_int_column(session, gemini_flash_model): - bigframes.options.experiments.semantic_operators = True df = dataframe.DataFrame( data={ "Movies": [ @@ -86,10 +137,17 @@ def test_agg_w_int_column(session, gemini_flash_model): session=session, ) instruction = "Find the {Years} Leonardo DiCaprio acted in the most movies. Answer with the year only." - actual_s = df.semantics.agg( - instruction, - model=gemini_flash_model, - ).to_pandas() + + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_s = df.semantics.agg( + instruction, + model=gemini_flash_model, + ).to_pandas() expected_s = pd.Series(["2013 \n"], dtype=dtypes.STRING_DTYPE) expected_s.name = "Years" @@ -117,7 +175,6 @@ def test_agg_w_int_column(session, gemini_flash_model): ], ) def test_agg_invalid_instruction_raise_error(instruction, gemini_flash_model): - bigframes.options.experiments.semantic_operators = True df = dataframe.DataFrame( data={ "Movies": [ @@ -128,7 +185,14 @@ def test_agg_invalid_instruction_raise_error(instruction, gemini_flash_model): "Year": [1997, 2013, 2023], }, ) - df.semantics.agg(instruction, gemini_flash_model) + + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + df.semantics.agg(instruction, gemini_flash_model) @pytest.mark.parametrize( @@ -145,7 +209,6 @@ def test_agg_invalid_instruction_raise_error(instruction, gemini_flash_model): ], ) def test_agg_invalid_cluster_column_raise_error(gemini_flash_model, cluster_column): - bigframes.options.experiments.semantic_operators = True df = dataframe.DataFrame( data={ "Movies": [ @@ -157,7 +220,14 @@ def test_agg_invalid_cluster_column_raise_error(gemini_flash_model, cluster_colu }, ) instruction = "Find the shared first name of actors in {Movies}. One word answer." - df.semantics.agg(instruction, gemini_flash_model, cluster_column=cluster_column) + + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + df.semantics.agg(instruction, gemini_flash_model, cluster_column=cluster_column) @pytest.mark.parametrize( @@ -168,7 +238,6 @@ def test_agg_invalid_cluster_column_raise_error(gemini_flash_model, cluster_colu ], ) def test_cluster_by(session, text_embedding_generator, n_clusters): - bigframes.options.experiments.semantic_operators = True df = dataframe.DataFrame( ( { @@ -186,28 +255,82 @@ def test_cluster_by(session, text_embedding_generator, n_clusters): session=session, ) output_column = "cluster id" - result = df.semantics.cluster_by( - "Item", - output_column, - text_embedding_generator, - n_clusters=n_clusters, - ) + + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + result = df.semantics.cluster_by( + "Item", + output_column, + text_embedding_generator, + n_clusters=n_clusters, + ) assert output_column in result # In rare cases, it's possible to have fewer than K clusters due to randomness. assert len(result[output_column].unique()) <= n_clusters -def test_cluster_by_invalid_column(session, text_embedding_generator): - bigframes.options.experiments.semantic_operators = True +@pytest.mark.parametrize( + ("reply"), + [ + pytest.param("y"), + pytest.param( + "n", marks=pytest.mark.xfail(raises=exceptions.OperationAbortedError) + ), + ], +) +def test_cluster_by_with_confirmation( + session, text_embedding_generator, reply, monkeypatch +): + df = dataframe.DataFrame( + ( + { + "Item": [ + "Orange", + "Cantaloupe", + "Watermelon", + "Chicken", + "Duck", + "Hen", + "Rooster", + ] + } + ), + session=session, + ) + monkeypatch.setattr("builtins.input", lambda: reply) + + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 0, + ): + df.semantics.cluster_by( + "Item", + "cluster id", + text_embedding_generator, + n_clusters=2, + ) + +def test_cluster_by_invalid_column(session, text_embedding_generator): df = dataframe.DataFrame( ({"Product": ["Smartphone", "Laptop", "Coffee Maker", "T-shirt", "Jeans"]}), session=session, ) - output_column = "cluster id" - with pytest.raises(ValueError): + + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): df.semantics.cluster_by( "unknown_column", output_column, @@ -217,15 +340,18 @@ def test_cluster_by_invalid_column(session, text_embedding_generator): def test_cluster_by_invalid_model(session, gemini_flash_model): - bigframes.options.experiments.semantic_operators = True - df = dataframe.DataFrame( ({"Product": ["Smartphone", "Laptop", "Coffee Maker", "T-shirt", "Jeans"]}), session=session, ) - output_column = "cluster id" - with pytest.raises(TypeError): + + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(TypeError): df.semantics.cluster_by( "Product", output_column, @@ -235,7 +361,6 @@ def test_cluster_by_invalid_model(session, gemini_flash_model): def test_filter(session, gemini_flash_model): - bigframes.options.experiments.semantic_operators = True df = dataframe.DataFrame( data={ "country": ["USA", "Germany"], @@ -245,9 +370,15 @@ def test_filter(session, gemini_flash_model): session=session, ) - actual_df = df.semantics.filter( - "{city} is the capital of {country} in {year}", gemini_flash_model - ).to_pandas() + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_df = df.semantics.filter( + "{city} is the capital of {country} in {year}", gemini_flash_model + ).to_pandas() expected_df = pd.DataFrame( {"country": ["Germany"], "city": ["Berlin"], "year": [2024]}, index=[1] @@ -257,16 +388,52 @@ def test_filter(session, gemini_flash_model): ) +@pytest.mark.parametrize( + ("reply"), + [ + pytest.param("y"), + pytest.param( + "n", marks=pytest.mark.xfail(raises=exceptions.OperationAbortedError) + ), + ], +) +def test_filter_with_confirmation(session, gemini_flash_model, reply, monkeypatch): + df = dataframe.DataFrame( + data={ + "country": ["USA", "Germany"], + "city": ["Seattle", "Berlin"], + "year": [2023, 2024], + }, + session=session, + ) + monkeypatch.setattr("builtins.input", lambda: reply) + + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 0, + ): + df.semantics.filter( + "{city} is the capital of {country} in {year}", gemini_flash_model + ) + + def test_filter_single_column_reference(session, gemini_flash_model): - bigframes.options.experiments.semantic_operators = True df = dataframe.DataFrame( data={"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]}, session=session, ) - actual_df = df.semantics.filter( - "{country} is in Europe", gemini_flash_model - ).to_pandas() + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_df = df.semantics.filter( + "{country} is in Europe", gemini_flash_model + ).to_pandas() expected_df = pd.DataFrame({"country": ["Germany"], "city": ["Berlin"]}, index=[1]) pandas.testing.assert_frame_equal( @@ -295,25 +462,32 @@ def test_filter_single_column_reference(session, gemini_flash_model): ], ) def test_filter_invalid_instruction_raise_error(instruction, gemini_flash_model): - bigframes.options.experiments.semantic_operators = True df = dataframe.DataFrame({"id": [1, 2], "city": ["Seattle", "Berlin"]}) - with pytest.raises(ValueError): + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): df.semantics.filter(instruction, gemini_flash_model) def test_filter_invalid_model_raise_error(): - bigframes.options.experiments.semantic_operators = True df = dataframe.DataFrame( {"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]} ) - with pytest.raises(TypeError): + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(TypeError): df.semantics.filter("{city} is the capital of {country}", None) def test_map(session, gemini_flash_model): - bigframes.options.experiments.semantic_operators = True df = dataframe.DataFrame( data={ "ingredient_1": ["Burger Bun", "Soy Bean"], @@ -323,11 +497,17 @@ def test_map(session, gemini_flash_model): session=session, ) - actual_df = df.semantics.map( - "What is the {gluten-free} food made from {ingredient_1} and {ingredient_2}? One word only.", - "food", - gemini_flash_model, - ).to_pandas() + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_df = df.semantics.map( + "What is the {gluten-free} food made from {ingredient_1} and {ingredient_2}? One word only.", + "food", + gemini_flash_model, + ).to_pandas() # Result sanitation actual_df["food"] = actual_df["food"].str.strip().str.lower() @@ -348,6 +528,39 @@ def test_map(session, gemini_flash_model): ) +@pytest.mark.parametrize( + ("reply"), + [ + pytest.param("y"), + pytest.param( + "n", marks=pytest.mark.xfail(raises=exceptions.OperationAbortedError) + ), + ], +) +def test_map_with_confirmation(session, gemini_flash_model, reply, monkeypatch): + df = dataframe.DataFrame( + data={ + "ingredient_1": ["Burger Bun", "Soy Bean"], + "ingredient_2": ["Beef Patty", "Bittern"], + "gluten-free": [True, True], + }, + session=session, + ) + monkeypatch.setattr("builtins.input", lambda: reply) + + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 0, + ): + df.semantics.map( + "What is the {gluten-free} food made from {ingredient_1} and {ingredient_2}? One word only.", + "food", + gemini_flash_model, + ) + + @pytest.mark.parametrize( "instruction", [ @@ -369,7 +582,6 @@ def test_map(session, gemini_flash_model): ], ) def test_map_invalid_instruction_raise_error(instruction, gemini_flash_model): - bigframes.options.experiments.semantic_operators = True df = dataframe.DataFrame( data={ "id": [1, 2], @@ -378,12 +590,16 @@ def test_map_invalid_instruction_raise_error(instruction, gemini_flash_model): } ) - with pytest.raises(ValueError): + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): df.semantics.map(instruction, "food", gemini_flash_model) def test_map_invalid_model_raise_error(): - bigframes.options.experiments.semantic_operators = True df = dataframe.DataFrame( data={ "ingredient_1": ["Burger Bun", "Soy Bean"], @@ -391,7 +607,12 @@ def test_map_invalid_model_raise_error(): }, ) - with pytest.raises(TypeError): + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(TypeError): df.semantics.map( "What is the food made from {ingredient_1} and {ingredient_2}? One word only.", "food", @@ -414,7 +635,6 @@ def test_map_invalid_model_raise_error(): ], ) def test_join(instruction, session, gemini_flash_model): - bigframes.options.experiments.semantic_operators = True cities = dataframe.DataFrame( data={ "city": ["Seattle", "Berlin"], @@ -426,11 +646,17 @@ def test_join(instruction, session, gemini_flash_model): session=session, ) - actual_df = cities.semantics.join( - countries, - instruction, - gemini_flash_model, - ).to_pandas() + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_df = cities.semantics.join( + countries, + instruction, + gemini_flash_model, + ).to_pandas() expected_df = pd.DataFrame( { @@ -447,8 +673,42 @@ def test_join(instruction, session, gemini_flash_model): ) +@pytest.mark.parametrize( + ("reply"), + [ + pytest.param("y"), + pytest.param( + "n", marks=pytest.mark.xfail(raises=exceptions.OperationAbortedError) + ), + ], +) +def test_join_with_confirmation(session, gemini_flash_model, reply, monkeypatch): + cities = dataframe.DataFrame( + data={ + "city": ["Seattle", "Berlin"], + }, + session=session, + ) + countries = dataframe.DataFrame( + data={"country": ["USA", "UK", "Germany"]}, + session=session, + ) + monkeypatch.setattr("builtins.input", lambda: reply) + + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 0, + ): + cities.semantics.join( + countries, + "{city} is in {country}", + gemini_flash_model, + ) + + def test_self_join(session, gemini_flash_model): - bigframes.options.experiments.semantic_operators = True animals = dataframe.DataFrame( data={ "animal": ["spider", "capybara"], @@ -456,11 +716,17 @@ def test_self_join(session, gemini_flash_model): session=session, ) - actual_df = animals.semantics.join( - animals, - "{left.animal} is heavier than {right.animal}", - gemini_flash_model, - ).to_pandas() + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_df = animals.semantics.join( + animals, + "{left.animal} is heavier than {right.animal}", + gemini_flash_model, + ).to_pandas() expected_df = pd.DataFrame( { @@ -477,25 +743,6 @@ def test_self_join(session, gemini_flash_model): ) -def test_join_data_too_large_raise_error(session, gemini_flash_model): - bigframes.options.experiments.semantic_operators = True - cities = dataframe.DataFrame( - data={ - "city": ["Seattle", "Berlin"], - }, - session=session, - ) - countries = dataframe.DataFrame( - data={"country": ["USA", "UK", "Germany"]}, - session=session, - ) - - with pytest.raises(ValueError): - cities.semantics.join( - countries, "{city} belongs to {country}", gemini_flash_model, max_rows=1 - ) - - @pytest.mark.parametrize( ("instruction", "error_pattern"), [ @@ -521,7 +768,6 @@ def test_join_data_too_large_raise_error(session, gemini_flash_model): def test_join_invalid_instruction_raise_error( instruction, error_pattern, gemini_flash_model ): - bigframes.options.experiments.semantic_operators = True df1 = dataframe.DataFrame( {"city": ["Seattle", "Berlin"], "country": ["USA", "Germany"]} ) @@ -532,16 +778,25 @@ def test_join_invalid_instruction_raise_error( } ) - with pytest.raises(ValueError, match=error_pattern): + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError, match=error_pattern): df1.semantics.join(df2, instruction, gemini_flash_model) def test_join_invalid_model_raise_error(): - bigframes.options.experiments.semantic_operators = True cities = dataframe.DataFrame({"city": ["Seattle", "Berlin"]}) countries = dataframe.DataFrame({"country": ["USA", "UK", "Germany"]}) - with pytest.raises(TypeError): + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(TypeError): cities.semantics.join(countries, "{city} is in {country}", None) @@ -553,19 +808,24 @@ def test_join_invalid_model_raise_error(): ], ) def test_search(session, text_embedding_generator, score_column): - bigframes.options.experiments.semantic_operators = True df = dataframe.DataFrame( data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, session=session, ) - actual_result = df.semantics.search( - "creatures", - "monkey", - top_k=2, - model=text_embedding_generator, - score_column=score_column, - ).to_pandas() + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_result = df.semantics.search( + "creatures", + "monkey", + top_k=2, + model=text_embedding_generator, + score_column=score_column, + ).to_pandas() expected_result = pd.Series( ["baboons", "chimpanzee"], index=[2, 4], name="creatures" @@ -583,38 +843,82 @@ def test_search(session, text_embedding_generator, score_column): assert score_column in actual_result.columns +@pytest.mark.parametrize( + ("reply"), + [ + pytest.param("y"), + pytest.param( + "n", marks=pytest.mark.xfail(raises=exceptions.OperationAbortedError) + ), + ], +) +def test_search_with_confirmation( + session, text_embedding_generator, reply, monkeypatch +): + df = dataframe.DataFrame( + data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, + session=session, + ) + monkeypatch.setattr("builtins.input", lambda: reply) + + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 0, + ): + df.semantics.search( + "creatures", + "monkey", + top_k=2, + model=text_embedding_generator, + ) + + def test_search_invalid_column_raises_error(session, text_embedding_generator): - bigframes.options.experiments.semantic_operators = True df = dataframe.DataFrame( data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, session=session, ) - with pytest.raises(ValueError): + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): df.semantics.search( "whatever", "monkey", top_k=2, model=text_embedding_generator ) def test_search_invalid_model_raises_error(session): - bigframes.options.experiments.semantic_operators = True df = dataframe.DataFrame( data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, session=session, ) - with pytest.raises(TypeError): + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(TypeError): df.semantics.search("creatures", "monkey", top_k=2, model=None) def test_search_invalid_top_k_raises_error(session, text_embedding_generator): - bigframes.options.experiments.semantic_operators = True df = dataframe.DataFrame( data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, session=session, ) - with pytest.raises(ValueError): + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): df.semantics.search( "creatures", "monkey", top_k=0, model=text_embedding_generator ) @@ -628,7 +932,6 @@ def test_search_invalid_top_k_raises_error(session, text_embedding_generator): ], ) def test_sim_join(session, text_embedding_generator, score_column): - bigframes.options.experiments.semantic_operators = True df1 = dataframe.DataFrame( data={"creatures": ["salmon", "cat"]}, session=session, @@ -638,14 +941,20 @@ def test_sim_join(session, text_embedding_generator, score_column): session=session, ) - actual_result = df1.semantics.sim_join( - df2, - left_on="creatures", - right_on="creatures", - model=text_embedding_generator, - top_k=1, - score_column=score_column, - ).to_pandas() + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_result = df1.semantics.sim_join( + df2, + left_on="creatures", + right_on="creatures", + model=text_embedding_generator, + top_k=1, + score_column=score_column, + ).to_pandas() expected_result = pd.DataFrame( {"creatures": ["salmon", "cat"], "creatures_1": ["tuna", "dog"]} @@ -663,6 +972,43 @@ def test_sim_join(session, text_embedding_generator, score_column): assert score_column in actual_result.columns +@pytest.mark.parametrize( + ("reply"), + [ + pytest.param("y"), + pytest.param( + "n", marks=pytest.mark.xfail(raises=exceptions.OperationAbortedError) + ), + ], +) +def test_sim_join_with_confirmation( + session, text_embedding_generator, reply, monkeypatch +): + df1 = dataframe.DataFrame( + data={"creatures": ["salmon", "cat"]}, + session=session, + ) + df2 = dataframe.DataFrame( + data={"creatures": ["dog", "tuna"]}, + session=session, + ) + monkeypatch.setattr("builtins.input", lambda: reply) + + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 0, + ): + df1.semantics.sim_join( + df2, + left_on="creatures", + right_on="creatures", + model=text_embedding_generator, + top_k=1, + ) + + @pytest.mark.parametrize( ("left_on", "right_on"), [ @@ -673,7 +1019,6 @@ def test_sim_join(session, text_embedding_generator, score_column): def test_sim_join_invalid_column_raises_error( session, text_embedding_generator, left_on, right_on ): - bigframes.options.experiments.semantic_operators = True df1 = dataframe.DataFrame( data={"creatures": ["salmon", "cat"]}, session=session, @@ -683,14 +1028,18 @@ def test_sim_join_invalid_column_raises_error( session=session, ) - with pytest.raises(ValueError): + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): df1.semantics.sim_join( df2, left_on=left_on, right_on=right_on, model=text_embedding_generator ) def test_sim_join_invalid_model_raises_error(session): - bigframes.options.experiments.semantic_operators = True df1 = dataframe.DataFrame( data={"creatures": ["salmon", "cat"]}, session=session, @@ -700,14 +1049,18 @@ def test_sim_join_invalid_model_raises_error(session): session=session, ) - with pytest.raises(TypeError): + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(TypeError): df1.semantics.sim_join( df2, left_on="creatures", right_on="creatures", model=None ) def test_sim_join_invalid_top_k_raises_error(session, text_embedding_generator): - bigframes.options.experiments.semantic_operators = True df1 = dataframe.DataFrame( data={"creatures": ["salmon", "cat"]}, session=session, @@ -717,7 +1070,12 @@ def test_sim_join_invalid_top_k_raises_error(session, text_embedding_generator): session=session, ) - with pytest.raises(ValueError): + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): df1.semantics.sim_join( df2, left_on="creatures", @@ -728,7 +1086,6 @@ def test_sim_join_invalid_top_k_raises_error(session, text_embedding_generator): def test_sim_join_data_too_large_raises_error(session, text_embedding_generator): - bigframes.options.experiments.semantic_operators = True df1 = dataframe.DataFrame( data={"creatures": ["salmon", "cat"]}, session=session, @@ -738,7 +1095,12 @@ def test_sim_join_data_too_large_raises_error(session, text_embedding_generator) session=session, ) - with pytest.raises(ValueError): + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): df1.semantics.sim_join( df2, left_on="creatures", @@ -774,7 +1136,6 @@ def test_sim_join_data_too_large_raises_error(session, text_embedding_generator) ], ) def test_top_k_invalid_instruction_raise_error(instruction, gemini_flash_model): - bigframes.options.experiments.semantic_operators = True df = dataframe.DataFrame( { "Animals": ["Dog", "Cat", "Bird", "Horse"], @@ -782,15 +1143,97 @@ def test_top_k_invalid_instruction_raise_error(instruction, gemini_flash_model): "index": ["a", "b", "c", "d"], } ) - df.semantics.top_k(instruction, model=gemini_flash_model, k=2) + + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + df.semantics.top_k(instruction, model=gemini_flash_model, k=2) def test_top_k_invalid_k_raise_error(gemini_flash_model): - bigframes.options.experiments.semantic_operators = True df = dataframe.DataFrame({"Animals": ["Dog", "Cat", "Bird", "Horse"]}) - with pytest.raises(ValueError): + + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): df.semantics.top_k( "{Animals} are more popular as pets", gemini_flash_model, k=0, ) + + +@patch("builtins.input", return_value="") +def test_confirm_operation__below_threshold_do_not_confirm(mock_input): + df = dataframe.DataFrame({}) + + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 3, + ): + df.semantics._confirm_operation(1) + + mock_input.assert_not_called() + + +@patch("builtins.input", return_value="") +def test_confirm_operation__threshold_is_none_do_not_confirm(mock_input): + df = dataframe.DataFrame({}) + + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + None, + ): + df.semantics._confirm_operation(100) + + mock_input.assert_not_called() + + +@patch("builtins.input", return_value="") +def test_confirm_operation__threshold_autofail_do_not_confirm(mock_input): + df = dataframe.DataFrame({}) + + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 1, + "compute.semantic_ops_threshold_autofail", + True, + ), pytest.raises(exceptions.OperationAbortedError): + df.semantics._confirm_operation(100) + + mock_input.assert_not_called() + + +@pytest.mark.parametrize( + ("reply", "expectation"), + [ + ("y", nullcontext()), + ("yes", nullcontext()), + ("", nullcontext()), + ("n", pytest.raises(exceptions.OperationAbortedError)), + ("something", pytest.raises(exceptions.OperationAbortedError)), + ], +) +def test_confirm_operation__above_threshold_confirm(reply, expectation, monkeypatch): + monkeypatch.setattr("builtins.input", lambda: reply) + df = dataframe.DataFrame({}) + + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 3, + ), expectation as e: + assert df.semantics._confirm_operation(4) == e