From a32886a51f53cd0c8a20d6663091da24132e98c6 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 3 Jan 2025 01:33:15 +0000 Subject: [PATCH 01/12] feat: implement confirmation threshold for semantic operators --- bigframes/_config/compute_options.py | 6 + bigframes/operations/semantics.py | 64 ++- .../experimental/semantic_operators.ipynb | 386 ++++++++++-------- .../system/large/operations/test_semantics.py | 280 +++++++++++++ 4 files changed, 555 insertions(+), 181 deletions(-) diff --git a/bigframes/_config/compute_options.py b/bigframes/_config/compute_options.py index c8a54fe0b3..33e978e51f 100644 --- a/bigframes/_config/compute_options.py +++ b/bigframes/_config/compute_options.py @@ -66,6 +66,11 @@ class ComputeOptions: engine to handle. However this comes at the cost of increase cost and latency. extra_query_labels (Dict[str, Any], Options): Stores additional custom labels for query configuration. + sem_ops_confirmation_threshold (int, Options): + Guards against unexepcted processing of large amount of rows by semantic operators. + If the number of rows exceeds the threshold, the user will be asked to confirm + their operations to resume. The default value is 25. Set the value to None + to turn off the guard. """ maximum_bytes_billed: Optional[int] = None @@ -73,6 +78,7 @@ class ComputeOptions: extra_query_labels: Dict[str, Any] = dataclasses.field( default_factory=dict, init=False ) + sem_ops_confirmation_threshold: Optional[int] = 25 def assign_extra_query_labels(self, **kwargs: Any) -> None: """ diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py index 79b92afe4f..1aa76dfb3a 100644 --- a/bigframes/operations/semantics.py +++ b/bigframes/operations/semantics.py @@ -115,6 +115,16 @@ def agg( self._validate_model(model) columns = self._parse_columns(instruction) + if max_agg_rows <= 1: + raise ValueError( + f"Invalid value for `max_agg_rows`: {max_agg_rows}." + "It must be greater than 1." + ) + + work_estimate = len(self._df) * int(max_agg_rows / (max_agg_rows - 1)) + if not self._confirm_operation(work_estimate): + return None + df: bigframes.dataframe.DataFrame = self._df.copy() for column in columns: if column not in self._df.columns: @@ -135,12 +145,6 @@ def agg( "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" ) - if max_agg_rows <= 1: - raise ValueError( - f"Invalid value for `max_agg_rows`: {max_agg_rows}." - "It must be greater than 1." - ) - user_instruction = self._format_instruction(instruction, columns) num_cluster = 1 @@ -296,6 +300,9 @@ def cluster_by( "It must be greater than 1." ) + if not self._confirm_operation(len(self._df)): + return None + df: bigframes.dataframe.DataFrame = self._df.copy() embeddings_df = model.predict(df[column]) @@ -367,6 +374,9 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" ) + if not self._confirm_operation(len(self._df)): + return None + df: bigframes.dataframe.DataFrame = self._df[columns].copy() for column in columns: if df[column].dtype != dtypes.STRING_DTYPE: @@ -462,6 +472,9 @@ def map( "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" ) + if not self._confirm_operation(len(self._df)): + return None + df: bigframes.dataframe.DataFrame = self._df[columns].copy() for column in columns: if df[column].dtype != dtypes.STRING_DTYPE: @@ -490,7 +503,6 @@ def join( other, instruction: str, model, - max_rows: int = 1000, ground_with_google_search: bool = False, ): """ @@ -561,12 +573,9 @@ def join( "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" ) - joined_table_rows = len(self._df) * len(other) - - if joined_table_rows > max_rows: - raise ValueError( - f"Number of rows that need processing is {joined_table_rows}, which exceeds row limit {max_rows}." - ) + work_estimate = len(self._df) * len(other) + if not self._confirm_operation(work_estimate): + return None left_columns = [] right_columns = [] @@ -680,6 +689,9 @@ def search( if search_column not in self._df.columns: raise ValueError(f"Column `{search_column}` not found") + if not self._confirm_operation(len(self._df)): + return None + import bigframes.ml.llm as llm if not isinstance(model, llm.TextEmbeddingGenerator): @@ -803,6 +815,10 @@ def top_k( "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" ) + work_estimate = int(len(self._df) * (len(self._df) - 1) / 2) + if not self._confirm_operation(work_estimate): + return None + df: bigframes.dataframe.DataFrame = self._df[columns].copy() column = columns[0] if df[column].dtype != dtypes.STRING_DTYPE: @@ -1001,6 +1017,10 @@ def sim_join( if top_k < 1: raise ValueError("top_k must be an integer greater than or equal to 1.") + work_estimate = len(self._df) * len(other) + if not self._confirm_operation(work_estimate): + return None + base_table_embedding_column = guid.generate_guid() base_table = self._attach_embedding( other, right_on, base_table_embedding_column, model @@ -1072,3 +1092,21 @@ def _validate_model(model): if not isinstance(model, GeminiTextGenerator): raise TypeError("Model is not GeminiText Generator") + + @staticmethod + def _confirm_operation(row_count: int) -> bool: + import bigframes + + threshold = bigframes.options.compute.sem_ops_confirmation_threshold + + if threshold is None or row_count <= threshold: + return True + + # Separate the prompt out. In IDE such VS Code, leaving prompt in the + # input function makes it less visible to the end user. + print(f"This operation will process about {row_count} rows. Proceed? [Y/n]") + reply = input().lower() + if reply == "" or reply == "y" or reply == "yes": + return True + + return False diff --git a/notebooks/experimental/semantic_operators.ipynb b/notebooks/experimental/semantic_operators.ipynb index 815e14284f..1193f0cb62 100644 --- a/notebooks/experimental/semantic_operators.ipynb +++ b/notebooks/experimental/semantic_operators.ipynb @@ -153,7 +153,26 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# API Syntax" + "*Note*: semantic operators could be expensive over a large set of data. As a result, our team added this option `bigframes.options.compute.sem_ops_confirmation_threshold` at `version 1.31.0` so that the BigQuery Dataframe will ask for your confirmation if the amount of data to be processed is too large. If the amount of rows exceeds your threshold, you will see a prompt for your keyboard input -- 'y' to proceed and 'n' to abort. If you abort the operation, no LLM processing will be done.\n", + "\n", + "The default threshold is 25 rows. You are free to adjust the value as needed. To skip confirmation for all operations, set it to `None`. To always have confirmations, set it to `0`." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "if Version(bigframes.__version__) >= Version(\"1.31.0\"):\n", + " bigframes.options.compute.sem_ops_confirmation_threshold = 100" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# The API" ] }, { @@ -181,7 +200,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -239,7 +258,7 @@ "[3 rows x 2 columns]" ] }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -263,7 +282,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -317,7 +336,7 @@ "[1 rows x 2 columns]" ] }, - "execution_count": 8, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -351,7 +370,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -409,7 +428,7 @@ "[3 rows x 2 columns]" ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -431,7 +450,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -504,7 +523,7 @@ "[3 rows x 3 columns]" ] }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -531,7 +550,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -548,7 +567,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -620,7 +639,7 @@ "[4 rows x 2 columns]" ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -633,7 +652,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "!! **Important:** Semantic join can trigger probihitively expensitve operations! This operation first cross joins two dataframes, then invokes semantic filter on each row. That means if you have two dataframes of sizes `M` and `N`, the total amount of queries sent to the LLM is on the scale of `M * N`. Therefore, our team has added a parameter `max_rows`, a threshold that guards against unexpected expensive calls. With this parameter, the operator first calculates the size of your cross-joined data, and compares it with the threshold. If the size exceeds your threshold, the fuction will abort early with a `ValueError`. You can manually set the value of `max_rows` to raise or lower the threshold." + "!! **Important:** Semantic join can trigger probihitively expensitve operations! This operation first cross joins two dataframes, then invokes semantic filter on each row. That means if you have two dataframes of sizes `M` and `N`, the total amount of queries sent to the LLM is on the scale of `M * N`. " ] }, { @@ -654,7 +673,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -670,7 +689,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -754,7 +773,7 @@ "[6 rows x 2 columns]" ] }, - "execution_count": 14, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -781,7 +800,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -855,7 +874,7 @@ "[7 rows x 1 columns]" ] }, - "execution_count": 15, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -884,7 +903,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -903,7 +922,7 @@ "Name: Movies, dtype: string" ] }, - "execution_count": 16, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -936,7 +955,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -952,7 +971,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -1009,7 +1028,7 @@ "[2 rows x 1 columns]" ] }, - "execution_count": 18, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -1041,7 +1060,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -1105,7 +1124,7 @@ "[5 rows x 1 columns]" ] }, - "execution_count": 19, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1124,7 +1143,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -1188,7 +1207,7 @@ "[2 rows x 2 columns]" ] }, - "execution_count": 20, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1222,7 +1241,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -1239,7 +1258,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -1325,7 +1344,7 @@ "[5 rows x 3 columns]" ] }, - "execution_count": 22, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -1338,7 +1357,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "!! **Important** Like semantic join, this operator can also be very expensive. To guard against unexpected processing of large dataset, use the `max_rows` parameter to specify a threshold. " + "!! **Important** Like semantic join, this operator can also be very expensive. To guard against unexpected processing of large dataset, use the `bigframes.options.compute.sem_ops_confirmation_threshold` option to specify a threshold. " ] }, { @@ -1357,7 +1376,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -1373,7 +1392,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -1415,17 +1434,17 @@ " \n", " 0\n", " Smartphone\n", - " 2\n", + " 3\n", " \n", " \n", " 1\n", " Laptop\n", - " 2\n", + " 3\n", " \n", " \n", " 2\n", " Coffee Maker\n", - " 2\n", + " 1\n", " \n", " \n", " 3\n", @@ -1444,16 +1463,16 @@ ], "text/plain": [ " Product Cluster ID\n", - "0 Smartphone 2\n", - "1 Laptop 2\n", - "2 Coffee Maker 2\n", + "0 Smartphone 3\n", + "1 Laptop 3\n", + "2 Coffee Maker 1\n", "3 T-shirt 2\n", "4 Jeans 2\n", "\n", "[5 rows x 2 columns]" ] }, - "execution_count": 24, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -1487,7 +1506,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -1836,7 +1855,7 @@ "[3000 rows x 6 columns]" ] }, - "execution_count": 25, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -1855,16 +1874,16 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "2555" + "2556" ] }, - "execution_count": 26, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -1883,16 +1902,16 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "390.61878669276047" + "390.29068857589976" ] }, - "execution_count": 27, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -1901,6 +1920,23 @@ "hacker_news_with_texts['text'].str.len().mean()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[Optional] You can raise the confirmation threshold for a smoother experience." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "if Version(bigframes.__version__) >= Version(\"1.31.0\"):\n", + " bigframes.options.compute.sem_ops_confirmation_threshold = 5000" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1910,9 +1946,16 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 30, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This operation will process about 2556 rows. Proceed? [Y/n]\n" + ] + }, { "name": "stderr", "output_type": "stream", @@ -1961,7 +2004,7 @@ " comment\n", " \n", " \n", - " 419\n", + " 420\n", " <NA>\n", " Well last time I got angry down votes for sayi...\n", " drieddust\n", @@ -1970,7 +2013,7 @@ " comment\n", " \n", " \n", - " 812\n", + " 814\n", " <NA>\n", " New iPhone should be announced on September. L...\n", " meerita\n", @@ -1979,7 +2022,7 @@ " comment\n", " \n", " \n", - " 1512\n", + " 1515\n", " <NA>\n", " Why would this take a week? i(phone)OS was ori...\n", " TheOtherHobbes\n", @@ -1988,7 +2031,7 @@ " comment\n", " \n", " \n", - " 1559\n", + " 1562\n", " <NA>\n", " &gt;or because Apple drama brings many clicks?...\n", " weberer\n", @@ -2004,22 +2047,22 @@ "text/plain": [ " title text by \\\n", "9 It doesn’t work on Safari, and WebKit based br... archiewood \n", - "419 Well last time I got angry down votes for sayi... drieddust \n", - "812 New iPhone should be announced on September. L... meerita \n", - "1512 Why would this take a week? i(phone)OS was ori... TheOtherHobbes \n", - "1559 >or because Apple drama brings many clicks?... weberer \n", + "420 Well last time I got angry down votes for sayi... drieddust \n", + "814 New iPhone should be announced on September. L... meerita \n", + "1515 Why would this take a week? i(phone)OS was ori... TheOtherHobbes \n", + "1562 >or because Apple drama brings many clicks?... weberer \n", "\n", " score timestamp type \n", "9 2023-04-21 16:45:13+00:00 comment \n", - "419 2021-01-11 19:27:27+00:00 comment \n", - "812 2019-07-30 20:54:42+00:00 comment \n", - "1512 2021-06-08 09:25:24+00:00 comment \n", - "1559 2022-09-05 13:16:02+00:00 comment \n", + "420 2021-01-11 19:27:27+00:00 comment \n", + "814 2019-07-30 20:54:42+00:00 comment \n", + "1515 2021-06-08 09:25:24+00:00 comment \n", + "1562 2022-09-05 13:16:02+00:00 comment \n", "\n", "[5 rows x 6 columns]" ] }, - "execution_count": 28, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -2051,7 +2094,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -2104,7 +2147,7 @@ " Frustrated, but hopeful.\n", " \n", " \n", - " 419\n", + " 420\n", " <NA>\n", " Well last time I got angry down votes for sayi...\n", " drieddust\n", @@ -2114,7 +2157,7 @@ " Frustrated and angry.\n", " \n", " \n", - " 812\n", + " 814\n", " <NA>\n", " New iPhone should be announced on September. L...\n", " meerita\n", @@ -2124,7 +2167,7 @@ " Excited anticipation.\n", " \n", " \n", - " 1512\n", + " 1515\n", " <NA>\n", " Why would this take a week? i(phone)OS was ori...\n", " TheOtherHobbes\n", @@ -2134,7 +2177,7 @@ " Frustrated, critical, obvious.\n", " \n", " \n", - " 1559\n", + " 1562\n", " <NA>\n", " &gt;or because Apple drama brings many clicks?...\n", " weberer\n", @@ -2151,34 +2194,34 @@ "text/plain": [ " title text by \\\n", "9 It doesn’t work on Safari, and WebKit based br... archiewood \n", - "419 Well last time I got angry down votes for sayi... drieddust \n", - "812 New iPhone should be announced on September. L... meerita \n", - "1512 Why would this take a week? i(phone)OS was ori... TheOtherHobbes \n", - "1559 >or because Apple drama brings many clicks?... weberer \n", + "420 Well last time I got angry down votes for sayi... drieddust \n", + "814 New iPhone should be announced on September. L... meerita \n", + "1515 Why would this take a week? i(phone)OS was ori... TheOtherHobbes \n", + "1562 >or because Apple drama brings many clicks?... weberer \n", "\n", " score timestamp type \\\n", "9 2023-04-21 16:45:13+00:00 comment \n", - "419 2021-01-11 19:27:27+00:00 comment \n", - "812 2019-07-30 20:54:42+00:00 comment \n", - "1512 2021-06-08 09:25:24+00:00 comment \n", - "1559 2022-09-05 13:16:02+00:00 comment \n", + "420 2021-01-11 19:27:27+00:00 comment \n", + "814 2019-07-30 20:54:42+00:00 comment \n", + "1515 2021-06-08 09:25:24+00:00 comment \n", + "1562 2022-09-05 13:16:02+00:00 comment \n", "\n", " sentiment \n", "9 Frustrated, but hopeful. \n", " \n", - "419 Frustrated and angry. \n", + "420 Frustrated and angry. \n", " \n", - "812 Excited anticipation. \n", + "814 Excited anticipation. \n", " \n", - "1512 Frustrated, critical, obvious. \n", + "1515 Frustrated, critical, obvious. \n", " \n", - "1559 Negative, clickbait, Apple. \n", + "1562 Negative, clickbait, Apple. \n", " \n", "\n", "[5 rows x 7 columns]" ] }, - "execution_count": 29, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -2196,14 +2239,14 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/venv/lib/python3.11/site-packages/IPython/core/interactiveshell.py:3577: UserWarning: Reading cached table from 2024-12-27 21:39:10.129973+00:00 to avoid incompatibilies with previous reads of this table. To read the latest version, set `use_cache=False` or close the current session with Session.close() or bigframes.pandas.close_session().\n", + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/venv/lib/python3.11/site-packages/IPython/core/interactiveshell.py:3577: UserWarning: Reading cached table from 2025-01-03 01:18:29.080474+00:00 to avoid incompatibilies with previous reads of this table. To read the latest version, set `use_cache=False` or close the current session with Session.close() or bigframes.pandas.close_session().\n", " exec(code_obj, self.user_global_ns, self.user_ns)\n" ] }, @@ -2553,7 +2596,7 @@ "[3000 rows x 6 columns]" ] }, - "execution_count": 30, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -2565,9 +2608,16 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 33, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This operation will process about 3000 rows. Proceed? [Y/n]\n" + ] + }, { "name": "stderr", "output_type": "stream", @@ -2643,7 +2693,7 @@ " comment\n", " \n", " \n", - " 208\n", + " 209\n", " <NA>\n", " I like the idea of moving that arrow the way h...\n", " rattray\n", @@ -2652,7 +2702,7 @@ " comment\n", " \n", " \n", - " 227\n", + " 228\n", " <NA>\n", " I don&#x27;t understand why a beginner would s...\n", " wolco\n", @@ -2661,7 +2711,7 @@ " comment\n", " \n", " \n", - " 289\n", + " 290\n", " <NA>\n", " I leaerned more with one minute of this than a...\n", " agumonkey\n", @@ -2670,7 +2720,7 @@ " comment\n", " \n", " \n", - " 302\n", + " 303\n", " <NA>\n", " I've suggested a <i>rationale</i> for the tabo...\n", " mechanical_fish\n", @@ -2679,7 +2729,7 @@ " comment\n", " \n", " \n", - " 311\n", + " 312\n", " <NA>\n", " Do you have any reference for this?<p>I&#x27;m...\n", " banashark\n", @@ -2688,7 +2738,7 @@ " comment\n", " \n", " \n", - " 321\n", + " 322\n", " <NA>\n", " Default search scope is an option in the Finde...\n", " kitsunesoba\n", @@ -2697,7 +2747,7 @@ " comment\n", " \n", " \n", - " 390\n", + " 391\n", " <NA>\n", " Orthogonality and biology aren&#x27;t friends.\n", " agumonkey\n", @@ -2706,7 +2756,7 @@ " comment\n", " \n", " \n", - " 395\n", + " 396\n", " <NA>\n", " I chose some random physics book that was good...\n", " prawn\n", @@ -2715,7 +2765,7 @@ " comment\n", " \n", " \n", - " 423\n", + " 424\n", " <NA>\n", " Seeing this get huge on Twitter. It&#x27;s the...\n", " shenanigoat\n", @@ -2724,7 +2774,7 @@ " comment\n", " \n", " \n", - " 427\n", + " 428\n", " <NA>\n", " Looking through the comments there are a numbe...\n", " moomin\n", @@ -2733,7 +2783,7 @@ " comment\n", " \n", " \n", - " 428\n", + " 429\n", " <NA>\n", " Legacy media is a tough business. GBTC is payi...\n", " arcticbull\n", @@ -2742,7 +2792,7 @@ " comment\n", " \n", " \n", - " 435\n", + " 436\n", " <NA>\n", " Same thing if you sell unsafe food, yet we hav...\n", " jabradoodle\n", @@ -2751,7 +2801,7 @@ " comment\n", " \n", " \n", - " 437\n", + " 438\n", " <NA>\n", " There was briefly a thing called HSCSD (&quot;...\n", " LeoPanthera\n", @@ -2760,7 +2810,7 @@ " comment\n", " \n", " \n", - " 445\n", + " 446\n", " <NA>\n", " &gt; This article is a bit comical to read and...\n", " lapcat\n", @@ -2769,7 +2819,7 @@ " comment\n", " \n", " \n", - " 452\n", + " 453\n", " <NA>\n", " Large positions are most likely sold off in sm...\n", " meowkit\n", @@ -2778,7 +2828,7 @@ " comment\n", " \n", " \n", - " 506\n", + " 507\n", " <NA>\n", " A US-based VPN (or really any VPN) is only goi...\n", " RandomBacon\n", @@ -2787,7 +2837,7 @@ " comment\n", " \n", " \n", - " 542\n", + " 543\n", " <NA>\n", " <a href=\"https:&#x2F;&#x2F;codeberg.org&#x2F;A...\n", " ElectronBadger\n", @@ -2796,7 +2846,7 @@ " comment\n", " \n", " \n", - " 564\n", + " 565\n", " <NA>\n", " It’s much harder for people without hands to w...\n", " Aeolun\n", @@ -2805,7 +2855,7 @@ " comment\n", " \n", " \n", - " 611\n", + " 612\n", " <NA>\n", " So by using ADMIN_SL0T instead was it just set...\n", " minitoar\n", @@ -2814,7 +2864,7 @@ " comment\n", " \n", " \n", - " 658\n", + " 660\n", " <NA>\n", " Outstanding!\n", " cafard\n", @@ -2823,7 +2873,7 @@ " comment\n", " \n", " \n", - " 671\n", + " 673\n", " <NA>\n", " On the other hand, something can be said for &...\n", " babby\n", @@ -2842,87 +2892,87 @@ "98 \n", "137 FDA reverses marketing ban on Juul e-cigarettes \n", "188 \n", - "208 \n", - "227 \n", - "289 \n", - "302 \n", - "311 \n", - "321 \n", - "390 \n", - "395 \n", - "423 \n", - "427 \n", + "209 \n", + "228 \n", + "290 \n", + "303 \n", + "312 \n", + "322 \n", + "391 \n", + "396 \n", + "424 \n", "428 \n", - "435 \n", - "437 \n", - "445 \n", - "452 \n", - "506 \n", - "542 \n", - "564 \n", - "611 \n", - "658 \n", - "671 \n", + "429 \n", + "436 \n", + "438 \n", + "446 \n", + "453 \n", + "507 \n", + "543 \n", + "565 \n", + "612 \n", + "660 \n", + "673 \n", "\n", " text by \\\n", "24 GiraffeNecktie \n", "98 i resisted switching to chrome for months beca... catshirt \n", "137 anigbrowl \n", "188 I think it's more than hazing. It may be ... bayesianhorse \n", - "208 I like the idea of moving that arrow the way h... rattray \n", - "227 I don't understand why a beginner would s... wolco \n", - "289 I leaerned more with one minute of this than a... agumonkey \n", - "302 I've suggested a rationale for the tabo... mechanical_fish \n", - "311 Do you have any reference for this?

I'm... banashark \n", - "321 Default search scope is an option in the Finde... kitsunesoba \n", - "390 Orthogonality and biology aren't friends. agumonkey \n", - "395 I chose some random physics book that was good... prawn \n", - "423 Seeing this get huge on Twitter. It's the... shenanigoat \n", - "427 Looking through the comments there are a numbe... moomin \n", - "428 Legacy media is a tough business. GBTC is payi... arcticbull \n", - "435 Same thing if you sell unsafe food, yet we hav... jabradoodle \n", - "437 There was briefly a thing called HSCSD ("... LeoPanthera \n", - "445 > This article is a bit comical to read and... lapcat \n", - "452 Large positions are most likely sold off in sm... meowkit \n", - "506 A US-based VPN (or really any VPN) is only goi... RandomBacon \n", - "542 rationale for the tabo... mechanical_fish \n", + "312 Do you have any reference for this?

I'm... banashark \n", + "322 Default search scope is an option in the Finde... kitsunesoba \n", + "391 Orthogonality and biology aren't friends. agumonkey \n", + "396 I chose some random physics book that was good... prawn \n", + "424 Seeing this get huge on Twitter. It's the... shenanigoat \n", + "428 Looking through the comments there are a numbe... moomin \n", + "429 Legacy media is a tough business. GBTC is payi... arcticbull \n", + "436 Same thing if you sell unsafe food, yet we hav... jabradoodle \n", + "438 There was briefly a thing called HSCSD ("... LeoPanthera \n", + "446 > This article is a bit comical to read and... lapcat \n", + "453 Large positions are most likely sold off in sm... meowkit \n", + "507 A US-based VPN (or really any VPN) is only goi... RandomBacon \n", + "543 2011-04-06 08:02:24+00:00 comment \n", "137 2 2024-06-06 16:42:40+00:00 story \n", "188 2015-06-18 16:42:53+00:00 comment \n", - "208 2015-06-08 02:15:30+00:00 comment \n", - "227 2019-02-03 14:35:43+00:00 comment \n", - "289 2016-07-16 06:19:39+00:00 comment \n", - "302 2008-12-17 04:42:02+00:00 comment \n", - "311 2023-11-13 19:57:00+00:00 comment \n", - "321 2017-08-13 17:15:19+00:00 comment \n", - "390 2016-04-24 16:33:41+00:00 comment \n", - "395 2011-03-27 22:29:51+00:00 comment \n", - "423 2016-01-09 03:04:22+00:00 comment \n", - "427 2024-10-01 14:37:04+00:00 comment \n", - "428 2021-04-16 16:30:33+00:00 comment \n", - "435 2023-08-03 20:47:52+00:00 comment \n", - "437 2019-02-11 19:49:29+00:00 comment \n", - "445 2023-01-02 16:00:49+00:00 comment \n", - "452 2021-01-27 23:22:48+00:00 comment \n", - "506 2019-04-05 00:58:58+00:00 comment \n", - "542 2023-12-13 08:13:15+00:00 comment \n", - "564 2024-05-03 11:58:13+00:00 comment \n", - "611 2021-03-05 16:07:56+00:00 comment \n", - "658 2022-06-09 09:51:54+00:00 comment \n", - "671 2013-08-12 00:31:02+00:00 comment \n", + "209 2015-06-08 02:15:30+00:00 comment \n", + "228 2019-02-03 14:35:43+00:00 comment \n", + "290 2016-07-16 06:19:39+00:00 comment \n", + "303 2008-12-17 04:42:02+00:00 comment \n", + "312 2023-11-13 19:57:00+00:00 comment \n", + "322 2017-08-13 17:15:19+00:00 comment \n", + "391 2016-04-24 16:33:41+00:00 comment \n", + "396 2011-03-27 22:29:51+00:00 comment \n", + "424 2016-01-09 03:04:22+00:00 comment \n", + "428 2024-10-01 14:37:04+00:00 comment \n", + "429 2021-04-16 16:30:33+00:00 comment \n", + "436 2023-08-03 20:47:52+00:00 comment \n", + "438 2019-02-11 19:49:29+00:00 comment \n", + "446 2023-01-02 16:00:49+00:00 comment \n", + "453 2021-01-27 23:22:48+00:00 comment \n", + "507 2019-04-05 00:58:58+00:00 comment \n", + "543 2023-12-13 08:13:15+00:00 comment \n", + "565 2024-05-03 11:58:13+00:00 comment \n", + "612 2021-03-05 16:07:56+00:00 comment \n", + "660 2022-06-09 09:51:54+00:00 comment \n", + "673 2013-08-12 00:31:02+00:00 comment \n", "...\n", "\n", "[123 rows x 6 columns]" ] }, - "execution_count": 31, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } diff --git a/tests/system/large/operations/test_semantics.py b/tests/system/large/operations/test_semantics.py index 7602be2fca..6ac9cfc7f0 100644 --- a/tests/system/large/operations/test_semantics.py +++ b/tests/system/large/operations/test_semantics.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from unittest.mock import patch + import pandas as pd import pandas.testing import pytest @@ -45,6 +47,7 @@ def test_semantics_experiment_off_raise_error(): ) def test_agg(session, gemini_flash_model, max_agg_rows, cluster_column): bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={ "Movies": [ @@ -73,8 +76,44 @@ def test_agg(session, gemini_flash_model, max_agg_rows, cluster_column): pandas.testing.assert_series_equal(actual_s, expected_s, check_index_type=False) +@pytest.mark.parametrize(("reply", "should_execute"), [("y", True), ("n", False)]) +def test_agg_with_confirmation( + session, gemini_flash_model, reply, should_execute, monkeypatch +): + bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 0 + df = dataframe.DataFrame( + data={ + "Movies": [ + "Titanic", + "The Wolf of Wall Street", + "Killers of the Flower Moon", + "The Revenant", + "Inception", + "Shuttle Island", + "The Great Gatsby", + ], + "Years": [1997, 2013, 2023, 2015, 2010, 2010, 2013], + }, + session=session, + ) + instruction = "Find the shared first name of actors in {Movies}. One word answer." + monkeypatch.setattr("builtins.input", lambda _: reply) + + result = df.semantics.agg( + instruction, + model=gemini_flash_model, + ) + + if should_execute: + assert result is not None + else: + assert result is None + + def test_agg_w_int_column(session, gemini_flash_model): bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={ "Movies": [ @@ -118,6 +157,7 @@ def test_agg_w_int_column(session, gemini_flash_model): ) def test_agg_invalid_instruction_raise_error(instruction, gemini_flash_model): bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={ "Movies": [ @@ -146,6 +186,7 @@ def test_agg_invalid_instruction_raise_error(instruction, gemini_flash_model): ) def test_agg_invalid_cluster_column_raise_error(gemini_flash_model, cluster_column): bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={ "Movies": [ @@ -169,6 +210,7 @@ def test_agg_invalid_cluster_column_raise_error(gemini_flash_model, cluster_colu ) def test_cluster_by(session, text_embedding_generator, n_clusters): bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 10 df = dataframe.DataFrame( ( { @@ -198,8 +240,46 @@ def test_cluster_by(session, text_embedding_generator, n_clusters): assert len(result[output_column].unique()) <= n_clusters +@pytest.mark.parametrize(("reply", "should_execute"), [("y", True), ("n", False)]) +def test_cluster_by_with_confirmation( + session, text_embedding_generator, reply, should_execute, monkeypatch +): + bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 0 + df = dataframe.DataFrame( + ( + { + "Item": [ + "Orange", + "Cantaloupe", + "Watermelon", + "Chicken", + "Duck", + "Hen", + "Rooster", + ] + } + ), + session=session, + ) + monkeypatch.setattr("builtins.input", lambda _: reply) + + result = df.semantics.cluster_by( + "Item", + "cluster id", + text_embedding_generator, + n_clusters=2, + ) + + if should_execute: + assert result is not None + else: + assert result is None + + def test_cluster_by_invalid_column(session, text_embedding_generator): bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 10 df = dataframe.DataFrame( ({"Product": ["Smartphone", "Laptop", "Coffee Maker", "T-shirt", "Jeans"]}), @@ -218,6 +298,7 @@ def test_cluster_by_invalid_column(session, text_embedding_generator): def test_cluster_by_invalid_model(session, gemini_flash_model): bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 10 df = dataframe.DataFrame( ({"Product": ["Smartphone", "Laptop", "Coffee Maker", "T-shirt", "Jeans"]}), @@ -236,6 +317,7 @@ def test_cluster_by_invalid_model(session, gemini_flash_model): def test_filter(session, gemini_flash_model): bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={ "country": ["USA", "Germany"], @@ -257,8 +339,35 @@ def test_filter(session, gemini_flash_model): ) +@pytest.mark.parametrize(("reply", "should_execute"), [("y", True), ("n", False)]) +def test_filter_with_confirmation( + session, gemini_flash_model, reply, should_execute, monkeypatch +): + bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 0 + df = dataframe.DataFrame( + data={ + "country": ["USA", "Germany"], + "city": ["Seattle", "Berlin"], + "year": [2023, 2024], + }, + session=session, + ) + monkeypatch.setattr("builtins.input", lambda _: reply) + + result = df.semantics.filter( + "{city} is the capital of {country} in {year}", gemini_flash_model + ) + + if should_execute: + assert result is not None + else: + assert result is None + + def test_filter_single_column_reference(session, gemini_flash_model): bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]}, session=session, @@ -296,6 +405,7 @@ def test_filter_single_column_reference(session, gemini_flash_model): ) def test_filter_invalid_instruction_raise_error(instruction, gemini_flash_model): bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 10 df = dataframe.DataFrame({"id": [1, 2], "city": ["Seattle", "Berlin"]}) with pytest.raises(ValueError): @@ -304,6 +414,7 @@ def test_filter_invalid_instruction_raise_error(instruction, gemini_flash_model) def test_filter_invalid_model_raise_error(): bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 10 df = dataframe.DataFrame( {"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]} ) @@ -314,6 +425,7 @@ def test_filter_invalid_model_raise_error(): def test_map(session, gemini_flash_model): bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={ "ingredient_1": ["Burger Bun", "Soy Bean"], @@ -348,6 +460,34 @@ def test_map(session, gemini_flash_model): ) +@pytest.mark.parametrize(("reply", "should_execute"), [("y", True), ("n", False)]) +def test_map_with_confirmation( + session, gemini_flash_model, reply, should_execute, monkeypatch +): + bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 0 + df = dataframe.DataFrame( + data={ + "ingredient_1": ["Burger Bun", "Soy Bean"], + "ingredient_2": ["Beef Patty", "Bittern"], + "gluten-free": [True, True], + }, + session=session, + ) + monkeypatch.setattr("builtins.input", lambda _: reply) + + result = df.semantics.map( + "What is the {gluten-free} food made from {ingredient_1} and {ingredient_2}? One word only.", + "food", + gemini_flash_model, + ) + + if should_execute: + assert result is not None + else: + assert result is None + + @pytest.mark.parametrize( "instruction", [ @@ -370,6 +510,7 @@ def test_map(session, gemini_flash_model): ) def test_map_invalid_instruction_raise_error(instruction, gemini_flash_model): bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={ "id": [1, 2], @@ -384,6 +525,7 @@ def test_map_invalid_instruction_raise_error(instruction, gemini_flash_model): def test_map_invalid_model_raise_error(): bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={ "ingredient_1": ["Burger Bun", "Soy Bean"], @@ -415,6 +557,7 @@ def test_map_invalid_model_raise_error(): ) def test_join(instruction, session, gemini_flash_model): bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 10 cities = dataframe.DataFrame( data={ "city": ["Seattle", "Berlin"], @@ -447,8 +590,39 @@ def test_join(instruction, session, gemini_flash_model): ) +@pytest.mark.parametrize(("reply", "should_execute"), [("y", True), ("n", False)]) +def test_join_with_confirmation( + session, gemini_flash_model, reply, should_execute, monkeypatch +): + bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 0 + cities = dataframe.DataFrame( + data={ + "city": ["Seattle", "Berlin"], + }, + session=session, + ) + countries = dataframe.DataFrame( + data={"country": ["USA", "UK", "Germany"]}, + session=session, + ) + monkeypatch.setattr("builtins.input", lambda _: reply) + + result = cities.semantics.join( + countries, + "{city} is in {country}", + gemini_flash_model, + ) + + if should_execute: + assert result is not None + else: + assert result is None + + def test_self_join(session, gemini_flash_model): bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 10 animals = dataframe.DataFrame( data={ "animal": ["spider", "capybara"], @@ -479,6 +653,7 @@ def test_self_join(session, gemini_flash_model): def test_join_data_too_large_raise_error(session, gemini_flash_model): bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 10 cities = dataframe.DataFrame( data={ "city": ["Seattle", "Berlin"], @@ -522,6 +697,7 @@ def test_join_invalid_instruction_raise_error( instruction, error_pattern, gemini_flash_model ): bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 10 df1 = dataframe.DataFrame( {"city": ["Seattle", "Berlin"], "country": ["USA", "Germany"]} ) @@ -538,6 +714,7 @@ def test_join_invalid_instruction_raise_error( def test_join_invalid_model_raise_error(): bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 10 cities = dataframe.DataFrame({"city": ["Seattle", "Berlin"]}) countries = dataframe.DataFrame({"country": ["USA", "UK", "Germany"]}) @@ -554,6 +731,7 @@ def test_join_invalid_model_raise_error(): ) def test_search(session, text_embedding_generator, score_column): bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, session=session, @@ -583,8 +761,34 @@ def test_search(session, text_embedding_generator, score_column): assert score_column in actual_result.columns +@pytest.mark.parametrize(("reply", "should_execute"), [("y", True), ("n", False)]) +def test_search_with_confirmation( + session, text_embedding_generator, reply, should_execute, monkeypatch +): + bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 0 + df = dataframe.DataFrame( + data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, + session=session, + ) + monkeypatch.setattr("builtins.input", lambda _: reply) + + result = df.semantics.search( + "creatures", + "monkey", + top_k=2, + model=text_embedding_generator, + ) + + if should_execute: + assert result is not None + else: + assert result is None + + def test_search_invalid_column_raises_error(session, text_embedding_generator): bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, session=session, @@ -598,6 +802,7 @@ def test_search_invalid_column_raises_error(session, text_embedding_generator): def test_search_invalid_model_raises_error(session): bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, session=session, @@ -609,6 +814,7 @@ def test_search_invalid_model_raises_error(session): def test_search_invalid_top_k_raises_error(session, text_embedding_generator): bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, session=session, @@ -629,6 +835,7 @@ def test_search_invalid_top_k_raises_error(session, text_embedding_generator): ) def test_sim_join(session, text_embedding_generator, score_column): bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 10 df1 = dataframe.DataFrame( data={"creatures": ["salmon", "cat"]}, session=session, @@ -663,6 +870,36 @@ def test_sim_join(session, text_embedding_generator, score_column): assert score_column in actual_result.columns +@pytest.mark.parametrize(("reply", "should_execute"), [("y", True), ("n", False)]) +def test_sim_join_with_confirmation( + session, text_embedding_generator, reply, should_execute, monkeypatch +): + bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 0 + df1 = dataframe.DataFrame( + data={"creatures": ["salmon", "cat"]}, + session=session, + ) + df2 = dataframe.DataFrame( + data={"creatures": ["dog", "tuna"]}, + session=session, + ) + monkeypatch.setattr("builtins.input", lambda _: reply) + + result = df1.semantics.sim_join( + df2, + left_on="creatures", + right_on="creatures", + model=text_embedding_generator, + top_k=1, + ) + + if should_execute: + assert result is not None + else: + assert result is None + + @pytest.mark.parametrize( ("left_on", "right_on"), [ @@ -674,6 +911,7 @@ def test_sim_join_invalid_column_raises_error( session, text_embedding_generator, left_on, right_on ): bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 10 df1 = dataframe.DataFrame( data={"creatures": ["salmon", "cat"]}, session=session, @@ -691,6 +929,7 @@ def test_sim_join_invalid_column_raises_error( def test_sim_join_invalid_model_raises_error(session): bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 10 df1 = dataframe.DataFrame( data={"creatures": ["salmon", "cat"]}, session=session, @@ -708,6 +947,7 @@ def test_sim_join_invalid_model_raises_error(session): def test_sim_join_invalid_top_k_raises_error(session, text_embedding_generator): bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 10 df1 = dataframe.DataFrame( data={"creatures": ["salmon", "cat"]}, session=session, @@ -729,6 +969,7 @@ def test_sim_join_invalid_top_k_raises_error(session, text_embedding_generator): def test_sim_join_data_too_large_raises_error(session, text_embedding_generator): bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 10 df1 = dataframe.DataFrame( data={"creatures": ["salmon", "cat"]}, session=session, @@ -775,6 +1016,7 @@ def test_sim_join_data_too_large_raises_error(session, text_embedding_generator) ) def test_top_k_invalid_instruction_raise_error(instruction, gemini_flash_model): bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 10 df = dataframe.DataFrame( { "Animals": ["Dog", "Cat", "Bird", "Horse"], @@ -787,6 +1029,7 @@ def test_top_k_invalid_instruction_raise_error(instruction, gemini_flash_model): def test_top_k_invalid_k_raise_error(gemini_flash_model): bigframes.options.experiments.semantic_operators = True + bigframes.options.compute.sem_ops_confirmation_threshold = 10 df = dataframe.DataFrame({"Animals": ["Dog", "Cat", "Bird", "Horse"]}) with pytest.raises(ValueError): df.semantics.top_k( @@ -794,3 +1037,40 @@ def test_top_k_invalid_k_raise_error(gemini_flash_model): gemini_flash_model, k=0, ) + + +@patch("builtins.input", return_value="") +def test_confirm_operation__below_threshold_do_not_confirm(mock_input): + bigframes.options.experiments.semantic_operators = True + df = dataframe.DataFrame({}) + + bigframes.options.compute.sem_ops_confirmation_threshold = 3 + df.semantics._confirm_operation(1) + + mock_input.assert_not_called() + + +@patch("builtins.input", return_value="") +def test_confirm_operation__threshold_is_none_do_not_confirm(mock_input): + bigframes.options.experiments.semantic_operators = True + df = dataframe.DataFrame({}) + + bigframes.options.compute.sem_ops_confirmation_threshold = None + df.semantics._confirm_operation(100) + + mock_input.assert_not_called() + + +@pytest.mark.parametrize( + ("reply", "expected"), + [("y", True), ("yes", True), ("", True), ("n", False), ("something", False)], +) +def test_confirm_operation__above_threshold_confirm(reply, expected, monkeypatch): + bigframes.options.experiments.semantic_operators = True + monkeypatch.setattr("builtins.input", lambda _: reply) + df = dataframe.DataFrame({}) + + bigframes.options.compute.sem_ops_confirmation_threshold = 3 + actual = df.semantics._confirm_operation(4) + + assert actual is expected From 80b346d8e3bf2e3be3f919a01c2bfde2bd2ef3c0 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 3 Jan 2025 01:35:47 +0000 Subject: [PATCH 02/12] fix format --- bigframes/operations/semantics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py index 1aa76dfb3a..b9cb712533 100644 --- a/bigframes/operations/semantics.py +++ b/bigframes/operations/semantics.py @@ -1102,8 +1102,8 @@ def _confirm_operation(row_count: int) -> bool: if threshold is None or row_count <= threshold: return True - # Separate the prompt out. In IDE such VS Code, leaving prompt in the - # input function makes it less visible to the end user. + # Separate the prompt out. In IDE such VS Code, leaving prompt in the + # input function makes it less visible to the end user. print(f"This operation will process about {row_count} rows. Proceed? [Y/n]") reply = input().lower() if reply == "" or reply == "y" or reply == "yes": From c93fb38dd8a1b1bd00f294948699647105152a9c Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Fri, 3 Jan 2025 01:36:07 +0000 Subject: [PATCH 03/12] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20?= =?UTF-8?q?post-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- bigframes/operations/semantics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py index 1aa76dfb3a..b9cb712533 100644 --- a/bigframes/operations/semantics.py +++ b/bigframes/operations/semantics.py @@ -1102,8 +1102,8 @@ def _confirm_operation(row_count: int) -> bool: if threshold is None or row_count <= threshold: return True - # Separate the prompt out. In IDE such VS Code, leaving prompt in the - # input function makes it less visible to the end user. + # Separate the prompt out. In IDE such VS Code, leaving prompt in the + # input function makes it less visible to the end user. print(f"This operation will process about {row_count} rows. Proceed? [Y/n]") reply = input().lower() if reply == "" or reply == "y" or reply == "yes": From 1ed263dd2e6890b15c01207e694611b0171bae08 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 3 Jan 2025 17:06:38 +0000 Subject: [PATCH 04/12] add doc --- bigframes/operations/semantics.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py index b9cb712533..f7f111d599 100644 --- a/bigframes/operations/semantics.py +++ b/bigframes/operations/semantics.py @@ -1095,6 +1095,7 @@ def _validate_model(model): @staticmethod def _confirm_operation(row_count: int) -> bool: + """Returns true only if we can proceed with the operation.""" import bigframes threshold = bigframes.options.compute.sem_ops_confirmation_threshold From f4730f4fb1e6d53c69d66866549b2599b19e7001 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 3 Jan 2025 18:48:29 +0000 Subject: [PATCH 05/12] raise exception when the user didn't confirm to proceed --- bigframes/_config/compute_options.py | 6 +- bigframes/exceptions.py | 4 + bigframes/operations/semantics.py | 52 ++-- .../experimental/semantic_operators.ipynb | 6 +- .../system/large/operations/test_semantics.py | 257 ++++++++++-------- 5 files changed, 172 insertions(+), 153 deletions(-) diff --git a/bigframes/_config/compute_options.py b/bigframes/_config/compute_options.py index 33e978e51f..62d9ddcd07 100644 --- a/bigframes/_config/compute_options.py +++ b/bigframes/_config/compute_options.py @@ -66,10 +66,10 @@ class ComputeOptions: engine to handle. However this comes at the cost of increase cost and latency. extra_query_labels (Dict[str, Any], Options): Stores additional custom labels for query configuration. - sem_ops_confirmation_threshold (int, Options): + semmantic_ops_confirmation_threshold (int, optional): Guards against unexepcted processing of large amount of rows by semantic operators. If the number of rows exceeds the threshold, the user will be asked to confirm - their operations to resume. The default value is 25. Set the value to None + their operations to resume. The default value is 0. Set the value to None to turn off the guard. """ @@ -78,7 +78,7 @@ class ComputeOptions: extra_query_labels: Dict[str, Any] = dataclasses.field( default_factory=dict, init=False ) - sem_ops_confirmation_threshold: Optional[int] = 25 + semantic_ops_confirmation_threshold: Optional[int] = 0 def assign_extra_query_labels(self, **kwargs: Any) -> None: """ diff --git a/bigframes/exceptions.py b/bigframes/exceptions.py index 27f3508ff4..3cb5f3665d 100644 --- a/bigframes/exceptions.py +++ b/bigframes/exceptions.py @@ -59,6 +59,10 @@ class QueryComplexityError(RuntimeError): """Query plan is too complex to execute.""" +class OperationAbortedError(RuntimeError): + """Operation is aborted.""" + + class TimeTravelDisabledWarning(Warning): """A query was reattempted without time travel.""" diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py index f7f111d599..c0923bd065 100644 --- a/bigframes/operations/semantics.py +++ b/bigframes/operations/semantics.py @@ -20,8 +20,8 @@ import numpy as np -import bigframes.core.guid as guid -import bigframes.dtypes as dtypes +from bigframes import dtypes, exceptions +from bigframes.core import guid class Semantics: @@ -122,8 +122,7 @@ def agg( ) work_estimate = len(self._df) * int(max_agg_rows / (max_agg_rows - 1)) - if not self._confirm_operation(work_estimate): - return None + self._confirm_operation(work_estimate) df: bigframes.dataframe.DataFrame = self._df.copy() for column in columns: @@ -300,8 +299,7 @@ def cluster_by( "It must be greater than 1." ) - if not self._confirm_operation(len(self._df)): - return None + self._confirm_operation(len(self._df)) df: bigframes.dataframe.DataFrame = self._df.copy() embeddings_df = model.predict(df[column]) @@ -374,8 +372,7 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" ) - if not self._confirm_operation(len(self._df)): - return None + self._confirm_operation(len(self._df)) df: bigframes.dataframe.DataFrame = self._df[columns].copy() for column in columns: @@ -472,8 +469,7 @@ def map( "details: https://cloud.google.com/vertex-ai/generative-ai/pricing#google_models" ) - if not self._confirm_operation(len(self._df)): - return None + self._confirm_operation(len(self._df)) df: bigframes.dataframe.DataFrame = self._df[columns].copy() for column in columns: @@ -574,8 +570,7 @@ def join( ) work_estimate = len(self._df) * len(other) - if not self._confirm_operation(work_estimate): - return None + self._confirm_operation(work_estimate) left_columns = [] right_columns = [] @@ -689,8 +684,7 @@ def search( if search_column not in self._df.columns: raise ValueError(f"Column `{search_column}` not found") - if not self._confirm_operation(len(self._df)): - return None + self._confirm_operation(len(self._df)) import bigframes.ml.llm as llm @@ -816,8 +810,7 @@ def top_k( ) work_estimate = int(len(self._df) * (len(self._df) - 1) / 2) - if not self._confirm_operation(work_estimate): - return None + self._confirm_operation(work_estimate) df: bigframes.dataframe.DataFrame = self._df[columns].copy() column = columns[0] @@ -1018,8 +1011,7 @@ def sim_join( raise ValueError("top_k must be an integer greater than or equal to 1.") work_estimate = len(self._df) * len(other) - if not self._confirm_operation(work_estimate): - return None + self._confirm_operation(work_estimate) base_table_embedding_column = guid.generate_guid() base_table = self._attach_embedding( @@ -1094,20 +1086,24 @@ def _validate_model(model): raise TypeError("Model is not GeminiText Generator") @staticmethod - def _confirm_operation(row_count: int) -> bool: - """Returns true only if we can proceed with the operation.""" + def _confirm_operation(row_count: int): + """Raises OperationAbortedError when the confirmation fails""" import bigframes - threshold = bigframes.options.compute.sem_ops_confirmation_threshold + threshold = bigframes.options.compute.semantic_ops_confirmation_threshold if threshold is None or row_count <= threshold: - return True + return # Separate the prompt out. In IDE such VS Code, leaving prompt in the # input function makes it less visible to the end user. - print(f"This operation will process about {row_count} rows. Proceed? [Y/n]") - reply = input().lower() - if reply == "" or reply == "y" or reply == "yes": - return True - - return False + print(f"This operation will process about {row_count} rows.") + print( + """You can raise the confirmation threshold by setting + `bigframes.options.compute.semantic_ops_confirmation_threshold` to a higher value. + To completely turn off the confirmation check, set the threshold to `None`.""" + ) + print("Proceed? [Y/n]") + reply = input().casefold() + if reply not in {"y", "yes", ""}: + raise exceptions.OperationAbortedError("Operation was cancelled.") diff --git a/notebooks/experimental/semantic_operators.ipynb b/notebooks/experimental/semantic_operators.ipynb index 1193f0cb62..7885460558 100644 --- a/notebooks/experimental/semantic_operators.ipynb +++ b/notebooks/experimental/semantic_operators.ipynb @@ -155,7 +155,7 @@ "source": [ "*Note*: semantic operators could be expensive over a large set of data. As a result, our team added this option `bigframes.options.compute.sem_ops_confirmation_threshold` at `version 1.31.0` so that the BigQuery Dataframe will ask for your confirmation if the amount of data to be processed is too large. If the amount of rows exceeds your threshold, you will see a prompt for your keyboard input -- 'y' to proceed and 'n' to abort. If you abort the operation, no LLM processing will be done.\n", "\n", - "The default threshold is 25 rows. You are free to adjust the value as needed. To skip confirmation for all operations, set it to `None`. To always have confirmations, set it to `0`." + "The default threshold is 0, which means the operators will always ask for confirmations. You are free to adjust the value as needed. You can also set the threshold to `None` to disable this feature." ] }, { @@ -165,7 +165,7 @@ "outputs": [], "source": [ "if Version(bigframes.__version__) >= Version(\"1.31.0\"):\n", - " bigframes.options.compute.sem_ops_confirmation_threshold = 100" + " bigframes.options.compute.semantic_ops_confirmation_threshold = 100" ] }, { @@ -1934,7 +1934,7 @@ "outputs": [], "source": [ "if Version(bigframes.__version__) >= Version(\"1.31.0\"):\n", - " bigframes.options.compute.sem_ops_confirmation_threshold = 5000" + " bigframes.options.compute.semantic_ops_confirmation_threshold = 5000" ] }, { diff --git a/tests/system/large/operations/test_semantics.py b/tests/system/large/operations/test_semantics.py index 6ac9cfc7f0..edfe00c8df 100644 --- a/tests/system/large/operations/test_semantics.py +++ b/tests/system/large/operations/test_semantics.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from contextlib import nullcontext from unittest.mock import patch import pandas as pd @@ -19,8 +20,7 @@ import pytest import bigframes -import bigframes.dataframe as dataframe -import bigframes.dtypes as dtypes +from bigframes import dataframe, dtypes, exceptions def test_semantics_experiment_off_raise_error(): @@ -47,7 +47,7 @@ def test_semantics_experiment_off_raise_error(): ) def test_agg(session, gemini_flash_model, max_agg_rows, cluster_column): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 10 + bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={ "Movies": [ @@ -76,12 +76,18 @@ def test_agg(session, gemini_flash_model, max_agg_rows, cluster_column): pandas.testing.assert_series_equal(actual_s, expected_s, check_index_type=False) -@pytest.mark.parametrize(("reply", "should_execute"), [("y", True), ("n", False)]) -def test_agg_with_confirmation( - session, gemini_flash_model, reply, should_execute, monkeypatch -): +@pytest.mark.parametrize( + ("reply"), + [ + pytest.param("y"), + pytest.param( + "n", marks=pytest.mark.xfail(raises=exceptions.OperationAbortedError) + ), + ], +) +def test_agg_with_confirmation(session, gemini_flash_model, reply, monkeypatch): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 0 + bigframes.options.compute.semantic_ops_confirmation_threshold = 0 df = dataframe.DataFrame( data={ "Movies": [ @@ -98,22 +104,17 @@ def test_agg_with_confirmation( session=session, ) instruction = "Find the shared first name of actors in {Movies}. One word answer." - monkeypatch.setattr("builtins.input", lambda _: reply) + monkeypatch.setattr("builtins.input", lambda: reply) - result = df.semantics.agg( + df.semantics.agg( instruction, model=gemini_flash_model, ) - if should_execute: - assert result is not None - else: - assert result is None - def test_agg_w_int_column(session, gemini_flash_model): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 10 + bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={ "Movies": [ @@ -157,7 +158,7 @@ def test_agg_w_int_column(session, gemini_flash_model): ) def test_agg_invalid_instruction_raise_error(instruction, gemini_flash_model): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 10 + bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={ "Movies": [ @@ -186,7 +187,7 @@ def test_agg_invalid_instruction_raise_error(instruction, gemini_flash_model): ) def test_agg_invalid_cluster_column_raise_error(gemini_flash_model, cluster_column): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 10 + bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={ "Movies": [ @@ -210,7 +211,7 @@ def test_agg_invalid_cluster_column_raise_error(gemini_flash_model, cluster_colu ) def test_cluster_by(session, text_embedding_generator, n_clusters): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 10 + bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( ( { @@ -240,12 +241,20 @@ def test_cluster_by(session, text_embedding_generator, n_clusters): assert len(result[output_column].unique()) <= n_clusters -@pytest.mark.parametrize(("reply", "should_execute"), [("y", True), ("n", False)]) +@pytest.mark.parametrize( + ("reply"), + [ + pytest.param("y"), + pytest.param( + "n", marks=pytest.mark.xfail(raises=exceptions.OperationAbortedError) + ), + ], +) def test_cluster_by_with_confirmation( - session, text_embedding_generator, reply, should_execute, monkeypatch + session, text_embedding_generator, reply, monkeypatch ): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 0 + bigframes.options.compute.semantic_ops_confirmation_threshold = 0 df = dataframe.DataFrame( ( { @@ -262,24 +271,19 @@ def test_cluster_by_with_confirmation( ), session=session, ) - monkeypatch.setattr("builtins.input", lambda _: reply) + monkeypatch.setattr("builtins.input", lambda: reply) - result = df.semantics.cluster_by( + df.semantics.cluster_by( "Item", "cluster id", text_embedding_generator, n_clusters=2, ) - if should_execute: - assert result is not None - else: - assert result is None - def test_cluster_by_invalid_column(session, text_embedding_generator): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 10 + bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( ({"Product": ["Smartphone", "Laptop", "Coffee Maker", "T-shirt", "Jeans"]}), @@ -298,7 +302,7 @@ def test_cluster_by_invalid_column(session, text_embedding_generator): def test_cluster_by_invalid_model(session, gemini_flash_model): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 10 + bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( ({"Product": ["Smartphone", "Laptop", "Coffee Maker", "T-shirt", "Jeans"]}), @@ -317,7 +321,7 @@ def test_cluster_by_invalid_model(session, gemini_flash_model): def test_filter(session, gemini_flash_model): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 10 + bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={ "country": ["USA", "Germany"], @@ -339,12 +343,18 @@ def test_filter(session, gemini_flash_model): ) -@pytest.mark.parametrize(("reply", "should_execute"), [("y", True), ("n", False)]) -def test_filter_with_confirmation( - session, gemini_flash_model, reply, should_execute, monkeypatch -): +@pytest.mark.parametrize( + ("reply"), + [ + pytest.param("y"), + pytest.param( + "n", marks=pytest.mark.xfail(raises=exceptions.OperationAbortedError) + ), + ], +) +def test_filter_with_confirmation(session, gemini_flash_model, reply, monkeypatch): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 0 + bigframes.options.compute.semantic_ops_confirmation_threshold = 0 df = dataframe.DataFrame( data={ "country": ["USA", "Germany"], @@ -353,21 +363,16 @@ def test_filter_with_confirmation( }, session=session, ) - monkeypatch.setattr("builtins.input", lambda _: reply) + monkeypatch.setattr("builtins.input", lambda: reply) - result = df.semantics.filter( + df.semantics.filter( "{city} is the capital of {country} in {year}", gemini_flash_model ) - if should_execute: - assert result is not None - else: - assert result is None - def test_filter_single_column_reference(session, gemini_flash_model): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 10 + bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]}, session=session, @@ -405,7 +410,7 @@ def test_filter_single_column_reference(session, gemini_flash_model): ) def test_filter_invalid_instruction_raise_error(instruction, gemini_flash_model): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 10 + bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame({"id": [1, 2], "city": ["Seattle", "Berlin"]}) with pytest.raises(ValueError): @@ -414,7 +419,7 @@ def test_filter_invalid_instruction_raise_error(instruction, gemini_flash_model) def test_filter_invalid_model_raise_error(): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 10 + bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( {"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]} ) @@ -425,7 +430,7 @@ def test_filter_invalid_model_raise_error(): def test_map(session, gemini_flash_model): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 10 + bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={ "ingredient_1": ["Burger Bun", "Soy Bean"], @@ -460,12 +465,18 @@ def test_map(session, gemini_flash_model): ) -@pytest.mark.parametrize(("reply", "should_execute"), [("y", True), ("n", False)]) -def test_map_with_confirmation( - session, gemini_flash_model, reply, should_execute, monkeypatch -): +@pytest.mark.parametrize( + ("reply"), + [ + pytest.param("y"), + pytest.param( + "n", marks=pytest.mark.xfail(raises=exceptions.OperationAbortedError) + ), + ], +) +def test_map_with_confirmation(session, gemini_flash_model, reply, monkeypatch): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 0 + bigframes.options.compute.semantic_ops_confirmation_threshold = 0 df = dataframe.DataFrame( data={ "ingredient_1": ["Burger Bun", "Soy Bean"], @@ -474,19 +485,14 @@ def test_map_with_confirmation( }, session=session, ) - monkeypatch.setattr("builtins.input", lambda _: reply) + monkeypatch.setattr("builtins.input", lambda: reply) - result = df.semantics.map( + df.semantics.map( "What is the {gluten-free} food made from {ingredient_1} and {ingredient_2}? One word only.", "food", gemini_flash_model, ) - if should_execute: - assert result is not None - else: - assert result is None - @pytest.mark.parametrize( "instruction", @@ -510,7 +516,7 @@ def test_map_with_confirmation( ) def test_map_invalid_instruction_raise_error(instruction, gemini_flash_model): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 10 + bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={ "id": [1, 2], @@ -525,7 +531,7 @@ def test_map_invalid_instruction_raise_error(instruction, gemini_flash_model): def test_map_invalid_model_raise_error(): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 10 + bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={ "ingredient_1": ["Burger Bun", "Soy Bean"], @@ -557,7 +563,7 @@ def test_map_invalid_model_raise_error(): ) def test_join(instruction, session, gemini_flash_model): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 10 + bigframes.options.compute.semantic_ops_confirmation_threshold = 10 cities = dataframe.DataFrame( data={ "city": ["Seattle", "Berlin"], @@ -590,12 +596,18 @@ def test_join(instruction, session, gemini_flash_model): ) -@pytest.mark.parametrize(("reply", "should_execute"), [("y", True), ("n", False)]) -def test_join_with_confirmation( - session, gemini_flash_model, reply, should_execute, monkeypatch -): +@pytest.mark.parametrize( + ("reply"), + [ + pytest.param("y"), + pytest.param( + "n", marks=pytest.mark.xfail(raises=exceptions.OperationAbortedError) + ), + ], +) +def test_join_with_confirmation(session, gemini_flash_model, reply, monkeypatch): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 0 + bigframes.options.compute.semantic_ops_confirmation_threshold = 0 cities = dataframe.DataFrame( data={ "city": ["Seattle", "Berlin"], @@ -606,23 +618,18 @@ def test_join_with_confirmation( data={"country": ["USA", "UK", "Germany"]}, session=session, ) - monkeypatch.setattr("builtins.input", lambda _: reply) + monkeypatch.setattr("builtins.input", lambda: reply) - result = cities.semantics.join( + cities.semantics.join( countries, "{city} is in {country}", gemini_flash_model, ) - if should_execute: - assert result is not None - else: - assert result is None - def test_self_join(session, gemini_flash_model): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 10 + bigframes.options.compute.semantic_ops_confirmation_threshold = 10 animals = dataframe.DataFrame( data={ "animal": ["spider", "capybara"], @@ -653,7 +660,7 @@ def test_self_join(session, gemini_flash_model): def test_join_data_too_large_raise_error(session, gemini_flash_model): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 10 + bigframes.options.compute.semantic_ops_confirmation_threshold = 10 cities = dataframe.DataFrame( data={ "city": ["Seattle", "Berlin"], @@ -697,7 +704,7 @@ def test_join_invalid_instruction_raise_error( instruction, error_pattern, gemini_flash_model ): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 10 + bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df1 = dataframe.DataFrame( {"city": ["Seattle", "Berlin"], "country": ["USA", "Germany"]} ) @@ -714,7 +721,7 @@ def test_join_invalid_instruction_raise_error( def test_join_invalid_model_raise_error(): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 10 + bigframes.options.compute.semantic_ops_confirmation_threshold = 10 cities = dataframe.DataFrame({"city": ["Seattle", "Berlin"]}) countries = dataframe.DataFrame({"country": ["USA", "UK", "Germany"]}) @@ -731,7 +738,7 @@ def test_join_invalid_model_raise_error(): ) def test_search(session, text_embedding_generator, score_column): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 10 + bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, session=session, @@ -761,34 +768,37 @@ def test_search(session, text_embedding_generator, score_column): assert score_column in actual_result.columns -@pytest.mark.parametrize(("reply", "should_execute"), [("y", True), ("n", False)]) +@pytest.mark.parametrize( + ("reply"), + [ + pytest.param("y"), + pytest.param( + "n", marks=pytest.mark.xfail(raises=exceptions.OperationAbortedError) + ), + ], +) def test_search_with_confirmation( - session, text_embedding_generator, reply, should_execute, monkeypatch + session, text_embedding_generator, reply, monkeypatch ): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 0 + bigframes.options.compute.semantic_ops_confirmation_threshold = 0 df = dataframe.DataFrame( data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, session=session, ) - monkeypatch.setattr("builtins.input", lambda _: reply) + monkeypatch.setattr("builtins.input", lambda: reply) - result = df.semantics.search( + df.semantics.search( "creatures", "monkey", top_k=2, model=text_embedding_generator, ) - if should_execute: - assert result is not None - else: - assert result is None - def test_search_invalid_column_raises_error(session, text_embedding_generator): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 10 + bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, session=session, @@ -802,7 +812,7 @@ def test_search_invalid_column_raises_error(session, text_embedding_generator): def test_search_invalid_model_raises_error(session): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 10 + bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, session=session, @@ -814,7 +824,7 @@ def test_search_invalid_model_raises_error(session): def test_search_invalid_top_k_raises_error(session, text_embedding_generator): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 10 + bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, session=session, @@ -835,7 +845,7 @@ def test_search_invalid_top_k_raises_error(session, text_embedding_generator): ) def test_sim_join(session, text_embedding_generator, score_column): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 10 + bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df1 = dataframe.DataFrame( data={"creatures": ["salmon", "cat"]}, session=session, @@ -870,12 +880,20 @@ def test_sim_join(session, text_embedding_generator, score_column): assert score_column in actual_result.columns -@pytest.mark.parametrize(("reply", "should_execute"), [("y", True), ("n", False)]) +@pytest.mark.parametrize( + ("reply"), + [ + pytest.param("y"), + pytest.param( + "n", marks=pytest.mark.xfail(raises=exceptions.OperationAbortedError) + ), + ], +) def test_sim_join_with_confirmation( - session, text_embedding_generator, reply, should_execute, monkeypatch + session, text_embedding_generator, reply, monkeypatch ): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 0 + bigframes.options.compute.semantic_ops_confirmation_threshold = 0 df1 = dataframe.DataFrame( data={"creatures": ["salmon", "cat"]}, session=session, @@ -884,9 +902,9 @@ def test_sim_join_with_confirmation( data={"creatures": ["dog", "tuna"]}, session=session, ) - monkeypatch.setattr("builtins.input", lambda _: reply) + monkeypatch.setattr("builtins.input", lambda: reply) - result = df1.semantics.sim_join( + df1.semantics.sim_join( df2, left_on="creatures", right_on="creatures", @@ -894,11 +912,6 @@ def test_sim_join_with_confirmation( top_k=1, ) - if should_execute: - assert result is not None - else: - assert result is None - @pytest.mark.parametrize( ("left_on", "right_on"), @@ -911,7 +924,7 @@ def test_sim_join_invalid_column_raises_error( session, text_embedding_generator, left_on, right_on ): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 10 + bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df1 = dataframe.DataFrame( data={"creatures": ["salmon", "cat"]}, session=session, @@ -929,7 +942,7 @@ def test_sim_join_invalid_column_raises_error( def test_sim_join_invalid_model_raises_error(session): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 10 + bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df1 = dataframe.DataFrame( data={"creatures": ["salmon", "cat"]}, session=session, @@ -947,7 +960,7 @@ def test_sim_join_invalid_model_raises_error(session): def test_sim_join_invalid_top_k_raises_error(session, text_embedding_generator): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 10 + bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df1 = dataframe.DataFrame( data={"creatures": ["salmon", "cat"]}, session=session, @@ -969,7 +982,7 @@ def test_sim_join_invalid_top_k_raises_error(session, text_embedding_generator): def test_sim_join_data_too_large_raises_error(session, text_embedding_generator): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 10 + bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df1 = dataframe.DataFrame( data={"creatures": ["salmon", "cat"]}, session=session, @@ -1016,7 +1029,7 @@ def test_sim_join_data_too_large_raises_error(session, text_embedding_generator) ) def test_top_k_invalid_instruction_raise_error(instruction, gemini_flash_model): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 10 + bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( { "Animals": ["Dog", "Cat", "Bird", "Horse"], @@ -1029,7 +1042,7 @@ def test_top_k_invalid_instruction_raise_error(instruction, gemini_flash_model): def test_top_k_invalid_k_raise_error(gemini_flash_model): bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.sem_ops_confirmation_threshold = 10 + bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame({"Animals": ["Dog", "Cat", "Bird", "Horse"]}) with pytest.raises(ValueError): df.semantics.top_k( @@ -1044,7 +1057,7 @@ def test_confirm_operation__below_threshold_do_not_confirm(mock_input): bigframes.options.experiments.semantic_operators = True df = dataframe.DataFrame({}) - bigframes.options.compute.sem_ops_confirmation_threshold = 3 + bigframes.options.compute.semantic_ops_confirmation_threshold = 3 df.semantics._confirm_operation(1) mock_input.assert_not_called() @@ -1055,22 +1068,28 @@ def test_confirm_operation__threshold_is_none_do_not_confirm(mock_input): bigframes.options.experiments.semantic_operators = True df = dataframe.DataFrame({}) - bigframes.options.compute.sem_ops_confirmation_threshold = None + bigframes.options.compute.semantic_ops_confirmation_threshold = None df.semantics._confirm_operation(100) mock_input.assert_not_called() @pytest.mark.parametrize( - ("reply", "expected"), - [("y", True), ("yes", True), ("", True), ("n", False), ("something", False)], + ("reply", "expectation"), + [ + ("y", nullcontext()), + ("yes", nullcontext()), + ("", nullcontext()), + ("n", pytest.raises(exceptions.OperationAbortedError)), + ("something", pytest.raises(exceptions.OperationAbortedError)), + ], ) -def test_confirm_operation__above_threshold_confirm(reply, expected, monkeypatch): +def test_confirm_operation__above_threshold_confirm(reply, expectation, monkeypatch): bigframes.options.experiments.semantic_operators = True - monkeypatch.setattr("builtins.input", lambda _: reply) + monkeypatch.setattr("builtins.input", lambda: reply) df = dataframe.DataFrame({}) - bigframes.options.compute.sem_ops_confirmation_threshold = 3 - actual = df.semantics._confirm_operation(4) + bigframes.options.compute.semantic_ops_confirmation_threshold = 3 - assert actual is expected + with expectation as e: + assert df.semantics._confirm_operation(4) == e From 6b824516dd2df205f0327acb13636c5b6ec6b15e Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 3 Jan 2025 18:51:00 +0000 Subject: [PATCH 06/12] fix prompt format --- bigframes/operations/semantics.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py index c0923bd065..61330a154f 100644 --- a/bigframes/operations/semantics.py +++ b/bigframes/operations/semantics.py @@ -1099,9 +1099,7 @@ def _confirm_operation(row_count: int): # input function makes it less visible to the end user. print(f"This operation will process about {row_count} rows.") print( - """You can raise the confirmation threshold by setting - `bigframes.options.compute.semantic_ops_confirmation_threshold` to a higher value. - To completely turn off the confirmation check, set the threshold to `None`.""" + "You can raise the confirmation threshold by setting `bigframes.options.compute.semantic_ops_confirmation_threshold` to a higher value. To completely turn off the confirmation check, set the threshold to `None`." ) print("Proceed? [Y/n]") reply = input().casefold() From 3ef109d00ca690e58ffc2da3a228f24a9a6ece12 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 3 Jan 2025 19:02:09 +0000 Subject: [PATCH 07/12] add sem ops autofail option --- bigframes/_config/compute_options.py | 4 ++++ bigframes/operations/semantics.py | 5 +++++ notebooks/experimental/semantic_operators.ipynb | 17 +++++++++++++++++ tests/system/large/operations/test_semantics.py | 14 ++++++++++++++ 4 files changed, 40 insertions(+) diff --git a/bigframes/_config/compute_options.py b/bigframes/_config/compute_options.py index 62d9ddcd07..4561ea6e67 100644 --- a/bigframes/_config/compute_options.py +++ b/bigframes/_config/compute_options.py @@ -71,6 +71,9 @@ class ComputeOptions: If the number of rows exceeds the threshold, the user will be asked to confirm their operations to resume. The default value is 0. Set the value to None to turn off the guard. + semmantic_ops_confirmation_threshold (bool): + Guards against unexepcted processing of large amount of rows by semantic operators. + When set to True, the operation automatically fails without asking for user inputs. """ maximum_bytes_billed: Optional[int] = None @@ -79,6 +82,7 @@ class ComputeOptions: default_factory=dict, init=False ) semantic_ops_confirmation_threshold: Optional[int] = 0 + semantic_ops_threshold_autofail = False def assign_extra_query_labels(self, **kwargs: Any) -> None: """ diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py index 61330a154f..5777bf6c31 100644 --- a/bigframes/operations/semantics.py +++ b/bigframes/operations/semantics.py @@ -1095,6 +1095,11 @@ def _confirm_operation(row_count: int): if threshold is None or row_count <= threshold: return + if bigframes.options.compute.semantic_ops_threshold_autofail: + raise exceptions.OperationAbortedError( + f"Operation was cancelled because your work estimate is {row_count} rows, which exceeds the threshold {threshold} rows." + ) + # Separate the prompt out. In IDE such VS Code, leaving prompt in the # input function makes it less visible to the end user. print(f"This operation will process about {row_count} rows.") diff --git a/notebooks/experimental/semantic_operators.ipynb b/notebooks/experimental/semantic_operators.ipynb index 7885460558..d07a5d7911 100644 --- a/notebooks/experimental/semantic_operators.ipynb +++ b/notebooks/experimental/semantic_operators.ipynb @@ -168,6 +168,23 @@ " bigframes.options.compute.semantic_ops_confirmation_threshold = 100" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you would like your operations to fail automatically when the data is too large, set `bigframes.options.compute.semantic_ops_threshold_autofail` to `True`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# if Version(bigframes.__version__) >= Version(\"1.31.0\"):\n", + "# bigframes.options.compute.semantic_ops_threshold_autofail = True" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/tests/system/large/operations/test_semantics.py b/tests/system/large/operations/test_semantics.py index edfe00c8df..a8356d70f6 100644 --- a/tests/system/large/operations/test_semantics.py +++ b/tests/system/large/operations/test_semantics.py @@ -1074,6 +1074,20 @@ def test_confirm_operation__threshold_is_none_do_not_confirm(mock_input): mock_input.assert_not_called() +@patch("builtins.input", return_value="") +def test_confirm_operation__threshold_autofail_do_not_confirm(mock_input): + bigframes.options.experiments.semantic_operators = True + df = dataframe.DataFrame({}) + + bigframes.options.compute.semantic_ops_confirmation_threshold = 1 + bigframes.options.compute.semantic_ops_threshold_autofail = True + + with pytest.raises(exceptions.OperationAbortedError): + df.semantics._confirm_operation(100) + + mock_input.assert_not_called() + + @pytest.mark.parametrize( ("reply", "expectation"), [ From 42ad24449da5708f302bee2606ad643c0055fd4b Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 3 Jan 2025 19:05:52 +0000 Subject: [PATCH 08/12] fix doc --- bigframes/_config/compute_options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/_config/compute_options.py b/bigframes/_config/compute_options.py index 4561ea6e67..21b41eb185 100644 --- a/bigframes/_config/compute_options.py +++ b/bigframes/_config/compute_options.py @@ -71,7 +71,7 @@ class ComputeOptions: If the number of rows exceeds the threshold, the user will be asked to confirm their operations to resume. The default value is 0. Set the value to None to turn off the guard. - semmantic_ops_confirmation_threshold (bool): + semantic_ops_threshold_autofail (bool): Guards against unexepcted processing of large amount of rows by semantic operators. When set to True, the operation automatically fails without asking for user inputs. """ From 55c5ab6134fd212174fe16f7b1d87b6be69f79ba Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 3 Jan 2025 21:49:46 +0000 Subject: [PATCH 09/12] use option_context to set options in tests --- .../system/large/operations/test_semantics.py | 562 +++++++++++------- 1 file changed, 358 insertions(+), 204 deletions(-) diff --git a/tests/system/large/operations/test_semantics.py b/tests/system/large/operations/test_semantics.py index a8356d70f6..fc4fef7677 100644 --- a/tests/system/large/operations/test_semantics.py +++ b/tests/system/large/operations/test_semantics.py @@ -22,14 +22,19 @@ import bigframes from bigframes import dataframe, dtypes, exceptions +EXPERIMENT_OPTION = "experiments.semantic_operators" +THRESHOLD_OPTION = "compute.semantic_ops_confirmation_threshold" + def test_semantics_experiment_off_raise_error(): - bigframes.options.experiments.semantic_operators = False + # bigframes.options.experiments.semantic_operators = False df = dataframe.DataFrame( {"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]} ) - with pytest.raises(NotImplementedError): + with bigframes.option_context(EXPERIMENT_OPTION, False), pytest.raises( + NotImplementedError + ): df.semantics @@ -46,8 +51,6 @@ def test_semantics_experiment_off_raise_error(): ], ) def test_agg(session, gemini_flash_model, max_agg_rows, cluster_column): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={ "Movies": [ @@ -64,12 +67,19 @@ def test_agg(session, gemini_flash_model, max_agg_rows, cluster_column): session=session, ) instruction = "Find the shared first name of actors in {Movies}. One word answer." - actual_s = df.semantics.agg( - instruction, - model=gemini_flash_model, - max_agg_rows=max_agg_rows, - cluster_column=cluster_column, - ).to_pandas() + + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 50, + ): + actual_s = df.semantics.agg( + instruction, + model=gemini_flash_model, + max_agg_rows=max_agg_rows, + cluster_column=cluster_column, + ).to_pandas() expected_s = pd.Series(["Leonardo \n"], dtype=dtypes.STRING_DTYPE) expected_s.name = "Movies" @@ -86,8 +96,6 @@ def test_agg(session, gemini_flash_model, max_agg_rows, cluster_column): ], ) def test_agg_with_confirmation(session, gemini_flash_model, reply, monkeypatch): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 0 df = dataframe.DataFrame( data={ "Movies": [ @@ -106,15 +114,19 @@ def test_agg_with_confirmation(session, gemini_flash_model, reply, monkeypatch): instruction = "Find the shared first name of actors in {Movies}. One word answer." monkeypatch.setattr("builtins.input", lambda: reply) - df.semantics.agg( - instruction, - model=gemini_flash_model, - ) + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 0, + ): + df.semantics.agg( + instruction, + model=gemini_flash_model, + ) def test_agg_w_int_column(session, gemini_flash_model): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={ "Movies": [ @@ -126,10 +138,17 @@ def test_agg_w_int_column(session, gemini_flash_model): session=session, ) instruction = "Find the {Years} Leonardo DiCaprio acted in the most movies. Answer with the year only." - actual_s = df.semantics.agg( - instruction, - model=gemini_flash_model, - ).to_pandas() + + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_s = df.semantics.agg( + instruction, + model=gemini_flash_model, + ).to_pandas() expected_s = pd.Series(["2013 \n"], dtype=dtypes.STRING_DTYPE) expected_s.name = "Years" @@ -157,8 +176,6 @@ def test_agg_w_int_column(session, gemini_flash_model): ], ) def test_agg_invalid_instruction_raise_error(instruction, gemini_flash_model): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={ "Movies": [ @@ -169,7 +186,14 @@ def test_agg_invalid_instruction_raise_error(instruction, gemini_flash_model): "Year": [1997, 2013, 2023], }, ) - df.semantics.agg(instruction, gemini_flash_model) + + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + df.semantics.agg(instruction, gemini_flash_model) @pytest.mark.parametrize( @@ -186,8 +210,6 @@ def test_agg_invalid_instruction_raise_error(instruction, gemini_flash_model): ], ) def test_agg_invalid_cluster_column_raise_error(gemini_flash_model, cluster_column): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={ "Movies": [ @@ -199,7 +221,14 @@ def test_agg_invalid_cluster_column_raise_error(gemini_flash_model, cluster_colu }, ) instruction = "Find the shared first name of actors in {Movies}. One word answer." - df.semantics.agg(instruction, gemini_flash_model, cluster_column=cluster_column) + + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + df.semantics.agg(instruction, gemini_flash_model, cluster_column=cluster_column) @pytest.mark.parametrize( @@ -210,8 +239,6 @@ def test_agg_invalid_cluster_column_raise_error(gemini_flash_model, cluster_colu ], ) def test_cluster_by(session, text_embedding_generator, n_clusters): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( ( { @@ -229,12 +256,19 @@ def test_cluster_by(session, text_embedding_generator, n_clusters): session=session, ) output_column = "cluster id" - result = df.semantics.cluster_by( - "Item", - output_column, - text_embedding_generator, - n_clusters=n_clusters, - ) + + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + result = df.semantics.cluster_by( + "Item", + output_column, + text_embedding_generator, + n_clusters=n_clusters, + ) assert output_column in result # In rare cases, it's possible to have fewer than K clusters due to randomness. @@ -253,8 +287,6 @@ def test_cluster_by(session, text_embedding_generator, n_clusters): def test_cluster_by_with_confirmation( session, text_embedding_generator, reply, monkeypatch ): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 0 df = dataframe.DataFrame( ( { @@ -273,25 +305,33 @@ def test_cluster_by_with_confirmation( ) monkeypatch.setattr("builtins.input", lambda: reply) - df.semantics.cluster_by( - "Item", - "cluster id", - text_embedding_generator, - n_clusters=2, - ) + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 0, + ): + df.semantics.cluster_by( + "Item", + "cluster id", + text_embedding_generator, + n_clusters=2, + ) def test_cluster_by_invalid_column(session, text_embedding_generator): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 10 - df = dataframe.DataFrame( ({"Product": ["Smartphone", "Laptop", "Coffee Maker", "T-shirt", "Jeans"]}), session=session, ) - output_column = "cluster id" - with pytest.raises(ValueError): + + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): df.semantics.cluster_by( "unknown_column", output_column, @@ -301,16 +341,18 @@ def test_cluster_by_invalid_column(session, text_embedding_generator): def test_cluster_by_invalid_model(session, gemini_flash_model): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 10 - df = dataframe.DataFrame( ({"Product": ["Smartphone", "Laptop", "Coffee Maker", "T-shirt", "Jeans"]}), session=session, ) - output_column = "cluster id" - with pytest.raises(TypeError): + + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(TypeError): df.semantics.cluster_by( "Product", output_column, @@ -320,8 +362,6 @@ def test_cluster_by_invalid_model(session, gemini_flash_model): def test_filter(session, gemini_flash_model): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={ "country": ["USA", "Germany"], @@ -331,9 +371,15 @@ def test_filter(session, gemini_flash_model): session=session, ) - actual_df = df.semantics.filter( - "{city} is the capital of {country} in {year}", gemini_flash_model - ).to_pandas() + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_df = df.semantics.filter( + "{city} is the capital of {country} in {year}", gemini_flash_model + ).to_pandas() expected_df = pd.DataFrame( {"country": ["Germany"], "city": ["Berlin"], "year": [2024]}, index=[1] @@ -353,8 +399,6 @@ def test_filter(session, gemini_flash_model): ], ) def test_filter_with_confirmation(session, gemini_flash_model, reply, monkeypatch): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 0 df = dataframe.DataFrame( data={ "country": ["USA", "Germany"], @@ -365,18 +409,28 @@ def test_filter_with_confirmation(session, gemini_flash_model, reply, monkeypatc ) monkeypatch.setattr("builtins.input", lambda: reply) - df.semantics.filter( - "{city} is the capital of {country} in {year}", gemini_flash_model - ) + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 0, + ): + df.semantics.filter( + "{city} is the capital of {country} in {year}", gemini_flash_model + ) def test_filter_single_column_reference(session, gemini_flash_model): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 10 - df = dataframe.DataFrame( - data={"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]}, - session=session, - ) + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + df = dataframe.DataFrame( + data={"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]}, + session=session, + ) actual_df = df.semantics.filter( "{country} is in Europe", gemini_flash_model @@ -409,28 +463,32 @@ def test_filter_single_column_reference(session, gemini_flash_model): ], ) def test_filter_invalid_instruction_raise_error(instruction, gemini_flash_model): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame({"id": [1, 2], "city": ["Seattle", "Berlin"]}) - with pytest.raises(ValueError): + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): df.semantics.filter(instruction, gemini_flash_model) def test_filter_invalid_model_raise_error(): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( {"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]} ) - with pytest.raises(TypeError): + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(TypeError): df.semantics.filter("{city} is the capital of {country}", None) def test_map(session, gemini_flash_model): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={ "ingredient_1": ["Burger Bun", "Soy Bean"], @@ -440,11 +498,17 @@ def test_map(session, gemini_flash_model): session=session, ) - actual_df = df.semantics.map( - "What is the {gluten-free} food made from {ingredient_1} and {ingredient_2}? One word only.", - "food", - gemini_flash_model, - ).to_pandas() + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_df = df.semantics.map( + "What is the {gluten-free} food made from {ingredient_1} and {ingredient_2}? One word only.", + "food", + gemini_flash_model, + ).to_pandas() # Result sanitation actual_df["food"] = actual_df["food"].str.strip().str.lower() @@ -475,8 +539,6 @@ def test_map(session, gemini_flash_model): ], ) def test_map_with_confirmation(session, gemini_flash_model, reply, monkeypatch): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 0 df = dataframe.DataFrame( data={ "ingredient_1": ["Burger Bun", "Soy Bean"], @@ -487,11 +549,17 @@ def test_map_with_confirmation(session, gemini_flash_model, reply, monkeypatch): ) monkeypatch.setattr("builtins.input", lambda: reply) - df.semantics.map( - "What is the {gluten-free} food made from {ingredient_1} and {ingredient_2}? One word only.", - "food", - gemini_flash_model, - ) + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 0, + ): + df.semantics.map( + "What is the {gluten-free} food made from {ingredient_1} and {ingredient_2}? One word only.", + "food", + gemini_flash_model, + ) @pytest.mark.parametrize( @@ -515,8 +583,6 @@ def test_map_with_confirmation(session, gemini_flash_model, reply, monkeypatch): ], ) def test_map_invalid_instruction_raise_error(instruction, gemini_flash_model): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={ "id": [1, 2], @@ -525,13 +591,16 @@ def test_map_invalid_instruction_raise_error(instruction, gemini_flash_model): } ) - with pytest.raises(ValueError): + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): df.semantics.map(instruction, "food", gemini_flash_model) def test_map_invalid_model_raise_error(): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={ "ingredient_1": ["Burger Bun", "Soy Bean"], @@ -539,7 +608,12 @@ def test_map_invalid_model_raise_error(): }, ) - with pytest.raises(TypeError): + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(TypeError): df.semantics.map( "What is the food made from {ingredient_1} and {ingredient_2}? One word only.", "food", @@ -562,8 +636,6 @@ def test_map_invalid_model_raise_error(): ], ) def test_join(instruction, session, gemini_flash_model): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 10 cities = dataframe.DataFrame( data={ "city": ["Seattle", "Berlin"], @@ -575,11 +647,17 @@ def test_join(instruction, session, gemini_flash_model): session=session, ) - actual_df = cities.semantics.join( - countries, - instruction, - gemini_flash_model, - ).to_pandas() + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_df = cities.semantics.join( + countries, + instruction, + gemini_flash_model, + ).to_pandas() expected_df = pd.DataFrame( { @@ -606,8 +684,6 @@ def test_join(instruction, session, gemini_flash_model): ], ) def test_join_with_confirmation(session, gemini_flash_model, reply, monkeypatch): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 0 cities = dataframe.DataFrame( data={ "city": ["Seattle", "Berlin"], @@ -620,16 +696,20 @@ def test_join_with_confirmation(session, gemini_flash_model, reply, monkeypatch) ) monkeypatch.setattr("builtins.input", lambda: reply) - cities.semantics.join( - countries, - "{city} is in {country}", - gemini_flash_model, - ) + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 0, + ): + cities.semantics.join( + countries, + "{city} is in {country}", + gemini_flash_model, + ) def test_self_join(session, gemini_flash_model): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 10 animals = dataframe.DataFrame( data={ "animal": ["spider", "capybara"], @@ -637,11 +717,17 @@ def test_self_join(session, gemini_flash_model): session=session, ) - actual_df = animals.semantics.join( - animals, - "{left.animal} is heavier than {right.animal}", - gemini_flash_model, - ).to_pandas() + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_df = animals.semantics.join( + animals, + "{left.animal} is heavier than {right.animal}", + gemini_flash_model, + ).to_pandas() expected_df = pd.DataFrame( { @@ -659,8 +745,6 @@ def test_self_join(session, gemini_flash_model): def test_join_data_too_large_raise_error(session, gemini_flash_model): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 10 cities = dataframe.DataFrame( data={ "city": ["Seattle", "Berlin"], @@ -672,7 +756,12 @@ def test_join_data_too_large_raise_error(session, gemini_flash_model): session=session, ) - with pytest.raises(ValueError): + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): cities.semantics.join( countries, "{city} belongs to {country}", gemini_flash_model, max_rows=1 ) @@ -703,8 +792,6 @@ def test_join_data_too_large_raise_error(session, gemini_flash_model): def test_join_invalid_instruction_raise_error( instruction, error_pattern, gemini_flash_model ): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df1 = dataframe.DataFrame( {"city": ["Seattle", "Berlin"], "country": ["USA", "Germany"]} ) @@ -715,17 +802,25 @@ def test_join_invalid_instruction_raise_error( } ) - with pytest.raises(ValueError, match=error_pattern): + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError, match=error_pattern): df1.semantics.join(df2, instruction, gemini_flash_model) def test_join_invalid_model_raise_error(): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 10 cities = dataframe.DataFrame({"city": ["Seattle", "Berlin"]}) countries = dataframe.DataFrame({"country": ["USA", "UK", "Germany"]}) - with pytest.raises(TypeError): + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(TypeError): cities.semantics.join(countries, "{city} is in {country}", None) @@ -737,20 +832,24 @@ def test_join_invalid_model_raise_error(): ], ) def test_search(session, text_embedding_generator, score_column): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, session=session, ) - actual_result = df.semantics.search( - "creatures", - "monkey", - top_k=2, - model=text_embedding_generator, - score_column=score_column, - ).to_pandas() + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_result = df.semantics.search( + "creatures", + "monkey", + top_k=2, + model=text_embedding_generator, + score_column=score_column, + ).to_pandas() expected_result = pd.Series( ["baboons", "chimpanzee"], index=[2, 4], name="creatures" @@ -780,57 +879,70 @@ def test_search(session, text_embedding_generator, score_column): def test_search_with_confirmation( session, text_embedding_generator, reply, monkeypatch ): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 0 df = dataframe.DataFrame( data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, session=session, ) monkeypatch.setattr("builtins.input", lambda: reply) - df.semantics.search( - "creatures", - "monkey", - top_k=2, - model=text_embedding_generator, - ) + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 0, + ): + df.semantics.search( + "creatures", + "monkey", + top_k=2, + model=text_embedding_generator, + ) def test_search_invalid_column_raises_error(session, text_embedding_generator): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, session=session, ) - with pytest.raises(ValueError): + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): df.semantics.search( "whatever", "monkey", top_k=2, model=text_embedding_generator ) def test_search_invalid_model_raises_error(session): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, session=session, ) - with pytest.raises(TypeError): + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(TypeError): df.semantics.search("creatures", "monkey", top_k=2, model=None) def test_search_invalid_top_k_raises_error(session, text_embedding_generator): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, session=session, ) - with pytest.raises(ValueError): + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): df.semantics.search( "creatures", "monkey", top_k=0, model=text_embedding_generator ) @@ -844,8 +956,6 @@ def test_search_invalid_top_k_raises_error(session, text_embedding_generator): ], ) def test_sim_join(session, text_embedding_generator, score_column): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df1 = dataframe.DataFrame( data={"creatures": ["salmon", "cat"]}, session=session, @@ -855,14 +965,20 @@ def test_sim_join(session, text_embedding_generator, score_column): session=session, ) - actual_result = df1.semantics.sim_join( - df2, - left_on="creatures", - right_on="creatures", - model=text_embedding_generator, - top_k=1, - score_column=score_column, - ).to_pandas() + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + actual_result = df1.semantics.sim_join( + df2, + left_on="creatures", + right_on="creatures", + model=text_embedding_generator, + top_k=1, + score_column=score_column, + ).to_pandas() expected_result = pd.DataFrame( {"creatures": ["salmon", "cat"], "creatures_1": ["tuna", "dog"]} @@ -892,8 +1008,6 @@ def test_sim_join(session, text_embedding_generator, score_column): def test_sim_join_with_confirmation( session, text_embedding_generator, reply, monkeypatch ): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 0 df1 = dataframe.DataFrame( data={"creatures": ["salmon", "cat"]}, session=session, @@ -904,13 +1018,19 @@ def test_sim_join_with_confirmation( ) monkeypatch.setattr("builtins.input", lambda: reply) - df1.semantics.sim_join( - df2, - left_on="creatures", - right_on="creatures", - model=text_embedding_generator, - top_k=1, - ) + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 0, + ): + df1.semantics.sim_join( + df2, + left_on="creatures", + right_on="creatures", + model=text_embedding_generator, + top_k=1, + ) @pytest.mark.parametrize( @@ -923,8 +1043,6 @@ def test_sim_join_with_confirmation( def test_sim_join_invalid_column_raises_error( session, text_embedding_generator, left_on, right_on ): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df1 = dataframe.DataFrame( data={"creatures": ["salmon", "cat"]}, session=session, @@ -934,15 +1052,18 @@ def test_sim_join_invalid_column_raises_error( session=session, ) - with pytest.raises(ValueError): + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): df1.semantics.sim_join( df2, left_on=left_on, right_on=right_on, model=text_embedding_generator ) def test_sim_join_invalid_model_raises_error(session): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df1 = dataframe.DataFrame( data={"creatures": ["salmon", "cat"]}, session=session, @@ -952,15 +1073,18 @@ def test_sim_join_invalid_model_raises_error(session): session=session, ) - with pytest.raises(TypeError): + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(TypeError): df1.semantics.sim_join( df2, left_on="creatures", right_on="creatures", model=None ) def test_sim_join_invalid_top_k_raises_error(session, text_embedding_generator): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df1 = dataframe.DataFrame( data={"creatures": ["salmon", "cat"]}, session=session, @@ -970,7 +1094,12 @@ def test_sim_join_invalid_top_k_raises_error(session, text_embedding_generator): session=session, ) - with pytest.raises(ValueError): + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): df1.semantics.sim_join( df2, left_on="creatures", @@ -981,8 +1110,6 @@ def test_sim_join_invalid_top_k_raises_error(session, text_embedding_generator): def test_sim_join_data_too_large_raises_error(session, text_embedding_generator): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df1 = dataframe.DataFrame( data={"creatures": ["salmon", "cat"]}, session=session, @@ -992,7 +1119,12 @@ def test_sim_join_data_too_large_raises_error(session, text_embedding_generator) session=session, ) - with pytest.raises(ValueError): + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): df1.semantics.sim_join( df2, left_on="creatures", @@ -1028,8 +1160,6 @@ def test_sim_join_data_too_large_raises_error(session, text_embedding_generator) ], ) def test_top_k_invalid_instruction_raise_error(instruction, gemini_flash_model): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame( { "Animals": ["Dog", "Cat", "Bird", "Horse"], @@ -1037,14 +1167,25 @@ def test_top_k_invalid_instruction_raise_error(instruction, gemini_flash_model): "index": ["a", "b", "c", "d"], } ) - df.semantics.top_k(instruction, model=gemini_flash_model, k=2) + + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + df.semantics.top_k(instruction, model=gemini_flash_model, k=2) def test_top_k_invalid_k_raise_error(gemini_flash_model): - bigframes.options.experiments.semantic_operators = True - bigframes.options.compute.semantic_ops_confirmation_threshold = 10 df = dataframe.DataFrame({"Animals": ["Dog", "Cat", "Bird", "Horse"]}) - with pytest.raises(ValueError): + + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 10, + ), pytest.raises(ValueError): df.semantics.top_k( "{Animals} are more popular as pets", gemini_flash_model, @@ -1054,35 +1195,46 @@ def test_top_k_invalid_k_raise_error(gemini_flash_model): @patch("builtins.input", return_value="") def test_confirm_operation__below_threshold_do_not_confirm(mock_input): - bigframes.options.experiments.semantic_operators = True df = dataframe.DataFrame({}) - bigframes.options.compute.semantic_ops_confirmation_threshold = 3 - df.semantics._confirm_operation(1) + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 3, + ): + df.semantics._confirm_operation(1) mock_input.assert_not_called() @patch("builtins.input", return_value="") def test_confirm_operation__threshold_is_none_do_not_confirm(mock_input): - bigframes.options.experiments.semantic_operators = True df = dataframe.DataFrame({}) - bigframes.options.compute.semantic_ops_confirmation_threshold = None - df.semantics._confirm_operation(100) + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + None, + ): + df.semantics._confirm_operation(100) mock_input.assert_not_called() @patch("builtins.input", return_value="") def test_confirm_operation__threshold_autofail_do_not_confirm(mock_input): - bigframes.options.experiments.semantic_operators = True df = dataframe.DataFrame({}) - bigframes.options.compute.semantic_ops_confirmation_threshold = 1 - bigframes.options.compute.semantic_ops_threshold_autofail = True - - with pytest.raises(exceptions.OperationAbortedError): + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 1, + "compute.semanti_ops_threshold_autofail", + True, + ), pytest.raises(exceptions.OperationAbortedError): df.semantics._confirm_operation(100) mock_input.assert_not_called() @@ -1099,11 +1251,13 @@ def test_confirm_operation__threshold_autofail_do_not_confirm(mock_input): ], ) def test_confirm_operation__above_threshold_confirm(reply, expectation, monkeypatch): - bigframes.options.experiments.semantic_operators = True monkeypatch.setattr("builtins.input", lambda: reply) df = dataframe.DataFrame({}) - bigframes.options.compute.semantic_ops_confirmation_threshold = 3 - - with expectation as e: + with bigframes.option_context( + EXPERIMENT_OPTION, + True, + THRESHOLD_OPTION, + 3, + ), expectation as e: assert df.semantics._confirm_operation(4) == e From 7c216d8afd8a99011d763c4ca464a39d326a4337 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 3 Jan 2025 21:51:11 +0000 Subject: [PATCH 10/12] remove redundant code --- tests/system/large/operations/test_semantics.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/system/large/operations/test_semantics.py b/tests/system/large/operations/test_semantics.py index fc4fef7677..fa18b9bd7c 100644 --- a/tests/system/large/operations/test_semantics.py +++ b/tests/system/large/operations/test_semantics.py @@ -27,7 +27,6 @@ def test_semantics_experiment_off_raise_error(): - # bigframes.options.experiments.semantic_operators = False df = dataframe.DataFrame( {"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]} ) From 0e5d59867cb9d2b7df97a53322ba0c261e73cc51 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 3 Jan 2025 21:54:35 +0000 Subject: [PATCH 11/12] fix tests --- .../system/large/operations/test_semantics.py | 41 ++++--------------- 1 file changed, 9 insertions(+), 32 deletions(-) diff --git a/tests/system/large/operations/test_semantics.py b/tests/system/large/operations/test_semantics.py index fa18b9bd7c..20219ef46e 100644 --- a/tests/system/large/operations/test_semantics.py +++ b/tests/system/large/operations/test_semantics.py @@ -420,20 +420,20 @@ def test_filter_with_confirmation(session, gemini_flash_model, reply, monkeypatc def test_filter_single_column_reference(session, gemini_flash_model): + df = dataframe.DataFrame( + data={"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]}, + session=session, + ) + with bigframes.option_context( EXPERIMENT_OPTION, True, THRESHOLD_OPTION, 10, ): - df = dataframe.DataFrame( - data={"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]}, - session=session, - ) - - actual_df = df.semantics.filter( - "{country} is in Europe", gemini_flash_model - ).to_pandas() + actual_df = df.semantics.filter( + "{country} is in Europe", gemini_flash_model + ).to_pandas() expected_df = pd.DataFrame({"country": ["Germany"], "city": ["Berlin"]}, index=[1]) pandas.testing.assert_frame_equal( @@ -743,29 +743,6 @@ def test_self_join(session, gemini_flash_model): ) -def test_join_data_too_large_raise_error(session, gemini_flash_model): - cities = dataframe.DataFrame( - data={ - "city": ["Seattle", "Berlin"], - }, - session=session, - ) - countries = dataframe.DataFrame( - data={"country": ["USA", "UK", "Germany"]}, - session=session, - ) - - with bigframes.option_context( - EXPERIMENT_OPTION, - True, - THRESHOLD_OPTION, - 10, - ), pytest.raises(ValueError): - cities.semantics.join( - countries, "{city} belongs to {country}", gemini_flash_model, max_rows=1 - ) - - @pytest.mark.parametrize( ("instruction", "error_pattern"), [ @@ -1231,7 +1208,7 @@ def test_confirm_operation__threshold_autofail_do_not_confirm(mock_input): True, THRESHOLD_OPTION, 1, - "compute.semanti_ops_threshold_autofail", + "compute.semantic_ops_threshold_autofail", True, ), pytest.raises(exceptions.OperationAbortedError): df.semantics._confirm_operation(100) From da7211daf689b6046c87e566a0d670a27c62cf69 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 3 Jan 2025 22:47:25 +0000 Subject: [PATCH 12/12] fix doctest --- bigframes/operations/semantics.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py index 5777bf6c31..6a537db4f3 100644 --- a/bigframes/operations/semantics.py +++ b/bigframes/operations/semantics.py @@ -53,6 +53,7 @@ def agg( >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") @@ -246,6 +247,7 @@ def cluster_by( >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm >>> model = llm.TextEmbeddingGenerator() @@ -319,6 +321,7 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") @@ -410,6 +413,7 @@ def map( >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") @@ -510,6 +514,7 @@ def join( >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") @@ -649,6 +654,7 @@ def search( >>> import bigframes >>> bigframes.options.experiments.semantic_operators = True + >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm >>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005") @@ -749,6 +755,7 @@ def top_k( >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") @@ -949,9 +956,8 @@ def sim_join( >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None - - >>> import bigframes - >>> bigframes.options.experiments.semantic_operators = True + >>> bpd.options.experiments.semantic_operators = True + >>> bpd.options.compute.semantic_ops_confirmation_threshold = 25 >>> import bigframes.ml.llm as llm >>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005")