From 9182de16923a21e6201f3ad8e3cdde341fe01f25 Mon Sep 17 00:00:00 2001
From: Ashley Xu <ashleyxu@google.com>
Date: Sat, 1 Jun 2024 01:15:33 +0000
Subject: [PATCH 1/3] feat: support score in GeminiTextGenerator

---
 bigframes/ml/llm.py           | 62 +++++++++++++++++++++++++++++++++++
 tests/system/load/test_llm.py | 41 +++++++++++++++++++++++
 2 files changed, 103 insertions(+)

diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py
index 7fa0e236eb..8e29682761 100644
--- a/bigframes/ml/llm.py
+++ b/bigframes/ml/llm.py
@@ -732,6 +732,68 @@ def predict(
 
         return df
 
+    def score(
+        self,
+        X: Union[bpd.DataFrame, bpd.Series],
+        y: Union[bpd.DataFrame, bpd.Series],
+        task_type: Literal[
+            "text_generation", "classification", "summarization", "question_answering"
+        ] = "text_generation",
+    ) -> bpd.DataFrame:
+        """Calculate evaluation metrics of the model.
+
+        .. note::
+
+            This product or feature is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the
+            Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is"
+            and might have limited support. For more information, see the launch stage descriptions
+            (https://cloud.google.com/products#product-launch-stages).
+
+        .. note::
+
+            Output matches that of the BigQuery ML.EVALUTE function.
+            See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#remote-model-llm
+            for the outputs relevant to this model type.
+
+        Args:
+            X (bigframes.dataframe.DataFrame or bigframes.series.Series):
+                A BigQuery DataFrame as evaluation data, which contains only one column of input_text
+                that contains the prompt text to use when evaluating the model.
+            y (bigframes.dataframe.DataFrame or bigframes.series.Series):
+                A BigQuery DataFrame as evaluation labels, which contains only one column of output_text
+                that you would expect to be returned by the model.
+            task_type (str):
+                The type of the task for LLM model. Default to "text_generation".
+                Possible values: "text_generation", "classification", "summarization", and "question_answering".
+
+        Returns:
+            bigframes.dataframe.DataFrame: The DataFrame as evaluation result.
+        """
+        if not self._bqml_model:
+            raise RuntimeError("A model must be fitted before score")
+
+        if self._bqml_model.model_name.startswith("gemini-1.5"):
+            raise NotImplementedError(
+                "Score is not supported for gemini-1.5 model. Please use gemini-pro-1.0 model instead."
+            )
+
+        X, y = utils.convert_to_dataframe(X, y)
+
+        if len(X.columns) != 1 or len(y.columns) != 1:
+            raise ValueError(
+                f"Only support one column as input for X and y. {constants.FEEDBACK_LINK}"
+            )
+
+        # BQML identified the column by name
+        X_col_label = cast(blocks.Label, X.columns[0])
+        y_col_label = cast(blocks.Label, y.columns[0])
+        X = X.rename(columns={X_col_label: "input_text"})
+        y = y.rename(columns={y_col_label: "output_text"})
+
+        input_data = X.join(y, how="outer")
+
+        return self._bqml_model.llm_evaluate(input_data, task_type)
+
     def to_gbq(self, model_name: str, replace: bool = False) -> GeminiTextGenerator:
         """Save the model to BigQuery.
 
diff --git a/tests/system/load/test_llm.py b/tests/system/load/test_llm.py
index fd13662275..527356d5e6 100644
--- a/tests/system/load/test_llm.py
+++ b/tests/system/load/test_llm.py
@@ -112,3 +112,44 @@ def test_llm_palm_score_params(llm_fine_tune_df_default_index):
         "evaluation_status",
     ]
     assert all(col in score_result_col for col in expected_col)
+
+
+@pytest.mark.flaky(retries=2)
+def test_llm_gemini_pro_score(llm_fine_tune_df_default_index):
+    model = bigframes.ml.llm.GeminiTextGenerator(model_name="gemini-pro")
+
+    # Check score to ensure the model was fitted
+    score_result = model.score(
+        X=llm_fine_tune_df_default_index[["prompt"]],
+        y=llm_fine_tune_df_default_index[["label"]],
+    ).to_pandas()
+    score_result_col = score_result.columns.to_list()
+    expected_col = [
+        "bleu4_score",
+        "rouge-l_precision",
+        "rouge-l_recall",
+        "rouge-l_f1_score",
+        "evaluation_status",
+    ]
+    assert all(col in score_result_col for col in expected_col)
+
+
+@pytest.mark.flaky(retries=2)
+def test_llm_gemini_pro_score_params(llm_fine_tune_df_default_index):
+    model = bigframes.ml.llm.GeminiTextGenerator(model_name="gemini-pro")
+
+    # Check score to ensure the model was fitted
+    score_result = model.score(
+        X=llm_fine_tune_df_default_index["prompt"],
+        y=llm_fine_tune_df_default_index["label"],
+        task_type="classification",
+    ).to_pandas()
+    score_result_col = score_result.columns.to_list()
+    expected_col = [
+        "precision",
+        "recall",
+        "f1_score",
+        "label",
+        "evaluation_status",
+    ]
+    assert all(col in score_result_col for col in expected_col)

From a68312d0473f19b37a9a2b2ad6dab81aa50f55d9 Mon Sep 17 00:00:00 2001
From: Ashley Xu <ashleyxu@google.com>
Date: Tue, 4 Jun 2024 16:58:05 +0000
Subject: [PATCH 2/3] address comments

---
 bigframes/ml/llm.py           |  7 +++---
 tests/system/load/test_llm.py | 43 +++++++++++++++++++----------------
 2 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py
index 8e29682761..c94a1184e7 100644
--- a/bigframes/ml/llm.py
+++ b/bigframes/ml/llm.py
@@ -740,7 +740,7 @@ def score(
             "text_generation", "classification", "summarization", "question_answering"
         ] = "text_generation",
     ) -> bpd.DataFrame:
-        """Calculate evaluation metrics of the model.
+        """Calculate evaluation metrics of the model. Only "gemini-pro" model is supported for now.
 
         .. note::
 
@@ -772,10 +772,9 @@ def score(
         if not self._bqml_model:
             raise RuntimeError("A model must be fitted before score")
 
+        # TODO(ashleyxu): Support gemini-1.5 when the rollout is ready. b/344891364.
         if self._bqml_model.model_name.startswith("gemini-1.5"):
-            raise NotImplementedError(
-                "Score is not supported for gemini-1.5 model. Please use gemini-pro-1.0 model instead."
-            )
+            raise NotImplementedError("Score is not supported for gemini-1.5 model.")
 
         X, y = utils.convert_to_dataframe(X, y)
 
diff --git a/tests/system/load/test_llm.py b/tests/system/load/test_llm.py
index 527356d5e6..9aefc8e722 100644
--- a/tests/system/load/test_llm.py
+++ b/tests/system/load/test_llm.py
@@ -16,6 +16,7 @@
 import pytest
 
 import bigframes.ml.llm
+from tests.system import utils
 
 
 @pytest.fixture(scope="session")
@@ -114,7 +115,6 @@ def test_llm_palm_score_params(llm_fine_tune_df_default_index):
     assert all(col in score_result_col for col in expected_col)
 
 
-@pytest.mark.flaky(retries=2)
 def test_llm_gemini_pro_score(llm_fine_tune_df_default_index):
     model = bigframes.ml.llm.GeminiTextGenerator(model_name="gemini-pro")
 
@@ -123,18 +123,19 @@ def test_llm_gemini_pro_score(llm_fine_tune_df_default_index):
         X=llm_fine_tune_df_default_index[["prompt"]],
         y=llm_fine_tune_df_default_index[["label"]],
     ).to_pandas()
-    score_result_col = score_result.columns.to_list()
-    expected_col = [
-        "bleu4_score",
-        "rouge-l_precision",
-        "rouge-l_recall",
-        "rouge-l_f1_score",
-        "evaluation_status",
-    ]
-    assert all(col in score_result_col for col in expected_col)
+    utils.check_pandas_df_schema_and_index(
+        score_result,
+        columns=[
+            "bleu4_score",
+            "rouge-l_precision",
+            "rouge-l_recall",
+            "rouge-l_f1_score",
+            "evaluation_status",
+        ],
+        index=1,
+    )
 
 
-@pytest.mark.flaky(retries=2)
 def test_llm_gemini_pro_score_params(llm_fine_tune_df_default_index):
     model = bigframes.ml.llm.GeminiTextGenerator(model_name="gemini-pro")
 
@@ -144,12 +145,14 @@ def test_llm_gemini_pro_score_params(llm_fine_tune_df_default_index):
         y=llm_fine_tune_df_default_index["label"],
         task_type="classification",
     ).to_pandas()
-    score_result_col = score_result.columns.to_list()
-    expected_col = [
-        "precision",
-        "recall",
-        "f1_score",
-        "label",
-        "evaluation_status",
-    ]
-    assert all(col in score_result_col for col in expected_col)
+    utils.check_pandas_df_schema_and_index(
+        score_result,
+        columns=[
+            "precision",
+            "recall",
+            "f1_score",
+            "label",
+            "evaluation_status",
+        ],
+        index=1,
+    )

From 1b96f6f3f45bfa0f513c83730dc5155491ac9996 Mon Sep 17 00:00:00 2001
From: Ashley Xu <ashleyxu@google.com>
Date: Tue, 4 Jun 2024 19:11:36 +0000
Subject: [PATCH 3/3] reorganize the tests

---
 tests/system/load/test_llm.py     | 44 -----------------------------
 tests/system/small/ml/conftest.py | 12 ++++++++
 tests/system/small/ml/test_llm.py | 46 +++++++++++++++++++++++++++++++
 3 files changed, 58 insertions(+), 44 deletions(-)

diff --git a/tests/system/load/test_llm.py b/tests/system/load/test_llm.py
index 9aefc8e722..fd13662275 100644
--- a/tests/system/load/test_llm.py
+++ b/tests/system/load/test_llm.py
@@ -16,7 +16,6 @@
 import pytest
 
 import bigframes.ml.llm
-from tests.system import utils
 
 
 @pytest.fixture(scope="session")
@@ -113,46 +112,3 @@ def test_llm_palm_score_params(llm_fine_tune_df_default_index):
         "evaluation_status",
     ]
     assert all(col in score_result_col for col in expected_col)
-
-
-def test_llm_gemini_pro_score(llm_fine_tune_df_default_index):
-    model = bigframes.ml.llm.GeminiTextGenerator(model_name="gemini-pro")
-
-    # Check score to ensure the model was fitted
-    score_result = model.score(
-        X=llm_fine_tune_df_default_index[["prompt"]],
-        y=llm_fine_tune_df_default_index[["label"]],
-    ).to_pandas()
-    utils.check_pandas_df_schema_and_index(
-        score_result,
-        columns=[
-            "bleu4_score",
-            "rouge-l_precision",
-            "rouge-l_recall",
-            "rouge-l_f1_score",
-            "evaluation_status",
-        ],
-        index=1,
-    )
-
-
-def test_llm_gemini_pro_score_params(llm_fine_tune_df_default_index):
-    model = bigframes.ml.llm.GeminiTextGenerator(model_name="gemini-pro")
-
-    # Check score to ensure the model was fitted
-    score_result = model.score(
-        X=llm_fine_tune_df_default_index["prompt"],
-        y=llm_fine_tune_df_default_index["label"],
-        task_type="classification",
-    ).to_pandas()
-    utils.check_pandas_df_schema_and_index(
-        score_result,
-        columns=[
-            "precision",
-            "recall",
-            "f1_score",
-            "label",
-            "evaluation_status",
-        ],
-        index=1,
-    )
diff --git a/tests/system/small/ml/conftest.py b/tests/system/small/ml/conftest.py
index ee96646687..ab75053b0e 100644
--- a/tests/system/small/ml/conftest.py
+++ b/tests/system/small/ml/conftest.py
@@ -171,6 +171,18 @@ def llm_text_pandas_df():
     )
 
 
+@pytest.fixture(scope="session")
+def llm_fine_tune_df_default_index(
+    session: bigframes.Session,
+) -> bigframes.dataframe.DataFrame:
+    training_table_name = "llm_tuning.emotion_classification_train"
+    df = session.read_gbq(training_table_name)
+    prefix = "Please do sentiment analysis on the following text and only output a number from 0 to 5 where 0 means sadness, 1 means joy, 2 means love, 3 means anger, 4 means fear, and 5 means surprise. Text: "
+    df["prompt"] = prefix + df["text"]
+    df["label"] = df["label"].astype("string")
+    return df
+
+
 @pytest.fixture(scope="session")
 def onnx_iris_pandas_df():
     """Data matching the iris dataset."""
diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py
index 20e8dd0c19..36d01e126f 100644
--- a/tests/system/small/ml/test_llm.py
+++ b/tests/system/small/ml/test_llm.py
@@ -15,6 +15,7 @@
 import pytest
 
 from bigframes.ml import llm
+from tests.system import utils
 
 
 def test_create_text_generator_model(
@@ -366,3 +367,48 @@ def test_gemini_text_generator_predict_with_params_success(
     assert "ml_generate_text_llm_result" in df.columns
     series = df["ml_generate_text_llm_result"]
     assert all(series.str.len() > 20)
+
+
+@pytest.mark.flaky(retries=2)
+def test_llm_gemini_pro_score(llm_fine_tune_df_default_index):
+    model = llm.GeminiTextGenerator(model_name="gemini-pro")
+
+    # Check score to ensure the model was fitted
+    score_result = model.score(
+        X=llm_fine_tune_df_default_index[["prompt"]],
+        y=llm_fine_tune_df_default_index[["label"]],
+    ).to_pandas()
+    utils.check_pandas_df_schema_and_index(
+        score_result,
+        columns=[
+            "bleu4_score",
+            "rouge-l_precision",
+            "rouge-l_recall",
+            "rouge-l_f1_score",
+            "evaluation_status",
+        ],
+        index=1,
+    )
+
+
+@pytest.mark.flaky(retries=2)
+def test_llm_gemini_pro_score_params(llm_fine_tune_df_default_index):
+    model = llm.GeminiTextGenerator(model_name="gemini-pro")
+
+    # Check score to ensure the model was fitted
+    score_result = model.score(
+        X=llm_fine_tune_df_default_index["prompt"],
+        y=llm_fine_tune_df_default_index["label"],
+        task_type="classification",
+    ).to_pandas()
+    utils.check_pandas_df_schema_and_index(
+        score_result,
+        columns=[
+            "precision",
+            "recall",
+            "f1_score",
+            "label",
+            "evaluation_status",
+        ],
+        index=6,
+    )