From 9182de16923a21e6201f3ad8e3cdde341fe01f25 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Sat, 1 Jun 2024 01:15:33 +0000 Subject: [PATCH 1/3] feat: support score in GeminiTextGenerator --- bigframes/ml/llm.py | 62 +++++++++++++++++++++++++++++++++++ tests/system/load/test_llm.py | 41 +++++++++++++++++++++++ 2 files changed, 103 insertions(+) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 7fa0e236eb..8e29682761 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -732,6 +732,68 @@ def predict( return df + def score( + self, + X: Union[bpd.DataFrame, bpd.Series], + y: Union[bpd.DataFrame, bpd.Series], + task_type: Literal[ + "text_generation", "classification", "summarization", "question_answering" + ] = "text_generation", + ) -> bpd.DataFrame: + """Calculate evaluation metrics of the model. + + .. note:: + + This product or feature is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the + Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is" + and might have limited support. For more information, see the launch stage descriptions + (https://cloud.google.com/products#product-launch-stages). + + .. note:: + + Output matches that of the BigQuery ML.EVALUTE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#remote-model-llm + for the outputs relevant to this model type. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + A BigQuery DataFrame as evaluation data, which contains only one column of input_text + that contains the prompt text to use when evaluating the model. + y (bigframes.dataframe.DataFrame or bigframes.series.Series): + A BigQuery DataFrame as evaluation labels, which contains only one column of output_text + that you would expect to be returned by the model. + task_type (str): + The type of the task for LLM model. Default to "text_generation". + Possible values: "text_generation", "classification", "summarization", and "question_answering". + + Returns: + bigframes.dataframe.DataFrame: The DataFrame as evaluation result. + """ + if not self._bqml_model: + raise RuntimeError("A model must be fitted before score") + + if self._bqml_model.model_name.startswith("gemini-1.5"): + raise NotImplementedError( + "Score is not supported for gemini-1.5 model. Please use gemini-pro-1.0 model instead." + ) + + X, y = utils.convert_to_dataframe(X, y) + + if len(X.columns) != 1 or len(y.columns) != 1: + raise ValueError( + f"Only support one column as input for X and y. {constants.FEEDBACK_LINK}" + ) + + # BQML identified the column by name + X_col_label = cast(blocks.Label, X.columns[0]) + y_col_label = cast(blocks.Label, y.columns[0]) + X = X.rename(columns={X_col_label: "input_text"}) + y = y.rename(columns={y_col_label: "output_text"}) + + input_data = X.join(y, how="outer") + + return self._bqml_model.llm_evaluate(input_data, task_type) + def to_gbq(self, model_name: str, replace: bool = False) -> GeminiTextGenerator: """Save the model to BigQuery. diff --git a/tests/system/load/test_llm.py b/tests/system/load/test_llm.py index fd13662275..527356d5e6 100644 --- a/tests/system/load/test_llm.py +++ b/tests/system/load/test_llm.py @@ -112,3 +112,44 @@ def test_llm_palm_score_params(llm_fine_tune_df_default_index): "evaluation_status", ] assert all(col in score_result_col for col in expected_col) + + +@pytest.mark.flaky(retries=2) +def test_llm_gemini_pro_score(llm_fine_tune_df_default_index): + model = bigframes.ml.llm.GeminiTextGenerator(model_name="gemini-pro") + + # Check score to ensure the model was fitted + score_result = model.score( + X=llm_fine_tune_df_default_index[["prompt"]], + y=llm_fine_tune_df_default_index[["label"]], + ).to_pandas() + score_result_col = score_result.columns.to_list() + expected_col = [ + "bleu4_score", + "rouge-l_precision", + "rouge-l_recall", + "rouge-l_f1_score", + "evaluation_status", + ] + assert all(col in score_result_col for col in expected_col) + + +@pytest.mark.flaky(retries=2) +def test_llm_gemini_pro_score_params(llm_fine_tune_df_default_index): + model = bigframes.ml.llm.GeminiTextGenerator(model_name="gemini-pro") + + # Check score to ensure the model was fitted + score_result = model.score( + X=llm_fine_tune_df_default_index["prompt"], + y=llm_fine_tune_df_default_index["label"], + task_type="classification", + ).to_pandas() + score_result_col = score_result.columns.to_list() + expected_col = [ + "precision", + "recall", + "f1_score", + "label", + "evaluation_status", + ] + assert all(col in score_result_col for col in expected_col) From a68312d0473f19b37a9a2b2ad6dab81aa50f55d9 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Tue, 4 Jun 2024 16:58:05 +0000 Subject: [PATCH 2/3] address comments --- bigframes/ml/llm.py | 7 +++--- tests/system/load/test_llm.py | 43 +++++++++++++++++++---------------- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 8e29682761..c94a1184e7 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -740,7 +740,7 @@ def score( "text_generation", "classification", "summarization", "question_answering" ] = "text_generation", ) -> bpd.DataFrame: - """Calculate evaluation metrics of the model. + """Calculate evaluation metrics of the model. Only "gemini-pro" model is supported for now. .. note:: @@ -772,10 +772,9 @@ def score( if not self._bqml_model: raise RuntimeError("A model must be fitted before score") + # TODO(ashleyxu): Support gemini-1.5 when the rollout is ready. b/344891364. if self._bqml_model.model_name.startswith("gemini-1.5"): - raise NotImplementedError( - "Score is not supported for gemini-1.5 model. Please use gemini-pro-1.0 model instead." - ) + raise NotImplementedError("Score is not supported for gemini-1.5 model.") X, y = utils.convert_to_dataframe(X, y) diff --git a/tests/system/load/test_llm.py b/tests/system/load/test_llm.py index 527356d5e6..9aefc8e722 100644 --- a/tests/system/load/test_llm.py +++ b/tests/system/load/test_llm.py @@ -16,6 +16,7 @@ import pytest import bigframes.ml.llm +from tests.system import utils @pytest.fixture(scope="session") @@ -114,7 +115,6 @@ def test_llm_palm_score_params(llm_fine_tune_df_default_index): assert all(col in score_result_col for col in expected_col) -@pytest.mark.flaky(retries=2) def test_llm_gemini_pro_score(llm_fine_tune_df_default_index): model = bigframes.ml.llm.GeminiTextGenerator(model_name="gemini-pro") @@ -123,18 +123,19 @@ def test_llm_gemini_pro_score(llm_fine_tune_df_default_index): X=llm_fine_tune_df_default_index[["prompt"]], y=llm_fine_tune_df_default_index[["label"]], ).to_pandas() - score_result_col = score_result.columns.to_list() - expected_col = [ - "bleu4_score", - "rouge-l_precision", - "rouge-l_recall", - "rouge-l_f1_score", - "evaluation_status", - ] - assert all(col in score_result_col for col in expected_col) + utils.check_pandas_df_schema_and_index( + score_result, + columns=[ + "bleu4_score", + "rouge-l_precision", + "rouge-l_recall", + "rouge-l_f1_score", + "evaluation_status", + ], + index=1, + ) -@pytest.mark.flaky(retries=2) def test_llm_gemini_pro_score_params(llm_fine_tune_df_default_index): model = bigframes.ml.llm.GeminiTextGenerator(model_name="gemini-pro") @@ -144,12 +145,14 @@ def test_llm_gemini_pro_score_params(llm_fine_tune_df_default_index): y=llm_fine_tune_df_default_index["label"], task_type="classification", ).to_pandas() - score_result_col = score_result.columns.to_list() - expected_col = [ - "precision", - "recall", - "f1_score", - "label", - "evaluation_status", - ] - assert all(col in score_result_col for col in expected_col) + utils.check_pandas_df_schema_and_index( + score_result, + columns=[ + "precision", + "recall", + "f1_score", + "label", + "evaluation_status", + ], + index=1, + ) From 1b96f6f3f45bfa0f513c83730dc5155491ac9996 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Tue, 4 Jun 2024 19:11:36 +0000 Subject: [PATCH 3/3] reorganize the tests --- tests/system/load/test_llm.py | 44 ----------------------------- tests/system/small/ml/conftest.py | 12 ++++++++ tests/system/small/ml/test_llm.py | 46 +++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 44 deletions(-) diff --git a/tests/system/load/test_llm.py b/tests/system/load/test_llm.py index 9aefc8e722..fd13662275 100644 --- a/tests/system/load/test_llm.py +++ b/tests/system/load/test_llm.py @@ -16,7 +16,6 @@ import pytest import bigframes.ml.llm -from tests.system import utils @pytest.fixture(scope="session") @@ -113,46 +112,3 @@ def test_llm_palm_score_params(llm_fine_tune_df_default_index): "evaluation_status", ] assert all(col in score_result_col for col in expected_col) - - -def test_llm_gemini_pro_score(llm_fine_tune_df_default_index): - model = bigframes.ml.llm.GeminiTextGenerator(model_name="gemini-pro") - - # Check score to ensure the model was fitted - score_result = model.score( - X=llm_fine_tune_df_default_index[["prompt"]], - y=llm_fine_tune_df_default_index[["label"]], - ).to_pandas() - utils.check_pandas_df_schema_and_index( - score_result, - columns=[ - "bleu4_score", - "rouge-l_precision", - "rouge-l_recall", - "rouge-l_f1_score", - "evaluation_status", - ], - index=1, - ) - - -def test_llm_gemini_pro_score_params(llm_fine_tune_df_default_index): - model = bigframes.ml.llm.GeminiTextGenerator(model_name="gemini-pro") - - # Check score to ensure the model was fitted - score_result = model.score( - X=llm_fine_tune_df_default_index["prompt"], - y=llm_fine_tune_df_default_index["label"], - task_type="classification", - ).to_pandas() - utils.check_pandas_df_schema_and_index( - score_result, - columns=[ - "precision", - "recall", - "f1_score", - "label", - "evaluation_status", - ], - index=1, - ) diff --git a/tests/system/small/ml/conftest.py b/tests/system/small/ml/conftest.py index ee96646687..ab75053b0e 100644 --- a/tests/system/small/ml/conftest.py +++ b/tests/system/small/ml/conftest.py @@ -171,6 +171,18 @@ def llm_text_pandas_df(): ) +@pytest.fixture(scope="session") +def llm_fine_tune_df_default_index( + session: bigframes.Session, +) -> bigframes.dataframe.DataFrame: + training_table_name = "llm_tuning.emotion_classification_train" + df = session.read_gbq(training_table_name) + prefix = "Please do sentiment analysis on the following text and only output a number from 0 to 5 where 0 means sadness, 1 means joy, 2 means love, 3 means anger, 4 means fear, and 5 means surprise. Text: " + df["prompt"] = prefix + df["text"] + df["label"] = df["label"].astype("string") + return df + + @pytest.fixture(scope="session") def onnx_iris_pandas_df(): """Data matching the iris dataset.""" diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 20e8dd0c19..36d01e126f 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -15,6 +15,7 @@ import pytest from bigframes.ml import llm +from tests.system import utils def test_create_text_generator_model( @@ -366,3 +367,48 @@ def test_gemini_text_generator_predict_with_params_success( assert "ml_generate_text_llm_result" in df.columns series = df["ml_generate_text_llm_result"] assert all(series.str.len() > 20) + + +@pytest.mark.flaky(retries=2) +def test_llm_gemini_pro_score(llm_fine_tune_df_default_index): + model = llm.GeminiTextGenerator(model_name="gemini-pro") + + # Check score to ensure the model was fitted + score_result = model.score( + X=llm_fine_tune_df_default_index[["prompt"]], + y=llm_fine_tune_df_default_index[["label"]], + ).to_pandas() + utils.check_pandas_df_schema_and_index( + score_result, + columns=[ + "bleu4_score", + "rouge-l_precision", + "rouge-l_recall", + "rouge-l_f1_score", + "evaluation_status", + ], + index=1, + ) + + +@pytest.mark.flaky(retries=2) +def test_llm_gemini_pro_score_params(llm_fine_tune_df_default_index): + model = llm.GeminiTextGenerator(model_name="gemini-pro") + + # Check score to ensure the model was fitted + score_result = model.score( + X=llm_fine_tune_df_default_index["prompt"], + y=llm_fine_tune_df_default_index["label"], + task_type="classification", + ).to_pandas() + utils.check_pandas_df_schema_and_index( + score_result, + columns=[ + "precision", + "recall", + "f1_score", + "label", + "evaluation_status", + ], + index=6, + )