diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 27cd8207b2..427e99583d 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -47,9 +47,11 @@ _EMBEDDING_GENERATOR_GECKO_MULTILINGUAL_ENDPOINT, ) +_TEXT_EMBEDDING_005_ENDPOINT = "text-embedding-005" _TEXT_EMBEDDING_004_ENDPOINT = "text-embedding-004" _TEXT_MULTILINGUAL_EMBEDDING_002_ENDPOINT = "text-multilingual-embedding-002" _TEXT_EMBEDDING_ENDPOINTS = ( + _TEXT_EMBEDDING_005_ENDPOINT, _TEXT_EMBEDDING_004_ENDPOINT, _TEXT_MULTILINGUAL_EMBEDDING_002_ENDPOINT, ) @@ -606,8 +608,8 @@ class TextEmbeddingGenerator(base.BaseEstimator): Args: model_name (str, Default to "text-embedding-004"): - The model for text embedding. Possible values are "text-embedding-004" or "text-multilingual-embedding-002". - text-embedding models returns model embeddings for text inputs. + The model for text embedding. Possible values are "text-embedding-005", "text-embedding-004" + or "text-multilingual-embedding-002". text-embedding models returns model embeddings for text inputs. text-multilingual-embedding models returns model embeddings for text inputs which support over 100 languages. Default to "text-embedding-004". session (bigframes.Session or None): @@ -621,7 +623,9 @@ def __init__( self, *, model_name: Literal[ - "text-embedding-004", "text-multilingual-embedding-002" + "text-embedding-005", + "text-embedding-004", + "text-multilingual-embedding-002", ] = "text-embedding-004", session: Optional[bigframes.Session] = None, connection_name: Optional[str] = None, diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py index 0ebf65b893..6a14fb3451 100644 --- a/bigframes/ml/loader.py +++ b/bigframes/ml/loader.py @@ -71,6 +71,7 @@ llm._CLAUDE_3_SONNET_ENDPOINT: llm.Claude3TextGenerator, llm._CLAUDE_3_5_SONNET_ENDPOINT: llm.Claude3TextGenerator, llm._CLAUDE_3_OPUS_ENDPOINT: llm.Claude3TextGenerator, + llm._TEXT_EMBEDDING_005_ENDPOINT: llm.TextEmbeddingGenerator, llm._TEXT_EMBEDDING_004_ENDPOINT: llm.TextEmbeddingGenerator, llm._TEXT_MULTILINGUAL_EMBEDDING_002_ENDPOINT: llm.TextEmbeddingGenerator, } diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py index 46ed5b8b3e..c605f30765 100644 --- a/bigframes/operations/semantics.py +++ b/bigframes/operations/semantics.py @@ -647,12 +647,12 @@ def search( >>> bigframes.options.experiments.semantic_operators = True >>> import bigframes.ml.llm as llm - >>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-004") + >>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005") >>> df = bpd.DataFrame({"creatures": ["salmon", "sea urchin", "frog", "chimpanzee"]}) >>> df.semantics.search("creatures", "monkey", top_k=1, model=model, score_column='distance') creatures distance - 3 chimpanzee 0.781101 + 3 chimpanzee 0.635844 [1 rows x 2 columns] @@ -945,7 +945,7 @@ def sim_join( >>> bigframes.options.experiments.semantic_operators = True >>> import bigframes.ml.llm as llm - >>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-004") + >>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005") >>> df1 = bpd.DataFrame({'animal': ['monkey', 'spider']}) >>> df2 = bpd.DataFrame({'animal': ['scorpion', 'baboon']}) diff --git a/notebooks/experimental/semantic_operators.ipynb b/notebooks/experimental/semantic_operators.ipynb index 831fc2ab9b..6739c299d2 100644 --- a/notebooks/experimental/semantic_operators.ipynb +++ b/notebooks/experimental/semantic_operators.ipynb @@ -151,7 +151,7 @@ "source": [ "import bigframes.ml.llm as llm\n", "gemini_model = llm.GeminiTextGenerator(model_name=llm._GEMINI_1P5_FLASH_001_ENDPOINT)\n", - "text_embedding_model = llm.TextEmbeddingGenerator(model_name=\"text-embedding-004\")" + "text_embedding_model = llm.TextEmbeddingGenerator(model_name=\"text-embedding-005\")" ] }, { diff --git a/owlbot.py b/owlbot.py index b7286565c9..1ede2e0453 100644 --- a/owlbot.py +++ b/owlbot.py @@ -104,7 +104,7 @@ # Use a custom table of contents since the default one isn't organized well # enough for the number of classes we have. -assert 1 == s.replace( # publish-docs.sh +assert 1 == s.replace( # publish-docs.sh [".kokoro/publish-docs.sh"], ( re.escape("# upload docs") @@ -122,14 +122,14 @@ ) # Fixup the documentation. -assert 1 == s.replace( # docs/conf.py +assert 1 == s.replace( # docs/conf.py ["docs/conf.py"], re.escape("Google Cloud Client Libraries for bigframes"), "BigQuery DataFrames provides DataFrame APIs on the BigQuery engine", ) # Don't omit `*/core/*.py` when counting test coverages -assert 1 == s.replace( # .coveragerc +assert 1 == s.replace( # .coveragerc [".coveragerc"], re.escape(" */core/*.py\n"), "", diff --git a/tests/system/large/operations/conftest.py b/tests/system/large/operations/conftest.py index 7ab3811f10..4f6e2d1704 100644 --- a/tests/system/large/operations/conftest.py +++ b/tests/system/large/operations/conftest.py @@ -29,5 +29,5 @@ def gemini_flash_model(session, bq_connection) -> llm.GeminiTextGenerator: @pytest.fixture(scope="session") def text_embedding_generator(session, bq_connection) -> llm.TextEmbeddingGenerator: return llm.TextEmbeddingGenerator( - session=session, connection_name=bq_connection, model_name="text-embedding-004" + session=session, connection_name=bq_connection, model_name="text-embedding-005" ) diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 40862b3086..4bc1bd63be 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -196,7 +196,7 @@ def test_text_generator_predict_with_params_success( @pytest.mark.parametrize( "model_name", - ("text-embedding-004", "text-multilingual-embedding-002"), + ("text-embedding-005", "text-embedding-004", "text-multilingual-embedding-002"), ) def test_create_load_text_embedding_generator_model( dataset_id, model_name, session, bq_connection @@ -218,7 +218,7 @@ def test_create_load_text_embedding_generator_model( @pytest.mark.parametrize( "model_name", - ("text-embedding-004", "text-multilingual-embedding-002"), + ("text-embedding-005", "text-embedding-004", "text-multilingual-embedding-002"), ) @pytest.mark.flaky(retries=2) def test_text_embedding_generator_predict_default_params_success( @@ -236,7 +236,7 @@ def test_text_embedding_generator_predict_default_params_success( @pytest.mark.parametrize( "model_name", - ("text-embedding-004", "text-multilingual-embedding-002"), + ("text-embedding-005", "text-embedding-004", "text-multilingual-embedding-002"), ) @pytest.mark.flaky(retries=2) def test_text_embedding_generator_multi_cols_predict_success(