Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

feat: Update llm.TextEmbeddingGenerator to 005 #1186

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Dec 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions 10 bigframes/ml/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,11 @@
_EMBEDDING_GENERATOR_GECKO_MULTILINGUAL_ENDPOINT,
)

_TEXT_EMBEDDING_005_ENDPOINT = "text-embedding-005"
_TEXT_EMBEDDING_004_ENDPOINT = "text-embedding-004"
_TEXT_MULTILINGUAL_EMBEDDING_002_ENDPOINT = "text-multilingual-embedding-002"
_TEXT_EMBEDDING_ENDPOINTS = (
_TEXT_EMBEDDING_005_ENDPOINT,
_TEXT_EMBEDDING_004_ENDPOINT,
_TEXT_MULTILINGUAL_EMBEDDING_002_ENDPOINT,
)
Expand Down Expand Up @@ -606,8 +608,8 @@ class TextEmbeddingGenerator(base.BaseEstimator):

Args:
model_name (str, Default to "text-embedding-004"):
The model for text embedding. Possible values are "text-embedding-004" or "text-multilingual-embedding-002".
text-embedding models returns model embeddings for text inputs.
The model for text embedding. Possible values are "text-embedding-005", "text-embedding-004"
or "text-multilingual-embedding-002". text-embedding models returns model embeddings for text inputs.
text-multilingual-embedding models returns model embeddings for text inputs which support over 100 languages.
Default to "text-embedding-004".
session (bigframes.Session or None):
Expand All @@ -621,7 +623,9 @@ def __init__(
self,
*,
model_name: Literal[
"text-embedding-004", "text-multilingual-embedding-002"
"text-embedding-005",
"text-embedding-004",
"text-multilingual-embedding-002",
] = "text-embedding-004",
session: Optional[bigframes.Session] = None,
connection_name: Optional[str] = None,
Expand Down
1 change: 1 addition & 0 deletions 1 bigframes/ml/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
llm._CLAUDE_3_SONNET_ENDPOINT: llm.Claude3TextGenerator,
llm._CLAUDE_3_5_SONNET_ENDPOINT: llm.Claude3TextGenerator,
llm._CLAUDE_3_OPUS_ENDPOINT: llm.Claude3TextGenerator,
llm._TEXT_EMBEDDING_005_ENDPOINT: llm.TextEmbeddingGenerator,
llm._TEXT_EMBEDDING_004_ENDPOINT: llm.TextEmbeddingGenerator,
llm._TEXT_MULTILINGUAL_EMBEDDING_002_ENDPOINT: llm.TextEmbeddingGenerator,
}
Expand Down
6 changes: 3 additions & 3 deletions 6 bigframes/operations/semantics.py
Original file line number Diff line number Diff line change
Expand Up @@ -647,12 +647,12 @@ def search(
>>> bigframes.options.experiments.semantic_operators = True

>>> import bigframes.ml.llm as llm
>>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-004")
>>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

docs tests fail on semantic. These changes are unnecessary. Maybe just revert it.


>>> df = bpd.DataFrame({"creatures": ["salmon", "sea urchin", "frog", "chimpanzee"]})
>>> df.semantics.search("creatures", "monkey", top_k=1, model=model, score_column='distance')
creatures distance
3 chimpanzee 0.781101
3 chimpanzee 0.635844
<BLANKLINE>
[1 rows x 2 columns]

Expand Down Expand Up @@ -945,7 +945,7 @@ def sim_join(
>>> bigframes.options.experiments.semantic_operators = True

>>> import bigframes.ml.llm as llm
>>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-004")
>>> model = llm.TextEmbeddingGenerator(model_name="text-embedding-005")

>>> df1 = bpd.DataFrame({'animal': ['monkey', 'spider']})
>>> df2 = bpd.DataFrame({'animal': ['scorpion', 'baboon']})
Expand Down
2 changes: 1 addition & 1 deletion 2 notebooks/experimental/semantic_operators.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@
"source": [
"import bigframes.ml.llm as llm\n",
"gemini_model = llm.GeminiTextGenerator(model_name=llm._GEMINI_1P5_FLASH_001_ENDPOINT)\n",
"text_embedding_model = llm.TextEmbeddingGenerator(model_name=\"text-embedding-004\")"
"text_embedding_model = llm.TextEmbeddingGenerator(model_name=\"text-embedding-005\")"
]
},
{
Expand Down
6 changes: 3 additions & 3 deletions 6 owlbot.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@

# Use a custom table of contents since the default one isn't organized well
# enough for the number of classes we have.
assert 1 == s.replace( # publish-docs.sh
assert 1 == s.replace( # publish-docs.sh
[".kokoro/publish-docs.sh"],
(
re.escape("# upload docs")
Expand All @@ -122,14 +122,14 @@
)

# Fixup the documentation.
assert 1 == s.replace( # docs/conf.py
assert 1 == s.replace( # docs/conf.py
["docs/conf.py"],
re.escape("Google Cloud Client Libraries for bigframes"),
"BigQuery DataFrames provides DataFrame APIs on the BigQuery engine",
)

# Don't omit `*/core/*.py` when counting test coverages
assert 1 == s.replace( # .coveragerc
assert 1 == s.replace( # .coveragerc
[".coveragerc"],
re.escape(" */core/*.py\n"),
"",
Expand Down
2 changes: 1 addition & 1 deletion 2 tests/system/large/operations/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,5 @@ def gemini_flash_model(session, bq_connection) -> llm.GeminiTextGenerator:
@pytest.fixture(scope="session")
def text_embedding_generator(session, bq_connection) -> llm.TextEmbeddingGenerator:
return llm.TextEmbeddingGenerator(
session=session, connection_name=bq_connection, model_name="text-embedding-004"
session=session, connection_name=bq_connection, model_name="text-embedding-005"
)
6 changes: 3 additions & 3 deletions 6 tests/system/small/ml/test_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def test_text_generator_predict_with_params_success(

@pytest.mark.parametrize(
"model_name",
("text-embedding-004", "text-multilingual-embedding-002"),
("text-embedding-005", "text-embedding-004", "text-multilingual-embedding-002"),
)
def test_create_load_text_embedding_generator_model(
dataset_id, model_name, session, bq_connection
Expand All @@ -218,7 +218,7 @@ def test_create_load_text_embedding_generator_model(

@pytest.mark.parametrize(
"model_name",
("text-embedding-004", "text-multilingual-embedding-002"),
("text-embedding-005", "text-embedding-004", "text-multilingual-embedding-002"),
)
@pytest.mark.flaky(retries=2)
def test_text_embedding_generator_predict_default_params_success(
Expand All @@ -236,7 +236,7 @@ def test_text_embedding_generator_predict_default_params_success(

@pytest.mark.parametrize(
"model_name",
("text-embedding-004", "text-multilingual-embedding-002"),
("text-embedding-005", "text-embedding-004", "text-multilingual-embedding-002"),
)
@pytest.mark.flaky(retries=2)
def test_text_embedding_generator_multi_cols_predict_success(
Expand Down
Loading
Morty Proxy This is a proxified and sanitized view of the page, visit original site.