From 407db5edcae2dc02c64bee56f5bcc70aa24dd861 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Thu, 16 May 2024 18:05:57 +0000 Subject: [PATCH 1/8] feat: supoort bigquery.vector_search() --- bigframes/bigquery/__init__.py | 150 +++++++++++++++++- bigframes/bigquery/utils.py | 85 ++++++++++ .../small/bigquery/test_vector_search.py | 136 ++++++++++++++++ tests/unit/bigquery/__init__.py | 13 ++ tests/unit/bigquery/test_utils.py | 149 +++++++++++++++++ 5 files changed, 525 insertions(+), 8 deletions(-) create mode 100644 bigframes/bigquery/utils.py create mode 100644 tests/system/small/bigquery/test_vector_search.py create mode 100644 tests/unit/bigquery/__init__.py create mode 100644 tests/unit/bigquery/test_utils.py diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 5808aa28bf..33e65bbe76 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -20,19 +20,17 @@ from __future__ import annotations -import typing +from typing import Literal, Optional, Union +import bigframes.bigquery.utils as utils import bigframes.constants as constants import bigframes.core.groupby as groupby import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops +import bigframes.pandas as bpd -if typing.TYPE_CHECKING: - import bigframes.dataframe as dataframe - import bigframes.series as series - -def array_length(series: series.Series) -> series.Series: +def array_length(series: bpd.Series) -> bpd.Series: """Compute the length of each array element in the Series. **Examples:** @@ -69,7 +67,7 @@ def array_length(series: series.Series) -> series.Series: def array_agg( obj: groupby.SeriesGroupBy | groupby.DataFrameGroupBy, -) -> series.Series | dataframe.DataFrame: +) -> bpd.Series | bpd.DataFrame: """Group data and create arrays from selected columns, omitting NULLs to avoid BigQuery errors (NULLs not allowed in arrays). @@ -120,7 +118,7 @@ def array_agg( ) -def array_to_string(series: series.Series, delimiter: str) -> series.Series: +def array_to_string(series: bpd.Series, delimiter: str) -> bpd.Series: """Converts array elements within a Series into delimited strings. **Examples:** @@ -148,3 +146,139 @@ def array_to_string(series: series.Series, delimiter: str) -> series.Series: """ return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter)) + + +def vector_search( + base_table: str, + column_to_search: str, + query: Union[bpd.DataFrame, bpd.Series], + *, + query_column_to_search: Optional[str] = None, + top_k: Optional[int] = 10, + distance_type: Literal["euclidean", "cosine"] = "euclidean", + fraction_lists_to_search: Optional[float] = None, + use_brute_force: bool = False, +) -> bpd.DataFrame: + """ + Conduct vector search to earch embeddings to find semantically similar entities. + + **Examples:** + + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + + DataFrame embeddings for which to find nearest neighbors: + + >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"], + ... "embedding": [[1.0, 2.0], [3.0, 5.2]]}) + >>> bbq.vector_search( + ... base_table="bigframes-dev.bigframes_tests_sys.base_table", + ... column_to_search="my_embedding", + ... query=search_query, + ... top_k=2) + query_id embedding id my_embedding distance + 1 cat [3. 5.2] 5 [5. 5.4] 2.009975 + 0 dog [1. 2.] 1 [1. 2.] 0.0 + 0 dog [1. 2.] 4 [1. 3.2] 1.2 + 1 cat [3. 5.2] 2 [2. 4.] 1.56205 + + [4 rows x 5 columns] + + Series embeddings for which to find nearest neighbors: + + >>> search_query = bpd.Series([[1.0, 2.0], [3.0, 5.2]], + ... index=["dog", "cat"], + ... name="embedding") + >>> bbq.vector_search( + ... base_table="bigframes-dev.bigframes_tests_sys.base_table", + ... column_to_search="my_embedding", + ... query=search_query, + ... top_k=2) + embedding id my_embedding distance + dog [1. 2.] 1 [1. 2.] 0.0 + cat [3. 5.2] 5 [5. 5.4] 2.009975 + dog [1. 2.] 4 [1. 3.2] 1.2 + cat [3. 5.2] 2 [2. 4.] 1.56205 + + [4 rows x 4 columns] + + You can specify the name of the column in the query DataFrame embeddings and distance type: + + >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"], + ... "embedding": [[1.0, 2.0], [3.0, 5.2]], + ... "another_embedding": [[0.7, 2.2], [3.3, 5.2]]}) + >>> bbq.vector_search( + ... base_table="bigframes-dev.bigframes_tests_sys.base_table", + ... column_to_search="my_embedding", + ... query=search_query, + ... distance_type="cosine", + ... query_column_to_search="another_embedding", + ... top_k=2) + query_id embedding another_embedding id my_embedding distance + 1 cat [3. 5.2] [3.3 5.2] 2 [2. 4.] 0.005181 + 0 dog [1. 2.] [0.7 2.2] 4 [1. 3.2] 0.000013 + 1 cat [3. 5.2] [3.3 5.2] 1 [1. 2.] 0.005181 + 0 dog [1. 2.] [0.7 2.2] 3 [1.5 7. ] 0.004697 + + [4 rows x 6 columns] + + Args: + base_table (str): + The table to search for nearest neighbor embeddings. + column_to_search (groupby.SeriesGroupBy | groupby.DataFrameGroupBy): + The name of the base table column to search for nearest neighbor embeddings. + The column must have a type of ``ARRAY``. All elements in the array must be non-NULL. + query (bigframes.dataframe.DataFrame | bigframes.dataframe.Series): + A Series or DataFrame that provides the embeddings for which to find nearest neighbors. + query_column_to_search (str): + Specifies the name of the column in the query that contains the embeddings for which to + find nearest neighbors. The column must have a type of ``ARRAY``. All elements in + the array must be non-NULL and all values in the column must have the same array dimensions + as the values in the ``column_to_search`` column. Can only be set when query is a DataFrame. + top_k (int, default 10): + Sepecifies the number of nearest neighbors to return. Default to 10. + distance_type (str, defalt "euclidean"): + Specifies the type of metric to use to compute the distance between two vectors. + Possible values are "euclidean" and "cosine". Default to "euclidean". + fraction_lists_to_search (float, range in [0.0, 1.0]): + Specifies the percentage of lists to search. Specifying a higher percentage leads to + higher recall and slower performance, and the converse is true when specifying a lower + percentage. It is only used when a vector index is also used. You can only specify + ``fraction_lists_to_search`` when ``use_brute_force`` is set to False. + use_brute_force (bool, default False): + Determines whether to use brute force search by skipping the vector index if one is available. + Default to False. + + Returns: + bigframes.dataframe.DataFrame: A DataFrame containing vector search result. + """ + if not fraction_lists_to_search and use_brute_force is True: + raise ValueError( + "You can't specify fraction_lists_to_search when use_brute_force is set to True." + ) + if isinstance(query, bpd.Series) and query_column_to_search is not None: + raise ValueError( + "You can't specify query_column_to_search when query is a Series." + ) + ## (TODO: ashleyxu. Support options in vector search.) + if fraction_lists_to_search is not None or use_brute_force is True: + raise NotImplementedError( + f"fraction_lists_to_search and use_brute_force is not supported. {constants.FEEDBACK_LINK}" + ) + options = { + "base_table": base_table, + "column_to_search": column_to_search, + "query_column_to_search": query_column_to_search, + "distance_type": distance_type, + "top_k": top_k, + "fraction_lists_to_search": fraction_lists_to_search, + "use_brute_force": use_brute_force, + } + + df = utils.apply_sql( + query, + options, # type:ignore + ) + return df diff --git a/bigframes/bigquery/utils.py b/bigframes/bigquery/utils.py new file mode 100644 index 0000000000..b311d6e5df --- /dev/null +++ b/bigframes/bigquery/utils.py @@ -0,0 +1,85 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Iterable, Mapping, Union + +import bigframes.ml.utils as utils +import bigframes.pandas as bpd + + +def create_vector_search_sql( + sql_string: str, + options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, +) -> str: + """Encode the VECTOR SEARCH statement for BigQuery Vector Search.""" + + base_table = options["base_table"] + column_to_search = options["column_to_search"] + distance_type = options["distance_type"] + top_k = options["top_k"] + query_column_to_search = options.get("query_column_to_search", None) + + if query_column_to_search is not None: + query_str = f""" + SELECT + query.*, + base.*, + distance, + FROM VECTOR_SEARCH( + TABLE `{base_table}`, + "{column_to_search}", + ({sql_string}), + "{query_column_to_search}", + distance_type => "{distance_type}", + top_k => {top_k} + ) + """ + else: + query_str = f""" + SELECT + query.*, + base.*, + distance, + FROM VECTOR_SEARCH( + TABLE `{base_table}`, + "{column_to_search}", + ({sql_string}), + distance_type => "{distance_type}", + top_k => {top_k} + ) + """ + return query_str + + +def apply_sql( + query: Union[bpd.DataFrame, bpd.Series], + options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, +) -> bpd.DataFrame: + """Helper to wrap a dataframe in a SQL query, keeping the index intact. + + Args: + query (bigframes.dataframe.DataFrame): + The dataframe to be wrapped. + """ + (query,) = utils.convert_to_dataframe(query) + sql_string, index_col_ids, index_labels = query._to_sql_query(include_index=True) + + sql = create_vector_search_sql(sql_string=sql_string, options=options) + if index_col_ids is not None: + df = query._session.read_gbq(sql, index_col=index_col_ids) + else: + df = query._session.read_gbq(sql) + df.index.names = index_labels + + return df diff --git a/tests/system/small/bigquery/test_vector_search.py b/tests/system/small/bigquery/test_vector_search.py new file mode 100644 index 0000000000..4280c0a888 --- /dev/null +++ b/tests/system/small/bigquery/test_vector_search.py @@ -0,0 +1,136 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +import bigframes.bigquery as bbq +import bigframes.pandas as bpd + + +def test_vector_search_basic_params_with_df(): + search_query = bpd.DataFrame( + { + "query_id": ["dog", "cat"], + "embedding": [[1.0, 2.0], [3.0, 5.2]], + } + ) + vector_search_result = bbq.vector_search( + base_table="bigframes-dev.bigframes_tests_sys.base_table", + column_to_search="my_embedding", + query=search_query, + top_k=2, + ).to_pandas() # type:ignore + expected = pd.DataFrame( + { + "query_id": ["cat", "dog", "dog", "cat"], + "embedding": [ + np.array([3.0, 5.2]), + np.array([1.0, 2.0]), + np.array([1.0, 2.0]), + np.array([3.0, 5.2]), + ], + "id": [5, 1, 4, 2], + "my_embedding": [ + np.array([5.0, 5.4]), + np.array([1.0, 2.0]), + np.array([1.0, 3.2]), + np.array([2.0, 4.0]), + ], + "distance": [2.009975, 0.0, 1.2, 1.56205], + }, + index=pd.Index([1, 0, 0, 1], dtype="Int64"), + ) + pd.testing.assert_frame_equal( + vector_search_result, expected, check_dtype=False, rtol=0.1 + ) + + +def test_vector_search_different_params_with_query(): + search_query = bpd.Series([[1.0, 2.0], [3.0, 5.2]]) + vector_search_result = bbq.vector_search( + base_table="bigframes-dev.bigframes_tests_sys.base_table", + column_to_search="my_embedding", + query=search_query, + distance_type="cosine", + top_k=2, + ).to_pandas() # type:ignore + expected = pd.DataFrame( + { + "0": [ + np.array([1.0, 2.0]), + np.array([1.0, 2.0]), + np.array([3.0, 5.2]), + np.array([3.0, 5.2]), + ], + "id": [2, 1, 1, 2], + "my_embedding": [ + np.array([2.0, 4.0]), + np.array([1.0, 2.0]), + np.array([1.0, 2.0]), + np.array([2.0, 4.0]), + ], + "distance": [0.0, 0.0, 0.001777, 0.001777], + }, + index=pd.Index([0, 0, 1, 1], dtype="Int64"), + ) + pd.testing.assert_frame_equal( + vector_search_result, expected, check_dtype=False, rtol=0.1 + ) + + +def test_vector_search_df_with_query_column_to_search(): + search_query = bpd.DataFrame( + { + "query_id": ["dog", "cat"], + "embedding": [[1.0, 2.0], [3.0, 5.2]], + "another_embedding": [[1.0, 2.5], [3.3, 5.2]], + } + ) + vector_search_result = bbq.vector_search( + base_table="bigframes-dev.bigframes_tests_sys.base_table", + column_to_search="my_embedding", + query=search_query, + query_column_to_search="another_embedding", + top_k=2, + ).to_pandas() # type:ignore + expected = pd.DataFrame( + { + "query_id": ["dog", "dog", "cat", "cat"], + "embedding": [ + np.array([1.0, 2.0]), + np.array([1.0, 2.0]), + np.array([3.0, 5.2]), + np.array([3.0, 5.2]), + ], + "another_embedding": [ + np.array([1.0, 2.5]), + np.array([1.0, 2.5]), + np.array([3.3, 5.2]), + np.array([3.3, 5.2]), + ], + "id": [1, 4, 2, 5], + "my_embedding": [ + np.array([1.0, 2.0]), + np.array([1.0, 3.2]), + np.array([2.0, 4.0]), + np.array([5.0, 5.4]), + ], + "distance": [0.5, 0.7, 1.769181, 1.711724], + }, + index=pd.Index([0, 0, 1, 1], dtype="Int64"), + ) + pd.testing.assert_frame_equal( + vector_search_result, expected, check_dtype=False, rtol=0.1 + ) diff --git a/tests/unit/bigquery/__init__.py b/tests/unit/bigquery/__init__.py new file mode 100644 index 0000000000..6d5e14bcf4 --- /dev/null +++ b/tests/unit/bigquery/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/unit/bigquery/test_utils.py b/tests/unit/bigquery/test_utils.py new file mode 100644 index 0000000000..56323d1b02 --- /dev/null +++ b/tests/unit/bigquery/test_utils.py @@ -0,0 +1,149 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +import bigframes.bigquery as bbq +import bigframes.pandas as bpd + + +def test_create_vector_search_sql_simple(): + sql_string = "SELECT embedding FROM my_embeddings_table WHERE id = 1" + options = { + "base_table": "my_base_table", + "column_to_search": "my_embedding_column", + "distance_type": "COSINE", + "top_k": 10, + "use_brute_force": False, + } + + expected_query = f""" + SELECT + query.*, + base.*, + distance, + FROM VECTOR_SEARCH( + TABLE `my_base_table`, + "my_embedding_column", + ({sql_string}), + distance_type => "COSINE", + top_k => 10 + ) + """ + + result_query = bbq.utils.create_vector_search_sql( + sql_string, options # type:ignore + ) + assert result_query == expected_query + + +def test_create_vector_search_sql_query_column_to_search(): + sql_string = "SELECT embedding FROM my_embeddings_table WHERE id = 1" + options = { + "base_table": "my_base_table", + "column_to_search": "my_embedding_column", + "distance_type": "COSINE", + "top_k": 10, + "query_column_to_search": "new_embedding_column", + "use_brute_force": False, + } + + expected_query = f""" + SELECT + query.*, + base.*, + distance, + FROM VECTOR_SEARCH( + TABLE `my_base_table`, + "my_embedding_column", + ({sql_string}), + "new_embedding_column", + distance_type => "COSINE", + top_k => 10 + ) + """ + + result_query = bbq.utils.create_vector_search_sql( + sql_string, options # type:ignore + ) + assert result_query == expected_query + + +def test_apply_sql_df_query(): + query = bpd.DataFrame( + { + "query_id": ["dog", "cat"], + "embedding": [[1.0, 2.0], [3.0, 5.2]], + } + ) + options = { + "base_table": "bigframes-dev.bigframes_tests_sys.base_table", + "column_to_search": "my_embedding", + "distance_type": "cosine", + "top_k": 2, + } + result = bbq.utils.apply_sql(query, options).to_pandas() # type:ignore + expected = pd.DataFrame( + { + "query_id": ["cat", "dog", "dog", "cat"], + "embedding": [ + np.array([3.0, 5.2]), + np.array([1.0, 2.0]), + np.array([1.0, 2.0]), + np.array([3.0, 5.2]), + ], + "id": [1, 2, 1, 2], + "my_embedding": [ + np.array([1.0, 2.0]), + np.array([2.0, 4.0]), + np.array([1.0, 2.0]), + np.array([2.0, 4.0]), + ], + "distance": [0.001777, 0.0, 0.0, 0.001777], + }, + index=pd.Index([1, 0, 0, 1], dtype="Int64"), + ) + pd.testing.assert_frame_equal(result, expected, check_dtype=False, rtol=0.1) + + +def test_apply_sql_series_query(): + query = bpd.Series([[1.0, 2.0], [3.0, 5.2]]) + options = { + "base_table": "bigframes-dev.bigframes_tests_sys.base_table", + "column_to_search": "my_embedding", + "distance_type": "euclidean", + "top_k": 2, + } + result = bbq.utils.apply_sql(query, options).to_pandas() # type:ignore + expected = pd.DataFrame( + { + "0": [ + np.array([3.0, 5.2]), + np.array([1.0, 2.0]), + np.array([3.0, 5.2]), + np.array([1.0, 2.0]), + ], + "id": [2, 4, 5, 1], + "my_embedding": [ + np.array([2.0, 4.0]), + np.array([1.0, 3.2]), + np.array([5.0, 5.4]), + np.array([1.0, 2.0]), + ], + "distance": [1.562049935181331, 1.2000000000000002, 2.009975124224178, 0.0], + }, + index=pd.Index([1, 0, 1, 0], dtype="Int64"), + ) + pd.testing.assert_frame_equal(result, expected, check_dtype=False, rtol=0.1) From 86a26bd84b052b33b61bf6a8471d4662a540ba96 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Fri, 31 May 2024 16:36:48 +0000 Subject: [PATCH 2/8] minor fix --- bigframes/bigquery/__init__.py | 2 +- tests/system/small/bigquery/test_utils.py | 87 +++++++++++++++++++++++ tests/unit/bigquery/test_utils.py | 72 ------------------- 3 files changed, 88 insertions(+), 73 deletions(-) create mode 100644 tests/system/small/bigquery/test_utils.py diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 33e65bbe76..21c9c9edc4 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -178,7 +178,7 @@ def vector_search( ... column_to_search="my_embedding", ... query=search_query, ... top_k=2) - query_id embedding id my_embedding distance + query_id embedding id my_embedding distance 1 cat [3. 5.2] 5 [5. 5.4] 2.009975 0 dog [1. 2.] 1 [1. 2.] 0.0 0 dog [1. 2.] 4 [1. 3.2] 1.2 diff --git a/tests/system/small/bigquery/test_utils.py b/tests/system/small/bigquery/test_utils.py new file mode 100644 index 0000000000..535a645fe4 --- /dev/null +++ b/tests/system/small/bigquery/test_utils.py @@ -0,0 +1,87 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +import bigframes.bigquery as bbq +import bigframes.pandas as bpd + + +def test_apply_sql_df_query(): + query = bpd.DataFrame( + { + "query_id": ["dog", "cat"], + "embedding": [[1.0, 2.0], [3.0, 5.2]], + } + ) + options = { + "base_table": "bigframes-dev.bigframes_tests_sys.base_table", + "column_to_search": "my_embedding", + "distance_type": "cosine", + "top_k": 2, + } + result = bbq.utils.apply_sql(query, options).to_pandas() # type:ignore + expected = pd.DataFrame( + { + "query_id": ["cat", "dog", "dog", "cat"], + "embedding": [ + np.array([3.0, 5.2]), + np.array([1.0, 2.0]), + np.array([1.0, 2.0]), + np.array([3.0, 5.2]), + ], + "id": [1, 2, 1, 2], + "my_embedding": [ + np.array([1.0, 2.0]), + np.array([2.0, 4.0]), + np.array([1.0, 2.0]), + np.array([2.0, 4.0]), + ], + "distance": [0.001777, 0.0, 0.0, 0.001777], + }, + index=pd.Index([1, 0, 0, 1], dtype="Int64"), + ) + pd.testing.assert_frame_equal(result, expected, check_dtype=False, rtol=0.1) + + +def test_apply_sql_series_query(): + query = bpd.Series([[1.0, 2.0], [3.0, 5.2]]) + options = { + "base_table": "bigframes-dev.bigframes_tests_sys.base_table", + "column_to_search": "my_embedding", + "distance_type": "euclidean", + "top_k": 2, + } + result = bbq.utils.apply_sql(query, options).to_pandas() # type:ignore + expected = pd.DataFrame( + { + "0": [ + np.array([3.0, 5.2]), + np.array([1.0, 2.0]), + np.array([3.0, 5.2]), + np.array([1.0, 2.0]), + ], + "id": [2, 4, 5, 1], + "my_embedding": [ + np.array([2.0, 4.0]), + np.array([1.0, 3.2]), + np.array([5.0, 5.4]), + np.array([1.0, 2.0]), + ], + "distance": [1.562049935181331, 1.2000000000000002, 2.009975124224178, 0.0], + }, + index=pd.Index([1, 0, 1, 0], dtype="Int64"), + ) + pd.testing.assert_frame_equal(result, expected, check_dtype=False, rtol=0.1) diff --git a/tests/unit/bigquery/test_utils.py b/tests/unit/bigquery/test_utils.py index 56323d1b02..1f781701d8 100644 --- a/tests/unit/bigquery/test_utils.py +++ b/tests/unit/bigquery/test_utils.py @@ -12,11 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np -import pandas as pd - import bigframes.bigquery as bbq -import bigframes.pandas as bpd def test_create_vector_search_sql_simple(): @@ -79,71 +75,3 @@ def test_create_vector_search_sql_query_column_to_search(): sql_string, options # type:ignore ) assert result_query == expected_query - - -def test_apply_sql_df_query(): - query = bpd.DataFrame( - { - "query_id": ["dog", "cat"], - "embedding": [[1.0, 2.0], [3.0, 5.2]], - } - ) - options = { - "base_table": "bigframes-dev.bigframes_tests_sys.base_table", - "column_to_search": "my_embedding", - "distance_type": "cosine", - "top_k": 2, - } - result = bbq.utils.apply_sql(query, options).to_pandas() # type:ignore - expected = pd.DataFrame( - { - "query_id": ["cat", "dog", "dog", "cat"], - "embedding": [ - np.array([3.0, 5.2]), - np.array([1.0, 2.0]), - np.array([1.0, 2.0]), - np.array([3.0, 5.2]), - ], - "id": [1, 2, 1, 2], - "my_embedding": [ - np.array([1.0, 2.0]), - np.array([2.0, 4.0]), - np.array([1.0, 2.0]), - np.array([2.0, 4.0]), - ], - "distance": [0.001777, 0.0, 0.0, 0.001777], - }, - index=pd.Index([1, 0, 0, 1], dtype="Int64"), - ) - pd.testing.assert_frame_equal(result, expected, check_dtype=False, rtol=0.1) - - -def test_apply_sql_series_query(): - query = bpd.Series([[1.0, 2.0], [3.0, 5.2]]) - options = { - "base_table": "bigframes-dev.bigframes_tests_sys.base_table", - "column_to_search": "my_embedding", - "distance_type": "euclidean", - "top_k": 2, - } - result = bbq.utils.apply_sql(query, options).to_pandas() # type:ignore - expected = pd.DataFrame( - { - "0": [ - np.array([3.0, 5.2]), - np.array([1.0, 2.0]), - np.array([3.0, 5.2]), - np.array([1.0, 2.0]), - ], - "id": [2, 4, 5, 1], - "my_embedding": [ - np.array([2.0, 4.0]), - np.array([1.0, 3.2]), - np.array([5.0, 5.4]), - np.array([1.0, 2.0]), - ], - "distance": [1.562049935181331, 1.2000000000000002, 2.009975124224178, 0.0], - }, - index=pd.Index([1, 0, 1, 0], dtype="Int64"), - ) - pd.testing.assert_frame_equal(result, expected, check_dtype=False, rtol=0.1) From 5e6fe306c0d6c44c033aef7ef6cd77a5075edd38 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Fri, 31 May 2024 22:19:10 +0000 Subject: [PATCH 3/8] address comments --- bigframes/bigquery/__init__.py | 41 ++++++--- bigframes/bigquery/utils.py | 85 ------------------ bigframes/core/sql.py | 46 +++++++++- tests/system/small/bigquery/test_utils.py | 87 ------------------- tests/unit/bigquery/__init__.py | 13 --- .../test_utils.py => core/test_sql.py} | 7 +- 6 files changed, 78 insertions(+), 201 deletions(-) delete mode 100644 bigframes/bigquery/utils.py delete mode 100644 tests/system/small/bigquery/test_utils.py delete mode 100644 tests/unit/bigquery/__init__.py rename tests/unit/{bigquery/test_utils.py => core/test_sql.py} (93%) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 21c9c9edc4..73647b8573 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -20,17 +20,23 @@ from __future__ import annotations +import typing from typing import Literal, Optional, Union -import bigframes.bigquery.utils as utils import bigframes.constants as constants import bigframes.core.groupby as groupby +import bigframes.core.sql +import bigframes.ml.utils as utils import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops -import bigframes.pandas as bpd +import bigframes.series +if typing.TYPE_CHECKING: + import bigframes.dataframe as dataframe + import bigframes.series as series -def array_length(series: bpd.Series) -> bpd.Series: + +def array_length(series: series.Series) -> series.Series: """Compute the length of each array element in the Series. **Examples:** @@ -67,7 +73,7 @@ def array_length(series: bpd.Series) -> bpd.Series: def array_agg( obj: groupby.SeriesGroupBy | groupby.DataFrameGroupBy, -) -> bpd.Series | bpd.DataFrame: +) -> series.Series | dataframe.DataFrame: """Group data and create arrays from selected columns, omitting NULLs to avoid BigQuery errors (NULLs not allowed in arrays). @@ -118,7 +124,7 @@ def array_agg( ) -def array_to_string(series: bpd.Series, delimiter: str) -> bpd.Series: +def array_to_string(series: series.Series, delimiter: str) -> series.Series: """Converts array elements within a Series into delimited strings. **Examples:** @@ -151,14 +157,14 @@ def array_to_string(series: bpd.Series, delimiter: str) -> bpd.Series: def vector_search( base_table: str, column_to_search: str, - query: Union[bpd.DataFrame, bpd.Series], + query: Union[dataframe.DataFrame, series.Series], *, query_column_to_search: Optional[str] = None, top_k: Optional[int] = 10, distance_type: Literal["euclidean", "cosine"] = "euclidean", fraction_lists_to_search: Optional[float] = None, use_brute_force: bool = False, -) -> bpd.DataFrame: +) -> dataframe.DataFrame: """ Conduct vector search to earch embeddings to find semantically similar entities. @@ -258,11 +264,14 @@ def vector_search( raise ValueError( "You can't specify fraction_lists_to_search when use_brute_force is set to True." ) - if isinstance(query, bpd.Series) and query_column_to_search is not None: + if ( + isinstance(query, bigframes.series.Series) + and query_column_to_search is not None + ): raise ValueError( "You can't specify query_column_to_search when query is a Series." ) - ## (TODO: ashleyxu. Support options in vector search.) + # TODO(ashleyxu): ashleyxu. Support options in vector search. b/344019989 if fraction_lists_to_search is not None or use_brute_force is True: raise NotImplementedError( f"fraction_lists_to_search and use_brute_force is not supported. {constants.FEEDBACK_LINK}" @@ -277,8 +286,16 @@ def vector_search( "use_brute_force": use_brute_force, } - df = utils.apply_sql( - query, - options, # type:ignore + (query,) = utils.convert_to_dataframe(query) + sql_string, index_col_ids, index_labels = query._to_sql_query(include_index=True) + + sql = bigframes.core.sql.create_vector_search_sql( + sql_string=sql_string, options=options # type: ignore ) + if index_col_ids is not None: + df = query._session.read_gbq(sql, index_col=index_col_ids) + else: + df = query._session.read_gbq(sql) + df.index.names = index_labels + return df diff --git a/bigframes/bigquery/utils.py b/bigframes/bigquery/utils.py deleted file mode 100644 index b311d6e5df..0000000000 --- a/bigframes/bigquery/utils.py +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Iterable, Mapping, Union - -import bigframes.ml.utils as utils -import bigframes.pandas as bpd - - -def create_vector_search_sql( - sql_string: str, - options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, -) -> str: - """Encode the VECTOR SEARCH statement for BigQuery Vector Search.""" - - base_table = options["base_table"] - column_to_search = options["column_to_search"] - distance_type = options["distance_type"] - top_k = options["top_k"] - query_column_to_search = options.get("query_column_to_search", None) - - if query_column_to_search is not None: - query_str = f""" - SELECT - query.*, - base.*, - distance, - FROM VECTOR_SEARCH( - TABLE `{base_table}`, - "{column_to_search}", - ({sql_string}), - "{query_column_to_search}", - distance_type => "{distance_type}", - top_k => {top_k} - ) - """ - else: - query_str = f""" - SELECT - query.*, - base.*, - distance, - FROM VECTOR_SEARCH( - TABLE `{base_table}`, - "{column_to_search}", - ({sql_string}), - distance_type => "{distance_type}", - top_k => {top_k} - ) - """ - return query_str - - -def apply_sql( - query: Union[bpd.DataFrame, bpd.Series], - options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, -) -> bpd.DataFrame: - """Helper to wrap a dataframe in a SQL query, keeping the index intact. - - Args: - query (bigframes.dataframe.DataFrame): - The dataframe to be wrapped. - """ - (query,) = utils.convert_to_dataframe(query) - sql_string, index_col_ids, index_labels = query._to_sql_query(include_index=True) - - sql = create_vector_search_sql(sql_string=sql_string, options=options) - if index_col_ids is not None: - df = query._session.read_gbq(sql, index_col=index_col_ids) - else: - df = query._session.read_gbq(sql) - df.index.names = index_labels - - return df diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py index c1e319b860..0fb99d67eb 100644 --- a/bigframes/core/sql.py +++ b/bigframes/core/sql.py @@ -20,7 +20,7 @@ import datetime import math import textwrap -from typing import Iterable, TYPE_CHECKING +from typing import Iterable, Mapping, TYPE_CHECKING, Union # Literals and identifiers matching this pattern can be unquoted unquoted = r"^[A-Za-z_][A-Za-z_0-9]*$" @@ -169,3 +169,47 @@ def ordering_clause( part = f"`{ordering_expr.id}` {asc_desc} {null_clause}" parts.append(part) return f"ORDER BY {' ,'.join(parts)}" + + +def create_vector_search_sql( + sql_string: str, + options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, +) -> str: + """Encode the VECTOR SEARCH statement for BigQuery Vector Search.""" + + base_table = options["base_table"] + column_to_search = options["column_to_search"] + distance_type = options["distance_type"] + top_k = options["top_k"] + query_column_to_search = options.get("query_column_to_search", None) + + if query_column_to_search is not None: + query_str = f""" + SELECT + query.*, + base.*, + distance, + FROM VECTOR_SEARCH( + TABLE `{base_table}`, + "{column_to_search}", + ({sql_string}), + "{query_column_to_search}", + distance_type => "{distance_type}", + top_k => {top_k} + ) + """ + else: + query_str = f""" + SELECT + query.*, + base.*, + distance, + FROM VECTOR_SEARCH( + TABLE `{base_table}`, + "{column_to_search}", + ({sql_string}), + distance_type => "{distance_type}", + top_k => {top_k} + ) + """ + return query_str diff --git a/tests/system/small/bigquery/test_utils.py b/tests/system/small/bigquery/test_utils.py deleted file mode 100644 index 535a645fe4..0000000000 --- a/tests/system/small/bigquery/test_utils.py +++ /dev/null @@ -1,87 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import pandas as pd - -import bigframes.bigquery as bbq -import bigframes.pandas as bpd - - -def test_apply_sql_df_query(): - query = bpd.DataFrame( - { - "query_id": ["dog", "cat"], - "embedding": [[1.0, 2.0], [3.0, 5.2]], - } - ) - options = { - "base_table": "bigframes-dev.bigframes_tests_sys.base_table", - "column_to_search": "my_embedding", - "distance_type": "cosine", - "top_k": 2, - } - result = bbq.utils.apply_sql(query, options).to_pandas() # type:ignore - expected = pd.DataFrame( - { - "query_id": ["cat", "dog", "dog", "cat"], - "embedding": [ - np.array([3.0, 5.2]), - np.array([1.0, 2.0]), - np.array([1.0, 2.0]), - np.array([3.0, 5.2]), - ], - "id": [1, 2, 1, 2], - "my_embedding": [ - np.array([1.0, 2.0]), - np.array([2.0, 4.0]), - np.array([1.0, 2.0]), - np.array([2.0, 4.0]), - ], - "distance": [0.001777, 0.0, 0.0, 0.001777], - }, - index=pd.Index([1, 0, 0, 1], dtype="Int64"), - ) - pd.testing.assert_frame_equal(result, expected, check_dtype=False, rtol=0.1) - - -def test_apply_sql_series_query(): - query = bpd.Series([[1.0, 2.0], [3.0, 5.2]]) - options = { - "base_table": "bigframes-dev.bigframes_tests_sys.base_table", - "column_to_search": "my_embedding", - "distance_type": "euclidean", - "top_k": 2, - } - result = bbq.utils.apply_sql(query, options).to_pandas() # type:ignore - expected = pd.DataFrame( - { - "0": [ - np.array([3.0, 5.2]), - np.array([1.0, 2.0]), - np.array([3.0, 5.2]), - np.array([1.0, 2.0]), - ], - "id": [2, 4, 5, 1], - "my_embedding": [ - np.array([2.0, 4.0]), - np.array([1.0, 3.2]), - np.array([5.0, 5.4]), - np.array([1.0, 2.0]), - ], - "distance": [1.562049935181331, 1.2000000000000002, 2.009975124224178, 0.0], - }, - index=pd.Index([1, 0, 1, 0], dtype="Int64"), - ) - pd.testing.assert_frame_equal(result, expected, check_dtype=False, rtol=0.1) diff --git a/tests/unit/bigquery/__init__.py b/tests/unit/bigquery/__init__.py deleted file mode 100644 index 6d5e14bcf4..0000000000 --- a/tests/unit/bigquery/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/tests/unit/bigquery/test_utils.py b/tests/unit/core/test_sql.py similarity index 93% rename from tests/unit/bigquery/test_utils.py rename to tests/unit/core/test_sql.py index 1f781701d8..b5b2e1c14e 100644 --- a/tests/unit/bigquery/test_utils.py +++ b/tests/unit/core/test_sql.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import bigframes.bigquery as bbq + +from bigframes.core import sql def test_create_vector_search_sql_simple(): @@ -39,7 +40,7 @@ def test_create_vector_search_sql_simple(): ) """ - result_query = bbq.utils.create_vector_search_sql( + result_query = sql.create_vector_search_sql( sql_string, options # type:ignore ) assert result_query == expected_query @@ -71,7 +72,7 @@ def test_create_vector_search_sql_query_column_to_search(): ) """ - result_query = bbq.utils.create_vector_search_sql( + result_query = sql.create_vector_search_sql( sql_string, options # type:ignore ) assert result_query == expected_query From b7e347016ee9ee380574a5b73e3707ad21d82b86 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Sat, 1 Jun 2024 01:23:31 +0000 Subject: [PATCH 4/8] docstring fix --- bigframes/bigquery/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 73647b8573..b690e83e27 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -166,7 +166,7 @@ def vector_search( use_brute_force: bool = False, ) -> dataframe.DataFrame: """ - Conduct vector search to earch embeddings to find semantically similar entities. + Conduct vector search which searches embeddings to find semantically similar entities. **Examples:** @@ -233,7 +233,7 @@ def vector_search( Args: base_table (str): The table to search for nearest neighbor embeddings. - column_to_search (groupby.SeriesGroupBy | groupby.DataFrameGroupBy): + column_to_search (str): The name of the base table column to search for nearest neighbor embeddings. The column must have a type of ``ARRAY``. All elements in the array must be non-NULL. query (bigframes.dataframe.DataFrame | bigframes.dataframe.Series): @@ -271,7 +271,7 @@ def vector_search( raise ValueError( "You can't specify query_column_to_search when query is a Series." ) - # TODO(ashleyxu): ashleyxu. Support options in vector search. b/344019989 + # TODO(ashleyxu): Support options in vector search. b/344019989 if fraction_lists_to_search is not None or use_brute_force is True: raise NotImplementedError( f"fraction_lists_to_search and use_brute_force is not supported. {constants.FEEDBACK_LINK}" From 2a5de8ed2bd2b0bc7e7d4c028465b200c7959523 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Thu, 6 Jun 2024 17:40:05 +0000 Subject: [PATCH 5/8] address comments --- bigframes/core/sql.py | 16 ++++++++-------- tests/unit/core/test_sql.py | 12 ++++++------ 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py index 0fb99d67eb..a011bc9965 100644 --- a/bigframes/core/sql.py +++ b/bigframes/core/sql.py @@ -173,7 +173,7 @@ def ordering_clause( def create_vector_search_sql( sql_string: str, - options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, + options: Mapping[str, Union[str | int | bool | float]] = {}, ) -> str: """Encode the VECTOR SEARCH statement for BigQuery Vector Search.""" @@ -191,11 +191,11 @@ def create_vector_search_sql( distance, FROM VECTOR_SEARCH( TABLE `{base_table}`, - "{column_to_search}", + {simple_literal(column_to_search)}, ({sql_string}), - "{query_column_to_search}", - distance_type => "{distance_type}", - top_k => {top_k} + {simple_literal(query_column_to_search)}, + distance_type => {simple_literal(distance_type)}, + top_k => {simple_literal(top_k)} ) """ else: @@ -206,10 +206,10 @@ def create_vector_search_sql( distance, FROM VECTOR_SEARCH( TABLE `{base_table}`, - "{column_to_search}", + {simple_literal(column_to_search)}, ({sql_string}), - distance_type => "{distance_type}", - top_k => {top_k} + distance_type => {simple_literal(distance_type)}, + top_k => {simple_literal(top_k)} ) """ return query_str diff --git a/tests/unit/core/test_sql.py b/tests/unit/core/test_sql.py index b5b2e1c14e..feef772058 100644 --- a/tests/unit/core/test_sql.py +++ b/tests/unit/core/test_sql.py @@ -33,9 +33,9 @@ def test_create_vector_search_sql_simple(): distance, FROM VECTOR_SEARCH( TABLE `my_base_table`, - "my_embedding_column", + 'my_embedding_column', ({sql_string}), - distance_type => "COSINE", + distance_type => 'COSINE', top_k => 10 ) """ @@ -63,11 +63,11 @@ def test_create_vector_search_sql_query_column_to_search(): base.*, distance, FROM VECTOR_SEARCH( - TABLE `my_base_table`, - "my_embedding_column", + TABLE my_base_table, + 'my_embedding_column', ({sql_string}), - "new_embedding_column", - distance_type => "COSINE", + 'new_embedding_column', + distance_type => 'COSINE', top_k => 10 ) """ From 2bbbaf4e1fad3f4587de6fab4ca7797beabcf8b0 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Thu, 6 Jun 2024 19:09:10 +0000 Subject: [PATCH 6/8] small fix --- tests/unit/core/test_sql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/core/test_sql.py b/tests/unit/core/test_sql.py index feef772058..29f1e48a70 100644 --- a/tests/unit/core/test_sql.py +++ b/tests/unit/core/test_sql.py @@ -63,7 +63,7 @@ def test_create_vector_search_sql_query_column_to_search(): base.*, distance, FROM VECTOR_SEARCH( - TABLE my_base_table, + TABLE `my_base_table`, 'my_embedding_column', ({sql_string}), 'new_embedding_column', From f2859f3da12ace9fe4debd256d5a3448c0b49413 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Thu, 6 Jun 2024 20:36:38 +0000 Subject: [PATCH 7/8] add docstring clarification --- bigframes/bigquery/__init__.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index b690e83e27..9cc97d949f 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -175,7 +175,8 @@ def vector_search( >>> import bigframes.bigquery as bbq >>> bpd.options.display.progress_bar = None - DataFrame embeddings for which to find nearest neighbors: + DataFrame embeddings for which to find nearest neighbors, and ARRAY column + is used as the search query: >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"], ... "embedding": [[1.0, 2.0], [3.0, 5.2]]}) @@ -210,7 +211,9 @@ def vector_search( [4 rows x 4 columns] - You can specify the name of the column in the query DataFrame embeddings and distance type: + You can specify the name of the column in the query DataFrame embeddings and distance type. + If you specify query_column_to_search_value, it will use the provided column which contains + the embeddings for which to find nearest neighbors. Otherwiese, it uses the column_to_search value. >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"], ... "embedding": [[1.0, 2.0], [3.0, 5.2]], From a2f3b1d6316e824cdf5e4c8e26d503bcb5a8871f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Thu, 6 Jun 2024 16:19:53 -0500 Subject: [PATCH 8/8] Update bigframes/bigquery/__init__.py --- bigframes/bigquery/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 9cc97d949f..85a9010a7d 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -175,7 +175,7 @@ def vector_search( >>> import bigframes.bigquery as bbq >>> bpd.options.display.progress_bar = None - DataFrame embeddings for which to find nearest neighbors, and ARRAY column + DataFrame embeddings for which to find nearest neighbors. The ``ARRAY`` column is used as the search query: >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"],