From 407db5edcae2dc02c64bee56f5bcc70aa24dd861 Mon Sep 17 00:00:00 2001
From: Ashley Xu <ashleyxu@google.com>
Date: Thu, 16 May 2024 18:05:57 +0000
Subject: [PATCH 1/8] feat: supoort bigquery.vector_search()

---
 bigframes/bigquery/__init__.py                | 150 +++++++++++++++++-
 bigframes/bigquery/utils.py                   |  85 ++++++++++
 .../small/bigquery/test_vector_search.py      | 136 ++++++++++++++++
 tests/unit/bigquery/__init__.py               |  13 ++
 tests/unit/bigquery/test_utils.py             | 149 +++++++++++++++++
 5 files changed, 525 insertions(+), 8 deletions(-)
 create mode 100644 bigframes/bigquery/utils.py
 create mode 100644 tests/system/small/bigquery/test_vector_search.py
 create mode 100644 tests/unit/bigquery/__init__.py
 create mode 100644 tests/unit/bigquery/test_utils.py
diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py
index 5808aa28bf..33e65bbe76 100644
--- a/bigframes/bigquery/__init__.py
+++ b/bigframes/bigquery/__init__.py
@@ -20,19 +20,17 @@
 
 from __future__ import annotations
 
-import typing
+from typing import Literal, Optional, Union
 
+import bigframes.bigquery.utils as utils
 import bigframes.constants as constants
 import bigframes.core.groupby as groupby
 import bigframes.operations as ops
 import bigframes.operations.aggregations as agg_ops
+import bigframes.pandas as bpd
 
-if typing.TYPE_CHECKING:
-    import bigframes.dataframe as dataframe
-    import bigframes.series as series
 
-
-def array_length(series: series.Series) -> series.Series:
+def array_length(series: bpd.Series) -> bpd.Series:
     """Compute the length of each array element in the Series.
 
     **Examples:**
@@ -69,7 +67,7 @@ def array_length(series: series.Series) -> series.Series:
 
 def array_agg(
     obj: groupby.SeriesGroupBy | groupby.DataFrameGroupBy,
-) -> series.Series | dataframe.DataFrame:
+) -> bpd.Series | bpd.DataFrame:
     """Group data and create arrays from selected columns, omitting NULLs to avoid
     BigQuery errors (NULLs not allowed in arrays).
 
@@ -120,7 +118,7 @@ def array_agg(
         )
 
 
-def array_to_string(series: series.Series, delimiter: str) -> series.Series:
+def array_to_string(series: bpd.Series, delimiter: str) -> bpd.Series:
     """Converts array elements within a Series into delimited strings.
 
     **Examples:**
@@ -148,3 +146,139 @@ def array_to_string(series: series.Series, delimiter: str) -> series.Series:
 
     """
     return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter))
+
+
+def vector_search(
+    base_table: str,
+    column_to_search: str,
+    query: Union[bpd.DataFrame, bpd.Series],
+    *,
+    query_column_to_search: Optional[str] = None,
+    top_k: Optional[int] = 10,
+    distance_type: Literal["euclidean", "cosine"] = "euclidean",
+    fraction_lists_to_search: Optional[float] = None,
+    use_brute_force: bool = False,
+) -> bpd.DataFrame:
+    """
+    Conduct vector search to earch embeddings to find semantically similar entities.
+
+    **Examples:**
+
+
+        >>> import bigframes.pandas as bpd
+        >>> import bigframes.bigquery as bbq
+        >>> bpd.options.display.progress_bar = None
+
+    DataFrame embeddings for which to find nearest neighbors:
+
+        >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"],
+        ...                               "embedding": [[1.0, 2.0], [3.0, 5.2]]})
+        >>> bbq.vector_search(
+        ...             base_table="bigframes-dev.bigframes_tests_sys.base_table",
+        ...             column_to_search="my_embedding",
+        ...             query=search_query,
+        ...             top_k=2)
+            query_id  embedding  id my_embedding  distance
+        1      cat  [3.  5.2]   5    [5.  5.4]  2.009975
+        0      dog    [1. 2.]   1      [1. 2.]       0.0
+        0      dog    [1. 2.]   4    [1.  3.2]       1.2
+        1      cat  [3.  5.2]   2      [2. 4.]   1.56205
+        <BLANKLINE>
+        [4 rows x 5 columns]
+
+    Series embeddings for which to find nearest neighbors:
+
+        >>> search_query = bpd.Series([[1.0, 2.0], [3.0, 5.2]],
+        ...                            index=["dog", "cat"],
+        ...                            name="embedding")
+        >>> bbq.vector_search(
+        ...             base_table="bigframes-dev.bigframes_tests_sys.base_table",
+        ...             column_to_search="my_embedding",
+        ...             query=search_query,
+        ...             top_k=2)
+             embedding  id my_embedding  distance
+        dog    [1. 2.]   1      [1. 2.]       0.0
+        cat  [3.  5.2]   5    [5.  5.4]  2.009975
+        dog    [1. 2.]   4    [1.  3.2]       1.2
+        cat  [3.  5.2]   2      [2. 4.]   1.56205
+        <BLANKLINE>
+        [4 rows x 4 columns]
+
+    You can specify the name of the column in the query DataFrame embeddings and distance type:
+
+        >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"],
+        ...                               "embedding": [[1.0, 2.0], [3.0, 5.2]],
+        ...                               "another_embedding": [[0.7, 2.2], [3.3, 5.2]]})
+        >>> bbq.vector_search(
+        ...             base_table="bigframes-dev.bigframes_tests_sys.base_table",
+        ...             column_to_search="my_embedding",
+        ...             query=search_query,
+        ...             distance_type="cosine",
+        ...             query_column_to_search="another_embedding",
+        ...             top_k=2)
+          query_id  embedding another_embedding  id my_embedding  distance
+        1      cat  [3.  5.2]         [3.3 5.2]   2      [2. 4.]  0.005181
+        0      dog    [1. 2.]         [0.7 2.2]   4    [1.  3.2]  0.000013
+        1      cat  [3.  5.2]         [3.3 5.2]   1      [1. 2.]  0.005181
+        0      dog    [1. 2.]         [0.7 2.2]   3    [1.5 7. ]  0.004697
+        <BLANKLINE>
+        [4 rows x 6 columns]
+
+    Args:
+        base_table (str):
+            The table to search for nearest neighbor embeddings.
+        column_to_search (groupby.SeriesGroupBy | groupby.DataFrameGroupBy):
+            The name of the base table column to search for nearest neighbor embeddings.
+            The column must have a type of ``ARRAY<FLOAT64>``. All elements in the array must be non-NULL.
+        query (bigframes.dataframe.DataFrame | bigframes.dataframe.Series):
+            A Series or DataFrame that provides the embeddings for which to find nearest neighbors.
+        query_column_to_search (str):
+            Specifies the name of the column in the query that contains the embeddings for which to
+            find nearest neighbors. The column must have a type of ``ARRAY<FLOAT64>``. All elements in
+            the array must be non-NULL and all values in the column must have the same array dimensions
+            as the values in the ``column_to_search`` column. Can only be set when query is a DataFrame.
+        top_k (int, default 10):
+            Sepecifies the number of nearest neighbors to return. Default to 10.
+        distance_type (str, defalt "euclidean"):
+            Specifies the type of metric to use to compute the distance between two vectors.
+            Possible values are "euclidean" and "cosine". Default to "euclidean".
+        fraction_lists_to_search (float, range in [0.0, 1.0]):
+            Specifies the percentage of lists to search. Specifying a higher percentage leads to
+            higher recall and slower performance, and the converse is true when specifying a lower
+            percentage. It is only used when a vector index is also used. You can only specify
+            ``fraction_lists_to_search`` when ``use_brute_force`` is set to False.
+        use_brute_force (bool, default False):
+            Determines whether to use brute force search by skipping the vector index if one is available.
+            Default to False.
+
+    Returns:
+        bigframes.dataframe.DataFrame: A DataFrame containing vector search result.
+    """
+    if not fraction_lists_to_search and use_brute_force is True:
+        raise ValueError(
+            "You can't specify fraction_lists_to_search when use_brute_force is set to True."
+        )
+    if isinstance(query, bpd.Series) and query_column_to_search is not None:
+        raise ValueError(
+            "You can't specify query_column_to_search when query is a Series."
+        )
+    ## (TODO: ashleyxu. Support options in vector search.)
+    if fraction_lists_to_search is not None or use_brute_force is True:
+        raise NotImplementedError(
+            f"fraction_lists_to_search and use_brute_force is not supported. {constants.FEEDBACK_LINK}"
+        )
+    options = {
+        "base_table": base_table,
+        "column_to_search": column_to_search,
+        "query_column_to_search": query_column_to_search,
+        "distance_type": distance_type,
+        "top_k": top_k,
+        "fraction_lists_to_search": fraction_lists_to_search,
+        "use_brute_force": use_brute_force,
+    }
+
+    df = utils.apply_sql(
+        query,
+        options,  # type:ignore
+    )
+    return df
diff --git a/bigframes/bigquery/utils.py b/bigframes/bigquery/utils.py
new file mode 100644
index 0000000000..b311d6e5df
--- /dev/null
+++ b/bigframes/bigquery/utils.py
@@ -0,0 +1,85 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Iterable, Mapping, Union
+
+import bigframes.ml.utils as utils
+import bigframes.pandas as bpd
+
+
+def create_vector_search_sql(
+    sql_string: str,
+    options: Mapping[str, Union[str, int, float, Iterable[str]]] = {},
+) -> str:
+    """Encode the VECTOR SEARCH statement for BigQuery Vector Search."""
+
+    base_table = options["base_table"]
+    column_to_search = options["column_to_search"]
+    distance_type = options["distance_type"]
+    top_k = options["top_k"]
+    query_column_to_search = options.get("query_column_to_search", None)
+
+    if query_column_to_search is not None:
+        query_str = f"""
+    SELECT
+        query.*,
+        base.*,
+        distance,
+    FROM VECTOR_SEARCH(
+        TABLE `{base_table}`,
+        "{column_to_search}",
+        ({sql_string}),
+        "{query_column_to_search}",
+        distance_type => "{distance_type}",
+        top_k => {top_k}
+    )
+    """
+    else:
+        query_str = f"""
+    SELECT
+        query.*,
+        base.*,
+        distance,
+    FROM VECTOR_SEARCH(
+        TABLE `{base_table}`,
+        "{column_to_search}",
+        ({sql_string}),
+        distance_type => "{distance_type}",
+        top_k => {top_k}
+    )
+    """
+    return query_str
+
+
+def apply_sql(
+    query: Union[bpd.DataFrame, bpd.Series],
+    options: Mapping[str, Union[str, int, float, Iterable[str]]] = {},
+) -> bpd.DataFrame:
+    """Helper to wrap a dataframe in a SQL query, keeping the index intact.
+
+    Args:
+        query (bigframes.dataframe.DataFrame):
+            The dataframe to be wrapped.
+    """
+    (query,) = utils.convert_to_dataframe(query)
+    sql_string, index_col_ids, index_labels = query._to_sql_query(include_index=True)
+
+    sql = create_vector_search_sql(sql_string=sql_string, options=options)
+    if index_col_ids is not None:
+        df = query._session.read_gbq(sql, index_col=index_col_ids)
+    else:
+        df = query._session.read_gbq(sql)
+    df.index.names = index_labels
+
+    return df
diff --git a/tests/system/small/bigquery/test_vector_search.py b/tests/system/small/bigquery/test_vector_search.py
new file mode 100644
index 0000000000..4280c0a888
--- /dev/null
+++ b/tests/system/small/bigquery/test_vector_search.py
@@ -0,0 +1,136 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+import bigframes.bigquery as bbq
+import bigframes.pandas as bpd
+
+
+def test_vector_search_basic_params_with_df():
+    search_query = bpd.DataFrame(
+        {
+            "query_id": ["dog", "cat"],
+            "embedding": [[1.0, 2.0], [3.0, 5.2]],
+        }
+    )
+    vector_search_result = bbq.vector_search(
+        base_table="bigframes-dev.bigframes_tests_sys.base_table",
+        column_to_search="my_embedding",
+        query=search_query,
+        top_k=2,
+    ).to_pandas()  # type:ignore
+    expected = pd.DataFrame(
+        {
+            "query_id": ["cat", "dog", "dog", "cat"],
+            "embedding": [
+                np.array([3.0, 5.2]),
+                np.array([1.0, 2.0]),
+                np.array([1.0, 2.0]),
+                np.array([3.0, 5.2]),
+            ],
+            "id": [5, 1, 4, 2],
+            "my_embedding": [
+                np.array([5.0, 5.4]),
+                np.array([1.0, 2.0]),
+                np.array([1.0, 3.2]),
+                np.array([2.0, 4.0]),
+            ],
+            "distance": [2.009975, 0.0, 1.2, 1.56205],
+        },
+        index=pd.Index([1, 0, 0, 1], dtype="Int64"),
+    )
+    pd.testing.assert_frame_equal(
+        vector_search_result, expected, check_dtype=False, rtol=0.1
+    )
+
+
+def test_vector_search_different_params_with_query():
+    search_query = bpd.Series([[1.0, 2.0], [3.0, 5.2]])
+    vector_search_result = bbq.vector_search(
+        base_table="bigframes-dev.bigframes_tests_sys.base_table",
+        column_to_search="my_embedding",
+        query=search_query,
+        distance_type="cosine",
+        top_k=2,
+    ).to_pandas()  # type:ignore
+    expected = pd.DataFrame(
+        {
+            "0": [
+                np.array([1.0, 2.0]),
+                np.array([1.0, 2.0]),
+                np.array([3.0, 5.2]),
+                np.array([3.0, 5.2]),
+            ],
+            "id": [2, 1, 1, 2],
+            "my_embedding": [
+                np.array([2.0, 4.0]),
+                np.array([1.0, 2.0]),
+                np.array([1.0, 2.0]),
+                np.array([2.0, 4.0]),
+            ],
+            "distance": [0.0, 0.0, 0.001777, 0.001777],
+        },
+        index=pd.Index([0, 0, 1, 1], dtype="Int64"),
+    )
+    pd.testing.assert_frame_equal(
+        vector_search_result, expected, check_dtype=False, rtol=0.1
+    )
+
+
+def test_vector_search_df_with_query_column_to_search():
+    search_query = bpd.DataFrame(
+        {
+            "query_id": ["dog", "cat"],
+            "embedding": [[1.0, 2.0], [3.0, 5.2]],
+            "another_embedding": [[1.0, 2.5], [3.3, 5.2]],
+        }
+    )
+    vector_search_result = bbq.vector_search(
+        base_table="bigframes-dev.bigframes_tests_sys.base_table",
+        column_to_search="my_embedding",
+        query=search_query,
+        query_column_to_search="another_embedding",
+        top_k=2,
+    ).to_pandas()  # type:ignore
+    expected = pd.DataFrame(
+        {
+            "query_id": ["dog", "dog", "cat", "cat"],
+            "embedding": [
+                np.array([1.0, 2.0]),
+                np.array([1.0, 2.0]),
+                np.array([3.0, 5.2]),
+                np.array([3.0, 5.2]),
+            ],
+            "another_embedding": [
+                np.array([1.0, 2.5]),
+                np.array([1.0, 2.5]),
+                np.array([3.3, 5.2]),
+                np.array([3.3, 5.2]),
+            ],
+            "id": [1, 4, 2, 5],
+            "my_embedding": [
+                np.array([1.0, 2.0]),
+                np.array([1.0, 3.2]),
+                np.array([2.0, 4.0]),
+                np.array([5.0, 5.4]),
+            ],
+            "distance": [0.5, 0.7, 1.769181, 1.711724],
+        },
+        index=pd.Index([0, 0, 1, 1], dtype="Int64"),
+    )
+    pd.testing.assert_frame_equal(
+        vector_search_result, expected, check_dtype=False, rtol=0.1
+    )
diff --git a/tests/unit/bigquery/__init__.py b/tests/unit/bigquery/__init__.py
new file mode 100644
index 0000000000..6d5e14bcf4
--- /dev/null
+++ b/tests/unit/bigquery/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/unit/bigquery/test_utils.py b/tests/unit/bigquery/test_utils.py
new file mode 100644
index 0000000000..56323d1b02
--- /dev/null
+++ b/tests/unit/bigquery/test_utils.py
@@ -0,0 +1,149 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+import bigframes.bigquery as bbq
+import bigframes.pandas as bpd
+
+
+def test_create_vector_search_sql_simple():
+    sql_string = "SELECT embedding FROM my_embeddings_table WHERE id = 1"
+    options = {
+        "base_table": "my_base_table",
+        "column_to_search": "my_embedding_column",
+        "distance_type": "COSINE",
+        "top_k": 10,
+        "use_brute_force": False,
+    }
+
+    expected_query = f"""
+    SELECT
+        query.*,
+        base.*,
+        distance,
+    FROM VECTOR_SEARCH(
+        TABLE `my_base_table`,
+        "my_embedding_column",
+        ({sql_string}),
+        distance_type => "COSINE",
+        top_k => 10
+    )
+    """
+
+    result_query = bbq.utils.create_vector_search_sql(
+        sql_string, options  # type:ignore
+    )
+    assert result_query == expected_query
+
+
+def test_create_vector_search_sql_query_column_to_search():
+    sql_string = "SELECT embedding FROM my_embeddings_table WHERE id = 1"
+    options = {
+        "base_table": "my_base_table",
+        "column_to_search": "my_embedding_column",
+        "distance_type": "COSINE",
+        "top_k": 10,
+        "query_column_to_search": "new_embedding_column",
+        "use_brute_force": False,
+    }
+
+    expected_query = f"""
+    SELECT
+        query.*,
+        base.*,
+        distance,
+    FROM VECTOR_SEARCH(
+        TABLE `my_base_table`,
+        "my_embedding_column",
+        ({sql_string}),
+        "new_embedding_column",
+        distance_type => "COSINE",
+        top_k => 10
+    )
+    """
+
+    result_query = bbq.utils.create_vector_search_sql(
+        sql_string, options  # type:ignore
+    )
+    assert result_query == expected_query
+
+
+def test_apply_sql_df_query():
+    query = bpd.DataFrame(
+        {
+            "query_id": ["dog", "cat"],
+            "embedding": [[1.0, 2.0], [3.0, 5.2]],
+        }
+    )
+    options = {
+        "base_table": "bigframes-dev.bigframes_tests_sys.base_table",
+        "column_to_search": "my_embedding",
+        "distance_type": "cosine",
+        "top_k": 2,
+    }
+    result = bbq.utils.apply_sql(query, options).to_pandas()  # type:ignore
+    expected = pd.DataFrame(
+        {
+            "query_id": ["cat", "dog", "dog", "cat"],
+            "embedding": [
+                np.array([3.0, 5.2]),
+                np.array([1.0, 2.0]),
+                np.array([1.0, 2.0]),
+                np.array([3.0, 5.2]),
+            ],
+            "id": [1, 2, 1, 2],
+            "my_embedding": [
+                np.array([1.0, 2.0]),
+                np.array([2.0, 4.0]),
+                np.array([1.0, 2.0]),
+                np.array([2.0, 4.0]),
+            ],
+            "distance": [0.001777, 0.0, 0.0, 0.001777],
+        },
+        index=pd.Index([1, 0, 0, 1], dtype="Int64"),
+    )
+    pd.testing.assert_frame_equal(result, expected, check_dtype=False, rtol=0.1)
+
+
+def test_apply_sql_series_query():
+    query = bpd.Series([[1.0, 2.0], [3.0, 5.2]])
+    options = {
+        "base_table": "bigframes-dev.bigframes_tests_sys.base_table",
+        "column_to_search": "my_embedding",
+        "distance_type": "euclidean",
+        "top_k": 2,
+    }
+    result = bbq.utils.apply_sql(query, options).to_pandas()  # type:ignore
+    expected = pd.DataFrame(
+        {
+            "0": [
+                np.array([3.0, 5.2]),
+                np.array([1.0, 2.0]),
+                np.array([3.0, 5.2]),
+                np.array([1.0, 2.0]),
+            ],
+            "id": [2, 4, 5, 1],
+            "my_embedding": [
+                np.array([2.0, 4.0]),
+                np.array([1.0, 3.2]),
+                np.array([5.0, 5.4]),
+                np.array([1.0, 2.0]),
+            ],
+            "distance": [1.562049935181331, 1.2000000000000002, 2.009975124224178, 0.0],
+        },
+        index=pd.Index([1, 0, 1, 0], dtype="Int64"),
+    )
+    pd.testing.assert_frame_equal(result, expected, check_dtype=False, rtol=0.1)

From 86a26bd84b052b33b61bf6a8471d4662a540ba96 Mon Sep 17 00:00:00 2001
From: Ashley Xu <ashleyxu@google.com>
Date: Fri, 31 May 2024 16:36:48 +0000
Subject: [PATCH 2/8] minor fix

---
 bigframes/bigquery/__init__.py            |  2 +-
 tests/system/small/bigquery/test_utils.py | 87 +++++++++++++++++++++++
 tests/unit/bigquery/test_utils.py         | 72 -------------------
 3 files changed, 88 insertions(+), 73 deletions(-)
 create mode 100644 tests/system/small/bigquery/test_utils.py

diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py
index 33e65bbe76..21c9c9edc4 100644
--- a/bigframes/bigquery/__init__.py
+++ b/bigframes/bigquery/__init__.py
@@ -178,7 +178,7 @@ def vector_search(
         ...             column_to_search="my_embedding",
         ...             query=search_query,
         ...             top_k=2)
-            query_id  embedding  id my_embedding  distance
+          query_id  embedding  id my_embedding  distance
         1      cat  [3.  5.2]   5    [5.  5.4]  2.009975
         0      dog    [1. 2.]   1      [1. 2.]       0.0
         0      dog    [1. 2.]   4    [1.  3.2]       1.2
diff --git a/tests/system/small/bigquery/test_utils.py b/tests/system/small/bigquery/test_utils.py
new file mode 100644
index 0000000000..535a645fe4
--- /dev/null
+++ b/tests/system/small/bigquery/test_utils.py
@@ -0,0 +1,87 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+
+import bigframes.bigquery as bbq
+import bigframes.pandas as bpd
+
+
+def test_apply_sql_df_query():
+    query = bpd.DataFrame(
+        {
+            "query_id": ["dog", "cat"],
+            "embedding": [[1.0, 2.0], [3.0, 5.2]],
+        }
+    )
+    options = {
+        "base_table": "bigframes-dev.bigframes_tests_sys.base_table",
+        "column_to_search": "my_embedding",
+        "distance_type": "cosine",
+        "top_k": 2,
+    }
+    result = bbq.utils.apply_sql(query, options).to_pandas()  # type:ignore
+    expected = pd.DataFrame(
+        {
+            "query_id": ["cat", "dog", "dog", "cat"],
+            "embedding": [
+                np.array([3.0, 5.2]),
+                np.array([1.0, 2.0]),
+                np.array([1.0, 2.0]),
+                np.array([3.0, 5.2]),
+            ],
+            "id": [1, 2, 1, 2],
+            "my_embedding": [
+                np.array([1.0, 2.0]),
+                np.array([2.0, 4.0]),
+                np.array([1.0, 2.0]),
+                np.array([2.0, 4.0]),
+            ],
+            "distance": [0.001777, 0.0, 0.0, 0.001777],
+        },
+        index=pd.Index([1, 0, 0, 1], dtype="Int64"),
+    )
+    pd.testing.assert_frame_equal(result, expected, check_dtype=False, rtol=0.1)
+
+
+def test_apply_sql_series_query():
+    query = bpd.Series([[1.0, 2.0], [3.0, 5.2]])
+    options = {
+        "base_table": "bigframes-dev.bigframes_tests_sys.base_table",
+        "column_to_search": "my_embedding",
+        "distance_type": "euclidean",
+        "top_k": 2,
+    }
+    result = bbq.utils.apply_sql(query, options).to_pandas()  # type:ignore
+    expected = pd.DataFrame(
+        {
+            "0": [
+                np.array([3.0, 5.2]),
+                np.array([1.0, 2.0]),
+                np.array([3.0, 5.2]),
+                np.array([1.0, 2.0]),
+            ],
+            "id": [2, 4, 5, 1],
+            "my_embedding": [
+                np.array([2.0, 4.0]),
+                np.array([1.0, 3.2]),
+                np.array([5.0, 5.4]),
+                np.array([1.0, 2.0]),
+            ],
+            "distance": [1.562049935181331, 1.2000000000000002, 2.009975124224178, 0.0],
+        },
+        index=pd.Index([1, 0, 1, 0], dtype="Int64"),
+    )
+    pd.testing.assert_frame_equal(result, expected, check_dtype=False, rtol=0.1)
diff --git a/tests/unit/bigquery/test_utils.py b/tests/unit/bigquery/test_utils.py
index 56323d1b02..1f781701d8 100644
--- a/tests/unit/bigquery/test_utils.py
+++ b/tests/unit/bigquery/test_utils.py
@@ -12,11 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
-import pandas as pd
-
 import bigframes.bigquery as bbq
-import bigframes.pandas as bpd
 
 
 def test_create_vector_search_sql_simple():
@@ -79,71 +75,3 @@ def test_create_vector_search_sql_query_column_to_search():
         sql_string, options  # type:ignore
     )
     assert result_query == expected_query
-
-
-def test_apply_sql_df_query():
-    query = bpd.DataFrame(
-        {
-            "query_id": ["dog", "cat"],
-            "embedding": [[1.0, 2.0], [3.0, 5.2]],
-        }
-    )
-    options = {
-        "base_table": "bigframes-dev.bigframes_tests_sys.base_table",
-        "column_to_search": "my_embedding",
-        "distance_type": "cosine",
-        "top_k": 2,
-    }
-    result = bbq.utils.apply_sql(query, options).to_pandas()  # type:ignore
-    expected = pd.DataFrame(
-        {
-            "query_id": ["cat", "dog", "dog", "cat"],
-            "embedding": [
-                np.array([3.0, 5.2]),
-                np.array([1.0, 2.0]),
-                np.array([1.0, 2.0]),
-                np.array([3.0, 5.2]),
-            ],
-            "id": [1, 2, 1, 2],
-            "my_embedding": [
-                np.array([1.0, 2.0]),
-                np.array([2.0, 4.0]),
-                np.array([1.0, 2.0]),
-                np.array([2.0, 4.0]),
-            ],
-            "distance": [0.001777, 0.0, 0.0, 0.001777],
-        },
-        index=pd.Index([1, 0, 0, 1], dtype="Int64"),
-    )
-    pd.testing.assert_frame_equal(result, expected, check_dtype=False, rtol=0.1)
-
-
-def test_apply_sql_series_query():
-    query = bpd.Series([[1.0, 2.0], [3.0, 5.2]])
-    options = {
-        "base_table": "bigframes-dev.bigframes_tests_sys.base_table",
-        "column_to_search": "my_embedding",
-        "distance_type": "euclidean",
-        "top_k": 2,
-    }
-    result = bbq.utils.apply_sql(query, options).to_pandas()  # type:ignore
-    expected = pd.DataFrame(
-        {
-            "0": [
-                np.array([3.0, 5.2]),
-                np.array([1.0, 2.0]),
-                np.array([3.0, 5.2]),
-                np.array([1.0, 2.0]),
-            ],
-            "id": [2, 4, 5, 1],
-            "my_embedding": [
-                np.array([2.0, 4.0]),
-                np.array([1.0, 3.2]),
-                np.array([5.0, 5.4]),
-                np.array([1.0, 2.0]),
-            ],
-            "distance": [1.562049935181331, 1.2000000000000002, 2.009975124224178, 0.0],
-        },
-        index=pd.Index([1, 0, 1, 0], dtype="Int64"),
-    )
-    pd.testing.assert_frame_equal(result, expected, check_dtype=False, rtol=0.1)

From 5e6fe306c0d6c44c033aef7ef6cd77a5075edd38 Mon Sep 17 00:00:00 2001
From: Ashley Xu <ashleyxu@google.com>
Date: Fri, 31 May 2024 22:19:10 +0000
Subject: [PATCH 3/8] address comments

---
 bigframes/bigquery/__init__.py                | 41 ++++++---
 bigframes/bigquery/utils.py                   | 85 ------------------
 bigframes/core/sql.py                         | 46 +++++++++-
 tests/system/small/bigquery/test_utils.py     | 87 -------------------
 tests/unit/bigquery/__init__.py               | 13 ---
 .../test_utils.py => core/test_sql.py}        |  7 +-
 6 files changed, 78 insertions(+), 201 deletions(-)
 delete mode 100644 bigframes/bigquery/utils.py
 delete mode 100644 tests/system/small/bigquery/test_utils.py
 delete mode 100644 tests/unit/bigquery/__init__.py
 rename tests/unit/{bigquery/test_utils.py => core/test_sql.py} (93%)

diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py
index 21c9c9edc4..73647b8573 100644
--- a/bigframes/bigquery/__init__.py
+++ b/bigframes/bigquery/__init__.py
@@ -20,17 +20,23 @@
 
 from __future__ import annotations
 
+import typing
 from typing import Literal, Optional, Union
 
-import bigframes.bigquery.utils as utils
 import bigframes.constants as constants
 import bigframes.core.groupby as groupby
+import bigframes.core.sql
+import bigframes.ml.utils as utils
 import bigframes.operations as ops
 import bigframes.operations.aggregations as agg_ops
-import bigframes.pandas as bpd
+import bigframes.series
 
+if typing.TYPE_CHECKING:
+    import bigframes.dataframe as dataframe
+    import bigframes.series as series
 
-def array_length(series: bpd.Series) -> bpd.Series:
+
+def array_length(series: series.Series) -> series.Series:
     """Compute the length of each array element in the Series.
 
     **Examples:**
@@ -67,7 +73,7 @@ def array_length(series: bpd.Series) -> bpd.Series:
 
 def array_agg(
     obj: groupby.SeriesGroupBy | groupby.DataFrameGroupBy,
-) -> bpd.Series | bpd.DataFrame:
+) -> series.Series | dataframe.DataFrame:
     """Group data and create arrays from selected columns, omitting NULLs to avoid
     BigQuery errors (NULLs not allowed in arrays).
 
@@ -118,7 +124,7 @@ def array_agg(
         )
 
 
-def array_to_string(series: bpd.Series, delimiter: str) -> bpd.Series:
+def array_to_string(series: series.Series, delimiter: str) -> series.Series:
     """Converts array elements within a Series into delimited strings.
 
     **Examples:**
@@ -151,14 +157,14 @@ def array_to_string(series: bpd.Series, delimiter: str) -> bpd.Series:
 def vector_search(
     base_table: str,
     column_to_search: str,
-    query: Union[bpd.DataFrame, bpd.Series],
+    query: Union[dataframe.DataFrame, series.Series],
     *,
     query_column_to_search: Optional[str] = None,
     top_k: Optional[int] = 10,
     distance_type: Literal["euclidean", "cosine"] = "euclidean",
     fraction_lists_to_search: Optional[float] = None,
     use_brute_force: bool = False,
-) -> bpd.DataFrame:
+) -> dataframe.DataFrame:
     """
     Conduct vector search to earch embeddings to find semantically similar entities.
 
@@ -258,11 +264,14 @@ def vector_search(
         raise ValueError(
             "You can't specify fraction_lists_to_search when use_brute_force is set to True."
         )
-    if isinstance(query, bpd.Series) and query_column_to_search is not None:
+    if (
+        isinstance(query, bigframes.series.Series)
+        and query_column_to_search is not None
+    ):
         raise ValueError(
             "You can't specify query_column_to_search when query is a Series."
         )
-    ## (TODO: ashleyxu. Support options in vector search.)
+    # TODO(ashleyxu): ashleyxu. Support options in vector search. b/344019989
     if fraction_lists_to_search is not None or use_brute_force is True:
         raise NotImplementedError(
             f"fraction_lists_to_search and use_brute_force is not supported. {constants.FEEDBACK_LINK}"
@@ -277,8 +286,16 @@ def vector_search(
         "use_brute_force": use_brute_force,
     }
 
-    df = utils.apply_sql(
-        query,
-        options,  # type:ignore
+    (query,) = utils.convert_to_dataframe(query)
+    sql_string, index_col_ids, index_labels = query._to_sql_query(include_index=True)
+
+    sql = bigframes.core.sql.create_vector_search_sql(
+        sql_string=sql_string, options=options  # type: ignore
     )
+    if index_col_ids is not None:
+        df = query._session.read_gbq(sql, index_col=index_col_ids)
+    else:
+        df = query._session.read_gbq(sql)
+    df.index.names = index_labels
+
     return df
diff --git a/bigframes/bigquery/utils.py b/bigframes/bigquery/utils.py
deleted file mode 100644
index b311d6e5df..0000000000
--- a/bigframes/bigquery/utils.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Iterable, Mapping, Union
-
-import bigframes.ml.utils as utils
-import bigframes.pandas as bpd
-
-
-def create_vector_search_sql(
-    sql_string: str,
-    options: Mapping[str, Union[str, int, float, Iterable[str]]] = {},
-) -> str:
-    """Encode the VECTOR SEARCH statement for BigQuery Vector Search."""
-
-    base_table = options["base_table"]
-    column_to_search = options["column_to_search"]
-    distance_type = options["distance_type"]
-    top_k = options["top_k"]
-    query_column_to_search = options.get("query_column_to_search", None)
-
-    if query_column_to_search is not None:
-        query_str = f"""
-    SELECT
-        query.*,
-        base.*,
-        distance,
-    FROM VECTOR_SEARCH(
-        TABLE `{base_table}`,
-        "{column_to_search}",
-        ({sql_string}),
-        "{query_column_to_search}",
-        distance_type => "{distance_type}",
-        top_k => {top_k}
-    )
-    """
-    else:
-        query_str = f"""
-    SELECT
-        query.*,
-        base.*,
-        distance,
-    FROM VECTOR_SEARCH(
-        TABLE `{base_table}`,
-        "{column_to_search}",
-        ({sql_string}),
-        distance_type => "{distance_type}",
-        top_k => {top_k}
-    )
-    """
-    return query_str
-
-
-def apply_sql(
-    query: Union[bpd.DataFrame, bpd.Series],
-    options: Mapping[str, Union[str, int, float, Iterable[str]]] = {},
-) -> bpd.DataFrame:
-    """Helper to wrap a dataframe in a SQL query, keeping the index intact.
-
-    Args:
-        query (bigframes.dataframe.DataFrame):
-            The dataframe to be wrapped.
-    """
-    (query,) = utils.convert_to_dataframe(query)
-    sql_string, index_col_ids, index_labels = query._to_sql_query(include_index=True)
-
-    sql = create_vector_search_sql(sql_string=sql_string, options=options)
-    if index_col_ids is not None:
-        df = query._session.read_gbq(sql, index_col=index_col_ids)
-    else:
-        df = query._session.read_gbq(sql)
-    df.index.names = index_labels
-
-    return df
diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py
index c1e319b860..0fb99d67eb 100644
--- a/bigframes/core/sql.py
+++ b/bigframes/core/sql.py
@@ -20,7 +20,7 @@
 import datetime
 import math
 import textwrap
-from typing import Iterable, TYPE_CHECKING
+from typing import Iterable, Mapping, TYPE_CHECKING, Union
 
 # Literals and identifiers matching this pattern can be unquoted
 unquoted = r"^[A-Za-z_][A-Za-z_0-9]*$"
@@ -169,3 +169,47 @@ def ordering_clause(
         part = f"`{ordering_expr.id}` {asc_desc} {null_clause}"
         parts.append(part)
     return f"ORDER BY {' ,'.join(parts)}"
+
+
+def create_vector_search_sql(
+    sql_string: str,
+    options: Mapping[str, Union[str, int, float, Iterable[str]]] = {},
+) -> str:
+    """Encode the VECTOR SEARCH statement for BigQuery Vector Search."""
+
+    base_table = options["base_table"]
+    column_to_search = options["column_to_search"]
+    distance_type = options["distance_type"]
+    top_k = options["top_k"]
+    query_column_to_search = options.get("query_column_to_search", None)
+
+    if query_column_to_search is not None:
+        query_str = f"""
+    SELECT
+        query.*,
+        base.*,
+        distance,
+    FROM VECTOR_SEARCH(
+        TABLE `{base_table}`,
+        "{column_to_search}",
+        ({sql_string}),
+        "{query_column_to_search}",
+        distance_type => "{distance_type}",
+        top_k => {top_k}
+    )
+    """
+    else:
+        query_str = f"""
+    SELECT
+        query.*,
+        base.*,
+        distance,
+    FROM VECTOR_SEARCH(
+        TABLE `{base_table}`,
+        "{column_to_search}",
+        ({sql_string}),
+        distance_type => "{distance_type}",
+        top_k => {top_k}
+    )
+    """
+    return query_str
diff --git a/tests/system/small/bigquery/test_utils.py b/tests/system/small/bigquery/test_utils.py
deleted file mode 100644
index 535a645fe4..0000000000
--- a/tests/system/small/bigquery/test_utils.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import pandas as pd
-
-import bigframes.bigquery as bbq
-import bigframes.pandas as bpd
-
-
-def test_apply_sql_df_query():
-    query = bpd.DataFrame(
-        {
-            "query_id": ["dog", "cat"],
-            "embedding": [[1.0, 2.0], [3.0, 5.2]],
-        }
-    )
-    options = {
-        "base_table": "bigframes-dev.bigframes_tests_sys.base_table",
-        "column_to_search": "my_embedding",
-        "distance_type": "cosine",
-        "top_k": 2,
-    }
-    result = bbq.utils.apply_sql(query, options).to_pandas()  # type:ignore
-    expected = pd.DataFrame(
-        {
-            "query_id": ["cat", "dog", "dog", "cat"],
-            "embedding": [
-                np.array([3.0, 5.2]),
-                np.array([1.0, 2.0]),
-                np.array([1.0, 2.0]),
-                np.array([3.0, 5.2]),
-            ],
-            "id": [1, 2, 1, 2],
-            "my_embedding": [
-                np.array([1.0, 2.0]),
-                np.array([2.0, 4.0]),
-                np.array([1.0, 2.0]),
-                np.array([2.0, 4.0]),
-            ],
-            "distance": [0.001777, 0.0, 0.0, 0.001777],
-        },
-        index=pd.Index([1, 0, 0, 1], dtype="Int64"),
-    )
-    pd.testing.assert_frame_equal(result, expected, check_dtype=False, rtol=0.1)
-
-
-def test_apply_sql_series_query():
-    query = bpd.Series([[1.0, 2.0], [3.0, 5.2]])
-    options = {
-        "base_table": "bigframes-dev.bigframes_tests_sys.base_table",
-        "column_to_search": "my_embedding",
-        "distance_type": "euclidean",
-        "top_k": 2,
-    }
-    result = bbq.utils.apply_sql(query, options).to_pandas()  # type:ignore
-    expected = pd.DataFrame(
-        {
-            "0": [
-                np.array([3.0, 5.2]),
-                np.array([1.0, 2.0]),
-                np.array([3.0, 5.2]),
-                np.array([1.0, 2.0]),
-            ],
-            "id": [2, 4, 5, 1],
-            "my_embedding": [
-                np.array([2.0, 4.0]),
-                np.array([1.0, 3.2]),
-                np.array([5.0, 5.4]),
-                np.array([1.0, 2.0]),
-            ],
-            "distance": [1.562049935181331, 1.2000000000000002, 2.009975124224178, 0.0],
-        },
-        index=pd.Index([1, 0, 1, 0], dtype="Int64"),
-    )
-    pd.testing.assert_frame_equal(result, expected, check_dtype=False, rtol=0.1)
diff --git a/tests/unit/bigquery/__init__.py b/tests/unit/bigquery/__init__.py
deleted file mode 100644
index 6d5e14bcf4..0000000000
--- a/tests/unit/bigquery/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright 2024 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/tests/unit/bigquery/test_utils.py b/tests/unit/core/test_sql.py
similarity index 93%
rename from tests/unit/bigquery/test_utils.py
rename to tests/unit/core/test_sql.py
index 1f781701d8..b5b2e1c14e 100644
--- a/tests/unit/bigquery/test_utils.py
+++ b/tests/unit/core/test_sql.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import bigframes.bigquery as bbq
+
+from bigframes.core import sql
 
 
 def test_create_vector_search_sql_simple():
@@ -39,7 +40,7 @@ def test_create_vector_search_sql_simple():
     )
     """
 
-    result_query = bbq.utils.create_vector_search_sql(
+    result_query = sql.create_vector_search_sql(
         sql_string, options  # type:ignore
     )
     assert result_query == expected_query
@@ -71,7 +72,7 @@ def test_create_vector_search_sql_query_column_to_search():
     )
     """
 
-    result_query = bbq.utils.create_vector_search_sql(
+    result_query = sql.create_vector_search_sql(
         sql_string, options  # type:ignore
     )
     assert result_query == expected_query

From b7e347016ee9ee380574a5b73e3707ad21d82b86 Mon Sep 17 00:00:00 2001
From: Ashley Xu <ashleyxu@google.com>
Date: Sat, 1 Jun 2024 01:23:31 +0000
Subject: [PATCH 4/8] docstring fix

---
 bigframes/bigquery/__init__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py
index 73647b8573..b690e83e27 100644
--- a/bigframes/bigquery/__init__.py
+++ b/bigframes/bigquery/__init__.py
@@ -166,7 +166,7 @@ def vector_search(
     use_brute_force: bool = False,
 ) -> dataframe.DataFrame:
     """
-    Conduct vector search to earch embeddings to find semantically similar entities.
+    Conduct vector search which searches embeddings to find semantically similar entities.
 
     **Examples:**
 
@@ -233,7 +233,7 @@ def vector_search(
     Args:
         base_table (str):
             The table to search for nearest neighbor embeddings.
-        column_to_search (groupby.SeriesGroupBy | groupby.DataFrameGroupBy):
+        column_to_search (str):
             The name of the base table column to search for nearest neighbor embeddings.
             The column must have a type of ``ARRAY<FLOAT64>``. All elements in the array must be non-NULL.
         query (bigframes.dataframe.DataFrame | bigframes.dataframe.Series):
@@ -271,7 +271,7 @@ def vector_search(
         raise ValueError(
             "You can't specify query_column_to_search when query is a Series."
         )
-    # TODO(ashleyxu): ashleyxu. Support options in vector search. b/344019989
+    # TODO(ashleyxu): Support options in vector search. b/344019989
     if fraction_lists_to_search is not None or use_brute_force is True:
         raise NotImplementedError(
             f"fraction_lists_to_search and use_brute_force is not supported. {constants.FEEDBACK_LINK}"

From 2a5de8ed2bd2b0bc7e7d4c028465b200c7959523 Mon Sep 17 00:00:00 2001
From: Ashley Xu <ashleyxu@google.com>
Date: Thu, 6 Jun 2024 17:40:05 +0000
Subject: [PATCH 5/8] address comments

---
 bigframes/core/sql.py       | 16 ++++++++--------
 tests/unit/core/test_sql.py | 12 ++++++------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py
index 0fb99d67eb..a011bc9965 100644
--- a/bigframes/core/sql.py
+++ b/bigframes/core/sql.py
@@ -173,7 +173,7 @@ def ordering_clause(
 
 def create_vector_search_sql(
     sql_string: str,
-    options: Mapping[str, Union[str, int, float, Iterable[str]]] = {},
+    options: Mapping[str, Union[str | int | bool | float]] = {},
 ) -> str:
     """Encode the VECTOR SEARCH statement for BigQuery Vector Search."""
 
@@ -191,11 +191,11 @@ def create_vector_search_sql(
         distance,
     FROM VECTOR_SEARCH(
         TABLE `{base_table}`,
-        "{column_to_search}",
+        {simple_literal(column_to_search)},
         ({sql_string}),
-        "{query_column_to_search}",
-        distance_type => "{distance_type}",
-        top_k => {top_k}
+        {simple_literal(query_column_to_search)},
+        distance_type => {simple_literal(distance_type)},
+        top_k => {simple_literal(top_k)}
     )
     """
     else:
@@ -206,10 +206,10 @@ def create_vector_search_sql(
         distance,
     FROM VECTOR_SEARCH(
         TABLE `{base_table}`,
-        "{column_to_search}",
+        {simple_literal(column_to_search)},
         ({sql_string}),
-        distance_type => "{distance_type}",
-        top_k => {top_k}
+        distance_type => {simple_literal(distance_type)},
+        top_k => {simple_literal(top_k)}
     )
     """
     return query_str
diff --git a/tests/unit/core/test_sql.py b/tests/unit/core/test_sql.py
index b5b2e1c14e..feef772058 100644
--- a/tests/unit/core/test_sql.py
+++ b/tests/unit/core/test_sql.py
@@ -33,9 +33,9 @@ def test_create_vector_search_sql_simple():
         distance,
     FROM VECTOR_SEARCH(
         TABLE `my_base_table`,
-        "my_embedding_column",
+        'my_embedding_column',
         ({sql_string}),
-        distance_type => "COSINE",
+        distance_type => 'COSINE',
         top_k => 10
     )
     """
@@ -63,11 +63,11 @@ def test_create_vector_search_sql_query_column_to_search():
         base.*,
         distance,
     FROM VECTOR_SEARCH(
-        TABLE `my_base_table`,
-        "my_embedding_column",
+        TABLE my_base_table,
+        'my_embedding_column',
         ({sql_string}),
-        "new_embedding_column",
-        distance_type => "COSINE",
+        'new_embedding_column',
+        distance_type => 'COSINE',
         top_k => 10
     )
     """

From 2bbbaf4e1fad3f4587de6fab4ca7797beabcf8b0 Mon Sep 17 00:00:00 2001
From: Ashley Xu <ashleyxu@google.com>
Date: Thu, 6 Jun 2024 19:09:10 +0000
Subject: [PATCH 6/8] small fix

---
 tests/unit/core/test_sql.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/core/test_sql.py b/tests/unit/core/test_sql.py
index feef772058..29f1e48a70 100644
--- a/tests/unit/core/test_sql.py
+++ b/tests/unit/core/test_sql.py
@@ -63,7 +63,7 @@ def test_create_vector_search_sql_query_column_to_search():
         base.*,
         distance,
     FROM VECTOR_SEARCH(
-        TABLE my_base_table,
+        TABLE `my_base_table`,
         'my_embedding_column',
         ({sql_string}),
         'new_embedding_column',

From f2859f3da12ace9fe4debd256d5a3448c0b49413 Mon Sep 17 00:00:00 2001
From: Ashley Xu <ashleyxu@google.com>
Date: Thu, 6 Jun 2024 20:36:38 +0000
Subject: [PATCH 7/8] add docstring clarification

---
 bigframes/bigquery/__init__.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py
index b690e83e27..9cc97d949f 100644
--- a/bigframes/bigquery/__init__.py
+++ b/bigframes/bigquery/__init__.py
@@ -175,7 +175,8 @@ def vector_search(
         >>> import bigframes.bigquery as bbq
         >>> bpd.options.display.progress_bar = None
 
-    DataFrame embeddings for which to find nearest neighbors:
+    DataFrame embeddings for which to find nearest neighbors, and ARRAY<FLOAT> column
+    is used as the search query:
 
         >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"],
         ...                               "embedding": [[1.0, 2.0], [3.0, 5.2]]})
@@ -210,7 +211,9 @@ def vector_search(
         <BLANKLINE>
         [4 rows x 4 columns]
 
-    You can specify the name of the column in the query DataFrame embeddings and distance type:
+    You can specify the name of the column in the query DataFrame embeddings and distance type.
+    If you specify query_column_to_search_value, it will use the provided column which contains
+    the embeddings for which to find nearest neighbors. Otherwiese, it uses the column_to_search value.
 
         >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"],
         ...                               "embedding": [[1.0, 2.0], [3.0, 5.2]],

From a2f3b1d6316e824cdf5e4c8e26d503bcb5a8871f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= <swast@google.com>
Date: Thu, 6 Jun 2024 16:19:53 -0500
Subject: [PATCH 8/8] Update bigframes/bigquery/__init__.py

---
 bigframes/bigquery/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py
index 9cc97d949f..85a9010a7d 100644
--- a/bigframes/bigquery/__init__.py
+++ b/bigframes/bigquery/__init__.py
@@ -175,7 +175,7 @@ def vector_search(
         >>> import bigframes.bigquery as bbq
         >>> bpd.options.display.progress_bar = None
 
-    DataFrame embeddings for which to find nearest neighbors, and ARRAY<FLOAT> column
+    DataFrame embeddings for which to find nearest neighbors. The ``ARRAY<FLOAT64>`` column
     is used as the search query:
 
         >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"],