From 72fc0401ec36cc2e66c1111870c0c59928ce0024 Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Wed, 26 Jun 2024 21:17:39 +0000
Subject: [PATCH 1/5] feat: add `DataFrame.to_arrow` to create Arrow Table from
 DataFrame

---
 bigframes/core/blocks.py                      |  22 ++
 bigframes/dataframe.py                        |  24 ++
 .../create_polars_df_with_to_arrow_test.py    |  39 +++
 samples/polars/noxfile.py                     | 292 ++++++++++++++++++
 samples/polars/noxfile_config.py              |  42 +++
 samples/polars/requirements-test.txt          |   3 +
 samples/polars/requirements.txt               |   3 +
 tests/system/small/test_dataframe_io.py       |  26 ++
 8 files changed, 451 insertions(+)
 create mode 100644 samples/polars/create_polars_df_with_to_arrow_test.py
 create mode 100644 samples/polars/noxfile.py
 create mode 100644 samples/polars/noxfile_config.py
 create mode 100644 samples/polars/requirements-test.txt
 create mode 100644 samples/polars/requirements.txt

diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
index f40dfc0071..4d22a37372 100644
--- a/bigframes/core/blocks.py
+++ b/bigframes/core/blocks.py
@@ -467,6 +467,28 @@ def _validate_result_schema(self, result_df: pd.DataFrame):
                 f"This error should only occur while testing. Ibis schema: {ibis_schema} does not match actual schema: {actual_schema}"
             )
 
+    def to_arrow(
+        self,
+        *,
+        ordered: bool = True,
+    ) -> Tuple[pa.Table, bigquery.QueryJob]:
+        """Run query and download results as a pyarrow Table."""
+        _, query_job = self.session._query_to_destination(
+            self.session._to_sql(self.expr, ordered=ordered),
+            list(self.index_columns),
+            api_name="cached",
+            do_clustering=False,
+        )
+        results_iterator = query_job.result()
+        pa_table = results_iterator.to_arrow()
+
+        # TODO(tswast): Include index columns with same names as pa.Table.from_pandas.
+        if len(self.index_columns) > 0:
+            pa_table = pa_table.drop_columns(self.index_columns)
+
+        pa_table = pa_table.rename_columns(self.column_labels)
+        return pa_table, query_job
+
     def to_pandas(
         self,
         max_download_size: Optional[int] = None,
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index 75420ca957..2fb733dc9b 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -44,6 +44,7 @@
 import numpy
 import pandas
 import pandas.io.formats.format
+import pyarrow
 import tabulate
 
 import bigframes
@@ -1183,6 +1184,29 @@ def cov(self, *, numeric_only: bool = False) -> DataFrame:
 
         return DataFrame(frame._block.calculate_pairwise_metric(agg_ops.CovOp()))
 
+    def to_arrow(
+        self,
+        *,
+        ordered: Optional[bool] = None,
+    ) -> pyarrow.Table:
+        """Write DataFrame to an Arrow table / record batch.
+
+        Args:
+            ordered (bool, default None):
+                Determines whether the resulting Arrow table will be deterministically ordered.
+                In some cases, unordered may result in a faster-executing query. If set to a value
+                other than None, will override Session default.
+
+        Returns:
+            pyarrow.Table: A pyarrow Table with all rows and columns of this DataFrame.
+        """
+        self._optimize_query_complexity()
+        pa_table, query_job = self._block.to_arrow(
+            ordered=ordered if ordered is not None else self._session._strictly_ordered,
+        )
+        self._set_internal_query_job(query_job)
+        return pa_table
+
     def to_pandas(
         self,
         max_download_size: Optional[int] = None,
diff --git a/samples/polars/create_polars_df_with_to_arrow_test.py b/samples/polars/create_polars_df_with_to_arrow_test.py
new file mode 100644
index 0000000000..f4738c5603
--- /dev/null
+++ b/samples/polars/create_polars_df_with_to_arrow_test.py
@@ -0,0 +1,39 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def test_create_polars_df():
+    # [START bigquery_dataframes_to_polars]
+    import polars
+
+    import bigframes.enums
+    import bigframes.pandas as bpd
+
+    bf_df = bpd.read_gbq_table(
+        "bigquery-public-data.usa_names.usa_1910_current",
+        # Setting index_col to either a unique column or NULL will give the
+        # best performance.
+        index_col=bigframes.enums.DefaultIndexKind.NULL,
+    )
+    # TODO(developer): Do some analysis using BigQuery DataFrames.
+    # ...
+
+    # Run the query and download the results as an Arrow table to convert into
+    # a Polars DataFrame. Use ordered=False if your polars analysis is OK with
+    # non-deterministic ordering.
+    arrow_table = bf_df.to_arrow(ordered=False)
+    polars_df = polars.from_arrow(arrow_table)
+    # [END bigquery_dataframes_to_polars]
+
+    assert polars_df is not None
diff --git a/samples/polars/noxfile.py b/samples/polars/noxfile.py
new file mode 100644
index 0000000000..17cc27cbe9
--- /dev/null
+++ b/samples/polars/noxfile.py
@@ -0,0 +1,292 @@
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import glob
+import os
+from pathlib import Path
+import sys
+from typing import Callable, Dict, Optional
+
+import nox
+
+# WARNING - WARNING - WARNING - WARNING - WARNING
+# WARNING - WARNING - WARNING - WARNING - WARNING
+#           DO NOT EDIT THIS FILE EVER!
+# WARNING - WARNING - WARNING - WARNING - WARNING
+# WARNING - WARNING - WARNING - WARNING - WARNING
+
+BLACK_VERSION = "black==22.3.0"
+ISORT_VERSION = "isort==5.10.1"
+
+# Copy `noxfile_config.py` to your directory and modify it instead.
+
+# `TEST_CONFIG` dict is a configuration hook that allows users to
+# modify the test configurations. The values here should be in sync
+# with `noxfile_config.py`. Users will copy `noxfile_config.py` into
+# their directory and modify it.
+
+TEST_CONFIG = {
+    # You can opt out from the test for specific Python versions.
+    "ignored_versions": [],
+    # Old samples are opted out of enforcing Python type hints
+    # All new samples should feature them
+    "enforce_type_hints": False,
+    # An envvar key for determining the project id to use. Change it
+    # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a
+    # build specific Cloud project. You can also use your own string
+    # to use your own Cloud project.
+    "gcloud_project_env": "GOOGLE_CLOUD_PROJECT",
+    # 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT',
+    # If you need to use a specific version of pip,
+    # change pip_version_override to the string representation
+    # of the version number, for example, "20.2.4"
+    "pip_version_override": None,
+    # A dictionary you want to inject into your test. Don't put any
+    # secrets here. These values will override predefined values.
+    "envs": {},
+}
+
+
+try:
+    # Ensure we can import noxfile_config in the project's directory.
+    sys.path.append(".")
+    from noxfile_config import TEST_CONFIG_OVERRIDE
+except ImportError as e:
+    print("No user noxfile_config found: detail: {}".format(e))
+    TEST_CONFIG_OVERRIDE = {}
+
+# Update the TEST_CONFIG with the user supplied values.
+TEST_CONFIG.update(TEST_CONFIG_OVERRIDE)
+
+
+def get_pytest_env_vars() -> Dict[str, str]:
+    """Returns a dict for pytest invocation."""
+    ret = {}
+
+    # Override the GCLOUD_PROJECT and the alias.
+    env_key = TEST_CONFIG["gcloud_project_env"]
+    # This should error out if not set.
+    ret["GOOGLE_CLOUD_PROJECT"] = os.environ[env_key]  # type: ignore
+
+    # Apply user supplied envs.
+    ret.update(TEST_CONFIG["envs"])  # type: ignore
+    return ret
+
+
+# DO NOT EDIT - automatically generated.
+# All versions used to test samples.
+ALL_VERSIONS = ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
+
+# Any default versions that should be ignored.
+IGNORED_VERSIONS = TEST_CONFIG["ignored_versions"]
+
+TESTED_VERSIONS = sorted([v for v in ALL_VERSIONS if v not in IGNORED_VERSIONS])  # type: ignore
+
+INSTALL_LIBRARY_FROM_SOURCE = os.environ.get("INSTALL_LIBRARY_FROM_SOURCE", False) in (
+    "True",
+    "true",
+)
+
+# Error if a python version is missing
+nox.options.error_on_missing_interpreters = True
+
+#
+# Style Checks
+#
+
+
+# Linting with flake8.
+#
+# We ignore the following rules:
+#   E203: whitespace before ‘:’
+#   E266: too many leading ‘#’ for block comment
+#   E501: line too long
+#   I202: Additional newline in a section of imports
+#
+# We also need to specify the rules which are ignored by default:
+# ['E226', 'W504', 'E126', 'E123', 'W503', 'E24', 'E704', 'E121']
+FLAKE8_COMMON_ARGS = [
+    "--show-source",
+    "--builtin=gettext",
+    "--max-complexity=20",
+    "--exclude=.nox,.cache,env,lib,generated_pb2,*_pb2.py,*_pb2_grpc.py",
+    "--ignore=E121,E123,E126,E203,E226,E24,E266,E501,E704,W503,W504,I202",
+    "--max-line-length=88",
+]
+
+
+@nox.session
+def lint(session: nox.sessions.Session) -> None:
+    if not TEST_CONFIG["enforce_type_hints"]:
+        session.install("flake8")
+    else:
+        session.install("flake8", "flake8-annotations")
+
+    args = FLAKE8_COMMON_ARGS + [
+        ".",
+    ]
+    session.run("flake8", *args)
+
+
+#
+# Black
+#
+
+
+@nox.session
+def blacken(session: nox.sessions.Session) -> None:
+    """Run black. Format code to uniform standard."""
+    session.install(BLACK_VERSION)
+    python_files = [path for path in os.listdir(".") if path.endswith(".py")]
+
+    session.run("black", *python_files)
+
+
+#
+# format = isort + black
+#
+
+
+@nox.session
+def format(session: nox.sessions.Session) -> None:
+    """
+    Run isort to sort imports. Then run black
+    to format code to uniform standard.
+    """
+    session.install(BLACK_VERSION, ISORT_VERSION)
+    python_files = [path for path in os.listdir(".") if path.endswith(".py")]
+
+    # Use the --fss option to sort imports using strict alphabetical order.
+    # See https://pycqa.github.io/isort/docs/configuration/options.html#force-sort-within-sections
+    session.run("isort", "--fss", *python_files)
+    session.run("black", *python_files)
+
+
+#
+# Sample Tests
+#
+
+
+PYTEST_COMMON_ARGS = ["--junitxml=sponge_log.xml"]
+
+
+def _session_tests(
+    session: nox.sessions.Session, post_install: Callable = None  # type: ignore
+) -> None:
+    # check for presence of tests
+    test_list = glob.glob("**/*_test.py", recursive=True) + glob.glob(
+        "**/test_*.py", recursive=True
+    )
+    test_list.extend(glob.glob("**/tests", recursive=True))
+
+    if len(test_list) == 0:
+        print("No tests found, skipping directory.")
+        return
+
+    if TEST_CONFIG["pip_version_override"]:
+        pip_version = TEST_CONFIG["pip_version_override"]
+        session.install(f"pip=={pip_version}")
+    """Runs py.test for a particular project."""
+    concurrent_args = []
+    if os.path.exists("requirements.txt"):
+        if os.path.exists("constraints.txt"):
+            session.install("-r", "requirements.txt", "-c", "constraints.txt")
+        else:
+            session.install("-r", "requirements.txt")
+        with open("requirements.txt") as rfile:
+            packages = rfile.read()
+
+    if os.path.exists("requirements-test.txt"):
+        if os.path.exists("constraints-test.txt"):
+            session.install("-r", "requirements-test.txt", "-c", "constraints-test.txt")
+        else:
+            session.install("-r", "requirements-test.txt")
+        with open("requirements-test.txt") as rtfile:
+            packages += rtfile.read()
+
+    if INSTALL_LIBRARY_FROM_SOURCE:
+        session.install("-e", _get_repo_root())
+
+    if post_install:  # type: ignore
+        post_install(session)
+
+    if "pytest-parallel" in packages:
+        concurrent_args.extend(["--workers", "auto", "--tests-per-worker", "auto"])
+    elif "pytest-xdist" in packages:
+        concurrent_args.extend(["-n", "auto"])
+
+    session.run(
+        "pytest",
+        *(PYTEST_COMMON_ARGS + session.posargs + concurrent_args),
+        # Pytest will return 5 when no tests are collected. This can happen
+        # on travis where slow and flaky tests are excluded.
+        # See http://doc.pytest.org/en/latest/_modules/_pytest/main.html
+        success_codes=[0, 5],
+        env=get_pytest_env_vars(),
+    )
+
+
+@nox.session(python=ALL_VERSIONS)
+def py(session: nox.sessions.Session) -> None:
+    """Runs py.test for a sample using the specified version of Python."""
+    if session.python in TESTED_VERSIONS:
+        _session_tests(session)
+    else:
+        session.skip(
+            "SKIPPED: {} tests are disabled for this sample.".format(session.python)
+        )
+
+
+#
+# Readmegen
+#
+
+
+def _get_repo_root() -> Optional[str]:
+    """Returns the root folder of the project."""
+    # Get root of this repository. Assume we don't have directories nested deeper than 10 items.
+    p = Path(os.getcwd())
+    for i in range(10):
+        if p is None:
+            break
+        if Path(p / ".git").exists():
+            return str(p)
+        # .git is not available in repos cloned via Cloud Build
+        # setup.py is always in the library's root, so use that instead
+        # https://github.com/googleapis/synthtool/issues/792
+        if Path(p / "setup.py").exists():
+            return str(p)
+        p = p.parent
+    raise Exception("Unable to detect repository root.")
+
+
+GENERATED_READMES = sorted([x for x in Path(".").rglob("*.rst.in")])
+
+
+@nox.session
+@nox.parametrize("path", GENERATED_READMES)
+def readmegen(session: nox.sessions.Session, path: str) -> None:
+    """(Re-)generates the readme for a sample."""
+    session.install("jinja2", "pyyaml")
+    dir_ = os.path.dirname(path)
+
+    if os.path.exists(os.path.join(dir_, "requirements.txt")):
+        session.install("-r", os.path.join(dir_, "requirements.txt"))
+
+    in_file = os.path.join(dir_, "README.rst.in")
+    session.run(
+        "python", _get_repo_root() + "/scripts/readme-gen/readme_gen.py", in_file  # type: ignore
+    )
diff --git a/samples/polars/noxfile_config.py b/samples/polars/noxfile_config.py
new file mode 100644
index 0000000000..211d6974b9
--- /dev/null
+++ b/samples/polars/noxfile_config.py
@@ -0,0 +1,42 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Default TEST_CONFIG_OVERRIDE for python repos.
+
+# You can copy this file into your directory, then it will be inported from
+# the noxfile.py.
+
+# The source of truth:
+# https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/noxfile_config.py
+
+TEST_CONFIG_OVERRIDE = {
+    # You can opt out from the test for specific Python versions.
+    "ignored_versions": ["2.7", "3.7", "3.8"],
+    # Old samples are opted out of enforcing Python type hints
+    # All new samples should feature them
+    "enforce_type_hints": True,
+    # An envvar key for determining the project id to use. Change it
+    # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a
+    # build specific Cloud project. You can also use your own string
+    # to use your own Cloud project.
+    "gcloud_project_env": "GOOGLE_CLOUD_PROJECT",
+    # "gcloud_project_env": "BUILD_SPECIFIC_GCLOUD_PROJECT",
+    # If you need to use a specific version of pip,
+    # change pip_version_override to the string representation
+    # of the version number, for example, "20.2.4"
+    "pip_version_override": None,
+    # A dictionary you want to inject into your test. Don't put any
+    # secrets here. These values will override predefined values.
+    "envs": {},
+}
diff --git a/samples/polars/requirements-test.txt b/samples/polars/requirements-test.txt
new file mode 100644
index 0000000000..beca2e44d9
--- /dev/null
+++ b/samples/polars/requirements-test.txt
@@ -0,0 +1,3 @@
+# samples/snippets should be runnable with no "extras"
+google-cloud-testutils==1.4.0
+pytest==8.2.0
diff --git a/samples/polars/requirements.txt b/samples/polars/requirements.txt
new file mode 100644
index 0000000000..e3f886e7e3
--- /dev/null
+++ b/samples/polars/requirements.txt
@@ -0,0 +1,3 @@
+bigframes==1.6.0
+polars==0.20.31
+pyarrow==15.0.0
diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py
index 8adbea88e4..de34e3c73d 100644
--- a/tests/system/small/test_dataframe_io.py
+++ b/tests/system/small/test_dataframe_io.py
@@ -132,6 +132,32 @@ def test_sql_executes_and_includes_named_multiindex(
     )
 
 
+def test_to_arrow(scalars_df_default_index, scalars_pandas_df_default_index):
+    """Verify to_arrow() APIs returns the expected data."""
+    expected = pa.Table.from_pandas(
+        scalars_pandas_df_default_index.drop(columns=["geography_col"])
+        # TODO(tswast): Add option for to_arrow() to include unnamed indexes.
+    ).drop_columns(["__index_level_0__"])
+    actual = scalars_df_default_index.drop(columns=["geography_col"]).to_arrow()
+
+    # Make string_col match type. Otherwise, one might use
+    # LargeStringArray and one might use StringArray.
+    expected = expected.set_column(
+        expected.column_names.index("string_col"),
+        pa.field("string_col", pa.string()),
+        expected["string_col"].cast(pa.string()),
+    )
+    actual = expected.set_column(
+        actual.column_names.index("string_col"),
+        pa.field("string_col", pa.string()),
+        actual["string_col"].cast(pa.string()),
+    )
+
+    for column in actual.column_names:
+        assert actual[column].equals(expected[column])
+    assert actual.equals(expected)
+
+
 def test_to_pandas_w_correct_dtypes(scalars_df_default_index):
     """Verify to_pandas() APIs returns the expected dtypes."""
     actual = scalars_df_default_index.to_pandas().dtypes

From 9264e85fe1b4c922882f0b91f691190369240a4a Mon Sep 17 00:00:00 2001
From: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
Date: Wed, 26 Jun 2024 21:21:14 +0000
Subject: [PATCH 2/5] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20po?=
 =?UTF-8?q?st-processor?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md
---
 samples/polars/noxfile.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/samples/polars/noxfile.py b/samples/polars/noxfile.py
index 17cc27cbe9..c36d5f2d81 100644
--- a/samples/polars/noxfile.py
+++ b/samples/polars/noxfile.py
@@ -79,10 +79,10 @@ def get_pytest_env_vars() -> Dict[str, str]:
     # Override the GCLOUD_PROJECT and the alias.
     env_key = TEST_CONFIG["gcloud_project_env"]
     # This should error out if not set.
-    ret["GOOGLE_CLOUD_PROJECT"] = os.environ[env_key]  # type: ignore
+    ret["GOOGLE_CLOUD_PROJECT"] = os.environ[env_key]
 
     # Apply user supplied envs.
-    ret.update(TEST_CONFIG["envs"])  # type: ignore
+    ret.update(TEST_CONFIG["envs"])
     return ret
 
 
@@ -93,7 +93,7 @@ def get_pytest_env_vars() -> Dict[str, str]:
 # Any default versions that should be ignored.
 IGNORED_VERSIONS = TEST_CONFIG["ignored_versions"]
 
-TESTED_VERSIONS = sorted([v for v in ALL_VERSIONS if v not in IGNORED_VERSIONS])  # type: ignore
+TESTED_VERSIONS = sorted([v for v in ALL_VERSIONS if v not in IGNORED_VERSIONS])
 
 INSTALL_LIBRARY_FROM_SOURCE = os.environ.get("INSTALL_LIBRARY_FROM_SOURCE", False) in (
     "True",
@@ -184,7 +184,7 @@ def format(session: nox.sessions.Session) -> None:
 
 
 def _session_tests(
-    session: nox.sessions.Session, post_install: Callable = None  # type: ignore
+    session: nox.sessions.Session, post_install: Callable = None
 ) -> None:
     # check for presence of tests
     test_list = glob.glob("**/*_test.py", recursive=True) + glob.glob(
@@ -220,7 +220,7 @@ def _session_tests(
     if INSTALL_LIBRARY_FROM_SOURCE:
         session.install("-e", _get_repo_root())
 
-    if post_install:  # type: ignore
+    if post_install:
         post_install(session)
 
     if "pytest-parallel" in packages:
@@ -288,5 +288,5 @@ def readmegen(session: nox.sessions.Session, path: str) -> None:
 
     in_file = os.path.join(dir_, "README.rst.in")
     session.run(
-        "python", _get_repo_root() + "/scripts/readme-gen/readme_gen.py", in_file  # type: ignore
+        "python", _get_repo_root() + "/scripts/readme-gen/readme_gen.py", in_file
     )

From b9143d5647a6d56fe53f9537962b3d02293fcfcd Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Wed, 26 Jun 2024 22:06:13 +0000
Subject: [PATCH 3/5] type annotation for sample

---
 samples/polars/create_polars_df_with_to_arrow_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/polars/create_polars_df_with_to_arrow_test.py b/samples/polars/create_polars_df_with_to_arrow_test.py
index f4738c5603..6d4853cd62 100644
--- a/samples/polars/create_polars_df_with_to_arrow_test.py
+++ b/samples/polars/create_polars_df_with_to_arrow_test.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-def test_create_polars_df():
+def test_create_polars_df() -> None:
     # [START bigquery_dataframes_to_polars]
     import polars
 

From 7bfd374ceab53bd82be7cc4c4b69a26ea36ee89c Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Thu, 27 Jun 2024 15:39:06 +0000
Subject: [PATCH 4/5] align index names in to_arrow

---
 bigframes/core/blocks.py                | 18 ++++++---
 bigframes/dataframe.py                  |  5 +++
 tests/system/small/test_dataframe_io.py | 51 +++++++++++++++++++++----
 3 files changed, 61 insertions(+), 13 deletions(-)

diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
index 4d22a37372..379c661179 100644
--- a/bigframes/core/blocks.py
+++ b/bigframes/core/blocks.py
@@ -473,8 +473,13 @@ def to_arrow(
         ordered: bool = True,
     ) -> Tuple[pa.Table, bigquery.QueryJob]:
         """Run query and download results as a pyarrow Table."""
+        # pa.Table.from_pandas puts index columns last, so update the expression to match.
+        expr = self.expr.select_columns(
+            list(self.value_columns) + list(self.index_columns)
+        )
+
         _, query_job = self.session._query_to_destination(
-            self.session._to_sql(self.expr, ordered=ordered),
+            self.session._to_sql(expr, ordered=ordered),
             list(self.index_columns),
             api_name="cached",
             do_clustering=False,
@@ -482,11 +487,14 @@ def to_arrow(
         results_iterator = query_job.result()
         pa_table = results_iterator.to_arrow()
 
-        # TODO(tswast): Include index columns with same names as pa.Table.from_pandas.
-        if len(self.index_columns) > 0:
-            pa_table = pa_table.drop_columns(self.index_columns)
+        pa_index_labels = []
+        for index_level, index_label in enumerate(self._index_labels):
+            if isinstance(index_label, str):
+                pa_index_labels.append(index_label)
+            else:
+                pa_index_labels.append(f"__index_level_{index_level}__")
 
-        pa_table = pa_table.rename_columns(self.column_labels)
+        pa_table = pa_table.rename_columns(list(self.column_labels) + pa_index_labels)
         return pa_table, query_job
 
     def to_pandas(
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index 2fb733dc9b..274e176dd5 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -1200,6 +1200,11 @@ def to_arrow(
         Returns:
             pyarrow.Table: A pyarrow Table with all rows and columns of this DataFrame.
         """
+        warnings.warn(
+            "to_arrow is in preview. Types and unnamed / duplicate name columns may change in future.",
+            category=bigframes.exceptions.PreviewWarning,
+        )
+
         self._optimize_query_complexity()
         pa_table, query_job = self._block.to_arrow(
             ordered=ordered if ordered is not None else self._session._strictly_ordered,
diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py
index de34e3c73d..ab1fdceae5 100644
--- a/tests/system/small/test_dataframe_io.py
+++ b/tests/system/small/test_dataframe_io.py
@@ -136,23 +136,58 @@ def test_to_arrow(scalars_df_default_index, scalars_pandas_df_default_index):
     """Verify to_arrow() APIs returns the expected data."""
     expected = pa.Table.from_pandas(
         scalars_pandas_df_default_index.drop(columns=["geography_col"])
-        # TODO(tswast): Add option for to_arrow() to include unnamed indexes.
-    ).drop_columns(["__index_level_0__"])
-    actual = scalars_df_default_index.drop(columns=["geography_col"]).to_arrow()
+    )
+
+    with pytest.warns(
+        bigframes.exceptions.PreviewWarning,
+        match="to_arrow",
+    ):
+        actual = scalars_df_default_index.drop(columns=["geography_col"]).to_arrow()
 
-    # Make string_col match type. Otherwise, one might use
-    # LargeStringArray and one might use StringArray.
+    # Make string_col match type. Otherwise, pa.Table.from_pandas uses
+    # LargeStringArray. LargeStringArray is unnecessary because our strings are
+    # less than 2 GB.
     expected = expected.set_column(
         expected.column_names.index("string_col"),
         pa.field("string_col", pa.string()),
         expected["string_col"].cast(pa.string()),
     )
-    actual = expected.set_column(
-        actual.column_names.index("string_col"),
+
+    # Note: the final .equals assertion covers all these checks, but these
+    # finer-grained assertions are easier to debug.
+    assert actual.column_names == expected.column_names
+    for column in actual.column_names:
+        assert actual[column].equals(expected[column])
+    assert actual.equals(expected)
+
+
+def test_to_arrow_multiindex(scalars_df_index, scalars_pandas_df_index):
+    scalars_df_multiindex = scalars_df_index.set_index(["string_col", "int64_col"])
+    scalars_pandas_df_multiindex = scalars_pandas_df_index.set_index(
+        ["string_col", "int64_col"]
+    )
+    expected = pa.Table.from_pandas(
+        scalars_pandas_df_multiindex.drop(columns=["geography_col"])
+    )
+
+    with pytest.warns(
+        bigframes.exceptions.PreviewWarning,
+        match="to_arrow",
+    ):
+        actual = scalars_df_multiindex.drop(columns=["geography_col"]).to_arrow()
+
+    # Make string_col match type. Otherwise, pa.Table.from_pandas uses
+    # LargeStringArray. LargeStringArray is unnecessary because our strings are
+    # less than 2 GB.
+    expected = expected.set_column(
+        expected.column_names.index("string_col"),
         pa.field("string_col", pa.string()),
-        actual["string_col"].cast(pa.string()),
+        expected["string_col"].cast(pa.string()),
     )
 
+    # Note: the final .equals assertion covers all these checks, but these
+    # finer-grained assertions are easier to debug.
+    assert actual.column_names == expected.column_names
     for column in actual.column_names:
         assert actual[column].equals(expected[column])
     assert actual.equals(expected)

From 19bee2147655d618a7aaf59141f647e1f3c59b2e Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Thu, 27 Jun 2024 22:06:05 +0000
Subject: [PATCH 5/5] better assertions

---
 samples/polars/create_polars_df_with_to_arrow_test.py | 3 ++-
 samples/polars/noxfile_config.py                      | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/samples/polars/create_polars_df_with_to_arrow_test.py b/samples/polars/create_polars_df_with_to_arrow_test.py
index 6d4853cd62..acb79f23c8 100644
--- a/samples/polars/create_polars_df_with_to_arrow_test.py
+++ b/samples/polars/create_polars_df_with_to_arrow_test.py
@@ -36,4 +36,5 @@ def test_create_polars_df() -> None:
     polars_df = polars.from_arrow(arrow_table)
     # [END bigquery_dataframes_to_polars]
 
-    assert polars_df is not None
+    assert polars_df.shape == bf_df.shape
+    assert polars_df["number"].sum() == bf_df["number"].sum()
diff --git a/samples/polars/noxfile_config.py b/samples/polars/noxfile_config.py
index 211d6974b9..91238e9e2f 100644
--- a/samples/polars/noxfile_config.py
+++ b/samples/polars/noxfile_config.py
@@ -1,4 +1,4 @@
-# Copyright 2020 Google LLC
+# Copyright 2024 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.