googleapis · chelsea-lin · Feb 5, 2025 · Feb 3, 2025 · Feb 3, 2025 · Feb 5, 2025
@@ -295,7 +295,10 @@ def is_object_like(type_: Union[ExpressionType, str]) -> bool:
    # See: https://stackoverflow.com/a/40312924/101923 and
    # https://numpy.org/doc/stable/reference/generated/numpy.dtype.kind.html
    # for the way to identify object type.
-    return type_ in ("object", "O") or getattr(type_, "kind", None) == "O"
+    return type_ in ("object", "O") or (
+        getattr(type_, "kind", None) == "O"
+        and getattr(type_, "storage", None) != "pyarrow"
+    )


 def is_string_like(type_: ExpressionType) -> bool:

@@ -16,6 +16,7 @@

 import geopandas as gpd  # type: ignore
 import pandas as pd
+import pyarrow as pa
 import pytest

 import bigframes.bigquery as bbq
@@ -174,7 +175,7 @@ def test_json_extract_array_from_json_strings():
    actual = bbq.json_extract_array(s, "$.a")
    expected = bpd.Series(
        [['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"'], None],
-        dtype=pd.StringDtype(storage="pyarrow"),
+        dtype=pd.ArrowDtype(pa.list_(pa.string())),
    )
    pd.testing.assert_series_equal(
        actual.to_pandas(),
@@ -190,7 +191,7 @@ def test_json_extract_array_from_json_array_strings():
    actual = bbq.json_extract_array(s)
    expected = bpd.Series(
        [["1", "2", "3"], [], ["4", "5"]],
-        dtype=pd.StringDtype(storage="pyarrow"),
+        dtype=pd.ArrowDtype(pa.list_(pa.string())),
    )
    pd.testing.assert_series_equal(
        actual.to_pandas(),

@@ -166,6 +166,19 @@ def test_df_construct_inline_respects_location():
        assert table.location == "europe-west1"


+def test_df_construct_dtype():
+    data = {
+        "int_col": [1, 2, 3],
+        "string_col": ["1.1", "2.0", "3.5"],
+        "float_col": [1.0, 2.0, 3.0],
+    }
+    dtype = pd.StringDtype(storage="pyarrow")
+    bf_result = dataframe.DataFrame(data, dtype=dtype)
+    pd_result = pd.DataFrame(data, dtype=dtype)
+    pd_result.index = pd_result.index.astype("Int64")
+    pandas.testing.assert_frame_equal(bf_result.to_pandas(), pd_result)
+
+
 def test_get_column(scalars_dfs):
    scalars_df, scalars_pandas_df = scalars_dfs
    col_name = "int64_col"

@@ -26,6 +26,7 @@
 import pytest
 import shapely  # type: ignore

+import bigframes.features
 import bigframes.pandas
 import bigframes.series as series
 from tests.system.utils import (
@@ -228,6 +229,79 @@ def test_series_construct_geodata():
    )


+@pytest.mark.parametrize(
+    ("dtype"),
+    [
+        pytest.param(pd.Int64Dtype(), id="int"),
+        pytest.param(pd.Float64Dtype(), id="float"),
+        pytest.param(pd.StringDtype(storage="pyarrow"), id="string"),
+    ],
+)
+def test_series_construct_w_dtype_for_int(dtype):
+    data = [1, 2, 3]
+    expected = pd.Series(data, dtype=dtype)
+    expected.index = expected.index.astype("Int64")
+    series = bigframes.pandas.Series(data, dtype=dtype)
+    pd.testing.assert_series_equal(series.to_pandas(), expected)
+
+
+def test_series_construct_w_dtype_for_struct():
+    # The data shows the struct fields are disordered and correctly handled during
+    # construction.
+    data = [
+        {"a": 1, "c": "pandas", "b": dt.datetime(2020, 1, 20, 20, 20, 20, 20)},
+        {"a": 2, "c": "pandas", "b": dt.datetime(2019, 1, 20, 20, 20, 20, 20)},
+        {"a": 1, "c": "numpy", "b": None},
+    ]
+    dtype = pd.ArrowDtype(
+        pa.struct([("a", pa.int64()), ("c", pa.string()), ("b", pa.timestamp("us"))])
+    )
+    series = bigframes.pandas.Series(data, dtype=dtype)
+    expected = pd.Series(data, dtype=dtype)
+    expected.index = expected.index.astype("Int64")
+    pd.testing.assert_series_equal(series.to_pandas(), expected)
+
+
+def test_series_construct_w_dtype_for_array_string():
+    data = [["1", "2", "3"], [], ["4", "5"]]
+    dtype = pd.ArrowDtype(pa.list_(pa.string()))
+    series = bigframes.pandas.Series(data, dtype=dtype)
+    expected = pd.Series(data, dtype=dtype)
+    expected.index = expected.index.astype("Int64")
+
+    # Skip dtype check due to internal issue b/321013333. This issue causes array types
+    # to be converted to the `object` dtype when calling `to_pandas()`, resulting in
+    # a mismatch with the expected Pandas type.
+    if bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable:
+        check_dtype = True
+    else:
+        check_dtype = False
+
+    pd.testing.assert_series_equal(
+        series.to_pandas(), expected, check_dtype=check_dtype
+    )
+
+
+def test_series_construct_w_dtype_for_array_struct():
+    data = [[{"a": 1, "c": "aa"}, {"a": 2, "c": "bb"}], [], [{"a": 3, "c": "cc"}]]
+    dtype = pd.ArrowDtype(pa.list_(pa.struct([("a", pa.int64()), ("c", pa.string())])))
+    series = bigframes.pandas.Series(data, dtype=dtype)
+    expected = pd.Series(data, dtype=dtype)
+    expected.index = expected.index.astype("Int64")
+
+    # Skip dtype check due to internal issue b/321013333. This issue causes array types
+    # to be converted to the `object` dtype when calling `to_pandas()`, resulting in
+    # a mismatch with the expected Pandas type.
+    if bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable:
+        check_dtype = True
+    else:
+        check_dtype = False
+
+    pd.testing.assert_series_equal(
+        series.to_pandas(), expected, check_dtype=check_dtype
+    )
+
+
 def test_series_keys(scalars_dfs):
    scalars_df, scalars_pandas_df = scalars_dfs
    bf_result = scalars_df["int64_col"].keys().to_pandas()

@@ -87,12 +87,12 @@ def field(self, name_or_index: str | int):
            >>> bpd.options.display.progress_bar = None
            >>> s = bpd.Series(
            ...     [
-            ...         {"project": "pandas", "version": 1},
-            ...         {"project": "pandas", "version": 2},
-            ...         {"project": "numpy", "version": 1},
+            ...         {"version": 1, "project": "pandas"},
+            ...         {"version": 2, "project": "pandas"},
+            ...         {"version": 1, "project": "numpy"},
            ...     ],
            ...     dtype=bpd.ArrowDtype(pa.struct(
-            ...         [("project", pa.string()), ("version", pa.int64())]
+            ...         [("version", pa.int64()), ("project", pa.string())]
            ...     ))
            ... )

@@ -106,7 +106,7 @@ def field(self, name_or_index: str | int):

        Extract by field index.

-            >>> s.struct.field(1)
+            >>> s.struct.field(0)
            0    1
            1    2
            2    1
@@ -133,22 +133,22 @@ def explode(self):
            >>> bpd.options.display.progress_bar = None
            >>> s = bpd.Series(
            ...     [
-            ...         {"project": "pandas", "version": 1},
-            ...         {"project": "pandas", "version": 2},
-            ...         {"project": "numpy", "version": 1},
+            ...         {"version": 1, "project": "pandas"},
+            ...         {"version": 2, "project": "pandas"},
+            ...         {"version": 1, "project": "numpy"},
            ...     ],
            ...     dtype=bpd.ArrowDtype(pa.struct(
-            ...         [("project", pa.string()), ("version", pa.int64())]
+            ...         [("version", pa.int64()), ("project", pa.string())]
            ...     ))
            ... )

        Extract all child fields.

            >>> s.struct.explode()
-               project version
-            0   pandas       1
-            1   pandas       2
-            2    numpy       1
+               version project
+            0        1  pandas
+            1        2  pandas
+            2        1   numpy
            <BLANKLINE>
            [3 rows x 2 columns]

@@ -178,8 +178,8 @@ def dtypes(self):
            ...     ))
            ... )
            >>> s.struct.dtypes()
-            project    string[pyarrow]
            version              Int64
+            project    string[pyarrow]
            dtype: object

        Returns:
@@ -205,21 +205,21 @@ def explode(self, column, *, separator: str = "."):
            >>> countries = bpd.Series(["cn", "es", "us"])
            >>> files = bpd.Series(
            ...     [
-            ...         {"project": "pandas", "version": 1},
-            ...         {"project": "pandas", "version": 2},
-            ...         {"project": "numpy", "version": 1},
+            ...         {"version": 1, "project": "pandas"},
+            ...         {"version": 2, "project": "pandas"},
+            ...         {"version": 1, "project": "numpy"},
            ...     ],
            ...     dtype=bpd.ArrowDtype(pa.struct(
-            ...         [("project", pa.string()), ("version", pa.int64())]
+            ...         [("version", pa.int64()), ("project", pa.string())]
            ...     ))
            ... )
            >>> downloads = bpd.Series([100, 200, 300])
            >>> df = bpd.DataFrame({"country": countries, "file": files, "download_count": downloads})
            >>> df.struct.explode("file")
-              country file.project  file.version  download_count
-            0      cn       pandas             1             100
-            1      es       pandas             2             200
-            2      us        numpy             1             300
+              country  file.version file.project  download_count
+            0      cn             1       pandas             100
+            1      es             2       pandas             200
+            2      us             1        numpy             300
            <BLANKLINE>
            [3 rows x 4 columns]