diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 8b1ca3b0c8..b06046a027 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -295,7 +295,10 @@ def is_object_like(type_: Union[ExpressionType, str]) -> bool: # See: https://stackoverflow.com/a/40312924/101923 and # https://numpy.org/doc/stable/reference/generated/numpy.dtype.kind.html # for the way to identify object type. - return type_ in ("object", "O") or getattr(type_, "kind", None) == "O" + return type_ in ("object", "O") or ( + getattr(type_, "kind", None) == "O" + and getattr(type_, "storage", None) != "pyarrow" + ) def is_string_like(type_: ExpressionType) -> bool: diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index aa490749ae..8f97856eea 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -16,6 +16,7 @@ import geopandas as gpd # type: ignore import pandas as pd +import pyarrow as pa import pytest import bigframes.bigquery as bbq @@ -174,7 +175,7 @@ def test_json_extract_array_from_json_strings(): actual = bbq.json_extract_array(s, "$.a") expected = bpd.Series( [['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"'], None], - dtype=pd.StringDtype(storage="pyarrow"), + dtype=pd.ArrowDtype(pa.list_(pa.string())), ) pd.testing.assert_series_equal( actual.to_pandas(), @@ -190,7 +191,7 @@ def test_json_extract_array_from_json_array_strings(): actual = bbq.json_extract_array(s) expected = bpd.Series( [["1", "2", "3"], [], ["4", "5"]], - dtype=pd.StringDtype(storage="pyarrow"), + dtype=pd.ArrowDtype(pa.list_(pa.string())), ) pd.testing.assert_series_equal( actual.to_pandas(), diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index aa038c62d8..e7556043af 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -166,6 +166,19 @@ def test_df_construct_inline_respects_location(): assert table.location == "europe-west1" +def test_df_construct_dtype(): + data = { + "int_col": [1, 2, 3], + "string_col": ["1.1", "2.0", "3.5"], + "float_col": [1.0, 2.0, 3.0], + } + dtype = pd.StringDtype(storage="pyarrow") + bf_result = dataframe.DataFrame(data, dtype=dtype) + pd_result = pd.DataFrame(data, dtype=dtype) + pd_result.index = pd_result.index.astype("Int64") + pandas.testing.assert_frame_equal(bf_result.to_pandas(), pd_result) + + def test_get_column(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_col" diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index fb48bf58b4..cdda7c753d 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -26,6 +26,7 @@ import pytest import shapely # type: ignore +import bigframes.features import bigframes.pandas import bigframes.series as series from tests.system.utils import ( @@ -228,6 +229,79 @@ def test_series_construct_geodata(): ) +@pytest.mark.parametrize( + ("dtype"), + [ + pytest.param(pd.Int64Dtype(), id="int"), + pytest.param(pd.Float64Dtype(), id="float"), + pytest.param(pd.StringDtype(storage="pyarrow"), id="string"), + ], +) +def test_series_construct_w_dtype_for_int(dtype): + data = [1, 2, 3] + expected = pd.Series(data, dtype=dtype) + expected.index = expected.index.astype("Int64") + series = bigframes.pandas.Series(data, dtype=dtype) + pd.testing.assert_series_equal(series.to_pandas(), expected) + + +def test_series_construct_w_dtype_for_struct(): + # The data shows the struct fields are disordered and correctly handled during + # construction. + data = [ + {"a": 1, "c": "pandas", "b": dt.datetime(2020, 1, 20, 20, 20, 20, 20)}, + {"a": 2, "c": "pandas", "b": dt.datetime(2019, 1, 20, 20, 20, 20, 20)}, + {"a": 1, "c": "numpy", "b": None}, + ] + dtype = pd.ArrowDtype( + pa.struct([("a", pa.int64()), ("c", pa.string()), ("b", pa.timestamp("us"))]) + ) + series = bigframes.pandas.Series(data, dtype=dtype) + expected = pd.Series(data, dtype=dtype) + expected.index = expected.index.astype("Int64") + pd.testing.assert_series_equal(series.to_pandas(), expected) + + +def test_series_construct_w_dtype_for_array_string(): + data = [["1", "2", "3"], [], ["4", "5"]] + dtype = pd.ArrowDtype(pa.list_(pa.string())) + series = bigframes.pandas.Series(data, dtype=dtype) + expected = pd.Series(data, dtype=dtype) + expected.index = expected.index.astype("Int64") + + # Skip dtype check due to internal issue b/321013333. This issue causes array types + # to be converted to the `object` dtype when calling `to_pandas()`, resulting in + # a mismatch with the expected Pandas type. + if bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable: + check_dtype = True + else: + check_dtype = False + + pd.testing.assert_series_equal( + series.to_pandas(), expected, check_dtype=check_dtype + ) + + +def test_series_construct_w_dtype_for_array_struct(): + data = [[{"a": 1, "c": "aa"}, {"a": 2, "c": "bb"}], [], [{"a": 3, "c": "cc"}]] + dtype = pd.ArrowDtype(pa.list_(pa.struct([("a", pa.int64()), ("c", pa.string())]))) + series = bigframes.pandas.Series(data, dtype=dtype) + expected = pd.Series(data, dtype=dtype) + expected.index = expected.index.astype("Int64") + + # Skip dtype check due to internal issue b/321013333. This issue causes array types + # to be converted to the `object` dtype when calling `to_pandas()`, resulting in + # a mismatch with the expected Pandas type. + if bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable: + check_dtype = True + else: + check_dtype = False + + pd.testing.assert_series_equal( + series.to_pandas(), expected, check_dtype=check_dtype + ) + + def test_series_keys(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_result = scalars_df["int64_col"].keys().to_pandas() diff --git a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py index 771146250a..fe15e7b40d 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py @@ -87,12 +87,12 @@ def field(self, name_or_index: str | int): >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ - ... {"project": "pandas", "version": 1}, - ... {"project": "pandas", "version": 2}, - ... {"project": "numpy", "version": 1}, + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, ... ], ... dtype=bpd.ArrowDtype(pa.struct( - ... [("project", pa.string()), ("version", pa.int64())] + ... [("version", pa.int64()), ("project", pa.string())] ... )) ... ) @@ -106,7 +106,7 @@ def field(self, name_or_index: str | int): Extract by field index. - >>> s.struct.field(1) + >>> s.struct.field(0) 0 1 1 2 2 1 @@ -133,22 +133,22 @@ def explode(self): >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ - ... {"project": "pandas", "version": 1}, - ... {"project": "pandas", "version": 2}, - ... {"project": "numpy", "version": 1}, + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, ... ], ... dtype=bpd.ArrowDtype(pa.struct( - ... [("project", pa.string()), ("version", pa.int64())] + ... [("version", pa.int64()), ("project", pa.string())] ... )) ... ) Extract all child fields. >>> s.struct.explode() - project version - 0 pandas 1 - 1 pandas 2 - 2 numpy 1 + version project + 0 1 pandas + 1 2 pandas + 2 1 numpy [3 rows x 2 columns] @@ -178,8 +178,8 @@ def dtypes(self): ... )) ... ) >>> s.struct.dtypes() - project string[pyarrow] version Int64 + project string[pyarrow] dtype: object Returns: @@ -205,21 +205,21 @@ def explode(self, column, *, separator: str = "."): >>> countries = bpd.Series(["cn", "es", "us"]) >>> files = bpd.Series( ... [ - ... {"project": "pandas", "version": 1}, - ... {"project": "pandas", "version": 2}, - ... {"project": "numpy", "version": 1}, + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, ... ], ... dtype=bpd.ArrowDtype(pa.struct( - ... [("project", pa.string()), ("version", pa.int64())] + ... [("version", pa.int64()), ("project", pa.string())] ... )) ... ) >>> downloads = bpd.Series([100, 200, 300]) >>> df = bpd.DataFrame({"country": countries, "file": files, "download_count": downloads}) >>> df.struct.explode("file") - country file.project file.version download_count - 0 cn pandas 1 100 - 1 es pandas 2 200 - 2 us numpy 1 300 + country file.version file.project download_count + 0 cn 1 pandas 100 + 1 es 2 pandas 200 + 2 us 1 numpy 300 [3 rows x 4 columns]