Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

fix: dtype parameter ineffective in Series/DataFrame construction #1354

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Feb 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion 5 bigframes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,10 @@ def is_object_like(type_: Union[ExpressionType, str]) -> bool:
# See: https://stackoverflow.com/a/40312924/101923 and
# https://numpy.org/doc/stable/reference/generated/numpy.dtype.kind.html
# for the way to identify object type.
return type_ in ("object", "O") or getattr(type_, "kind", None) == "O"
return type_ in ("object", "O") or (
getattr(type_, "kind", None) == "O"
and getattr(type_, "storage", None) != "pyarrow"
)


def is_string_like(type_: ExpressionType) -> bool:
Expand Down
5 changes: 3 additions & 2 deletions 5 tests/system/small/bigquery/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import geopandas as gpd # type: ignore
import pandas as pd
import pyarrow as pa
import pytest

import bigframes.bigquery as bbq
Expand Down Expand Up @@ -174,7 +175,7 @@ def test_json_extract_array_from_json_strings():
actual = bbq.json_extract_array(s, "$.a")
expected = bpd.Series(
[['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"'], None],
dtype=pd.StringDtype(storage="pyarrow"),
dtype=pd.ArrowDtype(pa.list_(pa.string())),
)
pd.testing.assert_series_equal(
actual.to_pandas(),
Expand All @@ -190,7 +191,7 @@ def test_json_extract_array_from_json_array_strings():
actual = bbq.json_extract_array(s)
expected = bpd.Series(
[["1", "2", "3"], [], ["4", "5"]],
dtype=pd.StringDtype(storage="pyarrow"),
dtype=pd.ArrowDtype(pa.list_(pa.string())),
)
pd.testing.assert_series_equal(
actual.to_pandas(),
Expand Down
13 changes: 13 additions & 0 deletions 13 tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,19 @@ def test_df_construct_inline_respects_location():
assert table.location == "europe-west1"


def test_df_construct_dtype():
data = {
"int_col": [1, 2, 3],
"string_col": ["1.1", "2.0", "3.5"],
"float_col": [1.0, 2.0, 3.0],
}
dtype = pd.StringDtype(storage="pyarrow")
bf_result = dataframe.DataFrame(data, dtype=dtype)
pd_result = pd.DataFrame(data, dtype=dtype)
pd_result.index = pd_result.index.astype("Int64")
pandas.testing.assert_frame_equal(bf_result.to_pandas(), pd_result)


def test_get_column(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
col_name = "int64_col"
Expand Down
74 changes: 74 additions & 0 deletions 74 tests/system/small/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import pytest
import shapely # type: ignore

import bigframes.features
import bigframes.pandas
import bigframes.series as series
from tests.system.utils import (
Expand Down Expand Up @@ -228,6 +229,79 @@ def test_series_construct_geodata():
)


@pytest.mark.parametrize(
("dtype"),
[
pytest.param(pd.Int64Dtype(), id="int"),
pytest.param(pd.Float64Dtype(), id="float"),
pytest.param(pd.StringDtype(storage="pyarrow"), id="string"),
],
)
def test_series_construct_w_dtype_for_int(dtype):
data = [1, 2, 3]
expected = pd.Series(data, dtype=dtype)
expected.index = expected.index.astype("Int64")
series = bigframes.pandas.Series(data, dtype=dtype)
pd.testing.assert_series_equal(series.to_pandas(), expected)


def test_series_construct_w_dtype_for_struct():
# The data shows the struct fields are disordered and correctly handled during
# construction.
data = [
{"a": 1, "c": "pandas", "b": dt.datetime(2020, 1, 20, 20, 20, 20, 20)},
{"a": 2, "c": "pandas", "b": dt.datetime(2019, 1, 20, 20, 20, 20, 20)},
{"a": 1, "c": "numpy", "b": None},
]
dtype = pd.ArrowDtype(
pa.struct([("a", pa.int64()), ("c", pa.string()), ("b", pa.timestamp("us"))])
)
series = bigframes.pandas.Series(data, dtype=dtype)
expected = pd.Series(data, dtype=dtype)
expected.index = expected.index.astype("Int64")
pd.testing.assert_series_equal(series.to_pandas(), expected)


def test_series_construct_w_dtype_for_array_string():
chelsea-lin marked this conversation as resolved.
Show resolved Hide resolved
data = [["1", "2", "3"], [], ["4", "5"]]
dtype = pd.ArrowDtype(pa.list_(pa.string()))
series = bigframes.pandas.Series(data, dtype=dtype)
expected = pd.Series(data, dtype=dtype)
expected.index = expected.index.astype("Int64")

# Skip dtype check due to internal issue b/321013333. This issue causes array types
# to be converted to the `object` dtype when calling `to_pandas()`, resulting in
# a mismatch with the expected Pandas type.
if bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable:
check_dtype = True
else:
check_dtype = False

pd.testing.assert_series_equal(
series.to_pandas(), expected, check_dtype=check_dtype
)


def test_series_construct_w_dtype_for_array_struct():
tswast marked this conversation as resolved.
Show resolved Hide resolved
data = [[{"a": 1, "c": "aa"}, {"a": 2, "c": "bb"}], [], [{"a": 3, "c": "cc"}]]
dtype = pd.ArrowDtype(pa.list_(pa.struct([("a", pa.int64()), ("c", pa.string())])))
series = bigframes.pandas.Series(data, dtype=dtype)
expected = pd.Series(data, dtype=dtype)
expected.index = expected.index.astype("Int64")

# Skip dtype check due to internal issue b/321013333. This issue causes array types
# to be converted to the `object` dtype when calling `to_pandas()`, resulting in
# a mismatch with the expected Pandas type.
if bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable:
check_dtype = True
else:
check_dtype = False

pd.testing.assert_series_equal(
series.to_pandas(), expected, check_dtype=check_dtype
)


def test_series_keys(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
bf_result = scalars_df["int64_col"].keys().to_pandas()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,12 +87,12 @@ def field(self, name_or_index: str | int):
>>> bpd.options.display.progress_bar = None
>>> s = bpd.Series(
... [
... {"project": "pandas", "version": 1},
... {"project": "pandas", "version": 2},
... {"project": "numpy", "version": 1},
... {"version": 1, "project": "pandas"},
... {"version": 2, "project": "pandas"},
... {"version": 1, "project": "numpy"},
... ],
... dtype=bpd.ArrowDtype(pa.struct(
... [("project", pa.string()), ("version", pa.int64())]
... [("version", pa.int64()), ("project", pa.string())]
... ))
... )

Expand All @@ -106,7 +106,7 @@ def field(self, name_or_index: str | int):

Extract by field index.

>>> s.struct.field(1)
>>> s.struct.field(0)
0 1
1 2
2 1
Expand All @@ -133,22 +133,22 @@ def explode(self):
>>> bpd.options.display.progress_bar = None
>>> s = bpd.Series(
... [
... {"project": "pandas", "version": 1},
... {"project": "pandas", "version": 2},
... {"project": "numpy", "version": 1},
... {"version": 1, "project": "pandas"},
... {"version": 2, "project": "pandas"},
... {"version": 1, "project": "numpy"},
... ],
... dtype=bpd.ArrowDtype(pa.struct(
... [("project", pa.string()), ("version", pa.int64())]
... [("version", pa.int64()), ("project", pa.string())]
... ))
... )

Extract all child fields.

>>> s.struct.explode()
project version
0 pandas 1
1 pandas 2
2 numpy 1
version project
0 1 pandas
1 2 pandas
2 1 numpy
<BLANKLINE>
[3 rows x 2 columns]

Expand Down Expand Up @@ -178,8 +178,8 @@ def dtypes(self):
... ))
... )
>>> s.struct.dtypes()
project string[pyarrow]
version Int64
project string[pyarrow]
dtype: object

Returns:
Expand All @@ -205,21 +205,21 @@ def explode(self, column, *, separator: str = "."):
>>> countries = bpd.Series(["cn", "es", "us"])
>>> files = bpd.Series(
... [
... {"project": "pandas", "version": 1},
... {"project": "pandas", "version": 2},
... {"project": "numpy", "version": 1},
... {"version": 1, "project": "pandas"},
... {"version": 2, "project": "pandas"},
... {"version": 1, "project": "numpy"},
... ],
... dtype=bpd.ArrowDtype(pa.struct(
... [("project", pa.string()), ("version", pa.int64())]
... [("version", pa.int64()), ("project", pa.string())]
... ))
... )
>>> downloads = bpd.Series([100, 200, 300])
>>> df = bpd.DataFrame({"country": countries, "file": files, "download_count": downloads})
>>> df.struct.explode("file")
country file.project file.version download_count
0 cn pandas 1 100
1 es pandas 2 200
2 us numpy 1 300
country file.version file.project download_count
0 cn 1 pandas 100
1 es 2 pandas 200
2 us 1 numpy 300
<BLANKLINE>
[3 rows x 4 columns]

Expand Down
Morty Proxy This is a proxified and sanitized view of the page, visit original site.