From b2d49e52895f2be3cd018b7d29d1dac19cd25e18 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 23 Jan 2025 18:46:28 +0000 Subject: [PATCH 01/10] feat: Support python type as astype arg --- bigframes/core/compile/ibis_types.py | 32 +------ bigframes/core/indexes/base.py | 3 +- bigframes/dataframe.py | 15 ++-- bigframes/dtypes.py | 90 +++++++++++++++---- bigframes/operations/generic_ops.py | 11 +-- bigframes/series.py | 1 + tests/system/small/test_dataframe.py | 16 +++- tests/system/small/test_index.py | 6 ++ tests/system/small/test_series.py | 11 +++ .../bigframes_vendored/pandas/core/generic.py | 2 +- .../pandas/core/indexes/base.py | 2 +- 11 files changed, 124 insertions(+), 65 deletions(-) diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index 18f0834903..e91f456b97 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -13,9 +13,8 @@ # limitations under the License. from __future__ import annotations -import textwrap import typing -from typing import Any, cast, Dict, Iterable, Optional, Tuple, Union +from typing import cast, Dict, Iterable, Optional, Tuple, Union import warnings import bigframes_vendored.constants as constants @@ -28,7 +27,6 @@ import bigframes_vendored.ibis.expr.types as ibis_types import geopandas as gpd # type: ignore import google.cloud.bigquery as bigquery -import numpy as np import pandas as pd import pyarrow as pa @@ -228,9 +226,7 @@ def ibis_value_to_canonical_type(value: ibis_types.Value) -> ibis_types.Value: def bigframes_dtype_to_ibis_dtype( - bigframes_dtype: Union[ - bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype, np.dtype[Any] - ] + bigframes_dtype: bigframes.dtypes.Dtype, ) -> ibis_dtypes.DataType: """Converts a BigQuery DataFrames supported dtype to an Ibis dtype. @@ -244,11 +240,6 @@ def bigframes_dtype_to_ibis_dtype( Raises: ValueError: If passed a dtype not supported by BigQuery DataFrames. """ - if str(bigframes_dtype) in bigframes.dtypes.BIGFRAMES_STRING_TO_BIGFRAMES: - bigframes_dtype = bigframes.dtypes.BIGFRAMES_STRING_TO_BIGFRAMES[ - cast(bigframes.dtypes.DtypeString, str(bigframes_dtype)) - ] - if bigframes_dtype in BIGFRAMES_TO_IBIS.keys(): return BIGFRAMES_TO_IBIS[bigframes_dtype] @@ -256,24 +247,7 @@ def bigframes_dtype_to_ibis_dtype( return _arrow_dtype_to_ibis_dtype(bigframes_dtype.pyarrow_dtype) else: - raise ValueError( - textwrap.dedent( - f""" - Unexpected data type {bigframes_dtype}. The following - str dtypes are supppted: 'boolean','Float64','Int64', - 'int64[pyarrow]','string','string[pyarrow]', - 'timestamp[us, tz=UTC][pyarrow]','timestamp[us][pyarrow]', - 'date32[day][pyarrow]','time64[us][pyarrow]'. - The following pandas.ExtensionDtype are supported: - pandas.BooleanDtype(), pandas.Float64Dtype(), - pandas.Int64Dtype(), pandas.StringDtype(storage="pyarrow"), - pd.ArrowDtype(pa.date32()), pd.ArrowDtype(pa.time64("us")), - pd.ArrowDtype(pa.timestamp("us")), - pd.ArrowDtype(pa.timestamp("us", tz="UTC")). - {constants.FEEDBACK_LINK} - """ - ) - ) + raise ValueError(f"Datatype has not ibis type mapping: {bigframes_dtype}") def ibis_dtype_to_bigframes_dtype( diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index da0daf027a..dcfe252708 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -310,7 +310,7 @@ def sort_values(self, *, ascending: bool = True, na_position: str = "last"): def astype( self, - dtype: Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype], + dtype, *, errors: Literal["raise", "null"] = "raise", ) -> Index: @@ -318,6 +318,7 @@ def astype( raise ValueError("Argument 'errors' must be one of 'raise' or 'null'") if self.nlevels > 1: raise TypeError("Multiindex does not support 'astype'") + dtype = bigframes.dtypes.bigframes_type(dtype) return self._apply_unary_expr( ops.AsTypeOp(to_type=dtype, safe=(errors == "null")).as_expr( ex.free_var("arg") diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 7f60f1c769..8f5a18afd0 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -368,6 +368,7 @@ def astype( dtype: Union[ bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype, + type, dict[str, Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype]], ], *, @@ -378,6 +379,14 @@ def astype( safe_cast = errors == "null" + if isinstance(dtype, dict): + result = self.copy() + for col, to_type in dtype.items(): + result[col] = result[col].astype(to_type) + return result + + dtype = bigframes.dtypes.bigframes_type(dtype) + # Type strings check if dtype in bigframes.dtypes.DTYPE_STRINGS: return self._apply_unary_op(ops.AsTypeOp(dtype, safe_cast)) @@ -386,12 +395,6 @@ def astype( if type(dtype) in bigframes.dtypes.DTYPES: return self._apply_unary_op(ops.AsTypeOp(dtype, safe_cast)) - if isinstance(dtype, dict): - result = self.copy() - for col, to_type in dtype.items(): - result[col] = result[col].astype(to_type) - return result - raise TypeError( f"Invalid type {type(dtype)} for dtype input. {constants.FEEDBACK_LINK}" ) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 3da3fa24f3..3fc1571e4b 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -17,6 +17,7 @@ from dataclasses import dataclass import datetime import decimal +import textwrap import typing from typing import Any, Dict, List, Literal, Union @@ -519,6 +520,76 @@ def arrow_type_to_literal( ) +def bigframes_type(dtype) -> Dtype: + if _is_bigframes_dtype(dtype): + return dtype + elif isinstance(dtype, str): + return _dtype_from_string(dtype) + elif isinstance(dtype, type): + return _infer_dtype_from_python_type(dtype) + else: + raise ValueError(f"Cannot infer supported datatype for: {dtype}") + + +def _is_bigframes_dtype(dtype) -> bool: + """True iff dtyps is a canonical bigframes dtype""" + if dtype in set(item.dtype for item in SIMPLE_TYPES): + return True + if isinstance(dtype, pd.ArrowDtype): + try: + _ = arrow_dtype_to_bigframes_dtype(dtype.pyarrow_dtype) + return True + except ValueError: + return False + return False + + +def _infer_dtype_from_python_type(type: type) -> Dtype: + if issubclass(type, (bool, np.bool_)): + return BOOL_DTYPE + if issubclass(type, (int, np.integer)): + return INT_DTYPE + if issubclass(type, (float, np.floating)): + return FLOAT_DTYPE + if issubclass(type, decimal.Decimal): + return NUMERIC_DTYPE + if issubclass(type, (str, np.str_)): + return STRING_DTYPE + if issubclass(type, (bytes, np.bytes_)): + return BYTES_DTYPE + if issubclass(type, datetime.date): + return DATE_DTYPE + if issubclass(type, datetime.time): + return TIME_DTYPE + else: + raise ValueError(f"No matching datatype for python type: {type}") + + +def _dtype_from_string(dtype_string: str) -> typing.Optional[Dtype]: + if str(dtype_string) in BIGFRAMES_STRING_TO_BIGFRAMES: + return BIGFRAMES_STRING_TO_BIGFRAMES[ + typing.cast(DtypeString, str(dtype_string)) + ] + raise ValueError( + textwrap.dedent( + f""" + Unexpected data type string {dtype_string}. The following + dtypes are supppted: 'boolean','Float64','Int64', + 'int64[pyarrow]','string','string[pyarrow]', + 'timestamp[us, tz=UTC][pyarrow]','timestamp[us][pyarrow]', + 'date32[day][pyarrow]','time64[us][pyarrow]'. + The following pandas.ExtensionDtype are supported: + pandas.BooleanDtype(), pandas.Float64Dtype(), + pandas.Int64Dtype(), pandas.StringDtype(storage="pyarrow"), + pd.ArrowDtype(pa.date32()), pd.ArrowDtype(pa.time64("us")), + pd.ArrowDtype(pa.timestamp("us")), + pd.ArrowDtype(pa.timestamp("us", tz="UTC")). + {constants.FEEDBACK_LINK} + """ + ) + ) + + def infer_literal_type(literal) -> typing.Optional[Dtype]: # Maybe also normalize literal to canonical python representation to remove this burden from compilers? if pd.api.types.is_list_like(literal): @@ -538,28 +609,15 @@ def infer_literal_type(literal) -> typing.Optional[Dtype]: return pd.ArrowDtype(pa.struct(fields)) if pd.isna(literal): return None # Null value without a definite type - if isinstance(literal, (bool, np.bool_)): - return BOOL_DTYPE - if isinstance(literal, (int, np.integer)): - return INT_DTYPE - if isinstance(literal, (float, np.floating)): - return FLOAT_DTYPE - if isinstance(literal, decimal.Decimal): - return NUMERIC_DTYPE - if isinstance(literal, (str, np.str_)): - return STRING_DTYPE - if isinstance(literal, (bytes, np.bytes_)): - return BYTES_DTYPE # Make sure to check datetime before date as datetimes are also dates if isinstance(literal, (datetime.datetime, pd.Timestamp)): if literal.tzinfo is not None: return TIMESTAMP_DTYPE else: return DATETIME_DTYPE - if isinstance(literal, datetime.date): - return DATE_DTYPE - if isinstance(literal, datetime.time): - return TIME_DTYPE + from_python_type = _infer_dtype_from_python_type(type(literal)) + if from_python_type is not None: + return from_python_type else: raise ValueError(f"Unable to infer type for value: {literal}") diff --git a/bigframes/operations/generic_ops.py b/bigframes/operations/generic_ops.py index ef7e1f5cea..b90a43b091 100644 --- a/bigframes/operations/generic_ops.py +++ b/bigframes/operations/generic_ops.py @@ -16,8 +16,6 @@ import functools import typing -import pyarrow as pa - from bigframes import dtypes from bigframes.operations import base_ops import bigframes.operations.type as op_typing @@ -56,17 +54,10 @@ class AsTypeOp(base_ops.UnaryOp): name: typing.ClassVar[str] = "astype" # TODO: Convert strings to dtype earlier - to_type: typing.Union[dtypes.DtypeString, dtypes.Dtype] + to_type: dtypes.Dtype safe: bool = False def output_type(self, *input_types): - # TODO: We should do this conversion earlier - if self.to_type == pa.string(): - return dtypes.STRING_DTYPE - if isinstance(self.to_type, str): - return dtypes.BIGFRAMES_STRING_TO_BIGFRAMES[ - typing.cast(dtypes.DtypeString, self.to_type) - ] return self.to_type diff --git a/bigframes/series.py b/bigframes/series.py index 46847996f1..eddb30330b 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -362,6 +362,7 @@ def astype( ) -> Series: if errors not in ["raise", "null"]: raise ValueError("Argument 'errors' must be one of 'raise' or 'null'") + dtype = bigframes.dtypes.bigframes_type(dtype) return self._apply_unary_op( bigframes.operations.AsTypeOp(to_type=dtype, safe=(errors == "null")) ) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 93c865536c..aaa111a437 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -5249,7 +5249,7 @@ def test__resample_start_time(rule, origin, data): ), ], ) -def test_astype(scalars_dfs, dtype): +def test_df_astype(scalars_dfs, dtype): bf_df, pd_df = scalars_dfs target_cols = ["bool_col", "int64_col"] bf_df = bf_df[target_cols] @@ -5261,6 +5261,20 @@ def test_astype(scalars_dfs, dtype): pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) +def test_df_astype_python_types(scalars_dfs): + bf_df, pd_df = scalars_dfs + target_cols = ["bool_col", "int64_col"] + bf_df = bf_df[target_cols] + pd_df = pd_df[target_cols] + + bf_result = bf_df.astype({"bool_col": str, "int64_col": float}).to_pandas() + pd_result = pd_df.astype( + {"bool_col": "string[pyarrow]", "int64_col": pd.Float64Dtype()} + ) + + pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + def test_astype_invalid_type_fail(scalars_dfs): bf_df, _ = scalars_dfs diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py index cdf4fa6511..4d01bc5ee9 100644 --- a/tests/system/small/test_index.py +++ b/tests/system/small/test_index.py @@ -123,6 +123,12 @@ def test_index_astype(scalars_df_index, scalars_pandas_df_index): pd.testing.assert_index_equal(bf_result, pd_result) +def test_index_astype_python(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.set_index("int64_col").index.astype(float).to_pandas() + pd_result = scalars_pandas_df_index.set_index("int64_col").index.astype("Float64") + pd.testing.assert_index_equal(bf_result, pd_result) + + def test_index_astype_error_error(session): input = pd.Index(["hello", "world", "3.11", "4000"]) with pytest.raises(ValueError): diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 670828f616..f3f22470fe 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -3218,6 +3218,17 @@ def test_astype(scalars_df_index, scalars_pandas_df_index, column, to_type, erro pd.testing.assert_series_equal(bf_result, pd_result) +def test_series_astype_python(session): + input = pd.Series(["hello", "world", "3.11", "4000"]) + exepcted = pd.Series( + [None, None, 3.11, 4000], + dtype="Float64", + index=pd.Index([0, 1, 2, 3], dtype="Int64"), + ) + result = session.read_pandas(input).astype(float, errors="null").to_pandas() + pd.testing.assert_series_equal(result, exepcted) + + def test_astype_safe(session): input = pd.Series(["hello", "world", "3.11", "4000"]) exepcted = pd.Series( diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 83a24f7a9c..9dae802b6e 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -165,7 +165,7 @@ def astype(self, dtype): dtype: Int64 Args: - dtype (str or pandas.ExtensionDtype): + dtype (str, data type or pandas.ExtensionDtype): A dtype supported by BigQuery DataFrame include ``'boolean'``, ``'Float64'``, ``'Int64'``, ``'int64\\[pyarrow\\]'``, ``'string'``, ``'string\\[pyarrow\\]'``, diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index c48c07424d..59504ee68c 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -445,7 +445,7 @@ def astype(self, dtype): Args: - dtype (str or pandas.ExtensionDtype): + dtype (str, data type, or pandas.ExtensionDtype): A dtype supported by BigQuery DataFrame include ``'boolean'``, ``'Float64'``, ``'Int64'``, ``'int64\\[pyarrow\\]'``, ``'string'``, ``'string\\[pyarrow\\]'``, From 48829e6e451abd217e2d20a2052c165703f3f9d6 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 23 Jan 2025 18:50:27 +0000 Subject: [PATCH 02/10] remove tests --- tests/unit/core/test_dtypes.py | 40 ---------------------------------- 1 file changed, 40 deletions(-) diff --git a/tests/unit/core/test_dtypes.py b/tests/unit/core/test_dtypes.py index e1fac624d7..035af5d9f1 100644 --- a/tests/unit/core/test_dtypes.py +++ b/tests/unit/core/test_dtypes.py @@ -16,7 +16,6 @@ import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes import bigframes_vendored.ibis.expr.types as ibis_types import geopandas as gpd # type: ignore -import numpy as np import pandas as pd import pyarrow as pa # type: ignore import pytest @@ -197,45 +196,6 @@ def test_bigframes_dtype_converts(ibis_dtype, bigframes_dtype): assert result == ibis_dtype -@pytest.mark.parametrize( - ["bigframes_dtype_str", "ibis_dtype"], - [ - # This test covers all dtypes that BigQuery DataFrames can exactly map to Ibis - ("boolean", ibis_dtypes.boolean), - ("date32[day][pyarrow]", ibis_dtypes.date), - ("timestamp[us][pyarrow]", ibis_dtypes.Timestamp()), - ("Float64", ibis_dtypes.float64), - ("Int64", ibis_dtypes.int64), - ("string[pyarrow]", ibis_dtypes.string), - ("time64[us][pyarrow]", ibis_dtypes.time), - ( - "timestamp[us, tz=UTC][pyarrow]", - ibis_dtypes.Timestamp(timezone="UTC"), - ), - # Special case - "string" is acceptable for "string[pyarrow]" - ("string", ibis_dtypes.string), - ], -) -def test_bigframes_string_dtype_converts(ibis_dtype, bigframes_dtype_str): - """Test all the Ibis data types needed to read BigQuery tables""" - result = bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype( - bigframes_dtype_str - ) - assert result == ibis_dtype - - -def test_unsupported_dtype_raises_unexpected_datatype(): - """Incompatible dtypes should fail when passed into BigQuery DataFrames""" - with pytest.raises(ValueError, match="Unexpected data type"): - bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype(np.float32) - - -def test_unsupported_dtype_str_raises_unexpected_datatype(): - """Incompatible dtypes should fail when passed into BigQuery DataFrames""" - with pytest.raises(ValueError, match="Unexpected data type"): - bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype("int64") - - @pytest.mark.parametrize( ["literal", "ibis_scalar"], [ From 59a6dbc34c5404a8613b293dd9dd138f483090cf Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 23 Jan 2025 19:16:20 +0000 Subject: [PATCH 03/10] fix astype with dtype strings --- bigframes/core/blocks.py | 10 +++++++--- bigframes/dataframe.py | 12 +----------- tests/unit/core/test_expression.py | 2 +- 3 files changed, 9 insertions(+), 15 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index afc03dbdea..37663f749b 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -707,7 +707,7 @@ def split( # Create an ordering col and convert to string block, ordering_col = block.promote_offsets() block, string_ordering_col = block.apply_unary_op( - ordering_col, ops.AsTypeOp(to_type="string[pyarrow]") + ordering_col, ops.AsTypeOp(to_type=bigframes.dtypes.STRING_DTYPE) ) # Apply hash method to sum col and order by it. @@ -1479,7 +1479,9 @@ def add_prefix(self, prefix: str, axis: str | int | None = None) -> Block: expr, new_col = expr.project_to_id( expression=ops.add_op.as_expr( ex.const(prefix), - ops.AsTypeOp(to_type="string").as_expr(index_col), + ops.AsTypeOp(to_type=bigframes.dtypes.STRING_DTYPE).as_expr( + index_col + ), ), ) new_index_cols.append(new_col) @@ -1502,7 +1504,9 @@ def add_suffix(self, suffix: str, axis: str | int | None = None) -> Block: for index_col in self._index_columns: expr, new_col = expr.project_to_id( expression=ops.add_op.as_expr( - ops.AsTypeOp(to_type="string").as_expr(index_col), + ops.AsTypeOp(to_type=bigframes.dtypes.STRING_DTYPE).as_expr( + index_col + ), ex.const(suffix), ), ) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 8f5a18afd0..d307820bf7 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -387,17 +387,7 @@ def astype( dtype = bigframes.dtypes.bigframes_type(dtype) - # Type strings check - if dtype in bigframes.dtypes.DTYPE_STRINGS: - return self._apply_unary_op(ops.AsTypeOp(dtype, safe_cast)) - - # Type instances check - if type(dtype) in bigframes.dtypes.DTYPES: - return self._apply_unary_op(ops.AsTypeOp(dtype, safe_cast)) - - raise TypeError( - f"Invalid type {type(dtype)} for dtype input. {constants.FEEDBACK_LINK}" - ) + return self._apply_unary_op(ops.AsTypeOp(dtype, safe_cast)) def _to_sql_query( self, include_index: bool, enable_cache: bool = True diff --git a/tests/unit/core/test_expression.py b/tests/unit/core/test_expression.py index 72e200f007..ab6402a909 100644 --- a/tests/unit/core/test_expression.py +++ b/tests/unit/core/test_expression.py @@ -47,7 +47,7 @@ def test_expression_dtype_where(): def test_expression_dtype_astype(): - expression = ops.AsTypeOp("Int64").as_expr(ex.const(3.14159)) + expression = ops.AsTypeOp(dtypes.INT_DTYPE).as_expr(ex.const(3.14159)) result = expression.output_type({}) From 19172c1e7ff619f1edc6e67ec6bc4c1739a6b781 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 23 Jan 2025 21:57:55 +0000 Subject: [PATCH 04/10] handle object constructors and use TypeError --- bigframes/core/indexes/base.py | 3 ++- bigframes/dataframe.py | 3 ++- bigframes/dtypes.py | 26 +++++++++++++++----------- bigframes/operations/base.py | 3 ++- 4 files changed, 21 insertions(+), 14 deletions(-) diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index dcfe252708..6ad0973262 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -78,7 +78,8 @@ def __new__( if name is not None: index.name = name if dtype is not None: - index = index.astype(dtype) + bf_dtype = bigframes.dtypes.bigframes_type(dtype) + index = index.astype(bf_dtype) block = index._block elif isinstance(data, pandas.Index): pd_df = pandas.DataFrame(index=data) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index d307820bf7..3d1d2f31e0 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -180,7 +180,8 @@ def __init__( if columns: block = block.select_columns(list(columns)) # type:ignore if dtype: - block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=dtype)) + bf_dtype = bigframes.dtypes.bigframes_type(dtype) + block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype)) self._block = block else: diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 3fc1571e4b..200bb1f7bf 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -422,7 +422,7 @@ def arrow_dtype_to_bigframes_dtype(arrow_dtype: pa.DataType) -> Dtype: return DEFAULT_DTYPE # No other types matched. - raise ValueError( + raise TypeError( f"Unexpected Arrow data type {arrow_dtype}. {constants.FEEDBACK_LINK}" ) @@ -447,7 +447,7 @@ def bigframes_dtype_to_arrow_dtype( if pa.types.is_struct(bigframes_dtype.pyarrow_dtype): return bigframes_dtype.pyarrow_dtype else: - raise ValueError( + raise TypeError( f"No arrow conversion for {bigframes_dtype}. {constants.FEEDBACK_LINK}" ) @@ -474,7 +474,7 @@ def bigframes_dtype_to_literal( if isinstance(bigframes_dtype, gpd.array.GeometryDtype): return shapely.Point((0, 0)) - raise ValueError( + raise TypeError( f"No literal conversion for {bigframes_dtype}. {constants.FEEDBACK_LINK}" ) @@ -515,7 +515,7 @@ def arrow_type_to_literal( if pa.types.is_time(arrow_type): return datetime.time(1, 1, 1) - raise ValueError( + raise TypeError( f"No literal conversion for {arrow_type}. {constants.FEEDBACK_LINK}" ) @@ -528,7 +528,9 @@ def bigframes_type(dtype) -> Dtype: elif isinstance(dtype, type): return _infer_dtype_from_python_type(dtype) else: - raise ValueError(f"Cannot infer supported datatype for: {dtype}") + raise TypeError( + f"Cannot infer supported datatype for: {dtype}. {constants.FEEDBACK_LINK}" + ) def _is_bigframes_dtype(dtype) -> bool: @@ -539,7 +541,7 @@ def _is_bigframes_dtype(dtype) -> bool: try: _ = arrow_dtype_to_bigframes_dtype(dtype.pyarrow_dtype) return True - except ValueError: + except TypeError: return False return False @@ -562,7 +564,9 @@ def _infer_dtype_from_python_type(type: type) -> Dtype: if issubclass(type, datetime.time): return TIME_DTYPE else: - raise ValueError(f"No matching datatype for python type: {type}") + raise TypeError( + f"No matching datatype for python type: {type}. {constants.FEEDBACK_LINK}" + ) def _dtype_from_string(dtype_string: str) -> typing.Optional[Dtype]: @@ -570,7 +574,7 @@ def _dtype_from_string(dtype_string: str) -> typing.Optional[Dtype]: return BIGFRAMES_STRING_TO_BIGFRAMES[ typing.cast(DtypeString, str(dtype_string)) ] - raise ValueError( + raise TypeError( textwrap.dedent( f""" Unexpected data type string {dtype_string}. The following @@ -619,7 +623,7 @@ def infer_literal_type(literal) -> typing.Optional[Dtype]: if from_python_type is not None: return from_python_type else: - raise ValueError(f"Unable to infer type for value: {literal}") + raise TypeError(f"Unable to infer type for value: {literal}") def infer_literal_arrow_type(literal) -> typing.Optional[pa.DataType]: @@ -659,7 +663,7 @@ def convert_schema_field( return field.name, pd.ArrowDtype(pa_type) return field.name, _TK_TO_BIGFRAMES[field.field_type] else: - raise ValueError(f"Cannot handle type: {field.field_type}") + raise TypeError(f"Cannot handle type: {field.field_type}") def convert_to_schema_field( @@ -690,7 +694,7 @@ def convert_to_schema_field( return google.cloud.bigquery.SchemaField( name, "RECORD", fields=inner_fields ) - raise ValueError( + raise TypeError( f"No arrow conversion for {bigframes_dtype}. {constants.FEEDBACK_LINK}" ) diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index f6e8223aa0..75db2f48e9 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -87,7 +87,8 @@ def __init__( if name is not None: data.name = name if dtype is not None: - data = data.astype(dtype) + bf_dtype = bigframes.dtypes.bigframes_type(dtype) + data = data.astype(bf_dtype) else: # local dict-like data data = read_pandas_func(pd.Series(data, name=name, dtype=dtype)) # type: ignore data_block = data._block From de2026c845c8cff4238d87173fdc5772e58bb70e Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 23 Jan 2025 22:07:07 +0000 Subject: [PATCH 05/10] allow astype with pyarrow type --- bigframes/dtypes.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 200bb1f7bf..fb17d4d54f 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -521,12 +521,15 @@ def arrow_type_to_literal( def bigframes_type(dtype) -> Dtype: + """Convert type object to canoncial bigframes dtype.""" if _is_bigframes_dtype(dtype): return dtype elif isinstance(dtype, str): return _dtype_from_string(dtype) elif isinstance(dtype, type): return _infer_dtype_from_python_type(dtype) + elif isinstance(dtype, pa.DataType): + return arrow_dtype_to_bigframes_dtype(dtype) else: raise TypeError( f"Cannot infer supported datatype for: {dtype}. {constants.FEEDBACK_LINK}" From 18d7d27e5e3ba2d534912b51229056acd7df70c0 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 24 Jan 2025 00:09:42 +0000 Subject: [PATCH 06/10] fix _is_bigframes_type for strings --- bigframes/dtypes.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 7a90471110..0a3e2fc875 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -538,7 +538,10 @@ def bigframes_type(dtype) -> Dtype: def _is_bigframes_dtype(dtype) -> bool: """True iff dtyps is a canonical bigframes dtype""" - if dtype in set(item.dtype for item in SIMPLE_TYPES): + # have to be quite strict, as pyarrow dtypes equal their string form, and we don't consider that a canonical form. + if (type(type), dtype) in set( + (type(item.dtype), item.dtype) for item in SIMPLE_TYPES + ): return True if isinstance(dtype, pd.ArrowDtype): try: From 7dc19387c551c153d6256567dfd3f571796f0169 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Mon, 27 Jan 2025 22:05:04 +0000 Subject: [PATCH 07/10] fix typo in _is_bigframes_dtype --- bigframes/dtypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 0a3e2fc875..2382861971 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -539,7 +539,7 @@ def bigframes_type(dtype) -> Dtype: def _is_bigframes_dtype(dtype) -> bool: """True iff dtyps is a canonical bigframes dtype""" # have to be quite strict, as pyarrow dtypes equal their string form, and we don't consider that a canonical form. - if (type(type), dtype) in set( + if (type(dtype), dtype) in set( (type(item.dtype), item.dtype) for item in SIMPLE_TYPES ): return True From 489896f492f305451d16c65c467081623fa116e3 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 28 Jan 2025 05:03:57 +0000 Subject: [PATCH 08/10] normalize decimal arrow types --- bigframes/core/local_data.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bigframes/core/local_data.py b/bigframes/core/local_data.py index 573562cefa..8e00ec020e 100644 --- a/bigframes/core/local_data.py +++ b/bigframes/core/local_data.py @@ -59,6 +59,10 @@ def arrow_type_replacements(type: pa.DataType) -> pa.DataType: if pa.types.is_time64(type): # This is potentially lossy, but BigFrames doesn't support ns return pa.time64("us") + if pa.types.is_decimal128(type): + return pa.decimal128(38, 9) + if pa.types.is_decimal256(type): + return pa.decimal256(76, 38) if pa.types.is_large_string(type): # simple string type can handle the largest strings needed return pa.string() From bd7be357592b1c2886baa12d247eb0297e234df1 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 28 Jan 2025 20:27:19 +0000 Subject: [PATCH 09/10] handle type errors better, and handle cat type --- bigframes/core/local_data.py | 2 ++ bigframes/session/__init__.py | 1 + 2 files changed, 3 insertions(+) diff --git a/bigframes/core/local_data.py b/bigframes/core/local_data.py index 8e00ec020e..f665948be2 100644 --- a/bigframes/core/local_data.py +++ b/bigframes/core/local_data.py @@ -63,6 +63,8 @@ def arrow_type_replacements(type: pa.DataType) -> pa.DataType: return pa.decimal128(38, 9) if pa.types.is_decimal256(type): return pa.decimal256(76, 38) + if pa.types.is_dictionary(type): + return arrow_type_replacements(type.value_type) if pa.types.is_large_string(type): # simple string type can handle the largest strings needed return pa.string() diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 89ea0eee69..686545bed3 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -806,6 +806,7 @@ def _read_pandas_inline( pa.ArrowInvalid, # Thrown by arrow for unsupported types, such as geo. pa.ArrowTypeError, # Thrown by arrow for types without mapping (geo). ValueError, # Thrown by ibis for some unhandled types + TypeError, # Not all types handleable by local code path ) as exc: if should_raise: raise ValueError( From e8687cdc62d977cd446ab25813bdea24c895c4fd Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 29 Jan 2025 21:22:27 +0000 Subject: [PATCH 10/10] reinstate unit tests --- bigframes/core/compile/ibis_types.py | 2 +- tests/unit/core/test_dtypes.py | 40 ++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index 74de95d49d..8a55f6775d 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -247,7 +247,7 @@ def bigframes_dtype_to_ibis_dtype( return _arrow_dtype_to_ibis_dtype(bigframes_dtype.pyarrow_dtype) else: - raise ValueError(f"Datatype has not ibis type mapping: {bigframes_dtype}") + raise ValueError(f"Datatype has no ibis type mapping: {bigframes_dtype}") def ibis_dtype_to_bigframes_dtype( diff --git a/tests/unit/core/test_dtypes.py b/tests/unit/core/test_dtypes.py index 035af5d9f1..3d420de51f 100644 --- a/tests/unit/core/test_dtypes.py +++ b/tests/unit/core/test_dtypes.py @@ -16,6 +16,7 @@ import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes import bigframes_vendored.ibis.expr.types as ibis_types import geopandas as gpd # type: ignore +import numpy as np import pandas as pd import pyarrow as pa # type: ignore import pytest @@ -196,6 +197,45 @@ def test_bigframes_dtype_converts(ibis_dtype, bigframes_dtype): assert result == ibis_dtype +@pytest.mark.parametrize( + ["bigframes_dtype_str", "ibis_dtype"], + [ + # This test covers all dtypes that BigQuery DataFrames can exactly map to Ibis + ("boolean", ibis_dtypes.boolean), + ("date32[day][pyarrow]", ibis_dtypes.date), + ("timestamp[us][pyarrow]", ibis_dtypes.Timestamp()), + ("Float64", ibis_dtypes.float64), + ("Int64", ibis_dtypes.int64), + ("string[pyarrow]", ibis_dtypes.string), + ("time64[us][pyarrow]", ibis_dtypes.time), + ( + "timestamp[us, tz=UTC][pyarrow]", + ibis_dtypes.Timestamp(timezone="UTC"), + ), + # Special case - "string" is acceptable for "string[pyarrow]" + ("string", ibis_dtypes.string), + ], +) +def test_bigframes_string_dtype_converts(ibis_dtype, bigframes_dtype_str): + """Test all the Ibis data types needed to read BigQuery tables""" + result = bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype( + bigframes.dtypes.bigframes_type(bigframes_dtype_str) + ) + assert result == ibis_dtype + + +def test_unsupported_dtype_raises_unexpected_datatype(): + """Incompatible dtypes should fail when passed into BigQuery DataFrames""" + with pytest.raises(ValueError, match="Datatype has no ibis type mapping"): + bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype(np.float32) + + +def test_unsupported_dtype_str_raises_unexpected_datatype(): + """Incompatible dtypes should fail when passed into BigQuery DataFrames""" + with pytest.raises(ValueError, match="Datatype has no ibis type mapping"): + bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype("int64") + + @pytest.mark.parametrize( ["literal", "ibis_scalar"], [