googleapis · TrevorBergeron · Jan 30, 2025 · Jan 23, 2025 · Jan 23, 2025 · Jan 23, 2025
@@ -707,7 +707,7 @@ def split(
        # Create an ordering col and convert to string
        block, ordering_col = block.promote_offsets()
        block, string_ordering_col = block.apply_unary_op(
-            ordering_col, ops.AsTypeOp(to_type="string[pyarrow]")
+            ordering_col, ops.AsTypeOp(to_type=bigframes.dtypes.STRING_DTYPE)
        )

        # Apply hash method to sum col and order by it.
@@ -1479,7 +1479,9 @@ def add_prefix(self, prefix: str, axis: str | int | None = None) -> Block:
                expr, new_col = expr.project_to_id(
                    expression=ops.add_op.as_expr(
                        ex.const(prefix),
-                        ops.AsTypeOp(to_type="string").as_expr(index_col),
+                        ops.AsTypeOp(to_type=bigframes.dtypes.STRING_DTYPE).as_expr(
+                            index_col
+                        ),
                    ),
                )
                new_index_cols.append(new_col)
@@ -1502,7 +1504,9 @@ def add_suffix(self, suffix: str, axis: str | int | None = None) -> Block:
            for index_col in self._index_columns:
                expr, new_col = expr.project_to_id(
                    expression=ops.add_op.as_expr(
-                        ops.AsTypeOp(to_type="string").as_expr(index_col),
+                        ops.AsTypeOp(to_type=bigframes.dtypes.STRING_DTYPE).as_expr(
+                            index_col
+                        ),
                        ex.const(suffix),
                    ),
                )

@@ -13,9 +13,8 @@
 # limitations under the License.
 from __future__ import annotations

-import textwrap
 import typing
-from typing import Any, cast, Dict, Iterable, Optional, Tuple, Union
+from typing import cast, Dict, Iterable, Optional, Tuple, Union

 import bigframes_vendored.constants as constants
 import bigframes_vendored.ibis
@@ -28,7 +27,6 @@
 import db_dtypes  # type: ignore
 import geopandas as gpd  # type: ignore
 import google.cloud.bigquery as bigquery
-import numpy as np
 import pandas as pd
 import pyarrow as pa

@@ -228,9 +226,7 @@ def ibis_value_to_canonical_type(value: ibis_types.Value) -> ibis_types.Value:


 def bigframes_dtype_to_ibis_dtype(
-    bigframes_dtype: Union[
-        bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype, np.dtype[Any]
-    ]
+    bigframes_dtype: bigframes.dtypes.Dtype,
 ) -> ibis_dtypes.DataType:
    """Converts a BigQuery DataFrames supported dtype to an Ibis dtype.

@@ -244,36 +240,14 @@ def bigframes_dtype_to_ibis_dtype(
    Raises:
        ValueError: If passed a dtype not supported by BigQuery DataFrames.
    """
-    if str(bigframes_dtype) in bigframes.dtypes.BIGFRAMES_STRING_TO_BIGFRAMES:
-        bigframes_dtype = bigframes.dtypes.BIGFRAMES_STRING_TO_BIGFRAMES[
-            cast(bigframes.dtypes.DtypeString, str(bigframes_dtype))
-        ]
-
    if bigframes_dtype in BIGFRAMES_TO_IBIS.keys():
        return BIGFRAMES_TO_IBIS[bigframes_dtype]

    elif isinstance(bigframes_dtype, pd.ArrowDtype) and bigframes_dtype.pyarrow_dtype:
        return _arrow_dtype_to_ibis_dtype(bigframes_dtype.pyarrow_dtype)

    else:
-        raise ValueError(
-            textwrap.dedent(
-                f"""
-                Unexpected data type {bigframes_dtype}. The following
-                        str dtypes are supppted: 'boolean','Float64','Int64',
-                        'int64[pyarrow]','string','string[pyarrow]',
-                        'timestamp[us, tz=UTC][pyarrow]','timestamp[us][pyarrow]',
-                        'date32[day][pyarrow]','time64[us][pyarrow]'.
-                        The following pandas.ExtensionDtype are supported:
-                        pandas.BooleanDtype(), pandas.Float64Dtype(),
-                        pandas.Int64Dtype(), pandas.StringDtype(storage="pyarrow"),
-                        pd.ArrowDtype(pa.date32()), pd.ArrowDtype(pa.time64("us")),
-                        pd.ArrowDtype(pa.timestamp("us")),
-                        pd.ArrowDtype(pa.timestamp("us", tz="UTC")).
-                {constants.FEEDBACK_LINK}
-                """
-            )
-        )
+        raise ValueError(f"Datatype has no ibis type mapping: {bigframes_dtype}")


 def ibis_dtype_to_bigframes_dtype(

@@ -78,7 +78,8 @@ def __new__(
            if name is not None:
                index.name = name
            if dtype is not None:
-                index = index.astype(dtype)
+                bf_dtype = bigframes.dtypes.bigframes_type(dtype)
+                index = index.astype(bf_dtype)
            block = index._block
        elif isinstance(data, pandas.Index):
            pd_df = pandas.DataFrame(index=data)
@@ -310,14 +311,15 @@ def sort_values(self, *, ascending: bool = True, na_position: str = "last"):

    def astype(
        self,
-        dtype: Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype],
+        dtype,
        *,
        errors: Literal["raise", "null"] = "raise",
    ) -> Index:
        if errors not in ["raise", "null"]:
            raise ValueError("Argument 'errors' must be one of 'raise' or 'null'")
        if self.nlevels > 1:
            raise TypeError("Multiindex does not support 'astype'")
+        dtype = bigframes.dtypes.bigframes_type(dtype)
        return self._apply_unary_expr(
            ops.AsTypeOp(to_type=dtype, safe=(errors == "null")).as_expr(
                ex.free_var("arg")

@@ -59,6 +59,12 @@ def arrow_type_replacements(type: pa.DataType) -> pa.DataType:
    if pa.types.is_time64(type):
        # This is potentially lossy, but BigFrames doesn't support ns
        return pa.time64("us")
+    if pa.types.is_decimal128(type):
+        return pa.decimal128(38, 9)
+    if pa.types.is_decimal256(type):
+        return pa.decimal256(76, 38)
+    if pa.types.is_dictionary(type):
+        return arrow_type_replacements(type.value_type)
    if pa.types.is_large_string(type):
        # simple string type can handle the largest strings needed
        return pa.string()

@@ -180,7 +180,8 @@ def __init__(
            if columns:
                block = block.select_columns(list(columns))  # type:ignore
            if dtype:
-                block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=dtype))
+                bf_dtype = bigframes.dtypes.bigframes_type(dtype)
+                block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype))
            self._block = block

        else:
@@ -368,6 +369,7 @@ def astype(
        dtype: Union[
            bigframes.dtypes.DtypeString,
            bigframes.dtypes.Dtype,
+            type,
            dict[str, Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype]],
        ],
        *,
@@ -378,23 +380,15 @@ def astype(

        safe_cast = errors == "null"

-        # Type strings check
-        if dtype in bigframes.dtypes.DTYPE_STRINGS:
-            return self._apply_unary_op(ops.AsTypeOp(dtype, safe_cast))
-
-        # Type instances check
-        if type(dtype) in bigframes.dtypes.DTYPES:
-            return self._apply_unary_op(ops.AsTypeOp(dtype, safe_cast))
-
        if isinstance(dtype, dict):
            result = self.copy()
            for col, to_type in dtype.items():
                result[col] = result[col].astype(to_type)
            return result

-        raise TypeError(
-            f"Invalid type {type(dtype)} for dtype input. {constants.FEEDBACK_LINK}"
-        )
+        dtype = bigframes.dtypes.bigframes_type(dtype)
+
+        return self._apply_unary_op(ops.AsTypeOp(dtype, safe_cast))

    def _to_sql_query(
        self, include_index: bool, enable_cache: bool = True

@@ -17,6 +17,7 @@
 from dataclasses import dataclass
 import datetime
 import decimal
+import textwrap
 import typing
 from typing import Any, Dict, List, Literal, Union

@@ -422,7 +423,7 @@ def arrow_dtype_to_bigframes_dtype(arrow_dtype: pa.DataType) -> Dtype:
        return DEFAULT_DTYPE

    # No other types matched.
-    raise ValueError(
+    raise TypeError(
        f"Unexpected Arrow data type {arrow_dtype}. {constants.FEEDBACK_LINK}"
    )

@@ -447,7 +448,7 @@ def bigframes_dtype_to_arrow_dtype(
        if pa.types.is_struct(bigframes_dtype.pyarrow_dtype):
            return bigframes_dtype.pyarrow_dtype
    else:
-        raise ValueError(
+        raise TypeError(
            f"No arrow conversion for {bigframes_dtype}. {constants.FEEDBACK_LINK}"
        )

@@ -474,7 +475,7 @@ def bigframes_dtype_to_literal(
    if isinstance(bigframes_dtype, gpd.array.GeometryDtype):
        return shapely.Point((0, 0))

-    raise ValueError(
+    raise TypeError(
        f"No literal  conversion for {bigframes_dtype}. {constants.FEEDBACK_LINK}"
    )

@@ -515,11 +516,91 @@ def arrow_type_to_literal(
    if pa.types.is_time(arrow_type):
        return datetime.time(1, 1, 1)

-    raise ValueError(
+    raise TypeError(
        f"No literal  conversion for {arrow_type}. {constants.FEEDBACK_LINK}"
    )


+def bigframes_type(dtype) -> Dtype:
+    """Convert type object to canoncial bigframes dtype."""
+    if _is_bigframes_dtype(dtype):
+        return dtype
+    elif isinstance(dtype, str):
+        return _dtype_from_string(dtype)
+    elif isinstance(dtype, type):
+        return _infer_dtype_from_python_type(dtype)
+    elif isinstance(dtype, pa.DataType):
+        return arrow_dtype_to_bigframes_dtype(dtype)
+    else:
+        raise TypeError(
+            f"Cannot infer supported datatype for: {dtype}. {constants.FEEDBACK_LINK}"
+        )
+
+
+def _is_bigframes_dtype(dtype) -> bool:
+    """True iff dtyps is a canonical bigframes dtype"""
+    # have to be quite strict, as pyarrow dtypes equal their string form, and we don't consider that a canonical form.
+    if (type(dtype), dtype) in set(
+        (type(item.dtype), item.dtype) for item in SIMPLE_TYPES
+    ):
+        return True
+    if isinstance(dtype, pd.ArrowDtype):
+        try:
+            _ = arrow_dtype_to_bigframes_dtype(dtype.pyarrow_dtype)
+            return True
+        except TypeError:
+            return False
+    return False
+
+
+def _infer_dtype_from_python_type(type: type) -> Dtype:
+    if issubclass(type, (bool, np.bool_)):
+        return BOOL_DTYPE
+    if issubclass(type, (int, np.integer)):
+        return INT_DTYPE
+    if issubclass(type, (float, np.floating)):
+        return FLOAT_DTYPE
+    if issubclass(type, decimal.Decimal):
+        return NUMERIC_DTYPE
+    if issubclass(type, (str, np.str_)):
+        return STRING_DTYPE
+    if issubclass(type, (bytes, np.bytes_)):
+        return BYTES_DTYPE
+    if issubclass(type, datetime.date):
+        return DATE_DTYPE
+    if issubclass(type, datetime.time):
+        return TIME_DTYPE
+    else:
+        raise TypeError(
+            f"No matching datatype for python type: {type}. {constants.FEEDBACK_LINK}"
+        )
+
+
+def _dtype_from_string(dtype_string: str) -> typing.Optional[Dtype]:
+    if str(dtype_string) in BIGFRAMES_STRING_TO_BIGFRAMES:
+        return BIGFRAMES_STRING_TO_BIGFRAMES[
+            typing.cast(DtypeString, str(dtype_string))
+        ]
+    raise TypeError(
+        textwrap.dedent(
+            f"""
+                Unexpected data type string {dtype_string}. The following
+                        dtypes are supppted: 'boolean','Float64','Int64',
+                        'int64[pyarrow]','string','string[pyarrow]',
+                        'timestamp[us, tz=UTC][pyarrow]','timestamp[us][pyarrow]',
+                        'date32[day][pyarrow]','time64[us][pyarrow]'.
+                        The following pandas.ExtensionDtype are supported:
+                        pandas.BooleanDtype(), pandas.Float64Dtype(),
+                        pandas.Int64Dtype(), pandas.StringDtype(storage="pyarrow"),
+                        pd.ArrowDtype(pa.date32()), pd.ArrowDtype(pa.time64("us")),
+                        pd.ArrowDtype(pa.timestamp("us")),
+                        pd.ArrowDtype(pa.timestamp("us", tz="UTC")).
+                {constants.FEEDBACK_LINK}
+                """
+        )
+    )
+
+
 def infer_literal_type(literal) -> typing.Optional[Dtype]:
    # Maybe also normalize literal to canonical python representation to remove this burden from compilers?
    if pd.api.types.is_list_like(literal):
@@ -539,30 +620,17 @@ def infer_literal_type(literal) -> typing.Optional[Dtype]:
        return pd.ArrowDtype(pa.struct(fields))
    if pd.isna(literal):
        return None  # Null value without a definite type
-    if isinstance(literal, (bool, np.bool_)):
-        return BOOL_DTYPE
-    if isinstance(literal, (int, np.integer)):
-        return INT_DTYPE
-    if isinstance(literal, (float, np.floating)):
-        return FLOAT_DTYPE
-    if isinstance(literal, decimal.Decimal):
-        return NUMERIC_DTYPE
-    if isinstance(literal, (str, np.str_)):
-        return STRING_DTYPE
-    if isinstance(literal, (bytes, np.bytes_)):
-        return BYTES_DTYPE
    # Make sure to check datetime before date as datetimes are also dates
    if isinstance(literal, (datetime.datetime, pd.Timestamp)):
        if literal.tzinfo is not None:
            return TIMESTAMP_DTYPE
        else:
            return DATETIME_DTYPE
-    if isinstance(literal, datetime.date):
-        return DATE_DTYPE
-    if isinstance(literal, datetime.time):
-        return TIME_DTYPE
+    from_python_type = _infer_dtype_from_python_type(type(literal))
+    if from_python_type is not None:
+        return from_python_type
    else:
-        raise ValueError(f"Unable to infer type for value: {literal}")
+        raise TypeError(f"Unable to infer type for value: {literal}")


 def infer_literal_arrow_type(literal) -> typing.Optional[pa.DataType]:
@@ -602,7 +670,7 @@ def convert_schema_field(
            return field.name, pd.ArrowDtype(pa_type)
        return field.name, _TK_TO_BIGFRAMES[field.field_type]
    else:
-        raise ValueError(f"Cannot handle type: {field.field_type}")
+        raise TypeError(f"Cannot handle type: {field.field_type}")


 def convert_to_schema_field(
@@ -636,7 +704,7 @@ def convert_to_schema_field(
        if bigframes_dtype.pyarrow_dtype == pa.duration("us"):
            # Timedeltas are represented as integers in microseconds.
            return google.cloud.bigquery.SchemaField(name, "INTEGER")
-    raise ValueError(
+    raise TypeError(
        f"No arrow conversion for {bigframes_dtype}. {constants.FEEDBACK_LINK}"
    )


@@ -87,7 +87,8 @@ def __init__(
                if name is not None:
                    data.name = name
                if dtype is not None:
-                    data = data.astype(dtype)
+                    bf_dtype = bigframes.dtypes.bigframes_type(dtype)
+                    data = data.astype(bf_dtype)
            else:  # local dict-like data
                data = read_pandas_func(pd.Series(data, name=name, dtype=dtype))  # type: ignore
            data_block = data._block