From b2d49e52895f2be3cd018b7d29d1dac19cd25e18 Mon Sep 17 00:00:00 2001
From: Trevor Bergeron <tbergeron@google.com>
Date: Thu, 23 Jan 2025 18:46:28 +0000
Subject: [PATCH 01/10] feat: Support python type as astype arg

---
 bigframes/core/compile/ibis_types.py          | 32 +------
 bigframes/core/indexes/base.py                |  3 +-
 bigframes/dataframe.py                        | 15 ++--
 bigframes/dtypes.py                           | 90 +++++++++++++++----
 bigframes/operations/generic_ops.py           | 11 +--
 bigframes/series.py                           |  1 +
 tests/system/small/test_dataframe.py          | 16 +++-
 tests/system/small/test_index.py              |  6 ++
 tests/system/small/test_series.py             | 11 +++
 .../bigframes_vendored/pandas/core/generic.py |  2 +-
 .../pandas/core/indexes/base.py               |  2 +-
 11 files changed, 124 insertions(+), 65 deletions(-)

diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py
index 18f0834903..e91f456b97 100644
--- a/bigframes/core/compile/ibis_types.py
+++ b/bigframes/core/compile/ibis_types.py
@@ -13,9 +13,8 @@
 # limitations under the License.
 from __future__ import annotations
 
-import textwrap
 import typing
-from typing import Any, cast, Dict, Iterable, Optional, Tuple, Union
+from typing import cast, Dict, Iterable, Optional, Tuple, Union
 import warnings
 
 import bigframes_vendored.constants as constants
@@ -28,7 +27,6 @@
 import bigframes_vendored.ibis.expr.types as ibis_types
 import geopandas as gpd  # type: ignore
 import google.cloud.bigquery as bigquery
-import numpy as np
 import pandas as pd
 import pyarrow as pa
 
@@ -228,9 +226,7 @@ def ibis_value_to_canonical_type(value: ibis_types.Value) -> ibis_types.Value:
 
 
 def bigframes_dtype_to_ibis_dtype(
-    bigframes_dtype: Union[
-        bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype, np.dtype[Any]
-    ]
+    bigframes_dtype: bigframes.dtypes.Dtype,
 ) -> ibis_dtypes.DataType:
     """Converts a BigQuery DataFrames supported dtype to an Ibis dtype.
 
@@ -244,11 +240,6 @@ def bigframes_dtype_to_ibis_dtype(
     Raises:
         ValueError: If passed a dtype not supported by BigQuery DataFrames.
     """
-    if str(bigframes_dtype) in bigframes.dtypes.BIGFRAMES_STRING_TO_BIGFRAMES:
-        bigframes_dtype = bigframes.dtypes.BIGFRAMES_STRING_TO_BIGFRAMES[
-            cast(bigframes.dtypes.DtypeString, str(bigframes_dtype))
-        ]
-
     if bigframes_dtype in BIGFRAMES_TO_IBIS.keys():
         return BIGFRAMES_TO_IBIS[bigframes_dtype]
 
@@ -256,24 +247,7 @@ def bigframes_dtype_to_ibis_dtype(
         return _arrow_dtype_to_ibis_dtype(bigframes_dtype.pyarrow_dtype)
 
     else:
-        raise ValueError(
-            textwrap.dedent(
-                f"""
-                Unexpected data type {bigframes_dtype}. The following
-                        str dtypes are supppted: 'boolean','Float64','Int64',
-                        'int64[pyarrow]','string','string[pyarrow]',
-                        'timestamp[us, tz=UTC][pyarrow]','timestamp[us][pyarrow]',
-                        'date32[day][pyarrow]','time64[us][pyarrow]'.
-                        The following pandas.ExtensionDtype are supported:
-                        pandas.BooleanDtype(), pandas.Float64Dtype(),
-                        pandas.Int64Dtype(), pandas.StringDtype(storage="pyarrow"),
-                        pd.ArrowDtype(pa.date32()), pd.ArrowDtype(pa.time64("us")),
-                        pd.ArrowDtype(pa.timestamp("us")),
-                        pd.ArrowDtype(pa.timestamp("us", tz="UTC")).
-                {constants.FEEDBACK_LINK}
-                """
-            )
-        )
+        raise ValueError(f"Datatype has not ibis type mapping: {bigframes_dtype}")
 
 
 def ibis_dtype_to_bigframes_dtype(
diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py
index da0daf027a..dcfe252708 100644
--- a/bigframes/core/indexes/base.py
+++ b/bigframes/core/indexes/base.py
@@ -310,7 +310,7 @@ def sort_values(self, *, ascending: bool = True, na_position: str = "last"):
 
     def astype(
         self,
-        dtype: Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype],
+        dtype,
         *,
         errors: Literal["raise", "null"] = "raise",
     ) -> Index:
@@ -318,6 +318,7 @@ def astype(
             raise ValueError("Argument 'errors' must be one of 'raise' or 'null'")
         if self.nlevels > 1:
             raise TypeError("Multiindex does not support 'astype'")
+        dtype = bigframes.dtypes.bigframes_type(dtype)
         return self._apply_unary_expr(
             ops.AsTypeOp(to_type=dtype, safe=(errors == "null")).as_expr(
                 ex.free_var("arg")
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index 7f60f1c769..8f5a18afd0 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -368,6 +368,7 @@ def astype(
         dtype: Union[
             bigframes.dtypes.DtypeString,
             bigframes.dtypes.Dtype,
+            type,
             dict[str, Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype]],
         ],
         *,
@@ -378,6 +379,14 @@ def astype(
 
         safe_cast = errors == "null"
 
+        if isinstance(dtype, dict):
+            result = self.copy()
+            for col, to_type in dtype.items():
+                result[col] = result[col].astype(to_type)
+            return result
+
+        dtype = bigframes.dtypes.bigframes_type(dtype)
+
         # Type strings check
         if dtype in bigframes.dtypes.DTYPE_STRINGS:
             return self._apply_unary_op(ops.AsTypeOp(dtype, safe_cast))
@@ -386,12 +395,6 @@ def astype(
         if type(dtype) in bigframes.dtypes.DTYPES:
             return self._apply_unary_op(ops.AsTypeOp(dtype, safe_cast))
 
-        if isinstance(dtype, dict):
-            result = self.copy()
-            for col, to_type in dtype.items():
-                result[col] = result[col].astype(to_type)
-            return result
-
         raise TypeError(
             f"Invalid type {type(dtype)} for dtype input. {constants.FEEDBACK_LINK}"
         )
diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
index 3da3fa24f3..3fc1571e4b 100644
--- a/bigframes/dtypes.py
+++ b/bigframes/dtypes.py
@@ -17,6 +17,7 @@
 from dataclasses import dataclass
 import datetime
 import decimal
+import textwrap
 import typing
 from typing import Any, Dict, List, Literal, Union
 
@@ -519,6 +520,76 @@ def arrow_type_to_literal(
     )
 
 
+def bigframes_type(dtype) -> Dtype:
+    if _is_bigframes_dtype(dtype):
+        return dtype
+    elif isinstance(dtype, str):
+        return _dtype_from_string(dtype)
+    elif isinstance(dtype, type):
+        return _infer_dtype_from_python_type(dtype)
+    else:
+        raise ValueError(f"Cannot infer supported datatype for: {dtype}")
+
+
+def _is_bigframes_dtype(dtype) -> bool:
+    """True iff dtyps is a canonical bigframes dtype"""
+    if dtype in set(item.dtype for item in SIMPLE_TYPES):
+        return True
+    if isinstance(dtype, pd.ArrowDtype):
+        try:
+            _ = arrow_dtype_to_bigframes_dtype(dtype.pyarrow_dtype)
+            return True
+        except ValueError:
+            return False
+    return False
+
+
+def _infer_dtype_from_python_type(type: type) -> Dtype:
+    if issubclass(type, (bool, np.bool_)):
+        return BOOL_DTYPE
+    if issubclass(type, (int, np.integer)):
+        return INT_DTYPE
+    if issubclass(type, (float, np.floating)):
+        return FLOAT_DTYPE
+    if issubclass(type, decimal.Decimal):
+        return NUMERIC_DTYPE
+    if issubclass(type, (str, np.str_)):
+        return STRING_DTYPE
+    if issubclass(type, (bytes, np.bytes_)):
+        return BYTES_DTYPE
+    if issubclass(type, datetime.date):
+        return DATE_DTYPE
+    if issubclass(type, datetime.time):
+        return TIME_DTYPE
+    else:
+        raise ValueError(f"No matching datatype for python type: {type}")
+
+
+def _dtype_from_string(dtype_string: str) -> typing.Optional[Dtype]:
+    if str(dtype_string) in BIGFRAMES_STRING_TO_BIGFRAMES:
+        return BIGFRAMES_STRING_TO_BIGFRAMES[
+            typing.cast(DtypeString, str(dtype_string))
+        ]
+    raise ValueError(
+        textwrap.dedent(
+            f"""
+                Unexpected data type string {dtype_string}. The following
+                        dtypes are supppted: 'boolean','Float64','Int64',
+                        'int64[pyarrow]','string','string[pyarrow]',
+                        'timestamp[us, tz=UTC][pyarrow]','timestamp[us][pyarrow]',
+                        'date32[day][pyarrow]','time64[us][pyarrow]'.
+                        The following pandas.ExtensionDtype are supported:
+                        pandas.BooleanDtype(), pandas.Float64Dtype(),
+                        pandas.Int64Dtype(), pandas.StringDtype(storage="pyarrow"),
+                        pd.ArrowDtype(pa.date32()), pd.ArrowDtype(pa.time64("us")),
+                        pd.ArrowDtype(pa.timestamp("us")),
+                        pd.ArrowDtype(pa.timestamp("us", tz="UTC")).
+                {constants.FEEDBACK_LINK}
+                """
+        )
+    )
+
+
 def infer_literal_type(literal) -> typing.Optional[Dtype]:
     # Maybe also normalize literal to canonical python representation to remove this burden from compilers?
     if pd.api.types.is_list_like(literal):
@@ -538,28 +609,15 @@ def infer_literal_type(literal) -> typing.Optional[Dtype]:
         return pd.ArrowDtype(pa.struct(fields))
     if pd.isna(literal):
         return None  # Null value without a definite type
-    if isinstance(literal, (bool, np.bool_)):
-        return BOOL_DTYPE
-    if isinstance(literal, (int, np.integer)):
-        return INT_DTYPE
-    if isinstance(literal, (float, np.floating)):
-        return FLOAT_DTYPE
-    if isinstance(literal, decimal.Decimal):
-        return NUMERIC_DTYPE
-    if isinstance(literal, (str, np.str_)):
-        return STRING_DTYPE
-    if isinstance(literal, (bytes, np.bytes_)):
-        return BYTES_DTYPE
     # Make sure to check datetime before date as datetimes are also dates
     if isinstance(literal, (datetime.datetime, pd.Timestamp)):
         if literal.tzinfo is not None:
             return TIMESTAMP_DTYPE
         else:
             return DATETIME_DTYPE
-    if isinstance(literal, datetime.date):
-        return DATE_DTYPE
-    if isinstance(literal, datetime.time):
-        return TIME_DTYPE
+    from_python_type = _infer_dtype_from_python_type(type(literal))
+    if from_python_type is not None:
+        return from_python_type
     else:
         raise ValueError(f"Unable to infer type for value: {literal}")
 
diff --git a/bigframes/operations/generic_ops.py b/bigframes/operations/generic_ops.py
index ef7e1f5cea..b90a43b091 100644
--- a/bigframes/operations/generic_ops.py
+++ b/bigframes/operations/generic_ops.py
@@ -16,8 +16,6 @@
 import functools
 import typing
 
-import pyarrow as pa
-
 from bigframes import dtypes
 from bigframes.operations import base_ops
 import bigframes.operations.type as op_typing
@@ -56,17 +54,10 @@
 class AsTypeOp(base_ops.UnaryOp):
     name: typing.ClassVar[str] = "astype"
     # TODO: Convert strings to dtype earlier
-    to_type: typing.Union[dtypes.DtypeString, dtypes.Dtype]
+    to_type: dtypes.Dtype
     safe: bool = False
 
     def output_type(self, *input_types):
-        # TODO: We should do this conversion earlier
-        if self.to_type == pa.string():
-            return dtypes.STRING_DTYPE
-        if isinstance(self.to_type, str):
-            return dtypes.BIGFRAMES_STRING_TO_BIGFRAMES[
-                typing.cast(dtypes.DtypeString, self.to_type)
-            ]
         return self.to_type
 
 
diff --git a/bigframes/series.py b/bigframes/series.py
index 46847996f1..eddb30330b 100644
--- a/bigframes/series.py
+++ b/bigframes/series.py
@@ -362,6 +362,7 @@ def astype(
     ) -> Series:
         if errors not in ["raise", "null"]:
             raise ValueError("Argument 'errors' must be one of 'raise' or 'null'")
+        dtype = bigframes.dtypes.bigframes_type(dtype)
         return self._apply_unary_op(
             bigframes.operations.AsTypeOp(to_type=dtype, safe=(errors == "null"))
         )
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
index 93c865536c..aaa111a437 100644
--- a/tests/system/small/test_dataframe.py
+++ b/tests/system/small/test_dataframe.py
@@ -5249,7 +5249,7 @@ def test__resample_start_time(rule, origin, data):
         ),
     ],
 )
-def test_astype(scalars_dfs, dtype):
+def test_df_astype(scalars_dfs, dtype):
     bf_df, pd_df = scalars_dfs
     target_cols = ["bool_col", "int64_col"]
     bf_df = bf_df[target_cols]
@@ -5261,6 +5261,20 @@ def test_astype(scalars_dfs, dtype):
     pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False)
 
 
+def test_df_astype_python_types(scalars_dfs):
+    bf_df, pd_df = scalars_dfs
+    target_cols = ["bool_col", "int64_col"]
+    bf_df = bf_df[target_cols]
+    pd_df = pd_df[target_cols]
+
+    bf_result = bf_df.astype({"bool_col": str, "int64_col": float}).to_pandas()
+    pd_result = pd_df.astype(
+        {"bool_col": "string[pyarrow]", "int64_col": pd.Float64Dtype()}
+    )
+
+    pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False)
+
+
 def test_astype_invalid_type_fail(scalars_dfs):
     bf_df, _ = scalars_dfs
 
diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py
index cdf4fa6511..4d01bc5ee9 100644
--- a/tests/system/small/test_index.py
+++ b/tests/system/small/test_index.py
@@ -123,6 +123,12 @@ def test_index_astype(scalars_df_index, scalars_pandas_df_index):
     pd.testing.assert_index_equal(bf_result, pd_result)
 
 
+def test_index_astype_python(scalars_df_index, scalars_pandas_df_index):
+    bf_result = scalars_df_index.set_index("int64_col").index.astype(float).to_pandas()
+    pd_result = scalars_pandas_df_index.set_index("int64_col").index.astype("Float64")
+    pd.testing.assert_index_equal(bf_result, pd_result)
+
+
 def test_index_astype_error_error(session):
     input = pd.Index(["hello", "world", "3.11", "4000"])
     with pytest.raises(ValueError):
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
index 670828f616..f3f22470fe 100644
--- a/tests/system/small/test_series.py
+++ b/tests/system/small/test_series.py
@@ -3218,6 +3218,17 @@ def test_astype(scalars_df_index, scalars_pandas_df_index, column, to_type, erro
     pd.testing.assert_series_equal(bf_result, pd_result)
 
 
+def test_series_astype_python(session):
+    input = pd.Series(["hello", "world", "3.11", "4000"])
+    exepcted = pd.Series(
+        [None, None, 3.11, 4000],
+        dtype="Float64",
+        index=pd.Index([0, 1, 2, 3], dtype="Int64"),
+    )
+    result = session.read_pandas(input).astype(float, errors="null").to_pandas()
+    pd.testing.assert_series_equal(result, exepcted)
+
+
 def test_astype_safe(session):
     input = pd.Series(["hello", "world", "3.11", "4000"])
     exepcted = pd.Series(
diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py
index 83a24f7a9c..9dae802b6e 100644
--- a/third_party/bigframes_vendored/pandas/core/generic.py
+++ b/third_party/bigframes_vendored/pandas/core/generic.py
@@ -165,7 +165,7 @@ def astype(self, dtype):
             dtype: Int64
 
         Args:
-            dtype (str or pandas.ExtensionDtype):
+            dtype (str, data type or pandas.ExtensionDtype):
                 A dtype supported by BigQuery DataFrame include ``'boolean'``,
                 ``'Float64'``, ``'Int64'``, ``'int64\\[pyarrow\\]'``,
                 ``'string'``, ``'string\\[pyarrow\\]'``,
diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py
index c48c07424d..59504ee68c 100644
--- a/third_party/bigframes_vendored/pandas/core/indexes/base.py
+++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py
@@ -445,7 +445,7 @@ def astype(self, dtype):
 
 
         Args:
-            dtype (str or pandas.ExtensionDtype):
+            dtype (str, data type, or pandas.ExtensionDtype):
                 A dtype supported by BigQuery DataFrame include ``'boolean'``,
                 ``'Float64'``, ``'Int64'``, ``'int64\\[pyarrow\\]'``,
                 ``'string'``, ``'string\\[pyarrow\\]'``,

From 48829e6e451abd217e2d20a2052c165703f3f9d6 Mon Sep 17 00:00:00 2001
From: Trevor Bergeron <tbergeron@google.com>
Date: Thu, 23 Jan 2025 18:50:27 +0000
Subject: [PATCH 02/10] remove tests

---
 tests/unit/core/test_dtypes.py | 40 ----------------------------------
 1 file changed, 40 deletions(-)

diff --git a/tests/unit/core/test_dtypes.py b/tests/unit/core/test_dtypes.py
index e1fac624d7..035af5d9f1 100644
--- a/tests/unit/core/test_dtypes.py
+++ b/tests/unit/core/test_dtypes.py
@@ -16,7 +16,6 @@
 import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes
 import bigframes_vendored.ibis.expr.types as ibis_types
 import geopandas as gpd  # type: ignore
-import numpy as np
 import pandas as pd
 import pyarrow as pa  # type: ignore
 import pytest
@@ -197,45 +196,6 @@ def test_bigframes_dtype_converts(ibis_dtype, bigframes_dtype):
     assert result == ibis_dtype
 
 
-@pytest.mark.parametrize(
-    ["bigframes_dtype_str", "ibis_dtype"],
-    [
-        # This test covers all dtypes that BigQuery DataFrames can exactly map to Ibis
-        ("boolean", ibis_dtypes.boolean),
-        ("date32[day][pyarrow]", ibis_dtypes.date),
-        ("timestamp[us][pyarrow]", ibis_dtypes.Timestamp()),
-        ("Float64", ibis_dtypes.float64),
-        ("Int64", ibis_dtypes.int64),
-        ("string[pyarrow]", ibis_dtypes.string),
-        ("time64[us][pyarrow]", ibis_dtypes.time),
-        (
-            "timestamp[us, tz=UTC][pyarrow]",
-            ibis_dtypes.Timestamp(timezone="UTC"),
-        ),
-        # Special case - "string" is acceptable for "string[pyarrow]"
-        ("string", ibis_dtypes.string),
-    ],
-)
-def test_bigframes_string_dtype_converts(ibis_dtype, bigframes_dtype_str):
-    """Test all the Ibis data types needed to read BigQuery tables"""
-    result = bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype(
-        bigframes_dtype_str
-    )
-    assert result == ibis_dtype
-
-
-def test_unsupported_dtype_raises_unexpected_datatype():
-    """Incompatible dtypes should fail when passed into BigQuery DataFrames"""
-    with pytest.raises(ValueError, match="Unexpected data type"):
-        bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype(np.float32)
-
-
-def test_unsupported_dtype_str_raises_unexpected_datatype():
-    """Incompatible dtypes should fail when passed into BigQuery DataFrames"""
-    with pytest.raises(ValueError, match="Unexpected data type"):
-        bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype("int64")
-
-
 @pytest.mark.parametrize(
     ["literal", "ibis_scalar"],
     [

From 59a6dbc34c5404a8613b293dd9dd138f483090cf Mon Sep 17 00:00:00 2001
From: Trevor Bergeron <tbergeron@google.com>
Date: Thu, 23 Jan 2025 19:16:20 +0000
Subject: [PATCH 03/10] fix astype with dtype strings

---
 bigframes/core/blocks.py           | 10 +++++++---
 bigframes/dataframe.py             | 12 +-----------
 tests/unit/core/test_expression.py |  2 +-
 3 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
index afc03dbdea..37663f749b 100644
--- a/bigframes/core/blocks.py
+++ b/bigframes/core/blocks.py
@@ -707,7 +707,7 @@ def split(
         # Create an ordering col and convert to string
         block, ordering_col = block.promote_offsets()
         block, string_ordering_col = block.apply_unary_op(
-            ordering_col, ops.AsTypeOp(to_type="string[pyarrow]")
+            ordering_col, ops.AsTypeOp(to_type=bigframes.dtypes.STRING_DTYPE)
         )
 
         # Apply hash method to sum col and order by it.
@@ -1479,7 +1479,9 @@ def add_prefix(self, prefix: str, axis: str | int | None = None) -> Block:
                 expr, new_col = expr.project_to_id(
                     expression=ops.add_op.as_expr(
                         ex.const(prefix),
-                        ops.AsTypeOp(to_type="string").as_expr(index_col),
+                        ops.AsTypeOp(to_type=bigframes.dtypes.STRING_DTYPE).as_expr(
+                            index_col
+                        ),
                     ),
                 )
                 new_index_cols.append(new_col)
@@ -1502,7 +1504,9 @@ def add_suffix(self, suffix: str, axis: str | int | None = None) -> Block:
             for index_col in self._index_columns:
                 expr, new_col = expr.project_to_id(
                     expression=ops.add_op.as_expr(
-                        ops.AsTypeOp(to_type="string").as_expr(index_col),
+                        ops.AsTypeOp(to_type=bigframes.dtypes.STRING_DTYPE).as_expr(
+                            index_col
+                        ),
                         ex.const(suffix),
                     ),
                 )
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index 8f5a18afd0..d307820bf7 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -387,17 +387,7 @@ def astype(
 
         dtype = bigframes.dtypes.bigframes_type(dtype)
 
-        # Type strings check
-        if dtype in bigframes.dtypes.DTYPE_STRINGS:
-            return self._apply_unary_op(ops.AsTypeOp(dtype, safe_cast))
-
-        # Type instances check
-        if type(dtype) in bigframes.dtypes.DTYPES:
-            return self._apply_unary_op(ops.AsTypeOp(dtype, safe_cast))
-
-        raise TypeError(
-            f"Invalid type {type(dtype)} for dtype input. {constants.FEEDBACK_LINK}"
-        )
+        return self._apply_unary_op(ops.AsTypeOp(dtype, safe_cast))
 
     def _to_sql_query(
         self, include_index: bool, enable_cache: bool = True
diff --git a/tests/unit/core/test_expression.py b/tests/unit/core/test_expression.py
index 72e200f007..ab6402a909 100644
--- a/tests/unit/core/test_expression.py
+++ b/tests/unit/core/test_expression.py
@@ -47,7 +47,7 @@ def test_expression_dtype_where():
 
 
 def test_expression_dtype_astype():
-    expression = ops.AsTypeOp("Int64").as_expr(ex.const(3.14159))
+    expression = ops.AsTypeOp(dtypes.INT_DTYPE).as_expr(ex.const(3.14159))
 
     result = expression.output_type({})
 

From 19172c1e7ff619f1edc6e67ec6bc4c1739a6b781 Mon Sep 17 00:00:00 2001
From: Trevor Bergeron <tbergeron@google.com>
Date: Thu, 23 Jan 2025 21:57:55 +0000
Subject: [PATCH 04/10] handle object constructors and use TypeError

---
 bigframes/core/indexes/base.py |  3 ++-
 bigframes/dataframe.py         |  3 ++-
 bigframes/dtypes.py            | 26 +++++++++++++++-----------
 bigframes/operations/base.py   |  3 ++-
 4 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py
index dcfe252708..6ad0973262 100644
--- a/bigframes/core/indexes/base.py
+++ b/bigframes/core/indexes/base.py
@@ -78,7 +78,8 @@ def __new__(
             if name is not None:
                 index.name = name
             if dtype is not None:
-                index = index.astype(dtype)
+                bf_dtype = bigframes.dtypes.bigframes_type(dtype)
+                index = index.astype(bf_dtype)
             block = index._block
         elif isinstance(data, pandas.Index):
             pd_df = pandas.DataFrame(index=data)
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index d307820bf7..3d1d2f31e0 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -180,7 +180,8 @@ def __init__(
             if columns:
                 block = block.select_columns(list(columns))  # type:ignore
             if dtype:
-                block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=dtype))
+                bf_dtype = bigframes.dtypes.bigframes_type(dtype)
+                block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype))
             self._block = block
 
         else:
diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
index 3fc1571e4b..200bb1f7bf 100644
--- a/bigframes/dtypes.py
+++ b/bigframes/dtypes.py
@@ -422,7 +422,7 @@ def arrow_dtype_to_bigframes_dtype(arrow_dtype: pa.DataType) -> Dtype:
         return DEFAULT_DTYPE
 
     # No other types matched.
-    raise ValueError(
+    raise TypeError(
         f"Unexpected Arrow data type {arrow_dtype}. {constants.FEEDBACK_LINK}"
     )
 
@@ -447,7 +447,7 @@ def bigframes_dtype_to_arrow_dtype(
         if pa.types.is_struct(bigframes_dtype.pyarrow_dtype):
             return bigframes_dtype.pyarrow_dtype
     else:
-        raise ValueError(
+        raise TypeError(
             f"No arrow conversion for {bigframes_dtype}. {constants.FEEDBACK_LINK}"
         )
 
@@ -474,7 +474,7 @@ def bigframes_dtype_to_literal(
     if isinstance(bigframes_dtype, gpd.array.GeometryDtype):
         return shapely.Point((0, 0))
 
-    raise ValueError(
+    raise TypeError(
         f"No literal  conversion for {bigframes_dtype}. {constants.FEEDBACK_LINK}"
     )
 
@@ -515,7 +515,7 @@ def arrow_type_to_literal(
     if pa.types.is_time(arrow_type):
         return datetime.time(1, 1, 1)
 
-    raise ValueError(
+    raise TypeError(
         f"No literal  conversion for {arrow_type}. {constants.FEEDBACK_LINK}"
     )
 
@@ -528,7 +528,9 @@ def bigframes_type(dtype) -> Dtype:
     elif isinstance(dtype, type):
         return _infer_dtype_from_python_type(dtype)
     else:
-        raise ValueError(f"Cannot infer supported datatype for: {dtype}")
+        raise TypeError(
+            f"Cannot infer supported datatype for: {dtype}. {constants.FEEDBACK_LINK}"
+        )
 
 
 def _is_bigframes_dtype(dtype) -> bool:
@@ -539,7 +541,7 @@ def _is_bigframes_dtype(dtype) -> bool:
         try:
             _ = arrow_dtype_to_bigframes_dtype(dtype.pyarrow_dtype)
             return True
-        except ValueError:
+        except TypeError:
             return False
     return False
 
@@ -562,7 +564,9 @@ def _infer_dtype_from_python_type(type: type) -> Dtype:
     if issubclass(type, datetime.time):
         return TIME_DTYPE
     else:
-        raise ValueError(f"No matching datatype for python type: {type}")
+        raise TypeError(
+            f"No matching datatype for python type: {type}. {constants.FEEDBACK_LINK}"
+        )
 
 
 def _dtype_from_string(dtype_string: str) -> typing.Optional[Dtype]:
@@ -570,7 +574,7 @@ def _dtype_from_string(dtype_string: str) -> typing.Optional[Dtype]:
         return BIGFRAMES_STRING_TO_BIGFRAMES[
             typing.cast(DtypeString, str(dtype_string))
         ]
-    raise ValueError(
+    raise TypeError(
         textwrap.dedent(
             f"""
                 Unexpected data type string {dtype_string}. The following
@@ -619,7 +623,7 @@ def infer_literal_type(literal) -> typing.Optional[Dtype]:
     if from_python_type is not None:
         return from_python_type
     else:
-        raise ValueError(f"Unable to infer type for value: {literal}")
+        raise TypeError(f"Unable to infer type for value: {literal}")
 
 
 def infer_literal_arrow_type(literal) -> typing.Optional[pa.DataType]:
@@ -659,7 +663,7 @@ def convert_schema_field(
             return field.name, pd.ArrowDtype(pa_type)
         return field.name, _TK_TO_BIGFRAMES[field.field_type]
     else:
-        raise ValueError(f"Cannot handle type: {field.field_type}")
+        raise TypeError(f"Cannot handle type: {field.field_type}")
 
 
 def convert_to_schema_field(
@@ -690,7 +694,7 @@ def convert_to_schema_field(
             return google.cloud.bigquery.SchemaField(
                 name, "RECORD", fields=inner_fields
             )
-    raise ValueError(
+    raise TypeError(
         f"No arrow conversion for {bigframes_dtype}. {constants.FEEDBACK_LINK}"
     )
 
diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py
index f6e8223aa0..75db2f48e9 100644
--- a/bigframes/operations/base.py
+++ b/bigframes/operations/base.py
@@ -87,7 +87,8 @@ def __init__(
                 if name is not None:
                     data.name = name
                 if dtype is not None:
-                    data = data.astype(dtype)
+                    bf_dtype = bigframes.dtypes.bigframes_type(dtype)
+                    data = data.astype(bf_dtype)
             else:  # local dict-like data
                 data = read_pandas_func(pd.Series(data, name=name, dtype=dtype))  # type: ignore
             data_block = data._block

From de2026c845c8cff4238d87173fdc5772e58bb70e Mon Sep 17 00:00:00 2001
From: Trevor Bergeron <tbergeron@google.com>
Date: Thu, 23 Jan 2025 22:07:07 +0000
Subject: [PATCH 05/10] allow astype with pyarrow type

---
 bigframes/dtypes.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
index 200bb1f7bf..fb17d4d54f 100644
--- a/bigframes/dtypes.py
+++ b/bigframes/dtypes.py
@@ -521,12 +521,15 @@ def arrow_type_to_literal(
 
 
 def bigframes_type(dtype) -> Dtype:
+    """Convert type object to canoncial bigframes dtype."""
     if _is_bigframes_dtype(dtype):
         return dtype
     elif isinstance(dtype, str):
         return _dtype_from_string(dtype)
     elif isinstance(dtype, type):
         return _infer_dtype_from_python_type(dtype)
+    elif isinstance(dtype, pa.DataType):
+        return arrow_dtype_to_bigframes_dtype(dtype)
     else:
         raise TypeError(
             f"Cannot infer supported datatype for: {dtype}. {constants.FEEDBACK_LINK}"

From 18d7d27e5e3ba2d534912b51229056acd7df70c0 Mon Sep 17 00:00:00 2001
From: Trevor Bergeron <tbergeron@google.com>
Date: Fri, 24 Jan 2025 00:09:42 +0000
Subject: [PATCH 06/10] fix _is_bigframes_type for strings

---
 bigframes/dtypes.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
index 7a90471110..0a3e2fc875 100644
--- a/bigframes/dtypes.py
+++ b/bigframes/dtypes.py
@@ -538,7 +538,10 @@ def bigframes_type(dtype) -> Dtype:
 
 def _is_bigframes_dtype(dtype) -> bool:
     """True iff dtyps is a canonical bigframes dtype"""
-    if dtype in set(item.dtype for item in SIMPLE_TYPES):
+    # have to be quite strict, as pyarrow dtypes equal their string form, and we don't consider that a canonical form.
+    if (type(type), dtype) in set(
+        (type(item.dtype), item.dtype) for item in SIMPLE_TYPES
+    ):
         return True
     if isinstance(dtype, pd.ArrowDtype):
         try:

From 7dc19387c551c153d6256567dfd3f571796f0169 Mon Sep 17 00:00:00 2001
From: Trevor Bergeron <tbergeron@google.com>
Date: Mon, 27 Jan 2025 22:05:04 +0000
Subject: [PATCH 07/10] fix typo in _is_bigframes_dtype

---
 bigframes/dtypes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
index 0a3e2fc875..2382861971 100644
--- a/bigframes/dtypes.py
+++ b/bigframes/dtypes.py
@@ -539,7 +539,7 @@ def bigframes_type(dtype) -> Dtype:
 def _is_bigframes_dtype(dtype) -> bool:
     """True iff dtyps is a canonical bigframes dtype"""
     # have to be quite strict, as pyarrow dtypes equal their string form, and we don't consider that a canonical form.
-    if (type(type), dtype) in set(
+    if (type(dtype), dtype) in set(
         (type(item.dtype), item.dtype) for item in SIMPLE_TYPES
     ):
         return True

From 489896f492f305451d16c65c467081623fa116e3 Mon Sep 17 00:00:00 2001
From: Trevor Bergeron <tbergeron@google.com>
Date: Tue, 28 Jan 2025 05:03:57 +0000
Subject: [PATCH 08/10] normalize decimal arrow types

---
 bigframes/core/local_data.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/bigframes/core/local_data.py b/bigframes/core/local_data.py
index 573562cefa..8e00ec020e 100644
--- a/bigframes/core/local_data.py
+++ b/bigframes/core/local_data.py
@@ -59,6 +59,10 @@ def arrow_type_replacements(type: pa.DataType) -> pa.DataType:
     if pa.types.is_time64(type):
         # This is potentially lossy, but BigFrames doesn't support ns
         return pa.time64("us")
+    if pa.types.is_decimal128(type):
+        return pa.decimal128(38, 9)
+    if pa.types.is_decimal256(type):
+        return pa.decimal256(76, 38)
     if pa.types.is_large_string(type):
         # simple string type can handle the largest strings needed
         return pa.string()

From bd7be357592b1c2886baa12d247eb0297e234df1 Mon Sep 17 00:00:00 2001
From: Trevor Bergeron <tbergeron@google.com>
Date: Tue, 28 Jan 2025 20:27:19 +0000
Subject: [PATCH 09/10] handle type errors better, and handle cat type

---
 bigframes/core/local_data.py  | 2 ++
 bigframes/session/__init__.py | 1 +
 2 files changed, 3 insertions(+)

diff --git a/bigframes/core/local_data.py b/bigframes/core/local_data.py
index 8e00ec020e..f665948be2 100644
--- a/bigframes/core/local_data.py
+++ b/bigframes/core/local_data.py
@@ -63,6 +63,8 @@ def arrow_type_replacements(type: pa.DataType) -> pa.DataType:
         return pa.decimal128(38, 9)
     if pa.types.is_decimal256(type):
         return pa.decimal256(76, 38)
+    if pa.types.is_dictionary(type):
+        return arrow_type_replacements(type.value_type)
     if pa.types.is_large_string(type):
         # simple string type can handle the largest strings needed
         return pa.string()
diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py
index 89ea0eee69..686545bed3 100644
--- a/bigframes/session/__init__.py
+++ b/bigframes/session/__init__.py
@@ -806,6 +806,7 @@ def _read_pandas_inline(
             pa.ArrowInvalid,  # Thrown by arrow for unsupported types, such as geo.
             pa.ArrowTypeError,  # Thrown by arrow for types without mapping (geo).
             ValueError,  # Thrown by ibis for some unhandled types
+            TypeError,  # Not all types handleable by local code path
         ) as exc:
             if should_raise:
                 raise ValueError(

From e8687cdc62d977cd446ab25813bdea24c895c4fd Mon Sep 17 00:00:00 2001
From: Trevor Bergeron <tbergeron@google.com>
Date: Wed, 29 Jan 2025 21:22:27 +0000
Subject: [PATCH 10/10] reinstate unit tests

---
 bigframes/core/compile/ibis_types.py |  2 +-
 tests/unit/core/test_dtypes.py       | 40 ++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py
index 74de95d49d..8a55f6775d 100644
--- a/bigframes/core/compile/ibis_types.py
+++ b/bigframes/core/compile/ibis_types.py
@@ -247,7 +247,7 @@ def bigframes_dtype_to_ibis_dtype(
         return _arrow_dtype_to_ibis_dtype(bigframes_dtype.pyarrow_dtype)
 
     else:
-        raise ValueError(f"Datatype has not ibis type mapping: {bigframes_dtype}")
+        raise ValueError(f"Datatype has no ibis type mapping: {bigframes_dtype}")
 
 
 def ibis_dtype_to_bigframes_dtype(
diff --git a/tests/unit/core/test_dtypes.py b/tests/unit/core/test_dtypes.py
index 035af5d9f1..3d420de51f 100644
--- a/tests/unit/core/test_dtypes.py
+++ b/tests/unit/core/test_dtypes.py
@@ -16,6 +16,7 @@
 import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes
 import bigframes_vendored.ibis.expr.types as ibis_types
 import geopandas as gpd  # type: ignore
+import numpy as np
 import pandas as pd
 import pyarrow as pa  # type: ignore
 import pytest
@@ -196,6 +197,45 @@ def test_bigframes_dtype_converts(ibis_dtype, bigframes_dtype):
     assert result == ibis_dtype
 
 
+@pytest.mark.parametrize(
+    ["bigframes_dtype_str", "ibis_dtype"],
+    [
+        # This test covers all dtypes that BigQuery DataFrames can exactly map to Ibis
+        ("boolean", ibis_dtypes.boolean),
+        ("date32[day][pyarrow]", ibis_dtypes.date),
+        ("timestamp[us][pyarrow]", ibis_dtypes.Timestamp()),
+        ("Float64", ibis_dtypes.float64),
+        ("Int64", ibis_dtypes.int64),
+        ("string[pyarrow]", ibis_dtypes.string),
+        ("time64[us][pyarrow]", ibis_dtypes.time),
+        (
+            "timestamp[us, tz=UTC][pyarrow]",
+            ibis_dtypes.Timestamp(timezone="UTC"),
+        ),
+        # Special case - "string" is acceptable for "string[pyarrow]"
+        ("string", ibis_dtypes.string),
+    ],
+)
+def test_bigframes_string_dtype_converts(ibis_dtype, bigframes_dtype_str):
+    """Test all the Ibis data types needed to read BigQuery tables"""
+    result = bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype(
+        bigframes.dtypes.bigframes_type(bigframes_dtype_str)
+    )
+    assert result == ibis_dtype
+
+
+def test_unsupported_dtype_raises_unexpected_datatype():
+    """Incompatible dtypes should fail when passed into BigQuery DataFrames"""
+    with pytest.raises(ValueError, match="Datatype has no ibis type mapping"):
+        bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype(np.float32)
+
+
+def test_unsupported_dtype_str_raises_unexpected_datatype():
+    """Incompatible dtypes should fail when passed into BigQuery DataFrames"""
+    with pytest.raises(ValueError, match="Datatype has no ibis type mapping"):
+        bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype("int64")
+
+
 @pytest.mark.parametrize(
     ["literal", "ibis_scalar"],
     [