Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

feat: Support python type as astype arg #1316

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Jan 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions 10 bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -707,7 +707,7 @@ def split(
# Create an ordering col and convert to string
block, ordering_col = block.promote_offsets()
block, string_ordering_col = block.apply_unary_op(
ordering_col, ops.AsTypeOp(to_type="string[pyarrow]")
ordering_col, ops.AsTypeOp(to_type=bigframes.dtypes.STRING_DTYPE)
)

# Apply hash method to sum col and order by it.
Expand Down Expand Up @@ -1479,7 +1479,9 @@ def add_prefix(self, prefix: str, axis: str | int | None = None) -> Block:
expr, new_col = expr.project_to_id(
expression=ops.add_op.as_expr(
ex.const(prefix),
ops.AsTypeOp(to_type="string").as_expr(index_col),
ops.AsTypeOp(to_type=bigframes.dtypes.STRING_DTYPE).as_expr(
index_col
),
),
)
new_index_cols.append(new_col)
Expand All @@ -1502,7 +1504,9 @@ def add_suffix(self, suffix: str, axis: str | int | None = None) -> Block:
for index_col in self._index_columns:
expr, new_col = expr.project_to_id(
expression=ops.add_op.as_expr(
ops.AsTypeOp(to_type="string").as_expr(index_col),
ops.AsTypeOp(to_type=bigframes.dtypes.STRING_DTYPE).as_expr(
index_col
),
ex.const(suffix),
),
)
Expand Down
32 changes: 3 additions & 29 deletions 32 bigframes/core/compile/ibis_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,8 @@
# limitations under the License.
from __future__ import annotations

import textwrap
import typing
from typing import Any, cast, Dict, Iterable, Optional, Tuple, Union
from typing import cast, Dict, Iterable, Optional, Tuple, Union

import bigframes_vendored.constants as constants
import bigframes_vendored.ibis
Expand All @@ -28,7 +27,6 @@
import db_dtypes # type: ignore
import geopandas as gpd # type: ignore
import google.cloud.bigquery as bigquery
import numpy as np
import pandas as pd
import pyarrow as pa

Expand Down Expand Up @@ -228,9 +226,7 @@ def ibis_value_to_canonical_type(value: ibis_types.Value) -> ibis_types.Value:


def bigframes_dtype_to_ibis_dtype(
bigframes_dtype: Union[
bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype, np.dtype[Any]
]
bigframes_dtype: bigframes.dtypes.Dtype,
) -> ibis_dtypes.DataType:
"""Converts a BigQuery DataFrames supported dtype to an Ibis dtype.

Expand All @@ -244,36 +240,14 @@ def bigframes_dtype_to_ibis_dtype(
Raises:
ValueError: If passed a dtype not supported by BigQuery DataFrames.
"""
if str(bigframes_dtype) in bigframes.dtypes.BIGFRAMES_STRING_TO_BIGFRAMES:
bigframes_dtype = bigframes.dtypes.BIGFRAMES_STRING_TO_BIGFRAMES[
cast(bigframes.dtypes.DtypeString, str(bigframes_dtype))
]

if bigframes_dtype in BIGFRAMES_TO_IBIS.keys():
return BIGFRAMES_TO_IBIS[bigframes_dtype]

elif isinstance(bigframes_dtype, pd.ArrowDtype) and bigframes_dtype.pyarrow_dtype:
return _arrow_dtype_to_ibis_dtype(bigframes_dtype.pyarrow_dtype)

else:
raise ValueError(
textwrap.dedent(
f"""
Unexpected data type {bigframes_dtype}. The following
str dtypes are supppted: 'boolean','Float64','Int64',
'int64[pyarrow]','string','string[pyarrow]',
'timestamp[us, tz=UTC][pyarrow]','timestamp[us][pyarrow]',
'date32[day][pyarrow]','time64[us][pyarrow]'.
The following pandas.ExtensionDtype are supported:
pandas.BooleanDtype(), pandas.Float64Dtype(),
pandas.Int64Dtype(), pandas.StringDtype(storage="pyarrow"),
pd.ArrowDtype(pa.date32()), pd.ArrowDtype(pa.time64("us")),
pd.ArrowDtype(pa.timestamp("us")),
pd.ArrowDtype(pa.timestamp("us", tz="UTC")).
{constants.FEEDBACK_LINK}
"""
)
)
raise ValueError(f"Datatype has no ibis type mapping: {bigframes_dtype}")


def ibis_dtype_to_bigframes_dtype(
Expand Down
6 changes: 4 additions & 2 deletions 6 bigframes/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ def __new__(
if name is not None:
index.name = name
if dtype is not None:
index = index.astype(dtype)
bf_dtype = bigframes.dtypes.bigframes_type(dtype)
index = index.astype(bf_dtype)
block = index._block
elif isinstance(data, pandas.Index):
pd_df = pandas.DataFrame(index=data)
Expand Down Expand Up @@ -310,14 +311,15 @@ def sort_values(self, *, ascending: bool = True, na_position: str = "last"):

def astype(
self,
dtype: Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype],
dtype,
*,
errors: Literal["raise", "null"] = "raise",
) -> Index:
if errors not in ["raise", "null"]:
raise ValueError("Argument 'errors' must be one of 'raise' or 'null'")
if self.nlevels > 1:
raise TypeError("Multiindex does not support 'astype'")
dtype = bigframes.dtypes.bigframes_type(dtype)
return self._apply_unary_expr(
ops.AsTypeOp(to_type=dtype, safe=(errors == "null")).as_expr(
ex.free_var("arg")
Expand Down
6 changes: 6 additions & 0 deletions 6 bigframes/core/local_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ def arrow_type_replacements(type: pa.DataType) -> pa.DataType:
if pa.types.is_time64(type):
# This is potentially lossy, but BigFrames doesn't support ns
return pa.time64("us")
if pa.types.is_decimal128(type):
return pa.decimal128(38, 9)
if pa.types.is_decimal256(type):
return pa.decimal256(76, 38)
if pa.types.is_dictionary(type):
return arrow_type_replacements(type.value_type)
if pa.types.is_large_string(type):
# simple string type can handle the largest strings needed
return pa.string()
Expand Down
18 changes: 6 additions & 12 deletions 18 bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,8 @@ def __init__(
if columns:
block = block.select_columns(list(columns)) # type:ignore
if dtype:
block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=dtype))
bf_dtype = bigframes.dtypes.bigframes_type(dtype)
block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype))
self._block = block

else:
Expand Down Expand Up @@ -368,6 +369,7 @@ def astype(
dtype: Union[
bigframes.dtypes.DtypeString,
bigframes.dtypes.Dtype,
type,
dict[str, Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype]],
],
*,
Expand All @@ -378,23 +380,15 @@ def astype(

safe_cast = errors == "null"

# Type strings check
if dtype in bigframes.dtypes.DTYPE_STRINGS:
return self._apply_unary_op(ops.AsTypeOp(dtype, safe_cast))

# Type instances check
if type(dtype) in bigframes.dtypes.DTYPES:
return self._apply_unary_op(ops.AsTypeOp(dtype, safe_cast))

if isinstance(dtype, dict):
result = self.copy()
for col, to_type in dtype.items():
result[col] = result[col].astype(to_type)
return result

raise TypeError(
f"Invalid type {type(dtype)} for dtype input. {constants.FEEDBACK_LINK}"
)
dtype = bigframes.dtypes.bigframes_type(dtype)

return self._apply_unary_op(ops.AsTypeOp(dtype, safe_cast))

def _to_sql_query(
self, include_index: bool, enable_cache: bool = True
Expand Down
114 changes: 91 additions & 23 deletions 114 bigframes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from dataclasses import dataclass
import datetime
import decimal
import textwrap
import typing
from typing import Any, Dict, List, Literal, Union

Expand Down Expand Up @@ -422,7 +423,7 @@ def arrow_dtype_to_bigframes_dtype(arrow_dtype: pa.DataType) -> Dtype:
return DEFAULT_DTYPE

# No other types matched.
raise ValueError(
raise TypeError(
f"Unexpected Arrow data type {arrow_dtype}. {constants.FEEDBACK_LINK}"
)

Expand All @@ -447,7 +448,7 @@ def bigframes_dtype_to_arrow_dtype(
if pa.types.is_struct(bigframes_dtype.pyarrow_dtype):
return bigframes_dtype.pyarrow_dtype
else:
raise ValueError(
raise TypeError(
f"No arrow conversion for {bigframes_dtype}. {constants.FEEDBACK_LINK}"
)

Expand All @@ -474,7 +475,7 @@ def bigframes_dtype_to_literal(
if isinstance(bigframes_dtype, gpd.array.GeometryDtype):
return shapely.Point((0, 0))

raise ValueError(
raise TypeError(
f"No literal conversion for {bigframes_dtype}. {constants.FEEDBACK_LINK}"
)

Expand Down Expand Up @@ -515,11 +516,91 @@ def arrow_type_to_literal(
if pa.types.is_time(arrow_type):
return datetime.time(1, 1, 1)

raise ValueError(
raise TypeError(
f"No literal conversion for {arrow_type}. {constants.FEEDBACK_LINK}"
)


def bigframes_type(dtype) -> Dtype:
"""Convert type object to canoncial bigframes dtype."""
if _is_bigframes_dtype(dtype):
return dtype
elif isinstance(dtype, str):
return _dtype_from_string(dtype)
elif isinstance(dtype, type):
return _infer_dtype_from_python_type(dtype)
elif isinstance(dtype, pa.DataType):
return arrow_dtype_to_bigframes_dtype(dtype)
else:
raise TypeError(
f"Cannot infer supported datatype for: {dtype}. {constants.FEEDBACK_LINK}"
)


def _is_bigframes_dtype(dtype) -> bool:
"""True iff dtyps is a canonical bigframes dtype"""
# have to be quite strict, as pyarrow dtypes equal their string form, and we don't consider that a canonical form.
if (type(dtype), dtype) in set(
(type(item.dtype), item.dtype) for item in SIMPLE_TYPES
):
return True
if isinstance(dtype, pd.ArrowDtype):
try:
_ = arrow_dtype_to_bigframes_dtype(dtype.pyarrow_dtype)
return True
except TypeError:
return False
return False


def _infer_dtype_from_python_type(type: type) -> Dtype:
if issubclass(type, (bool, np.bool_)):
return BOOL_DTYPE
if issubclass(type, (int, np.integer)):
return INT_DTYPE
if issubclass(type, (float, np.floating)):
return FLOAT_DTYPE
if issubclass(type, decimal.Decimal):
return NUMERIC_DTYPE
if issubclass(type, (str, np.str_)):
return STRING_DTYPE
if issubclass(type, (bytes, np.bytes_)):
return BYTES_DTYPE
if issubclass(type, datetime.date):
return DATE_DTYPE
if issubclass(type, datetime.time):
return TIME_DTYPE
else:
raise TypeError(
f"No matching datatype for python type: {type}. {constants.FEEDBACK_LINK}"
)


def _dtype_from_string(dtype_string: str) -> typing.Optional[Dtype]:
if str(dtype_string) in BIGFRAMES_STRING_TO_BIGFRAMES:
return BIGFRAMES_STRING_TO_BIGFRAMES[
typing.cast(DtypeString, str(dtype_string))
]
raise TypeError(
textwrap.dedent(
f"""
Unexpected data type string {dtype_string}. The following
dtypes are supppted: 'boolean','Float64','Int64',
'int64[pyarrow]','string','string[pyarrow]',
'timestamp[us, tz=UTC][pyarrow]','timestamp[us][pyarrow]',
'date32[day][pyarrow]','time64[us][pyarrow]'.
The following pandas.ExtensionDtype are supported:
pandas.BooleanDtype(), pandas.Float64Dtype(),
pandas.Int64Dtype(), pandas.StringDtype(storage="pyarrow"),
pd.ArrowDtype(pa.date32()), pd.ArrowDtype(pa.time64("us")),
pd.ArrowDtype(pa.timestamp("us")),
pd.ArrowDtype(pa.timestamp("us", tz="UTC")).
{constants.FEEDBACK_LINK}
"""
)
)


def infer_literal_type(literal) -> typing.Optional[Dtype]:
# Maybe also normalize literal to canonical python representation to remove this burden from compilers?
if pd.api.types.is_list_like(literal):
Expand All @@ -539,30 +620,17 @@ def infer_literal_type(literal) -> typing.Optional[Dtype]:
return pd.ArrowDtype(pa.struct(fields))
if pd.isna(literal):
return None # Null value without a definite type
if isinstance(literal, (bool, np.bool_)):
return BOOL_DTYPE
if isinstance(literal, (int, np.integer)):
return INT_DTYPE
if isinstance(literal, (float, np.floating)):
return FLOAT_DTYPE
if isinstance(literal, decimal.Decimal):
return NUMERIC_DTYPE
if isinstance(literal, (str, np.str_)):
return STRING_DTYPE
if isinstance(literal, (bytes, np.bytes_)):
return BYTES_DTYPE
# Make sure to check datetime before date as datetimes are also dates
if isinstance(literal, (datetime.datetime, pd.Timestamp)):
if literal.tzinfo is not None:
return TIMESTAMP_DTYPE
else:
return DATETIME_DTYPE
if isinstance(literal, datetime.date):
return DATE_DTYPE
if isinstance(literal, datetime.time):
return TIME_DTYPE
from_python_type = _infer_dtype_from_python_type(type(literal))
if from_python_type is not None:
return from_python_type
else:
raise ValueError(f"Unable to infer type for value: {literal}")
raise TypeError(f"Unable to infer type for value: {literal}")


def infer_literal_arrow_type(literal) -> typing.Optional[pa.DataType]:
Expand Down Expand Up @@ -602,7 +670,7 @@ def convert_schema_field(
return field.name, pd.ArrowDtype(pa_type)
return field.name, _TK_TO_BIGFRAMES[field.field_type]
else:
raise ValueError(f"Cannot handle type: {field.field_type}")
raise TypeError(f"Cannot handle type: {field.field_type}")


def convert_to_schema_field(
Expand Down Expand Up @@ -636,7 +704,7 @@ def convert_to_schema_field(
if bigframes_dtype.pyarrow_dtype == pa.duration("us"):
# Timedeltas are represented as integers in microseconds.
return google.cloud.bigquery.SchemaField(name, "INTEGER")
raise ValueError(
raise TypeError(
f"No arrow conversion for {bigframes_dtype}. {constants.FEEDBACK_LINK}"
)

Expand Down
3 changes: 2 additions & 1 deletion 3 bigframes/operations/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,8 @@ def __init__(
if name is not None:
data.name = name
if dtype is not None:
data = data.astype(dtype)
bf_dtype = bigframes.dtypes.bigframes_type(dtype)
data = data.astype(bf_dtype)
else: # local dict-like data
data = read_pandas_func(pd.Series(data, name=name, dtype=dtype)) # type: ignore
data_block = data._block
Expand Down
Loading
Morty Proxy This is a proxified and sanitized view of the page, visit original site.