Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

feat: Add support for temporal types in dataframe's describe() method #1189

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Dec 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 50 additions & 13 deletions 63 bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,17 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame:
)
return DataFrame(self._block.select_columns(selected_columns))

def _select_exact_dtypes(
self, dtypes: Sequence[bigframes.dtypes.Dtype]
) -> DataFrame:
"""Selects columns without considering inheritance relationships."""
columns = [
col_id
for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
if dtype in dtypes
]
return DataFrame(self._block.select_columns(columns))

def _set_internal_query_job(self, query_job: Optional[bigquery.QueryJob]):
self._query_job = query_job

Expand Down Expand Up @@ -2437,13 +2448,9 @@ def agg(
aggregations = [agg_ops.lookup_agg_func(f) for f in func]

for dtype, agg in itertools.product(self.dtypes, aggregations):
if not bigframes.operations.aggregations.is_agg_op_supported(
dtype, agg
):
raise NotImplementedError(
f"Type {dtype} does not support aggregation {agg}. "
f"Share your usecase with the BigQuery DataFrames team at the {constants.FEEDBACK_LINK}"
)
agg.output_type(
dtype
) # Raises exception if the agg does not support the dtype.

return DataFrame(
self._block.summarize(
Expand Down Expand Up @@ -2512,7 +2519,10 @@ def melt(

def describe(self, include: None | Literal["all"] = None) -> DataFrame:
if include is None:
numeric_df = self._drop_non_numeric(permissive=False)
numeric_df = self._select_exact_dtypes(
bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
+ bigframes.dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES
)
if len(numeric_df.columns) == 0:
# Describe eligible non-numeric columns
return self._describe_non_numeric()
Expand Down Expand Up @@ -2540,9 +2550,11 @@ def describe(self, include: None | Literal["all"] = None) -> DataFrame:
raise ValueError(f"Unsupported include type: {include}")

def _describe_numeric(self) -> DataFrame:
return typing.cast(
number_df_result = typing.cast(
DataFrame,
self._drop_non_numeric(permissive=False).agg(
self._select_exact_dtypes(
bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
).agg(
[
"count",
"mean",
Expand All @@ -2555,16 +2567,41 @@ def _describe_numeric(self) -> DataFrame:
]
),
)
temporal_df_result = typing.cast(
DataFrame,
self._select_exact_dtypes(
bigframes.dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES
).agg(["count"]),
)

if len(number_df_result.columns) == 0:
return temporal_df_result
elif len(temporal_df_result.columns) == 0:
return number_df_result
else:
import bigframes.core.reshape.api as rs

original_columns = self._select_exact_dtypes(
bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
+ bigframes.dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES
).columns

# Use reindex after join to preserve the original column order.
return rs.concat(
[number_df_result, temporal_df_result],
axis=1,
)._reindex_columns(original_columns)

def _describe_non_numeric(self) -> DataFrame:
return typing.cast(
DataFrame,
self.select_dtypes(
include={
self._select_exact_dtypes(
[
bigframes.dtypes.STRING_DTYPE,
bigframes.dtypes.BOOL_DTYPE,
bigframes.dtypes.BYTES_DTYPE,
}
bigframes.dtypes.TIME_DTYPE,
]
).agg(["count", "nunique"]),
)

Expand Down
17 changes: 13 additions & 4 deletions 17 bigframes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import datetime
import decimal
import typing
from typing import Dict, Literal, Union
from typing import Dict, List, Literal, Union

import bigframes_vendored.constants as constants
import geopandas as gpd # type: ignore
Expand Down Expand Up @@ -211,7 +211,7 @@ class SimpleDtypeInfo:

# Corresponds to the pandas concept of numeric type (such as when 'numeric_only' is specified in an operation)
# Pandas is inconsistent, so two definitions are provided, each used in different contexts
NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE = [
NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE: List[Dtype] = [
FLOAT_DTYPE,
INT_DTYPE,
]
Expand All @@ -222,7 +222,16 @@ class SimpleDtypeInfo:
]


## dtype predicates - use these to maintain consistency
# Temporal types that are considered as "numeric" by Pandas
TEMPORAL_NUMERIC_BIGFRAMES_TYPES: List[Dtype] = [
DATE_DTYPE,
TIMESTAMP_DTYPE,
DATETIME_DTYPE,
]
TEMPORAL_BIGFRAMES_TYPES = TEMPORAL_NUMERIC_BIGFRAMES_TYPES + [TIME_DTYPE]


# dtype predicates - use these to maintain consistency
def is_datetime_like(type_: ExpressionType) -> bool:
return type_ in (DATETIME_DTYPE, TIMESTAMP_DTYPE)

Expand Down Expand Up @@ -630,7 +639,7 @@ def can_coerce(source_type: ExpressionType, target_type: ExpressionType) -> bool
return True # None can be coerced to any supported type
else:
return (source_type == STRING_DTYPE) and (
target_type in (DATETIME_DTYPE, TIMESTAMP_DTYPE, TIME_DTYPE, DATE_DTYPE)
target_type in TEMPORAL_BIGFRAMES_TYPES
)


Expand Down
11 changes: 0 additions & 11 deletions 11 bigframes/operations/aggregations.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,14 +579,3 @@ def lookup_agg_func(key: str) -> typing.Union[UnaryAggregateOp, NullaryAggregate
return _AGGREGATIONS_LOOKUP[key]
else:
raise ValueError(f"Unrecognize aggregate function: {key}")


def is_agg_op_supported(dtype: dtypes.Dtype, op: AggregateOp) -> bool:
if dtype in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE:
return True

if dtype in (dtypes.STRING_DTYPE, dtypes.BOOL_DTYPE, dtypes.BYTES_DTYPE):
return isinstance(op, (CountOp, NuniqueOp))

# For all other types, support no aggregation
return False
45 changes: 37 additions & 8 deletions 45 tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2671,11 +2671,11 @@ def test_dataframe_agg_int_multi_string(scalars_dfs):


@skip_legacy_pandas
def test_df_describe(scalars_dfs):
def test_df_describe_non_temporal(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
# pyarrows time columns fail in pandas
# excluding temporal columns here because BigFrames cannot perform percentiles operations on them
unsupported_columns = ["datetime_col", "timestamp_col", "time_col", "date_col"]
bf_result = scalars_df.describe().to_pandas()
bf_result = scalars_df.drop(columns=unsupported_columns).describe().to_pandas()

modified_pd_df = scalars_pandas_df.drop(columns=unsupported_columns)
pd_result = modified_pd_df.describe()
Expand Down Expand Up @@ -2709,12 +2709,14 @@ def test_df_describe(scalars_dfs):
def test_df_describe_non_numeric(scalars_dfs, include):
scalars_df, scalars_pandas_df = scalars_dfs

non_numeric_columns = ["string_col", "bytes_col", "bool_col"]
# Excluding "date_col" here because in BigFrames it is used as PyArrow[date32()], which is
# considered numerical in Pandas
target_columns = ["string_col", "bytes_col", "bool_col", "time_col"]

modified_bf = scalars_df[non_numeric_columns]
modified_bf = scalars_df[target_columns]
bf_result = modified_bf.describe(include=include).to_pandas()

modified_pd_df = scalars_pandas_df[non_numeric_columns]
modified_pd_df = scalars_pandas_df[target_columns]
pd_result = modified_pd_df.describe(include=include)

# Reindex results with the specified keys and their order, because
Expand All @@ -2726,8 +2728,35 @@ def test_df_describe_non_numeric(scalars_dfs, include):
).rename(index={"unique": "nunique"})

pd.testing.assert_frame_equal(
pd_result[non_numeric_columns].astype("Int64"),
bf_result[non_numeric_columns],
pd_result.astype("Int64"),
bf_result,
check_index_type=False,
)


@skip_legacy_pandas
def test_df_describe_temporal(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs

temporal_columns = ["datetime_col", "timestamp_col", "time_col", "date_col"]

modified_bf = scalars_df[temporal_columns]
bf_result = modified_bf.describe(include="all").to_pandas()

modified_pd_df = scalars_pandas_df[temporal_columns]
pd_result = modified_pd_df.describe(include="all")

# Reindex results with the specified keys and their order, because
# the relative order is not important.
bf_result = bf_result.reindex(["count", "nunique"])
pd_result = pd_result.reindex(
["count", "unique"]
# BF counter part of "unique" is called "nunique"
).rename(index={"unique": "nunique"})

pd.testing.assert_frame_equal(
pd_result.astype("Float64"),
bf_result.astype("Float64"),
check_index_type=False,
)

Expand Down
83 changes: 0 additions & 83 deletions 83 tests/unit/operations/test_aggregations.py

This file was deleted.

Loading
Morty Proxy This is a proxified and sanitized view of the page, visit original site.