diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 4e175ce31c..614a2fb919 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -517,6 +517,17 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: ) return DataFrame(self._block.select_columns(selected_columns)) + def _select_exact_dtypes( + self, dtypes: Sequence[bigframes.dtypes.Dtype] + ) -> DataFrame: + """Selects columns without considering inheritance relationships.""" + columns = [ + col_id + for col_id, dtype in zip(self._block.value_columns, self._block.dtypes) + if dtype in dtypes + ] + return DataFrame(self._block.select_columns(columns)) + def _set_internal_query_job(self, query_job: Optional[bigquery.QueryJob]): self._query_job = query_job @@ -2437,13 +2448,9 @@ def agg( aggregations = [agg_ops.lookup_agg_func(f) for f in func] for dtype, agg in itertools.product(self.dtypes, aggregations): - if not bigframes.operations.aggregations.is_agg_op_supported( - dtype, agg - ): - raise NotImplementedError( - f"Type {dtype} does not support aggregation {agg}. " - f"Share your usecase with the BigQuery DataFrames team at the {constants.FEEDBACK_LINK}" - ) + agg.output_type( + dtype + ) # Raises exception if the agg does not support the dtype. return DataFrame( self._block.summarize( @@ -2512,7 +2519,10 @@ def melt( def describe(self, include: None | Literal["all"] = None) -> DataFrame: if include is None: - numeric_df = self._drop_non_numeric(permissive=False) + numeric_df = self._select_exact_dtypes( + bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE + + bigframes.dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES + ) if len(numeric_df.columns) == 0: # Describe eligible non-numeric columns return self._describe_non_numeric() @@ -2540,9 +2550,11 @@ def describe(self, include: None | Literal["all"] = None) -> DataFrame: raise ValueError(f"Unsupported include type: {include}") def _describe_numeric(self) -> DataFrame: - return typing.cast( + number_df_result = typing.cast( DataFrame, - self._drop_non_numeric(permissive=False).agg( + self._select_exact_dtypes( + bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE + ).agg( [ "count", "mean", @@ -2555,16 +2567,41 @@ def _describe_numeric(self) -> DataFrame: ] ), ) + temporal_df_result = typing.cast( + DataFrame, + self._select_exact_dtypes( + bigframes.dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES + ).agg(["count"]), + ) + + if len(number_df_result.columns) == 0: + return temporal_df_result + elif len(temporal_df_result.columns) == 0: + return number_df_result + else: + import bigframes.core.reshape.api as rs + + original_columns = self._select_exact_dtypes( + bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE + + bigframes.dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES + ).columns + + # Use reindex after join to preserve the original column order. + return rs.concat( + [number_df_result, temporal_df_result], + axis=1, + )._reindex_columns(original_columns) def _describe_non_numeric(self) -> DataFrame: return typing.cast( DataFrame, - self.select_dtypes( - include={ + self._select_exact_dtypes( + [ bigframes.dtypes.STRING_DTYPE, bigframes.dtypes.BOOL_DTYPE, bigframes.dtypes.BYTES_DTYPE, - } + bigframes.dtypes.TIME_DTYPE, + ] ).agg(["count", "nunique"]), ) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index c71531f9f3..69faa056fe 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -18,7 +18,7 @@ import datetime import decimal import typing -from typing import Dict, Literal, Union +from typing import Dict, List, Literal, Union import bigframes_vendored.constants as constants import geopandas as gpd # type: ignore @@ -211,7 +211,7 @@ class SimpleDtypeInfo: # Corresponds to the pandas concept of numeric type (such as when 'numeric_only' is specified in an operation) # Pandas is inconsistent, so two definitions are provided, each used in different contexts -NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE = [ +NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE: List[Dtype] = [ FLOAT_DTYPE, INT_DTYPE, ] @@ -222,7 +222,16 @@ class SimpleDtypeInfo: ] -## dtype predicates - use these to maintain consistency +# Temporal types that are considered as "numeric" by Pandas +TEMPORAL_NUMERIC_BIGFRAMES_TYPES: List[Dtype] = [ + DATE_DTYPE, + TIMESTAMP_DTYPE, + DATETIME_DTYPE, +] +TEMPORAL_BIGFRAMES_TYPES = TEMPORAL_NUMERIC_BIGFRAMES_TYPES + [TIME_DTYPE] + + +# dtype predicates - use these to maintain consistency def is_datetime_like(type_: ExpressionType) -> bool: return type_ in (DATETIME_DTYPE, TIMESTAMP_DTYPE) @@ -630,7 +639,7 @@ def can_coerce(source_type: ExpressionType, target_type: ExpressionType) -> bool return True # None can be coerced to any supported type else: return (source_type == STRING_DTYPE) and ( - target_type in (DATETIME_DTYPE, TIMESTAMP_DTYPE, TIME_DTYPE, DATE_DTYPE) + target_type in TEMPORAL_BIGFRAMES_TYPES ) diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 3e4e9d1df1..6b7f56d708 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -579,14 +579,3 @@ def lookup_agg_func(key: str) -> typing.Union[UnaryAggregateOp, NullaryAggregate return _AGGREGATIONS_LOOKUP[key] else: raise ValueError(f"Unrecognize aggregate function: {key}") - - -def is_agg_op_supported(dtype: dtypes.Dtype, op: AggregateOp) -> bool: - if dtype in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE: - return True - - if dtype in (dtypes.STRING_DTYPE, dtypes.BOOL_DTYPE, dtypes.BYTES_DTYPE): - return isinstance(op, (CountOp, NuniqueOp)) - - # For all other types, support no aggregation - return False diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index eacec66b2f..bbcec90ea8 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2671,11 +2671,11 @@ def test_dataframe_agg_int_multi_string(scalars_dfs): @skip_legacy_pandas -def test_df_describe(scalars_dfs): +def test_df_describe_non_temporal(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs - # pyarrows time columns fail in pandas + # excluding temporal columns here because BigFrames cannot perform percentiles operations on them unsupported_columns = ["datetime_col", "timestamp_col", "time_col", "date_col"] - bf_result = scalars_df.describe().to_pandas() + bf_result = scalars_df.drop(columns=unsupported_columns).describe().to_pandas() modified_pd_df = scalars_pandas_df.drop(columns=unsupported_columns) pd_result = modified_pd_df.describe() @@ -2709,12 +2709,14 @@ def test_df_describe(scalars_dfs): def test_df_describe_non_numeric(scalars_dfs, include): scalars_df, scalars_pandas_df = scalars_dfs - non_numeric_columns = ["string_col", "bytes_col", "bool_col"] + # Excluding "date_col" here because in BigFrames it is used as PyArrow[date32()], which is + # considered numerical in Pandas + target_columns = ["string_col", "bytes_col", "bool_col", "time_col"] - modified_bf = scalars_df[non_numeric_columns] + modified_bf = scalars_df[target_columns] bf_result = modified_bf.describe(include=include).to_pandas() - modified_pd_df = scalars_pandas_df[non_numeric_columns] + modified_pd_df = scalars_pandas_df[target_columns] pd_result = modified_pd_df.describe(include=include) # Reindex results with the specified keys and their order, because @@ -2726,8 +2728,35 @@ def test_df_describe_non_numeric(scalars_dfs, include): ).rename(index={"unique": "nunique"}) pd.testing.assert_frame_equal( - pd_result[non_numeric_columns].astype("Int64"), - bf_result[non_numeric_columns], + pd_result.astype("Int64"), + bf_result, + check_index_type=False, + ) + + +@skip_legacy_pandas +def test_df_describe_temporal(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + temporal_columns = ["datetime_col", "timestamp_col", "time_col", "date_col"] + + modified_bf = scalars_df[temporal_columns] + bf_result = modified_bf.describe(include="all").to_pandas() + + modified_pd_df = scalars_pandas_df[temporal_columns] + pd_result = modified_pd_df.describe(include="all") + + # Reindex results with the specified keys and their order, because + # the relative order is not important. + bf_result = bf_result.reindex(["count", "nunique"]) + pd_result = pd_result.reindex( + ["count", "unique"] + # BF counter part of "unique" is called "nunique" + ).rename(index={"unique": "nunique"}) + + pd.testing.assert_frame_equal( + pd_result.astype("Float64"), + bf_result.astype("Float64"), check_index_type=False, ) diff --git a/tests/unit/operations/test_aggregations.py b/tests/unit/operations/test_aggregations.py deleted file mode 100644 index 68ad48ac29..0000000000 --- a/tests/unit/operations/test_aggregations.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest - -import bigframes.dtypes as dtypes -from bigframes.operations.aggregations import ( - all_op, - any_op, - count_op, - dense_rank_op, - first_op, - is_agg_op_supported, - max_op, - mean_op, - median_op, - min_op, - nunique_op, - product_op, - rank_op, - size_op, - std_op, - sum_op, - var_op, -) - -_ALL_OPS = set( - [ - size_op, - sum_op, - mean_op, - median_op, - product_op, - max_op, - min_op, - std_op, - var_op, - count_op, - nunique_op, - rank_op, - dense_rank_op, - all_op, - any_op, - first_op, - ] -) - - -@pytest.mark.parametrize("dtype", dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE) -@pytest.mark.parametrize("op", _ALL_OPS) -def test_is_agg_op_supported_numeric_support_all(dtype, op): - assert is_agg_op_supported(dtype, op) is True - - -@pytest.mark.parametrize( - ("dtype", "supported_ops"), - [ - (dtypes.STRING_DTYPE, {count_op, nunique_op}), - (dtypes.BYTES_DTYPE, {count_op, nunique_op}), - (dtypes.DATE_DTYPE, set()), - (dtypes.TIME_DTYPE, set()), - (dtypes.DATETIME_DTYPE, set()), - (dtypes.TIMESTAMP_DTYPE, set()), - (dtypes.GEO_DTYPE, set()), - ], -) -def test_is_agg_op_supported_non_numeric(dtype, supported_ops): - for op in supported_ops: - assert is_agg_op_supported(dtype, op) is True - - for op in _ALL_OPS - supported_ops: - assert is_agg_op_supported(dtype, op) is False