googleapis · sycai · Dec 10, 2024 · Dec 5, 2024 · Dec 5, 2024 · Dec 6, 2024
@@ -517,6 +517,17 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame:
        )
        return DataFrame(self._block.select_columns(selected_columns))

+    def _select_exact_dtypes(
+        self, dtypes: Sequence[bigframes.dtypes.Dtype]
+    ) -> DataFrame:
+        """Selects columns without considering inheritance relationships."""
+        columns = [
+            col_id
+            for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
+            if dtype in dtypes
+        ]
+        return DataFrame(self._block.select_columns(columns))
+
    def _set_internal_query_job(self, query_job: Optional[bigquery.QueryJob]):
        self._query_job = query_job

@@ -2437,13 +2448,9 @@ def agg(
            aggregations = [agg_ops.lookup_agg_func(f) for f in func]

            for dtype, agg in itertools.product(self.dtypes, aggregations):
-                if not bigframes.operations.aggregations.is_agg_op_supported(
-                    dtype, agg
-                ):
-                    raise NotImplementedError(
-                        f"Type {dtype} does not support aggregation {agg}. "
-                        f"Share your usecase with the BigQuery DataFrames team at the {constants.FEEDBACK_LINK}"
-                    )
+                agg.output_type(
+                    dtype
+                )  # Raises exception if the agg does not support the dtype.

            return DataFrame(
                self._block.summarize(
@@ -2512,7 +2519,10 @@ def melt(

    def describe(self, include: None | Literal["all"] = None) -> DataFrame:
        if include is None:
-            numeric_df = self._drop_non_numeric(permissive=False)
+            numeric_df = self._select_exact_dtypes(
+                bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
+                + bigframes.dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES
+            )
            if len(numeric_df.columns) == 0:
                # Describe eligible non-numeric columns
                return self._describe_non_numeric()
@@ -2540,9 +2550,11 @@ def describe(self, include: None | Literal["all"] = None) -> DataFrame:
            raise ValueError(f"Unsupported include type: {include}")

    def _describe_numeric(self) -> DataFrame:
-        return typing.cast(
+        number_df_result = typing.cast(
            DataFrame,
-            self._drop_non_numeric(permissive=False).agg(
+            self._select_exact_dtypes(
+                bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
+            ).agg(
                [
                    "count",
                    "mean",
@@ -2555,16 +2567,41 @@ def _describe_numeric(self) -> DataFrame:
                ]
            ),
        )
+        temporal_df_result = typing.cast(
+            DataFrame,
+            self._select_exact_dtypes(
+                bigframes.dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES
+            ).agg(["count"]),
+        )
+
+        if len(number_df_result.columns) == 0:
+            return temporal_df_result
+        elif len(temporal_df_result.columns) == 0:
+            return number_df_result
+        else:
+            import bigframes.core.reshape.api as rs
+
+            original_columns = self._select_exact_dtypes(
+                bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
+                + bigframes.dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES
+            ).columns
+
+            # Use reindex after join to preserve the original column order.
+            return rs.concat(
+                [number_df_result, temporal_df_result],
+                axis=1,
+            )._reindex_columns(original_columns)

    def _describe_non_numeric(self) -> DataFrame:
        return typing.cast(
            DataFrame,
-            self.select_dtypes(
-                include={
+            self._select_exact_dtypes(
+                [
                    bigframes.dtypes.STRING_DTYPE,
                    bigframes.dtypes.BOOL_DTYPE,
                    bigframes.dtypes.BYTES_DTYPE,
-                }
+                    bigframes.dtypes.TIME_DTYPE,
+                ]
            ).agg(["count", "nunique"]),
        )


@@ -18,7 +18,7 @@
 import datetime
 import decimal
 import typing
-from typing import Dict, Literal, Union
+from typing import Dict, List, Literal, Union

 import bigframes_vendored.constants as constants
 import geopandas as gpd  # type: ignore
@@ -211,7 +211,7 @@ class SimpleDtypeInfo:

 # Corresponds to the pandas concept of numeric type (such as when 'numeric_only' is specified in an operation)
 # Pandas is inconsistent, so two definitions are provided, each used in different contexts
-NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE = [
+NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE: List[Dtype] = [
    FLOAT_DTYPE,
    INT_DTYPE,
 ]
@@ -222,7 +222,16 @@ class SimpleDtypeInfo:
 ]


-## dtype predicates - use these to maintain consistency
+# Temporal types that are considered as "numeric" by Pandas
+TEMPORAL_NUMERIC_BIGFRAMES_TYPES: List[Dtype] = [
+    DATE_DTYPE,
+    TIMESTAMP_DTYPE,
+    DATETIME_DTYPE,
+]
+TEMPORAL_BIGFRAMES_TYPES = TEMPORAL_NUMERIC_BIGFRAMES_TYPES + [TIME_DTYPE]
+
+
+# dtype predicates - use these to maintain consistency
 def is_datetime_like(type_: ExpressionType) -> bool:
    return type_ in (DATETIME_DTYPE, TIMESTAMP_DTYPE)

@@ -630,7 +639,7 @@ def can_coerce(source_type: ExpressionType, target_type: ExpressionType) -> bool
        return True  # None can be coerced to any supported type
    else:
        return (source_type == STRING_DTYPE) and (
-            target_type in (DATETIME_DTYPE, TIMESTAMP_DTYPE, TIME_DTYPE, DATE_DTYPE)
+            target_type in TEMPORAL_BIGFRAMES_TYPES
        )



@@ -579,14 +579,3 @@ def lookup_agg_func(key: str) -> typing.Union[UnaryAggregateOp, NullaryAggregate
        return _AGGREGATIONS_LOOKUP[key]
    else:
        raise ValueError(f"Unrecognize aggregate function: {key}")
-
-
-def is_agg_op_supported(dtype: dtypes.Dtype, op: AggregateOp) -> bool:
-    if dtype in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE:
-        return True
-
-    if dtype in (dtypes.STRING_DTYPE, dtypes.BOOL_DTYPE, dtypes.BYTES_DTYPE):
-        return isinstance(op, (CountOp, NuniqueOp))
-
-    # For all other types, support no aggregation
-    return False
@@ -2671,11 +2671,11 @@ def test_dataframe_agg_int_multi_string(scalars_dfs):


 @skip_legacy_pandas
-def test_df_describe(scalars_dfs):
+def test_df_describe_non_temporal(scalars_dfs):
    scalars_df, scalars_pandas_df = scalars_dfs
-    # pyarrows time columns fail in pandas
+    # excluding temporal columns here because BigFrames cannot perform percentiles operations on them
    unsupported_columns = ["datetime_col", "timestamp_col", "time_col", "date_col"]
-    bf_result = scalars_df.describe().to_pandas()
+    bf_result = scalars_df.drop(columns=unsupported_columns).describe().to_pandas()

    modified_pd_df = scalars_pandas_df.drop(columns=unsupported_columns)
    pd_result = modified_pd_df.describe()
@@ -2709,12 +2709,14 @@ def test_df_describe(scalars_dfs):
 def test_df_describe_non_numeric(scalars_dfs, include):
    scalars_df, scalars_pandas_df = scalars_dfs

-    non_numeric_columns = ["string_col", "bytes_col", "bool_col"]
+    # Excluding "date_col" here because in BigFrames it is used as PyArrow[date32()], which is
+    # considered numerical in Pandas
+    target_columns = ["string_col", "bytes_col", "bool_col", "time_col"]

-    modified_bf = scalars_df[non_numeric_columns]
+    modified_bf = scalars_df[target_columns]
    bf_result = modified_bf.describe(include=include).to_pandas()

-    modified_pd_df = scalars_pandas_df[non_numeric_columns]
+    modified_pd_df = scalars_pandas_df[target_columns]
    pd_result = modified_pd_df.describe(include=include)

    # Reindex results with the specified keys and their order, because
@@ -2726,8 +2728,35 @@ def test_df_describe_non_numeric(scalars_dfs, include):
    ).rename(index={"unique": "nunique"})

    pd.testing.assert_frame_equal(
-        pd_result[non_numeric_columns].astype("Int64"),
-        bf_result[non_numeric_columns],
+        pd_result.astype("Int64"),
+        bf_result,
+        check_index_type=False,
+    )
+
+
+@skip_legacy_pandas
+def test_df_describe_temporal(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    temporal_columns = ["datetime_col", "timestamp_col", "time_col", "date_col"]
+
+    modified_bf = scalars_df[temporal_columns]
+    bf_result = modified_bf.describe(include="all").to_pandas()
+
+    modified_pd_df = scalars_pandas_df[temporal_columns]
+    pd_result = modified_pd_df.describe(include="all")
+
+    # Reindex results with the specified keys and their order, because
+    # the relative order is not important.
+    bf_result = bf_result.reindex(["count", "nunique"])
+    pd_result = pd_result.reindex(
+        ["count", "unique"]
+        # BF counter part of "unique" is called "nunique"
+    ).rename(index={"unique": "nunique"})
+
+    pd.testing.assert_frame_equal(
+        pd_result.astype("Float64"),
+        bf_result.astype("Float64"),
        check_index_type=False,
    )