Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

feat: Add pivot_table for DataFrame. #473

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Apr 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions 60 bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2132,6 +2132,66 @@ def pivot(
) -> DataFrame:
return self._pivot(columns=columns, index=index, values=values)

def pivot_table(
self,
values: typing.Optional[
typing.Union[blocks.Label, Sequence[blocks.Label]]
] = None,
index: typing.Optional[
typing.Union[blocks.Label, Sequence[blocks.Label]]
] = None,
columns: typing.Union[blocks.Label, Sequence[blocks.Label]] = None,
aggfunc: str = "mean",
) -> DataFrame:
if isinstance(index, Iterable) and not (
isinstance(index, blocks.Label) and index in self.columns
):
index = list(index)
else:
index = [index]

if isinstance(columns, Iterable) and not (
isinstance(columns, blocks.Label) and columns in self.columns
):
columns = list(columns)
else:
columns = [columns]

if isinstance(values, Iterable) and not (
isinstance(values, blocks.Label) and values in self.columns
):
values = list(values)
else:
values = [values]

# Unlike pivot, pivot_table has values always ordered.
values.sort()

keys = index + columns
agged = self.groupby(keys, dropna=True)[values].agg(aggfunc)

if isinstance(agged, bigframes.series.Series):
agged = agged.to_frame()

agged = agged.dropna(how="all")

if len(values) == 1:
agged = agged.rename(columns={agged.columns[0]: values[0]})

agged = agged.reset_index()

pivoted = agged.pivot(
columns=columns,
index=index,
values=values if len(values) > 1 else None,
).sort_index()

# TODO: Remove the reordering step once the issue is resolved.
# The pivot_table method results in multi-index columns that are always ordered.
# However, the order of the pivoted result columns is not guaranteed to be sorted.
# Sort and reorder.
return pivoted[pivoted.columns.sort_values()]

def stack(self, level: LevelsType = -1):
if not isinstance(self.columns, pandas.MultiIndex):
if level not in [0, -1, self.columns.name]:
Expand Down
28 changes: 28 additions & 0 deletions 28 tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2606,6 +2606,34 @@ def test_df_pivot_hockey(hockey_df, hockey_pandas_df, values, index, columns):
pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)


@pytest.mark.parametrize(
("values", "index", "columns", "aggfunc"),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we also test for the default values of these params?

[
(("culmen_length_mm", "body_mass_g"), "species", "sex", "std"),
(["body_mass_g", "culmen_length_mm"], ("species", "island"), "sex", "sum"),
("body_mass_g", "sex", ["island", "species"], "mean"),
("culmen_depth_mm", "island", "species", "max"),
],
)
def test_df_pivot_table(
penguins_df_default_index,
penguins_pandas_df_default_index,
values,
index,
columns,
aggfunc,
):
bf_result = penguins_df_default_index.pivot_table(
values=values, index=index, columns=columns, aggfunc=aggfunc
).to_pandas()
pd_result = penguins_pandas_df_default_index.pivot_table(
values=values, index=index, columns=columns, aggfunc=aggfunc
)
pd.testing.assert_frame_equal(
bf_result, pd_result, check_dtype=False, check_column_type=False
)


def test_ipython_key_completions_with_drop(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
col_names = "string_col"
Expand Down
82 changes: 82 additions & 0 deletions 82 third_party/bigframes_vendored/pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4711,6 +4711,88 @@ def pivot(self, *, columns, index=None, values=None):
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def pivot_table(self, values=None, index=None, columns=None, aggfunc="mean"):
"""
Create a spreadsheet-style pivot table as a DataFrame.

The levels in the pivot table will be stored in MultiIndex objects (hierarchical indexes)
on the index and columns of the result DataFrame.

**Examples:**

>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None

>>> df = bpd.DataFrame({
... 'Product': ['Product A', 'Product B', 'Product A', 'Product B', 'Product A', 'Product B'],
... 'Region': ['East', 'West', 'East', 'West', 'West', 'East'],
... 'Sales': [100, 200, 150, 100, 200, 150],
... 'Rating': [3, 5, 4, 3, 3, 5]
... })
>>> df
Product Region Sales Rating
0 Product A East 100 3
1 Product B West 200 5
2 Product A East 150 4
3 Product B West 100 3
4 Product A West 200 3
5 Product B East 150 5
<BLANKLINE>
[6 rows x 4 columns]

Using `pivot_table` with default aggfunc "mean":

>>> pivot_table = df.pivot_table(
... values=['Sales', 'Rating'],
... index='Product',
... columns='Region'
... )
>>> pivot_table
Rating Sales
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#[nit] is the indentation alright here, also below?

Region East West East West
Product
Product A 3.5 3.0 125.0 200.0
Product B 5.0 4.0 150.0 150.0
<BLANKLINE>
[2 rows x 4 columns]

Using `pivot_table` with specified aggfunc "max":

>>> pivot_table = df.pivot_table(
... values=['Sales', 'Rating'],
... index='Product',
... columns='Region',
... aggfunc="max"
... )
>>> pivot_table
Rating Sales
Region East West East West
Product
Product A 4 3 150 200
Product B 5 5 150 200
<BLANKLINE>
[2 rows x 4 columns]

Args:
values (str, object or a list of the previous, optional):
Column(s) to use for populating new frame's values. If not
specified, all remaining columns will be used and the result will
have hierarchically indexed columns.

index (str or object or a list of str, optional):
Column to use to make new frame's index. If not given, uses existing index.

columns (str or object or a list of str):
Column to use to make new frame's columns.

aggfunc (str, default "mean"):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nit] no newline like after other params

Aggregation function name to compute summary statistics (e.g., 'sum', 'mean').

Returns:
DataFrame: An Excel style pivot table.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def stack(self, level=-1):
"""
Stack the prescribed level(s) from columns to index.
Expand Down
Morty Proxy This is a proxified and sanitized view of the page, visit original site.