Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

feat: add unstack to series, add level param #115

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Oct 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 23 additions & 3 deletions 26 bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@
_MONOTONIC_DECREASING = "monotonic_decreasing"


LevelType = typing.Union[str, int]
LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]]


class BlockHolder(typing.Protocol):
"""Interface for mutable objects with state represented by a block value object."""

Expand Down Expand Up @@ -1423,9 +1427,7 @@ def _get_unique_values(
raise ValueError(f"Too many unique values: {pd_values}")

if len(columns) > 1:
return pd.MultiIndex.from_frame(
pd_values.sort_values(by=list(pd_values.columns), na_position="first")
)
return pd.MultiIndex.from_frame(pd_values)
else:
return pd.Index(pd_values.squeeze(axis=1).sort_values(na_position="first"))

Expand Down Expand Up @@ -1611,6 +1613,24 @@ def cached(self) -> Block:
index_labels=self.index_labels,
)

def resolve_index_level(self, level: LevelsType) -> typing.Sequence[str]:
if utils.is_list_like(level):
levels = list(level)
else:
levels = [level]
resolved_level_ids = []
for level_ref in levels:
if isinstance(level_ref, int):
resolved_level_ids.append(self.index_columns[level_ref])
elif isinstance(level_ref, typing.Hashable):
matching_ids = self.index_name_to_col_id.get(level_ref, [])
if len(matching_ids) != 1:
raise ValueError("level name cannot be found or is ambiguous")
resolved_level_ids.append(matching_ids[0])
else:
raise ValueError(f"Unexpected level: {level_ref}")
return resolved_level_ids

def _is_monotonic(
self, column_ids: typing.Union[str, Sequence[str]], increasing: bool
) -> bool:
Expand Down
32 changes: 11 additions & 21 deletions 32 bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1038,22 +1038,7 @@ def reorder_levels(self, order: LevelsType, axis: int | str = 0):
raise ValueError("Columns must be a multiindex to reorder levels.")

def _resolve_levels(self, level: LevelsType) -> typing.Sequence[str]:
if utils.is_list_like(level):
levels = list(level)
else:
levels = [level]
resolved_level_ids = []
for level_ref in levels:
if isinstance(level_ref, int):
resolved_level_ids.append(self._block.index_columns[level_ref])
elif isinstance(level_ref, typing.Hashable):
matching_ids = self._block.index_name_to_col_id.get(level_ref, [])
if len(matching_ids) != 1:
raise ValueError("level name cannot be found or is ambiguous")
resolved_level_ids.append(matching_ids[0])
else:
raise ValueError(f"Unexpected level: {level_ref}")
return resolved_level_ids
return self._block.resolve_index_level(level)

def rename(self, *, columns: Mapping[blocks.Label, blocks.Label]) -> DataFrame:
block = self._block.rename(columns=columns)
Expand Down Expand Up @@ -1802,20 +1787,25 @@ def _stack_multi(self, level: LevelsType = -1):
block = block.stack(levels=len(level))
return DataFrame(block)

def unstack(self):
def unstack(self, level: LevelsType = -1):
if isinstance(level, int) or isinstance(level, str):
level = [level]

block = self._block
# Special case, unstack with mono-index transpose into a series
if self.index.nlevels == 1:
block = block.stack(how="right", levels=self.columns.nlevels)
return bigframes.series.Series(block)

# Pivot by last level of index
index_ids = block.index_columns
# Pivot by index levels
unstack_ids = self._resolve_levels(level)
block = block.reset_index(drop=False)
block = block.set_index(index_ids[:-1])
block = block.set_index(
[col for col in self._block.index_columns if col not in unstack_ids]
)

pivot_block = block.pivot(
columns=[index_ids[-1]],
columns=unstack_ids,
values=self._block.value_columns,
values_in_index=True,
)
Expand Down
40 changes: 24 additions & 16 deletions 40 bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,22 +352,7 @@ def reorder_levels(self, order: LevelsType, axis: int | str = 0):
return Series(self._block.reorder_levels(resolved_level_ids))

def _resolve_levels(self, level: LevelsType) -> typing.Sequence[str]:
if _is_list_like(level):
levels = list(level)
else:
levels = [level]
resolved_level_ids = []
for level_ref in levels:
if isinstance(level_ref, int):
resolved_level_ids.append(self._block.index_columns[level_ref])
elif isinstance(level_ref, typing.Hashable):
matching_ids = self._block.index_name_to_col_id.get(level_ref, [])
if len(matching_ids) != 1:
raise ValueError("level name cannot be found or is ambiguous")
resolved_level_ids.append(matching_ids[0])
else:
raise ValueError(f"Unexpected level: {level_ref}")
return resolved_level_ids
return self._block.resolve_index_level(level)

def between(self, left, right, inclusive="both"):
if inclusive not in ["both", "neither", "left", "right"]:
Expand Down Expand Up @@ -918,6 +903,29 @@ def argmin(self) -> int:
scalars.Scalar, Series(block.select_column(row_nums)).iloc[0]
)

def unstack(self, level: LevelsType = -1):
if isinstance(level, int) or isinstance(level, str):
level = [level]

block = self._block

if self.index.nlevels == 1:
raise ValueError("Series must have multi-index to unstack")

# Pivot by index levels
unstack_ids = self._resolve_levels(level)
block = block.reset_index(drop=False)
block = block.set_index(
[col for col in self._block.index_columns if col not in unstack_ids]
)

pivot_block = block.pivot(
columns=unstack_ids,
values=self._block.value_columns,
values_in_index=False,
)
return bigframes.dataframe.DataFrame(pivot_block)

def idxmax(self) -> blocks.Label:
block = self._block.order_by(
[
Expand Down
8 changes: 6 additions & 2 deletions 8 tests/system/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,11 @@ def hockey_df(
hockey_table_id: str, session: bigframes.Session
) -> bigframes.dataframe.DataFrame:
"""DataFrame pointing at test data."""
return session.read_gbq(hockey_table_id)
return (
session.read_gbq(hockey_table_id)
.set_index(["player_name", "season"])
.sort_index()
)


@pytest.fixture(scope="session")
Expand All @@ -419,7 +423,7 @@ def hockey_pandas_df() -> pd.DataFrame:
"season": pd.Int64Dtype(),
},
)
df.index = df.index.astype("Int64")
df = df.set_index(["player_name", "season"]).sort_index()
return df


Expand Down
10 changes: 8 additions & 2 deletions 10 tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1949,8 +1949,14 @@ def test_df_pivot(scalars_dfs, values, index, columns):
],
)
def test_df_pivot_hockey(hockey_df, hockey_pandas_df, values, index, columns):
bf_result = hockey_df.pivot(values=values, index=index, columns=columns).to_pandas()
pd_result = hockey_pandas_df.pivot(values=values, index=index, columns=columns)
bf_result = (
hockey_df.reset_index()
.pivot(values=values, index=index, columns=columns)
.to_pandas()
)
pd_result = hockey_pandas_df.reset_index().pivot(
values=values, index=index, columns=columns
)

# Pandas produces NaN, where bq dataframes produces pd.NA
pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)
Expand Down
31 changes: 27 additions & 4 deletions 31 tests/system/small/test_multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -909,13 +909,36 @@ def test_column_multi_index_reorder_levels(scalars_df_index, scalars_pandas_df_i
pandas.testing.assert_frame_equal(bf_result, pd_result)


def test_multi_index_unstack(hockey_df, hockey_pandas_df):
@pytest.mark.parametrize(
("level",),
[(["position", "team_name"],), ([-2, -1],), (["position"],), ("season",), (-3,)],
)
def test_df_multi_index_unstack(hockey_df, hockey_pandas_df, level):
bf_result = (
hockey_df.set_index(["team_name", "season", "position"]).unstack().to_pandas()
hockey_df.set_index(["team_name", "position"], append=True)
.unstack(level=level)
.to_pandas()
)
pd_result = hockey_pandas_df.set_index(
["team_name", "season", "position"]
).unstack()
["team_name", "position"], append=True
).unstack(level=level)

pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)


@pytest.mark.parametrize(
("level",),
[(["position", "team_name"],), ([-2, -1],), (["position"],), ("season",), (-3,)],
)
def test_series_multi_index_unstack(hockey_df, hockey_pandas_df, level):
bf_result = (
hockey_df.set_index(["team_name", "position"], append=True)["number"]
.unstack(level=level)
.to_pandas()
)
pd_result = hockey_pandas_df.set_index(["team_name", "position"], append=True)[
"number"
].unstack(level=level)

pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)

Expand Down
13 changes: 13 additions & 0 deletions 13 third_party/bigframes_vendored/pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1654,6 +1654,19 @@ def clip(self):
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def unstack(self, level):
"""
Unstack, also known as pivot, Series with MultiIndex to produce DataFrame.

Args:
level (int, str, or list of these, default last level):
Level(s) to unstack, can pass level name.

Returns:
DataFrame: Unstacked Series.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def argmax(self):
"""
Return int position of the smallest value in the Series.
Expand Down
Morty Proxy This is a proxified and sanitized view of the page, visit original site.