Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings
This repository was archived by the owner on May 7, 2026. It is now read-only.

Commit c62e553

Browse filesBrowse files
feat: Add bigframes.pandas.crosstab (#2231)
1 parent 44e9869 commit c62e553
Copy full SHA for c62e553

7 files changed

+261-3Lines changed: 261 additions & 3 deletions

File tree

Expand file treeCollapse file tree
Open diff view settings
Filter options
Expand file treeCollapse file tree
Open diff view settings
Collapse file

‎bigframes/core/reshape/api.py‎

Copy file name to clipboardExpand all lines: bigframes/core/reshape/api.py
+2-1Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from bigframes.core.reshape.concat import concat
1616
from bigframes.core.reshape.encoding import get_dummies
1717
from bigframes.core.reshape.merge import merge
18+
from bigframes.core.reshape.pivot import crosstab
1819
from bigframes.core.reshape.tile import cut, qcut
1920

20-
__all__ = ["concat", "get_dummies", "merge", "cut", "qcut"]
21+
__all__ = ["concat", "get_dummies", "merge", "cut", "qcut", "crosstab"]
Collapse file

‎bigframes/core/reshape/pivot.py‎

Copy file name to clipboard
+89Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
from __future__ import annotations
15+
16+
from typing import Optional, TYPE_CHECKING
17+
18+
import bigframes_vendored.pandas.core.reshape.pivot as vendored_pandas_pivot
19+
import pandas as pd
20+
21+
import bigframes
22+
from bigframes.core import convert, utils
23+
from bigframes.core.reshape import concat
24+
from bigframes.dataframe import DataFrame
25+
26+
if TYPE_CHECKING:
27+
import bigframes.session
28+
29+
30+
def crosstab(
31+
index,
32+
columns,
33+
values=None,
34+
rownames=None,
35+
colnames=None,
36+
aggfunc=None,
37+
*,
38+
session: Optional[bigframes.session.Session] = None,
39+
) -> DataFrame:
40+
if _is_list_of_lists(index):
41+
index = [
42+
convert.to_bf_series(subindex, default_index=None, session=session)
43+
for subindex in index
44+
]
45+
else:
46+
index = [convert.to_bf_series(index, default_index=None, session=session)]
47+
if _is_list_of_lists(columns):
48+
columns = [
49+
convert.to_bf_series(subcol, default_index=None, session=session)
50+
for subcol in columns
51+
]
52+
else:
53+
columns = [convert.to_bf_series(columns, default_index=None, session=session)]
54+
55+
df = concat.concat([*index, *columns], join="inner", axis=1)
56+
# for uniqueness
57+
tmp_index_names = [f"_crosstab_index_{i}" for i in range(len(index))]
58+
tmp_col_names = [f"_crosstab_columns_{i}" for i in range(len(columns))]
59+
df.columns = pd.Index([*tmp_index_names, *tmp_col_names])
60+
61+
values = (
62+
convert.to_bf_series(values, default_index=df.index, session=session)
63+
if values is not None
64+
else 0
65+
)
66+
67+
df["_crosstab_values"] = values
68+
pivot_table = df.pivot_table(
69+
values="_crosstab_values",
70+
index=tmp_index_names,
71+
columns=tmp_col_names,
72+
aggfunc=aggfunc or "count",
73+
sort=False,
74+
)
75+
pivot_table.index.names = rownames or [i.name for i in index]
76+
pivot_table.columns.names = colnames or [c.name for c in columns]
77+
if aggfunc is None:
78+
# TODO: Push this into pivot_table itself
79+
pivot_table = pivot_table.fillna(0)
80+
return pivot_table
81+
82+
83+
def _is_list_of_lists(item) -> bool:
84+
if not utils.is_list_like(item):
85+
return False
86+
return all(convert.can_convert_to_series(subitem) for subitem in item)
87+
88+
89+
crosstab.__doc__ = vendored_pandas_pivot.crosstab.__doc__
Collapse file

‎bigframes/dataframe.py‎

Copy file name to clipboardExpand all lines: bigframes/dataframe.py
+30-1Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3479,7 +3479,34 @@ def pivot_table(
34793479
] = None,
34803480
columns: typing.Union[blocks.Label, Sequence[blocks.Label]] = None,
34813481
aggfunc: str = "mean",
3482+
fill_value=None,
3483+
margins: bool = False,
3484+
dropna: bool = True,
3485+
margins_name: Hashable = "All",
3486+
observed: bool = False,
3487+
sort: bool = True,
34823488
) -> DataFrame:
3489+
if fill_value is not None:
3490+
raise NotImplementedError(
3491+
"DataFrame.pivot_table fill_value arg not supported. {constants.FEEDBACK_LINK}"
3492+
)
3493+
if margins:
3494+
raise NotImplementedError(
3495+
"DataFrame.pivot_table margins arg not supported. {constants.FEEDBACK_LINK}"
3496+
)
3497+
if not dropna:
3498+
raise NotImplementedError(
3499+
"DataFrame.pivot_table dropna arg not supported. {constants.FEEDBACK_LINK}"
3500+
)
3501+
if margins_name != "All":
3502+
raise NotImplementedError(
3503+
"DataFrame.pivot_table margins_name arg not supported. {constants.FEEDBACK_LINK}"
3504+
)
3505+
if observed:
3506+
raise NotImplementedError(
3507+
"DataFrame.pivot_table observed arg not supported. {constants.FEEDBACK_LINK}"
3508+
)
3509+
34833510
if isinstance(index, Iterable) and not (
34843511
isinstance(index, blocks.Label) and index in self.columns
34853512
):
@@ -3521,7 +3548,9 @@ def pivot_table(
35213548
columns=columns,
35223549
index=index,
35233550
values=values if len(values) > 1 else None,
3524-
).sort_index()
3551+
)
3552+
if sort:
3553+
pivoted = pivoted.sort_index()
35253554

35263555
# TODO: Remove the reordering step once the issue is resolved.
35273556
# The pivot_table method results in multi-index columns that are always ordered.
Collapse file

‎bigframes/pandas/__init__.py‎

Copy file name to clipboardExpand all lines: bigframes/pandas/__init__.py
+2-1Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
import bigframes.core.blocks
3232
import bigframes.core.global_session as global_session
3333
import bigframes.core.indexes
34-
from bigframes.core.reshape.api import concat, cut, get_dummies, merge, qcut
34+
from bigframes.core.reshape.api import concat, crosstab, cut, get_dummies, merge, qcut
3535
import bigframes.core.tools
3636
import bigframes.dataframe
3737
import bigframes.enums
@@ -372,6 +372,7 @@ def reset_session():
372372
_functions = [
373373
clean_up_by_session_id,
374374
concat,
375+
crosstab,
375376
cut,
376377
deploy_remote_function,
377378
deploy_udf,
Collapse file

‎bigframes/session/__init__.py‎

Copy file name to clipboardExpand all lines: bigframes/session/__init__.py
+15Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2312,6 +2312,21 @@ def cut(self, *args, **kwargs) -> bigframes.series.Series:
23122312
**kwargs,
23132313
)
23142314

2315+
def crosstab(self, *args, **kwargs) -> dataframe.DataFrame:
2316+
"""Compute a simple cross tabulation of two (or more) factors.
2317+
2318+
Included for compatibility between bpd and Session.
2319+
2320+
See :func:`bigframes.pandas.crosstab` for full documentation.
2321+
"""
2322+
import bigframes.core.reshape.pivot
2323+
2324+
return bigframes.core.reshape.pivot.crosstab(
2325+
*args,
2326+
session=self,
2327+
**kwargs,
2328+
)
2329+
23152330
def DataFrame(self, *args, **kwargs):
23162331
"""Constructs a DataFrame.
23172332
Collapse file

‎tests/system/small/test_pandas.py‎

Copy file name to clipboardExpand all lines: tests/system/small/test_pandas.py
+66Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -450,6 +450,72 @@ def test_merge_raises_error_when_left_right_on_set(scalars_dfs):
450450
)
451451

452452

453+
def test_crosstab_aligned_series(scalars_dfs):
454+
scalars_df, scalars_pandas_df = scalars_dfs
455+
456+
pd_result = pd.crosstab(
457+
scalars_pandas_df["int64_col"], scalars_pandas_df["int64_too"]
458+
)
459+
bf_result = bpd.crosstab(
460+
scalars_df["int64_col"], scalars_df["int64_too"]
461+
).to_pandas()
462+
463+
assert_pandas_df_equal(bf_result, pd_result, check_dtype=False)
464+
465+
466+
def test_crosstab_nondefault_func(scalars_dfs):
467+
scalars_df, scalars_pandas_df = scalars_dfs
468+
469+
pd_result = pd.crosstab(
470+
scalars_pandas_df["int64_col"],
471+
scalars_pandas_df["int64_too"],
472+
values=scalars_pandas_df["float64_col"],
473+
aggfunc="mean",
474+
)
475+
bf_result = bpd.crosstab(
476+
scalars_df["int64_col"],
477+
scalars_df["int64_too"],
478+
values=scalars_df["float64_col"],
479+
aggfunc="mean",
480+
).to_pandas()
481+
482+
assert_pandas_df_equal(bf_result, pd_result, check_dtype=False)
483+
484+
485+
def test_crosstab_multi_cols(scalars_dfs):
486+
scalars_df, scalars_pandas_df = scalars_dfs
487+
488+
pd_result = pd.crosstab(
489+
[scalars_pandas_df["int64_col"], scalars_pandas_df["bool_col"]],
490+
[scalars_pandas_df["int64_too"], scalars_pandas_df["string_col"]],
491+
rownames=["a", "b"],
492+
colnames=["c", "d"],
493+
)
494+
bf_result = bpd.crosstab(
495+
[scalars_df["int64_col"], scalars_df["bool_col"]],
496+
[scalars_df["int64_too"], scalars_df["string_col"]],
497+
rownames=["a", "b"],
498+
colnames=["c", "d"],
499+
).to_pandas()
500+
501+
assert_pandas_df_equal(bf_result, pd_result, check_dtype=False)
502+
503+
504+
def test_crosstab_unaligned_series(scalars_dfs, session):
505+
scalars_df, scalars_pandas_df = scalars_dfs
506+
other_pd_series = pd.Series(
507+
[10, 20, 10, 30, 10], index=[5, 4, 1, 2, 3], dtype="Int64", name="nums"
508+
)
509+
other_bf_series = session.Series(
510+
[10, 20, 10, 30, 10], index=[5, 4, 1, 2, 3], name="nums"
511+
)
512+
513+
pd_result = pd.crosstab(scalars_pandas_df["int64_col"], other_pd_series)
514+
bf_result = bpd.crosstab(scalars_df["int64_col"], other_bf_series).to_pandas()
515+
516+
assert_pandas_df_equal(bf_result, pd_result, check_dtype=False)
517+
518+
453519
def _convert_pandas_category(pd_s: pd.Series):
454520
"""
455521
Transforms a pandas Series with Categorical dtype into a bigframes-compatible
Collapse file
+57Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/reshape/pivot.py
2+
from __future__ import annotations
3+
4+
from bigframes import constants
5+
6+
7+
def crosstab(
8+
index,
9+
columns,
10+
values=None,
11+
rownames=None,
12+
colnames=None,
13+
aggfunc=None,
14+
):
15+
"""
16+
Compute a simple cross tabulation of two (or more) factors.
17+
18+
By default, computes a frequency table of the factors unless an
19+
array of values and an aggregation function are passed.
20+
21+
**Examples:**
22+
>>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar",
23+
... "bar", "bar", "foo", "foo", "foo"], dtype=object)
24+
>>> b = np.array(["one", "one", "one", "two", "one", "one",
25+
... "one", "two", "two", "two", "one"], dtype=object)
26+
>>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny",
27+
... "shiny", "dull", "shiny", "shiny", "shiny"],
28+
... dtype=object)
29+
>>> bpd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])
30+
b one two
31+
c dull shiny dull shiny
32+
a
33+
bar 1 2 1 0
34+
foo 2 2 1 2
35+
<BLANKLINE>
36+
[2 rows x 4 columns]
37+
38+
Args:
39+
index (array-like, Series, or list of arrays/Series):
40+
Values to group by in the rows.
41+
columns (array-like, Series, or list of arrays/Series):
42+
Values to group by in the columns.
43+
values (array-like, optional):
44+
Array of values to aggregate according to the factors.
45+
Requires `aggfunc` be specified.
46+
rownames (sequence, default None):
47+
If passed, must match number of row arrays passed.
48+
colnames (sequence, default None):
49+
If passed, must match number of column arrays passed.
50+
aggfunc (function, optional):
51+
If specified, requires `values` be specified as well.
52+
53+
Returns:
54+
DataFrame:
55+
Cross tabulation of the data.
56+
"""
57+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.