Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings
This repository was archived by the owner on May 7, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 22 additions & 23 deletions 45 bigframes/core/block_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,40 +67,39 @@ def indicate_duplicates(
if keep not in ["first", "last", False]:
raise ValueError("keep must be one of 'first', 'last', or False'")

rownums = agg_expressions.WindowExpression(
agg_expressions.NullaryAggregation(
agg_ops.RowNumberOp(),
),
window=windows.unbound(grouping_keys=tuple(columns)),
)
count = agg_expressions.WindowExpression(
agg_expressions.NullaryAggregation(
agg_ops.SizeOp(),
),
window=windows.unbound(grouping_keys=tuple(columns)),
)

if keep == "first":
# Count how many copies occur up to current copy of value
# Discard this value if there are copies BEFORE
window_spec = windows.cumulative_rows(
grouping_keys=tuple(columns),
)
predicate = ops.gt_op.as_expr(rownums, ex.const(0))
elif keep == "last":
# Count how many copies occur up to current copy of values
# Discard this value if there are copies AFTER
window_spec = windows.inverse_cumulative_rows(
grouping_keys=tuple(columns),
)
predicate = ops.lt_op.as_expr(rownums, ops.sub_op.as_expr(count, ex.const(1)))
else: # keep == False
# Count how many copies of the value occur in entire series.
# Discard this value if there are copies ANYWHERE
window_spec = windows.unbound(grouping_keys=tuple(columns))
block, dummy = block.create_constant(1)
# use row number as will work even with partial ordering
block, val_count_col_id = block.apply_window_op(
dummy,
agg_ops.sum_op,
window_spec=window_spec,
)
block, duplicate_indicator = block.project_expr(
ops.gt_op.as_expr(val_count_col_id, ex.const(1))
predicate = ops.gt_op.as_expr(count, ex.const(1))

block = block.project_block_exprs(
[predicate],
labels=[None],
)
return (
block.drop_columns(
(
dummy,
val_count_col_id,
)
),
duplicate_indicator,
block,
block.value_columns[-1],
)


Expand Down
3 changes: 3 additions & 0 deletions 3 bigframes/core/compile/polars/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,9 @@ def compile_agg_op(
return pl.col(*inputs).first()
if isinstance(op, agg_ops.LastOp):
return pl.col(*inputs).last()
if isinstance(op, agg_ops.RowNumberOp):
# pl.row_index is not yet stable enough to use here, and only supports polars>=1.32
return pl.int_range(pl.len(), dtype=pl.Int64)
if isinstance(op, agg_ops.ShiftOp):
return pl.col(*inputs).shift(op.periods)
if isinstance(op, agg_ops.DiffOp):
Expand Down
2 changes: 0 additions & 2 deletions 2 bigframes/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -626,8 +626,6 @@ def dropna(self, how: typing.Literal["all", "any"] = "any") -> Index:
return Index(result)

def drop_duplicates(self, *, keep: __builtins__.str = "first") -> Index:
if keep is not False:
validations.enforce_ordered(self, "drop_duplicates")
block = block_ops.drop_duplicates(self._block, self._block.index_columns, keep)
return Index(block)

Expand Down
4 changes: 0 additions & 4 deletions 4 bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4989,8 +4989,6 @@ def drop_duplicates(
*,
keep: str = "first",
) -> DataFrame:
if keep is not False:
validations.enforce_ordered(self, "drop_duplicates(keep != False)")
if subset is None:
column_ids = self._block.value_columns
elif utils.is_list_like(subset):
Expand All @@ -5004,8 +5002,6 @@ def drop_duplicates(
return DataFrame(block)

def duplicated(self, subset=None, keep: str = "first") -> bigframes.series.Series:
if keep is not False:
validations.enforce_ordered(self, "duplicated(keep != False)")
if subset is None:
column_ids = self._block.value_columns
else:
Expand Down
4 changes: 0 additions & 4 deletions 4 bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2227,8 +2227,6 @@ def reindex_like(self, other: Series, *, validate: typing.Optional[bool] = None)
return self.reindex(other.index, validate=validate)

def drop_duplicates(self, *, keep: str = "first") -> Series:
if keep is not False:
validations.enforce_ordered(self, "drop_duplicates(keep != False)")
block = block_ops.drop_duplicates(self._block, (self._value_column,), keep)
return Series(block)

Expand All @@ -2249,8 +2247,6 @@ def unique(self, keep_order=True) -> Series:
return Series(block.select_columns(result).reset_index())

def duplicated(self, keep: str = "first") -> Series:
if keep is not False:
validations.enforce_ordered(self, "duplicated(keep != False)")
block, indicator = block_ops.indicate_duplicates(
self._block, (self._value_column,), keep
)
Expand Down
24 changes: 24 additions & 0 deletions 24 tests/system/large/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,27 @@ def test_cov_150_columns(scalars_df_numeric_150_columns_maybe_ordered):
check_index_type=False,
check_column_type=False,
)


@pytest.mark.parametrize(
("keep",),
[
("first",),
("last",),
(False,),
],
)
def test_drop_duplicates_unordered(
scalars_df_unordered, scalars_pandas_df_default_index, keep
):
uniq_scalar_rows = scalars_df_unordered.drop_duplicates(
subset="bool_col", keep=keep
)
uniq_pd_rows = scalars_pandas_df_default_index.drop_duplicates(
subset="bool_col", keep=keep
)

assert len(uniq_scalar_rows) == len(uniq_pd_rows)
assert len(uniq_scalar_rows.groupby("bool_col")) == len(
uniq_pd_rows.groupby("bool_col")
)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.