googleapis · TrevorBergeron · Dec 3, 2025 · Dec 2, 2025
@@ -67,40 +67,39 @@ def indicate_duplicates(
    if keep not in ["first", "last", False]:
        raise ValueError("keep must be one of 'first', 'last', or False'")

+    rownums = agg_expressions.WindowExpression(
+        agg_expressions.NullaryAggregation(
+            agg_ops.RowNumberOp(),
+        ),
+        window=windows.unbound(grouping_keys=tuple(columns)),
+    )
+    count = agg_expressions.WindowExpression(
+        agg_expressions.NullaryAggregation(
+            agg_ops.SizeOp(),
+        ),
+        window=windows.unbound(grouping_keys=tuple(columns)),
+    )
+
    if keep == "first":
        # Count how many copies occur up to current copy of value
        # Discard this value if there are copies BEFORE
-        window_spec = windows.cumulative_rows(
-            grouping_keys=tuple(columns),
-        )
+        predicate = ops.gt_op.as_expr(rownums, ex.const(0))
    elif keep == "last":
        # Count how many copies occur up to current copy of values
        # Discard this value if there are copies AFTER
-        window_spec = windows.inverse_cumulative_rows(
-            grouping_keys=tuple(columns),
-        )
+        predicate = ops.lt_op.as_expr(rownums, ops.sub_op.as_expr(count, ex.const(1)))
    else:  # keep == False
        # Count how many copies of the value occur in entire series.
        # Discard this value if there are copies ANYWHERE
-        window_spec = windows.unbound(grouping_keys=tuple(columns))
-    block, dummy = block.create_constant(1)
-    # use row number as will work even with partial ordering
-    block, val_count_col_id = block.apply_window_op(
-        dummy,
-        agg_ops.sum_op,
-        window_spec=window_spec,
-    )
-    block, duplicate_indicator = block.project_expr(
-        ops.gt_op.as_expr(val_count_col_id, ex.const(1))
+        predicate = ops.gt_op.as_expr(count, ex.const(1))
+
+    block = block.project_block_exprs(
+        [predicate],
+        labels=[None],
    )
    return (
-        block.drop_columns(
-            (
-                dummy,
-                val_count_col_id,
-            )
-        ),
-        duplicate_indicator,
+        block,
+        block.value_columns[-1],
    )



@@ -547,6 +547,9 @@ def compile_agg_op(
                return pl.col(*inputs).first()
            if isinstance(op, agg_ops.LastOp):
                return pl.col(*inputs).last()
+            if isinstance(op, agg_ops.RowNumberOp):
+                # pl.row_index is not yet stable enough to use here, and only supports polars>=1.32
+                return pl.int_range(pl.len(), dtype=pl.Int64)
            if isinstance(op, agg_ops.ShiftOp):
                return pl.col(*inputs).shift(op.periods)
            if isinstance(op, agg_ops.DiffOp):

@@ -626,8 +626,6 @@ def dropna(self, how: typing.Literal["all", "any"] = "any") -> Index:
        return Index(result)

    def drop_duplicates(self, *, keep: __builtins__.str = "first") -> Index:
-        if keep is not False:
-            validations.enforce_ordered(self, "drop_duplicates")
        block = block_ops.drop_duplicates(self._block, self._block.index_columns, keep)
        return Index(block)


@@ -4989,8 +4989,6 @@ def drop_duplicates(
        *,
        keep: str = "first",
    ) -> DataFrame:
-        if keep is not False:
-            validations.enforce_ordered(self, "drop_duplicates(keep != False)")
        if subset is None:
            column_ids = self._block.value_columns
        elif utils.is_list_like(subset):
@@ -5004,8 +5002,6 @@ def drop_duplicates(
        return DataFrame(block)

    def duplicated(self, subset=None, keep: str = "first") -> bigframes.series.Series:
-        if keep is not False:
-            validations.enforce_ordered(self, "duplicated(keep != False)")
        if subset is None:
            column_ids = self._block.value_columns
        else:

@@ -2227,8 +2227,6 @@ def reindex_like(self, other: Series, *, validate: typing.Optional[bool] = None)
        return self.reindex(other.index, validate=validate)

    def drop_duplicates(self, *, keep: str = "first") -> Series:
-        if keep is not False:
-            validations.enforce_ordered(self, "drop_duplicates(keep != False)")
        block = block_ops.drop_duplicates(self._block, (self._value_column,), keep)
        return Series(block)

@@ -2249,8 +2247,6 @@ def unique(self, keep_order=True) -> Series:
        return Series(block.select_columns(result).reset_index())

    def duplicated(self, keep: str = "first") -> Series:
-        if keep is not False:
-            validations.enforce_ordered(self, "duplicated(keep != False)")
        block, indicator = block_ops.indicate_duplicates(
            self._block, (self._value_column,), keep
        )

@@ -40,3 +40,27 @@ def test_cov_150_columns(scalars_df_numeric_150_columns_maybe_ordered):
        check_index_type=False,
        check_column_type=False,
    )
+
+
+@pytest.mark.parametrize(
+    ("keep",),
+    [
+        ("first",),
+        ("last",),
+        (False,),
+    ],
+)
+def test_drop_duplicates_unordered(
+    scalars_df_unordered, scalars_pandas_df_default_index, keep
+):
+    uniq_scalar_rows = scalars_df_unordered.drop_duplicates(
+        subset="bool_col", keep=keep
+    )
+    uniq_pd_rows = scalars_pandas_df_default_index.drop_duplicates(
+        subset="bool_col", keep=keep
+    )
+
+    assert len(uniq_scalar_rows) == len(uniq_pd_rows)
+    assert len(uniq_scalar_rows.groupby("bool_col")) == len(
+        uniq_pd_rows.groupby("bool_col")
+    )