diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index f12c346776..f78dee1642 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1305,6 +1305,34 @@ def nsmallest( column_ids = self._sql_names(columns) return DataFrame(block_ops.nsmallest(self._block, n, column_ids, keep=keep)) + def insert( + self, + loc: int, + column: blocks.Label, + value: SingleItemValue, + allow_duplicates: bool = False, + ): + column_count = len(self.columns) + if loc > column_count: + raise IndexError( + f"Column index {loc} is out of bounds with {column_count} total columns." + ) + if (column in self.columns) and not allow_duplicates: + raise ValueError(f"cannot insert {column}, already exists") + + temp_column = bigframes.core.guid.generate_guid(prefix=str(column)) + df = self._assign_single_item(temp_column, value) + + block = df._get_block() + value_columns = typing.cast(List, block.value_columns) + value_columns, new_column = value_columns[:-1], value_columns[-1] + value_columns.insert(loc, new_column) + + block = block.select_columns(value_columns) + block = block.rename(columns={temp_column: column}) + + self._set_block(block) + def drop( self, labels: typing.Any = None, diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index ffc09a1a1f..841ff7bbea 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -270,6 +270,44 @@ def test_get_columns_default(scalars_dfs): assert result == "default_val" +@pytest.mark.parametrize( + ("loc", "column", "value", "allow_duplicates"), + [ + (0, 666, 2, False), + (5, "float64_col", 2.2, True), + (13, "rowindex_2", [8, 7, 6, 5, 4, 3, 2, 1, 0], True), + pytest.param( + 14, + "test", + 2, + False, + marks=pytest.mark.xfail( + raises=IndexError, + ), + ), + pytest.param( + 12, + "int64_col", + 2, + False, + marks=pytest.mark.xfail( + raises=ValueError, + ), + ), + ], +) +def test_insert(scalars_dfs, loc, column, value, allow_duplicates): + scalars_df, scalars_pandas_df = scalars_dfs + # insert works inplace, so will influence other tests. + # make a copy to avoid inplace changes. + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df.insert(loc, column, value, allow_duplicates) + pd_df.insert(loc, column, value, allow_duplicates) + + pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df, check_dtype=False) + + def test_drop_column(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_col" diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index d46fa4cfc7..f8088f8060 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -1067,6 +1067,51 @@ def reindex_like(self, other): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def insert(self, loc, column, value, allow_duplicates=False): + """Insert column into DataFrame at specified location. + + Raises a ValueError if `column` is already contained in the DataFrame, + unless `allow_duplicates` is set to True. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + + Insert a new column named 'col3' between 'col1' and 'col2' with all entries set to 5. + + >>> df.insert(1, 'col3', 5) + >>> df + col1 col3 col2 + 0 1 5 3 + 1 2 5 4 + + [2 rows x 3 columns] + + Insert another column named 'col2' at the beginning of the DataFrame with values [5, 6] + + >>> df.insert(0, 'col2', [5, 6], allow_duplicates=True) + >>> df + col2 col1 col3 col2 + 0 5 1 5 3 + 1 6 2 5 4 + + [2 rows x 4 columns] + + Args: + loc (int): + Insertion index. Must verify 0 <= loc <= len(columns). + column (str, number, or hashable object): + Label of the inserted column. + value (Scalar, Series, or array-like): + Content of the inserted column. + allow_duplicates (bool, default False): + Allow duplicate column labels to be created. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def drop( self, labels=None, *, axis=0, index=None, columns=None, level=None ) -> DataFrame | None: