diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index ab0006ea20..595670b0b6 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2759,26 +2759,28 @@ def _apply_unary_op(self, operation: ops.UnaryOp) -> DataFrame: def _create_io_query(self, index: bool, ordering_id: Optional[str]) -> str: """Create query text representing this dataframe for I/O.""" array_value = self._block.expr + + new_col_labels, new_idx_labels = utils.get_standardized_ids( + self._block.column_labels, self.index.names + ) + columns = list(self._block.value_columns) - column_labels = list(self._block.column_labels) + column_labels = new_col_labels # This code drops unnamed indexes to keep consistent with the behavior of # most pandas write APIs. The exception is `pandas.to_csv`, which keeps # unnamed indexes as `Unnamed: 0`. # TODO(chelsealin): check if works for multiple indexes. if index and self.index.name is not None: columns.extend(self._block.index_columns) - column_labels.extend(self.index.names) + column_labels.extend(new_idx_labels) else: array_value = array_value.drop_columns(self._block.index_columns) # Make columns in SQL reflect _labels_ not _ids_. Note: This may use # the arbitrary unicode column labels feature in BigQuery, which is # currently (June 2023) in preview. - # TODO(swast): Handle duplicate and NULL labels. id_overrides = { - col_id: col_label - for col_id, col_label in zip(columns, column_labels) - if col_label and isinstance(col_label, str) + col_id: col_label for col_id, col_label in zip(columns, column_labels) } if ordering_id is not None: diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 59864e483e..6f1b31b48e 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -273,6 +273,50 @@ def test_to_gbq_if_exists( ) +def test_to_gbq_w_duplicate_column_names( + scalars_df_index, scalars_pandas_df_index, dataset_id +): + """Test the `to_gbq` API when dealing with duplicate column names.""" + destination_table = f"{dataset_id}.test_to_gbq_w_duplicate_column_names" + + # Renaming 'int64_too' to 'int64_col', which will result in 'int64_too' + # becoming 'int64_col_1' after deduplication. + scalars_df_index = scalars_df_index.rename(columns={"int64_too": "int64_col"}) + scalars_df_index.to_gbq(destination_table, if_exists="replace") + + bf_result = bpd.read_gbq(destination_table, index_col="rowindex").to_pandas() + + pd.testing.assert_series_equal( + scalars_pandas_df_index["int64_col"], bf_result["int64_col"] + ) + pd.testing.assert_series_equal( + scalars_pandas_df_index["int64_too"], + bf_result["int64_col_1"], + check_names=False, + ) + + +def test_to_gbq_w_None_column_names( + scalars_df_index, scalars_pandas_df_index, dataset_id +): + """Test the `to_gbq` API with None as a column name.""" + destination_table = f"{dataset_id}.test_to_gbq_w_none_column_names" + + scalars_df_index = scalars_df_index.rename(columns={"int64_too": None}) + scalars_df_index.to_gbq(destination_table, if_exists="replace") + + bf_result = bpd.read_gbq(destination_table, index_col="rowindex").to_pandas() + + pd.testing.assert_series_equal( + scalars_pandas_df_index["int64_col"], bf_result["int64_col"] + ) + pd.testing.assert_series_equal( + scalars_pandas_df_index["int64_too"], + bf_result["bigframes_unnamed_column"], + check_names=False, + ) + + def test_to_gbq_w_invalid_destination_table(scalars_df_index): with pytest.raises(ValueError): scalars_df_index.to_gbq("table_id")