From a0f47fd18cc8169ea729c8ece26c4a9277bb8897 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Thu, 6 Feb 2025 20:16:59 +0000 Subject: [PATCH 1/4] fix: translate labels to col ids when copying dataframes --- bigframes/core/blocks.py | 20 ++++++++++++++++++++ bigframes/dataframe.py | 13 +++---------- tests/system/small/test_dataframe.py | 11 +++++++++-- 3 files changed, 32 insertions(+), 12 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index b1f4ed35cc..18f6078edf 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -275,6 +275,26 @@ def label_to_col_id(self) -> typing.Mapping[Label, typing.Sequence[str]]: for id, label in self.col_id_to_label.items(): mapping[label] = (*mapping.get(label, ()), id) return mapping + + def resolve_label_exact(self, label:Label) -> Optional[str]: + """Returns the column id matching the label if there is exactly + one such column. If there are multiple columns with the same name, + raises an error. If there is no such column, returns None.""" + matches = self.label_to_col_id.get(label, []) + if len(matches) > 1: + raise ValueError( + f"Multiple columns matching id {label} were found. {constants.FEEDBACK_LINK}" + ) + return matches[0] if len(matches) != 0 else None + + def resolve_label_exact_or_error(self, label:Label) -> str: + """Returns the column id matching the label if there is exactly + one such column. If there are multiple columns with the same name, + raises an error. If there is no such column, raises an error too.""" + col_id = self.resolve_label_exact(label) + if col_id is None: + raise ValueError(f"Label not found: {label}") + return col_id @functools.cached_property def col_id_to_index_name(self) -> typing.Mapping[str, Label]: diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 20f636b681..069f99c6e8 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -180,7 +180,8 @@ def __init__( ) block = block.set_index([r_mapping[idx_col] for idx_col in idx_cols]) if columns: - block = block.select_columns(list(columns)) # type:ignore + column_ids = [block.resolve_label_exact_or_error(label) for label in list(columns)] + block = block.select_columns(column_ids) # type:ignore if dtype: bf_dtype = bigframes.dtypes.bigframes_type(dtype) block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype)) @@ -238,15 +239,7 @@ def _find_indices( return [self._block.value_columns.index(col_id) for col_id in col_ids] def _resolve_label_exact(self, label) -> Optional[str]: - """Returns the column id matching the label if there is exactly - one such column. If there are multiple columns with the same name, - raises an error. If there is no such column, returns None.""" - matches = self._block.label_to_col_id.get(label, []) - if len(matches) > 1: - raise ValueError( - f"Multiple columns matching id {label} were found. {constants.FEEDBACK_LINK}" - ) - return matches[0] if len(matches) != 0 else None + return self._block.resolve_label_exact(label) def _sql_names( self, diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index e7556043af..1db89a074a 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -44,8 +44,15 @@ def test_df_construct_copy(scalars_dfs): columns = ["int64_col", "string_col", "float64_col"] scalars_df, scalars_pandas_df = scalars_dfs - bf_result = dataframe.DataFrame(scalars_df, columns=columns).to_pandas() - pd_result = pd.DataFrame(scalars_pandas_df, columns=columns) + # Make the mapping from label to col_id non-trivial + bf_df = scalars_df.copy() + bf_df["int64_col"] = bf_df["int64_col"] / 2 + pd_df = scalars_pandas_df.copy() + pd_df["int64_col"] = pd_df["int64_col"] / 2 + + bf_result = dataframe.DataFrame(bf_df, columns=columns).to_pandas() + + pd_result = pd.DataFrame(pd_df, columns=columns) pandas.testing.assert_frame_equal(bf_result, pd_result) From 0ee787ee6c8330fa242b1336eb25b9d99ef3207e Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Thu, 6 Feb 2025 20:18:55 +0000 Subject: [PATCH 2/4] polish error message --- bigframes/core/blocks.py | 10 +++++----- bigframes/dataframe.py | 4 +++- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 18f6078edf..945bac321b 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -275,8 +275,8 @@ def label_to_col_id(self) -> typing.Mapping[Label, typing.Sequence[str]]: for id, label in self.col_id_to_label.items(): mapping[label] = (*mapping.get(label, ()), id) return mapping - - def resolve_label_exact(self, label:Label) -> Optional[str]: + + def resolve_label_exact(self, label: Label) -> Optional[str]: """Returns the column id matching the label if there is exactly one such column. If there are multiple columns with the same name, raises an error. If there is no such column, returns None.""" @@ -286,14 +286,14 @@ def resolve_label_exact(self, label:Label) -> Optional[str]: f"Multiple columns matching id {label} were found. {constants.FEEDBACK_LINK}" ) return matches[0] if len(matches) != 0 else None - - def resolve_label_exact_or_error(self, label:Label) -> str: + + def resolve_label_exact_or_error(self, label: Label) -> str: """Returns the column id matching the label if there is exactly one such column. If there are multiple columns with the same name, raises an error. If there is no such column, raises an error too.""" col_id = self.resolve_label_exact(label) if col_id is None: - raise ValueError(f"Label not found: {label}") + raise ValueError(f"Label {label} not found. {constants.FEEDBACK_LINK}") return col_id @functools.cached_property diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 069f99c6e8..4ffa56c2e5 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -180,7 +180,9 @@ def __init__( ) block = block.set_index([r_mapping[idx_col] for idx_col in idx_cols]) if columns: - column_ids = [block.resolve_label_exact_or_error(label) for label in list(columns)] + column_ids = [ + block.resolve_label_exact_or_error(label) for label in list(columns) + ] block = block.select_columns(column_ids) # type:ignore if dtype: bf_dtype = bigframes.dtypes.bigframes_type(dtype) From a3f60626422fa492d6173b63f82a051b1bf82e15 Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Thu, 6 Feb 2025 20:19:39 +0000 Subject: [PATCH 3/4] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20po?= =?UTF-8?q?st-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- bigframes/core/blocks.py | 8 ++++---- bigframes/dataframe.py | 4 +++- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 18f6078edf..f4bf6ccbdd 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -275,8 +275,8 @@ def label_to_col_id(self) -> typing.Mapping[Label, typing.Sequence[str]]: for id, label in self.col_id_to_label.items(): mapping[label] = (*mapping.get(label, ()), id) return mapping - - def resolve_label_exact(self, label:Label) -> Optional[str]: + + def resolve_label_exact(self, label: Label) -> Optional[str]: """Returns the column id matching the label if there is exactly one such column. If there are multiple columns with the same name, raises an error. If there is no such column, returns None.""" @@ -286,8 +286,8 @@ def resolve_label_exact(self, label:Label) -> Optional[str]: f"Multiple columns matching id {label} were found. {constants.FEEDBACK_LINK}" ) return matches[0] if len(matches) != 0 else None - - def resolve_label_exact_or_error(self, label:Label) -> str: + + def resolve_label_exact_or_error(self, label: Label) -> str: """Returns the column id matching the label if there is exactly one such column. If there are multiple columns with the same name, raises an error. If there is no such column, raises an error too.""" diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 069f99c6e8..4ffa56c2e5 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -180,7 +180,9 @@ def __init__( ) block = block.set_index([r_mapping[idx_col] for idx_col in idx_cols]) if columns: - column_ids = [block.resolve_label_exact_or_error(label) for label in list(columns)] + column_ids = [ + block.resolve_label_exact_or_error(label) for label in list(columns) + ] block = block.select_columns(column_ids) # type:ignore if dtype: bf_dtype = bigframes.dtypes.bigframes_type(dtype) From 8d738b0576afe0df3038f38a888aaf9ae7e5f766 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Thu, 6 Feb 2025 20:21:24 +0000 Subject: [PATCH 4/4] polish doc --- bigframes/core/blocks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 945bac321b..c6e3096e51 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -279,7 +279,7 @@ def label_to_col_id(self) -> typing.Mapping[Label, typing.Sequence[str]]: def resolve_label_exact(self, label: Label) -> Optional[str]: """Returns the column id matching the label if there is exactly one such column. If there are multiple columns with the same name, - raises an error. If there is no such column, returns None.""" + raises an error. If there is no such a column, returns None.""" matches = self.label_to_col_id.get(label, []) if len(matches) > 1: raise ValueError( @@ -290,7 +290,7 @@ def resolve_label_exact(self, label: Label) -> Optional[str]: def resolve_label_exact_or_error(self, label: Label) -> str: """Returns the column id matching the label if there is exactly one such column. If there are multiple columns with the same name, - raises an error. If there is no such column, raises an error too.""" + raises an error. If there is no such a column, raises an error too.""" col_id = self.resolve_label_exact(label) if col_id is None: raise ValueError(f"Label {label} not found. {constants.FEEDBACK_LINK}")