Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

fix: read_csv with both index_col and use_cols inconsistent with pandas #1785

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion 6 bigframes/session/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1170,7 +1170,11 @@ def _read_csv_w_bigquery_engine(

table_id = self._loader.load_file(filepath_or_buffer, job_config=job_config)
df = self._loader.read_gbq_table(
table_id, index_col=index_col, columns=columns, names=names
table_id,
index_col=index_col,
columns=columns,
names=names,
index_col_in_columns=True,
)

if dtype is not None:
Expand Down
118 changes: 110 additions & 8 deletions 118 bigframes/session/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,31 @@ def _to_index_cols(
return index_cols


def _check_column_duplicates(index_cols: Iterable[str], columns: Iterable[str]):
def _check_column_duplicates(
index_cols: Iterable[str], columns: Iterable[str], index_col_in_columns: bool
) -> Iterable[str]:
"""Validates and processes index and data columns for duplicates and overlap.

This function performs two main tasks:
1. Ensures there are no duplicate column names within the `index_cols` list
or within the `columns` list.
2. Based on the `index_col_in_columns` flag, it validates the relationship
between `index_cols` and `columns`.

Args:
index_cols (Iterable[str]):
An iterable of column names designated as the index.
columns (Iterable[str]):
An iterable of column names designated as the data columns.
index_col_in_columns (bool):
A flag indicating how to handle overlap between `index_cols` and
`columns`.
- If `False`, the two lists must be disjoint (contain no common
elements). An error is raised if any overlap is found.
- If `True`, `index_cols` is expected to be a subset of
`columns`. An error is raised if an index column is not found
in the `columns` list.
"""
index_cols_list = list(index_cols) if index_cols is not None else []
columns_list = list(columns) if columns is not None else []
set_index = set(index_cols_list)
Expand All @@ -108,17 +132,29 @@ def _check_column_duplicates(index_cols: Iterable[str], columns: Iterable[str]):
"All column names specified in 'index_col' must be unique."
)

if len(columns_list) == 0:
return columns

if len(columns_list) > len(set_columns):
raise ValueError(
"The 'columns' argument contains duplicate names. "
"All column names specified in 'columns' must be unique."
)

if not set_index.isdisjoint(set_columns):
raise ValueError(
"Found column names that exist in both 'index_col' and 'columns' arguments. "
"These arguments must specify distinct sets of columns."
)
if index_col_in_columns:
if not set_index.issubset(set_columns):
raise ValueError(
f"The specified index column(s) were not found: {set_index - set_columns}. "
f"Available columns are: {set_columns}"
)
return [col for col in columns if col not in set_index]
else:
if not set_index.isdisjoint(set_columns):
raise ValueError(
"Found column names that exist in both 'index_col' and 'columns' arguments. "
"These arguments must specify distinct sets of columns."
)
return columns


@dataclasses.dataclass
Expand Down Expand Up @@ -391,6 +427,7 @@ def read_gbq_table( # type: ignore[overload-overlap]
dry_run: Literal[False] = ...,
force_total_order: Optional[bool] = ...,
n_rows: Optional[int] = None,
index_col_in_columns: bool = False,
) -> dataframe.DataFrame:
...

Expand All @@ -413,6 +450,7 @@ def read_gbq_table(
dry_run: Literal[True] = ...,
force_total_order: Optional[bool] = ...,
n_rows: Optional[int] = None,
index_col_in_columns: bool = False,
) -> pandas.Series:
...

Expand All @@ -434,7 +472,67 @@ def read_gbq_table(
dry_run: bool = False,
force_total_order: Optional[bool] = None,
n_rows: Optional[int] = None,
index_col_in_columns: bool = False,
) -> dataframe.DataFrame | pandas.Series:
"""Read a BigQuery table into a BigQuery DataFrames DataFrame.

This method allows you to create a DataFrame from a BigQuery table.
You can specify the columns to load, an index column, and apply
filters.

Args:
table_id (str):
The identifier of the BigQuery table to read.
index_col (Iterable[str] | str | Iterable[int] | int | bigframes.enums.DefaultIndexKind, optional):
The column(s) to use as the index for the DataFrame. This can be
a single column name or a list of column names. If not provided,
a default index will be used based on the session's
``default_index_type``.
columns (Iterable[str], optional):
The columns to read from the table. If not specified, all
columns will be read.
names (Optional[Iterable[str]], optional):
A list of column names to use for the resulting DataFrame. This
is useful if you want to rename the columns as you read the
data.
max_results (Optional[int], optional):
The maximum number of rows to retrieve from the table. If not
specified, all rows will be loaded.
use_cache (bool, optional):
Whether to use cached results for the query. Defaults to True.
Setting this to False will force a re-execution of the query.
filters (third_party_pandas_gbq.FiltersType, optional):
A list of filters to apply to the data. Filters are specified
as a list of tuples, where each tuple contains a column name,
an operator (e.g., '==', '!='), and a value.
enable_snapshot (bool, optional):
If True, a snapshot of the table is used to ensure that the
DataFrame is deterministic, even if the underlying table
changes. Defaults to True.
dry_run (bool, optional):
If True, the function will not actually execute the query but
will instead return statistics about the table. Defaults to False.
force_total_order (Optional[bool], optional):
If True, a total ordering is enforced on the DataFrame, which
can be useful for operations that require a stable row order.
If None, the session's default behavior is used.
n_rows (Optional[int], optional):
The number of rows to consider for type inference and other
metadata operations. This does not limit the number of rows
in the final DataFrame.
index_col_in_columns (bool, optional):
Specifies if the ``index_col`` is also present in the ``columns``
list. Defaults to ``False``.

* If ``False``, ``index_col`` and ``columns`` must specify
distinct sets of columns. An error will be raised if any
column is found in both.
* If ``True``, the column(s) in ``index_col`` are expected to
also be present in the ``columns`` list. This is useful
when the index is selected from the data columns (e.g., in a
``read_csv`` scenario). The column will be used as the
DataFrame's index and removed from the list of value columns.
"""
import bigframes._tools.strings
import bigframes.dataframe as dataframe

Expand Down Expand Up @@ -516,7 +614,9 @@ def read_gbq_table(
index_col=index_col,
names=names,
)
_check_column_duplicates(index_cols, columns)
columns = list(
_check_column_duplicates(index_cols, columns, index_col_in_columns)
)

for key in index_cols:
if key not in table_column_names:
Expand Down Expand Up @@ -798,7 +898,9 @@ def read_gbq_query(
)

index_cols = _to_index_cols(index_col)
_check_column_duplicates(index_cols, columns)
columns = _check_column_duplicates(
index_cols, columns, index_col_in_columns=False
)

filters_copy1, filters_copy2 = itertools.tee(filters)
has_filters = len(list(filters_copy1)) != 0
Expand Down
76 changes: 49 additions & 27 deletions 76 tests/system/small/test_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -1320,10 +1320,6 @@ def test_read_csv_for_names_less_than_columns(session, df_and_gcs_csv_for_two_co
assert bf_df.shape == pd_df.shape
assert bf_df.columns.tolist() == pd_df.columns.tolist()

# BigFrames requires `sort_index()` because BigQuery doesn't preserve row IDs
# (b/280889935) or guarantee row ordering.
bf_df = bf_df.sort_index()

# Pandas's index name is None, while BigFrames's index name is "rowindex".
pd_df.index.name = "rowindex"
pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas())
Expand Down Expand Up @@ -1479,41 +1475,70 @@ def test_read_csv_for_gcs_file_w_header(session, df_and_gcs_csv, header):
def test_read_csv_w_usecols(session, df_and_local_csv):
# Compares results for pandas and bigframes engines
scalars_df, path = df_and_local_csv
usecols = ["rowindex", "bool_col"]
with open(path, "rb") as buffer:
bf_df = session.read_csv(
buffer,
engine="bigquery",
usecols=["bool_col"],
usecols=usecols,
)
with open(path, "rb") as buffer:
# Convert default pandas dtypes to match BigQuery DataFrames dtypes.
pd_df = session.read_csv(
buffer,
usecols=["bool_col"],
usecols=usecols,
dtype=scalars_df[["bool_col"]].dtypes.to_dict(),
)

# Cannot compare two dataframe due to b/408499371.
assert len(bf_df.columns) == 1
assert len(pd_df.columns) == 1
assert bf_df.shape == pd_df.shape
assert bf_df.columns.tolist() == pd_df.columns.tolist()

# BigFrames requires `sort_index()` because BigQuery doesn't preserve row IDs
# (b/280889935) or guarantee row ordering.
bf_df = bf_df.set_index("rowindex").sort_index()
pd_df = pd_df.set_index("rowindex")
pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas())

@pytest.mark.parametrize(
"engine",
[
pytest.param("bigquery", id="bq_engine"),
pytest.param(None, id="default_engine"),
],
)
def test_read_csv_local_w_usecols(session, scalars_pandas_df_index, engine):
with tempfile.TemporaryDirectory() as dir:
path = dir + "/test_read_csv_local_w_usecols.csv"
# Using the pandas to_csv method because the BQ one does not support local write.
scalars_pandas_df_index.to_csv(path, index=False)

# df should only have 1 column which is bool_col.
df = session.read_csv(path, usecols=["bool_col"], engine=engine)
assert len(df.columns) == 1
def test_read_csv_w_usecols_and_indexcol(session, df_and_local_csv):
# Compares results for pandas and bigframes engines
scalars_df, path = df_and_local_csv
usecols = ["rowindex", "bool_col"]
with open(path, "rb") as buffer:
bf_df = session.read_csv(
buffer,
engine="bigquery",
usecols=usecols,
index_col="rowindex",
)
with open(path, "rb") as buffer:
# Convert default pandas dtypes to match BigQuery DataFrames dtypes.
pd_df = session.read_csv(
buffer,
usecols=usecols,
index_col="rowindex",
dtype=scalars_df[["bool_col"]].dtypes.to_dict(),
)

assert bf_df.shape == pd_df.shape
assert bf_df.columns.tolist() == pd_df.columns.tolist()

pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df.to_pandas())


def test_read_csv_w_indexcol_not_in_usecols(session, df_and_local_csv):
_, path = df_and_local_csv
with open(path, "rb") as buffer:
with pytest.raises(
ValueError,
match=re.escape("The specified index column(s) were not found"),
):
session.read_csv(
buffer,
engine="bigquery",
usecols=["bool_col"],
index_col="rowindex",
)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -1553,9 +1578,6 @@ def test_read_csv_local_w_encoding(session, penguins_pandas_df_default_index):
bf_df = session.read_csv(
path, engine="bigquery", index_col="rowindex", encoding="ISO-8859-1"
)
# BigFrames requires `sort_index()` because BigQuery doesn't preserve row IDs
# (b/280889935) or guarantee row ordering.
bf_df = bf_df.sort_index()
pd.testing.assert_frame_equal(
bf_df.to_pandas(), penguins_pandas_df_default_index
)
Expand Down
Morty Proxy This is a proxified and sanitized view of the page, visit original site.