From 34ad9d0ff4c4cd1b637b513b1653487f18f5bcb9 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Wed, 20 Sep 2023 01:01:37 +0000 Subject: [PATCH] fix: Fix header skipping logic in `read_csv` Change-Id: Ib575e2c2b07f819d1dc499a271fea91107fbb8b4 --- bigframes/session.py | 7 +++---- tests/system/small/test_session.py | 18 ++++++++++++------ 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/bigframes/session.py b/bigframes/session.py index 04ae6ba454..7b827c7dcf 100644 --- a/bigframes/session.py +++ b/bigframes/session.py @@ -1050,11 +1050,10 @@ def read_csv( # We want to match pandas behavior. If header is 0, no rows should be skipped, so we # do not need to set `skip_leading_rows`. If header is None, then there is no header. # Setting skip_leading_rows to 0 does that. If header=N and N>0, we want to skip N rows. - # `skip_leading_rows` skips N-1 rows, so we set it to header+1. - if header is not None and header > 0: - job_config.skip_leading_rows = header + 1 - elif header is None: + if header is None: job_config.skip_leading_rows = 0 + elif header > 0: + job_config.skip_leading_rows = header return self._read_bigquery_load_job( filepath_or_buffer, diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index b7bee16ffd..614c953764 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -578,9 +578,12 @@ def test_read_csv_gcs_bq_engine_w_header(session, scalars_df_index, gcs_folder): path = gcs_folder + "test_read_csv_gcs_bq_engine_w_header*.csv" scalars_df_index.to_csv(path, index=False) - # Skip the header and the first 2 data rows. Without provided schema, the column names - # would be like `bool_field_0`, `string_field_1` and etc. - df = session.read_csv(path, header=2, engine="bigquery") + # Skip the header and the first 2 data rows. Note that one line of header + # also got added while writing the csv through `to_csv`, so we would have to + # pass headers=3 in the `read_csv` to skip reading the header and two rows. + # Without provided schema, the column names would be like `bool_field_0`, + # `string_field_1` and etc. + df = session.read_csv(path, header=3, engine="bigquery") assert df.shape[0] == scalars_df_index.shape[0] - 2 assert len(df.columns) == len(scalars_df_index.columns) @@ -609,9 +612,12 @@ def test_read_csv_local_bq_engine_w_header(session, scalars_pandas_df_index): # Using the pandas to_csv method because the BQ one does not support local write. scalars_pandas_df_index.to_csv(path, index=False) - # Skip the header and the first 2 data rows. Without provided schema, the column names - # would be like `bool_field_0`, `string_field_1` and etc. - df = session.read_csv(path, header=2, engine="bigquery") + # Skip the header and the first 2 data rows. Note that one line of + # header also got added while writing the csv through `to_csv`, so we + # would have to pass headers=3 in the `read_csv` to skip reading the + # header and two rows. Without provided schema, the column names would + # be like `bool_field_0`, `string_field_1` and etc. + df = session.read_csv(path, header=3, engine="bigquery") assert df.shape[0] == scalars_pandas_df_index.shape[0] - 2 assert len(df.columns) == len(scalars_pandas_df_index.columns)