From 34ad9d0ff4c4cd1b637b513b1653487f18f5bcb9 Mon Sep 17 00:00:00 2001
From: Shobhit Singh <shobs@google.com>
Date: Wed, 20 Sep 2023 01:01:37 +0000
Subject: [PATCH] fix: Fix header skipping logic in `read_csv`

Change-Id: Ib575e2c2b07f819d1dc499a271fea91107fbb8b4
---
 bigframes/session.py               |  7 +++----
 tests/system/small/test_session.py | 18 ++++++++++++------
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/bigframes/session.py b/bigframes/session.py
index 04ae6ba454..7b827c7dcf 100644
--- a/bigframes/session.py
+++ b/bigframes/session.py
@@ -1050,11 +1050,10 @@ def read_csv(
             # We want to match pandas behavior. If header is 0, no rows should be skipped, so we
             # do not need to set `skip_leading_rows`. If header is None, then there is no header.
             # Setting skip_leading_rows to 0 does that. If header=N and N>0, we want to skip N rows.
-            # `skip_leading_rows` skips N-1 rows, so we set it to header+1.
-            if header is not None and header > 0:
-                job_config.skip_leading_rows = header + 1
-            elif header is None:
+            if header is None:
                 job_config.skip_leading_rows = 0
+            elif header > 0:
+                job_config.skip_leading_rows = header
 
             return self._read_bigquery_load_job(
                 filepath_or_buffer,
diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py
index b7bee16ffd..614c953764 100644
--- a/tests/system/small/test_session.py
+++ b/tests/system/small/test_session.py
@@ -578,9 +578,12 @@ def test_read_csv_gcs_bq_engine_w_header(session, scalars_df_index, gcs_folder):
     path = gcs_folder + "test_read_csv_gcs_bq_engine_w_header*.csv"
     scalars_df_index.to_csv(path, index=False)
 
-    # Skip the header and the first 2 data rows. Without provided schema, the column names
-    # would be like `bool_field_0`, `string_field_1` and etc.
-    df = session.read_csv(path, header=2, engine="bigquery")
+    # Skip the header and the first 2 data rows. Note that one line of header
+    # also got added while writing the csv through `to_csv`, so we would have to
+    # pass headers=3 in the `read_csv` to skip reading the header and two rows.
+    # Without provided schema, the column names would be like `bool_field_0`,
+    # `string_field_1` and etc.
+    df = session.read_csv(path, header=3, engine="bigquery")
     assert df.shape[0] == scalars_df_index.shape[0] - 2
     assert len(df.columns) == len(scalars_df_index.columns)
 
@@ -609,9 +612,12 @@ def test_read_csv_local_bq_engine_w_header(session, scalars_pandas_df_index):
         # Using the pandas to_csv method because the BQ one does not support local write.
         scalars_pandas_df_index.to_csv(path, index=False)
 
-        # Skip the header and the first 2 data rows. Without provided schema, the column names
-        # would be like `bool_field_0`, `string_field_1` and etc.
-        df = session.read_csv(path, header=2, engine="bigquery")
+        # Skip the header and the first 2 data rows. Note that one line of
+        # header also got added while writing the csv through `to_csv`, so we
+        # would have to pass headers=3 in the `read_csv` to skip reading the
+        # header and two rows. Without provided schema, the column names would
+        # be like `bool_field_0`, `string_field_1` and etc.
+        df = session.read_csv(path, header=3, engine="bigquery")
         assert df.shape[0] == scalars_pandas_df_index.shape[0] - 2
         assert len(df.columns) == len(scalars_pandas_df_index.columns)