From 0ddd86b062d124a754dcb8087d114fbce7901970 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 19 Apr 2024 21:32:08 +0000 Subject: [PATCH 1/2] docs: set `index_cols` in `read_gbq` as a best practice --- third_party/bigframes_vendored/pandas/io/gbq.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index b5feeb13c5..520092204a 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -28,12 +28,14 @@ def read_gbq( """Loads a DataFrame from BigQuery. BigQuery tables are an unordered, unindexed data source. By default, - the DataFrame will have an arbitrary index and ordering. - - Set the `index_col` argument to one or more columns to choose an - index. The resulting DataFrame is sorted by the index columns. For the - best performance, ensure the index columns don't contain duplicate - values. + the DataFrame will have an arbitrary index and ordering. Generating + the default index uses an analytic windowed operation that prevents + many filtering push down operations. As a best practice, set the + ``index_col`` argument to one or more columns, especially on large + tables. + + Duplicate keys in an index are valid, but for the best performance, + ensure the index columns don't contain duplicate values. .. note:: By default, even SQL query inputs with an ORDER BY clause create a From b96cba38b29ced8a2814f4b2e1ecad2c72a9d6e0 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Mon, 22 Apr 2024 18:52:33 +0000 Subject: [PATCH 2/2] document behaviors --- .../bigframes_vendored/pandas/io/gbq.py | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 520092204a..c60a276338 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -27,15 +27,17 @@ def read_gbq( ): """Loads a DataFrame from BigQuery. - BigQuery tables are an unordered, unindexed data source. By default, - the DataFrame will have an arbitrary index and ordering. Generating - the default index uses an analytic windowed operation that prevents - many filtering push down operations. As a best practice, set the - ``index_col`` argument to one or more columns, especially on large - tables. - - Duplicate keys in an index are valid, but for the best performance, - ensure the index columns don't contain duplicate values. + BigQuery tables are an unordered, unindexed data source. To add support + pandas-compatibility, the following indexing options are supported: + + * (Default behavior) Add an arbitrary sequential index and ordering + using an an analytic windowed operation that prevents filtering + push down. + * (Recommended) Set the ``index_col`` argument to one or more columns. + Unique values for the row labels are recommended. Duplicate labels + are possible, but note that joins on a non-unique index can duplicate + rows and operations like ``cumsum()`` that window across a non-unique + index can have some non-deternimism. .. note:: By default, even SQL query inputs with an ORDER BY clause create a