From 962e1fd356141a288bc31f3d0790640cc9b4ea61 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 21 Aug 2024 15:28:56 +0000 Subject: [PATCH 1/3] docs: add a code sample using `bpd.options.bigquery.ordering_mode = "partial"` --- .../quickstart_ordering_mode_partial_test.py | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 samples/snippets/quickstart_ordering_mode_partial_test.py diff --git a/samples/snippets/quickstart_ordering_mode_partial_test.py b/samples/snippets/quickstart_ordering_mode_partial_test.py new file mode 100644 index 0000000000..22ce9a333c --- /dev/null +++ b/samples/snippets/quickstart_ordering_mode_partial_test.py @@ -0,0 +1,69 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import bigframes +import bigframes.pandas + + +def test_quickstart() -> None: + # We need a fresh session since we're modifying connection options. + bigframes.pandas.close_session() + + # [START bigquery_bigframes_ordering_mode_partial] + import bigframes.pandas as bpd + + bpd.options.bigquery.ordering_mode = "partial" + # [END bigquery_bigframes_ordering_mode_partial] + + # Below is a copy of the main quickstart to check that it also works with + # this ordering mode. + + # Create a DataFrame from a BigQuery table + query_or_table = "bigquery-public-data.ml_datasets.penguins" + df = bpd.read_gbq(query_or_table) + + # Use the DataFrame just as you would a pandas DataFrame, but calculations + # happen in the BigQuery query engine instead of the local system. + average_body_mass = df["body_mass_g"].mean() + print(f"average_body_mass: {average_body_mass}") + + # Create the Linear Regression model + from bigframes.ml.linear_model import LinearRegression + + # Filter down to the data we want to analyze + adelie_data = df[df.species == "Adelie Penguin (Pygoscelis adeliae)"] + + # Drop the columns we don't care about + adelie_data = adelie_data.drop(columns=["species"]) + + # Drop rows with nulls to get our training data + training_data = adelie_data.dropna() + + # Pick feature columns and label column + X = training_data[ + [ + "island", + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "sex", + ] + ] + y = training_data[["body_mass_g"]] + + model = LinearRegression(fit_intercept=False) + model.fit(X, y) + model.score(X, y) + + assert model is not None From a4255de67da940b822b1f352cffa9e43c9e7d0d6 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 21 Aug 2024 15:35:35 +0000 Subject: [PATCH 2/3] add warning filter too --- .../quickstart_ordering_mode_partial_test.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/samples/snippets/quickstart_ordering_mode_partial_test.py b/samples/snippets/quickstart_ordering_mode_partial_test.py index 22ce9a333c..c7917e4441 100644 --- a/samples/snippets/quickstart_ordering_mode_partial_test.py +++ b/samples/snippets/quickstart_ordering_mode_partial_test.py @@ -12,11 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import bigframes -import bigframes.pandas - def test_quickstart() -> None: + import bigframes.pandas + # We need a fresh session since we're modifying connection options. bigframes.pandas.close_session() @@ -26,6 +25,16 @@ def test_quickstart() -> None: bpd.options.bigquery.ordering_mode = "partial" # [END bigquery_bigframes_ordering_mode_partial] + # [START bigquery_bigframes_ordering_mode_partial_ambiguous_window_warning] + import warnings + + import bigframes.exceptions + + warnings.simplefilter( + "ignore", category=bigframes.exceptions.AmbiguousWindowWarning + ) + # [END bigquery_bigframes_ordering_mode_partial_ambiguous_window_warning] + # Below is a copy of the main quickstart to check that it also works with # this ordering mode. From 6bbd36a6d455692925eaf82d4e753ef0604cedda Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 21 Aug 2024 18:28:03 +0000 Subject: [PATCH 3/3] add drop_duplicates alternative --- .../snippets/ordering_mode_partial_test.py | 45 +++++++++++ .../quickstart_ordering_mode_partial_test.py | 78 ------------------- 2 files changed, 45 insertions(+), 78 deletions(-) create mode 100644 samples/snippets/ordering_mode_partial_test.py delete mode 100644 samples/snippets/quickstart_ordering_mode_partial_test.py diff --git a/samples/snippets/ordering_mode_partial_test.py b/samples/snippets/ordering_mode_partial_test.py new file mode 100644 index 0000000000..15ee4cb4ba --- /dev/null +++ b/samples/snippets/ordering_mode_partial_test.py @@ -0,0 +1,45 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_quickstart() -> None: + import bigframes.pandas + + # We need a fresh session since we're modifying connection options. + bigframes.pandas.close_session() + + # [START bigquery_bigframes_ordering_mode_partial] + import bigframes.pandas as bpd + + bpd.options.bigquery.ordering_mode = "partial" + # [END bigquery_bigframes_ordering_mode_partial] + + # [START bigquery_bigframes_ordering_mode_partial_ambiguous_window_warning] + import warnings + + import bigframes.exceptions + + warnings.simplefilter( + "ignore", category=bigframes.exceptions.AmbiguousWindowWarning + ) + # [END bigquery_bigframes_ordering_mode_partial_ambiguous_window_warning] + + df = bpd.DataFrame({"column": [1, 2, 1, 3, 1, 2, 3]}) + + # [START bigquery_bigframes_ordering_mode_partial_drop_duplicates] + # Avoid order dependency by using groupby instead of drop_duplicates. + unique_col = df.groupby(["column"], as_index=False).size().drop(columns="size") + # [END bigquery_bigframes_ordering_mode_partial_drop_duplicates] + + assert len(unique_col) == 3 diff --git a/samples/snippets/quickstart_ordering_mode_partial_test.py b/samples/snippets/quickstart_ordering_mode_partial_test.py deleted file mode 100644 index c7917e4441..0000000000 --- a/samples/snippets/quickstart_ordering_mode_partial_test.py +++ /dev/null @@ -1,78 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -def test_quickstart() -> None: - import bigframes.pandas - - # We need a fresh session since we're modifying connection options. - bigframes.pandas.close_session() - - # [START bigquery_bigframes_ordering_mode_partial] - import bigframes.pandas as bpd - - bpd.options.bigquery.ordering_mode = "partial" - # [END bigquery_bigframes_ordering_mode_partial] - - # [START bigquery_bigframes_ordering_mode_partial_ambiguous_window_warning] - import warnings - - import bigframes.exceptions - - warnings.simplefilter( - "ignore", category=bigframes.exceptions.AmbiguousWindowWarning - ) - # [END bigquery_bigframes_ordering_mode_partial_ambiguous_window_warning] - - # Below is a copy of the main quickstart to check that it also works with - # this ordering mode. - - # Create a DataFrame from a BigQuery table - query_or_table = "bigquery-public-data.ml_datasets.penguins" - df = bpd.read_gbq(query_or_table) - - # Use the DataFrame just as you would a pandas DataFrame, but calculations - # happen in the BigQuery query engine instead of the local system. - average_body_mass = df["body_mass_g"].mean() - print(f"average_body_mass: {average_body_mass}") - - # Create the Linear Regression model - from bigframes.ml.linear_model import LinearRegression - - # Filter down to the data we want to analyze - adelie_data = df[df.species == "Adelie Penguin (Pygoscelis adeliae)"] - - # Drop the columns we don't care about - adelie_data = adelie_data.drop(columns=["species"]) - - # Drop rows with nulls to get our training data - training_data = adelie_data.dropna() - - # Pick feature columns and label column - X = training_data[ - [ - "island", - "culmen_length_mm", - "culmen_depth_mm", - "flipper_length_mm", - "sex", - ] - ] - y = training_data[["body_mass_g"]] - - model = LinearRegression(fit_intercept=False) - model.fit(X, y) - model.score(X, y) - - assert model is not None