From dcf3b263f8761c7a83be81d6b67183127797dd0a Mon Sep 17 00:00:00 2001 From: Salem Boyland Date: Mon, 29 Jan 2024 14:07:21 -0600 Subject: [PATCH 01/10] create_single_timeseries_forecasting_model_test.py code sample --- ...ingle_timeseries_forecasting_model_test.py | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 samples/snippets/create_single_timeseries_forecasting_model_test.py diff --git a/samples/snippets/create_single_timeseries_forecasting_model_test.py b/samples/snippets/create_single_timeseries_forecasting_model_test.py new file mode 100644 index 0000000000..a6df5f0297 --- /dev/null +++ b/samples/snippets/create_single_timeseries_forecasting_model_test.py @@ -0,0 +1,66 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (t +# you may not use this file except in compliance wi +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in +# distributed under the License is distributed on a +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, eit +# See the License for the specific language governi +# limitations under the License. + + +def test_create_single_timeseries(random_model_id): + your_model_id = random_model_id + + # [START bigquery_dataframes_single_timeseries_forecasting_model_tutorial] + import bigframes.pandas as bpd + + # Start by selecting the data you'll use for training. `read_gbq` accepts + # either a SQL query or a table ID. Since this example selects from multiple + # tables via a wildcard, use SQL to define this data. Watch issue + # https://github.com/googleapis/python-bigquery-dataframes/issues/169 + # for updates to `read_gbq` to support wildcard tables. + + # Read and visualize the time series you want to forecast. + df = bpd.read_gbq(''' + SELECT PARSE_TIMESTAMP("%Y%m%d", date) AS parsed_date, + SUM(totals.visits) AS total_visits + FROM + `bigquery-public-data.google_analytics_sample.ga_sessions_*` + GROUP BY date + ''') + X = df[["parsed_date"]] + y = df[["total_visits"]] + + # Create an Arima-based time series model using the Google Analytics 360 data. + from bigframes.ml.forecasting import ARIMAPlus + + ga_arima_model = ARIMAPlus() + + # Fit the model to your dataframe. + ga_arima_model.fit(X,y) + + # The model.fit() call above created a temporary model. + # Use the to_gbq() method to write to a permanent location. + ga_arima_model.to_gbq( + your_model_id, # For example: "bqml_tutorial.sample_model", + replace=True, + ) + + # Inspect the evaluation metrics of all evaluated models. + # when ruuning this function use same model, dataset, model name (str) + evaluation = ga_arima_model.summary( + f''' + SELECT * + FROM ML.ARIMA_EVALUATE(MODEL `{your_model_id}`) + ''' + ) + + print(evaluation) + # Inspect the coefficients of your model + + \ No newline at end of file From adc22ef8a872227ee49ee0907b35eb8e5775bab9 Mon Sep 17 00:00:00 2001 From: Salem Jorden <115185670+SalemJorden@users.noreply.github.com> Date: Tue, 20 Feb 2024 09:46:43 -0600 Subject: [PATCH 02/10] fix: forecast method to forecast time series --- ...ingle_timeseries_forecasting_model_test.py | 29 ++++++++++++++----- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/samples/snippets/create_single_timeseries_forecasting_model_test.py b/samples/snippets/create_single_timeseries_forecasting_model_test.py index a6df5f0297..6429f613a4 100644 --- a/samples/snippets/create_single_timeseries_forecasting_model_test.py +++ b/samples/snippets/create_single_timeseries_forecasting_model_test.py @@ -52,15 +52,30 @@ def test_create_single_timeseries(random_model_id): ) # Inspect the evaluation metrics of all evaluated models. - # when ruuning this function use same model, dataset, model name (str) + # when running this function use same model, dataset, model name (str) evaluation = ga_arima_model.summary( - f''' - SELECT * - FROM ML.ARIMA_EVALUATE(MODEL `{your_model_id}`) - ''' + show_all_candidate_models = False, ) print(evaluation) + # Inspect the coefficients of your model - - \ No newline at end of file + f''' + SELECT * + FROM ML.ARIMA_COEFFICIENTS(MODEL `{your_model_id}`) + ''' + evaluation.ML.ARIMA_COEFFICIENTS() + + # Use your model to forecast the time series + #standardSQL + your_model_id.forecast() + + # Explain and visualize the forecasting results + f''' + SELECT * + FROM ML.EXPLAIN_FORECAST( + MODEL `{your_model_id}`, + STRUCT( + [horizon AS horizon] + [, confidence_level AS confidence_level])) + ''' \ No newline at end of file From d3ea7c79affca6d1edcf38da84c9525ed61df765 Mon Sep 17 00:00:00 2001 From: Salem Jorden <115185670+SalemJorden@users.noreply.github.com> Date: Wed, 13 Mar 2024 12:10:10 -0500 Subject: [PATCH 03/10] pair programming PR draft creation --- ...ingle_timeseries_forecasting_model_test.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/samples/snippets/create_single_timeseries_forecasting_model_test.py b/samples/snippets/create_single_timeseries_forecasting_model_test.py index 6429f613a4..a91f6d07b7 100644 --- a/samples/snippets/create_single_timeseries_forecasting_model_test.py +++ b/samples/snippets/create_single_timeseries_forecasting_model_test.py @@ -26,21 +26,21 @@ def test_create_single_timeseries(random_model_id): # for updates to `read_gbq` to support wildcard tables. # Read and visualize the time series you want to forecast. - df = bpd.read_gbq(''' - SELECT PARSE_TIMESTAMP("%Y%m%d", date) AS parsed_date, - SUM(totals.visits) AS total_visits - FROM - `bigquery-public-data.google_analytics_sample.ga_sessions_*` - GROUP BY date - ''') - X = df[["parsed_date"]] - y = df[["total_visits"]] + df = bpd.read_gbq( + 'bigquery-public-data.google_analytics_sample.ga_sessions_*' + ) + parsed_date = bpd.to_datetime(df.date, format= "%Y%m%d", utc = True) + total_visits = df.groupby(["date"])["parsed_date"].sum() + visits = df["totals"].struct.field("visits") - # Create an Arima-based time series model using the Google Analytics 360 data. + # Create an Arima-based time series model using the Google Analytics 360 data. from bigframes.ml.forecasting import ARIMAPlus ga_arima_model = ARIMAPlus() + X = df[["parsed_date"]] + y = df[["total_visits"]] + # Fit the model to your dataframe. ga_arima_model.fit(X,y) From 17cbd68f87a85021fecfd44da2e92e91d4ecca9e Mon Sep 17 00:00:00 2001 From: Salem Jorden <115185670+SalemJorden@users.noreply.github.com> Date: Fri, 5 Apr 2024 15:15:30 -0500 Subject: [PATCH 04/10] tutorial step 7 & 8 --- ..._single_timeseries_forecasting_model_test.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/samples/snippets/create_single_timeseries_forecasting_model_test.py b/samples/snippets/create_single_timeseries_forecasting_model_test.py index a91f6d07b7..b7ac8d8dc7 100644 --- a/samples/snippets/create_single_timeseries_forecasting_model_test.py +++ b/samples/snippets/create_single_timeseries_forecasting_model_test.py @@ -78,4 +78,19 @@ def test_create_single_timeseries(random_model_id): STRUCT( [horizon AS horizon] [, confidence_level AS confidence_level])) - ''' \ No newline at end of file + ''' + total_visits.plot.line(x = 'history_timestamp', y = 'history_value') + + # Visualize the forecasting results without having decompose_time_series enabled. + df = bpd.read_gbq( + 'bigquery-public-data.google_analytics_sample.ga_sessions_*' + ) + parsed_date = bpd.to_datetime(df.date, format= "%Y%m%d", utc = True) + visits = df["totals"].struct.field("visits") + df = bpd.DataFrame( + { + 'history_timestamp': parsed_date, + 'history_value': visits, + } + ) + total_visits = df.groupby(["history_timestamp"], as_index = False).sum(numeric_only= True) From 2ecceaeba70b27a88aaf5421db9e8706b37bd216 Mon Sep 17 00:00:00 2001 From: Salem Jorden <115185670+SalemJorden@users.noreply.github.com> Date: Thu, 18 Apr 2024 14:11:49 -0500 Subject: [PATCH 05/10] concat function for visualizing forecasting results --- .../create_single_timeseries_forecasting_model_test.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/samples/snippets/create_single_timeseries_forecasting_model_test.py b/samples/snippets/create_single_timeseries_forecasting_model_test.py index b7ac8d8dc7..f2ba60fa7d 100644 --- a/samples/snippets/create_single_timeseries_forecasting_model_test.py +++ b/samples/snippets/create_single_timeseries_forecasting_model_test.py @@ -68,7 +68,7 @@ def test_create_single_timeseries(random_model_id): # Use your model to forecast the time series #standardSQL - your_model_id.forecast() + your_model_id.predict() # Explain and visualize the forecasting results f''' @@ -94,3 +94,10 @@ def test_create_single_timeseries(random_model_id): } ) total_visits = df.groupby(["history_timestamp"], as_index = False).sum(numeric_only= True) + + + history_df = bpd.read_gbq(df) + forecast_df = bpd.read_gbq(total_visits) + + # Concat DataFrame + combined_df = bpd.concat([history_df, forecast_df], ignore_index=True) From 8a2a61275309745f212ea63bda3d831fb388adbd Mon Sep 17 00:00:00 2001 From: Salem Jorden <115185670+SalemJorden@users.noreply.github.com> Date: Fri, 19 Apr 2024 15:49:58 -0500 Subject: [PATCH 06/10] docs: single time series code sample step 2 --- ...ingle_timeseries_forecasting_model_test.py | 69 ------------------- 1 file changed, 69 deletions(-) diff --git a/samples/snippets/create_single_timeseries_forecasting_model_test.py b/samples/snippets/create_single_timeseries_forecasting_model_test.py index f2ba60fa7d..509c163e3f 100644 --- a/samples/snippets/create_single_timeseries_forecasting_model_test.py +++ b/samples/snippets/create_single_timeseries_forecasting_model_test.py @@ -32,72 +32,3 @@ def test_create_single_timeseries(random_model_id): parsed_date = bpd.to_datetime(df.date, format= "%Y%m%d", utc = True) total_visits = df.groupby(["date"])["parsed_date"].sum() visits = df["totals"].struct.field("visits") - - # Create an Arima-based time series model using the Google Analytics 360 data. - from bigframes.ml.forecasting import ARIMAPlus - - ga_arima_model = ARIMAPlus() - - X = df[["parsed_date"]] - y = df[["total_visits"]] - - # Fit the model to your dataframe. - ga_arima_model.fit(X,y) - - # The model.fit() call above created a temporary model. - # Use the to_gbq() method to write to a permanent location. - ga_arima_model.to_gbq( - your_model_id, # For example: "bqml_tutorial.sample_model", - replace=True, - ) - - # Inspect the evaluation metrics of all evaluated models. - # when running this function use same model, dataset, model name (str) - evaluation = ga_arima_model.summary( - show_all_candidate_models = False, - ) - - print(evaluation) - - # Inspect the coefficients of your model - f''' - SELECT * - FROM ML.ARIMA_COEFFICIENTS(MODEL `{your_model_id}`) - ''' - evaluation.ML.ARIMA_COEFFICIENTS() - - # Use your model to forecast the time series - #standardSQL - your_model_id.predict() - - # Explain and visualize the forecasting results - f''' - SELECT * - FROM ML.EXPLAIN_FORECAST( - MODEL `{your_model_id}`, - STRUCT( - [horizon AS horizon] - [, confidence_level AS confidence_level])) - ''' - total_visits.plot.line(x = 'history_timestamp', y = 'history_value') - - # Visualize the forecasting results without having decompose_time_series enabled. - df = bpd.read_gbq( - 'bigquery-public-data.google_analytics_sample.ga_sessions_*' - ) - parsed_date = bpd.to_datetime(df.date, format= "%Y%m%d", utc = True) - visits = df["totals"].struct.field("visits") - df = bpd.DataFrame( - { - 'history_timestamp': parsed_date, - 'history_value': visits, - } - ) - total_visits = df.groupby(["history_timestamp"], as_index = False).sum(numeric_only= True) - - - history_df = bpd.read_gbq(df) - forecast_df = bpd.read_gbq(total_visits) - - # Concat DataFrame - combined_df = bpd.concat([history_df, forecast_df], ignore_index=True) From 673c6e380ecc682366fba11c933b810d4d4b0bd5 Mon Sep 17 00:00:00 2001 From: Salem Jorden <115185670+SalemJorden@users.noreply.github.com> Date: Mon, 22 Apr 2024 12:50:14 -0500 Subject: [PATCH 07/10] suggested changes to step 2 --- ...te_single_timeseries_forecasting_model_test.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/samples/snippets/create_single_timeseries_forecasting_model_test.py b/samples/snippets/create_single_timeseries_forecasting_model_test.py index 509c163e3f..bdc59673c4 100644 --- a/samples/snippets/create_single_timeseries_forecasting_model_test.py +++ b/samples/snippets/create_single_timeseries_forecasting_model_test.py @@ -1,4 +1,4 @@ -# Copyright 2023 Google LLC +# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (t # you may not use this file except in compliance wi @@ -19,16 +19,17 @@ def test_create_single_timeseries(random_model_id): # [START bigquery_dataframes_single_timeseries_forecasting_model_tutorial] import bigframes.pandas as bpd - # Start by selecting the data you'll use for training. `read_gbq` accepts - # either a SQL query or a table ID. Since this example selects from multiple - # tables via a wildcard, use SQL to define this data. Watch issue - # https://github.com/googleapis/python-bigquery-dataframes/issues/169 - # for updates to `read_gbq` to support wildcard tables. + # Start by selecting the data that you'll be querying from bigquery-public-data.google_analytics_sample.ga_sessions_* + # The read_gbq function accepts table expressions or SQL + # the clause indicates that you are querying the ga_sessions_* tables in the google_analytics_sample dataset # Read and visualize the time series you want to forecast. df = bpd.read_gbq( 'bigquery-public-data.google_analytics_sample.ga_sessions_*' ) parsed_date = bpd.to_datetime(df.date, format= "%Y%m%d", utc = True) - total_visits = df.groupby(["date"])["parsed_date"].sum() visits = df["totals"].struct.field("visits") + total_visits = visits.groupby(parsed_date).sum() + total_visits.plot.line() + + # [END bigquery_dataframes_single_timeseries_forecasting_model_tutorial] \ No newline at end of file From 647b1c20b061ad84d5e28ccac670810958a9da06 Mon Sep 17 00:00:00 2001 From: Salem Jorden <115185670+SalemJorden@users.noreply.github.com> Date: Mon, 22 Apr 2024 12:53:00 -0500 Subject: [PATCH 08/10] docs: edit text because wildcare tables are supported --- .../create_single_timeseries_forecasting_model_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/samples/snippets/create_single_timeseries_forecasting_model_test.py b/samples/snippets/create_single_timeseries_forecasting_model_test.py index bdc59673c4..6288346bf5 100644 --- a/samples/snippets/create_single_timeseries_forecasting_model_test.py +++ b/samples/snippets/create_single_timeseries_forecasting_model_test.py @@ -20,8 +20,8 @@ def test_create_single_timeseries(random_model_id): import bigframes.pandas as bpd # Start by selecting the data that you'll be querying from bigquery-public-data.google_analytics_sample.ga_sessions_* - # The read_gbq function accepts table expressions or SQL - # the clause indicates that you are querying the ga_sessions_* tables in the google_analytics_sample dataset + # The read_gbq function accepts the wildcard table expressions and this clause indicates that + # you are querying the ga_sessions_* tables in the google_analytics_sample dataset # Read and visualize the time series you want to forecast. df = bpd.read_gbq( From 310d33d957eb6f2fe90adb02868259b208cfb7ac Mon Sep 17 00:00:00 2001 From: Salem Jorden <115185670+SalemJorden@users.noreply.github.com> Date: Mon, 22 Apr 2024 15:28:41 -0500 Subject: [PATCH 09/10] code review changes --- ..._single_timeseries_forecasting_model_test.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/samples/snippets/create_single_timeseries_forecasting_model_test.py b/samples/snippets/create_single_timeseries_forecasting_model_test.py index 6288346bf5..23b00fb10e 100644 --- a/samples/snippets/create_single_timeseries_forecasting_model_test.py +++ b/samples/snippets/create_single_timeseries_forecasting_model_test.py @@ -19,9 +19,8 @@ def test_create_single_timeseries(random_model_id): # [START bigquery_dataframes_single_timeseries_forecasting_model_tutorial] import bigframes.pandas as bpd - # Start by selecting the data that you'll be querying from bigquery-public-data.google_analytics_sample.ga_sessions_* - # The read_gbq function accepts the wildcard table expressions and this clause indicates that - # you are querying the ga_sessions_* tables in the google_analytics_sample dataset + # Start by loading the historical data from BigQuerythat you want to analyze and forecast. + # This clause indicates that you are querying the ga_sessions_* tables in the google_analytics_sample dataset. # Read and visualize the time series you want to forecast. df = bpd.read_gbq( @@ -30,6 +29,16 @@ def test_create_single_timeseries(random_model_id): parsed_date = bpd.to_datetime(df.date, format= "%Y%m%d", utc = True) visits = df["totals"].struct.field("visits") total_visits = visits.groupby(parsed_date).sum() - total_visits.plot.line() + + # Expected output: total_visits.head() + # date + # 2016-08-01 00:00:00+00:00 1711 + # 2016-08-02 00:00:00+00:00 2140 + # 2016-08-03 00:00:00+00:00 2890 + # 2016-08-04 00:00:00+00:00 3161 + # 2016-08-05 00:00:00+00:00 2702 + # Name: visits, dtype: Int64 + total_visits.plot.line() + # [END bigquery_dataframes_single_timeseries_forecasting_model_tutorial] \ No newline at end of file From 88667a93e27b0a9d09c352df490b39f33dc0f77c Mon Sep 17 00:00:00 2001 From: Salem Jorden <115185670+SalemJorden@users.noreply.github.com> Date: Mon, 22 Apr 2024 15:32:26 -0500 Subject: [PATCH 10/10] remove unused variable --- ...ingle_timeseries_forecasting_model_test.py | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/samples/snippets/create_single_timeseries_forecasting_model_test.py b/samples/snippets/create_single_timeseries_forecasting_model_test.py index 23b00fb10e..5750933713 100644 --- a/samples/snippets/create_single_timeseries_forecasting_model_test.py +++ b/samples/snippets/create_single_timeseries_forecasting_model_test.py @@ -13,23 +13,19 @@ # limitations under the License. -def test_create_single_timeseries(random_model_id): - your_model_id = random_model_id +def test_create_single_timeseries(): # [START bigquery_dataframes_single_timeseries_forecasting_model_tutorial] import bigframes.pandas as bpd - - # Start by loading the historical data from BigQuerythat you want to analyze and forecast. + + # Start by loading the historical data from BigQuerythat you want to analyze and forecast. # This clause indicates that you are querying the ga_sessions_* tables in the google_analytics_sample dataset. - # Read and visualize the time series you want to forecast. - df = bpd.read_gbq( - 'bigquery-public-data.google_analytics_sample.ga_sessions_*' - ) - parsed_date = bpd.to_datetime(df.date, format= "%Y%m%d", utc = True) + df = bpd.read_gbq("bigquery-public-data.google_analytics_sample.ga_sessions_*") + parsed_date = bpd.to_datetime(df.date, format="%Y%m%d", utc=True) visits = df["totals"].struct.field("visits") total_visits = visits.groupby(parsed_date).sum() - + # Expected output: total_visits.head() # date # 2016-08-01 00:00:00+00:00 1711 @@ -40,5 +36,5 @@ def test_create_single_timeseries(random_model_id): # Name: visits, dtype: Int64 total_visits.plot.line() - - # [END bigquery_dataframes_single_timeseries_forecasting_model_tutorial] \ No newline at end of file + + # [END bigquery_dataframes_single_timeseries_forecasting_model_tutorial]