-
Notifications
You must be signed in to change notification settings - Fork 50
chore: ensure colab sample notebooks are tested #351
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
3593c7c
f91ce03
a3ec546
5904960
b5cf2ed
0e8ece0
ba097bf
dfee838
f72dd2e
53b0e8a
c92b9da
e5d9191
4bc0cdb
ecd59c8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -657,9 +657,23 @@ def system_prerelease(session: nox.sessions.Session): | |
|
||
|
||
@nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS) | ||
def notebook(session): | ||
def notebook(session: nox.Session): | ||
GOOGLE_CLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") | ||
if not GOOGLE_CLOUD_PROJECT: | ||
session.error( | ||
"Set GOOGLE_CLOUD_PROJECT environment variable to run notebook session." | ||
) | ||
|
||
session.install("-e", ".[all]") | ||
session.install("pytest", "pytest-xdist", "pytest-retry", "nbmake") | ||
session.install( | ||
"pytest", | ||
"pytest-xdist", | ||
"pytest-retry", | ||
"nbmake", | ||
"google-cloud-aiplatform", | ||
"matplotlib", | ||
"seaborn", | ||
) | ||
|
||
notebooks_list = list(Path("notebooks/").glob("*/*.ipynb")) | ||
|
||
|
@@ -669,19 +683,22 @@ def notebook(session): | |
# These notebooks contain special colab `param {type:"string"}` | ||
# comments, which make it easy for customers to fill in their | ||
# own information. | ||
# | ||
# With the notebooks_fill_params.py script, we are able to find and | ||
# replace the PROJECT_ID parameter, but not the others. | ||
# | ||
# TODO(ashleyxu): Test these notebooks by replacing parameters with | ||
# appropriate values and omitting cleanup logic that may break | ||
# our test infrastructure. | ||
"notebooks/getting_started/getting_started_bq_dataframes.ipynb", | ||
"notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb", | ||
"notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb", | ||
"notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb", | ||
"notebooks/regression/bq_dataframes_ml_linear_regression.ipynb", | ||
"notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb", | ||
"notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb", | ||
"notebooks/vertex_sdk/sdk2_bigframes_sklearn.ipynb", | ||
"notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb", | ||
"notebooks/visualization/bq_dataframes_covid_line_graphs.ipynb", | ||
"notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb", # Needs DATASET. | ||
"notebooks/regression/bq_dataframes_ml_linear_regression.ipynb", # Needs DATASET_ID. | ||
"notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb", # Needs CONNECTION. | ||
# TODO(swast): investigate why we get 404 errors, even though | ||
# bq_dataframes_llm_code_generation creates a bucket in the sample. | ||
"notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb", # Needs BUCKET_URI. | ||
"notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb", # Needs BUCKET_URI. | ||
"notebooks/vertex_sdk/sdk2_bigframes_sklearn.ipynb", # Needs BUCKET_URI. | ||
"notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb", # Needs BUCKET_URI. | ||
# The experimental notebooks imagine features that don't yet | ||
# exist or only exist as temporary prototypes. | ||
"notebooks/experimental/longer_ml_demo.ipynb", | ||
|
@@ -709,9 +726,9 @@ def notebook(session): | |
for nb, regions in notebooks_reg.items() | ||
} | ||
|
||
# For some reason nbmake exits silently with "no tests ran" message if | ||
# The pytest --nbmake exits silently with "no tests ran" message if | ||
# one of the notebook paths supplied does not exist. Let's make sure that | ||
# each path exists | ||
# each path exists. | ||
for nb in notebooks + list(notebooks_reg): | ||
assert os.path.exists(nb), nb | ||
|
||
|
@@ -723,16 +740,33 @@ def notebook(session): | |
pytest_command = [ | ||
"py.test", | ||
"--nbmake", | ||
"--nbmake-timeout=600", | ||
"--nbmake-timeout=900", # 15 minutes | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a reason why we increase the timeout limit specifically to 15 mins. Do we test the boundary? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The default was too slow for the LLM sample when I tested locally. |
||
] | ||
|
||
# Run self-contained notebooks in single session.run | ||
# achieve parallelization via -n | ||
session.run( | ||
*pytest_command, | ||
"-nauto", | ||
*notebooks, | ||
) | ||
try: | ||
# Populate notebook parameters and make a backup so that the notebooks | ||
# are runnable. | ||
session.run( | ||
"python", | ||
CURRENT_DIRECTORY / "scripts" / "notebooks_fill_params.py", | ||
*notebooks, | ||
) | ||
|
||
# Run self-contained notebooks in single session.run | ||
# achieve parallelization via -n | ||
session.run( | ||
*pytest_command, | ||
"-nauto", | ||
*notebooks, | ||
) | ||
finally: | ||
# Prevent our notebook changes from getting checked in to git | ||
# accidentally. | ||
session.run( | ||
"python", | ||
CURRENT_DIRECTORY / "scripts" / "notebooks_restore_from_backup.py", | ||
*notebooks, | ||
) | ||
|
||
# Run regionalized notebooks in parallel session.run's, since each notebook | ||
# takes a different region via env param. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
# Copyright 2024 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import json | ||
import os | ||
import re | ||
import shutil | ||
import sys | ||
|
||
GOOGLE_CLOUD_PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"] | ||
|
||
|
||
def make_backup(notebook_path: str): | ||
shutil.copy( | ||
notebook_path, | ||
f"{notebook_path}.backup", | ||
) | ||
|
||
|
||
def replace_project(line): | ||
""" | ||
Notebooks contain special colab `param {type:"string"}` | ||
comments, which make it easy for customers to fill in their | ||
own information. | ||
""" | ||
# Make sure we're robust to whitespace differences. | ||
cleaned = re.sub(r"\s", "", line) | ||
if cleaned == 'PROJECT_ID=""#@param{type:"string"}': | ||
return f'PROJECT_ID = "{GOOGLE_CLOUD_PROJECT}" # @param {{type:"string"}}\n' | ||
else: | ||
return line | ||
|
||
|
||
def replace_params(notebook_path: str): | ||
with open(notebook_path, "r", encoding="utf-8") as notebook_file: | ||
notebook_json = json.load(notebook_file) | ||
|
||
for cell in notebook_json["cells"]: | ||
lines = cell.get("source", []) | ||
new_lines = [replace_project(line) for line in lines] | ||
cell["source"] = new_lines | ||
|
||
with open(notebook_path, "w", encoding="utf-8") as notebook_file: | ||
json.dump(notebook_json, notebook_file, indent=2, ensure_ascii=False) | ||
|
||
|
||
def main(notebook_paths): | ||
for notebook_path in notebook_paths: | ||
make_backup(notebook_path) | ||
replace_params(notebook_path) | ||
|
||
|
||
if __name__ == "__main__": | ||
main(sys.argv[1:]) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# Copyright 2024 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import pathlib | ||
import shutil | ||
import sys | ||
|
||
|
||
def restore_from_backup(notebook_path): | ||
backup_path = pathlib.Path(f"{notebook_path}.backup") | ||
if backup_path.exists(): | ||
shutil.move( | ||
backup_path, | ||
notebook_path, | ||
) | ||
|
||
|
||
def main(notebook_paths): | ||
for notebook_path in notebook_paths: | ||
restore_from_backup(notebook_path) | ||
|
||
|
||
if __name__ == "__main__": | ||
main(sys.argv[1:]) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we still need DATASET when running
bf.read_gbq()
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't know what you mean. DATASET variable doesn't appear in https://github.com/googleapis/python-bigquery-dataframes/blob/677f0146acf19def88fddbeb0527a078458948ae/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We read it here, right?
https://screenshot.googleplex.com/32s3pbtqnWZakiM
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's a read, which doesn't need to be parametrized. DATASET is used a a destination in the other samples.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks got it!