diff --git a/notebooks/apps/synthetic_data_generation.ipynb b/notebooks/apps/synthetic_data_generation.ipynb new file mode 100644 index 0000000000..a6e8444aac --- /dev/null +++ b/notebooks/apps/synthetic_data_generation.ipynb @@ -0,0 +1,1133 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2023 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BigQuery DataFrames: Synthetic Data Generation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In addition to BigQuery DataFrames (installing which also installs `pandas` as a dependency) we will use\n", + "`faker` library as a building block for synthetic data generation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "suoG7eWDZARj", + "outputId": "b5c620a9-8f5b-413f-dd38-93448f941846" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting faker\n", + " Downloading Faker-24.9.0-py3-none-any.whl (1.8 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m11.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: python-dateutil>=2.4 in /usr/local/lib/python3.10/dist-packages (from faker) (2.8.2)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.4->faker) (1.16.0)\n", + "Installing collected packages: faker\n", + "Successfully installed faker-24.9.0\n" + ] + } + ], + "source": [ + "!pip install faker" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "m3q1oeJALhsG" + }, + "outputs": [], + "source": [ + "import bigframes.pandas as bpd\n", + "bpd.options.bigquery.project = PROJECT_ID" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's use `GeminiTextGenerator` for our purpose, which is BigQuery DataFrame's state-of-the-art LLM integration at the time of writing this notebook (Apr 16 2024)." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 69 + }, + "id": "lIYdn1woOS1n", + "outputId": "be474338-44c2-4ce0-955e-d525b8b9c84b" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/bigframes/session/__init__.py:1907: UserWarning: No explicit location is set, so using location US for the session.\n", + " return Session(context)\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 3e8423da-737c-42e2-a3d2-d2180ca18579 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from bigframes.ml.llm import GeminiTextGenerator\n", + "\n", + "model = GeminiTextGenerator()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Craft a prompt for the LLM to indicate the schema of the desired data and hints for the code that could generate such data. " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 162 + }, + "id": "SSR-lLScLa95", + "outputId": "cbaec34e-6fa6-45b4-e54a-f11ca06b61e1" + }, + "outputs": [ + { + "data": { + "text/html": [ + "Query job d651d0bf-300c-4b1d-9e3c-03310b71287c is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job c67b9bb9-2f3e-4b9e-b680-0b7b6e9d2279 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
prompt
0Write python code to generate a pandas datafra...
\n", + "

1 rows × 1 columns

\n", + "
[1 rows x 1 columns in total]" + ], + "text/plain": [ + " prompt\n", + "0 Write python code to generate a pandas datafra...\n", + "\n", + "[1 rows x 1 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prompt = \"\"\"\\\n", + "Write python code to generate a pandas dataframe based on the requirements:\n", + " Column name: Name, type: string, Description: Latin American Names\n", + " Column name: Age, type: int\n", + " Column name: Gender, type: string, Description: Inclusive\n", + "\n", + "Note:\n", + " - Return the code only, no additional texts or comments\n", + " - Use faker library\n", + " - Generate 100 rows\n", + " - The final dataframe should be named 'result_df'.\n", + "\"\"\"\n", + "\n", + "df_prompt = bpd.DataFrame({\"prompt\" : [prompt]})\n", + "df_prompt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Be accommodating that LLM may not produce a runnable code in the first go and may need some nudging. We will retry by adding the failing code and the exception it throws as additional context in the prompt." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 277 + }, + "id": "miDe3K4GNvOo", + "outputId": "f2039e80-5ad7-4551-f8b2-7ef714a89d63" + }, + "outputs": [ + { + "data": { + "text/html": [ + "Query job d5c0725d-9070-4712-adfd-8a9bd86eefc3 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 4eb581a3-7f97-411a-bee1-91e8c150cef4 is DONE. 8 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job f3d5503d-a3e7-49ce-b985-5ffbdbd856e3 is DONE. 2 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 8ef76041-f077-4a05-bc03-63e6983ef853 is DONE. 332 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "import pandas as pd\n", + "from faker import Faker\n", + "\n", + "fake = Faker('es_ES')\n", + "result_df = pd.DataFrame({\n", + " 'Name': [fake.name() for _ in range(100)],\n", + " 'Age': [fake.random_int(min=18, max=65) for _ in range(100)],\n", + " 'Gender': [fake.random_element(elements=['Male', 'Female', 'Non-binary']) for _ in range(100)]\n", + "})\n", + "\n" + ] + } + ], + "source": [ + "max_tries = 5\n", + "for i in range(max_tries):\n", + " # Get LLM generated code\n", + " df_result = model.predict(df_prompt)\n", + " llm_result = df_result['ml_generate_text_llm_result'].iloc[0]\n", + "\n", + " # Python code comes back as a markdown code block,\n", + " # remove the prefix \"```python\" and suffix \"```\"\n", + " code = llm_result[9:-3]\n", + " print(code)\n", + "\n", + " # Check if the generated code is runnable\n", + " try:\n", + " exec(code)\n", + " break\n", + " except Exception as ex:\n", + " print(ex)\n", + " error_context = f\"\"\"\n", + "Previous code:\n", + "{code}\n", + "\n", + "Had this exception:\n", + "{ex}\"\"\"\n", + "\n", + " # Update the prompt to help LLM correct error\n", + " df_prompt[\"prompt\"] += error_context\n", + "\n", + " # If we have exhausted max tries then stop trying\n", + " if i+1 == max_tries:\n", + " raise Exception(\"Failed to generate runnable code\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run the generated code and verify that it produced the desired data." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "GODcPwX2PBEu", + "outputId": "dec4c872-c464-49e4-cd7f-9442fc977d18" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "summary": "{\n \"name\": \"execution_context\",\n \"rows\": 100,\n \"fields\": [\n {\n \"column\": \"Name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 100,\n \"samples\": [\n \"Renata Pla Cases\",\n \"Guiomar Carnero-Paz\",\n \"Luciano Garmendia\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 13,\n \"min\": 18,\n \"max\": 64,\n \"num_unique_values\": 39,\n \"samples\": [\n 56,\n 31,\n 34\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Gender\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Male\",\n \"Non-binary\",\n \"Female\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", + "type": "dataframe" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameAgeGender
0Pastora Acuña Company21Male
1León Reig-Salom39Non-binary
2Aura Tomás Llobet30Female
3Vicente Correa Palomar64Female
4Benito del Fuster34Female
............
95Eduardo Cabrera27Non-binary
96Nazaret de Izaguirre40Non-binary
97Manuela Agullo Bustamante27Female
98Eugenio Mateo Naranjo Blazquez36Non-binary
99Heriberto Vicens Baeza53Female
\n", + "

100 rows × 3 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " Name Age Gender\n", + "0 Pastora Acuña Company 21 Male\n", + "1 León Reig-Salom 39 Non-binary\n", + "2 Aura Tomás Llobet 30 Female\n", + "3 Vicente Correa Palomar 64 Female\n", + "4 Benito del Fuster 34 Female\n", + ".. ... ... ...\n", + "95 Eduardo Cabrera 27 Non-binary\n", + "96 Nazaret de Izaguirre 40 Non-binary\n", + "97 Manuela Agullo Bustamante 27 Female\n", + "98 Eugenio Mateo Naranjo Blazquez 36 Non-binary\n", + "99 Heriberto Vicens Baeza 53 Female\n", + "\n", + "[100 rows x 3 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "execution_context = {}\n", + "exec(code, execution_context)\n", + "execution_context.get(\"result_df\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We want to run this code at scale to generate since we want to generate large amount of data. Let's deploy a `remote_function` for this purpose." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 36 + }, + "id": "n-BsGciNqSwU", + "outputId": "996e5639-a49c-4542-a0dc-ede450e0eb6d" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'projects/bigframes-dev/locations/us-central1/functions/bigframes-19f2f35637098969770261a2974bef32'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "@bpd.remote_function([int], str, packages=['faker', 'pandas'])\n", + "def data_generator(id):\n", + " context = {}\n", + " exec(code, context)\n", + " result_df = context.get(\"result_df\")\n", + " return result_df.to_json(orient=\"records\")\n", + "\n", + "data_generator.bigframes_cloud_function" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let’s say we want to generate 1 million rows of synthetic data. Since our generated code produces 100 rows in one run, we can initialize an indicator dataframe with 1M/100 = 10K indicator rows. Then we can apply the remote function to produce 100 synthetic data rows for each indicator row." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "id": "Odkmev9nsYqA", + "outputId": "4aa7a1fd-0c0d-4412-f326-a20e19f583b5" + }, + "outputs": [ + { + "data": { + "text/html": [ + "Load job 40b9c3a8-27fc-40a8-9edf-4aa2e0fec332 is DONE. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "desired_num_rows = 1_000_000 # 1 million rows\n", + "batch_size = 100 # used in the prompt\n", + "num_batches = int(desired_num_rows/batch_size)\n", + "\n", + "df = bpd.DataFrame({\"row_id\": range(num_batches)})" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "id": "UyBhlJFVsmQC", + "outputId": "29748df5-673b-4320-bb1f-53abaace3b81" + }, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 9dd49b50-2dbf-4351-b9ad-b17aeb627caf is DONE. 240.0 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df[\"json_data\"] = df[\"row_id\"].apply(data_generator)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "At this point each item in `df[\"json_data\"]` is a json serialized array of 100 records. Let’s flatten that into 1 record per row using a direct SQL." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 932 + }, + "id": "6p3eM21qvRvy", + "outputId": "333f4e49-a555-4d2f-b527-02142782b3a7" + }, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 3f8d2133-b01d-402d-a731-79592810ca1c is DONE. 63.7 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 4a613aa3-6323-4914-8e34-93323885d458 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 0deb03be-725b-40b4-a7a1-1023b0477f35 is DONE. 40.1 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameAgeGender
0Eloy Santiago-Aragón31Male
1Amanda Mata Abril20Non-binary
2Danilo Velázquez Salcedo58Male
3Leyre Alba España61Female
4Paulina Amores Pastor41Male
5Jorge Cuadrado Mena50Female
6Chucho Catalán36Non-binary
7Vidal Benavente Lerma38Male
8Clementina Álamo32Female
9Petrona Roselló-Valls61Male
10Luís Camilo Sastre Marin45Male
11Gil Baudelio Carbajo Ordóñez58Non-binary
12David del Donoso44Female
13Dolores Arnau Ros21Non-binary
14Febe de León46Non-binary
15Ariadna Almazán34Female
16Blas Serna Aguiló24Non-binary
17Paulino Barreda Almeida59Female
18Eligio Valcárcel Tormo35Non-binary
19Toño Amador Torres Portillo48Female
20Florencia del Bejarano65Non-binary
21Clímaco Andreu Gómez18Male
22Xiomara Dominguez Solana35Female
23Leire Castilla Borrego19Non-binary
24Angelita Garmendia Carpio21Non-binary
\n", + "

25 rows × 3 columns

\n", + "
[1000000 rows x 3 columns in total]" + ], + "text/plain": [ + " Name Age Gender\n", + "0 Eloy Santiago-Aragón 31 Male\n", + "1 Amanda Mata Abril 20 Non-binary\n", + "2 Danilo Velázquez Salcedo 58 Male\n", + "3 Leyre Alba España 61 Female\n", + "4 Paulina Amores Pastor 41 Male\n", + "5 Jorge Cuadrado Mena 50 Female\n", + "6 Chucho Catalán 36 Non-binary\n", + "7 Vidal Benavente Lerma 38 Male\n", + "8 Clementina Álamo 32 Female\n", + "9 Petrona Roselló-Valls 61 Male\n", + "10 Luís Camilo Sastre Marin 45 Male\n", + "11 Gil Baudelio Carbajo Ordóñez 58 Non-binary\n", + "12 David del Donoso 44 Female\n", + "13 Dolores Arnau Ros 21 Non-binary\n", + "14 Febe de León 46 Non-binary\n", + "15 Ariadna Almazán 34 Female\n", + "16 Blas Serna Aguiló 24 Non-binary\n", + "17 Paulino Barreda Almeida 59 Female\n", + "18 Eligio Valcárcel Tormo 35 Non-binary\n", + "19 Toño Amador Torres Portillo 48 Female\n", + "20 Florencia del Bejarano 65 Non-binary\n", + "21 Clímaco Andreu Gómez 18 Male\n", + "22 Xiomara Dominguez Solana 35 Female\n", + "23 Leire Castilla Borrego 19 Non-binary\n", + "24 Angelita Garmendia Carpio 21 Non-binary\n", + "...\n", + "\n", + "[1000000 rows x 3 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sql = f\"\"\"\n", + "WITH T0 AS ({df.sql}),\n", + "T1 AS (\n", + " SELECT PARSE_JSON(json_row) AS json_row\n", + " FROM T0, UNNEST(JSON_EXTRACT_ARRAY(json_data)) AS json_row\n", + ")\n", + "SELECT STRING(json_row.Name) AS Name,\n", + " INT64(json_row.Age) AS Age,\n", + " STRING(json_row.Gender) AS Gender\n", + "FROM T1\n", + "\"\"\"\n", + "df_result = bpd.read_gbq(sql)\n", + "df_result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There you have it, 1 million synthetic data rows ready to use, or save them in a BigQuery table for future use." + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/noxfile.py b/noxfile.py index 9479a7a318..91ad6bc0e6 100644 --- a/noxfile.py +++ b/noxfile.py @@ -723,6 +723,10 @@ def notebook(session: nox.Session): # The experimental notebooks imagine features that don't yet # exist or only exist as temporary prototypes. "notebooks/experimental/longer_ml_demo.ipynb", + # The notebooks that are added for more use cases, such as backing a + # blog post, which may take longer to execute and need not be + # continuously tested. + "notebooks/apps/synthetic_data_generation.ipynb", ] # Convert each Path notebook object to a string using a list comprehension.