googleapis
diff --git a/Collapse file
‎notebooks/multimodal/multimodal_dataframe.ipynb‎
Copy file name to clipboardExpand all lines: notebooks/multimodal/multimodal_dataframe.ipynb
+92-2Lines changed: 92 additions & 2 deletions b/Collapse file
‎notebooks/multimodal/multimodal_dataframe.ipynb‎
Copy file name to clipboardExpand all lines: notebooks/multimodal/multimodal_dataframe.ipynb
+92-2Lines changed: 92 additions & 2 deletions
@@ -61,7 +61,8 @@
         "3. Conduct image transformations\n",
         "4. Use LLM models to ask questions and generate embeddings on images\n",
         "5. PDF chunking function\n",
-        "6. Transcribe audio"
+        "6. Transcribe audio\n",
+        "7. Extract EXIF metadata from images"
       ]
     },
     {
@@ -104,6 +105,11 @@
         "PROJECT = \"bigframes-dev\" # replace with your project. \n",
         "# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#required_roles for your required permissions\n",
         "\n",
+        "LOCATION = \"us\" # replace with your location.\n",
+        "\n",
+        "# Dataset where the UDF will be created.\n",
+        "DATASET_ID = \"bigframes_samples\" # replace with your dataset ID.\n",
+        "\n",
         "OUTPUT_BUCKET = \"bigframes_blob_test\" # replace with your GCS bucket. \n",
         "# The connection (or bigframes-default-connection of the project) must have read/write permission to the bucket. \n",
         "# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#grant-permissions for setting up connection service account permissions.\n",
@@ -112,12 +118,14 @@
         "import bigframes\n",
         "# Setup project\n",
         "bigframes.options.bigquery.project = PROJECT\n",
+        "bigframes.options.bigquery.location = LOCATION\n",
         "\n",
         "# Display options\n",
         "bigframes.options.display.blob_display_width = 300\n",
         "bigframes.options.display.progress_bar = None\n",
         "\n",
-        "import bigframes.pandas as bpd"
+        "import bigframes.pandas as bpd\n",
+        "import bigframes.bigquery as bbq"
       ]
     },
     {
@@ -1546,6 +1554,88 @@
         "transcribed_series_verbose = df['audio'].blob.audio_transcribe(model_name=\"gemini-2.0-flash-001\", verbose=True)\n",
         "transcribed_series_verbose"
       ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### 7. Extract EXIF metadata from images"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "This section demonstrates how to extract EXIF metadata from images using a custom BigQuery Python UDF and the `Pillow` library."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Construct the canonical connection ID\n",
+        "FULL_CONNECTION_ID = f\"{PROJECT}.{LOCATION}.bigframes-default-connection\"\n",
+        "\n",
+        "@bpd.udf(\n",
+        "    input_types=[str],\n",
+        "    output_type=str,\n",
+        "    dataset=DATASET_ID,\n",
+        "    name=\"extract_exif\",\n",
+        "    bigquery_connection=FULL_CONNECTION_ID,\n",
+        "    packages=[\"pillow\", \"requests\"],\n",
+        "    max_batching_rows=8192,\n",
+        "    container_cpu=0.33,\n",
+        "    container_memory=\"512Mi\"\n",
+        ")\n",
+        "def extract_exif(src_obj_ref_rt: str) -> str:\n",
+        "    import io\n",
+        "    import json\n",
+        "    from PIL import ExifTags, Image\n",
+        "    import requests\n",
+        "    from requests import adapters\n",
+        "    session = requests.Session()\n",
+        "    session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n",
+        "    src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n",
+        "    src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n",
+        "    response = session.get(src_url, timeout=30)\n",
+        "    bts = response.content\n",
+        "    image = Image.open(io.BytesIO(bts))\n",
+        "    exif_data = image.getexif()\n",
+        "    exif_dict = {}\n",
+        "    if exif_data:\n",
+        "        for tag, value in exif_data.items():\n",
+        "            tag_name = ExifTags.TAGS.get(tag, tag)\n",
+        "            exif_dict[tag_name] = value\n",
+        "    return json.dumps(exif_dict)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Create a Multimodal DataFrame from the sample image URIs\n",
+        "exif_image_df = bpd.from_glob_path(\n",
+        "    \"gs://bigframes_blob_test/images_exif/*\",\n",
+        "    name=\"blob_col\",\n",
+        ")\n",
+        "\n",
+        "# Generate a JSON string containing the runtime information (including signed read URLs)\n",
+        "# This allows the UDF to download the images from Google Cloud Storage\n",
+        "access_urls = exif_image_df[\"blob_col\"].blob.get_runtime_json_str(mode=\"R\")\n",
+        "\n",
+        "# Apply the BigQuery Python UDF to the runtime JSON strings\n",
+        "# We cast to string to ensure the input matches the UDF's signature\n",
+        "exif_json = access_urls.astype(str).apply(extract_exif)\n",
+        "\n",
+        "# Parse the resulting JSON strings back into a structured JSON type for easier access\n",
+        "exif_data = bbq.parse_json(exif_json)\n",
+        "\n",
+        "exif_data"
+      ]
     }
   ],
   "metadata": {