Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings
This repository was archived by the owner on May 7, 2026. It is now read-only.

Commit 84c6f88

Browse filesBrowse files
shuoweilGarrettWuchelsea-linsycai
authored
docs: Add EXIF metadata extraction example to multimodal notebook (#2429)
This PR updates the notebooks/multimodal/multimodal_dataframe.ipynb notebook to include a comprehensive example of extracting EXIF metadata from images. Key Changes: * Added a new section "7. Extract EXIF metadata from images". * Implemented a custom remote function (UDF) using pillow and requests to retrieve and parse EXIF tags from image URLs. * Demonstrated how to apply this function efficiently within a BigFrames workflow to analyze image metadata. This addition provides users with a practical pattern for handling image metadata and using custom libraries within BigQuery DataFrames. Fixes #<478952827> 🦕 --------- Co-authored-by: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Co-authored-by: Chelsea Lin <chelsealin@google.com> Co-authored-by: Shenyang Cai <sycai@users.noreply.github.com>
1 parent 9f1ba1d commit 84c6f88
Copy full SHA for 84c6f88

1 file changed

+92-2Lines changed: 92 additions & 2 deletions

File tree

Expand file treeCollapse file tree
Open diff view settings
Filter options
Expand file treeCollapse file tree
Open diff view settings
Collapse file

‎notebooks/multimodal/multimodal_dataframe.ipynb‎

Copy file name to clipboardExpand all lines: notebooks/multimodal/multimodal_dataframe.ipynb
+92-2Lines changed: 92 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,8 @@
6161
"3. Conduct image transformations\n",
6262
"4. Use LLM models to ask questions and generate embeddings on images\n",
6363
"5. PDF chunking function\n",
64-
"6. Transcribe audio"
64+
"6. Transcribe audio\n",
65+
"7. Extract EXIF metadata from images"
6566
]
6667
},
6768
{
@@ -104,6 +105,11 @@
104105
"PROJECT = \"bigframes-dev\" # replace with your project. \n",
105106
"# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#required_roles for your required permissions\n",
106107
"\n",
108+
"LOCATION = \"us\" # replace with your location.\n",
109+
"\n",
110+
"# Dataset where the UDF will be created.\n",
111+
"DATASET_ID = \"bigframes_samples\" # replace with your dataset ID.\n",
112+
"\n",
107113
"OUTPUT_BUCKET = \"bigframes_blob_test\" # replace with your GCS bucket. \n",
108114
"# The connection (or bigframes-default-connection of the project) must have read/write permission to the bucket. \n",
109115
"# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#grant-permissions for setting up connection service account permissions.\n",
@@ -112,12 +118,14 @@
112118
"import bigframes\n",
113119
"# Setup project\n",
114120
"bigframes.options.bigquery.project = PROJECT\n",
121+
"bigframes.options.bigquery.location = LOCATION\n",
115122
"\n",
116123
"# Display options\n",
117124
"bigframes.options.display.blob_display_width = 300\n",
118125
"bigframes.options.display.progress_bar = None\n",
119126
"\n",
120-
"import bigframes.pandas as bpd"
127+
"import bigframes.pandas as bpd\n",
128+
"import bigframes.bigquery as bbq"
121129
]
122130
},
123131
{
@@ -1546,6 +1554,88 @@
15461554
"transcribed_series_verbose = df['audio'].blob.audio_transcribe(model_name=\"gemini-2.0-flash-001\", verbose=True)\n",
15471555
"transcribed_series_verbose"
15481556
]
1557+
},
1558+
{
1559+
"cell_type": "markdown",
1560+
"metadata": {},
1561+
"source": [
1562+
"### 7. Extract EXIF metadata from images"
1563+
]
1564+
},
1565+
{
1566+
"cell_type": "markdown",
1567+
"metadata": {},
1568+
"source": [
1569+
"This section demonstrates how to extract EXIF metadata from images using a custom BigQuery Python UDF and the `Pillow` library."
1570+
]
1571+
},
1572+
{
1573+
"cell_type": "code",
1574+
"execution_count": null,
1575+
"metadata": {},
1576+
"outputs": [],
1577+
"source": [
1578+
"# Construct the canonical connection ID\n",
1579+
"FULL_CONNECTION_ID = f\"{PROJECT}.{LOCATION}.bigframes-default-connection\"\n",
1580+
"\n",
1581+
"@bpd.udf(\n",
1582+
" input_types=[str],\n",
1583+
" output_type=str,\n",
1584+
" dataset=DATASET_ID,\n",
1585+
" name=\"extract_exif\",\n",
1586+
" bigquery_connection=FULL_CONNECTION_ID,\n",
1587+
" packages=[\"pillow\", \"requests\"],\n",
1588+
" max_batching_rows=8192,\n",
1589+
" container_cpu=0.33,\n",
1590+
" container_memory=\"512Mi\"\n",
1591+
")\n",
1592+
"def extract_exif(src_obj_ref_rt: str) -> str:\n",
1593+
" import io\n",
1594+
" import json\n",
1595+
" from PIL import ExifTags, Image\n",
1596+
" import requests\n",
1597+
" from requests import adapters\n",
1598+
" session = requests.Session()\n",
1599+
" session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n",
1600+
" src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n",
1601+
" src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n",
1602+
" response = session.get(src_url, timeout=30)\n",
1603+
" bts = response.content\n",
1604+
" image = Image.open(io.BytesIO(bts))\n",
1605+
" exif_data = image.getexif()\n",
1606+
" exif_dict = {}\n",
1607+
" if exif_data:\n",
1608+
" for tag, value in exif_data.items():\n",
1609+
" tag_name = ExifTags.TAGS.get(tag, tag)\n",
1610+
" exif_dict[tag_name] = value\n",
1611+
" return json.dumps(exif_dict)"
1612+
]
1613+
},
1614+
{
1615+
"cell_type": "code",
1616+
"execution_count": null,
1617+
"metadata": {},
1618+
"outputs": [],
1619+
"source": [
1620+
"# Create a Multimodal DataFrame from the sample image URIs\n",
1621+
"exif_image_df = bpd.from_glob_path(\n",
1622+
" \"gs://bigframes_blob_test/images_exif/*\",\n",
1623+
" name=\"blob_col\",\n",
1624+
")\n",
1625+
"\n",
1626+
"# Generate a JSON string containing the runtime information (including signed read URLs)\n",
1627+
"# This allows the UDF to download the images from Google Cloud Storage\n",
1628+
"access_urls = exif_image_df[\"blob_col\"].blob.get_runtime_json_str(mode=\"R\")\n",
1629+
"\n",
1630+
"# Apply the BigQuery Python UDF to the runtime JSON strings\n",
1631+
"# We cast to string to ensure the input matches the UDF's signature\n",
1632+
"exif_json = access_urls.astype(str).apply(extract_exif)\n",
1633+
"\n",
1634+
"# Parse the resulting JSON strings back into a structured JSON type for easier access\n",
1635+
"exif_data = bbq.parse_json(exif_json)\n",
1636+
"\n",
1637+
"exif_data"
1638+
]
15491639
}
15501640
],
15511641
"metadata": {

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.