Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings
This repository was archived by the owner on May 7, 2026. It is now read-only.

Commit 543ce52

Browse filesBrowse files
authored
docs: use direct API for pdf chunk and pdf extract (#2452)
This PR updates `notebooks/multimodal/multimodal_dataframe.ipynb` to demonstrate PDF text extraction a chunking using custom BigQuery Python UDFs with the `pypdf` library. Fixes #<478952827> 🦕
1 parent 3409acd commit 543ce52
Copy full SHA for 543ce52

1 file changed

+97-139Lines changed: 97 additions & 139 deletions

File tree

Expand file treeCollapse file tree
Open diff view settings
Filter options
Expand file treeCollapse file tree
Open diff view settings
Collapse file

‎notebooks/multimodal/multimodal_dataframe.ipynb‎

Copy file name to clipboardExpand all lines: notebooks/multimodal/multimodal_dataframe.ipynb
+97-139Lines changed: 97 additions & 139 deletions
Original file line numberDiff line numberDiff line change
@@ -1317,161 +1317,119 @@
13171317
"id": "iRUi8AjG7cIf"
13181318
},
13191319
"source": [
1320-
"### 5. PDF chunking function"
1320+
"### 5. PDF extraction and chunking function\n",
1321+
"\n",
1322+
"This section demonstrates how to extract text and chunk text from PDF files using custom BigQuery Python UDFs and the `pypdf` library."
13211323
]
13221324
},
13231325
{
13241326
"cell_type": "code",
1325-
"execution_count": 3,
1326-
"metadata": {
1327-
"id": "oDDuYtUm5Yiy"
1328-
},
1327+
"execution_count": null,
1328+
"metadata": {},
13291329
"outputs": [],
13301330
"source": [
1331-
"df_pdf = bpd.from_glob_path(\"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*\", name=\"pdf\")"
1332-
]
1333-
},
1334-
{
1335-
"cell_type": "code",
1336-
"execution_count": 18,
1337-
"metadata": {
1338-
"colab": {
1339-
"base_uri": "https://localhost:8080/"
1340-
},
1341-
"id": "7jLpMYaj7nj8",
1342-
"outputId": "06d5456f-580f-4693-adff-2605104b056c"
1343-
},
1344-
"outputs": [
1345-
{
1346-
"name": "stderr",
1347-
"output_type": "stream",
1348-
"text": [
1349-
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n",
1350-
"instead of using `db_dtypes` in the future when available in pandas\n",
1351-
"(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n",
1352-
" warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n",
1353-
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n",
1354-
" return method(*args, **kwargs)\n",
1355-
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n",
1356-
"future version. Use `json_value_array` instead.\n",
1357-
" warnings.warn(bfe.format_message(msg), category=UserWarning)\n",
1358-
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n",
1359-
"future version. Use `json_value_array` instead.\n",
1360-
" warnings.warn(bfe.format_message(msg), category=UserWarning)\n"
1361-
]
1362-
}
1363-
],
1364-
"source": [
1365-
"df_pdf[\"chunked\"] = df_pdf[\"pdf\"].blob.pdf_chunk(engine=\"pypdf\")"
1331+
"# Construct the canonical connection ID\n",
1332+
"FULL_CONNECTION_ID = f\"{PROJECT}.{LOCATION}.bigframes-default-connection\"\n",
1333+
"\n",
1334+
"@bpd.udf(\n",
1335+
" input_types=[str],\n",
1336+
" output_type=str,\n",
1337+
" dataset=DATASET_ID,\n",
1338+
" name=\"pdf_extract\",\n",
1339+
" bigquery_connection=FULL_CONNECTION_ID,\n",
1340+
" packages=[\"pypdf\", \"requests\", \"cryptography\"],\n",
1341+
")\n",
1342+
"def pdf_extract(src_obj_ref_rt: str) -> str:\n",
1343+
" import io\n",
1344+
" import json\n",
1345+
" from pypdf import PdfReader\n",
1346+
" import requests\n",
1347+
" from requests import adapters\n",
1348+
" session = requests.Session()\n",
1349+
" session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n",
1350+
" src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n",
1351+
" src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n",
1352+
" response = session.get(src_url, timeout=30, stream=True)\n",
1353+
" response.raise_for_status()\n",
1354+
" pdf_bytes = response.content\n",
1355+
" pdf_file = io.BytesIO(pdf_bytes)\n",
1356+
" reader = PdfReader(pdf_file, strict=False)\n",
1357+
" all_text = \"\"\n",
1358+
" for page in reader.pages:\n",
1359+
" page_extract_text = page.extract_text()\n",
1360+
" if page_extract_text:\n",
1361+
" all_text += page_extract_text\n",
1362+
" return all_text\n",
1363+
"\n",
1364+
"@bpd.udf(\n",
1365+
" input_types=[str, int, int],\n",
1366+
" output_type=list[str],\n",
1367+
" dataset=DATASET_ID,\n",
1368+
" name=\"pdf_chunk\",\n",
1369+
" bigquery_connection=FULL_CONNECTION_ID,\n",
1370+
" packages=[\"pypdf\", \"requests\", \"cryptography\"],\n",
1371+
")\n",
1372+
"def pdf_chunk(src_obj_ref_rt: str, chunk_size: int, overlap_size: int) -> list[str]:\n",
1373+
" import io\n",
1374+
" import json\n",
1375+
" from pypdf import PdfReader\n",
1376+
" import requests\n",
1377+
" from requests import adapters\n",
1378+
" session = requests.Session()\n",
1379+
" session.mount(\"https://\", adapters.HTTPAdapter(max_retries=3))\n",
1380+
" src_obj_ref_rt_json = json.loads(src_obj_ref_rt)\n",
1381+
" src_url = src_obj_ref_rt_json[\"access_urls\"][\"read_url\"]\n",
1382+
" response = session.get(src_url, timeout=30, stream=True)\n",
1383+
" response.raise_for_status()\n",
1384+
" pdf_bytes = response.content\n",
1385+
" pdf_file = io.BytesIO(pdf_bytes)\n",
1386+
" reader = PdfReader(pdf_file, strict=False)\n",
1387+
" all_text_chunks = []\n",
1388+
" curr_chunk = \"\"\n",
1389+
" for page in reader.pages:\n",
1390+
" page_text = page.extract_text()\n",
1391+
" if page_text:\n",
1392+
" curr_chunk += page_text\n",
1393+
" while len(curr_chunk) >= chunk_size:\n",
1394+
" split_idx = curr_chunk.rfind(\" \", 0, chunk_size)\n",
1395+
" if split_idx == -1:\n",
1396+
" split_idx = chunk_size\n",
1397+
" actual_chunk = curr_chunk[:split_idx]\n",
1398+
" all_text_chunks.append(actual_chunk)\n",
1399+
" overlap = curr_chunk[split_idx + 1 : split_idx + 1 + overlap_size]\n",
1400+
" curr_chunk = overlap + curr_chunk[split_idx + 1 + overlap_size :]\n",
1401+
" if curr_chunk:\n",
1402+
" all_text_chunks.append(curr_chunk)\n",
1403+
" return all_text_chunks"
13661404
]
13671405
},
13681406
{
13691407
"cell_type": "code",
1370-
"execution_count": 19,
1408+
"execution_count": null,
13711409
"metadata": {},
1372-
"outputs": [
1373-
{
1374-
"name": "stderr",
1375-
"output_type": "stream",
1376-
"text": [
1377-
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n",
1378-
"instead of using `db_dtypes` in the future when available in pandas\n",
1379-
"(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n",
1380-
" warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n",
1381-
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n",
1382-
" return method(*args, **kwargs)\n",
1383-
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n",
1384-
"future version. Use `json_value_array` instead.\n",
1385-
" warnings.warn(bfe.format_message(msg), category=UserWarning)\n"
1386-
]
1387-
},
1388-
{
1389-
"data": {
1390-
"text/html": [
1391-
"<div>\n",
1392-
"<style scoped>\n",
1393-
" .dataframe tbody tr th:only-of-type {\n",
1394-
" vertical-align: middle;\n",
1395-
" }\n",
1396-
"\n",
1397-
" .dataframe tbody tr th {\n",
1398-
" vertical-align: top;\n",
1399-
" }\n",
1400-
"\n",
1401-
" .dataframe thead th {\n",
1402-
" text-align: right;\n",
1403-
" }\n",
1404-
"</style>\n",
1405-
"<table border=\"1\" class=\"dataframe\">\n",
1406-
" <thead>\n",
1407-
" <tr style=\"text-align: right;\">\n",
1408-
" <th></th>\n",
1409-
" <th>chunked_verbose</th>\n",
1410-
" </tr>\n",
1411-
" </thead>\n",
1412-
" <tbody>\n",
1413-
" <tr>\n",
1414-
" <th>0</th>\n",
1415-
" <td>{'status': '', 'content': array([\"CritterCuisi...</td>\n",
1416-
" </tr>\n",
1417-
" </tbody>\n",
1418-
"</table>\n",
1419-
"<p>1 rows × 1 columns</p>\n",
1420-
"</div>[1 rows x 1 columns in total]"
1421-
],
1422-
"text/plain": [
1423-
" chunked_verbose\n",
1424-
"0 {'status': '', 'content': array([\"CritterCuisi...\n",
1425-
"\n",
1426-
"[1 rows x 1 columns]"
1427-
]
1428-
},
1429-
"execution_count": 19,
1430-
"metadata": {},
1431-
"output_type": "execute_result"
1432-
}
1433-
],
1410+
"outputs": [],
14341411
"source": [
1435-
"df_pdf[\"chunked_verbose\"] = df_pdf[\"pdf\"].blob.pdf_chunk(engine=\"pypdf\", verbose=True)\n",
1436-
"df_pdf[[\"chunked_verbose\"]]"
1412+
"df_pdf = bpd.from_glob_path(\"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*\", name=\"pdf\")\n",
1413+
"\n",
1414+
"# Generate a JSON string containing the runtime information (including signed read URLs)\n",
1415+
"access_urls = get_runtime_json_str(df_pdf[\"pdf\"], mode=\"R\")\n",
1416+
"\n",
1417+
"# Apply PDF extraction\n",
1418+
"df_pdf[\"extracted_text\"] = access_urls.apply(pdf_extract)\n",
1419+
"\n",
1420+
"# Apply PDF chunking\n",
1421+
"df_pdf[\"chunked\"] = access_urls.apply(pdf_chunk, args=(2000, 200))\n",
1422+
"\n",
1423+
"df_pdf[[\"extracted_text\", \"chunked\"]]"
14371424
]
14381425
},
14391426
{
14401427
"cell_type": "code",
1441-
"execution_count": 20,
1442-
"metadata": {
1443-
"id": "kaPvJATN7zlw"
1444-
},
1445-
"outputs": [
1446-
{
1447-
"name": "stderr",
1448-
"output_type": "stream",
1449-
"text": [
1450-
"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n",
1451-
"instead of using `db_dtypes` in the future when available in pandas\n",
1452-
"(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n",
1453-
" warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n"
1454-
]
1455-
},
1456-
{
1457-
"data": {
1458-
"text/plain": [
1459-
"0 CritterCuisine Pro 5000 - Automatic Pet Feeder...\n",
1460-
"0 on a level, stable surface to prevent tipping....\n",
1461-
"0 included)\\nto maintain the schedule during pow...\n",
1462-
"0 digits for Meal 1 will flash.\\n\u0000. Use the UP/D...\n",
1463-
"0 paperclip) for 5\\nseconds. This will reset all...\n",
1464-
"0 unit with a damp cloth. Do not immerse the bas...\n",
1465-
"0 continues,\\ncontact customer support.\\nE2: Foo...\n",
1466-
"Name: chunked, dtype: string"
1467-
]
1468-
},
1469-
"execution_count": 20,
1470-
"metadata": {},
1471-
"output_type": "execute_result"
1472-
}
1473-
],
1428+
"execution_count": null,
1429+
"metadata": {},
1430+
"outputs": [],
14741431
"source": [
1432+
"# Explode the chunks to see each chunk as a separate row\n",
14751433
"chunked = df_pdf[\"chunked\"].explode()\n",
14761434
"chunked"
14771435
]
@@ -1674,7 +1632,7 @@
16741632
"name": "python",
16751633
"nbconvert_exporter": "python",
16761634
"pygments_lexer": "ipython3",
1677-
"version": "3.13.0"
1635+
"version": "3.10.15"
16781636
}
16791637
},
16801638
"nbformat": 4,

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.