Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit d66e3a7

Browse filesBrowse files
dalequarknnegreytelpirion
authored
feat: Document AI code snippets for beta
* first pass v1beta2 analyze_form.py * Update document/cloud-client/analyze_form.py Move region tag to the top, set sensible defaults Co-Authored-By: Noah Negrey <nnegrey@users.noreply.github.com> * updated form code * changed naming to be consistent with node * added parse table files * style updates * added quickstart * added batch samples * added set endpoint * renamed set endpoint fn name * feat: adds AutoML model sample * feat: adds requirements files * fix: linter issues * chore: changes to GCS output * fix: linter issues * fix: changes format for AutoML model * fix: per reviewer * fix: added bounding poly comments * fix: adjusts locations, reviewer feedback * fix: reviewer feedback * fix: linter issues * fix: moved comment * fix: per reviewer * fix: per reviewer * fix: region tag bracket * fix: test assert Co-authored-by: Noah Negrey <nnegrey@users.noreply.github.com> Co-authored-by: Eric Schmidt <erschmid@google.com>
1 parent b221fbf commit d66e3a7
Copy full SHA for d66e3a7
Expand file treeCollapse file tree

16 files changed

+784
-0
lines changed
+96Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
# [START documentai_batch_parse_form_beta]
17+
from google.cloud import documentai_v1beta2 as documentai
18+
from google.cloud import storage
19+
import re
20+
21+
22+
def batch_parse_form(
23+
project_id='YOUR_PROJECT_ID',
24+
input_uri='gs://cloud-samples-data/documentai/form.pdf',
25+
destination_uri='gs://your-bucket-id/path/to/save/results/'):
26+
"""Parse a form"""
27+
28+
client = documentai.DocumentUnderstandingServiceClient()
29+
30+
gcs_source = documentai.types.GcsSource(uri=input_uri)
31+
32+
# mime_type can be application/pdf, image/tiff,
33+
# and image/gif, or application/json
34+
input_config = documentai.types.InputConfig(
35+
gcs_source=gcs_source, mime_type='application/pdf')
36+
37+
# where to write results
38+
output_config = documentai.types.OutputConfig(
39+
gcs_destination=documentai.types.GcsDestination(
40+
uri=destination_uri),
41+
pages_per_shard=1 # Map one doc page to one output page
42+
)
43+
44+
# Improve form parsing results by providing key-value pair hints.
45+
# For each key hint, key is text that is likely to appear in the
46+
# document as a form field name (i.e. "DOB").
47+
# Value types are optional, but can be one or more of:
48+
# ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID,
49+
# NUMBER, EMAIL, PRICE, TERMS, DATE, NAME
50+
key_value_pair_hints = [
51+
documentai.types.KeyValuePairHint(
52+
key='Emergency Contact',
53+
value_types=['NAME']),
54+
documentai.types.KeyValuePairHint(
55+
key='Referred By')
56+
]
57+
58+
# Setting enabled=True enables form extraction
59+
form_extraction_params = documentai.types.FormExtractionParams(
60+
enabled=True, key_value_pair_hints=key_value_pair_hints)
61+
62+
# Location can be 'us' or 'eu'
63+
parent = 'projects/{}/locations/us'.format(project_id)
64+
request = documentai.types.ProcessDocumentRequest(
65+
input_config=input_config,
66+
output_config=output_config,
67+
form_extraction_params=form_extraction_params)
68+
69+
# Add each ProcessDocumentRequest to the batch request
70+
requests = []
71+
requests.append(request)
72+
73+
batch_request = documentai.types.BatchProcessDocumentsRequest(
74+
parent=parent, requests=requests
75+
)
76+
77+
operation = client.batch_process_documents(batch_request)
78+
79+
# Wait for the operation to finish
80+
operation.result()
81+
82+
# Results are written to GCS. Use a regex to find
83+
# output files
84+
match = re.match(r'gs://([^/]+)/(.+)', destination_uri)
85+
output_bucket = match.group(1)
86+
prefix = match.group(2)
87+
88+
storage_client = storage.client.Client()
89+
bucket = storage_client.get_bucket(output_bucket)
90+
blob_list = list(bucket.list_blobs(prefix=prefix))
91+
print('Output files:')
92+
for blob in blob_list:
93+
print(blob.name)
94+
95+
96+
# [END documentai_batch_parse_form_beta]
+42Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific ladnguage governing permissions and
13+
# limitations under the License.
14+
15+
import batch_parse_form_beta
16+
import os
17+
import pytest
18+
import uuid
19+
from google.cloud import storage
20+
21+
BUCKET = 'document-ai-{}'.format(uuid.uuid4())
22+
OUTPUT_PREFIX = 'TEST_OUTPUT_{}'.format(uuid.uuid4())
23+
PROJECT_ID = os.environ['GCLOUD_PROJECT']
24+
INPUT_URI = 'gs://cloud-samples-data/documentai/invoice.pdf'
25+
BATCH_OUTPUT_URI = 'gs://{}/{}/'.format(BUCKET, OUTPUT_PREFIX)
26+
27+
28+
@pytest.fixture(autouse=True)
29+
def setup_teardown():
30+
"""Create a temporary bucket to store annotation output."""
31+
storage_client = storage.Client()
32+
bucket = storage_client.create_bucket(BUCKET)
33+
34+
yield
35+
36+
bucket.delete(force=True)
37+
38+
39+
def test_batch_parse_form(capsys):
40+
batch_parse_form_beta.batch_parse_form(PROJECT_ID, INPUT_URI, BATCH_OUTPUT_URI)
41+
out, _ = capsys.readouterr()
42+
assert 'Output files' in out
+114Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
# [START documentai_batch_parse_table_beta]
17+
from google.cloud import documentai_v1beta2 as documentai
18+
from google.cloud import storage
19+
import re
20+
21+
22+
def batch_parse_table(
23+
project_id='YOUR_PROJECT_ID',
24+
input_uri='gs://cloud-samples-data/documentai/form.pdf',
25+
destination_uri='gs://your-bucket-id/path/to/save/results/'):
26+
"""Parse a form"""
27+
28+
client = documentai.DocumentUnderstandingServiceClient()
29+
30+
gcs_source = documentai.types.GcsSource(uri=input_uri)
31+
32+
# mime_type can be application/pdf, image/tiff,
33+
# and image/gif, or application/json
34+
input_config = documentai.types.InputConfig(
35+
gcs_source=gcs_source, mime_type='application/pdf')
36+
37+
# where to write results
38+
output_config = documentai.types.OutputConfig(
39+
gcs_destination=documentai.types.GcsDestination(
40+
uri=destination_uri),
41+
pages_per_shard=1 # Map one doc page to one output page
42+
)
43+
44+
# Improve table parsing results by providing bounding boxes
45+
# specifying where the box appears in the document (optional)
46+
table_bound_hints = [
47+
documentai.types.TableBoundHint(
48+
page_number=1,
49+
bounding_box=documentai.types.BoundingPoly(
50+
# Define a polygon around tables to detect
51+
# Each vertice coordinate must be a number between 0 and 1
52+
normalized_vertices=[
53+
# Top left
54+
documentai.types.geometry.NormalizedVertex(
55+
x=0,
56+
y=0
57+
),
58+
# Top right
59+
documentai.types.geometry.NormalizedVertex(
60+
x=1,
61+
y=0
62+
),
63+
# Bottom right
64+
documentai.types.geometry.NormalizedVertex(
65+
x=1,
66+
y=1
67+
),
68+
# Bottom left
69+
documentai.types.geometry.NormalizedVertex(
70+
x=0,
71+
y=1
72+
)
73+
]
74+
)
75+
)
76+
]
77+
78+
# Setting enabled=True enables form extraction
79+
table_extraction_params = documentai.types.TableExtractionParams(
80+
enabled=True, table_bound_hints=table_bound_hints)
81+
82+
# Location can be 'us' or 'eu'
83+
parent = 'projects/{}/locations/us'.format(project_id)
84+
request = documentai.types.ProcessDocumentRequest(
85+
input_config=input_config,
86+
output_config=output_config,
87+
table_extraction_params=table_extraction_params)
88+
89+
requests = []
90+
requests.append(request)
91+
92+
batch_request = documentai.types.BatchProcessDocumentsRequest(
93+
parent=parent, requests=requests
94+
)
95+
96+
operation = client.batch_process_documents(batch_request)
97+
98+
# Wait for the operation to finish
99+
operation.result()
100+
101+
# Results are written to GCS. Use a regex to find
102+
# output files
103+
match = re.match(r'gs://([^/]+)/(.+)', destination_uri)
104+
output_bucket = match.group(1)
105+
prefix = match.group(2)
106+
107+
storage_client = storage.client.Client()
108+
bucket = storage_client.get_bucket(output_bucket)
109+
blob_list = list(bucket.list_blobs(prefix=prefix))
110+
print('Output files:')
111+
for blob in blob_list:
112+
print(blob.name)
113+
114+
# [END documentai_batch_parse_table_beta]
+42Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific ladnguage governing permissions and
13+
# limitations under the License.
14+
15+
import batch_parse_table_beta
16+
import os
17+
import pytest
18+
import uuid
19+
from google.cloud import storage
20+
21+
BUCKET = 'document-ai-{}'.format(uuid.uuid4())
22+
OUTPUT_PREFIX = 'TEST_OUTPUT_{}'.format(uuid.uuid4())
23+
PROJECT_ID = os.environ['GCLOUD_PROJECT']
24+
INPUT_URI = 'gs://cloud-samples-data/documentai/invoice.pdf'
25+
BATCH_OUTPUT_URI = 'gs://{}/{}/'.format(BUCKET, OUTPUT_PREFIX)
26+
27+
28+
@pytest.fixture(autouse=True)
29+
def setup_teardown():
30+
"""Create a temporary bucket to store annotation output."""
31+
storage_client = storage.Client()
32+
bucket = storage_client.create_bucket(BUCKET)
33+
34+
yield
35+
36+
bucket.delete(force=True)
37+
38+
39+
def test_batch_parse_table(capsys):
40+
batch_parse_table_beta.batch_parse_table(PROJECT_ID, INPUT_URI, BATCH_OUTPUT_URI)
41+
out, _ = capsys.readouterr()
42+
assert 'Output files:' in out
+82Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the 'License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# [START documentai_parse_form_beta]
16+
from google.cloud import documentai_v1beta2 as documentai
17+
18+
19+
def parse_form(project_id='YOUR_PROJECT_ID',
20+
input_uri='gs://cloud-samples-data/documentai/form.pdf'):
21+
"""Parse a form"""
22+
23+
client = documentai.DocumentUnderstandingServiceClient()
24+
25+
gcs_source = documentai.types.GcsSource(uri=input_uri)
26+
27+
# mime_type can be application/pdf, image/tiff,
28+
# and image/gif, or application/json
29+
input_config = documentai.types.InputConfig(
30+
gcs_source=gcs_source, mime_type='application/pdf')
31+
32+
# Improve form parsing results by providing key-value pair hints.
33+
# For each key hint, key is text that is likely to appear in the
34+
# document as a form field name (i.e. "DOB").
35+
# Value types are optional, but can be one or more of:
36+
# ADDRESS, LOCATION, ORGANIZATION, PERSON, PHONE_NUMBER, ID,
37+
# NUMBER, EMAIL, PRICE, TERMS, DATE, NAME
38+
key_value_pair_hints = [
39+
documentai.types.KeyValuePairHint(key='Emergency Contact',
40+
value_types=['NAME']),
41+
documentai.types.KeyValuePairHint(
42+
key='Referred By')
43+
]
44+
45+
# Setting enabled=True enables form extraction
46+
form_extraction_params = documentai.types.FormExtractionParams(
47+
enabled=True, key_value_pair_hints=key_value_pair_hints)
48+
49+
# Location can be 'us' or 'eu'
50+
parent = 'projects/{}/locations/us'.format(project_id)
51+
request = documentai.types.ProcessDocumentRequest(
52+
parent=parent,
53+
input_config=input_config,
54+
form_extraction_params=form_extraction_params)
55+
56+
document = client.process_document(request=request)
57+
58+
def _get_text(el):
59+
"""Doc AI identifies form fields by their offsets
60+
in document text. This function converts offsets
61+
to text snippets.
62+
"""
63+
response = ''
64+
# If a text segment spans several lines, it will
65+
# be stored in different text segments.
66+
for segment in el.text_anchor.text_segments:
67+
start_index = segment.start_index
68+
end_index = segment.end_index
69+
response += document.text[start_index:end_index]
70+
return response
71+
72+
for page in document.pages:
73+
print('Page number: {}'.format(page.page_number))
74+
for form_field in page.form_fields:
75+
print('Field Name: {}\tConfidence: {}'.format(
76+
_get_text(form_field.field_name),
77+
form_field.field_name.confidence))
78+
print('Field Value: {}\tConfidence: {}'.format(
79+
_get_text(form_field.field_value),
80+
form_field.field_value.confidence))
81+
82+
# [END documentai_parse_form_beta]

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.