Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 7405c00

Browse filesBrowse files
dizcologychenyumic
authored andcommitted
[DO NOT MERGE] Vision API OCR PDF/TIFF sample (GoogleCloudPlatform#1420)
* add docpdf sample * import order * list blobs * filename change * add the renamed files * parse json string to AnnotateFileResponse message * show more of the response * simplify response processing to better focus on how to make the request * fix typo * linter * linter * linter
1 parent f427368 commit 7405c00
Copy full SHA for 7405c00

File tree

Expand file treeCollapse file tree

3 files changed

+149
-0
lines changed
Filter options
Expand file treeCollapse file tree

3 files changed

+149
-0
lines changed
+110Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
#!/usr/bin/env python
2+
3+
# Copyright 2018 Google Inc. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
18+
"""OCR with PDF/TIFF as source files on GCS
19+
20+
Example:
21+
python detect_pdf.py \
22+
--gcs-source-uri gs://python-docs-samples-tests/HodgeConj.pdf \
23+
--gcs-destination-uri gs://BUCKET_NAME/PREFIX/
24+
"""
25+
26+
import argparse
27+
import re
28+
29+
from google.cloud import storage
30+
from google.cloud import vision_v1p2beta1 as vision
31+
from google.protobuf import json_format
32+
33+
34+
# [START vision_async_detect_document_ocr]
35+
def async_detect_document(gcs_source_uri, gcs_destination_uri):
36+
# Supported mime_types are: 'application/pdf' and 'image/tiff'
37+
mime_type = 'application/pdf'
38+
39+
# How many pages should be grouped into each json output file.
40+
# With a file of 5 pages
41+
batch_size = 2
42+
43+
client = vision.ImageAnnotatorClient()
44+
45+
feature = vision.types.Feature(
46+
type=vision.enums.Feature.Type.DOCUMENT_TEXT_DETECTION)
47+
48+
gcs_source = vision.types.GcsSource(uri=gcs_source_uri)
49+
input_config = vision.types.InputConfig(
50+
gcs_source=gcs_source, mime_type=mime_type)
51+
52+
gcs_destination = vision.types.GcsDestination(uri=gcs_destination_uri)
53+
output_config = vision.types.OutputConfig(
54+
gcs_destination=gcs_destination, batch_size=batch_size)
55+
56+
async_request = vision.types.AsyncAnnotateFileRequest(
57+
features=[feature], input_config=input_config,
58+
output_config=output_config)
59+
60+
operation = client.async_batch_annotate_files(
61+
requests=[async_request])
62+
63+
print('Waiting for the operation to finish.')
64+
operation.result(timeout=90)
65+
66+
# Once the request has completed and the output has been
67+
# written to GCS, we can list all the output files.
68+
storage_client = storage.Client()
69+
70+
match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
71+
bucket_name = match.group(1)
72+
prefix = match.group(2)
73+
74+
bucket = storage_client.get_bucket(bucket_name=bucket_name)
75+
76+
# List objects with the given prefix.
77+
blob_list = list(bucket.list_blobs(prefix=prefix))
78+
print('Output files:')
79+
for blob in blob_list:
80+
print(blob.name)
81+
82+
# Process the first output file from GCS.
83+
# Since we specified batch_size=2, the first response contains
84+
# the first two pages of the input file.
85+
output = blob_list[0]
86+
87+
json_string = output.download_as_string()
88+
response = json_format.Parse(
89+
json_string, vision.types.AnnotateFileResponse())
90+
91+
# The actual response for the first page of the input file.
92+
first_page_response = response.responses[0]
93+
annotation = first_page_response.full_text_annotation
94+
95+
# Here we print the full text from the first page.
96+
# The response contains more information:
97+
# annotation/pages/blocks/paragraphs/words/symbols
98+
# including confidence scores and bounding boxes
99+
print(u'Full text:\n{}'.format(
100+
annotation.text))
101+
# [END vision_async_detect_document_ocr]
102+
103+
104+
if __name__ == '__main__':
105+
parser = argparse.ArgumentParser()
106+
parser.add_argument('--gcs-source-uri', required=True)
107+
parser.add_argument('--gcs-destination-uri', required=True)
108+
109+
args = parser.parse_args()
110+
async_detect_document(args.gcs_source_uri, args.gcs_destination_uri)
+38Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Copyright 2018 Google Inc. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import os
16+
17+
from google.cloud import storage
18+
19+
from detect_pdf import async_detect_document
20+
21+
BUCKET = os.environ['CLOUD_STORAGE_BUCKET']
22+
OUTPUT_PREFIX = 'OCR_PDF_TEST_OUTPUT'
23+
GCS_SOURCE_URI = 'gs://{}/HodgeConj.pdf'.format(BUCKET)
24+
GCS_DESTINATION_URI = 'gs://{}/{}/'.format(BUCKET, OUTPUT_PREFIX)
25+
26+
27+
def test_async_detect_document(capsys):
28+
async_detect_document(
29+
gcs_source_uri=GCS_SOURCE_URI,
30+
gcs_destination_uri=GCS_DESTINATION_URI)
31+
out, _ = capsys.readouterr()
32+
33+
assert 'Hodge conjecture' in out
34+
35+
storage_client = storage.Client()
36+
bucket = storage_client.get_bucket(BUCKET)
37+
for blob in bucket.list_blobs(prefix=OUTPUT_PREFIX):
38+
blob.delete()
+1Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
google-cloud-vision==0.30.1
2+
google-cloud-storage==1.6.0

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.