Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 94ced73

Browse filesBrowse files
ackulengelke
authored andcommitted
Add inspect table code sample for DLP and some nit fixes (GoogleCloudPlatform#1921)
* Remove claim that redact.py operates on strings Reflect in the comments that this particular code sample does not support text redaction. * Add code sample for inspecting table, fix requirements for running tests, quickstart example refactor * Remove newline, if -> elif * formatting * More formatting
1 parent c1ec40c commit 94ced73
Copy full SHA for 94ced73

File tree

Expand file treeCollapse file tree

7 files changed

+237
-25
lines changed
Filter options
Expand file treeCollapse file tree

7 files changed

+237
-25
lines changed

‎dlp/README.rst

Copy file name to clipboardExpand all lines: dlp/README.rst
+10-1Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,15 @@ Install Dependencies
5858
.. _pip: https://pip.pypa.io/
5959
.. _virtualenv: https://virtualenv.pypa.io/
6060

61+
#. For running *_test.py files, install test dependencies
62+
63+
.. code-block:: bash
64+
65+
$ pip install -r requirements-test.txt
66+
$ pytest inspect_content_test.py
67+
68+
** *_test.py files are demo wrappers and make API calls. You may get rate limited for making high number of requests. **
69+
6170
Samples
6271
-------------------------------------------------------------------------------
6372

@@ -74,7 +83,7 @@ To run this sample:
7483

7584
.. code-block:: bash
7685
77-
$ python quickstart.py
86+
$ python quickstart.py <project-id>
7887
7988
8089
Inspect Content

‎dlp/deid.py

Copy file name to clipboardExpand all lines: dlp/deid.py
+3-3Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,7 @@ def write_data(data):
402402
write_file.writerow(map(write_data, row.values))
403403
# Print status
404404
print('Successfully saved date-shift output to {}'.format(
405-
output_csv_file))
405+
output_csv_file))
406406
# [END dlp_deidentify_date_shift]
407407

408408

@@ -450,8 +450,8 @@ def write_data(data):
450450
'If unspecified, the three above examples will be used.',
451451
default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'])
452452
fpe_parser.add_argument(
453-
'project',
454-
help='The Google Cloud project id to use as a parent resource.')
453+
'project',
454+
help='The Google Cloud project id to use as a parent resource.')
455455
fpe_parser.add_argument(
456456
'item',
457457
help='The string to deidentify. '

‎dlp/inspect_content.py

Copy file name to clipboardExpand all lines: dlp/inspect_content.py
+177-9Lines changed: 177 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
import argparse
2121
import os
22+
import json
2223

2324

2425
# [START dlp_inspect_string]
@@ -77,7 +78,7 @@ def inspect_string(project, content_string, info_types,
7778
'min_likelihood': min_likelihood,
7879
'include_quote': include_quote,
7980
'limits': {'max_findings_per_request': max_findings},
80-
}
81+
}
8182

8283
# Construct the `item`.
8384
item = {'value': content_string}
@@ -102,8 +103,130 @@ def inspect_string(project, content_string, info_types,
102103
print('No findings.')
103104
# [END dlp_inspect_string]
104105

106+
# [START dlp_inspect_table]
107+
108+
109+
def inspect_table(project, data, info_types,
110+
custom_dictionaries=None, custom_regexes=None,
111+
min_likelihood=None, max_findings=None, include_quote=True):
112+
"""Uses the Data Loss Prevention API to analyze strings for protected data.
113+
Args:
114+
project: The Google Cloud project id to use as a parent resource.
115+
data: Json string representing table data.
116+
info_types: A list of strings representing info types to look for.
117+
A full list of info type categories can be fetched from the API.
118+
min_likelihood: A string representing the minimum likelihood threshold
119+
that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
120+
'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
121+
max_findings: The maximum number of findings to report; 0 = no maximum.
122+
include_quote: Boolean for whether to display a quote of the detected
123+
information in the results.
124+
Returns:
125+
None; the response from the API is printed to the terminal.
126+
Example:
127+
data = {
128+
"header":[
129+
"email",
130+
"phone number"
131+
],
132+
"rows":[
133+
[
134+
"robertfrost@xyz.com",
135+
"4232342345"
136+
],
137+
[
138+
"johndoe@pqr.com",
139+
"4253458383"
140+
]
141+
]
142+
}
143+
144+
>> $ python inspect_content.py table \
145+
'{"header": ["email", "phone number"],
146+
"rows": [["robertfrost@xyz.com", "4232342345"],
147+
["johndoe@pqr.com", "4253458383"]]}'
148+
>> Quote: robertfrost@xyz.com
149+
Info type: EMAIL_ADDRESS
150+
Likelihood: 4
151+
Quote: johndoe@pqr.com
152+
Info type: EMAIL_ADDRESS
153+
Likelihood: 4
154+
"""
155+
156+
# Import the client library.
157+
import google.cloud.dlp
158+
159+
# Instantiate a client.
160+
dlp = google.cloud.dlp.DlpServiceClient()
161+
162+
# Prepare info_types by converting the list of strings into a list of
163+
# dictionaries (protos are also accepted).
164+
info_types = [{'name': info_type} for info_type in info_types]
165+
166+
# Prepare custom_info_types by parsing the dictionary word lists and
167+
# regex patterns.
168+
if custom_dictionaries is None:
169+
custom_dictionaries = []
170+
dictionaries = [{
171+
'info_type': {'name': 'CUSTOM_DICTIONARY_{}'.format(i)},
172+
'dictionary': {
173+
'word_list': {'words': custom_dict.split(',')}
174+
}
175+
} for i, custom_dict in enumerate(custom_dictionaries)]
176+
if custom_regexes is None:
177+
custom_regexes = []
178+
regexes = [{
179+
'info_type': {'name': 'CUSTOM_REGEX_{}'.format(i)},
180+
'regex': {'pattern': custom_regex}
181+
} for i, custom_regex in enumerate(custom_regexes)]
182+
custom_info_types = dictionaries + regexes
183+
184+
# Construct the configuration dictionary. Keys which are None may
185+
# optionally be omitted entirely.
186+
inspect_config = {
187+
'info_types': info_types,
188+
'custom_info_types': custom_info_types,
189+
'min_likelihood': min_likelihood,
190+
'include_quote': include_quote,
191+
'limits': {'max_findings_per_request': max_findings},
192+
}
193+
194+
# Construct the `table`. For more details on the table schema, please see
195+
# https://cloud.google.com/dlp/docs/reference/rest/v2/ContentItem#Table
196+
headers = [{"name": val} for val in data["header"]]
197+
rows = []
198+
for row in data["rows"]:
199+
rows.append({
200+
"values": [{"string_value": cell_val} for cell_val in row]
201+
})
202+
203+
table = {}
204+
table["headers"] = headers
205+
table["rows"] = rows
206+
item = {"table": table}
207+
# Convert the project id into a full resource id.
208+
parent = dlp.project_path(project)
209+
210+
# Call the API.
211+
response = dlp.inspect_content(parent, inspect_config, item)
212+
213+
# Print out the results.
214+
if response.result.findings:
215+
for finding in response.result.findings:
216+
try:
217+
if finding.quote:
218+
print('Quote: {}'.format(finding.quote))
219+
except AttributeError:
220+
pass
221+
print('Info type: {}'.format(finding.info_type.name))
222+
print('Likelihood: {}'.format(finding.likelihood))
223+
else:
224+
print('No findings.')
225+
# [END dlp_inspect_table]
105226

106227
# [START dlp_inspect_file]
228+
229+
107230
def inspect_file(project, filename, info_types, min_likelihood=None,
108231
custom_dictionaries=None, custom_regexes=None,
109232
max_findings=None, include_quote=True, mime_type=None):
@@ -284,8 +407,8 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id,
284407
storage_config = {
285408
'cloud_storage_options': {
286409
'file_set': {'url': url}
287-
}
288410
}
411+
}
289412

290413
# Convert the project id into a full resource id.
291414
parent = dlp.project_path(project)
@@ -309,7 +432,6 @@ def inspect_gcs_file(project, bucket, filename, topic_id, subscription_id,
309432
subscriber = google.cloud.pubsub.SubscriberClient()
310433
subscription_path = subscriber.subscription_path(
311434
project, subscription_id)
312-
subscription = subscriber.subscribe(subscription_path)
313435

314436
# Set up a callback to acknowledge a message. This closes around an event
315437
# so that it can signal that it is done and the main thread can continue.
@@ -341,8 +463,7 @@ def callback(message):
341463
print(e)
342464
raise
343465

344-
# Register the callback and wait on the event.
345-
subscription.open(callback)
466+
subscriber.subscribe(subscription_path, callback=callback)
346467
finished = job_done.wait(timeout=timeout)
347468
if not finished:
348469
print('No event received before the timeout. Please verify that the '
@@ -460,7 +581,6 @@ def inspect_datastore(project, datastore_project, kind,
460581
subscriber = google.cloud.pubsub.SubscriberClient()
461582
subscription_path = subscriber.subscription_path(
462583
project, subscription_id)
463-
subscription = subscriber.subscribe(subscription_path)
464584

465585
# Set up a callback to acknowledge a message. This closes around an event
466586
# so that it can signal that it is done and the main thread can continue.
@@ -493,7 +613,8 @@ def callback(message):
493613
raise
494614

495615
# Register the callback and wait on the event.
496-
subscription.open(callback)
616+
subscriber.subscribe(subscription_path, callback=callback)
617+
497618
finished = job_done.wait(timeout=timeout)
498619
if not finished:
499620
print('No event received before the timeout. Please verify that the '
@@ -609,7 +730,6 @@ def inspect_bigquery(project, bigquery_project, dataset_id, table_id,
609730
subscriber = google.cloud.pubsub.SubscriberClient()
610731
subscription_path = subscriber.subscription_path(
611732
project, subscription_id)
612-
subscription = subscriber.subscribe(subscription_path)
613733

614734
# Set up a callback to acknowledge a message. This closes around an event
615735
# so that it can signal that it is done and the main thread can continue.
@@ -642,7 +762,7 @@ def callback(message):
642762
raise
643763

644764
# Register the callback and wait on the event.
645-
subscription.open(callback)
765+
subscriber.subscribe(subscription_path, callback=callback)
646766
finished = job_done.wait(timeout=timeout)
647767
if not finished:
648768
print('No event received before the timeout. Please verify that the '
@@ -698,6 +818,46 @@ def callback(message):
698818
'information in the results.',
699819
default=True)
700820

821+
parser_table = subparsers.add_parser('table', help='Inspect a table.')
822+
parser_table.add_argument(
823+
'data', help='Json string representing a table.', type=json.loads)
824+
parser_table.add_argument(
825+
'--project',
826+
help='The Google Cloud project id to use as a parent resource.',
827+
default=default_project)
828+
parser_table.add_argument(
829+
'--info_types', action='append',
830+
help='Strings representing info types to look for. A full list of '
831+
'info categories and types is available from the API. Examples '
832+
'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
833+
'If unspecified, the three above examples will be used.',
834+
default=['FIRST_NAME', 'LAST_NAME', 'EMAIL_ADDRESS'])
835+
parser_table.add_argument(
836+
'--custom_dictionaries', action='append',
837+
help='Strings representing comma-delimited lists of dictionary words'
838+
' to search for as custom info types. Each string is a comma '
839+
'delimited list of words representing a distinct dictionary.',
840+
default=None)
841+
parser_table.add_argument(
842+
'--custom_regexes', action='append',
843+
help='Strings representing regex patterns to search for as custom '
844+
' info types.',
845+
default=None)
846+
parser_table.add_argument(
847+
'--min_likelihood',
848+
choices=['LIKELIHOOD_UNSPECIFIED', 'VERY_UNLIKELY', 'UNLIKELY',
849+
'POSSIBLE', 'LIKELY', 'VERY_LIKELY'],
850+
help='A string representing the minimum likelihood threshold that '
851+
'constitutes a match.')
852+
parser_table.add_argument(
853+
'--max_findings', type=int,
854+
help='The maximum number of findings to report; 0 = no maximum.')
855+
parser_table.add_argument(
856+
'--include_quote', type=bool,
857+
help='A boolean for whether to display a quote of the detected '
858+
'information in the results.',
859+
default=True)
860+
701861
parser_file = subparsers.add_parser('file', help='Inspect a local file.')
702862
parser_file.add_argument(
703863
'filename', help='The path to the file to inspect.')
@@ -923,6 +1083,14 @@ def callback(message):
9231083
min_likelihood=args.min_likelihood,
9241084
max_findings=args.max_findings,
9251085
include_quote=args.include_quote)
1086+
elif args.content == 'table':
1087+
inspect_table(
1088+
args.project, args.data, args.info_types,
1089+
custom_dictionaries=args.custom_dictionaries,
1090+
custom_regexes=args.custom_regexes,
1091+
min_likelihood=args.min_likelihood,
1092+
max_findings=args.max_findings,
1093+
include_quote=args.include_quote)
9261094
elif args.content == 'file':
9271095
inspect_file(
9281096
args.project, args.filename, args.info_types,

‎dlp/inspect_content_test.py

Copy file name to clipboardExpand all lines: dlp/inspect_content_test.py
+29-1Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
import google.cloud.storage
2525

2626
import pytest
27-
2827
import inspect_content
2928

3029

@@ -170,6 +169,35 @@ def test_inspect_string(capsys):
170169
assert 'Info type: EMAIL_ADDRESS' in out
171170

172171

172+
def test_inspect_table(capsys):
173+
test_tabular_data = {
174+
"header": [
175+
"email",
176+
"phone number"
177+
],
178+
"rows": [
179+
[
180+
"robertfrost@xyz.com",
181+
"4232342345"
182+
],
183+
[
184+
"johndoe@pqr.com",
185+
"4253458383"
186+
]
187+
]
188+
}
189+
190+
inspect_content.inspect_table(
191+
GCLOUD_PROJECT,
192+
test_tabular_data,
193+
['PHONE_NUMBER', 'EMAIL_ADDRESS'],
194+
include_quote=True)
195+
196+
out, _ = capsys.readouterr()
197+
assert 'Info type: PHONE_NUMBER' in out
198+
assert 'Info type: EMAIL_ADDRESS' in out
199+
200+
173201
def test_inspect_string_with_custom_info_types(capsys):
174202
test_string = 'My name is Gary Smith and my email is gary@example.com'
175203
dictionaries = ['Gary Smith']

‎dlp/quickstart.py

Copy file name to clipboardExpand all lines: dlp/quickstart.py
+12-5Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,17 @@
1717

1818
from __future__ import print_function
1919

20+
import sys
21+
import argparse
2022

21-
def quickstart():
23+
24+
def quickstart(project_id):
2225
"""Demonstrates use of the Data Loss Prevention API client library."""
2326

2427
# [START dlp_quickstart]
2528
# Import the client library
2629
import google.cloud.dlp
2730

28-
# Edit this with your Google Cloud Project ID.
29-
project = 'your-project'
30-
3131
# Instantiate a client.
3232
dlp_client = google.cloud.dlp.DlpServiceClient()
3333

@@ -84,4 +84,11 @@ def quickstart():
8484

8585

8686
if __name__ == '__main__':
87-
quickstart()
87+
parser = argparse.ArgumentParser()
88+
parser.add_argument(
89+
"project_id", help="Enter your GCP project id.", type=str)
90+
args = parser.parse_args()
91+
if len(sys.argv) == 1:
92+
parser.print_usage()
93+
sys.exit(1)
94+
quickstart(args.project_id)

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.