Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 699db3d

Browse filesBrowse files
feat(dlp): Add code samples to de-identify files in Cloud Storage (GoogleCloudPlatform#10511)
* Implemented dlp_deidentify_cloud_storage * Created separate files (sample and test) for deid_cloud_storage sample * removed unused imports * corrected the endless looping condition. * resolved issue with the loop waiting for status check. * Checked whether correct arguments were passed or not.
1 parent f3857b2 commit 699db3d
Copy full SHA for 699db3d

File tree

Expand file treeCollapse file tree

3 files changed

+314
-0
lines changed
Filter options
Expand file treeCollapse file tree

3 files changed

+314
-0
lines changed

‎dlp/snippets/deid_cloud_storage.py

Copy file name to clipboard
+234Lines changed: 234 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
# Copyright 2023 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Uses of the Data Loss Prevention API for deidentifying sensitive data."""
16+
17+
from __future__ import annotations
18+
19+
import argparse
20+
21+
import time
22+
from typing import List
23+
24+
import google.cloud.dlp
25+
26+
27+
# [START dlp_deidentify_cloud_storage]
28+
def deidentify_cloud_storage(
29+
project: str,
30+
input_gcs_bucket: str,
31+
output_gcs_bucket: str,
32+
info_types: List[str],
33+
deid_template_id: str,
34+
structured_deid_template_id: str,
35+
image_redact_template_id: str,
36+
dataset_id: str,
37+
table_id: str,
38+
timeout: int = 300,
39+
) -> None:
40+
"""
41+
Uses the Data Loss Prevention API to de-identify files in a Google Cloud
42+
Storage directory.
43+
Args:
44+
project: The Google Cloud project id to use as a parent resource.
45+
input_gcs_bucket: The name of google cloud storage bucket to inspect.
46+
output_gcs_bucket: The name of google cloud storage bucket where
47+
de-identified files would be stored.
48+
info_types: A list of strings representing info types to look for.
49+
A full list of info type categories can be fetched from the API.
50+
deid_template_id: The name of the de-identify template for
51+
unstructured and structured files.
52+
structured_deid_template_id: The name of the de-identify template
53+
for structured files.
54+
image_redact_template_id: The name of the image redaction template
55+
for images.
56+
dataset_id: The identifier of the BigQuery dataset where transformation
57+
details would be stored.
58+
table_id: The identifier of the BigQuery table where transformation
59+
details would be stored.
60+
timeout: The number of seconds to wait for a response from the API.
61+
"""
62+
63+
# Instantiate a client.
64+
dlp = google.cloud.dlp_v2.DlpServiceClient()
65+
66+
# Construct the configuration dictionary.
67+
# Specify the type of info the inspection will look for.
68+
# See https://cloud.google.com/dlp/docs/infotypes-reference for complete list of info types.
69+
inspect_config = {
70+
"info_types": [{"name": info_type} for info_type in info_types]
71+
}
72+
73+
# Construct cloud_storage_options dictionary with the bucket's URL.
74+
storage_config = {
75+
"cloud_storage_options": {
76+
"file_set": {
77+
"url": f"gs://{input_gcs_bucket}"
78+
}
79+
}
80+
}
81+
82+
# Specify the big query table to store the transformation details.
83+
big_query_table = {
84+
"project_id": project,
85+
"dataset_id": dataset_id,
86+
"table_id": table_id,
87+
}
88+
89+
# Convert the project id into a full resource id.
90+
parent = f"projects/{project}"
91+
92+
# Construct Transformation Configuration with de-identify Templates used
93+
# for transformation.
94+
transformation_config = {
95+
"deidentify_template": f"{parent}/deidentifyTemplates/{deid_template_id}",
96+
"structured_deidentify_template": f"{parent}/deidentifyTemplates/{structured_deid_template_id}",
97+
"image_redact_template": f"{parent}/deidentifyTemplates/{image_redact_template_id}",
98+
}
99+
100+
# Tell the API where to send notification when the job is completed.
101+
actions = [
102+
{
103+
"deidentify": {
104+
"cloud_storage_output": f"gs://{output_gcs_bucket}",
105+
"transformation_config": transformation_config,
106+
"transformation_details_storage_config": {
107+
"table": big_query_table
108+
},
109+
"file_types_to_transform": ["IMAGE", "CSV", "TEXT_FILE"],
110+
}
111+
}
112+
]
113+
114+
# Construct the job definition.
115+
inspect_job = {
116+
"inspect_config": inspect_config,
117+
"storage_config": storage_config,
118+
"actions": actions,
119+
}
120+
121+
# Call the API.
122+
response = dlp.create_dlp_job(
123+
request={
124+
"parent": parent,
125+
"inspect_job": inspect_job,
126+
}
127+
)
128+
129+
job_name = response.name
130+
print(f"Inspection Job started : {job_name}")
131+
132+
# Waiting for the job to get completed.
133+
job = dlp.get_dlp_job(request={"name": job_name})
134+
# Since the sleep time is kept as 30s, number of calls would be timeout/30.
135+
no_of_attempts = timeout//30
136+
while no_of_attempts != 0:
137+
# Check if the job has completed.
138+
if job.state == google.cloud.dlp_v2.DlpJob.JobState.DONE:
139+
break
140+
if job.state == google.cloud.dlp_v2.DlpJob.JobState.FAILED:
141+
print('Job Failed, Please check the configuration.')
142+
break
143+
144+
# Sleep for a short duration before checking the job status again.
145+
time.sleep(30)
146+
no_of_attempts -= 1
147+
148+
# Get DLP job status.
149+
job = dlp.get_dlp_job(request={"name": job_name})
150+
151+
if job.state != google.cloud.dlp_v2.DlpJob.JobState.DONE:
152+
print(f"Job did not complete within {timeout} minutes.")
153+
return
154+
155+
# Print out the results.
156+
print(f"Job name: {job.name}")
157+
result = job.inspect_details.result
158+
print(f"Processed Bytes: {result.processed_bytes}")
159+
if result.info_type_stats:
160+
for stats in result.info_type_stats:
161+
print(f"Info type: {stats.info_type.name}")
162+
print(f"Count: {stats.count}")
163+
else:
164+
print("No findings.")
165+
166+
167+
# [END dlp_deidentify_cloud_storage]
168+
169+
170+
if __name__ == "__main__":
171+
parser = argparse.ArgumentParser()
172+
173+
parser.add_argument(
174+
"project",
175+
help="The Google Cloud project id to use as a parent resource.",
176+
)
177+
parser.add_argument(
178+
"--info_types",
179+
action="append",
180+
help="Strings representing info types to look for. A full list of "
181+
"info categories and types is available from the API. Examples "
182+
'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ',
183+
)
184+
parser.add_argument(
185+
"input_gcs_bucket",
186+
help="The name of google cloud storage bucket to inspect.",
187+
)
188+
parser.add_argument(
189+
"output_gcs_bucket",
190+
help="The name of google cloud storage bucket where "
191+
"de-identified files would be stored.",
192+
)
193+
parser.add_argument(
194+
"deid_template_id",
195+
help="The name of the de-identify template for unstructured "
196+
"and structured files.",
197+
)
198+
parser.add_argument(
199+
"structured_deid_template_id",
200+
help="The name of the de-identify template for structured files.",
201+
)
202+
parser.add_argument(
203+
"image_redact_template_id",
204+
help="The name of the image redaction template for images.",
205+
)
206+
parser.add_argument(
207+
"dataset_id",
208+
help="The identifier of the BigQuery dataset where transformation "
209+
"details would be stored.",
210+
)
211+
parser.add_argument(
212+
"table_id",
213+
help="The identifier of the BigQuery table where transformation "
214+
"details would be stored.",
215+
)
216+
parser.add_argument(
217+
"timeout",
218+
help="The number of seconds to wait for a response from the API.",
219+
)
220+
221+
args = parser.parse_args()
222+
223+
deidentify_cloud_storage(
224+
args.project,
225+
args.input_gcs_bucket,
226+
args.output_gcs_bucket,
227+
args.info_types,
228+
args.deid_template_id,
229+
args.structured_deid_template_id,
230+
args.image_redact_template_id,
231+
args.dataset_id,
232+
args.table_id,
233+
args.timeout,
234+
)
+79Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
# Copyright 2023 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the 'License');
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an 'AS IS' BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
import os
15+
16+
from unittest import mock
17+
from unittest.mock import MagicMock
18+
19+
import google.cloud.dlp_v2
20+
import pytest
21+
22+
import deid_cloud_storage
23+
24+
GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT")
25+
TXT_FILE = os.path.join(os.path.dirname(__file__), "resources/test.txt")
26+
27+
28+
@mock.patch("google.cloud.dlp_v2.DlpServiceClient")
29+
def test_deidentify_cloud_storage(
30+
dlp_client: MagicMock,
31+
capsys: pytest.CaptureFixture,
32+
) -> None:
33+
# Configure the mock DLP client and its behavior.
34+
mock_dlp_instance = dlp_client.return_value
35+
36+
# Configure the mock CreateDlpJob DLP method and its behavior.
37+
test_job = f"projects/{GCLOUD_PROJECT}/dlpJobs/test_job"
38+
mock_dlp_instance.create_dlp_job.return_value.name = test_job
39+
40+
# Configure the mock GetDlpJob DLP method and its behavior.
41+
mock_job = mock_dlp_instance.get_dlp_job.return_value
42+
mock_job.name = test_job
43+
mock_job.state = google.cloud.dlp_v2.DlpJob.JobState.DONE
44+
45+
# Considering this file is present in gcs bucket.
46+
file = open(TXT_FILE, "r")
47+
# read the content of file
48+
data = file.read()
49+
# get the length of the data
50+
number_of_characters = len(data)
51+
52+
mock_job.inspect_details.result.processed_bytes = number_of_characters
53+
mock_job.inspect_details.result.info_type_stats.info_type.name = "EMAIL_ADDRESS"
54+
finding = mock_job.inspect_details.result.info_type_stats.info_type
55+
56+
mock_job.inspect_details.result.info_type_stats = [
57+
MagicMock(info_type=finding, count=1),
58+
]
59+
60+
deid_cloud_storage.deidentify_cloud_storage(
61+
GCLOUD_PROJECT,
62+
"input_bucket",
63+
"output_bucket",
64+
['EMAIL_ADDRESS', 'PERSON_NAME', 'PHONE_NUMBER'],
65+
"deidentify_template_name",
66+
"structured_deidentify_template_name",
67+
"image_redaction_template_name",
68+
"DATASET_ID",
69+
"TABLE_ID",
70+
timeout=1,
71+
)
72+
out, _ = capsys.readouterr()
73+
assert test_job in out
74+
assert "Processed Bytes" in out
75+
assert "Info type: EMAIL_ADDRESS" in out
76+
77+
create_job_args = mock_dlp_instance.create_dlp_job.call_args
78+
mock_dlp_instance.create_dlp_job.assert_called_once_with(request=create_job_args.kwargs['request'])
79+
mock_dlp_instance.get_dlp_job.assert_called_once_with(request={'name': test_job})

‎dlp/snippets/deid_test.py

Copy file name to clipboardExpand all lines: dlp/snippets/deid_test.py
+1Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
)
3737
SURROGATE_TYPE = "SSN_TOKEN"
3838
CSV_FILE = os.path.join(os.path.dirname(__file__), "resources/dates.csv")
39+
TXT_FILE = os.path.join(os.path.dirname(__file__), "resources/test.txt")
3940
DATE_SHIFTED_AMOUNT = 30
4041
DATE_FIELDS = ["birth_date", "register_date"]
4142
CSV_CONTEXT_FIELD = "name"

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.