From 58ee042f3103194b68db8cd379c10f012b21b0a2 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Mon, 16 Sep 2019 12:04:27 -0700
Subject: [PATCH 1/2] Add bigquery_kms_key Dataflow sample

---
 dataflow/README.md                           |  90 +++++++++
 dataflow/encryption-keys/README.md           | 197 +++++++++++++++++++
 dataflow/encryption-keys/bigquery_kms_key.py |  88 +++++++++
 dataflow/encryption-keys/requirements.txt    |   1 +
 4 files changed, 376 insertions(+)
 create mode 100644 dataflow/README.md
 create mode 100644 dataflow/encryption-keys/README.md
 create mode 100644 dataflow/encryption-keys/bigquery_kms_key.py
 create mode 100644 dataflow/encryption-keys/requirements.txt

diff --git a/dataflow/README.md b/dataflow/README.md
new file mode 100644
index 00000000000..4d6d9b2ebb0
--- /dev/null
+++ b/dataflow/README.md
@@ -0,0 +1,90 @@
+# Getting started with Google Cloud Dataflow
+
+[![Open in Cloud Shell](http://gstatic.com/cloudssh/images/open-btn.svg)](https://console.cloud.google.com/cloudshell/editor)
+
+[Apache Beam](https://beam.apache.org/)
+is an open source, unified model for defining both batch and streaming data-parallel processing pipelines.
+This guides you through all the steps needed to run an Apache Beam pipeline in the
+[Google Cloud Dataflow](https://cloud.google.com/dataflow) runner.
+
+## Setting up your Google Cloud project
+
+The following instructions help you prepare your Google Cloud project.
+
+1. Install the [Cloud SDK](https://cloud.google.com/sdk/docs/).
+   > *Note:* This is not required in
+   > [Cloud Shell](https://console.cloud.google.com/cloudshell/editor)
+   > since it already has the Cloud SDK pre-installed.
+
+1. Create a new Google Cloud project via the
+   [*New Project* page](https://console.cloud.google.com/projectcreate),
+   or via the `gcloud` command line tool.
+
+   ```sh
+   export PROJECT=your-google-cloud-project-id
+   gcloud projects create $PROJECT
+   ```
+
+1. Setup the Cloud SDK to your GCP project.
+
+   ```sh
+   gcloud init
+   ```
+
+1. [Enable billing](https://cloud.google.com/billing/docs/how-to/modify-project).
+
+1. [Enable the APIs](https://console.cloud.google.com/flows/enableapi?apiid=dataflow,compute_component,storage_component,storage_api,logging,cloudresourcemanager.googleapis.com,iam.googleapis.com):
+   Dataflow, Compute Engine, Cloud Storage, Cloud Storage JSON,
+   Stackdriver Logging, Cloud Resource Manager, and IAM API.
+
+1. Create a service account JSON key via the
+   [*Create service account key* page](https://console.cloud.google.com/apis/credentials/serviceaccountkey),
+   or via the `gcloud` command line tool.
+   Here is how to do it through the *Create service account key* page.
+
+   * From the **Service account** list, select **New service account**.
+   * In the **Service account name** field, enter a name.
+   * From the **Role** list, select **Project > Owner** **(*)**.
+   * Click **Create**. A JSON file that contains your key downloads to your computer.
+
+   Alternatively, you can use `gcloud` through the command line.
+
+   ```sh
+   export PROJECT=$(gcloud config get-value project)
+   export SA_NAME=samples
+   export IAM_ACCOUNT=$SA_NAME@$PROJECT.iam.gserviceaccount.com
+
+   # Create the service account.
+   gcloud iam service-accounts create $SA_NAME --display-name $SA_NAME
+
+   # Set the role to Project Owner (*).
+   gcloud projects add-iam-policy-binding $PROJECT \
+     --member serviceAccount:$IAM_ACCOUNT \
+     --role roles/owner
+
+   # Create a JSON file with the service account credentials.
+   gcloud iam service-accounts keys create path/to/your/credentials.json \
+     --iam-account=$IAM_ACCOUNT
+   ```
+
+   > **(*)** *Note:* The **Role** field authorizes your service account to access resources.
+   > You can view and change this field later by using the
+   > [GCP Console IAM page](https://console.cloud.google.com/iam-admin/iam).
+   > If you are developing a production app, specify more granular permissions than **Project > Owner**.
+   > For more information, see
+   > [Granting roles to service accounts](https://cloud.google.com/iam/docs/granting-roles-to-service-accounts).
+
+   For more information, see
+   [Creating and managing service accounts](https://cloud.google.com/iam/docs/creating-managing-service-accounts)
+
+1. Set your `GOOGLE_APPLICATION_CREDENTIALS` environment variable to point to your service account key file.
+
+   ```sh
+   export GOOGLE_APPLICATION_CREDENTIALS=path/to/your/credentials.json
+   ```
+
+## Setting up a Python development environment
+
+For instructions on how to install Python, virtualenv, and the Cloud SDK, see the
+[Setting up a Python development environment](https://cloud.google.com/python/setup)
+guide.
diff --git a/dataflow/encryption-keys/README.md b/dataflow/encryption-keys/README.md
new file mode 100644
index 00000000000..bbb68acc612
--- /dev/null
+++ b/dataflow/encryption-keys/README.md
@@ -0,0 +1,197 @@
+# Using customer-managed encryption keys
+
+[![Open in Cloud Shell](http://gstatic.com/cloudssh/images/open-btn.svg)](https://console.cloud.google.com/cloudshell/editor)
+
+This sample demonstrate how to use
+[cryptographic encryption keys](https://cloud.google.com/kms/)
+for the I/O connectors in an
+[Apache Beam](https://beam.apache.org) pipeline.
+For more information, see the
+[Using customer-managed encryption keys](https://cloud.google.com/dataflow/docs/guides/customer-managed-encryption-keys)
+docs page.
+
+## Before you begin
+
+Follow the
+[Getting started with Google Cloud Dataflow](../README.md)
+page, and make sure you have a Google Cloud project with billing enabled
+and a *service account JSON key* set up in your `GOOGLE_APPLICATION_CREDENTIALS` environment variable.
+Additionally, for this sample you need the following:
+
+1. [Enable the APIs](https://console.cloud.google.com/flows/enableapi?apiid=bigquery,cloudkms.googleapis.com):
+   BigQuery and Cloud KMS API.
+
+1. Create a Cloud Storage bucket.
+
+   ```sh
+   export BUCKET=your-gcs-bucket
+   gsutil mb gs://$BUCKET
+   ```
+
+1. [Create a symmetric key ring](https://cloud.google.com/kms/docs/creating-keys).
+   For best results, use a [regional location](https://cloud.google.com/kms/docs/locations).
+   This example uses a `global` key for simplicity.
+
+   ```sh
+   export KMS_KEYRING=samples-keyring
+   export KMS_KEY=samples-key
+
+   # Create a key ring.
+   gcloud kms keyrings create $KMS_KEYRING --location global
+
+   # Create a key.
+   gcloud kms keys create $KMS_KEY --location global \
+     --keyring $KMS_KEYRING --purpose encryption
+   ```
+
+   > *Note:* Although you can destroy the
+   > [*key version material*](https://cloud.google.com/kms/docs/destroy-restore),
+   > you [cannot delete keys and key rings](https://cloud.google.com/kms/docs/object-hierarchy#lifetime).
+   > Key rings and keys do not have billable costs or quota limitations,
+   > so their continued existence does not impact costs or production limits.
+
+1. Grant Encrypter/Decrypter permissions to the *Dataflow*, *Compute Engine*, and *BigQuery* accounts.
+
+   ```sh
+   export PROJECT=$(gcloud config get-value project)
+   export PROJECT_NUMBER=$(gcloud projects list --filter $PROJECT --format "value(PROJECT_NUMBER)")
+
+   # Grant Encrypter/Decrypter permissions to the Dataflow service account.
+   gcloud projects add-iam-policy-binding $PROJECT \
+     --member serviceAccount:service-$PROJECT_NUMBER@dataflow-service-producer-prod.iam.gserviceaccount.com \
+     --role roles/cloudkms.cryptoKeyEncrypterDecrypter
+
+   # Grant Encrypter/Decrypter permissions to the Compute Engine service account.
+   gcloud projects add-iam-policy-binding $PROJECT \
+     --member serviceAccount:service-$PROJECT_NUMBER@compute-system.iam.gserviceaccount.com \
+     --role roles/cloudkms.cryptoKeyEncrypterDecrypter
+
+   # Grant Encrypter/Decrypter permissions to the BigQuery service account.
+   gcloud projects add-iam-policy-binding $PROJECT \
+     --member serviceAccount:bq-$PROJECT_NUMBER@bigquery-encryption.iam.gserviceaccount.com \
+     --role roles/cloudkms.cryptoKeyEncrypterDecrypter
+   ```
+
+1. Clone the `python-docs-samples` repository.
+
+   ```sh
+   git clone https://github.com/GoogleCloudPlatform/python-docs-samples.git
+   ```
+
+1. Navigate to the sample code directory.
+
+   ```sh
+   cd python-docs-samples/dataflow/encryption-keys
+   ```
+
+1. Create a virtual environment and activate it.
+
+   ```sh
+   virtualenv env
+   source env/bin/activate
+   ```
+
+   > Once you are done, you can deactivate the virtualenv and go back to your global Python environment by running `deactivate`.
+
+1. Install the sample requirements.
+
+   ```sh
+   pip install -U -r requirements.txt
+   ```
+
+## BigQuery KMS Key example
+
+* [bigquery_kms_key.py](bigquery_kms_key.py)
+
+The following sample gets some data from the
+[NASA wildfires public BigQuery dataset](https://console.cloud.google.com/bigquery?p=bigquery-public-data&d=nasa_wildfire&t=past_week&page=table)
+using a customer-managed encryption key, and dump that data into the specified `output_bigquery_table`
+using the same customer-managed encryption key.
+
+Make sure you have the following variables set up:
+
+```sh
+# Set the project ID, GCS bucket and KMS key.
+export PROJECT=$(gcloud config get-value project)
+export BUCKET=your-gcs-bucket
+
+# Set the region for the Dataflow job.
+# https://cloud.google.com/compute/docs/regions-zones/
+export REGION=us-central1
+
+# Set the KMS key ID.
+export KMS_KEYRING=samples-keyring
+export KMS_KEY=samples-key
+export KMS_KEY_ID=$(gcloud kms keys list --location global --keyring $KMS_KEYRING --filter $KMS_KEY --format "value(NAME)")
+
+# Output BigQuery dataset and table name.
+export DATASET=samples
+export TABLE=dataflow_kms
+```
+
+Create the BigQuery dataset where the output table resides.
+
+```sh
+# Create the BigQuery dataset.
+bq mk --dataset $PROJECT:$DATASET
+```
+
+To run the sample using the Dataflow runner.
+
+```sh
+python bigquery_kms_key.py \
+  --output_bigquery_table $PROJECT:$DATASET.$TABLE \
+  --kms_key $KMS_KEY_ID \
+  --project $PROJECT \
+  --runner DataflowRunner \
+  --temp_location gs://$BUCKET/samples/dataflow/kms/tmp \
+  --region $REGION
+```
+
+> *Note:* To run locally you can omit the `--runner` command line argument and it defaults to the `DirectRunner`.
+
+You can check your submitted Cloud Dataflow jobs in the
+[GCP Console Dataflow page](https://console.cloud.google.com/dataflow) or by using `gcloud`.
+
+```sh
+gcloud dataflow jobs list
+```
+
+Finally, check the contents of the BigQuery table.
+
+```sh
+bq query --use_legacy_sql=false "SELECT * FROM `$PROJECT.$DATASET.$TABLE`"
+```
+
+## Cleanup
+
+To avoid incurring charges to your GCP account for the resources used:
+
+```sh
+# Remove only the files created by this sample.
+gsutil -m rm -rf "gs://$BUCKET/samples/dataflow/kms"
+
+# [optional] Remove the Cloud Storage bucket.
+gsutil rb gs://$BUCKET
+
+# Remove the BigQuery table.
+bq rm -f -t $PROJECT:$DATASET.$TABLE
+
+# [optional] Remove the BigQuery dataset and all its tables.
+bq rm -rf -d $PROJECT:$DATASET
+
+# Revoke Encrypter/Decrypter permissions to the Dataflow service account.
+gcloud projects remove-iam-policy-binding $PROJECT \
+  --member serviceAccount:service-$PROJECT_NUMBER@dataflow-service-producer-prod.iam.gserviceaccount.com \
+  --role roles/cloudkms.cryptoKeyEncrypterDecrypter
+
+# Revoke Encrypter/Decrypter permissions to the Compute Engine service account.
+gcloud projects remove-iam-policy-binding $PROJECT \
+  --member serviceAccount:service-$PROJECT_NUMBER@compute-system.iam.gserviceaccount.com \
+  --role roles/cloudkms.cryptoKeyEncrypterDecrypter
+
+# Revoke Encrypter/Decrypter permissions to the BigQuery service account.
+gcloud projects remove-iam-policy-binding $PROJECT \
+  --member serviceAccount:bq-$PROJECT_NUMBER@bigquery-encryption.iam.gserviceaccount.com \
+  --role roles/cloudkms.cryptoKeyEncrypterDecrypter
+```
diff --git a/dataflow/encryption-keys/bigquery_kms_key.py b/dataflow/encryption-keys/bigquery_kms_key.py
new file mode 100644
index 00000000000..bfe84196953
--- /dev/null
+++ b/dataflow/encryption-keys/bigquery_kms_key.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+#
+# Copyright 2019 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+
+def run(output_bigquery_table, kms_key, beam_args):
+    # [START dataflow_cmek]
+    import apache_beam as beam
+
+    # output_bigquery_table = '<project>:<dataset>.<table>'
+    # kms_key = 'projects/<project>/locations/<kms-location>/keyRings/<kms-keyring>/cryptoKeys/<kms-key>' # noqa
+    # beam_args = [
+    #     '--project', 'your-project-id',
+    #     '--runner', 'DataflowRunner',
+    #     '--temp_location', 'gs://your-bucket/samples/dataflow/kms/tmp',
+    #     '--region', 'us-central1',
+    # ]
+
+    # Query from the NASA wildfires public dataset:
+    # https://console.cloud.google.com/bigquery?p=bigquery-public-data&d=nasa_wildfire&t=past_week&page=table
+    query = """
+        SELECT latitude,longitude,acq_date,acq_time,bright_ti4,confidence
+        FROM `bigquery-public-data.nasa_wildfire.past_week`
+        LIMIT 10
+    """
+
+    # Schema for the output BigQuery table.
+    schema = {
+        'fields': [
+            {'name': 'latitude', 'type': 'FLOAT'},
+            {'name': 'longitude', 'type': 'FLOAT'},
+            {'name': 'acq_date', 'type': 'DATE'},
+            {'name': 'acq_time', 'type': 'TIME'},
+            {'name': 'bright_ti4', 'type': 'FLOAT'},
+            {'name': 'confidence', 'type': 'STRING'},
+        ],
+    }
+
+    options = beam.options.pipeline_options.PipelineOptions(beam_args)
+    with beam.Pipeline(options=options) as pipeline:
+        (
+            pipeline
+            | 'Read from BigQuery with KMS key' >>
+            beam.io.Read(beam.io.BigQuerySource(
+                query=query,
+                use_standard_sql=True,
+                kms_key=kms_key,
+            ))
+            | 'Write to BigQuery with KMS key' >>
+            beam.io.WriteToBigQuery(
+                output_bigquery_table,
+                schema=schema,
+                write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
+                kms_key=kms_key,
+            )
+        )
+    # [END dataflow_cmek]
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--kms_key',
+        required=True,
+        help='Cloud Key Management Service key name',
+    )
+    parser.add_argument(
+        '--output_bigquery_table',
+        required=True,
+        help="Output BigQuery table in the format 'PROJECT:DATASET.TABLE'",
+    )
+    args, beam_args = parser.parse_known_args()
+
+    run(args.output_bigquery_table, args.kms_key, beam_args)
diff --git a/dataflow/encryption-keys/requirements.txt b/dataflow/encryption-keys/requirements.txt
new file mode 100644
index 00000000000..dd30470c2c1
--- /dev/null
+++ b/dataflow/encryption-keys/requirements.txt
@@ -0,0 +1 @@
+apache-beam[gcp]
\ No newline at end of file

From ddcc2b41c873a5180078611a42cbccd379c98d89 Mon Sep 17 00:00:00 2001
From: David Cavazos <dcavazos@google.com>
Date: Wed, 18 Sep 2019 15:50:05 -0700
Subject: [PATCH 2/2] Clarified description on service accounts

---
 dataflow/encryption-keys/README.md | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/dataflow/encryption-keys/README.md b/dataflow/encryption-keys/README.md
index bbb68acc612..953d95b10ec 100644
--- a/dataflow/encryption-keys/README.md
+++ b/dataflow/encryption-keys/README.md
@@ -50,7 +50,12 @@ Additionally, for this sample you need the following:
    > Key rings and keys do not have billable costs or quota limitations,
    > so their continued existence does not impact costs or production limits.
 
-1. Grant Encrypter/Decrypter permissions to the *Dataflow*, *Compute Engine*, and *BigQuery* accounts.
+1. Grant Encrypter/Decrypter permissions to the *Dataflow*, *Compute Engine*, and *BigQuery*
+   [service accounts](https://cloud.google.com/iam/docs/service-accounts).
+   This grants your Dataflow, Compute Engine and BigQuery service accounts the
+   permission to encrypt and decrypt with the CMEK you specify.
+   The Dataflow workers use these service accounts when running the pipeline,
+   which is different from the *user* service account used to start the pipeline.
 
    ```sh
    export PROJECT=$(gcloud config get-value project)