diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 5e9a8238a86..90b740e9aa3 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -9,3 +9,6 @@ bigquery/transfer/* @tswast # Tim Swast is the primary maintainer of the Composer samples. composer/* @tswast + +# Alix Hamilton is the primary maintainer of the Jupyter notebook samples +notebooks/* @alixhami diff --git a/.gitignore b/.gitignore index 79fbc1fc767..08a370acb39 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,8 @@ .coverage .tox .pytest_cache +.ipynb_checkpoints +.executed_notebooks coverage.xml python-docs-samples.json service-account.json diff --git a/notebooks/README.md b/notebooks/README.md new file mode 100644 index 00000000000..4ae67189085 --- /dev/null +++ b/notebooks/README.md @@ -0,0 +1,50 @@ +# Notebook Tutorials + +This directory contains Jupyter notebook tutorials for Google Cloud Platform. +The tutorials assume you have performed the following steps: + +1. Install Jupyter notebooks ([instructions](https://jupyter.org/install)) +1. Install the dependencies in the [requirements.txt](./requirements.txt) file ([instructions below](#install-the-dependencies)) +1. Registered the `google-cloud-bigquery` magic commands ([instructions below](#register-magics-and-configure-matplotlib)) +1. Set `matplotlib` to render inline ([instructions below](#register-magics-and-configure-matplotlib)) + +## Install the dependencies + +Install the dependencies with the following command: + + pip install --upgrade -r requirements.txt + +## Register magics and configure matplotlib + +You can either perform these set up steps in a single notebook, or add the +steps to your IPython configuration file to apply to all notebooks. + +### Perform set up steps within a notebook + +To perform the set up steps for a single notebook, run the following commands +in your notebook to register the BigQuery magic commands and set `matplotlib` +to render inline: +```python +%load_ext google.cloud.bigquery +%matplotlib inline +``` + +### Perform set up steps in your IPython configuration file + +To perform the set up steps implicitly for all of your notebooks, add the +following code to your `ipython_config.py` file to register the BigQuery magic +commands and set `matplotlib` to render inline: +```python +c = get_config() + +# Register magic commands +c.InteractiveShellApp.extensions = [ + 'google.cloud.bigquery', +] + +# Enable matplotlib renderings to render inline in the notebook. +c.InteractiveShellApp.matplotlib = 'inline' +``` +See +[IPython documentation](https://ipython.readthedocs.io/en/stable/config/intro.html) +for more information about IPython configuration. diff --git a/notebooks/rendered/bigquery-basics.md b/notebooks/rendered/bigquery-basics.md new file mode 100644 index 00000000000..c72b37f6fc0 --- /dev/null +++ b/notebooks/rendered/bigquery-basics.md @@ -0,0 +1,232 @@ + +# BigQuery basics + +[BigQuery](https://cloud.google.com/bigquery/docs/) is a petabyte-scale analytics data warehouse that you can use to run SQL queries over vast amounts of data in near realtime. This page shows you how to get started with the Google BigQuery API using the Python client library. + +## Import the libraries used in this tutorial + + +```python +from google.cloud import bigquery +import pandas +``` + +## Initialize a client + +To use the BigQuery Python client library, start by initializing a client. The BigQuery client is used to send and receive messages from the BigQuery API. + +### Client project +The `bigquery.Client` object uses your default project. Alternatively, you can specify a project in the `Client` constructor. For more information about how the default project is determined, see the [google-auth documentation](https://google-auth.readthedocs.io/en/latest/reference/google.auth.html). + + +### Client location +Locations are required for certain BigQuery operations such as creating a dataset. If a location is provided to the client when it is initialized, it will be the default location for jobs, datasets, and tables. + +Run the following to create a client with your default project: + + +```python +client = bigquery.Client(location="US") +print("Client creating using default project: {}".format(client.project)) +``` + +To explicitly specify a project when constructing the client, set the `project` parameter: + + +```python +# client = bigquery.Client(location="US", project="your-project-id") +``` + +## Run a query on a public dataset + +The following example queries the BigQuery `usa_names` public dataset to find the 10 most popular names. `usa_names` is a Social Security Administration dataset that contains all names from Social Security card applications for births that occurred in the United States after 1879. + +Use the [Client.query](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.client.Client.html#google.cloud.bigquery.client.Client.query) method to run the query, and the [QueryJob.to_dataframe](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.QueryJob.html#google.cloud.bigquery.job.QueryJob.to_dataframe) method to return the results as a pandas [`DataFrame`](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html). + + +```python +query = """ + SELECT name, SUM(number) as total + FROM `bigquery-public-data.usa_names.usa_1910_current` + GROUP BY name + ORDER BY total DESC + LIMIT 10 +""" +query_job = client.query( + query, + # Location must match that of the dataset(s) referenced in the query. + location="US", +) # API request - starts the query + +df = query_job.to_dataframe() +df +``` + +## Run a parameterized query + +BigQuery supports query parameters to help prevent [SQL injection](https://en.wikipedia.org/wiki/SQL_injection) when you construct a query with user input. Query parameters are only available with [standard SQL syntax](https://cloud.google.com/bigquery/docs/reference/standard-sql/). Query parameters can be used as substitutes for arbitrary expressions. Parameters cannot be used as substitutes for identifiers, column names, table names, or other parts of the query. + +To specify a parameter, use the `@` character followed by an [identifier](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#identifiers), such as `@param_name`. For example, the following query finds all the words in a specific Shakespeare corpus with counts that are at least the specified value. + +For more information, see [Running parameterized queries](https://cloud.google.com/bigquery/docs/parameterized-queries) in the BigQuery documentation. + + +```python +# Define the query +sql = """ + SELECT word, word_count + FROM `bigquery-public-data.samples.shakespeare` + WHERE corpus = @corpus + AND word_count >= @min_word_count + ORDER BY word_count DESC; +""" + +# Define the parameter values in a query job configuration +job_config = bigquery.QueryJobConfig( + query_parameters=[ + bigquery.ScalarQueryParameter("corpus", "STRING", "romeoandjuliet"), + bigquery.ScalarQueryParameter("min_word_count", "INT64", 250), + ] +) + +# Start the query job +query_job = client.query(sql, location="US", job_config=job_config) + +# Return the results as a pandas DataFrame +query_job.to_dataframe() +``` + +## Create a new dataset + +A dataset is contained within a specific [project](https://cloud.google.com/bigquery/docs/projects). Datasets are top-level containers that are used to organize and control access to your [tables](https://cloud.google.com/bigquery/docs/tables) and [views](https://cloud.google.com/bigquery/docs/views). A table or view must belong to a dataset. You need to create at least one dataset before [loading data into BigQuery](https://cloud.google.com/bigquery/loading-data-into-bigquery). + + +```python +# Define a name for the new dataset. +dataset_id = 'your_new_dataset' + +# The project defaults to the Client's project if not specified. +dataset = client.create_dataset(dataset_id) # API request +``` + +## Write query results to a destination table + +For more information, see [Writing query results](https://cloud.google.com/bigquery/docs/writing-results) in the BigQuery documentation. + + +```python +sql = """ + SELECT corpus + FROM `bigquery-public-data.samples.shakespeare` + GROUP BY corpus; +""" +table_ref = dataset.table("your_new_table_id") +job_config = bigquery.QueryJobConfig( + destination=table_ref +) + +# Start the query, passing in the extra configuration. +query_job = client.query(sql, location="US", job_config=job_config) + +query_job.result() # Waits for the query to finish +print("Query results loaded to table {}".format(table_ref.path)) +``` + +## Load data from a pandas DataFrame to a new table + + +```python +records = [ + {"title": "The Meaning of Life", "release_year": 1983}, + {"title": "Monty Python and the Holy Grail", "release_year": 1975}, + {"title": "Life of Brian", "release_year": 1979}, + {"title": "And Now for Something Completely Different", "release_year": 1971}, +] + +# Optionally set explicit indices. +# If indices are not specified, a column will be created for the default +# indices created by pandas. +index = ["Q24980", "Q25043", "Q24953", "Q16403"] +df = pandas.DataFrame(records, index=pandas.Index(index, name="wikidata_id")) + +table_ref = dataset.table("monty_python") +job = client.load_table_from_dataframe(df, table_ref, location="US") + +job.result() # Waits for table load to complete. +print("Loaded dataframe to {}".format(table_ref.path)) +``` + +## Load data from a local file to a table + +The following example demonstrates how to load a local CSV file into a new table. See [SourceFormat](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.SourceFormat.html#google.cloud.bigquery.job.SourceFormat) in the Python client library documentation for a list of available source formats. For more information, see [Loading Data into BigQuery from a local data source](https://cloud.google.com/bigquery/docs/loading-data-local) in the BigQuery documentation. + + +```python +source_filename = 'resources/us-states.csv' + +table_ref = dataset.table('us_states_from_local_file') +job_config = bigquery.LoadJobConfig( + source_format=bigquery.SourceFormat.CSV, + skip_leading_rows=1, + autodetect=True +) + +with open(source_filename, 'rb') as source_file: + job = client.load_table_from_file( + source_file, + table_ref, + location='US', # Must match the destination dataset location. + job_config=job_config) # API request + +job.result() # Waits for table load to complete. + +print('Loaded {} rows into {}:{}.'.format( + job.output_rows, dataset_id, table_ref.path)) +``` + +## Load data from Cloud Storage to a table + +The following example demonstrates how to load a local CSV file into a new table. See [SourceFormat](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.SourceFormat.html#google.cloud.bigquery.job.SourceFormat) in the Python client library documentation for a list of available source formats. For more information, see [Introduction to loading data from Cloud Storage](https://cloud.google.com/bigquery/docs/loading-data-cloud-storage) in the BigQuery documentation. + + +```python +# Configure the load job +job_config = bigquery.LoadJobConfig( + schema=[ + bigquery.SchemaField('name', 'STRING'), + bigquery.SchemaField('post_abbr', 'STRING') + ], + skip_leading_rows=1, + # The source format defaults to CSV. The line below is optional. + source_format=bigquery.SourceFormat.CSV +) +uri = 'gs://cloud-samples-data/bigquery/us-states/us-states.csv' +destination_table_ref = dataset.table('us_states_from_gcs') + +# Start the load job +load_job = client.load_table_from_uri( + uri, destination_table_ref, job_config=job_config) +print('Starting job {}'.format(load_job.job_id)) + +load_job.result() # Waits for table load to complete. +print('Job finished.') + +# Retreive the destination table +destination_table = client.get_table(table_ref) +print('Loaded {} rows.'.format(destination_table.num_rows)) +``` + +## Cleaning Up + +The following code deletes the dataset created for this tutorial, including all tables in the dataset. + + +```python +# Retrieve the dataset from the API +dataset = client.get_dataset(client.dataset(dataset_id)) + +# Delete the dataset and its contents +client.delete_dataset(dataset, delete_contents=True) + +print('Deleted dataset: {}'.format(dataset.path)) +``` diff --git a/notebooks/rendered/bigquery-command-line-tool.md b/notebooks/rendered/bigquery-command-line-tool.md new file mode 100644 index 00000000000..9d824a09065 --- /dev/null +++ b/notebooks/rendered/bigquery-command-line-tool.md @@ -0,0 +1,101 @@ + +# BigQuery command-line tool + +The BigQuery command-line tool is installed as part of the [Cloud SDK](https://cloud-dot-devsite.googleplex.com/sdk/docs/) and can be used to interact with BigQuery. When you use CLI commands in a notebook, the command must be prepended with a `!`. + +## View available commands + +To view the available commands for the BigQuery command-line tool, use the `help` command. + + +```python +!bq help +``` + +## Create a new dataset + +A dataset is contained within a specific [project](https://cloud.google.com/bigquery/docs/projects). Datasets are top-level containers that are used to organize and control access to your [tables](https://cloud.google.com/bigquery/docs/tables) and [views](https://cloud.google.com/bigquery/docs/views). A table or view must belong to a dataset. You need to create at least one dataset before [loading data into BigQuery](https://cloud.google.com/bigquery/loading-data-into-bigquery). + +First, name your new dataset: + + +```python +dataset_id = "your_new_dataset" +``` + +The following command creates a new dataset in the US using the ID defined above. + +NOTE: In the examples in this notebook, the `dataset_id` variable is referenced in the commands using both `{}` and `$`. To avoid creating and using variables, replace these interpolated variables with literal values and remove the `{}` and `$` characters. + + +```python +!bq --location=US mk --dataset $dataset_id +``` + +The response should look like the following: + +``` +Dataset 'your-project-id:your_new_dataset' successfully created. +``` + +## List datasets + +The following command lists all datasets in your default project. + + +```python +!bq ls +``` + +The response should look like the following: + +``` + datasetId + ------------------------------ + your_new_dataset +``` + +## Load data from a local file to a table + +The following example demonstrates how to load a local CSV file into a new or existing table. See [SourceFormat](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.SourceFormat.html#google.cloud.bigquery.job.SourceFormat) in the Python client library documentation for a list of available source formats. For more information, see [Loading Data into BigQuery from a local data source](https://cloud.google.com/bigquery/docs/loading-data-local) in the BigQuery documentation. + + +```python +!bq \ + --location=US \ + load \ + --autodetect \ + --skip_leading_rows=1 \ + --source_format=CSV \ + {dataset_id}.us_states_local_file \ + 'resources/us-states.csv' +``` + +## Load data from Cloud Storage to a table + +The following example demonstrates how to load a local CSV file into a new table. See [SourceFormat](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.SourceFormat.html#google.cloud.bigquery.job.SourceFormat) in the Python client library documentation for a list of available source formats. For more information, see [Introduction to loading data from Cloud Storage](https://cloud.google.com/bigquery/docs/loading-data-cloud-storage) in the BigQuery documentation. + + +```python +!bq \ + --location=US \ + load \ + --autodetect \ + --skip_leading_rows=1 \ + --source_format=CSV \ + {dataset_id}.us_states_gcs \ + 'gs://cloud-samples-data/bigquery/us-states/us-states.csv' +``` + +## Run a query + +The BigQuery command-line tool has a `query` command for running queries, but it is recommended to use the [magic command](./BigQuery%20Query%20Magic.ipynb) for this purpose. + +## Cleaning Up + +The following code deletes the dataset created for this tutorial, including all tables in the dataset. + + +```python +!bq rm -r -f --dataset $dataset_id +``` diff --git a/notebooks/rendered/bigquery-query-magic.md b/notebooks/rendered/bigquery-query-magic.md new file mode 100644 index 00000000000..6200ac53084 --- /dev/null +++ b/notebooks/rendered/bigquery-query-magic.md @@ -0,0 +1,91 @@ + +# BigQuery query magic + +Jupyter magics are notebook-specific shortcuts that allow you to run commands with minimal syntax. Jupyter notebooks come with many [built-in commands](https://ipython.readthedocs.io/en/stable/interactive/magics.html). The BigQuery client library, `google-cloud-bigquery`, provides a cell magic, `%%bigquery`. The `%%bigquery` magic runs a SQL query and returns the results as a pandas `DataFrame`. + +## Run a query on a public dataset + +The following example queries the BigQuery `usa_names` public dataset. `usa_names` is a Social Security Administration dataset that contains all names from Social Security card applications for births that occurred in the United States after 1879. + +The following example shows how to invoke the magic (`%%bigquery`), and how to pass in a standard SQL query in the body of the code cell. The results are displayed below the input cell as a pandas [`DataFrame`](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html). + + +```python +%%bigquery +SELECT name, SUM(number) as count +FROM `bigquery-public-data.usa_names.usa_1910_current` +GROUP BY name +ORDER BY count DESC +LIMIT 10 +``` + +## Display verbose output + +As the query job is running, status messages below the cell update with the query job ID and the amount of time the query has been running. By default, this output is erased and replaced with the results of the query. If you pass the `--verbose` flag, the output will remain below the cell after query completion. + + +```python +%%bigquery --verbose +SELECT name, SUM(number) as count +FROM `bigquery-public-data.usa_names.usa_1910_current` +GROUP BY name +ORDER BY count DESC +LIMIT 10 +``` + +## Explicitly specify a project + +By default, the `%%bigquery` magic command uses your default project to run the query. You may also explicitly provide a project ID using the `--project` flag. Note that your credentials must have permissions to create query jobs in the project you specify. + + +```python +project_id = 'your-project-id' +``` + + +```python +%%bigquery --project $project_id +SELECT name, SUM(number) as count +FROM `bigquery-public-data.usa_names.usa_1910_current` +GROUP BY name +ORDER BY count DESC +LIMIT 10 +``` + +## Assign the query results to a variable + +To save the results of your query to a variable, provide a variable name as a parameter to `%%bigquery`. The following example saves the results of the query to a variable named `df`. Note that when a variable is provided, the results are not displayed below the cell that invokes the magic command. + + +```python +%%bigquery df +SELECT name, SUM(number) as count +FROM `bigquery-public-data.usa_names.usa_1910_current` +GROUP BY name +ORDER BY count DESC +LIMIT 10 +``` + + +```python +df +``` + +## Run a parameterized query + +Parameterized queries are useful if you need to run a query with certain parameters that are calculated at run time. Note that the value types must be JSON serializable. The following example defines a parameters dictionary and passes it to the `--params` flag. The key of the dictionary is the name of the parameter, and the value of the dictionary is the value of the parameter. + + +```python +params = {"limit": 10} +``` + + +```python +%%bigquery --params $params +SELECT name, SUM(number) as count +FROM `bigquery-public-data.usa_names.usa_1910_current` +GROUP BY name +ORDER BY count DESC +LIMIT @limit +``` diff --git a/notebooks/rendered/cloud-storage-client-library.md b/notebooks/rendered/cloud-storage-client-library.md new file mode 100644 index 00000000000..7fde6e40117 --- /dev/null +++ b/notebooks/rendered/cloud-storage-client-library.md @@ -0,0 +1,158 @@ + +# Cloud Storage client library + +This tutorial shows how to get started with the [Cloud Storage Python client library](https://googleapis.github.io/google-cloud-python/latest/storage/index.html). + +## Create a storage bucket + +Buckets are the basic containers that hold your data. Everything that you store in Cloud Storage must be contained in a bucket. You can use buckets to organize your data and control access to your data. + +Start by importing the library: + + +```python +from google.cloud import storage +``` + +The `storage.Client` object uses your default project. Alternatively, you can specify a project in the `Client` constructor. For more information about how the default project is determined, see the [google-auth documentation](https://google-auth.readthedocs.io/en/latest/reference/google.auth.html). + +Run the following to create a client with your default project: + + +```python +client = storage.Client() +print("Client created using default project: {}".format(client.project)) +``` + +To explicitly specify a project when constructing the client, set the `project` parameter: + + +```python +# client = storage.Client(project='your-project-id') +``` + +Finally, create a bucket with a globally unique name. + +For more information about naming buckets, see [Bucket name requirements](https://cloud.google.com/storage/docs/naming#requirements). + + +```python +# Replace the string below with a unique name for the new bucket +bucket_name = "your-new-bucket" + +# Creates the new bucket +bucket = client.create_bucket(bucket_name) + +print("Bucket {} created.".format(bucket.name)) +``` + +## List buckets in a project + + +```python +buckets = client.list_buckets() + +print("Buckets in {}:".format(client.project)) +for item in buckets: + print("\t" + item.name) +``` + +## Get bucket metadata + +The next cell shows how to get information on metadata of your Cloud Storage buckets. + +To learn more about specific bucket properties, see [Bucket locations](https://cloud.google.com/storage/docs/locations) and [Storage classes](https://cloud.google.com/storage/docs/storage-classes). + + +```python +bucket = client.get_bucket(bucket_name) + +print("Bucket name: {}".format(bucket.name)) +print("Bucket location: {}".format(bucket.location)) +print("Bucket storage class: {}".format(bucket.storage_class)) +``` + +## Upload a local file to a bucket + +Objects are the individual pieces of data that you store in Cloud Storage. Objects are referred to as "blobs" in the Python client library. There is no limit on the number of objects that you can create in a bucket. + +An object's name is treated as a piece of object metadata in Cloud Storage. Object names can contain any combination of Unicode characters (UTF-8 encoded) and must be less than 1024 bytes in length. + +For more information, including how to rename an object, see the [Object name requirements](https://cloud.google.com/storage/docs/naming#objectnames). + + +```python +blob_name = "us-states.txt" +blob = bucket.blob(blob_name) + +source_file_name = "resources/us-states.txt" +blob.upload_from_filename(source_file_name) + +print("File uploaded to {}.".format(bucket.name)) +``` + +## List blobs in a bucket + + +```python +blobs = bucket.list_blobs() + +print("Blobs in {}:".format(bucket.name)) +for item in blobs: + print("\t" + item.name) +``` + +## Get a blob and display metadata + +See [documentation](https://cloud.google.com/storage/docs/viewing-editing-metadata) for more information about object metadata. + + +```python +blob = bucket.get_blob(blob_name) + +print("Name: {}".format(blob.id)) +print("Size: {} bytes".format(blob.size)) +print("Content type: {}".format(blob.content_type)) +print("Public URL: {}".format(blob.public_url)) +``` + +## Download a blob to a local directory + + +```python +output_file_name = "resources/downloaded-us-states.txt" +blob.download_to_filename(output_file_name) + +print("Downloaded blob {} to {}.".format(blob.name, output_file_name)) +``` + +## Cleaning up + +### Delete a blob + + +```python +blob = client.get_bucket(bucket_name).get_blob(blob_name) +blob.delete() + +print("Blob {} deleted.".format(blob.name)) +``` + +### Delete a bucket + +Note that the bucket must be empty before it can be deleted. + + +```python +bucket = client.get_bucket(bucket_name) +bucket.delete() + +print("Bucket {} deleted.".format(bucket.name)) +``` + +## Next Steps + +Read more about Cloud Storage in the documentation: ++ [Storage key terms](https://cloud.google.com/storage/docs/key-terms) ++ [How-to guides](https://cloud.google.com/storage/docs/how-to) ++ [Pricing](https://cloud.google.com/storage/pricing) diff --git a/notebooks/rendered/getting-started-with-bigquery-ml.md b/notebooks/rendered/getting-started-with-bigquery-ml.md new file mode 100644 index 00000000000..be92acf0b8a --- /dev/null +++ b/notebooks/rendered/getting-started-with-bigquery-ml.md @@ -0,0 +1,239 @@ + +# Getting started with BigQuery ML + +BigQuery ML enables users to create and execute machine learning models in BigQuery using SQL queries. The goal is to democratize machine learning by enabling SQL practitioners to build models using their existing tools and to increase development speed by eliminating the need for data movement. + +In this tutorial, you use the sample [Google Analytics sample dataset for BigQuery](https://support.google.com/analytics/answer/7586738?hl=en&ref_topic=3416089) to create a model that predicts whether a website visitor will make a transaction. For information on the schema of the Analytics dataset, see [BigQuery export schema](https://support.google.com/analytics/answer/3437719) in the Google Analytics Help Center. + + +## Objectives +In this tutorial, you use: + ++ BigQuery ML to create a binary logistic regression model using the `CREATE MODEL` statement ++ The `ML.EVALUATE` function to evaluate the ML model ++ The `ML.PREDICT` function to make predictions using the ML model + +## Create your dataset + +Enter the following code to import the BigQuery Python client library and initialize a client. The BigQuery client is used to send and receive messages from the BigQuery API. + + +```python +from google.cloud import bigquery + +client = bigquery.Client(location="US") +``` + +Next, you create a BigQuery dataset to store your ML model. Run the following to create your dataset: + + +```python +dataset = client.create_dataset("bqml_tutorial") +``` + +## Create your model + +Next, you create a logistic regression model using the Google Analytics sample +dataset for BigQuery. The model is used to predict whether a +website visitor will make a transaction. The standard SQL query uses a +`CREATE MODEL` statement to create and train the model. Standard SQL is the +default query syntax for the BigQuery python client library. + +The BigQuery python client library provides a cell magic, +`%%bigquery`, which runs a SQL query and returns the results as a Pandas +`DataFrame`. + +To run the `CREATE MODEL` query to create and train your model: + + +```python +%%bigquery +CREATE OR REPLACE MODEL `bqml_tutorial.sample_model` +OPTIONS(model_type='logistic_reg') AS +SELECT + IF(totals.transactions IS NULL, 0, 1) AS label, + IFNULL(device.operatingSystem, "") AS os, + device.isMobile AS is_mobile, + IFNULL(geoNetwork.country, "") AS country, + IFNULL(totals.pageviews, 0) AS pageviews +FROM + `bigquery-public-data.google_analytics_sample.ga_sessions_*` +WHERE + _TABLE_SUFFIX BETWEEN '20160801' AND '20170630' +``` + +The query takes several minutes to complete. After the first iteration is +complete, your model (`sample_model`) appears in the navigation panel of the +BigQuery web UI. Because the query uses a `CREATE MODEL` statement to create a +table, you do not see query results. The output is an empty `DataFrame`. + +## Get training statistics + +To see the results of the model training, you can use the +[`ML.TRAINING_INFO`](https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-train) +function, or you can view the statistics in the BigQuery web UI. This functionality +is not currently available in the BigQuery Classic web UI. +In this tutorial, you use the `ML.TRAINING_INFO` function. + +A machine learning algorithm builds a model by examining many examples and +attempting to find a model that minimizes loss. This process is called empirical +risk minimization. + +Loss is the penalty for a bad prediction — a number indicating +how bad the model's prediction was on a single example. If the model's +prediction is perfect, the loss is zero; otherwise, the loss is greater. The +goal of training a model is to find a set of weights that have low +loss, on average, across all examples. + +To see the model training statistics that were generated when you ran the +`CREATE MODEL` query: + + +```python +%%bigquery +SELECT + * +FROM + ML.TRAINING_INFO(MODEL `bqml_tutorial.sample_model`) +``` + +Note: Typically, it is not a best practice to use a `SELECT *` query. Because the model output is a small table, this query does not process a large amount of data. As a result, the cost is minimal. + +When the query is complete, the results appear below the query. The results should look like the following: + +![Training statistics table](../tutorials/bigquery/resources/training-statistics.png) + +The `loss` column represents the loss metric calculated after the given iteration +on the training dataset. Since you performed a logistic regression, this column +is the [log loss](https://en.wikipedia.org/wiki/Cross_entropy#Cross-entropy_error_function_and_logistic_regression). +The `eval_loss` column is the same loss metric calculated on +the holdout dataset (data that is held back from training to validate the model). + +For more details on the `ML.TRAINING_INFO` function, see the +[BigQuery ML syntax reference](https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-train). + +## Evaluate your model + +After creating your model, you evaluate the performance of the classifier using +the [`ML.EVALUATE`](https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate) +function. You can also use the [`ML.ROC_CURVE`](https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-roc) +function for logistic regression specific metrics. + +A classifier is one of a set of enumerated target values for a label. For +example, in this tutorial you are using a binary classification model that +detects transactions. The two classes are the values in the `label` column: +`0` (no transactions) and not `1` (transaction made). + +To run the `ML.EVALUATE` query that evaluates the model: + + +```python +%%bigquery +SELECT + * +FROM ML.EVALUATE(MODEL `bqml_tutorial.sample_model`, ( + SELECT + IF(totals.transactions IS NULL, 0, 1) AS label, + IFNULL(device.operatingSystem, "") AS os, + device.isMobile AS is_mobile, + IFNULL(geoNetwork.country, "") AS country, + IFNULL(totals.pageviews, 0) AS pageviews + FROM + `bigquery-public-data.google_analytics_sample.ga_sessions_*` + WHERE + _TABLE_SUFFIX BETWEEN '20170701' AND '20170801')) +``` + +When the query is complete, the results appear below the query. The +results should look like the following: + +![Model evaluation results table](../tutorials/bigquery/resources/model-evaluation.png) + +Because you performed a logistic regression, the results include the following +columns: + ++ [`precision`](https://developers.google.com/machine-learning/glossary/#precision) ++ [`recall`](https://developers.google.com/machine-learning/glossary/#recall) ++ [`accuracy`](https://developers.google.com/machine-learning/glossary/#accuracy) ++ [`f1_score`](https://en.wikipedia.org/wiki/F1_score) ++ [`log_loss`](https://developers.google.com/machine-learning/glossary/#Log_Loss) ++ [`roc_auc`](https://developers.google.com/machine-learning/glossary/#AUC) + + +## Use your model to predict outcomes + +Now that you have evaluated your model, the next step is to use it to predict +outcomes. You use your model to predict the number of transactions made by +website visitors from each country. And you use it to predict purchases per user. + +To run the query that uses the model to predict the number of transactions: + + +```python +%%bigquery +SELECT + country, + SUM(predicted_label) as total_predicted_purchases +FROM ML.PREDICT(MODEL `bqml_tutorial.sample_model`, ( + SELECT + IFNULL(device.operatingSystem, "") AS os, + device.isMobile AS is_mobile, + IFNULL(totals.pageviews, 0) AS pageviews, + IFNULL(geoNetwork.country, "") AS country + FROM + `bigquery-public-data.google_analytics_sample.ga_sessions_*` + WHERE + _TABLE_SUFFIX BETWEEN '20170701' AND '20170801')) + GROUP BY country + ORDER BY total_predicted_purchases DESC + LIMIT 10 +``` + +When the query is complete, the results appear below the query. The +results should look like the following. Because model training is not +deterministic, your results may differ. + +![Model predictions table](../tutorials/bigquery/resources/transaction-predictions.png) + +In the next example, you try to predict the number of transactions each website +visitor will make. This query is identical to the previous query except for the +`GROUP BY` clause. Here the `GROUP BY` clause — `GROUP BY fullVisitorId` +— is used to group the results by visitor ID. + +To run the query that predicts purchases per user: + + +```python +%%bigquery +SELECT + fullVisitorId, + SUM(predicted_label) as total_predicted_purchases +FROM ML.PREDICT(MODEL `bqml_tutorial.sample_model`, ( + SELECT + IFNULL(device.operatingSystem, "") AS os, + device.isMobile AS is_mobile, + IFNULL(totals.pageviews, 0) AS pageviews, + IFNULL(geoNetwork.country, "") AS country, + fullVisitorId + FROM + `bigquery-public-data.google_analytics_sample.ga_sessions_*` + WHERE + _TABLE_SUFFIX BETWEEN '20170701' AND '20170801')) + GROUP BY fullVisitorId + ORDER BY total_predicted_purchases DESC + LIMIT 10 +``` + +When the query is complete, the results appear below the query. The +results should look like the following: + +![Purchase predictions table](../tutorials/bigquery/resources/purchase-predictions.png) + +## Cleaning up + +To delete the resources created by this tutorial, execute the following code to delete the dataset and its contents: + + +```python +client.delete_dataset(dataset, delete_contents=True) +``` diff --git a/notebooks/rendered/storage-command-line-tool.md b/notebooks/rendered/storage-command-line-tool.md new file mode 100644 index 00000000000..d6fffa759de --- /dev/null +++ b/notebooks/rendered/storage-command-line-tool.md @@ -0,0 +1,155 @@ + +# Storage command-line tool + +The [Google Cloud SDK](https://cloud-dot-devsite.googleplex.com/sdk/docs/) provides a set of commands for working with data stored in Cloud Storage. This notebook introduces several `gsutil` commands for interacting with Cloud Storage. Note that shell commands in a notebook must be prepended with a `!`. + +## List available commands + +The `gsutil` command can be used to perform a wide array of tasks. Run the `help` command to view a list of available commands: + + +```python +!gsutil help +``` + +## Create a storage bucket + +Buckets are the basic containers that hold your data. Everything that you store in Cloud Storage must be contained in a bucket. You can use buckets to organize your data and control access to your data. + +Start by defining a globally unique name. + +For more information about naming buckets, see [Bucket name requirements](https://cloud.google.com/storage/docs/naming#requirements). + + +```python +# Replace the string below with a unique name for the new bucket +bucket_name = "your-new-bucket" +``` + +NOTE: In the examples below, the `bucket_name` and `project_id` variables are referenced in the commands using `{}` and `$`. If you want to avoid creating and using variables, replace these interpolated variables with literal values and remove the `{}` and `$` characters. + +Next, create the new bucket with the `gsutil mb` command: + + +```python +!gsutil mb gs://{bucket_name}/ +``` + +## List buckets in a project + +Replace 'your-project-id' in the cell below with your project ID and run the cell to list the storage buckets in your project. + + +```python +# Replace the string below with your project ID +project_id = "your-project-id" +``` + + +```python +!gsutil ls -p $project_id +``` + +The response should look like the following: + +``` +gs://your-new-bucket/ +``` + +## Get bucket metadata + +The next cell shows how to get information on metadata of your Cloud Storage buckets. + +To learn more about specific bucket properties, see [Bucket locations](https://cloud.google.com/storage/docs/locations) and [Storage classes](https://cloud.google.com/storage/docs/storage-classes). + + +```python +!gsutil ls -L -b gs://{bucket_name}/ +``` + +The response should look like the following: +``` +gs://your-new-bucket/ : + Storage class: MULTI_REGIONAL + Location constraint: US + ... +``` + +## Upload a local file to a bucket + +Objects are the individual pieces of data that you store in Cloud Storage. Objects are referred to as "blobs" in the Python client library. There is no limit on the number of objects that you can create in a bucket. + +An object's name is treated as a piece of object metadata in Cloud Storage. Object names can contain any combination of Unicode characters (UTF-8 encoded) and must be less than 1024 bytes in length. + +For more information, including how to rename an object, see the [Object name requirements](https://cloud.google.com/storage/docs/naming#objectnames). + + +```python +!gsutil cp resources/us-states.txt gs://{bucket_name}/ +``` + +## List blobs in a bucket + + +```python +!gsutil ls -r gs://{bucket_name}/** +``` + +The response should look like the following: +``` +gs://your-new-bucket/us-states.txt +``` + +## Get a blob and display metadata + +See [Viewing and editing object metadata](https://cloud.google.com/storage/docs/viewing-editing-metadata) for more information about object metadata. + + +```python +!gsutil ls -L gs://{bucket_name}/us-states.txt +``` + +The response should look like the following: + +``` +gs://your-new-bucket/us-states.txt: + Creation time: Fri, 08 Feb 2019 05:23:28 GMT + Update time: Fri, 08 Feb 2019 05:23:28 GMT + Storage class: STANDARD + Content-Language: en + Content-Length: 637 + Content-Type: text/plain +... +``` + +## Download a blob to a local directory + + +```python +!gsutil cp gs://{bucket_name}/us-states.txt resources/downloaded-us-states.txt +``` + +## Cleaning up + +### Delete a blob + + +```python +!gsutil rm gs://{bucket_name}/us-states.txt +``` + +### Delete a bucket + +The following command deletes all objects in the bucket before deleting the bucket itself. + + +```python +!gsutil rm -r gs://{bucket_name}/ +``` + +## Next Steps + +Read more about Cloud Storage in the documentation: ++ [Storage key terms](https://cloud.google.com/storage/docs/key-terms) ++ [How-to guides](https://cloud.google.com/storage/docs/how-to) ++ [Pricing](https://cloud.google.com/storage/pricing) diff --git a/notebooks/rendered/visualizing-bigquery-public-data.md b/notebooks/rendered/visualizing-bigquery-public-data.md new file mode 100644 index 00000000000..bbd4bb34830 --- /dev/null +++ b/notebooks/rendered/visualizing-bigquery-public-data.md @@ -0,0 +1,146 @@ + +# Vizualizing BigQuery data in a Jupyter notebook + +[BigQuery](https://cloud.google.com/bigquery/docs/) is a petabyte-scale analytics data warehouse that you can use to run SQL queries over vast amounts of data in near realtime. + +Data visualization tools can help you make sense of your BigQuery data and help you analyze the data interactively. You can use visualization tools to help you identify trends, respond to them, and make predictions using your data. In this tutorial, you use the BigQuery Python client library and pandas in a Jupyter notebook to visualize data in the BigQuery natality sample table. + +## Using Jupyter magics to query BigQuery data + +The BigQuery Python client library provides a magic command that allows you to run queries with minimal code. + +The BigQuery client library provides a cell magic, `%%bigquery`. The `%%bigquery` magic runs a SQL query and returns the results as a pandas `DataFrame`. The following cell executes a query of the BigQuery natality public dataset and returns the total births by year. + + +```python +%%bigquery +SELECT + source_year AS year, + COUNT(is_male) AS birth_count +FROM `bigquery-public-data.samples.natality` +GROUP BY year +ORDER BY year DESC +LIMIT 15 +``` + +The following command to runs the same query, but this time the results are saved to a variable. The variable name, `total_births`, is given as an argument to the `%%bigquery`. The results can then be used for further analysis and visualization. + + +```python +%%bigquery total_births +SELECT + source_year AS year, + COUNT(is_male) AS birth_count +FROM `bigquery-public-data.samples.natality` +GROUP BY year +ORDER BY year DESC +LIMIT 15 +``` + +The next cell uses the pandas `DataFrame.plot` method to visualize the query results as a bar chart. See the [pandas documentation](https://pandas.pydata.org/pandas-docs/stable/visualization.html) to learn more about data visualization with pandas. + + +```python +total_births.plot(kind='bar', x='year', y='birth_count'); +``` + +Run the following query to retrieve the number of births by weekday. Because the `wday` (weekday) field allows null values, the query excludes records where wday is null. + + +```python +%%bigquery births_by_weekday +SELECT + wday, + SUM(CASE WHEN is_male THEN 1 ELSE 0 END) AS male_births, + SUM(CASE WHEN is_male THEN 0 ELSE 1 END) AS female_births +FROM `bigquery-public-data.samples.natality` +WHERE wday IS NOT NULL +GROUP BY wday +ORDER BY wday ASC +``` + +Visualize the query results using a line chart. + + +```python +births_by_weekday.plot(x='wday'); +``` + +## Using Python to query BigQuery data + +Magic commands allow you to use minimal syntax to interact with BigQuery. Behind the scenes, `%%bigquery` uses the BigQuery Python client library to run the given query, convert the results to a pandas `Dataframe`, optionally save the results to a variable, and finally display the results. Using the BigQuery Python client library directly instead of through magic commands gives you more control over your queries and allows for more complex configurations. The library's integrations with pandas enable you to combine the power of declarative SQL with imperative code (Python) to perform interesting data analysis, visualization, and transformation tasks. + +To use the BigQuery Python client library, start by importing the library and initializing a client. The BigQuery client is used to send and receive messages from the BigQuery API. + + +```python +from google.cloud import bigquery + +client = bigquery.Client() +``` + +Use the [`Client.query`](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.client.Client.html#google.cloud.bigquery.client.Client.query) method to run a query. Execute the following cell to run a query to retrieve the annual count of plural births by plurality (2 for twins, 3 for triplets, etc.). + + +```python +sql = """ +SELECT + plurality, + COUNT(1) AS count, + year +FROM + `bigquery-public-data.samples.natality` +WHERE + NOT IS_NAN(plurality) AND plurality > 1 +GROUP BY + plurality, year +ORDER BY + count DESC +""" +df = client.query(sql).to_dataframe() +df.head() +``` + +To chart the query results in your `DataFrame`, run the following cell to pivot the data and create a stacked bar chart of the count of plural births over time. + + +```python +pivot_table = df.pivot(index='year', columns='plurality', values='count') +pivot_table.plot(kind='bar', stacked=True, figsize=(15, 7)); +``` + +Run the following query to retrieve the count of births by the number of gestation weeks. + + +```python +sql = """ +SELECT + gestation_weeks, + COUNT(1) AS count +FROM + `bigquery-public-data.samples.natality` +WHERE + NOT IS_NAN(gestation_weeks) AND gestation_weeks <> 99 +GROUP BY + gestation_weeks +ORDER BY + gestation_weeks +""" +df = client.query(sql).to_dataframe() +``` + +Finally, chart the query results in your `DataFrame`. + + +```python +ax = df.plot(kind='bar', x='gestation_weeks', y='count', figsize=(15,7)) +ax.set_title('Count of Births by Gestation Weeks') +ax.set_xlabel('Gestation Weeks') +ax.set_ylabel('Count'); +``` + +## What's Next + ++ __Learn more about writing queries for BigQuery__ — [Querying Data](https://cloud.google.com/bigquery/querying-data) in the BigQuery documentation explains how to run queries, create user-defined functions (UDFs), and more. + ++ __Explore BigQuery syntax__ — The preferred dialect for SQL queries in BigQuery is standard SQL. Standard SQL syntax is described in the [SQL Reference](https://cloud.google.com/bigquery/docs/reference/standard-sql/). BigQuery's legacy SQL-like syntax is described in the [Query Reference (legacy SQL)](https://cloud.google.com/bigquery/query-reference). diff --git a/notebooks/requirements.txt b/notebooks/requirements.txt new file mode 100644 index 00000000000..d13f3e9f733 --- /dev/null +++ b/notebooks/requirements.txt @@ -0,0 +1,3 @@ +google-cloud-storage==1.14.0 +google-cloud-bigquery[pandas,pyarrow]==1.9.0 +matplotlib diff --git a/notebooks/tutorials/bigquery/BigQuery basics.ipynb b/notebooks/tutorials/bigquery/BigQuery basics.ipynb new file mode 100644 index 00000000000..2ade591fbfb --- /dev/null +++ b/notebooks/tutorials/bigquery/BigQuery basics.ipynb @@ -0,0 +1,361 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BigQuery basics\n", + "\n", + "[BigQuery](https://cloud.google.com/bigquery/docs/) is a petabyte-scale analytics data warehouse that you can use to run SQL queries over vast amounts of data in near realtime. This page shows you how to get started with the Google BigQuery API using the Python client library." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import the libraries used in this tutorial" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from google.cloud import bigquery\n", + "import pandas" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize a client\n", + "\n", + "To use the BigQuery Python client library, start by initializing a client. The BigQuery client is used to send and receive messages from the BigQuery API.\n", + "\n", + "### Client project\n", + "The `bigquery.Client` object uses your default project. Alternatively, you can specify a project in the `Client` constructor. For more information about how the default project is determined, see the [google-auth documentation](https://google-auth.readthedocs.io/en/latest/reference/google.auth.html).\n", + "\n", + "\n", + "### Client location\n", + "Locations are required for certain BigQuery operations such as creating a dataset. If a location is provided to the client when it is initialized, it will be the default location for jobs, datasets, and tables.\n", + "\n", + "Run the following to create a client with your default project:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client = bigquery.Client(location=\"US\")\n", + "print(\"Client creating using default project: {}\".format(client.project))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To explicitly specify a project when constructing the client, set the `project` parameter:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# client = bigquery.Client(location=\"US\", project=\"your-project-id\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run a query on a public dataset\n", + "\n", + "The following example queries the BigQuery `usa_names` public dataset to find the 10 most popular names. `usa_names` is a Social Security Administration dataset that contains all names from Social Security card applications for births that occurred in the United States after 1879.\n", + "\n", + "Use the [Client.query](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.client.Client.html#google.cloud.bigquery.client.Client.query) method to run the query, and the [QueryJob.to_dataframe](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.QueryJob.html#google.cloud.bigquery.job.QueryJob.to_dataframe) method to return the results as a pandas [`DataFrame`](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query = \"\"\"\n", + " SELECT name, SUM(number) as total\n", + " FROM `bigquery-public-data.usa_names.usa_1910_current`\n", + " GROUP BY name\n", + " ORDER BY total DESC\n", + " LIMIT 10\n", + "\"\"\"\n", + "query_job = client.query(\n", + " query,\n", + " # Location must match that of the dataset(s) referenced in the query.\n", + " location=\"US\",\n", + ") # API request - starts the query\n", + "\n", + "df = query_job.to_dataframe()\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run a parameterized query\n", + "\n", + "BigQuery supports query parameters to help prevent [SQL injection](https://en.wikipedia.org/wiki/SQL_injection) when you construct a query with user input. Query parameters are only available with [standard SQL syntax](https://cloud.google.com/bigquery/docs/reference/standard-sql/). Query parameters can be used as substitutes for arbitrary expressions. Parameters cannot be used as substitutes for identifiers, column names, table names, or other parts of the query.\n", + "\n", + "To specify a parameter, use the `@` character followed by an [identifier](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#identifiers), such as `@param_name`. For example, the following query finds all the words in a specific Shakespeare corpus with counts that are at least the specified value.\n", + "\n", + "For more information, see [Running parameterized queries](https://cloud.google.com/bigquery/docs/parameterized-queries) in the BigQuery documentation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Define the query\n", + "sql = \"\"\"\n", + " SELECT word, word_count\n", + " FROM `bigquery-public-data.samples.shakespeare`\n", + " WHERE corpus = @corpus\n", + " AND word_count >= @min_word_count\n", + " ORDER BY word_count DESC;\n", + "\"\"\"\n", + "\n", + "# Define the parameter values in a query job configuration\n", + "job_config = bigquery.QueryJobConfig(\n", + " query_parameters=[\n", + " bigquery.ScalarQueryParameter(\"corpus\", \"STRING\", \"romeoandjuliet\"),\n", + " bigquery.ScalarQueryParameter(\"min_word_count\", \"INT64\", 250),\n", + " ]\n", + ")\n", + "\n", + "# Start the query job\n", + "query_job = client.query(sql, location=\"US\", job_config=job_config)\n", + "\n", + "# Return the results as a pandas DataFrame\n", + "query_job.to_dataframe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a new dataset\n", + "\n", + "A dataset is contained within a specific [project](https://cloud.google.com/bigquery/docs/projects). Datasets are top-level containers that are used to organize and control access to your [tables](https://cloud.google.com/bigquery/docs/tables) and [views](https://cloud.google.com/bigquery/docs/views). A table or view must belong to a dataset. You need to create at least one dataset before [loading data into BigQuery](https://cloud.google.com/bigquery/loading-data-into-bigquery)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Define a name for the new dataset.\n", + "dataset_id = 'your_new_dataset'\n", + "\n", + "# The project defaults to the Client's project if not specified.\n", + "dataset = client.create_dataset(dataset_id) # API request" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Write query results to a destination table\n", + "\n", + "For more information, see [Writing query results](https://cloud.google.com/bigquery/docs/writing-results) in the BigQuery documentation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sql = \"\"\"\n", + " SELECT corpus\n", + " FROM `bigquery-public-data.samples.shakespeare`\n", + " GROUP BY corpus;\n", + "\"\"\"\n", + "table_ref = dataset.table(\"your_new_table_id\")\n", + "job_config = bigquery.QueryJobConfig(\n", + " destination=table_ref\n", + ")\n", + "\n", + "# Start the query, passing in the extra configuration.\n", + "query_job = client.query(sql, location=\"US\", job_config=job_config)\n", + "\n", + "query_job.result() # Waits for the query to finish\n", + "print(\"Query results loaded to table {}\".format(table_ref.path))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load data from a pandas DataFrame to a new table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "records = [\n", + " {\"title\": \"The Meaning of Life\", \"release_year\": 1983},\n", + " {\"title\": \"Monty Python and the Holy Grail\", \"release_year\": 1975},\n", + " {\"title\": \"Life of Brian\", \"release_year\": 1979},\n", + " {\"title\": \"And Now for Something Completely Different\", \"release_year\": 1971},\n", + "]\n", + "\n", + "# Optionally set explicit indices.\n", + "# If indices are not specified, a column will be created for the default\n", + "# indices created by pandas.\n", + "index = [\"Q24980\", \"Q25043\", \"Q24953\", \"Q16403\"]\n", + "df = pandas.DataFrame(records, index=pandas.Index(index, name=\"wikidata_id\"))\n", + "\n", + "table_ref = dataset.table(\"monty_python\")\n", + "job = client.load_table_from_dataframe(df, table_ref, location=\"US\")\n", + "\n", + "job.result() # Waits for table load to complete.\n", + "print(\"Loaded dataframe to {}\".format(table_ref.path))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load data from a local file to a table\n", + "\n", + "The following example demonstrates how to load a local CSV file into a new table. See [SourceFormat](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.SourceFormat.html#google.cloud.bigquery.job.SourceFormat) in the Python client library documentation for a list of available source formats. For more information, see [Loading Data into BigQuery from a local data source](https://cloud.google.com/bigquery/docs/loading-data-local) in the BigQuery documentation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "source_filename = 'resources/us-states.csv'\n", + "\n", + "table_ref = dataset.table('us_states_from_local_file')\n", + "job_config = bigquery.LoadJobConfig(\n", + " source_format=bigquery.SourceFormat.CSV,\n", + " skip_leading_rows=1,\n", + " autodetect=True\n", + ")\n", + "\n", + "with open(source_filename, 'rb') as source_file:\n", + " job = client.load_table_from_file(\n", + " source_file,\n", + " table_ref,\n", + " location='US', # Must match the destination dataset location.\n", + " job_config=job_config) # API request\n", + "\n", + "job.result() # Waits for table load to complete.\n", + "\n", + "print('Loaded {} rows into {}:{}.'.format(\n", + " job.output_rows, dataset_id, table_ref.path))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load data from Cloud Storage to a table\n", + "\n", + "The following example demonstrates how to load a local CSV file into a new table. See [SourceFormat](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.SourceFormat.html#google.cloud.bigquery.job.SourceFormat) in the Python client library documentation for a list of available source formats. For more information, see [Introduction to loading data from Cloud Storage](https://cloud.google.com/bigquery/docs/loading-data-cloud-storage) in the BigQuery documentation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Configure the load job\n", + "job_config = bigquery.LoadJobConfig(\n", + " schema=[\n", + " bigquery.SchemaField('name', 'STRING'),\n", + " bigquery.SchemaField('post_abbr', 'STRING')\n", + " ],\n", + " skip_leading_rows=1,\n", + " # The source format defaults to CSV. The line below is optional.\n", + " source_format=bigquery.SourceFormat.CSV\n", + ")\n", + "uri = 'gs://cloud-samples-data/bigquery/us-states/us-states.csv'\n", + "destination_table_ref = dataset.table('us_states_from_gcs')\n", + "\n", + "# Start the load job\n", + "load_job = client.load_table_from_uri(\n", + " uri, destination_table_ref, job_config=job_config)\n", + "print('Starting job {}'.format(load_job.job_id))\n", + "\n", + "load_job.result() # Waits for table load to complete.\n", + "print('Job finished.')\n", + "\n", + "# Retreive the destination table\n", + "destination_table = client.get_table(table_ref)\n", + "print('Loaded {} rows.'.format(destination_table.num_rows))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cleaning Up\n", + "\n", + "The following code deletes the dataset created for this tutorial, including all tables in the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Retrieve the dataset from the API\n", + "dataset = client.get_dataset(client.dataset(dataset_id))\n", + "\n", + "# Delete the dataset and its contents\n", + "client.delete_dataset(dataset, delete_contents=True)\n", + "\n", + "print('Deleted dataset: {}'.format(dataset.path))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/tutorials/bigquery/BigQuery command-line tool.ipynb b/notebooks/tutorials/bigquery/BigQuery command-line tool.ipynb new file mode 100644 index 00000000000..f9c709e4533 --- /dev/null +++ b/notebooks/tutorials/bigquery/BigQuery command-line tool.ipynb @@ -0,0 +1,209 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BigQuery command-line tool\n", + "\n", + "The BigQuery command-line tool is installed as part of the [Cloud SDK](https://cloud-dot-devsite.googleplex.com/sdk/docs/) and can be used to interact with BigQuery. When you use CLI commands in a notebook, the command must be prepended with a `!`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## View available commands\n", + "\n", + "To view the available commands for the BigQuery command-line tool, use the `help` command." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!bq help" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a new dataset\n", + "\n", + "A dataset is contained within a specific [project](https://cloud.google.com/bigquery/docs/projects). Datasets are top-level containers that are used to organize and control access to your [tables](https://cloud.google.com/bigquery/docs/tables) and [views](https://cloud.google.com/bigquery/docs/views). A table or view must belong to a dataset. You need to create at least one dataset before [loading data into BigQuery](https://cloud.google.com/bigquery/loading-data-into-bigquery).\n", + "\n", + "First, name your new dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset_id = \"your_new_dataset\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following command creates a new dataset in the US using the ID defined above.\n", + "\n", + "NOTE: In the examples in this notebook, the `dataset_id` variable is referenced in the commands using both `{}` and `$`. To avoid creating and using variables, replace these interpolated variables with literal values and remove the `{}` and `$` characters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!bq --location=US mk --dataset $dataset_id" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The response should look like the following:\n", + "\n", + "```\n", + "Dataset 'your-project-id:your_new_dataset' successfully created.\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## List datasets\n", + "\n", + "The following command lists all datasets in your default project." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!bq ls" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The response should look like the following:\n", + "\n", + "```\n", + " datasetId \n", + " ------------------------------ \n", + " your_new_dataset \n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load data from a local file to a table\n", + "\n", + "The following example demonstrates how to load a local CSV file into a new or existing table. See [SourceFormat](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.SourceFormat.html#google.cloud.bigquery.job.SourceFormat) in the Python client library documentation for a list of available source formats. For more information, see [Loading Data into BigQuery from a local data source](https://cloud.google.com/bigquery/docs/loading-data-local) in the BigQuery documentation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!bq \\\n", + " --location=US \\\n", + " load \\\n", + " --autodetect \\\n", + " --skip_leading_rows=1 \\\n", + " --source_format=CSV \\\n", + " {dataset_id}.us_states_local_file \\\n", + " 'resources/us-states.csv'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load data from Cloud Storage to a table\n", + "\n", + "The following example demonstrates how to load a local CSV file into a new table. See [SourceFormat](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.job.SourceFormat.html#google.cloud.bigquery.job.SourceFormat) in the Python client library documentation for a list of available source formats. For more information, see [Introduction to loading data from Cloud Storage](https://cloud.google.com/bigquery/docs/loading-data-cloud-storage) in the BigQuery documentation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!bq \\\n", + " --location=US \\\n", + " load \\\n", + " --autodetect \\\n", + " --skip_leading_rows=1 \\\n", + " --source_format=CSV \\\n", + " {dataset_id}.us_states_gcs \\\n", + " 'gs://cloud-samples-data/bigquery/us-states/us-states.csv'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run a query\n", + "\n", + "The BigQuery command-line tool has a `query` command for running queries, but it is recommended to use the [magic command](./BigQuery%20Query%20Magic.ipynb) for this purpose." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cleaning Up\n", + "\n", + "The following code deletes the dataset created for this tutorial, including all tables in the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!bq rm -r -f --dataset $dataset_id" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/tutorials/bigquery/BigQuery query magic.ipynb b/notebooks/tutorials/bigquery/BigQuery query magic.ipynb new file mode 100644 index 00000000000..9c948679a7b --- /dev/null +++ b/notebooks/tutorials/bigquery/BigQuery query magic.ipynb @@ -0,0 +1,180 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BigQuery query magic\n", + "\n", + "Jupyter magics are notebook-specific shortcuts that allow you to run commands with minimal syntax. Jupyter notebooks come with many [built-in commands](https://ipython.readthedocs.io/en/stable/interactive/magics.html). The BigQuery client library, `google-cloud-bigquery`, provides a cell magic, `%%bigquery`. The `%%bigquery` magic runs a SQL query and returns the results as a pandas `DataFrame`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run a query on a public dataset\n", + "\n", + "The following example queries the BigQuery `usa_names` public dataset. `usa_names` is a Social Security Administration dataset that contains all names from Social Security card applications for births that occurred in the United States after 1879.\n", + "\n", + "The following example shows how to invoke the magic (`%%bigquery`), and how to pass in a standard SQL query in the body of the code cell. The results are displayed below the input cell as a pandas [`DataFrame`](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "%%bigquery\n", + "SELECT name, SUM(number) as count\n", + "FROM `bigquery-public-data.usa_names.usa_1910_current`\n", + "GROUP BY name\n", + "ORDER BY count DESC\n", + "LIMIT 10" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Display verbose output\n", + "\n", + "As the query job is running, status messages below the cell update with the query job ID and the amount of time the query has been running. By default, this output is erased and replaced with the results of the query. If you pass the `--verbose` flag, the output will remain below the cell after query completion." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bigquery --verbose\n", + "SELECT name, SUM(number) as count\n", + "FROM `bigquery-public-data.usa_names.usa_1910_current`\n", + "GROUP BY name\n", + "ORDER BY count DESC\n", + "LIMIT 10" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explicitly specify a project\n", + "\n", + "By default, the `%%bigquery` magic command uses your default project to run the query. You may also explicitly provide a project ID using the `--project` flag. Note that your credentials must have permissions to create query jobs in the project you specify." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "project_id = 'your-project-id'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bigquery --project $project_id\n", + "SELECT name, SUM(number) as count\n", + "FROM `bigquery-public-data.usa_names.usa_1910_current`\n", + "GROUP BY name\n", + "ORDER BY count DESC\n", + "LIMIT 10" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Assign the query results to a variable\n", + "\n", + "To save the results of your query to a variable, provide a variable name as a parameter to `%%bigquery`. The following example saves the results of the query to a variable named `df`. Note that when a variable is provided, the results are not displayed below the cell that invokes the magic command." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bigquery df\n", + "SELECT name, SUM(number) as count\n", + "FROM `bigquery-public-data.usa_names.usa_1910_current`\n", + "GROUP BY name\n", + "ORDER BY count DESC\n", + "LIMIT 10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run a parameterized query\n", + "\n", + "Parameterized queries are useful if you need to run a query with certain parameters that are calculated at run time. Note that the value types must be JSON serializable. The following example defines a parameters dictionary and passes it to the `--params` flag. The key of the dictionary is the name of the parameter, and the value of the dictionary is the value of the parameter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "params = {\"limit\": 10}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bigquery --params $params\n", + "SELECT name, SUM(number) as count\n", + "FROM `bigquery-public-data.usa_names.usa_1910_current`\n", + "GROUP BY name\n", + "ORDER BY count DESC\n", + "LIMIT @limit" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/tutorials/bigquery/Getting started with BigQuery ML.ipynb b/notebooks/tutorials/bigquery/Getting started with BigQuery ML.ipynb new file mode 100644 index 00000000000..e3f8625f10e --- /dev/null +++ b/notebooks/tutorials/bigquery/Getting started with BigQuery ML.ipynb @@ -0,0 +1,384 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Getting started with BigQuery ML\n", + "\n", + "BigQuery ML enables users to create and execute machine learning models in BigQuery using SQL queries. The goal is to democratize machine learning by enabling SQL practitioners to build models using their existing tools and to increase development speed by eliminating the need for data movement.\n", + "\n", + "In this tutorial, you use the sample [Google Analytics sample dataset for BigQuery](https://support.google.com/analytics/answer/7586738?hl=en&ref_topic=3416089) to create a model that predicts whether a website visitor will make a transaction. For information on the schema of the Analytics dataset, see [BigQuery export schema](https://support.google.com/analytics/answer/3437719) in the Google Analytics Help Center.\n", + "\n", + "\n", + "## Objectives\n", + "In this tutorial, you use:\n", + "\n", + "+ BigQuery ML to create a binary logistic regression model using the `CREATE MODEL` statement\n", + "+ The `ML.EVALUATE` function to evaluate the ML model\n", + "+ The `ML.PREDICT` function to make predictions using the ML model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create your dataset\n", + "\n", + "Enter the following code to import the BigQuery Python client library and initialize a client. The BigQuery client is used to send and receive messages from the BigQuery API." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from google.cloud import bigquery\n", + "\n", + "client = bigquery.Client(location=\"US\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, you create a BigQuery dataset to store your ML model. Run the following to create your dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = client.create_dataset(\"bqml_tutorial\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create your model\n", + "\n", + "Next, you create a logistic regression model using the Google Analytics sample\n", + "dataset for BigQuery. The model is used to predict whether a\n", + "website visitor will make a transaction. The standard SQL query uses a\n", + "`CREATE MODEL` statement to create and train the model. Standard SQL is the\n", + "default query syntax for the BigQuery python client library.\n", + "\n", + "The BigQuery python client library provides a cell magic,\n", + "`%%bigquery`, which runs a SQL query and returns the results as a Pandas\n", + "`DataFrame`.\n", + "\n", + "To run the `CREATE MODEL` query to create and train your model:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bigquery\n", + "CREATE OR REPLACE MODEL `bqml_tutorial.sample_model`\n", + "OPTIONS(model_type='logistic_reg') AS\n", + "SELECT\n", + " IF(totals.transactions IS NULL, 0, 1) AS label,\n", + " IFNULL(device.operatingSystem, \"\") AS os,\n", + " device.isMobile AS is_mobile,\n", + " IFNULL(geoNetwork.country, \"\") AS country,\n", + " IFNULL(totals.pageviews, 0) AS pageviews\n", + "FROM\n", + " `bigquery-public-data.google_analytics_sample.ga_sessions_*`\n", + "WHERE\n", + " _TABLE_SUFFIX BETWEEN '20160801' AND '20170630'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The query takes several minutes to complete. After the first iteration is\n", + "complete, your model (`sample_model`) appears in the navigation panel of the\n", + "BigQuery web UI. Because the query uses a `CREATE MODEL` statement to create a\n", + "table, you do not see query results. The output is an empty `DataFrame`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get training statistics\n", + "\n", + "To see the results of the model training, you can use the\n", + "[`ML.TRAINING_INFO`](https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-train)\n", + "function, or you can view the statistics in the BigQuery web UI. This functionality\n", + "is not currently available in the BigQuery Classic web UI.\n", + "In this tutorial, you use the `ML.TRAINING_INFO` function.\n", + "\n", + "A machine learning algorithm builds a model by examining many examples and\n", + "attempting to find a model that minimizes loss. This process is called empirical\n", + "risk minimization.\n", + "\n", + "Loss is the penalty for a bad prediction — a number indicating\n", + "how bad the model's prediction was on a single example. If the model's\n", + "prediction is perfect, the loss is zero; otherwise, the loss is greater. The\n", + "goal of training a model is to find a set of weights that have low\n", + "loss, on average, across all examples.\n", + "\n", + "To see the model training statistics that were generated when you ran the\n", + "`CREATE MODEL` query:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bigquery\n", + "SELECT\n", + " *\n", + "FROM\n", + " ML.TRAINING_INFO(MODEL `bqml_tutorial.sample_model`)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note: Typically, it is not a best practice to use a `SELECT *` query. Because the model output is a small table, this query does not process a large amount of data. As a result, the cost is minimal." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When the query is complete, the results appear below the query. The results should look like the following:\n", + "\n", + "![Training statistics table](./resources/training-statistics.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `loss` column represents the loss metric calculated after the given iteration\n", + "on the training dataset. Since you performed a logistic regression, this column\n", + "is the [log loss](https://en.wikipedia.org/wiki/Cross_entropy#Cross-entropy_error_function_and_logistic_regression).\n", + "The `eval_loss` column is the same loss metric calculated on\n", + "the holdout dataset (data that is held back from training to validate the model).\n", + "\n", + "For more details on the `ML.TRAINING_INFO` function, see the\n", + "[BigQuery ML syntax reference](https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-train)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluate your model\n", + "\n", + "After creating your model, you evaluate the performance of the classifier using\n", + "the [`ML.EVALUATE`](https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate)\n", + "function. You can also use the [`ML.ROC_CURVE`](https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-roc)\n", + "function for logistic regression specific metrics.\n", + "\n", + "A classifier is one of a set of enumerated target values for a label. For\n", + "example, in this tutorial you are using a binary classification model that\n", + "detects transactions. The two classes are the values in the `label` column:\n", + "`0` (no transactions) and not `1` (transaction made).\n", + "\n", + "To run the `ML.EVALUATE` query that evaluates the model:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bigquery\n", + "SELECT\n", + " *\n", + "FROM ML.EVALUATE(MODEL `bqml_tutorial.sample_model`, (\n", + " SELECT\n", + " IF(totals.transactions IS NULL, 0, 1) AS label,\n", + " IFNULL(device.operatingSystem, \"\") AS os,\n", + " device.isMobile AS is_mobile,\n", + " IFNULL(geoNetwork.country, \"\") AS country,\n", + " IFNULL(totals.pageviews, 0) AS pageviews\n", + " FROM\n", + " `bigquery-public-data.google_analytics_sample.ga_sessions_*`\n", + " WHERE\n", + " _TABLE_SUFFIX BETWEEN '20170701' AND '20170801'))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When the query is complete, the results appear below the query. The\n", + "results should look like the following:\n", + "\n", + "![Model evaluation results table](./resources/model-evaluation.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Because you performed a logistic regression, the results include the following\n", + "columns:\n", + "\n", + "+ [`precision`](https://developers.google.com/machine-learning/glossary/#precision)\n", + "+ [`recall`](https://developers.google.com/machine-learning/glossary/#recall)\n", + "+ [`accuracy`](https://developers.google.com/machine-learning/glossary/#accuracy)\n", + "+ [`f1_score`](https://en.wikipedia.org/wiki/F1_score)\n", + "+ [`log_loss`](https://developers.google.com/machine-learning/glossary/#Log_Loss)\n", + "+ [`roc_auc`](https://developers.google.com/machine-learning/glossary/#AUC)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use your model to predict outcomes\n", + "\n", + "Now that you have evaluated your model, the next step is to use it to predict\n", + "outcomes. You use your model to predict the number of transactions made by\n", + "website visitors from each country. And you use it to predict purchases per user.\n", + "\n", + "To run the query that uses the model to predict the number of transactions:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bigquery\n", + "SELECT\n", + " country,\n", + " SUM(predicted_label) as total_predicted_purchases\n", + "FROM ML.PREDICT(MODEL `bqml_tutorial.sample_model`, (\n", + " SELECT\n", + " IFNULL(device.operatingSystem, \"\") AS os,\n", + " device.isMobile AS is_mobile,\n", + " IFNULL(totals.pageviews, 0) AS pageviews,\n", + " IFNULL(geoNetwork.country, \"\") AS country\n", + " FROM\n", + " `bigquery-public-data.google_analytics_sample.ga_sessions_*`\n", + " WHERE\n", + " _TABLE_SUFFIX BETWEEN '20170701' AND '20170801'))\n", + " GROUP BY country\n", + " ORDER BY total_predicted_purchases DESC\n", + " LIMIT 10" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When the query is complete, the results appear below the query. The\n", + "results should look like the following. Because model training is not\n", + "deterministic, your results may differ.\n", + "\n", + "![Model predictions table](./resources/transaction-predictions.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the next example, you try to predict the number of transactions each website\n", + "visitor will make. This query is identical to the previous query except for the\n", + "`GROUP BY` clause. Here the `GROUP BY` clause — `GROUP BY fullVisitorId`\n", + "— is used to group the results by visitor ID.\n", + "\n", + "To run the query that predicts purchases per user:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bigquery\n", + "SELECT\n", + " fullVisitorId,\n", + " SUM(predicted_label) as total_predicted_purchases\n", + "FROM ML.PREDICT(MODEL `bqml_tutorial.sample_model`, (\n", + " SELECT\n", + " IFNULL(device.operatingSystem, \"\") AS os,\n", + " device.isMobile AS is_mobile,\n", + " IFNULL(totals.pageviews, 0) AS pageviews,\n", + " IFNULL(geoNetwork.country, \"\") AS country,\n", + " fullVisitorId\n", + " FROM\n", + " `bigquery-public-data.google_analytics_sample.ga_sessions_*`\n", + " WHERE\n", + " _TABLE_SUFFIX BETWEEN '20170701' AND '20170801'))\n", + " GROUP BY fullVisitorId\n", + " ORDER BY total_predicted_purchases DESC\n", + " LIMIT 10" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When the query is complete, the results appear below the query. The\n", + "results should look like the following:\n", + "\n", + "![Purchase predictions table](./resources/purchase-predictions.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cleaning up\n", + "\n", + "To delete the resources created by this tutorial, execute the following code to delete the dataset and its contents:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.delete_dataset(dataset, delete_contents=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/tutorials/bigquery/Visualizing BigQuery public data.ipynb b/notebooks/tutorials/bigquery/Visualizing BigQuery public data.ipynb new file mode 100644 index 00000000000..607938d6fbd --- /dev/null +++ b/notebooks/tutorials/bigquery/Visualizing BigQuery public data.ipynb @@ -0,0 +1,271 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Vizualizing BigQuery data in a Jupyter notebook\n", + "\n", + "[BigQuery](https://cloud.google.com/bigquery/docs/) is a petabyte-scale analytics data warehouse that you can use to run SQL queries over vast amounts of data in near realtime.\n", + "\n", + "Data visualization tools can help you make sense of your BigQuery data and help you analyze the data interactively. You can use visualization tools to help you identify trends, respond to them, and make predictions using your data. In this tutorial, you use the BigQuery Python client library and pandas in a Jupyter notebook to visualize data in the BigQuery natality sample table." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using Jupyter magics to query BigQuery data\n", + "\n", + "The BigQuery Python client library provides a magic command that allows you to run queries with minimal code.\n", + "\n", + "The BigQuery client library provides a cell magic, `%%bigquery`. The `%%bigquery` magic runs a SQL query and returns the results as a pandas `DataFrame`. The following cell executes a query of the BigQuery natality public dataset and returns the total births by year." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bigquery\n", + "SELECT\n", + " source_year AS year,\n", + " COUNT(is_male) AS birth_count\n", + "FROM `bigquery-public-data.samples.natality`\n", + "GROUP BY year\n", + "ORDER BY year DESC\n", + "LIMIT 15" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following command to runs the same query, but this time the results are saved to a variable. The variable name, `total_births`, is given as an argument to the `%%bigquery`. The results can then be used for further analysis and visualization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bigquery total_births\n", + "SELECT\n", + " source_year AS year,\n", + " COUNT(is_male) AS birth_count\n", + "FROM `bigquery-public-data.samples.natality`\n", + "GROUP BY year\n", + "ORDER BY year DESC\n", + "LIMIT 15" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The next cell uses the pandas `DataFrame.plot` method to visualize the query results as a bar chart. See the [pandas documentation](https://pandas.pydata.org/pandas-docs/stable/visualization.html) to learn more about data visualization with pandas." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "total_births.plot(kind='bar', x='year', y='birth_count');" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run the following query to retrieve the number of births by weekday. Because the `wday` (weekday) field allows null values, the query excludes records where wday is null." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bigquery births_by_weekday\n", + "SELECT\n", + " wday,\n", + " SUM(CASE WHEN is_male THEN 1 ELSE 0 END) AS male_births,\n", + " SUM(CASE WHEN is_male THEN 0 ELSE 1 END) AS female_births\n", + "FROM `bigquery-public-data.samples.natality`\n", + "WHERE wday IS NOT NULL\n", + "GROUP BY wday\n", + "ORDER BY wday ASC" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Visualize the query results using a line chart." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "births_by_weekday.plot(x='wday');" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using Python to query BigQuery data\n", + "\n", + "Magic commands allow you to use minimal syntax to interact with BigQuery. Behind the scenes, `%%bigquery` uses the BigQuery Python client library to run the given query, convert the results to a pandas `Dataframe`, optionally save the results to a variable, and finally display the results. Using the BigQuery Python client library directly instead of through magic commands gives you more control over your queries and allows for more complex configurations. The library's integrations with pandas enable you to combine the power of declarative SQL with imperative code (Python) to perform interesting data analysis, visualization, and transformation tasks.\n", + "\n", + "To use the BigQuery Python client library, start by importing the library and initializing a client. The BigQuery client is used to send and receive messages from the BigQuery API." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from google.cloud import bigquery\n", + "\n", + "client = bigquery.Client()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use the [`Client.query`](https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.client.Client.html#google.cloud.bigquery.client.Client.query) method to run a query. Execute the following cell to run a query to retrieve the annual count of plural births by plurality (2 for twins, 3 for triplets, etc.)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sql = \"\"\"\n", + "SELECT\n", + " plurality,\n", + " COUNT(1) AS count,\n", + " year\n", + "FROM\n", + " `bigquery-public-data.samples.natality`\n", + "WHERE\n", + " NOT IS_NAN(plurality) AND plurality > 1\n", + "GROUP BY\n", + " plurality, year\n", + "ORDER BY\n", + " count DESC\n", + "\"\"\"\n", + "df = client.query(sql).to_dataframe()\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To chart the query results in your `DataFrame`, run the following cell to pivot the data and create a stacked bar chart of the count of plural births over time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pivot_table = df.pivot(index='year', columns='plurality', values='count')\n", + "pivot_table.plot(kind='bar', stacked=True, figsize=(15, 7));" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run the following query to retrieve the count of births by the number of gestation weeks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sql = \"\"\"\n", + "SELECT\n", + " gestation_weeks,\n", + " COUNT(1) AS count\n", + "FROM\n", + " `bigquery-public-data.samples.natality`\n", + "WHERE\n", + " NOT IS_NAN(gestation_weeks) AND gestation_weeks <> 99\n", + "GROUP BY\n", + " gestation_weeks\n", + "ORDER BY\n", + " gestation_weeks\n", + "\"\"\"\n", + "df = client.query(sql).to_dataframe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, chart the query results in your `DataFrame`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ax = df.plot(kind='bar', x='gestation_weeks', y='count', figsize=(15,7))\n", + "ax.set_title('Count of Births by Gestation Weeks')\n", + "ax.set_xlabel('Gestation Weeks')\n", + "ax.set_ylabel('Count');" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## What's Next\n", + "\n", + "+ __Learn more about writing queries for BigQuery__ — [Querying Data](https://cloud.google.com/bigquery/querying-data) in the BigQuery documentation explains how to run queries, create user-defined functions (UDFs), and more.\n", + "\n", + "+ __Explore BigQuery syntax__ — The preferred dialect for SQL queries in BigQuery is standard SQL. Standard SQL syntax is described in the [SQL Reference](https://cloud.google.com/bigquery/docs/reference/standard-sql/). BigQuery's legacy SQL-like syntax is described in the [Query Reference (legacy SQL)](https://cloud.google.com/bigquery/query-reference)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/tutorials/bigquery/resources/model-evaluation.png b/notebooks/tutorials/bigquery/resources/model-evaluation.png new file mode 100644 index 00000000000..814e8175f3d Binary files /dev/null and b/notebooks/tutorials/bigquery/resources/model-evaluation.png differ diff --git a/notebooks/tutorials/bigquery/resources/purchase-predictions.png b/notebooks/tutorials/bigquery/resources/purchase-predictions.png new file mode 100644 index 00000000000..b48aae714c8 Binary files /dev/null and b/notebooks/tutorials/bigquery/resources/purchase-predictions.png differ diff --git a/notebooks/tutorials/bigquery/resources/training-statistics.png b/notebooks/tutorials/bigquery/resources/training-statistics.png new file mode 100644 index 00000000000..6bb8176446b Binary files /dev/null and b/notebooks/tutorials/bigquery/resources/training-statistics.png differ diff --git a/notebooks/tutorials/bigquery/resources/transaction-predictions.png b/notebooks/tutorials/bigquery/resources/transaction-predictions.png new file mode 100644 index 00000000000..877500ed830 Binary files /dev/null and b/notebooks/tutorials/bigquery/resources/transaction-predictions.png differ diff --git a/notebooks/tutorials/bigquery/resources/us-states.csv b/notebooks/tutorials/bigquery/resources/us-states.csv new file mode 100644 index 00000000000..54a60e29de9 --- /dev/null +++ b/notebooks/tutorials/bigquery/resources/us-states.csv @@ -0,0 +1,51 @@ +name,post_abbr +Alabama,AL +Alaska,AK +Arizona,AZ +Arkansas,AR +California,CA +Colorado,CO +Connecticut,CT +Delaware,DE +Florida,FL +Georgia,GA +Hawaii,HI +Idaho,ID +Illinois,IL +Indiana,IN +Iowa,IA +Kansas,KS +Kentucky,KY +Louisiana,LA +Maine,ME +Maryland,MD +Massachusetts,MA +Michigan,MI +Minnesota,MN +Mississippi,MS +Missouri,MO +Montana,MT +Nebraska,NE +Nevada,NV +New Hampshire,NH +New Jersey,NJ +New Mexico,NM +New York,NY +North Carolina,NC +North Dakota,ND +Ohio,OH +Oklahoma,OK +Oregon,OR +Pennsylvania,PA +Rhode Island,RI +South Carolina,SC +South Dakota,SD +Tennessee,TN +Texas,TX +Utah,UT +Vermont,VT +Virginia,VA +Washington,WA +West Virginia,WV +Wisconsin,WI +Wyoming,WY diff --git a/notebooks/tutorials/storage/Cloud Storage client library.ipynb b/notebooks/tutorials/storage/Cloud Storage client library.ipynb new file mode 100644 index 00000000000..34f747b5786 --- /dev/null +++ b/notebooks/tutorials/storage/Cloud Storage client library.ipynb @@ -0,0 +1,312 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Cloud Storage client library\n", + "\n", + "This tutorial shows how to get started with the [Cloud Storage Python client library](https://googleapis.github.io/google-cloud-python/latest/storage/index.html)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a storage bucket\n", + "\n", + "Buckets are the basic containers that hold your data. Everything that you store in Cloud Storage must be contained in a bucket. You can use buckets to organize your data and control access to your data.\n", + "\n", + "Start by importing the library:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from google.cloud import storage" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `storage.Client` object uses your default project. Alternatively, you can specify a project in the `Client` constructor. For more information about how the default project is determined, see the [google-auth documentation](https://google-auth.readthedocs.io/en/latest/reference/google.auth.html).\n", + "\n", + "Run the following to create a client with your default project:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client = storage.Client()\n", + "print(\"Client created using default project: {}\".format(client.project))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To explicitly specify a project when constructing the client, set the `project` parameter:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# client = storage.Client(project='your-project-id')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, create a bucket with a globally unique name.\n", + "\n", + "For more information about naming buckets, see [Bucket name requirements](https://cloud.google.com/storage/docs/naming#requirements)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Replace the string below with a unique name for the new bucket\n", + "bucket_name = \"your-new-bucket\"\n", + "\n", + "# Creates the new bucket\n", + "bucket = client.create_bucket(bucket_name)\n", + "\n", + "print(\"Bucket {} created.\".format(bucket.name))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## List buckets in a project" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "buckets = client.list_buckets()\n", + "\n", + "print(\"Buckets in {}:\".format(client.project))\n", + "for item in buckets:\n", + " print(\"\\t\" + item.name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get bucket metadata\n", + "\n", + "The next cell shows how to get information on metadata of your Cloud Storage buckets.\n", + "\n", + "To learn more about specific bucket properties, see [Bucket locations](https://cloud.google.com/storage/docs/locations) and [Storage classes](https://cloud.google.com/storage/docs/storage-classes)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bucket = client.get_bucket(bucket_name)\n", + "\n", + "print(\"Bucket name: {}\".format(bucket.name))\n", + "print(\"Bucket location: {}\".format(bucket.location))\n", + "print(\"Bucket storage class: {}\".format(bucket.storage_class))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Upload a local file to a bucket\n", + "\n", + "Objects are the individual pieces of data that you store in Cloud Storage. Objects are referred to as \"blobs\" in the Python client library. There is no limit on the number of objects that you can create in a bucket.\n", + "\n", + "An object's name is treated as a piece of object metadata in Cloud Storage. Object names can contain any combination of Unicode characters (UTF-8 encoded) and must be less than 1024 bytes in length.\n", + "\n", + "For more information, including how to rename an object, see the [Object name requirements](https://cloud.google.com/storage/docs/naming#objectnames)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "blob_name = \"us-states.txt\"\n", + "blob = bucket.blob(blob_name)\n", + "\n", + "source_file_name = \"resources/us-states.txt\"\n", + "blob.upload_from_filename(source_file_name)\n", + "\n", + "print(\"File uploaded to {}.\".format(bucket.name))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## List blobs in a bucket" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "blobs = bucket.list_blobs()\n", + "\n", + "print(\"Blobs in {}:\".format(bucket.name))\n", + "for item in blobs:\n", + " print(\"\\t\" + item.name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get a blob and display metadata\n", + "\n", + "See [documentation](https://cloud.google.com/storage/docs/viewing-editing-metadata) for more information about object metadata." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "blob = bucket.get_blob(blob_name)\n", + "\n", + "print(\"Name: {}\".format(blob.id))\n", + "print(\"Size: {} bytes\".format(blob.size))\n", + "print(\"Content type: {}\".format(blob.content_type))\n", + "print(\"Public URL: {}\".format(blob.public_url))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download a blob to a local directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "output_file_name = \"resources/downloaded-us-states.txt\"\n", + "blob.download_to_filename(output_file_name)\n", + "\n", + "print(\"Downloaded blob {} to {}.\".format(blob.name, output_file_name))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cleaning up" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Delete a blob" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "blob = client.get_bucket(bucket_name).get_blob(blob_name)\n", + "blob.delete()\n", + "\n", + "print(\"Blob {} deleted.\".format(blob.name))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Delete a bucket\n", + "\n", + "Note that the bucket must be empty before it can be deleted." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bucket = client.get_bucket(bucket_name)\n", + "bucket.delete()\n", + "\n", + "print(\"Bucket {} deleted.\".format(bucket.name))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Next Steps\n", + "\n", + "Read more about Cloud Storage in the documentation:\n", + "+ [Storage key terms](https://cloud.google.com/storage/docs/key-terms)\n", + "+ [How-to guides](https://cloud.google.com/storage/docs/how-to)\n", + "+ [Pricing](https://cloud.google.com/storage/pricing)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/notebooks/tutorials/storage/Storage command-line tool.ipynb b/notebooks/tutorials/storage/Storage command-line tool.ipynb new file mode 100644 index 00000000000..21e62ae8236 --- /dev/null +++ b/notebooks/tutorials/storage/Storage command-line tool.ipynb @@ -0,0 +1,328 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Storage command-line tool\n", + "\n", + "The [Google Cloud SDK](https://cloud-dot-devsite.googleplex.com/sdk/docs/) provides a set of commands for working with data stored in Cloud Storage. This notebook introduces several `gsutil` commands for interacting with Cloud Storage. Note that shell commands in a notebook must be prepended with a `!`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## List available commands\n", + "\n", + "The `gsutil` command can be used to perform a wide array of tasks. Run the `help` command to view a list of available commands:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "!gsutil help" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a storage bucket\n", + "\n", + "Buckets are the basic containers that hold your data. Everything that you store in Cloud Storage must be contained in a bucket. You can use buckets to organize your data and control access to your data.\n", + "\n", + "Start by defining a globally unique name.\n", + "\n", + "For more information about naming buckets, see [Bucket name requirements](https://cloud.google.com/storage/docs/naming#requirements)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Replace the string below with a unique name for the new bucket\n", + "bucket_name = \"your-new-bucket\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "NOTE: In the examples below, the `bucket_name` and `project_id` variables are referenced in the commands using `{}` and `$`. If you want to avoid creating and using variables, replace these interpolated variables with literal values and remove the `{}` and `$` characters.\n", + "\n", + "Next, create the new bucket with the `gsutil mb` command:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!gsutil mb gs://{bucket_name}/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## List buckets in a project\n", + "\n", + "Replace 'your-project-id' in the cell below with your project ID and run the cell to list the storage buckets in your project." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Replace the string below with your project ID\n", + "project_id = \"your-project-id\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!gsutil ls -p $project_id" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The response should look like the following:\n", + "\n", + "```\n", + "gs://your-new-bucket/\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get bucket metadata\n", + "\n", + "The next cell shows how to get information on metadata of your Cloud Storage buckets.\n", + "\n", + "To learn more about specific bucket properties, see [Bucket locations](https://cloud.google.com/storage/docs/locations) and [Storage classes](https://cloud.google.com/storage/docs/storage-classes)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "!gsutil ls -L -b gs://{bucket_name}/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The response should look like the following:\n", + "```\n", + "gs://your-new-bucket/ :\n", + " Storage class: MULTI_REGIONAL\n", + " Location constraint: US\n", + " ...\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Upload a local file to a bucket\n", + "\n", + "Objects are the individual pieces of data that you store in Cloud Storage. Objects are referred to as \"blobs\" in the Python client library. There is no limit on the number of objects that you can create in a bucket.\n", + "\n", + "An object's name is treated as a piece of object metadata in Cloud Storage. Object names can contain any combination of Unicode characters (UTF-8 encoded) and must be less than 1024 bytes in length.\n", + "\n", + "For more information, including how to rename an object, see the [Object name requirements](https://cloud.google.com/storage/docs/naming#objectnames)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!gsutil cp resources/us-states.txt gs://{bucket_name}/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## List blobs in a bucket" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "!gsutil ls -r gs://{bucket_name}/**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The response should look like the following:\n", + "```\n", + "gs://your-new-bucket/us-states.txt\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get a blob and display metadata\n", + "\n", + "See [Viewing and editing object metadata](https://cloud.google.com/storage/docs/viewing-editing-metadata) for more information about object metadata." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!gsutil ls -L gs://{bucket_name}/us-states.txt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The response should look like the following:\n", + "\n", + "```\n", + "gs://your-new-bucket/us-states.txt:\n", + " Creation time: Fri, 08 Feb 2019 05:23:28 GMT\n", + " Update time: Fri, 08 Feb 2019 05:23:28 GMT\n", + " Storage class: STANDARD\n", + " Content-Language: en\n", + " Content-Length: 637\n", + " Content-Type: text/plain\n", + "...\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download a blob to a local directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "!gsutil cp gs://{bucket_name}/us-states.txt resources/downloaded-us-states.txt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cleaning up" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Delete a blob" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "!gsutil rm gs://{bucket_name}/us-states.txt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Delete a bucket\n", + "\n", + "The following command deletes all objects in the bucket before deleting the bucket itself." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!gsutil rm -r gs://{bucket_name}/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Next Steps\n", + "\n", + "Read more about Cloud Storage in the documentation:\n", + "+ [Storage key terms](https://cloud.google.com/storage/docs/key-terms)\n", + "+ [How-to guides](https://cloud.google.com/storage/docs/how-to)\n", + "+ [Pricing](https://cloud.google.com/storage/pricing)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/notebooks/tutorials/storage/resources/downloaded-us-states.txt b/notebooks/tutorials/storage/resources/downloaded-us-states.txt new file mode 100644 index 00000000000..54a60e29de9 --- /dev/null +++ b/notebooks/tutorials/storage/resources/downloaded-us-states.txt @@ -0,0 +1,51 @@ +name,post_abbr +Alabama,AL +Alaska,AK +Arizona,AZ +Arkansas,AR +California,CA +Colorado,CO +Connecticut,CT +Delaware,DE +Florida,FL +Georgia,GA +Hawaii,HI +Idaho,ID +Illinois,IL +Indiana,IN +Iowa,IA +Kansas,KS +Kentucky,KY +Louisiana,LA +Maine,ME +Maryland,MD +Massachusetts,MA +Michigan,MI +Minnesota,MN +Mississippi,MS +Missouri,MO +Montana,MT +Nebraska,NE +Nevada,NV +New Hampshire,NH +New Jersey,NJ +New Mexico,NM +New York,NY +North Carolina,NC +North Dakota,ND +Ohio,OH +Oklahoma,OK +Oregon,OR +Pennsylvania,PA +Rhode Island,RI +South Carolina,SC +South Dakota,SD +Tennessee,TN +Texas,TX +Utah,UT +Vermont,VT +Virginia,VA +Washington,WA +West Virginia,WV +Wisconsin,WI +Wyoming,WY diff --git a/notebooks/tutorials/storage/resources/us-states.txt b/notebooks/tutorials/storage/resources/us-states.txt new file mode 100644 index 00000000000..54a60e29de9 --- /dev/null +++ b/notebooks/tutorials/storage/resources/us-states.txt @@ -0,0 +1,51 @@ +name,post_abbr +Alabama,AL +Alaska,AK +Arizona,AZ +Arkansas,AR +California,CA +Colorado,CO +Connecticut,CT +Delaware,DE +Florida,FL +Georgia,GA +Hawaii,HI +Idaho,ID +Illinois,IL +Indiana,IN +Iowa,IA +Kansas,KS +Kentucky,KY +Louisiana,LA +Maine,ME +Maryland,MD +Massachusetts,MA +Michigan,MI +Minnesota,MN +Mississippi,MS +Missouri,MO +Montana,MT +Nebraska,NE +Nevada,NV +New Hampshire,NH +New Jersey,NJ +New Mexico,NM +New York,NY +North Carolina,NC +North Dakota,ND +Ohio,OH +Oklahoma,OK +Oregon,OR +Pennsylvania,PA +Rhode Island,RI +South Carolina,SC +South Dakota,SD +Tennessee,TN +Texas,TX +Utah,UT +Vermont,VT +Virginia,VA +Washington,WA +West Virginia,WV +Wisconsin,WI +Wyoming,WY