Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

BigQuery: Update pandas/bqstorage samples to latest library changes. #2413

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 24, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions 7 bigquery/pandas-gbq-migration/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
google-cloud-bigquery[pandas,pyarrow]==1.9.0
pandas-gbq==0.9.0
google-cloud-bigquery==1.20.0
google-cloud-bigquery-storage==0.7.0
pandas==0.25.1
pandas-gbq==0.11.0
pyarrow==0.14.1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The pyarrow req still feels odd to me. Should we add a comment if there's an alternate option, or start expressing a harder dependency via one of the other libraries?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think at some point we'll make pyarrow a hard dependency in google-cloud-bigquery-storage (in fact, it already is in the conda package), but probably not until after filtering support is launched for the arrow wire format.

71 changes: 59 additions & 12 deletions 71 bigquery/pandas-gbq-migration/samples_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,41 @@ def test_pandas_gbq_query():
assert len(df) > 0


def test_client_library_query_bqstorage():
# [START bigquery_migration_client_library_query_bqstorage]
import google.auth
from google.cloud import bigquery
from google.cloud import bigquery_storage_v1beta1

# Create a BigQuery client and a BigQuery Storage API client with the same
# credentials to avoid authenticating twice.
credentials, project_id = google.auth.default(
scopes=["https://www.googleapis.com/auth/cloud-platform"]
)
client = bigquery.Client(credentials=credentials, project=project_id)
bqstorage_client = bigquery_storage_v1beta1.BigQueryStorageClient(
credentials=credentials
)
sql = "SELECT * FROM `bigquery-public-data.irs_990.irs_990_2012`"

# Use a BigQuery Storage API client to download results more quickly.
df = client.query(sql).to_dataframe(bqstorage_client=bqstorage_client)
# [END bigquery_migration_client_library_query_bqstorage]
assert len(df) > 0


def test_pandas_gbq_query_bqstorage():
# [START bigquery_migration_pandas_gbq_query_bqstorage]
import pandas

sql = "SELECT * FROM `bigquery-public-data.irs_990.irs_990_2012`"

# Use the BigQuery Storage API to download results more quickly.
df = pandas.read_gbq(sql, dialect='standard', use_bqstorage_api=True)
# [END bigquery_migration_pandas_gbq_query_bqstorage]
assert len(df) > 0


def test_client_library_legacy_query():
# [START bigquery_migration_client_library_query_legacy]
from google.cloud import bigquery
Expand Down Expand Up @@ -184,16 +219,28 @@ def test_client_library_upload_from_dataframe(temp_dataset):
}
)
client = bigquery.Client()
dataset_ref = client.dataset('my_dataset')
table_id = 'my_dataset.new_table'
# [END bigquery_migration_client_library_upload_from_dataframe]
dataset_ref = client.dataset(temp_dataset.dataset_id)
table_id = (
temp_dataset.dataset_id
+ ".test_client_library_upload_from_dataframe"
)
# [START bigquery_migration_client_library_upload_from_dataframe]
table_ref = dataset_ref.table('new_table')
# Since string columns use the "object" dtype, pass in a (partial) schema
# to ensure the correct BigQuery data type.
job_config = bigquery.LoadJobConfig(schema=[
bigquery.SchemaField("my_string", "STRING"),
])

job = client.load_table_from_dataframe(
df, table_id, job_config=job_config
)

client.load_table_from_dataframe(df, table_ref).result()
# Wait for the load job to complete.
job.result()
# [END bigquery_migration_client_library_upload_from_dataframe]
client = bigquery.Client()
table = client.get_table(table_ref)
table = client.get_table(table_id)
assert table.num_rows == 3


Expand All @@ -209,16 +256,16 @@ def test_pandas_gbq_upload_from_dataframe(temp_dataset):
'my_float64': [4.0, 5.0, 6.0],
}
)
full_table_id = 'my_dataset.new_table'
project_id = 'my-project-id'
table_id = 'my_dataset.new_table'
# [END bigquery_migration_pandas_gbq_upload_from_dataframe]
table_id = 'new_table'
full_table_id = '{}.{}'.format(temp_dataset.dataset_id, table_id)
project_id = os.environ['GCLOUD_PROJECT']
table_id = (
temp_dataset.dataset_id
+ ".test_pandas_gbq_upload_from_dataframe"
)
# [START bigquery_migration_pandas_gbq_upload_from_dataframe]

df.to_gbq(full_table_id, project_id=project_id)
df.to_gbq(table_id)
# [END bigquery_migration_pandas_gbq_upload_from_dataframe]
client = bigquery.Client()
table = client.get_table(temp_dataset.table(table_id))
table = client.get_table(table_id)
assert table.num_rows == 3
49 changes: 8 additions & 41 deletions 49 bigquery_storage/to_dataframe/jupyter_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,6 @@ def test_jupyter_small_query(ipython):
assert "stackoverflow" in ip.user_ns # verify that variable exists


@pytest.mark.skipif(
"TRAVIS" in os.environ, reason="Not running long-running queries on Travis"
)
def test_jupyter_tutorial(ipython):
ip = IPython.get_ipython()
ip.extension_manager.load_extension("google.cloud.bigquery")
Expand All @@ -86,33 +83,18 @@ def test_jupyter_tutorial(ipython):
# speed-up of using the BigQuery Storage API to download the results.
sample = """
# [START bigquerystorage_jupyter_tutorial_query]
%%bigquery nodejs_deps --use_bqstorage_api
SELECT
dependency_name,
dependency_platform,
project_name,
project_id,
version_number,
version_id,
dependency_kind,
optional_dependency,
dependency_requirements,
dependency_project_id
FROM
`bigquery-public-data.libraries_io.dependencies`
WHERE
LOWER(dependency_platform) = 'npm'
LIMIT 2500000
%%bigquery tax_forms --use_bqstorage_api
SELECT * FROM `bigquery-public-data.irs_990.irs_990_2012`
# [END bigquerystorage_jupyter_tutorial_query]
"""
result = ip.run_cell(_strip_region_tags(sample))
result.raise_error() # Throws an exception if the cell failed.

assert "nodejs_deps" in ip.user_ns # verify that variable exists
nodejs_deps = ip.user_ns["nodejs_deps"]
assert "tax_forms" in ip.user_ns # verify that variable exists
tax_forms = ip.user_ns["tax_forms"]

# [START bigquerystorage_jupyter_tutorial_results]
nodejs_deps.head()
tax_forms.head()
# [END bigquerystorage_jupyter_tutorial_results]

# [START bigquerystorage_jupyter_tutorial_context]
Expand All @@ -123,26 +105,11 @@ def test_jupyter_tutorial(ipython):

sample = """
# [START bigquerystorage_jupyter_tutorial_query_default]
%%bigquery java_deps
SELECT
dependency_name,
dependency_platform,
project_name,
project_id,
version_number,
version_id,
dependency_kind,
optional_dependency,
dependency_requirements,
dependency_project_id
FROM
`bigquery-public-data.libraries_io.dependencies`
WHERE
LOWER(dependency_platform) = 'maven'
LIMIT 2500000
%%bigquery tax_forms
SELECT * FROM `bigquery-public-data.irs_990.irs_990_2012`
# [END bigquerystorage_jupyter_tutorial_query_default]
"""
result = ip.run_cell(_strip_region_tags(sample))
result.raise_error() # Throws an exception if the cell failed.

assert "java_deps" in ip.user_ns # verify that variable exists
assert "tax_forms" in ip.user_ns # verify that variable exists
8 changes: 4 additions & 4 deletions 8 bigquery_storage/to_dataframe/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
google-auth==1.6.2
google-cloud-bigquery-storage==0.6.0
google-cloud-bigquery==1.17.0
pyarrow==0.13.0
google-cloud-bigquery-storage==0.7.0
google-cloud-bigquery==1.20.0
pyarrow==0.14.1
ipython==7.2.0
pandas==0.24.2
pandas==0.25.1
3 changes: 2 additions & 1 deletion 3 noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,8 @@ def _setup_appengine_sdk(session):
PY3_ONLY_SAMPLES = [
sample for sample in ALL_TESTED_SAMPLES
if (sample.startswith('./appengine/standard_python37')
or sample.startswith('./functions/'))]
or sample.startswith('./functions/')
or sample.startswith('./bigquery/pandas-gbq-migration'))]
NON_GAE_STANDARD_SAMPLES_PY2 = sorted(list((
set(ALL_TESTED_SAMPLES) -
set(GAE_STANDARD_SAMPLES)) -
Expand Down
Morty Proxy This is a proxified and sanitized view of the page, visit original site.