From 61634be9bf9e3df7589fc1bfdbda87288859bb13 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 9 Oct 2020 11:57:40 -0500 Subject: [PATCH 01/18] docs: update snippets samples to support version 2.0 (#309) * docs: update snippets samples to support version 2.0 For some reason, old versions of the google-cloud-bigquery-storage library were still getting used. This pins those dependencies directly, instead. Also, updates the samples to remove warnings about `client.dataset`. * blacken --- samples/snippets/authorized_view_tutorial.py | 8 ++++++-- samples/snippets/authorized_view_tutorial_test.py | 10 +++++++--- samples/snippets/natality_tutorial.py | 4 +++- samples/snippets/natality_tutorial_test.py | 4 ++-- samples/snippets/quickstart.py | 4 ++-- samples/snippets/requirements.txt | 6 +++++- 6 files changed, 25 insertions(+), 11 deletions(-) diff --git a/samples/snippets/authorized_view_tutorial.py b/samples/snippets/authorized_view_tutorial.py index 6b5cc378f..b6a20c6ec 100644 --- a/samples/snippets/authorized_view_tutorial.py +++ b/samples/snippets/authorized_view_tutorial.py @@ -27,16 +27,18 @@ def run_authorized_view_tutorial(override_values={}): client = bigquery.Client() source_dataset_id = "github_source_data" + source_dataset_id_full = "{}.{}".format(client.project, source_dataset_id) # [END bigquery_authorized_view_tutorial] # [END bigquery_avt_create_source_dataset] # To facilitate testing, we replace values with alternatives # provided by the testing harness. source_dataset_id = override_values.get("source_dataset_id", source_dataset_id) + source_dataset_id_full = "{}.{}".format(client.project, source_dataset_id) # [START bigquery_authorized_view_tutorial] # [START bigquery_avt_create_source_dataset] - source_dataset = bigquery.Dataset(client.dataset(source_dataset_id)) + source_dataset = bigquery.Dataset(source_dataset_id_full) # Specify the geographic location where the dataset should reside. source_dataset.location = "US" source_dataset = client.create_dataset(source_dataset) # API request @@ -66,16 +68,18 @@ def run_authorized_view_tutorial(override_values={}): # Create a separate dataset to store your view # [START bigquery_avt_create_shared_dataset] shared_dataset_id = "shared_views" + shared_dataset_id_full = "{}.{}".format(client.project, shared_dataset_id) # [END bigquery_authorized_view_tutorial] # [END bigquery_avt_create_shared_dataset] # To facilitate testing, we replace values with alternatives # provided by the testing harness. shared_dataset_id = override_values.get("shared_dataset_id", shared_dataset_id) + shared_dataset_id_full = "{}.{}".format(client.project, shared_dataset_id) # [START bigquery_authorized_view_tutorial] # [START bigquery_avt_create_shared_dataset] - shared_dataset = bigquery.Dataset(client.dataset(shared_dataset_id)) + shared_dataset = bigquery.Dataset(shared_dataset_id_full) shared_dataset.location = "US" shared_dataset = client.create_dataset(shared_dataset) # API request # [END bigquery_avt_create_shared_dataset] diff --git a/samples/snippets/authorized_view_tutorial_test.py b/samples/snippets/authorized_view_tutorial_test.py index 4c74020bd..eb247c5eb 100644 --- a/samples/snippets/authorized_view_tutorial_test.py +++ b/samples/snippets/authorized_view_tutorial_test.py @@ -30,7 +30,7 @@ def datasets_to_delete(client): doomed = [] yield doomed for item in doomed: - client.delete_dataset(item, delete_contents=True) + client.delete_dataset(item, delete_contents=True, not_found_ok=True) def test_authorized_view_tutorial(client, datasets_to_delete): @@ -42,8 +42,12 @@ def test_authorized_view_tutorial(client, datasets_to_delete): str(uuid.uuid4()).replace("-", "_") ), } - source_dataset_ref = client.dataset(override_values["source_dataset_id"]) - shared_dataset_ref = client.dataset(override_values["shared_dataset_id"]) + source_dataset_ref = "{}.{}".format( + client.project, override_values["source_dataset_id"] + ) + shared_dataset_ref = "{}.{}".format( + client.project, override_values["shared_dataset_id"] + ) datasets_to_delete.extend( [override_values["source_dataset_id"], override_values["shared_dataset_id"]] ) diff --git a/samples/snippets/natality_tutorial.py b/samples/snippets/natality_tutorial.py index b2b607b0d..a8d90501a 100644 --- a/samples/snippets/natality_tutorial.py +++ b/samples/snippets/natality_tutorial.py @@ -38,13 +38,15 @@ def run_natality_tutorial(override_values={}): # Prepare a reference to a new dataset for storing the query results. dataset_id = "natality_regression" + dataset_id_full = "{}.{}".format(client.project, dataset_id) # [END bigquery_query_natality_tutorial] # To facilitate testing, we replace values with alternatives # provided by the testing harness. dataset_id = override_values.get("dataset_id", dataset_id) + dataset_id_full = "{}.{}".format(client.project, dataset_id) # [START bigquery_query_natality_tutorial] - dataset = bigquery.Dataset(client.dataset(dataset_id)) + dataset = bigquery.Dataset(dataset_id_full) # Create the new BigQuery dataset. dataset = client.create_dataset(dataset) diff --git a/samples/snippets/natality_tutorial_test.py b/samples/snippets/natality_tutorial_test.py index fae72fa46..d9c89bef2 100644 --- a/samples/snippets/natality_tutorial_test.py +++ b/samples/snippets/natality_tutorial_test.py @@ -43,8 +43,8 @@ def test_natality_tutorial(client, datasets_to_delete): natality_tutorial.run_natality_tutorial(override_values) - table_ref = bigquery.Dataset(client.dataset(override_values["dataset_id"])).table( - "regression_input" + table_ref = "{}.{}.{}".format( + client.project, override_values["dataset_id"], "regression_input" ) table = client.get_table(table_ref) assert table.num_rows > 0 diff --git a/samples/snippets/quickstart.py b/samples/snippets/quickstart.py index 56d6fd843..1b0ef5b3a 100644 --- a/samples/snippets/quickstart.py +++ b/samples/snippets/quickstart.py @@ -33,8 +33,8 @@ def run_quickstart(override_values={}): # [START bigquery_quickstart] # Prepares a reference to the new dataset - dataset_ref = bigquery_client.dataset(dataset_id) - dataset = bigquery.Dataset(dataset_ref) + dataset_id_full = "{}.{}".format(bigquery_client.project, dataset_id) + dataset = bigquery.Dataset(dataset_id_full) # Creates the new dataset dataset = bigquery_client.create_dataset(dataset) diff --git a/samples/snippets/requirements.txt b/samples/snippets/requirements.txt index 6edca4f10..76c333b46 100644 --- a/samples/snippets/requirements.txt +++ b/samples/snippets/requirements.txt @@ -1,6 +1,10 @@ -google-cloud-bigquery[pandas,bqstorage,pyarrow]==2.0.0 +google-cloud-bigquery==2.0.0 +google-cloud-bigquery-storage==2.0.0 google-auth-oauthlib==0.4.1 +grpcio==1.32.0 ipython==7.16.1; python_version < '3.7' ipython==7.17.0; python_version >= '3.7' matplotlib==3.3.1 +pandas==1.1.3 +pyarrow==1.0.1 pytz==2020.1 From 06830814e3328edadd7728e4271ced52233be0b1 Mon Sep 17 00:00:00 2001 From: WhiteSource Renovate Date: Fri, 9 Oct 2020 20:10:31 +0200 Subject: [PATCH 02/18] chore(deps): update dependency google-cloud-bigquery to v2.1.0 (#312) --- samples/snippets/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/snippets/requirements.txt b/samples/snippets/requirements.txt index 76c333b46..daabdf745 100644 --- a/samples/snippets/requirements.txt +++ b/samples/snippets/requirements.txt @@ -1,4 +1,4 @@ -google-cloud-bigquery==2.0.0 +google-cloud-bigquery==2.1.0 google-cloud-bigquery-storage==2.0.0 google-auth-oauthlib==0.4.1 grpcio==1.32.0 From 35627d145a41d57768f19d4392ef235928e00f72 Mon Sep 17 00:00:00 2001 From: Yoshi Automation Bot Date: Fri, 9 Oct 2020 13:04:02 -0700 Subject: [PATCH 03/18] chore: start tracking obsolete files (#310) This PR was generated using Autosynth. :rainbow: Synth log will be available here: https://source.cloud.google.com/results/invocations/4563ab42-f1d6-4a7f-8e48-cc92dfba56b5/targets - [ ] To automatically regenerate this PR, check this box. PiperOrigin-RevId: 334645418 Source-Link: https://github.com/googleapis/googleapis/commit/c941026e5e3d600817a20e9ab4d4be03dff21a68 --- .kokoro/presubmit/presubmit.cfg | 8 ++- .kokoro/samples/python3.6/common.cfg | 6 -- .kokoro/samples/python3.7/common.cfg | 6 -- .kokoro/samples/python3.8/common.cfg | 6 -- mypy.ini | 3 + synth.metadata | 94 +++++++++++++++++++++++++++- 6 files changed, 101 insertions(+), 22 deletions(-) create mode 100644 mypy.ini diff --git a/.kokoro/presubmit/presubmit.cfg b/.kokoro/presubmit/presubmit.cfg index 8f43917d9..b158096f0 100644 --- a/.kokoro/presubmit/presubmit.cfg +++ b/.kokoro/presubmit/presubmit.cfg @@ -1 +1,7 @@ -# Format: //devtools/kokoro/config/proto/build.proto \ No newline at end of file +# Format: //devtools/kokoro/config/proto/build.proto + +# Disable system tests. +env_vars: { + key: "RUN_SYSTEM_TESTS" + value: "false" +} diff --git a/.kokoro/samples/python3.6/common.cfg b/.kokoro/samples/python3.6/common.cfg index f3b930960..a56768eae 100644 --- a/.kokoro/samples/python3.6/common.cfg +++ b/.kokoro/samples/python3.6/common.cfg @@ -13,12 +13,6 @@ env_vars: { value: "py-3.6" } -# Declare build specific Cloud project. -env_vars: { - key: "BUILD_SPECIFIC_GCLOUD_PROJECT" - value: "python-docs-samples-tests-py36" -} - env_vars: { key: "TRAMPOLINE_BUILD_FILE" value: "github/python-bigquery/.kokoro/test-samples.sh" diff --git a/.kokoro/samples/python3.7/common.cfg b/.kokoro/samples/python3.7/common.cfg index fc0654565..c93747180 100644 --- a/.kokoro/samples/python3.7/common.cfg +++ b/.kokoro/samples/python3.7/common.cfg @@ -13,12 +13,6 @@ env_vars: { value: "py-3.7" } -# Declare build specific Cloud project. -env_vars: { - key: "BUILD_SPECIFIC_GCLOUD_PROJECT" - value: "python-docs-samples-tests-py37" -} - env_vars: { key: "TRAMPOLINE_BUILD_FILE" value: "github/python-bigquery/.kokoro/test-samples.sh" diff --git a/.kokoro/samples/python3.8/common.cfg b/.kokoro/samples/python3.8/common.cfg index 2b0bf59b3..9808f15e3 100644 --- a/.kokoro/samples/python3.8/common.cfg +++ b/.kokoro/samples/python3.8/common.cfg @@ -13,12 +13,6 @@ env_vars: { value: "py-3.8" } -# Declare build specific Cloud project. -env_vars: { - key: "BUILD_SPECIFIC_GCLOUD_PROJECT" - value: "python-docs-samples-tests-py38" -} - env_vars: { key: "TRAMPOLINE_BUILD_FILE" value: "github/python-bigquery/.kokoro/test-samples.sh" diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 000000000..4505b4854 --- /dev/null +++ b/mypy.ini @@ -0,0 +1,3 @@ +[mypy] +python_version = 3.6 +namespace_packages = True diff --git a/synth.metadata b/synth.metadata index d40e66dac..008810630 100644 --- a/synth.metadata +++ b/synth.metadata @@ -4,15 +4,22 @@ "git": { "name": ".", "remote": "https://github.com/googleapis/python-bigquery.git", - "sha": "fbbe0cb0ea22161d81f1e5504bb89b55e4198634" + "sha": "31644d380b35a76a9147801a4b6b0271c246fd0c" } }, { "git": { "name": "googleapis", "remote": "https://github.com/googleapis/googleapis.git", - "sha": "0dc0a6c0f1a9f979bc0690f0caa5fbafa3000c2c", - "internalRef": "327026955" + "sha": "c941026e5e3d600817a20e9ab4d4be03dff21a68", + "internalRef": "334645418" + } + }, + { + "git": { + "name": "synthtool", + "remote": "https://github.com/googleapis/synthtool.git", + "sha": "f3c04883d6c43261ff13db1f52d03a283be06871" } }, { @@ -33,5 +40,86 @@ "generator": "bazel" } } + ], + "generatedFiles": [ + ".flake8", + ".github/CONTRIBUTING.md", + ".github/ISSUE_TEMPLATE/bug_report.md", + ".github/ISSUE_TEMPLATE/feature_request.md", + ".github/ISSUE_TEMPLATE/support_request.md", + ".github/PULL_REQUEST_TEMPLATE.md", + ".github/release-please.yml", + ".github/snippet-bot.yml", + ".gitignore", + ".kokoro/build.sh", + ".kokoro/continuous/common.cfg", + ".kokoro/continuous/continuous.cfg", + ".kokoro/docker/docs/Dockerfile", + ".kokoro/docker/docs/fetch_gpg_keys.sh", + ".kokoro/docs/common.cfg", + ".kokoro/docs/docs-presubmit.cfg", + ".kokoro/docs/docs.cfg", + ".kokoro/populate-secrets.sh", + ".kokoro/presubmit/common.cfg", + ".kokoro/presubmit/presubmit.cfg", + ".kokoro/presubmit/system-3.8.cfg", + ".kokoro/publish-docs.sh", + ".kokoro/release.sh", + ".kokoro/release/common.cfg", + ".kokoro/release/release.cfg", + ".kokoro/samples/lint/common.cfg", + ".kokoro/samples/lint/continuous.cfg", + ".kokoro/samples/lint/periodic.cfg", + ".kokoro/samples/lint/presubmit.cfg", + ".kokoro/samples/python3.6/common.cfg", + ".kokoro/samples/python3.6/continuous.cfg", + ".kokoro/samples/python3.6/periodic.cfg", + ".kokoro/samples/python3.6/presubmit.cfg", + ".kokoro/samples/python3.7/common.cfg", + ".kokoro/samples/python3.7/continuous.cfg", + ".kokoro/samples/python3.7/periodic.cfg", + ".kokoro/samples/python3.7/presubmit.cfg", + ".kokoro/samples/python3.8/common.cfg", + ".kokoro/samples/python3.8/continuous.cfg", + ".kokoro/samples/python3.8/periodic.cfg", + ".kokoro/samples/python3.8/presubmit.cfg", + ".kokoro/test-samples.sh", + ".kokoro/trampoline.sh", + ".kokoro/trampoline_v2.sh", + ".trampolinerc", + "CODE_OF_CONDUCT.md", + "CONTRIBUTING.rst", + "LICENSE", + "MANIFEST.in", + "docs/_static/custom.css", + "docs/_templates/layout.html", + "docs/bigquery_v2/services.rst", + "docs/bigquery_v2/types.rst", + "docs/conf.py", + "google/cloud/bigquery_v2/__init__.py", + "google/cloud/bigquery_v2/proto/encryption_config.proto", + "google/cloud/bigquery_v2/proto/model.proto", + "google/cloud/bigquery_v2/proto/model_reference.proto", + "google/cloud/bigquery_v2/proto/standard_sql.proto", + "google/cloud/bigquery_v2/py.typed", + "google/cloud/bigquery_v2/types/__init__.py", + "google/cloud/bigquery_v2/types/encryption_config.py", + "google/cloud/bigquery_v2/types/model.py", + "google/cloud/bigquery_v2/types/model_reference.py", + "google/cloud/bigquery_v2/types/standard_sql.py", + "mypy.ini", + "renovate.json", + "samples/AUTHORING_GUIDE.md", + "samples/CONTRIBUTING.md", + "scripts/decrypt-secrets.sh", + "scripts/fixup_bigquery_v2_keywords.py", + "scripts/readme-gen/readme_gen.py", + "scripts/readme-gen/templates/README.tmpl.rst", + "scripts/readme-gen/templates/auth.tmpl.rst", + "scripts/readme-gen/templates/auth_api_key.tmpl.rst", + "scripts/readme-gen/templates/install_deps.tmpl.rst", + "scripts/readme-gen/templates/install_portaudio.tmpl.rst", + "setup.cfg", + "testing/.gitignore" ] } \ No newline at end of file From b8f502b14f21d1815697e4d57cf1225dfb4a7c5e Mon Sep 17 00:00:00 2001 From: HemangChothani <50404902+HemangChothani@users.noreply.github.com> Date: Mon, 12 Oct 2020 10:24:16 -0400 Subject: [PATCH 04/18] fix: use version.py instead of pkg_resources.get_distribution (#307) * fix: use version.py instead of pkg_resources.get_distribution * fix: nit --- google/cloud/bigquery/__init__.py | 4 ++-- google/cloud/bigquery/version.py | 15 +++++++++++++++ setup.py | 7 ++++++- 3 files changed, 23 insertions(+), 3 deletions(-) create mode 100644 google/cloud/bigquery/version.py diff --git a/google/cloud/bigquery/__init__.py b/google/cloud/bigquery/__init__.py index 89c5a3624..e83e70084 100644 --- a/google/cloud/bigquery/__init__.py +++ b/google/cloud/bigquery/__init__.py @@ -28,9 +28,9 @@ """ -from pkg_resources import get_distribution +from google.cloud.bigquery import version as bigquery_version -__version__ = get_distribution("google-cloud-bigquery").version +__version__ = bigquery_version.__version__ from google.cloud.bigquery.client import Client from google.cloud.bigquery.dataset import AccessEntry diff --git a/google/cloud/bigquery/version.py b/google/cloud/bigquery/version.py new file mode 100644 index 000000000..8b5d3328c --- /dev/null +++ b/google/cloud/bigquery/version.py @@ -0,0 +1,15 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__version__ = "2.1.0" diff --git a/setup.py b/setup.py index 14b38b63e..be7296081 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ name = "google-cloud-bigquery" description = "Google BigQuery API client library" -version = "2.1.0" + # Should be one of: # 'Development Status :: 3 - Alpha' # 'Development Status :: 4 - Beta' @@ -83,6 +83,11 @@ with io.open(readme_filename, encoding="utf-8") as readme_file: readme = readme_file.read() +version = {} +with open(os.path.join(package_root, "google/cloud/bigquery/version.py")) as fp: + exec(fp.read(), version) +version = version["__version__"] + # Only include packages under the 'google' namespace. Do not include tests, # benchmarks, etc. packages = [ From 801e4c0574b7e421aa3a28cafec6fd6bcce940dd Mon Sep 17 00:00:00 2001 From: Carlos de la Guardia Date: Mon, 12 Oct 2020 17:10:07 -0500 Subject: [PATCH 05/18] deps: require pyarrow for pandas support (#314) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [X] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [X] Ensure the tests and linter pass - [X] Code coverage does not decrease (if any source code was changed) - [X] Appropriate docs were updated (if necessary) Fixes #265 🦕 --- docs/snippets.py | 4 - google/cloud/bigquery/__init__.py | 3 - google/cloud/bigquery/client.py | 43 ++---- google/cloud/bigquery/exceptions.py | 17 -- google/cloud/bigquery/table.py | 92 ++++------- noxfile.py | 5 +- setup.py | 12 +- testing/constraints-3.6.txt | 1 - tests/unit/test__pandas_helpers.py | 8 + tests/unit/test_client.py | 144 +---------------- tests/unit/test_job.py | 39 ----- tests/unit/test_table.py | 232 ++++++---------------------- 12 files changed, 97 insertions(+), 503 deletions(-) delete mode 100644 google/cloud/bigquery/exceptions.py diff --git a/docs/snippets.py b/docs/snippets.py index bc6b58020..8c106e63d 100644 --- a/docs/snippets.py +++ b/docs/snippets.py @@ -26,10 +26,6 @@ import pytest -try: - import fastparquet -except (ImportError, AttributeError): - fastparquet = None try: import pandas except (ImportError, AttributeError): diff --git a/google/cloud/bigquery/__init__.py b/google/cloud/bigquery/__init__.py index e83e70084..b8d1cc4d7 100644 --- a/google/cloud/bigquery/__init__.py +++ b/google/cloud/bigquery/__init__.py @@ -38,7 +38,6 @@ from google.cloud.bigquery.dataset import DatasetReference from google.cloud.bigquery import enums from google.cloud.bigquery.enums import StandardSqlDataTypes -from google.cloud.bigquery.exceptions import PyarrowMissingWarning from google.cloud.bigquery.external_config import ExternalConfig from google.cloud.bigquery.external_config import BigtableOptions from google.cloud.bigquery.external_config import BigtableColumnFamily @@ -143,8 +142,6 @@ "WriteDisposition", # EncryptionConfiguration "EncryptionConfiguration", - # Errors and warnings - "PyarrowMissingWarning", ] diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index fcb18385d..2afffab80 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -58,7 +58,6 @@ from google.cloud.bigquery.dataset import Dataset from google.cloud.bigquery.dataset import DatasetListItem from google.cloud.bigquery.dataset import DatasetReference -from google.cloud.bigquery.exceptions import PyarrowMissingWarning from google.cloud.bigquery.opentelemetry_tracing import create_span from google.cloud.bigquery import job from google.cloud.bigquery.model import Model @@ -2135,29 +2134,31 @@ def load_table_from_dataframe( [Beta] The compression method to use if intermittently serializing ``dataframe`` to a parquet file. - If ``pyarrow`` and job config schema are used, the argument - is directly passed as the ``compression`` argument to the - underlying ``pyarrow.parquet.write_table()`` method (the - default value "snappy" gets converted to uppercase). + The argument is directly passed as the ``compression`` + argument to the underlying ``pyarrow.parquet.write_table()`` + method (the default value "snappy" gets converted to uppercase). https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table - If either ``pyarrow`` or job config schema are missing, the - argument is directly passed as the ``compression`` argument - to the underlying ``DataFrame.to_parquet()`` method. + If the job config schema is missing, the argument is directly + passed as the ``compression`` argument to the underlying + ``DataFrame.to_parquet()`` method. https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_parquet.html#pandas.DataFrame.to_parquet Returns: google.cloud.bigquery.job.LoadJob: A new load job. Raises: - ImportError: + ValueError: If a usable parquet engine cannot be found. This method - requires :mod:`pyarrow` or :mod:`fastparquet` to be - installed. + requires :mod:`pyarrow` to be installed. TypeError: If ``job_config`` is not an instance of :class:`~google.cloud.bigquery.job.LoadJobConfig` class. """ + if pyarrow is None: + # pyarrow is now the only supported parquet engine. + raise ValueError("This method requires pyarrow to be installed") + job_id = _make_job_id(job_id, job_id_prefix) if job_config: @@ -2222,7 +2223,7 @@ def load_table_from_dataframe( os.close(tmpfd) try: - if pyarrow and job_config.schema: + if job_config.schema: if parquet_compression == "snappy": # adjust the default value parquet_compression = parquet_compression.upper() @@ -2233,24 +2234,6 @@ def load_table_from_dataframe( parquet_compression=parquet_compression, ) else: - if not pyarrow: - warnings.warn( - "Loading dataframe data without pyarrow installed is " - "deprecated and will become unsupported in the future. " - "Please install the pyarrow package.", - PyarrowMissingWarning, - stacklevel=2, - ) - - if job_config.schema: - warnings.warn( - "job_config.schema is set, but not used to assist in " - "identifying correct types for data serialization. " - "Please install the pyarrow package.", - PendingDeprecationWarning, - stacklevel=2, - ) - dataframe.to_parquet(tmppath, compression=parquet_compression) with open(tmppath, "rb") as parquet_file: diff --git a/google/cloud/bigquery/exceptions.py b/google/cloud/bigquery/exceptions.py deleted file mode 100644 index 93490ef97..000000000 --- a/google/cloud/bigquery/exceptions.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright 2020 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -class PyarrowMissingWarning(DeprecationWarning): - pass diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index a72bacb74..01e8815da 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -50,7 +50,6 @@ from google.cloud.bigquery.schema import _build_schema_resource from google.cloud.bigquery.schema import _parse_schema_resource from google.cloud.bigquery.schema import _to_schema_fields -from google.cloud.bigquery.exceptions import PyarrowMissingWarning from google.cloud.bigquery.external_config import ExternalConfig from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration @@ -1679,75 +1678,38 @@ def to_dataframe( create_bqstorage_client = False bqstorage_client = None - if pyarrow is not None: - # If pyarrow is available, calling to_arrow, then converting to a - # pandas dataframe is about 2x faster. This is because pandas.concat is - # rarely no-copy, whereas pyarrow.Table.from_batches + to_pandas is - # usually no-copy. - record_batch = self.to_arrow( - progress_bar_type=progress_bar_type, - bqstorage_client=bqstorage_client, - create_bqstorage_client=create_bqstorage_client, - ) + record_batch = self.to_arrow( + progress_bar_type=progress_bar_type, + bqstorage_client=bqstorage_client, + create_bqstorage_client=create_bqstorage_client, + ) + + # When converting timestamp values to nanosecond precision, the result + # can be out of pyarrow bounds. To avoid the error when converting to + # Pandas, we set the timestamp_as_object parameter to True, if necessary. + types_to_check = { + pyarrow.timestamp("us"), + pyarrow.timestamp("us", tz=pytz.UTC), + } - # When converting timestamp values to nanosecond precision, the result - # can be out of pyarrow bounds. To avoid the error when converting to - # Pandas, we set the timestamp_as_object parameter to True, if necessary. - types_to_check = { - pyarrow.timestamp("us"), - pyarrow.timestamp("us", tz=pytz.UTC), - } - - for column in record_batch: - if column.type in types_to_check: - try: - column.cast("timestamp[ns]") - except pyarrow.lib.ArrowInvalid: - timestamp_as_object = True - break - else: - timestamp_as_object = False - - extra_kwargs = {"timestamp_as_object": timestamp_as_object} - - df = record_batch.to_pandas(date_as_object=date_as_object, **extra_kwargs) - - for column in dtypes: - df[column] = pandas.Series(df[column], dtype=dtypes[column]) - return df + for column in record_batch: + if column.type in types_to_check: + try: + column.cast("timestamp[ns]") + except pyarrow.lib.ArrowInvalid: + timestamp_as_object = True + break else: - warnings.warn( - "Converting to a dataframe without pyarrow installed is " - "often slower and will become unsupported in the future. " - "Please install the pyarrow package.", - PyarrowMissingWarning, - stacklevel=2, - ) + timestamp_as_object = False - # The bqstorage_client is only used if pyarrow is available, so the - # rest of this method only needs to account for tabledata.list. - progress_bar = self._get_progress_bar(progress_bar_type) + extra_kwargs = {"timestamp_as_object": timestamp_as_object} - frames = [] - for frame in self.to_dataframe_iterable(dtypes=dtypes): - frames.append(frame) + df = record_batch.to_pandas(date_as_object=date_as_object, **extra_kwargs) - if progress_bar is not None: - # In some cases, the number of total rows is not populated - # until the first page of rows is fetched. Update the - # progress bar's total to keep an accurate count. - progress_bar.total = progress_bar.total or self.total_rows - progress_bar.update(len(frame)) - - if progress_bar is not None: - # Indicate that the download has finished. - progress_bar.close() - - # Avoid concatting an empty list. - if not frames: - column_names = [field.name for field in self._schema] - return pandas.DataFrame(columns=column_names) - return pandas.concat(frames, ignore_index=True) + for column in dtypes: + df[column] = pandas.Series(df[column], dtype=dtypes[column]) + + return df class _EmptyRowIterator(object): diff --git a/noxfile.py b/noxfile.py index 42d8f9356..db1dcffde 100644 --- a/noxfile.py +++ b/noxfile.py @@ -49,10 +49,7 @@ def default(session): constraints_path, ) - # fastparquet is not included in .[all] because, in general, it's - # redundant with pyarrow. We still want to run some unit tests with - # fastparquet serialization, though. - session.install("-e", ".[all,fastparquet]", "-c", constraints_path) + session.install("-e", ".[all]", "-c", constraints_path) session.install("ipython", "-c", constraints_path) diff --git a/setup.py b/setup.py index be7296081..abd5cef95 100644 --- a/setup.py +++ b/setup.py @@ -47,13 +47,12 @@ "grpcio >= 1.32.0, < 2.0dev", "pyarrow >= 1.0.0, < 2.0dev", ], - "pandas": ["pandas>=0.23.0"], - "pyarrow": [ + "pandas": [ + "pandas>=0.23.0", # pyarrow 1.0.0 is required for the use of timestamp_as_object keyword. "pyarrow >= 1.0.0, < 2.0dev", ], "tqdm": ["tqdm >= 4.7.4, <5.0.0dev"], - "fastparquet": ["fastparquet", "python-snappy", "llvmlite>=0.34.0"], "opentelemetry": [ "opentelemetry-api==0.9b0", "opentelemetry-sdk==0.9b0", @@ -64,13 +63,6 @@ all_extras = [] for extra in extras: - if extra in ( - # Skip fastparquet from "all" because it is redundant with pyarrow and - # creates a dependency on pre-release versions of numpy. See: - # https://github.com/googleapis/google-cloud-python/issues/8549 - "fastparquet", - ): - continue all_extras.extend(extras[extra]) extras["all"] = all_extras diff --git a/testing/constraints-3.6.txt b/testing/constraints-3.6.txt index a9f4faa92..798804941 100644 --- a/testing/constraints-3.6.txt +++ b/testing/constraints-3.6.txt @@ -1,4 +1,3 @@ -fastparquet==0.4.1 google-api-core==1.22.2 google-cloud-bigquery-storage==2.0.0 google-cloud-core==1.4.1 diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index c1073066d..bdb1c56ea 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -1329,3 +1329,11 @@ def test_download_dataframe_tabledata_list_dict_sequence_schema(module_under_tes ) ) assert result.equals(expected_result) + + with pytest.raises(StopIteration): + result = next(results_gen) + + +def test_table_data_listpage_to_dataframe_skips_stop_iteration(module_under_test): + dataframe = module_under_test._tabledata_list_page_to_dataframe([], [], {}) + assert isinstance(dataframe, pandas.DataFrame) diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index f44201ab8..737c1aef7 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -32,10 +32,6 @@ import pytz import pkg_resources -try: - import fastparquet -except (ImportError, AttributeError): # pragma: NO COVER - fastparquet = None try: import pandas except (ImportError, AttributeError): # pragma: NO COVER @@ -7838,80 +7834,6 @@ def test_load_table_from_dataframe_unknown_table(self): job_config=mock.ANY, ) - @unittest.skipIf(pandas is None, "Requires `pandas`") - @unittest.skipIf(fastparquet is None, "Requires `fastparquet`") - def test_load_table_from_dataframe_no_pyarrow_warning(self): - from google.cloud.bigquery.client import PyarrowMissingWarning - - client = self._make_client() - - # Pick at least one column type that translates to Pandas dtype - # "object". A string column matches that. - records = [{"name": "Monty", "age": 100}, {"name": "Python", "age": 60}] - dataframe = pandas.DataFrame(records) - - get_table_patch = mock.patch( - "google.cloud.bigquery.client.Client.get_table", - autospec=True, - side_effect=google.api_core.exceptions.NotFound("Table not found"), - ) - load_patch = mock.patch( - "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True - ) - pyarrow_patch = mock.patch("google.cloud.bigquery.client.pyarrow", None) - pyarrow_patch_helpers = mock.patch( - "google.cloud.bigquery._pandas_helpers.pyarrow", None - ) - catch_warnings = warnings.catch_warnings(record=True) - - with get_table_patch, load_patch, pyarrow_patch, pyarrow_patch_helpers, catch_warnings as warned: - client.load_table_from_dataframe( - dataframe, self.TABLE_REF, location=self.LOCATION - ) - - matches = [ - warning for warning in warned if warning.category is PyarrowMissingWarning - ] - assert matches, "A missing pyarrow deprecation warning was not raised." - - @unittest.skipIf(pandas is None, "Requires `pandas`") - @unittest.skipIf(fastparquet is None, "Requires `fastparquet`") - def test_load_table_from_dataframe_no_schema_warning_wo_pyarrow(self): - client = self._make_client() - - # Pick at least one column type that translates to Pandas dtype - # "object". A string column matches that. - records = [{"name": "Monty", "age": 100}, {"name": "Python", "age": 60}] - dataframe = pandas.DataFrame(records) - - get_table_patch = mock.patch( - "google.cloud.bigquery.client.Client.get_table", - autospec=True, - side_effect=google.api_core.exceptions.NotFound("Table not found"), - ) - load_patch = mock.patch( - "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True - ) - pyarrow_patch = mock.patch("google.cloud.bigquery.client.pyarrow", None) - pyarrow_patch_helpers = mock.patch( - "google.cloud.bigquery._pandas_helpers.pyarrow", None - ) - catch_warnings = warnings.catch_warnings(record=True) - - with get_table_patch, load_patch, pyarrow_patch, pyarrow_patch_helpers, catch_warnings as warned: - client.load_table_from_dataframe( - dataframe, self.TABLE_REF, location=self.LOCATION - ) - - matches = [ - warning - for warning in warned - if warning.category in (DeprecationWarning, PendingDeprecationWarning) - and "could not be detected" in str(warning) - and "please provide a schema" in str(warning) - ] - assert matches, "A missing schema deprecation warning was not raised." - @unittest.skipIf( pandas is None or PANDAS_INSTALLED_VERSION < PANDAS_MINIUM_VERSION, "Only `pandas version >=1.0.0` supported", @@ -8182,7 +8104,6 @@ def test_load_table_from_dataframe_w_partial_schema_extra_types(self): assert "unknown_col" in message @unittest.skipIf(pandas is None, "Requires `pandas`") - @unittest.skipIf(fastparquet is None, "Requires `fastparquet`") def test_load_table_from_dataframe_w_partial_schema_missing_types(self): from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES from google.cloud.bigquery import job @@ -8236,55 +8157,6 @@ def test_load_table_from_dataframe_w_partial_schema_missing_types(self): assert sent_config.source_format == job.SourceFormat.PARQUET assert sent_config.schema is None - @unittest.skipIf(pandas is None, "Requires `pandas`") - @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") - def test_load_table_from_dataframe_w_schema_wo_pyarrow(self): - from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES - from google.cloud.bigquery import job - from google.cloud.bigquery.schema import SchemaField - - client = self._make_client() - records = [{"name": u"Monty", "age": 100}, {"name": u"Python", "age": 60}] - dataframe = pandas.DataFrame(records, columns=["name", "age"]) - schema = (SchemaField("name", "STRING"), SchemaField("age", "INTEGER")) - job_config = job.LoadJobConfig(schema=schema) - - load_patch = mock.patch( - "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True - ) - pyarrow_patch = mock.patch("google.cloud.bigquery.client.pyarrow", None) - - with load_patch as load_table_from_file, pyarrow_patch, warnings.catch_warnings( - record=True - ) as warned: - client.load_table_from_dataframe( - dataframe, self.TABLE_REF, job_config=job_config, location=self.LOCATION - ) - - assert warned # there should be at least one warning - for warning in warned: - assert "pyarrow" in str(warning) - assert issubclass( - warning.category, (DeprecationWarning, PendingDeprecationWarning) - ) - - load_table_from_file.assert_called_once_with( - client, - mock.ANY, - self.TABLE_REF, - num_retries=_DEFAULT_NUM_RETRIES, - rewind=True, - job_id=mock.ANY, - job_id_prefix=None, - location=self.LOCATION, - project=None, - job_config=mock.ANY, - ) - - sent_config = load_table_from_file.mock_calls[0][2]["job_config"] - assert sent_config.source_format == job.SourceFormat.PARQUET - assert tuple(sent_config.schema) == schema - @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe_w_schema_arrow_custom_compression(self): @@ -8320,7 +8192,7 @@ def test_load_table_from_dataframe_w_schema_arrow_custom_compression(self): @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") - def test_load_table_from_dataframe_wo_pyarrow_custom_compression(self): + def test_load_table_from_dataframe_wo_pyarrow_raises_error(self): client = self._make_client() records = [{"id": 1, "age": 100}, {"id": 2, "age": 60}] dataframe = pandas.DataFrame(records) @@ -8338,8 +8210,8 @@ def test_load_table_from_dataframe_wo_pyarrow_custom_compression(self): dataframe, "to_parquet", wraps=dataframe.to_parquet ) - with load_patch, get_table_patch, pyarrow_patch, to_parquet_patch as to_parquet_spy: - with warnings.catch_warnings(record=True) as warned: + with load_patch, get_table_patch, pyarrow_patch, to_parquet_patch: + with pytest.raises(ValueError): client.load_table_from_dataframe( dataframe, self.TABLE_REF, @@ -8347,16 +8219,6 @@ def test_load_table_from_dataframe_wo_pyarrow_custom_compression(self): parquet_compression="gzip", ) - call_args = to_parquet_spy.call_args - assert call_args is not None - assert call_args.kwargs.get("compression") == "gzip" - - assert len(warned) == 2 - warning = warned[0] - assert "Loading dataframe data without pyarrow" in str(warning) - warning = warned[1] - assert "Please install the pyarrow package" in str(warning) - @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe_w_nulls(self): diff --git a/tests/unit/test_job.py b/tests/unit/test_job.py index fb042e18c..d21489616 100644 --- a/tests/unit/test_job.py +++ b/tests/unit/test_job.py @@ -5802,45 +5802,6 @@ def test_to_dataframe_column_date_dtypes(self): self.assertEqual(df.date.dtype.name, "datetime64[ns]") - @unittest.skipIf(pandas is None, "Requires `pandas`") - def test_to_dataframe_column_date_dtypes_wo_pyarrow(self): - begun_resource = self._make_resource() - query_resource = { - "jobComplete": True, - "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, - "totalRows": "1", - "schema": {"fields": [{"name": "date", "type": "DATE"}]}, - } - row_data = [ - ["1999-12-01"], - ] - rows = [{"f": [{"v": field} for field in row]} for row in row_data] - query_resource["rows"] = rows - done_resource = copy.deepcopy(begun_resource) - done_resource["status"] = {"state": "DONE"} - connection = _make_connection( - begun_resource, query_resource, done_resource, query_resource - ) - client = _make_client(project=self.PROJECT, connection=connection) - job = self._make_one(self.JOB_ID, self.QUERY, client) - - with mock.patch("google.cloud.bigquery.table.pyarrow", None): - with warnings.catch_warnings(record=True) as warned: - df = job.to_dataframe( - date_as_object=False, create_bqstorage_client=False - ) - - self.assertIsInstance(df, pandas.DataFrame) - self.assertEqual(len(df), 1) # verify the number of rows - exp_columns = [field["name"] for field in query_resource["schema"]["fields"]] - self.assertEqual(list(df), exp_columns) # verify the column names - - self.assertEqual(df.date.dtype.name, "object") - - assert len(warned) == 1 - warning = warned[0] - assert "without pyarrow" in str(warning) - @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf(tqdm is None, "Requires `tqdm`") @mock.patch("tqdm.tqdm") diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 12169658e..fe17d2852 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -2148,6 +2148,49 @@ def test_to_dataframe_iterable(self): self.assertEqual(df_2["name"][0], "Sven") self.assertEqual(df_2["age"][0], 33) + @unittest.skipIf(pandas is None, "Requires `pandas`") + def test_to_dataframe_iterable_with_dtypes(self): + from google.cloud.bigquery.schema import SchemaField + import types + + schema = [ + SchemaField("name", "STRING", mode="REQUIRED"), + SchemaField("age", "INTEGER", mode="REQUIRED"), + ] + + path = "/foo" + api_request = mock.Mock( + side_effect=[ + { + "rows": [{"f": [{"v": "Bengt"}, {"v": "32"}]}], + "pageToken": "NEXTPAGE", + }, + {"rows": [{"f": [{"v": "Sven"}, {"v": "33"}]}]}, + ] + ) + + row_iterator = self._make_one( + _mock_client(), api_request, path, schema, page_size=1, max_results=5 + ) + dfs = row_iterator.to_dataframe_iterable(dtypes={"age": "int32"}) + + self.assertIsInstance(dfs, types.GeneratorType) + + df_1 = next(dfs) + self.assertIsInstance(df_1, pandas.DataFrame) + self.assertEqual(df_1.name.dtype.name, "object") + self.assertEqual(df_1.age.dtype.name, "int32") + self.assertEqual(len(df_1), 1) # verify the number of rows + self.assertEqual( + df_1["name"][0], "Bengt" + ) # verify the first value of 'name' column + self.assertEqual(df_1["age"][0], 32) # verify the first value of 'age' column + + df_2 = next(dfs) + self.assertEqual(len(df_2), 1) # verify the number of rows + self.assertEqual(df_2["name"][0], "Sven") + self.assertEqual(df_2["age"][0], 33) + @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf( bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" @@ -2327,38 +2370,6 @@ def test_to_dataframe_datetime_out_of_pyarrow_bounds(self): [dt.datetime(4567, 1, 1), dt.datetime(9999, 12, 31)], ) - @unittest.skipIf(pandas is None, "Requires `pandas`") - def test_to_dataframe_warning_wo_pyarrow(self): - from google.cloud.bigquery.client import PyarrowMissingWarning - from google.cloud.bigquery.schema import SchemaField - - schema = [ - SchemaField("name", "STRING", mode="REQUIRED"), - SchemaField("age", "INTEGER", mode="REQUIRED"), - ] - rows = [ - {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}, - {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, - ] - path = "/foo" - api_request = mock.Mock(return_value={"rows": rows}) - row_iterator = self._make_one(_mock_client(), api_request, path, schema) - - no_pyarrow_patch = mock.patch("google.cloud.bigquery.table.pyarrow", new=None) - catch_warnings = warnings.catch_warnings(record=True) - - with no_pyarrow_patch, catch_warnings as warned: - df = row_iterator.to_dataframe() - - self.assertIsInstance(df, pandas.DataFrame) - self.assertEqual(len(df), 2) - matches = [ - warning for warning in warned if warning.category is PyarrowMissingWarning - ] - self.assertTrue( - matches, msg="A missing pyarrow deprecation warning was not raised." - ) - @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf(tqdm is None, "Requires `tqdm`") @mock.patch("tqdm.tqdm_gui") @@ -2399,50 +2410,6 @@ def test_to_dataframe_progress_bar( progress_bar_mock().close.assert_called_once() self.assertEqual(len(df), 4) - @unittest.skipIf(pandas is None, "Requires `pandas`") - @unittest.skipIf(tqdm is None, "Requires `tqdm`") - @mock.patch("tqdm.tqdm_gui") - @mock.patch("tqdm.tqdm_notebook") - @mock.patch("tqdm.tqdm") - def test_to_dataframe_progress_bar_wo_pyarrow( - self, tqdm_mock, tqdm_notebook_mock, tqdm_gui_mock - ): - from google.cloud.bigquery.schema import SchemaField - - schema = [ - SchemaField("name", "STRING", mode="REQUIRED"), - SchemaField("age", "INTEGER", mode="REQUIRED"), - ] - rows = [ - {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}, - {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, - {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]}, - {"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]}, - ] - path = "/foo" - api_request = mock.Mock(return_value={"rows": rows}) - - progress_bars = ( - ("tqdm", tqdm_mock), - ("tqdm_notebook", tqdm_notebook_mock), - ("tqdm_gui", tqdm_gui_mock), - ) - - for progress_bar_type, progress_bar_mock in progress_bars: - row_iterator = self._make_one(_mock_client(), api_request, path, schema) - with mock.patch("google.cloud.bigquery.table.pyarrow", None): - with warnings.catch_warnings(record=True) as warned: - df = row_iterator.to_dataframe(progress_bar_type=progress_bar_type) - - progress_bar_mock.assert_called() - progress_bar_mock().update.assert_called() - progress_bar_mock().close.assert_called_once() - self.assertEqual(len(df), 4) - - self.assertEqual(len(warned), 1) - warning = warned[0] - self.assertTrue("without pyarrow" in str(warning)) - @unittest.skipIf(pandas is None, "Requires `pandas`") @mock.patch("google.cloud.bigquery.table.tqdm", new=None) def test_to_dataframe_no_tqdm_no_progress_bar(self): @@ -2557,57 +2524,6 @@ def test_to_dataframe_w_empty_results(self): self.assertEqual(len(df), 0) # verify the number of rows self.assertEqual(list(df), ["name", "age"]) # verify the column names - @unittest.skipIf(pandas is None, "Requires `pandas`") - def test_to_dataframe_w_empty_results_wo_pyarrow(self): - from google.cloud.bigquery.schema import SchemaField - - with mock.patch("google.cloud.bigquery.table.pyarrow", None): - schema = [ - SchemaField("name", "STRING", mode="REQUIRED"), - SchemaField("age", "INTEGER", mode="REQUIRED"), - ] - api_request = mock.Mock(return_value={"rows": []}) - row_iterator = self._make_one(_mock_client(), api_request, schema=schema) - - with warnings.catch_warnings(record=True) as warned: - df = row_iterator.to_dataframe() - - self.assertIsInstance(df, pandas.DataFrame) - self.assertEqual(len(df), 0) # verify the number of rows - self.assertEqual(list(df), ["name", "age"]) # verify the column names - - self.assertEqual(len(warned), 1) - warning = warned[0] - self.assertTrue("without pyarrow" in str(warning)) - - @unittest.skipIf(pandas is None, "Requires `pandas`") - def test_to_dataframe_w_no_results_wo_pyarrow(self): - from google.cloud.bigquery.schema import SchemaField - - with mock.patch("google.cloud.bigquery.table.pyarrow", None): - schema = [ - SchemaField("name", "STRING", mode="REQUIRED"), - SchemaField("age", "INTEGER", mode="REQUIRED"), - ] - api_request = mock.Mock(return_value={"rows": []}) - row_iterator = self._make_one(_mock_client(), api_request, schema=schema) - - def empty_iterable(dtypes=None): - return [] - - row_iterator.to_dataframe_iterable = empty_iterable - - with warnings.catch_warnings(record=True) as warned: - df = row_iterator.to_dataframe() - - self.assertIsInstance(df, pandas.DataFrame) - self.assertEqual(len(df), 0) # verify the number of rows - self.assertEqual(list(df), ["name", "age"]) # verify the column names - - self.assertEqual(len(warned), 1) - warning = warned[0] - self.assertTrue("without pyarrow" in str(warning)) - @unittest.skipIf(pandas is None, "Requires `pandas`") def test_to_dataframe_w_various_types_nullable(self): import datetime @@ -3424,68 +3340,6 @@ def test_to_dataframe_concat_categorical_dtype_w_pyarrow(self): # Don't close the client if it was passed in. bqstorage_client._transport.grpc_channel.close.assert_not_called() - @unittest.skipIf(pandas is None, "Requires `pandas`") - def test_to_dataframe_concat_categorical_dtype_wo_pyarrow(self): - from google.cloud.bigquery.schema import SchemaField - - schema = [ - SchemaField("col_str", "STRING"), - SchemaField("col_category", "STRING"), - ] - row_data = [ - [u"foo", u"low"], - [u"bar", u"medium"], - [u"baz", u"low"], - [u"foo_page2", u"medium"], - [u"bar_page2", u"high"], - [u"baz_page2", u"low"], - ] - path = "/foo" - - rows = [{"f": [{"v": field} for field in row]} for row in row_data[:3]] - rows_page2 = [{"f": [{"v": field} for field in row]} for row in row_data[3:]] - api_request = mock.Mock( - side_effect=[{"rows": rows, "pageToken": "NEXTPAGE"}, {"rows": rows_page2}] - ) - - row_iterator = self._make_one(_mock_client(), api_request, path, schema) - - mock_pyarrow = mock.patch("google.cloud.bigquery.table.pyarrow", None) - catch_warnings = warnings.catch_warnings(record=True) - - with mock_pyarrow, catch_warnings as warned: - got = row_iterator.to_dataframe( - dtypes={ - "col_category": pandas.core.dtypes.dtypes.CategoricalDtype( - categories=["low", "medium", "high"], ordered=False, - ), - }, - ) - - self.assertIsInstance(got, pandas.DataFrame) - self.assertEqual(len(got), 6) # verify the number of rows - expected_columns = [field.name for field in schema] - self.assertEqual(list(got), expected_columns) # verify the column names - - # Are column types correct? - expected_dtypes = [ - pandas.core.dtypes.dtypes.np.dtype("O"), # the default for string data - pandas.core.dtypes.dtypes.CategoricalDtype( - categories=["low", "medium", "high"], ordered=False, - ), - ] - self.assertEqual(list(got.dtypes), expected_dtypes) - - # And the data in the categorical column? - self.assertEqual( - list(got["col_category"]), - ["low", "medium", "low", "medium", "high", "low"], - ) - - self.assertEqual(len(warned), 1) - warning = warned[0] - self.assertTrue("without pyarrow" in str(warning)) - class TestPartitionRange(unittest.TestCase): def _get_target_class(self): From 07c70f0292f9212f0c968cd5c9206e8b0409c0da Mon Sep 17 00:00:00 2001 From: HemangChothani <50404902+HemangChothani@users.noreply.github.com> Date: Tue, 13 Oct 2020 02:53:19 -0400 Subject: [PATCH 06/18] feat: add method api_repr for table list item (#299) --- google/cloud/bigquery/table.py | 8 ++++++++ tests/unit/test_table.py | 11 +++++++++++ 2 files changed, 19 insertions(+) diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 01e8815da..2214d0172 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -1071,6 +1071,14 @@ def to_bqstorage(self): """ return self.reference.to_bqstorage() + def to_api_repr(self): + """Constructs the API resource of this table + + Returns: + Dict[str, object]: Table represented as an API resource + """ + return copy.deepcopy(self._properties) + def _row_from_mapping(mapping, schema): """Convert a mapping to a row tuple using the schema. diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index fe17d2852..376605521 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -1433,6 +1433,17 @@ def test_labels_update_in_place(self): labels["foo"] = "bar" # update in place self.assertEqual(table.labels, {"foo": "bar"}) + def test_to_api_repr(self): + resource = { + "tableReference": { + "projectId": "testproject", + "datasetId": "testdataset", + "tableId": "testtable", + } + } + table = self._make_one(resource) + self.assertEqual(table.to_api_repr(), resource) + class TestRow(unittest.TestCase): def test_row(self): From 5ea1ece2d911cdd1f3d9549ee01559ce8ed8269a Mon Sep 17 00:00:00 2001 From: Ryan Yuan Date: Wed, 14 Oct 2020 08:02:02 +1100 Subject: [PATCH 07/18] docs: update clustering field docstrings (#286) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [x] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [x] Ensure the tests and linter pass - [x] Code coverage does not decrease (if any source code was changed) - [x] Appropriate docs were updated (if necessary) Fixes #285 🦕 --- google/cloud/bigquery/job.py | 8 ++++---- google/cloud/bigquery/table.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/google/cloud/bigquery/job.py b/google/cloud/bigquery/job.py index 20bce597a..70db69e71 100644 --- a/google/cloud/bigquery/job.py +++ b/google/cloud/bigquery/job.py @@ -1073,8 +1073,8 @@ def clustering_fields(self): .. note:: - As of 2018-06-29, clustering fields cannot be set on a table - which does not also have time partioning defined. + BigQuery supports clustering for both partitioned and + non-partitioned tables. """ prop = self._get_sub_prop("clustering") if prop is not None: @@ -2554,8 +2554,8 @@ def clustering_fields(self): .. note:: - As of 2018-06-29, clustering fields cannot be set on a table - which does not also have time partioning defined. + BigQuery supports clustering for both partitioned and + non-partitioned tables. """ prop = self._get_sub_prop("clustering") if prop is not None: diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 2214d0172..5474f643e 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -625,8 +625,8 @@ def clustering_fields(self): .. note:: - As of 2018-06-29, clustering fields cannot be set on a table - which does not also have time partioning defined. + BigQuery supports clustering for both partitioned and + non-partitioned tables. """ prop = self._properties.get("clustering") if prop is not None: @@ -1030,8 +1030,8 @@ def clustering_fields(self): .. note:: - As of 2018-06-29, clustering fields cannot be set on a table - which does not also have time partioning defined. + BigQuery supports clustering for both partitioned and + non-partitioned tables. """ prop = self._properties.get("clustering") if prop is not None: From 20f473bfff5ae98377f5d9cdf18bfe5554d86ff4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20BAPTISTE?= Date: Tue, 13 Oct 2020 23:26:05 +0200 Subject: [PATCH 08/18] fix: make TimePartitioning repr evaluable (#110) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes #109 🦕 --- google/cloud/bigquery/table.py | 15 ++++++++++++++- tests/unit/test_table.py | 4 ++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 5474f643e..d6d966eee 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -2114,7 +2114,20 @@ def to_api_repr(self): return self._properties def _key(self): - return tuple(sorted(self._properties.items())) + # because we are only "renaming" top level keys shallow copy is sufficient here. + properties = self._properties.copy() + # calling repr for non built-in type objects. + properties["type_"] = repr(properties.pop("type")) + if "field" in properties: + # calling repr for non built-in type objects. + properties["field"] = repr(properties["field"]) + if "requirePartitionFilter" in properties: + properties["require_partition_filter"] = properties.pop( + "requirePartitionFilter" + ) + if "expirationMs" in properties: + properties["expiration_ms"] = properties.pop("expirationMs") + return tuple(sorted(properties.items())) def __eq__(self, other): if not isinstance(other, TimePartitioning): diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 376605521..e21453b9f 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -3711,7 +3711,7 @@ def test___hash__not_equals(self): def test___repr___minimal(self): time_partitioning = self._make_one() - expected = "TimePartitioning(type=DAY)" + expected = "TimePartitioning(type_='DAY')" self.assertEqual(repr(time_partitioning), expected) def test___repr___explicit(self): @@ -3720,7 +3720,7 @@ def test___repr___explicit(self): time_partitioning = self._make_one( type_=TimePartitioningType.DAY, field="name", expiration_ms=10000 ) - expected = "TimePartitioning(" "expirationMs=10000," "field=name," "type=DAY)" + expected = "TimePartitioning(expiration_ms=10000,field='name',type_='DAY')" self.assertEqual(repr(time_partitioning), expected) def test_set_expiration_w_none(self): From c69cd50914c0676645b04d44ede9392a3d6dd5b1 Mon Sep 17 00:00:00 2001 From: WhiteSource Renovate Date: Tue, 13 Oct 2020 23:41:46 +0200 Subject: [PATCH 09/18] chore(deps): update dependency matplotlib to v3.3.2 (#260) Co-authored-by: Tim Swast --- samples/snippets/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/snippets/requirements.txt b/samples/snippets/requirements.txt index daabdf745..7d001fa2f 100644 --- a/samples/snippets/requirements.txt +++ b/samples/snippets/requirements.txt @@ -4,7 +4,7 @@ google-auth-oauthlib==0.4.1 grpcio==1.32.0 ipython==7.16.1; python_version < '3.7' ipython==7.17.0; python_version >= '3.7' -matplotlib==3.3.1 +matplotlib==3.3.2 pandas==1.1.3 pyarrow==1.0.1 pytz==2020.1 From 3be78b737add7111e24e912cd02fc6df75a07de6 Mon Sep 17 00:00:00 2001 From: HemangChothani <50404902+HemangChothani@users.noreply.github.com> Date: Wed, 14 Oct 2020 01:42:40 -0400 Subject: [PATCH 10/18] perf: add size parameter for load table from dataframe and json methods (#280) * feat: add size parameter for load from dataframe and json * pref: calculate length of encoded string --- google/cloud/bigquery/client.py | 7 +++++-- tests/unit/test_client.py | 15 +++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index 2afffab80..b7e082daa 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -2237,11 +2237,13 @@ def load_table_from_dataframe( dataframe.to_parquet(tmppath, compression=parquet_compression) with open(tmppath, "rb") as parquet_file: + file_size = os.path.getsize(tmppath) return self.load_table_from_file( parquet_file, destination, num_retries=num_retries, rewind=True, + size=file_size, job_id=job_id, job_id_prefix=job_id_prefix, location=location, @@ -2343,11 +2345,12 @@ def load_table_from_json( destination = _table_arg_to_table_ref(destination, default_project=self.project) data_str = u"\n".join(json.dumps(item) for item in json_rows) - data_file = io.BytesIO(data_str.encode()) - + encoded_str = data_str.encode() + data_file = io.BytesIO(encoded_str) return self.load_table_from_file( data_file, destination, + size=len(encoded_str), num_retries=num_retries, job_id=job_id, job_id_prefix=job_id_prefix, diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index 737c1aef7..52e00d7c7 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -7482,6 +7482,7 @@ def test_load_table_from_dataframe(self): self.TABLE_REF, num_retries=_DEFAULT_NUM_RETRIES, rewind=True, + size=mock.ANY, job_id=mock.ANY, job_id_prefix=None, location=None, @@ -7525,6 +7526,7 @@ def test_load_table_from_dataframe_w_client_location(self): self.TABLE_REF, num_retries=_DEFAULT_NUM_RETRIES, rewind=True, + size=mock.ANY, job_id=mock.ANY, job_id_prefix=None, location=self.LOCATION, @@ -7577,6 +7579,7 @@ def test_load_table_from_dataframe_w_custom_job_config_wihtout_source_format(sel self.TABLE_REF, num_retries=_DEFAULT_NUM_RETRIES, rewind=True, + size=mock.ANY, job_id=mock.ANY, job_id_prefix=None, location=self.LOCATION, @@ -7631,6 +7634,7 @@ def test_load_table_from_dataframe_w_custom_job_config_w_source_format(self): self.TABLE_REF, num_retries=_DEFAULT_NUM_RETRIES, rewind=True, + size=mock.ANY, job_id=mock.ANY, job_id_prefix=None, location=self.LOCATION, @@ -7723,6 +7727,7 @@ def test_load_table_from_dataframe_w_automatic_schema(self): self.TABLE_REF, num_retries=_DEFAULT_NUM_RETRIES, rewind=True, + size=mock.ANY, job_id=mock.ANY, job_id_prefix=None, location=self.LOCATION, @@ -7782,6 +7787,7 @@ def test_load_table_from_dataframe_w_index_and_auto_schema(self): self.TABLE_REF, num_retries=_DEFAULT_NUM_RETRIES, rewind=True, + size=mock.ANY, job_id=mock.ANY, job_id_prefix=None, location=self.LOCATION, @@ -7827,6 +7833,7 @@ def test_load_table_from_dataframe_unknown_table(self): self.TABLE_REF, num_retries=_DEFAULT_NUM_RETRIES, rewind=True, + size=mock.ANY, job_id=mock.ANY, job_id_prefix=None, location=None, @@ -7867,6 +7874,7 @@ def test_load_table_from_dataframe_w_nullable_int64_datatype(self): self.TABLE_REF, num_retries=_DEFAULT_NUM_RETRIES, rewind=True, + size=mock.ANY, job_id=mock.ANY, job_id_prefix=None, location=self.LOCATION, @@ -7913,6 +7921,7 @@ def test_load_table_from_dataframe_w_nullable_int64_datatype_automatic_schema(se self.TABLE_REF, num_retries=_DEFAULT_NUM_RETRIES, rewind=True, + size=mock.ANY, job_id=mock.ANY, job_id_prefix=None, location=self.LOCATION, @@ -7973,6 +7982,7 @@ def test_load_table_from_dataframe_struct_fields(self): self.TABLE_REF, num_retries=_DEFAULT_NUM_RETRIES, rewind=True, + size=mock.ANY, job_id=mock.ANY, job_id_prefix=None, location=self.LOCATION, @@ -8046,6 +8056,7 @@ def test_load_table_from_dataframe_w_partial_schema(self): self.TABLE_REF, num_retries=_DEFAULT_NUM_RETRIES, rewind=True, + size=mock.ANY, job_id=mock.ANY, job_id_prefix=None, location=self.LOCATION, @@ -8139,6 +8150,7 @@ def test_load_table_from_dataframe_w_partial_schema_missing_types(self): self.TABLE_REF, num_retries=_DEFAULT_NUM_RETRIES, rewind=True, + size=mock.ANY, job_id=mock.ANY, job_id_prefix=None, location=self.LOCATION, @@ -8251,6 +8263,7 @@ def test_load_table_from_dataframe_w_nulls(self): self.TABLE_REF, num_retries=_DEFAULT_NUM_RETRIES, rewind=True, + size=mock.ANY, job_id=mock.ANY, job_id_prefix=None, location=self.LOCATION, @@ -8302,6 +8315,7 @@ def test_load_table_from_json_basic_use(self): client, mock.ANY, self.TABLE_REF, + size=mock.ANY, num_retries=_DEFAULT_NUM_RETRIES, job_id=mock.ANY, job_id_prefix=None, @@ -8353,6 +8367,7 @@ def test_load_table_from_json_non_default_args(self): client, mock.ANY, self.TABLE_REF, + size=mock.ANY, num_retries=_DEFAULT_NUM_RETRIES, job_id=mock.ANY, job_id_prefix=None, From fb401bd94477323bba68cf252dd88166495daf54 Mon Sep 17 00:00:00 2001 From: HemangChothani <50404902+HemangChothani@users.noreply.github.com> Date: Wed, 14 Oct 2020 10:05:18 -0400 Subject: [PATCH 11/18] feat: add to_api_repr method to Model (#326) --- google/cloud/bigquery/model.py | 8 +++++++ tests/unit/model/test_model.py | 44 ++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/google/cloud/bigquery/model.py b/google/cloud/bigquery/model.py index 092d98c2e..1143b71f9 100644 --- a/google/cloud/bigquery/model.py +++ b/google/cloud/bigquery/model.py @@ -317,6 +317,14 @@ def _build_resource(self, filter_fields): def __repr__(self): return "Model(reference={})".format(repr(self.reference)) + def to_api_repr(self): + """Construct the API resource representation of this model. + + Returns: + Dict[str, object]: Model reference represented as an API resource + """ + return json_format.MessageToDict(self._proto) + class ModelReference(object): """ModelReferences are pointers to models. diff --git a/tests/unit/model/test_model.py b/tests/unit/model/test_model.py index 2c0079429..9fa29a496 100644 --- a/tests/unit/model/test_model.py +++ b/tests/unit/model/test_model.py @@ -318,3 +318,47 @@ def test_repr(target_class): "Model(reference=ModelReference(" "project_id='my-proj', dataset_id='my_dset', model_id='my_model'))" ) + + +def test_to_api_repr(target_class): + from google.protobuf import json_format + + model = target_class("my-proj.my_dset.my_model") + resource = { + "etag": "abcdefg", + "modelReference": { + "projectId": "my-project", + "datasetId": "my_dataset", + "modelId": "my_model", + }, + "creationTime": "1274284800000", + "lastModifiedTime": "1317484800000", + "modelType": "LOGISTIC_REGRESSION", + "trainingRuns": [ + { + "trainingOptions": {"initialLearnRate": 1.0}, + "startTime": "2010-05-19T16:00:00Z", + }, + { + "trainingOptions": {"initialLearnRate": 0.5}, + "startTime": "2011-10-01T16:00:00Z", + }, + { + "trainingOptions": {"initialLearnRate": 0.25}, + "startTime": "2012-12-21T16:00:00Z", + }, + ], + "description": "A friendly description.", + "location": "US", + "friendlyName": "A friendly name.", + "labels": {"greeting": "こんにちは"}, + "expirationTime": "1356105600000", + "encryptionConfiguration": { + "kmsKeyName": "projects/1/locations/us/keyRings/1/cryptoKeys/1" + }, + } + model._proto = json_format.ParseDict( + resource, types.Model()._pb, ignore_unknown_fields=True + ) + got = model.to_api_repr() + assert got == resource From 5178b55682f5e264bfc082cde26acb1fdc953a18 Mon Sep 17 00:00:00 2001 From: Carlos de la Guardia Date: Wed, 14 Oct 2020 13:42:39 -0500 Subject: [PATCH 12/18] feat: allow client options to be set in magics context (#322) * feat: allow client options to be set in magics context * add separate client options for storage client --- google/cloud/bigquery/magics/magics.py | 98 ++++++++++++++++++++++++-- tests/unit/test_magics.py | 98 ++++++++++++++++++++++++-- 2 files changed, 188 insertions(+), 8 deletions(-) diff --git a/google/cloud/bigquery/magics/magics.py b/google/cloud/bigquery/magics/magics.py index 22175ee45..5645a84a5 100644 --- a/google/cloud/bigquery/magics/magics.py +++ b/google/cloud/bigquery/magics/magics.py @@ -139,6 +139,7 @@ import re import ast +import copy import functools import sys import time @@ -155,6 +156,7 @@ import six from google.api_core import client_info +from google.api_core import client_options from google.api_core.exceptions import NotFound import google.auth from google.cloud import bigquery @@ -178,11 +180,13 @@ def __init__(self): self._project = None self._connection = None self._default_query_job_config = bigquery.QueryJobConfig() + self._bigquery_client_options = client_options.ClientOptions() + self._bqstorage_client_options = client_options.ClientOptions() @property def credentials(self): """google.auth.credentials.Credentials: Credentials to use for queries - performed through IPython magics + performed through IPython magics. Note: These credentials do not need to be explicitly defined if you are @@ -217,7 +221,7 @@ def credentials(self, value): @property def project(self): """str: Default project to use for queries performed through IPython - magics + magics. Note: The project does not need to be explicitly defined if you have an @@ -239,6 +243,54 @@ def project(self): def project(self, value): self._project = value + @property + def bigquery_client_options(self): + """google.api_core.client_options.ClientOptions: client options to be + used through IPython magics. + + Note:: + The client options do not need to be explicitly defined if no + special network connections are required. Normally you would be + using the https://bigquery.googleapis.com/ end point. + + Example: + Manually setting the endpoint: + + >>> from google.cloud.bigquery import magics + >>> client_options = {} + >>> client_options['api_endpoint'] = "https://some.special.url" + >>> magics.context.bigquery_client_options = client_options + """ + return self._bigquery_client_options + + @bigquery_client_options.setter + def bigquery_client_options(self, value): + self._bigquery_client_options = value + + @property + def bqstorage_client_options(self): + """google.api_core.client_options.ClientOptions: client options to be + used through IPython magics for the storage client. + + Note:: + The client options do not need to be explicitly defined if no + special network connections are required. Normally you would be + using the https://bigquerystorage.googleapis.com/ end point. + + Example: + Manually setting the endpoint: + + >>> from google.cloud.bigquery import magics + >>> client_options = {} + >>> client_options['api_endpoint'] = "https://some.special.url" + >>> magics.context.bqstorage_client_options = client_options + """ + return self._bqstorage_client_options + + @bqstorage_client_options.setter + def bqstorage_client_options(self, value): + self._bqstorage_client_options = value + @property def default_query_job_config(self): """google.cloud.bigquery.job.QueryJobConfig: Default job @@ -410,6 +462,24 @@ def _create_dataset_if_necessary(client, dataset_id): "Standard SQL if this argument is not used." ), ) +@magic_arguments.argument( + "--bigquery_api_endpoint", + type=str, + default=None, + help=( + "The desired API endpoint, e.g., bigquery.googlepis.com. Defaults to this " + "option's value in the context bigquery_client_options." + ), +) +@magic_arguments.argument( + "--bqstorage_api_endpoint", + type=str, + default=None, + help=( + "The desired API endpoint, e.g., bigquerystorage.googlepis.com. Defaults to " + "this option's value in the context bqstorage_client_options." + ), +) @magic_arguments.argument( "--use_bqstorage_api", action="store_true", @@ -511,15 +581,34 @@ def _cell_magic(line, query): params = _helpers.to_query_parameters(ast.literal_eval(params_option_value)) project = args.project or context.project + + bigquery_client_options = copy.deepcopy(context.bigquery_client_options) + if args.bigquery_api_endpoint: + if isinstance(bigquery_client_options, dict): + bigquery_client_options["api_endpoint"] = args.bigquery_api_endpoint + else: + bigquery_client_options.api_endpoint = args.bigquery_api_endpoint + client = bigquery.Client( project=project, credentials=context.credentials, default_query_job_config=context.default_query_job_config, client_info=client_info.ClientInfo(user_agent=IPYTHON_USER_AGENT), + client_options=bigquery_client_options, ) if context._connection: client._connection = context._connection - bqstorage_client = _make_bqstorage_client(use_bqstorage_api, context.credentials) + + bqstorage_client_options = copy.deepcopy(context.bqstorage_client_options) + if args.bqstorage_api_endpoint: + if isinstance(bqstorage_client_options, dict): + bqstorage_client_options["api_endpoint"] = args.bqstorage_api_endpoint + else: + bqstorage_client_options.api_endpoint = args.bqstorage_api_endpoint + + bqstorage_client = _make_bqstorage_client( + use_bqstorage_api, context.credentials, bqstorage_client_options, + ) close_transports = functools.partial(_close_transports, client, bqstorage_client) @@ -632,7 +721,7 @@ def _split_args_line(line): return params_option_value, rest_of_args -def _make_bqstorage_client(use_bqstorage_api, credentials): +def _make_bqstorage_client(use_bqstorage_api, credentials, client_options): if not use_bqstorage_api: return None @@ -658,6 +747,7 @@ def _make_bqstorage_client(use_bqstorage_api, credentials): return bigquery_storage.BigQueryReadClient( credentials=credentials, client_info=gapic_client_info.ClientInfo(user_agent=IPYTHON_USER_AGENT), + client_options=client_options, ) diff --git a/tests/unit/test_magics.py b/tests/unit/test_magics.py index 20be6b755..30ca4d70c 100644 --- a/tests/unit/test_magics.py +++ b/tests/unit/test_magics.py @@ -309,7 +309,7 @@ def test__make_bqstorage_client_false(): credentials_mock = mock.create_autospec( google.auth.credentials.Credentials, instance=True ) - got = magics._make_bqstorage_client(False, credentials_mock) + got = magics._make_bqstorage_client(False, credentials_mock, {}) assert got is None @@ -320,7 +320,7 @@ def test__make_bqstorage_client_true(): credentials_mock = mock.create_autospec( google.auth.credentials.Credentials, instance=True ) - got = magics._make_bqstorage_client(True, credentials_mock) + got = magics._make_bqstorage_client(True, credentials_mock, {}) assert isinstance(got, bigquery_storage.BigQueryReadClient) @@ -330,7 +330,7 @@ def test__make_bqstorage_client_true_raises_import_error(missing_bq_storage): ) with pytest.raises(ImportError) as exc_context, missing_bq_storage: - magics._make_bqstorage_client(True, credentials_mock) + magics._make_bqstorage_client(True, credentials_mock, {}) error_msg = str(exc_context.value) assert "google-cloud-bigquery-storage" in error_msg @@ -347,7 +347,7 @@ def test__make_bqstorage_client_true_missing_gapic(missing_grpcio_lib): ) with pytest.raises(ImportError) as exc_context, missing_grpcio_lib: - magics._make_bqstorage_client(True, credentials_mock) + magics._make_bqstorage_client(True, credentials_mock, {}) assert "grpcio" in str(exc_context.value) @@ -1180,6 +1180,96 @@ def test_bigquery_magic_with_project(): assert magics.context.project == "general-project" +@pytest.mark.usefixtures("ipython_interactive") +def test_bigquery_magic_with_bigquery_api_endpoint(ipython_ns_cleanup): + ip = IPython.get_ipython() + ip.extension_manager.load_extension("google.cloud.bigquery") + magics.context._connection = None + + run_query_patch = mock.patch( + "google.cloud.bigquery.magics.magics._run_query", autospec=True + ) + with run_query_patch as run_query_mock: + ip.run_cell_magic( + "bigquery", + "--bigquery_api_endpoint=https://bigquery_api.endpoint.com", + "SELECT 17 as num", + ) + + connection_used = run_query_mock.call_args_list[0][0][0]._connection + assert connection_used.API_BASE_URL == "https://bigquery_api.endpoint.com" + # context client options should not change + assert magics.context.bigquery_client_options.api_endpoint is None + + +@pytest.mark.usefixtures("ipython_interactive") +def test_bigquery_magic_with_bigquery_api_endpoint_context_dict(): + ip = IPython.get_ipython() + ip.extension_manager.load_extension("google.cloud.bigquery") + magics.context._connection = None + magics.context.bigquery_client_options = {} + + run_query_patch = mock.patch( + "google.cloud.bigquery.magics.magics._run_query", autospec=True + ) + with run_query_patch as run_query_mock: + ip.run_cell_magic( + "bigquery", + "--bigquery_api_endpoint=https://bigquery_api.endpoint.com", + "SELECT 17 as num", + ) + + connection_used = run_query_mock.call_args_list[0][0][0]._connection + assert connection_used.API_BASE_URL == "https://bigquery_api.endpoint.com" + # context client options should not change + assert magics.context.bigquery_client_options == {} + + +@pytest.mark.usefixtures("ipython_interactive") +def test_bigquery_magic_with_bqstorage_api_endpoint(ipython_ns_cleanup): + ip = IPython.get_ipython() + ip.extension_manager.load_extension("google.cloud.bigquery") + magics.context._connection = None + + run_query_patch = mock.patch( + "google.cloud.bigquery.magics.magics._run_query", autospec=True + ) + with run_query_patch as run_query_mock: + ip.run_cell_magic( + "bigquery", + "--bqstorage_api_endpoint=https://bqstorage_api.endpoint.com", + "SELECT 17 as num", + ) + + client_used = run_query_mock.mock_calls[1][2]["bqstorage_client"] + assert client_used._transport._host == "https://bqstorage_api.endpoint.com" + # context client options should not change + assert magics.context.bqstorage_client_options.api_endpoint is None + + +@pytest.mark.usefixtures("ipython_interactive") +def test_bigquery_magic_with_bqstorage_api_endpoint_context_dict(): + ip = IPython.get_ipython() + ip.extension_manager.load_extension("google.cloud.bigquery") + magics.context._connection = None + magics.context.bqstorage_client_options = {} + + run_query_patch = mock.patch( + "google.cloud.bigquery.magics.magics._run_query", autospec=True + ) + with run_query_patch as run_query_mock: + ip.run_cell_magic( + "bigquery", + "--bqstorage_api_endpoint=https://bqstorage_api.endpoint.com", + "SELECT 17 as num", + ) + + client_used = run_query_mock.mock_calls[1][2]["bqstorage_client"] + assert client_used._transport._host == "https://bqstorage_api.endpoint.com" + # context client options should not change + assert magics.context.bqstorage_client_options == {} + + @pytest.mark.usefixtures("ipython_interactive") def test_bigquery_magic_with_multiple_options(): ip = IPython.get_ipython() From d093cd4ee255bbcbe5a6a198e819b876b4aa51f9 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 16 Oct 2020 04:55:34 -0500 Subject: [PATCH 13/18] refactor: update Job classes to use common _properties pattern (#323) Instead of mixing _properties and plain-old Python objects, always use _properties as the source of truth. This has the side-effect of properly reloading the whole job resource. Previously some properties were not reloaded. --- google/cloud/bigquery/job.py | 364 ++++++++++++++++++----------------- tests/unit/test_client.py | 69 ++++++- tests/unit/test_job.py | 62 ++---- 3 files changed, 259 insertions(+), 236 deletions(-) diff --git a/google/cloud/bigquery/job.py b/google/cloud/bigquery/job.py index 70db69e71..766db1d42 100644 --- a/google/cloud/bigquery/job.py +++ b/google/cloud/bigquery/job.py @@ -35,6 +35,7 @@ from google.cloud.bigquery.external_config import ExternalConfig from google.cloud.bigquery.external_config import HivePartitioningOptions from google.cloud.bigquery import _helpers +from google.cloud.bigquery.model import ModelReference from google.cloud.bigquery.query import _query_param_from_api_repr from google.cloud.bigquery.query import ArrayQueryParameter from google.cloud.bigquery.query import ScalarQueryParameter @@ -47,8 +48,9 @@ from google.cloud.bigquery.table import _EmptyRowIterator from google.cloud.bigquery.table import RangePartitioning from google.cloud.bigquery.table import _table_arg_to_table_ref -from google.cloud.bigquery.table import TableReference from google.cloud.bigquery.table import Table +from google.cloud.bigquery.table import TableListItem +from google.cloud.bigquery.table import TableReference from google.cloud.bigquery.table import TimePartitioning _DONE_STATE = "DONE" @@ -461,11 +463,11 @@ def created(self): Optional[datetime.datetime]: the creation time (None until set from the server). """ - statistics = self._properties.get("statistics") - if statistics is not None: - millis = statistics.get("creationTime") - if millis is not None: - return _helpers._datetime_from_microseconds(millis * 1000.0) + millis = _helpers._get_sub_prop( + self._properties, ["statistics", "creationTime"] + ) + if millis is not None: + return _helpers._datetime_from_microseconds(millis * 1000.0) @property def started(self): @@ -475,11 +477,9 @@ def started(self): Optional[datetime.datetime]: the start time (None until set from the server). """ - statistics = self._properties.get("statistics") - if statistics is not None: - millis = statistics.get("startTime") - if millis is not None: - return _helpers._datetime_from_microseconds(millis * 1000.0) + millis = _helpers._get_sub_prop(self._properties, ["statistics", "startTime"]) + if millis is not None: + return _helpers._datetime_from_microseconds(millis * 1000.0) @property def ended(self): @@ -489,11 +489,9 @@ def ended(self): Optional[datetime.datetime]: the end time (None until set from the server). """ - statistics = self._properties.get("statistics") - if statistics is not None: - millis = statistics.get("endTime") - if millis is not None: - return _helpers._datetime_from_microseconds(millis * 1000.0) + millis = _helpers._get_sub_prop(self._properties, ["statistics", "endTime"]) + if millis is not None: + return _helpers._datetime_from_microseconds(millis * 1000.0) def _job_statistics(self): """Helper for job-type specific statistics-based properties.""" @@ -535,14 +533,6 @@ def state(self): if status is not None: return status.get("state") - def _scrub_local_properties(self, cleaned): - """Helper: handle subclass properties in cleaned.""" - pass - - def _copy_configuration_properties(self, configuration): - """Helper: assign subclass configuration properties in cleaned.""" - raise NotImplementedError("Abstract") - def _set_properties(self, api_response): """Update properties from resource in body of ``api_response`` @@ -550,7 +540,6 @@ def _set_properties(self, api_response): api_response (Dict): response returned from an API call. """ cleaned = api_response.copy() - self._scrub_local_properties(cleaned) statistics = cleaned.get("statistics", {}) if "creationTime" in statistics: @@ -560,25 +549,24 @@ def _set_properties(self, api_response): if "endTime" in statistics: statistics["endTime"] = float(statistics["endTime"]) + # Save configuration to keep reference same in self._configuration. + cleaned_config = cleaned.pop("configuration", {}) + configuration = self._properties.pop("configuration", {}) self._properties.clear() self._properties.update(cleaned) - self._copy_configuration_properties(cleaned.get("configuration", {})) + self._properties["configuration"] = configuration + self._properties["configuration"].update(cleaned_config) # For Future interface self._set_future_result() @classmethod - def _get_resource_config(cls, resource): + def _check_resource_config(cls, resource): """Helper for :meth:`from_api_repr` Args: resource (Dict): resource for the job. - Returns: - (str, Dict): - tuple (string, dict), where the first element is the - job ID and the second contains job-specific configuration. - Raises: KeyError: If the resource has no identifier, or @@ -589,7 +577,6 @@ def _get_resource_config(cls, resource): "Resource lacks required identity information: " '["jobReference"]["jobId"]' ) - job_id = resource["jobReference"]["jobId"] if ( "configuration" not in resource or cls._JOB_TYPE not in resource["configuration"] @@ -598,7 +585,6 @@ def _get_resource_config(cls, resource): "Resource lacks required configuration: " '["configuration"]["%s"]' % cls._JOB_TYPE ) - return job_id, resource["configuration"] def to_api_repr(self): """Generate a resource for the job.""" @@ -1002,15 +988,15 @@ def from_api_repr(cls, resource): Args: resource (Dict): - An extract job configuration in the same representation as is - returned from the API. + A job configuration in the same representation as is returned + from the API. Returns: google.cloud.bigquery.job._JobConfig: Configuration parsed from ``resource``. """ - config = cls() - config._properties = copy.deepcopy(resource) - return config + job_config = cls() + job_config._properties = resource + return job_config class LoadJobConfig(_JobConfig): @@ -1450,12 +1436,23 @@ class LoadJob(_AsyncJob): def __init__(self, job_id, source_uris, destination, client, job_config=None): super(LoadJob, self).__init__(job_id, client) - if job_config is None: + if not job_config: job_config = LoadJobConfig() - self.source_uris = source_uris - self._destination = destination self._configuration = job_config + self._properties["configuration"] = job_config._properties + + if source_uris is not None: + _helpers._set_sub_prop( + self._properties, ["configuration", "load", "sourceUris"], source_uris + ) + + if destination is not None: + _helpers._set_sub_prop( + self._properties, + ["configuration", "load", "destinationTable"], + destination.to_api_repr(), + ) @property def destination(self): @@ -1464,7 +1461,20 @@ def destination(self): See: https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.destination_table """ - return self._destination + dest_config = _helpers._get_sub_prop( + self._properties, ["configuration", "load", "destinationTable"] + ) + return TableReference.from_api_repr(dest_config) + + @property + def source_uris(self): + """Optional[Sequence[str]]: URIs of data files to be loaded. See + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.source_uris + for supported URI formats. None for jobs that load from a file. + """ + return _helpers._get_sub_prop( + self._properties, ["configuration", "load", "sourceUris"] + ) @property def allow_jagged_rows(self): @@ -1687,24 +1697,12 @@ def output_rows(self): def to_api_repr(self): """Generate a resource for :meth:`_begin`.""" - configuration = self._configuration.to_api_repr() - if self.source_uris is not None: - _helpers._set_sub_prop( - configuration, ["load", "sourceUris"], self.source_uris - ) - _helpers._set_sub_prop( - configuration, ["load", "destinationTable"], self.destination.to_api_repr() - ) - + # Exclude statistics, if set. return { "jobReference": self._properties["jobReference"], - "configuration": configuration, + "configuration": self._properties["configuration"], } - def _copy_configuration_properties(self, configuration): - """Helper: assign subclass configuration properties in cleaned.""" - self._configuration._properties = copy.deepcopy(configuration) - @classmethod def from_api_repr(cls, resource, client): """Factory: construct a job given its API representation @@ -1724,16 +1722,9 @@ def from_api_repr(cls, resource, client): Returns: google.cloud.bigquery.job.LoadJob: Job parsed from ``resource``. """ - config_resource = resource.get("configuration", {}) - config = LoadJobConfig.from_api_repr(config_resource) - # A load job requires a destination table. - dest_config = config_resource["load"]["destinationTable"] - ds_ref = DatasetReference(dest_config["projectId"], dest_config["datasetId"]) - destination = TableReference(ds_ref, dest_config["tableId"]) - # sourceUris will be absent if this is a file upload. - source_uris = _helpers._get_sub_prop(config_resource, ["load", "sourceUris"]) + cls._check_resource_config(resource) job_ref = _JobReference._from_api_repr(resource["jobReference"]) - job = cls(job_ref, source_uris, destination, client, config) + job = cls(job_ref, None, None, client) job._set_properties(resource) return job @@ -1824,12 +1815,59 @@ class CopyJob(_AsyncJob): def __init__(self, job_id, sources, destination, client, job_config=None): super(CopyJob, self).__init__(job_id, client) - if job_config is None: + if not job_config: job_config = CopyJobConfig() - self.destination = destination - self.sources = sources self._configuration = job_config + self._properties["configuration"] = job_config._properties + + if destination: + _helpers._set_sub_prop( + self._properties, + ["configuration", "copy", "destinationTable"], + destination.to_api_repr(), + ) + + if sources: + source_resources = [source.to_api_repr() for source in sources] + _helpers._set_sub_prop( + self._properties, + ["configuration", "copy", "sourceTables"], + source_resources, + ) + + @property + def destination(self): + """google.cloud.bigquery.table.TableReference: Table into which data + is to be loaded. + """ + return TableReference.from_api_repr( + _helpers._get_sub_prop( + self._properties, ["configuration", "copy", "destinationTable"], + ) + ) + + @property + def sources(self): + """List[google.cloud.bigquery.table.TableReference]): Table(s) from + which data is to be loaded. + """ + source_configs = _helpers._get_sub_prop( + self._properties, ["configuration", "copy", "sourceTables"] + ) + if source_configs is None: + single = _helpers._get_sub_prop( + self._properties, ["configuration", "copy", "sourceTable"] + ) + if single is None: + raise KeyError("Resource missing 'sourceTables' / 'sourceTable'") + source_configs = [single] + + sources = [] + for source_config in source_configs: + table_ref = TableReference.from_api_repr(source_config) + sources.append(table_ref) + return sources @property def create_disposition(self): @@ -1860,40 +1898,15 @@ def destination_encryption_configuration(self): def to_api_repr(self): """Generate a resource for :meth:`_begin`.""" - - source_refs = [ - { - "projectId": table.project, - "datasetId": table.dataset_id, - "tableId": table.table_id, - } - for table in self.sources - ] - - configuration = self._configuration.to_api_repr() - _helpers._set_sub_prop(configuration, ["copy", "sourceTables"], source_refs) - _helpers._set_sub_prop( - configuration, - ["copy", "destinationTable"], - { - "projectId": self.destination.project, - "datasetId": self.destination.dataset_id, - "tableId": self.destination.table_id, - }, - ) - + # Exclude statistics, if set. return { "jobReference": self._properties["jobReference"], - "configuration": configuration, + "configuration": self._properties["configuration"], } - def _copy_configuration_properties(self, configuration): - """Helper: assign subclass configuration properties in cleaned.""" - self._configuration._properties = copy.deepcopy(configuration) - @classmethod def from_api_repr(cls, resource, client): - """Factory: construct a job given its API representation + """Factory: construct a job given its API representation .. note: @@ -1902,7 +1915,6 @@ def from_api_repr(cls, resource, client): Args: resource (Dict): dataset job representation returned from the API - client (google.cloud.bigquery.client.Client): Client which holds credentials and project configuration for the dataset. @@ -1910,22 +1922,9 @@ def from_api_repr(cls, resource, client): Returns: google.cloud.bigquery.job.CopyJob: Job parsed from ``resource``. """ - job_id, config_resource = cls._get_resource_config(resource) - config = CopyJobConfig.from_api_repr(config_resource) - # Copy required fields to the job. - copy_resource = config_resource["copy"] - destination = TableReference.from_api_repr(copy_resource["destinationTable"]) - sources = [] - source_configs = copy_resource.get("sourceTables") - if source_configs is None: - single = copy_resource.get("sourceTable") - if single is None: - raise KeyError("Resource missing 'sourceTables' / 'sourceTable'") - source_configs = [single] - for source_config in source_configs: - table_ref = TableReference.from_api_repr(source_config) - sources.append(table_ref) - job = cls(job_id, sources, destination, client=client, job_config=config) + cls._check_resource_config(resource) + job_ref = _JobReference._from_api_repr(resource["jobReference"]) + job = cls(job_ref, None, None, client=client) job._set_properties(resource) return job @@ -2038,10 +2037,61 @@ def __init__(self, job_id, source, destination_uris, client, job_config=None): if job_config is None: job_config = ExtractJobConfig() - self.source = source - self.destination_uris = destination_uris + self._properties["configuration"] = job_config._properties self._configuration = job_config + if source: + source_ref = { + "projectId": source.project, + "datasetId": source.dataset_id, + } + + if isinstance(source, (Table, TableListItem, TableReference)): + source_ref["tableId"] = source.table_id + source_key = "sourceTable" + else: + source_ref["modelId"] = source.model_id + source_key = "sourceModel" + + _helpers._set_sub_prop( + self._properties, ["configuration", "extract", source_key], source_ref + ) + + if destination_uris: + _helpers._set_sub_prop( + self._properties, + ["configuration", "extract", "destinationUris"], + destination_uris, + ) + + @property + def source(self): + """Union[ \ + google.cloud.bigquery.table.TableReference, \ + google.cloud.bigquery.model.ModelReference \ + ]: Table or Model from which data is to be loaded or extracted. + """ + source_config = _helpers._get_sub_prop( + self._properties, ["configuration", "extract", "sourceTable"] + ) + if source_config: + return TableReference.from_api_repr(source_config) + else: + source_config = _helpers._get_sub_prop( + self._properties, ["configuration", "extract", "sourceModel"] + ) + return ModelReference.from_api_repr(source_config) + + @property + def destination_uris(self): + """List[str]: URIs describing where the extracted data will be + written in Cloud Storage, using the format + ``gs:///``. + """ + return _helpers._get_sub_prop( + self._properties, ["configuration", "extract", "destinationUris"] + ) + @property def compression(self): """See @@ -2092,34 +2142,12 @@ def destination_uri_file_counts(self): def to_api_repr(self): """Generate a resource for :meth:`_begin`.""" - - configuration = self._configuration.to_api_repr() - source_ref = { - "projectId": self.source.project, - "datasetId": self.source.dataset_id, - } - - source = "sourceTable" - if isinstance(self.source, TableReference): - source_ref["tableId"] = self.source.table_id - else: - source_ref["modelId"] = self.source.model_id - source = "sourceModel" - - _helpers._set_sub_prop(configuration, ["extract", source], source_ref) - _helpers._set_sub_prop( - configuration, ["extract", "destinationUris"], self.destination_uris - ) - + # Exclude statistics, if set. return { "jobReference": self._properties["jobReference"], - "configuration": configuration, + "configuration": self._properties["configuration"], } - def _copy_configuration_properties(self, configuration): - """Helper: assign subclass configuration properties in cleaned.""" - self._configuration._properties = copy.deepcopy(configuration) - @classmethod def from_api_repr(cls, resource, client): """Factory: construct a job given its API representation @@ -2139,30 +2167,9 @@ def from_api_repr(cls, resource, client): Returns: google.cloud.bigquery.job.ExtractJob: Job parsed from ``resource``. """ - job_id, config_resource = cls._get_resource_config(resource) - config = ExtractJobConfig.from_api_repr(config_resource) - source_config = _helpers._get_sub_prop( - config_resource, ["extract", "sourceTable"] - ) - if source_config: - dataset = DatasetReference( - source_config["projectId"], source_config["datasetId"] - ) - source = dataset.table(source_config["tableId"]) - else: - source_config = _helpers._get_sub_prop( - config_resource, ["extract", "sourceModel"] - ) - dataset = DatasetReference( - source_config["projectId"], source_config["datasetId"] - ) - source = dataset.model(source_config["modelId"]) - - destination_uris = _helpers._get_sub_prop( - config_resource, ["extract", "destinationUris"] - ) - - job = cls(job_id, source, destination_uris, client=client, job_config=config) + cls._check_resource_config(resource) + job_ref = _JobReference._from_api_repr(resource["jobReference"]) + job = cls(job_ref, None, None, client=client) job._set_properties(resource) return job @@ -2631,11 +2638,14 @@ def __init__(self, job_id, query, client, job_config=None): if job_config.use_legacy_sql is None: job_config.use_legacy_sql = False - _helpers._set_sub_prop( - self._properties, ["configuration", "query", "query"], query - ) - + self._properties["configuration"] = job_config._properties self._configuration = job_config + + if query: + _helpers._set_sub_prop( + self._properties, ["configuration", "query", "query"], query + ) + self._query_results = None self._done_timeout = None self._transport_timeout = None @@ -2799,19 +2809,13 @@ def schema_update_options(self): def to_api_repr(self): """Generate a resource for :meth:`_begin`.""" + # Use to_api_repr to allow for some configuration properties to be set + # automatically. configuration = self._configuration.to_api_repr() - - resource = { + return { "jobReference": self._properties["jobReference"], "configuration": configuration, } - configuration["query"]["query"] = self.query - - return resource - - def _copy_configuration_properties(self, configuration): - """Helper: assign subclass configuration properties in cleaned.""" - self._configuration._properties = copy.deepcopy(configuration) @classmethod def from_api_repr(cls, resource, client): @@ -2827,9 +2831,9 @@ def from_api_repr(cls, resource, client): Returns: google.cloud.bigquery.job.QueryJob: Job parsed from ``resource``. """ - job_id, config = cls._get_resource_config(resource) - query = _helpers._get_sub_prop(config, ["query", "query"]) - job = cls(job_id, query, client=client) + cls._check_resource_config(resource) + job_ref = _JobReference._from_api_repr(resource["jobReference"]) + job = cls(job_ref, None, client=client) job._set_properties(resource) return job diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index 52e00d7c7..bc2658961 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -4266,7 +4266,7 @@ def test_load_table_from_uri(self): self.assertIs(job._client, client) self.assertEqual(job.job_id, JOB) self.assertEqual(list(job.source_uris), [SOURCE_URI]) - self.assertIs(job.destination, destination) + self.assertEqual(job.destination, destination) conn = client._connection = make_connection(RESOURCE) @@ -4275,7 +4275,7 @@ def test_load_table_from_uri(self): self.assertIs(job._client, client) self.assertEqual(job.job_id, JOB) self.assertEqual(list(job.source_uris), [SOURCE_URI]) - self.assertIs(job.destination, destination) + self.assertEqual(job.destination, destination) def test_load_table_from_uri_w_explicit_project(self): job_id = "this-is-a-job-id" @@ -4576,16 +4576,67 @@ def test_copy_table(self): self.assertIs(job._client, client) self.assertEqual(job.job_id, JOB) self.assertEqual(list(job.sources), [source]) - self.assertIs(job.destination, destination) + self.assertEqual(job.destination, destination) - conn = client._connection = make_connection(RESOURCE) - source2 = dataset.table(SOURCE + "2") - job = client.copy_table([source, source2], destination, job_id=JOB) + def test_copy_table_w_multiple_sources(self): + from google.cloud.bigquery.job import CopyJob + from google.cloud.bigquery.table import TableReference + + job_id = "job_name" + source_id = "my-project.my_dataset.source_table" + source_id2 = "my-project.my_dataset.source_table2" + destination_id = "my-other-project.another_dataset.destination_table" + expected_resource = { + "jobReference": {"projectId": self.PROJECT, "jobId": job_id}, + "configuration": { + "copy": { + "sourceTables": [ + { + "projectId": "my-project", + "datasetId": "my_dataset", + "tableId": "source_table", + }, + { + "projectId": "my-project", + "datasetId": "my_dataset", + "tableId": "source_table2", + }, + ], + "destinationTable": { + "projectId": "my-other-project", + "datasetId": "another_dataset", + "tableId": "destination_table", + }, + } + }, + } + returned_resource = expected_resource.copy() + returned_resource["statistics"] = {} + creds = _make_credentials() + http = object() + client = self._make_one(project=self.PROJECT, credentials=creds, _http=http) + conn = client._connection = make_connection(returned_resource) + + job = client.copy_table([source_id, source_id2], destination_id, job_id=job_id) + + # Check that copy_table actually starts the job. + conn.api_request.assert_called_once_with( + method="POST", + path="/projects/%s/jobs" % self.PROJECT, + data=expected_resource, + timeout=None, + ) self.assertIsInstance(job, CopyJob) self.assertIs(job._client, client) - self.assertEqual(job.job_id, JOB) - self.assertEqual(list(job.sources), [source, source2]) - self.assertIs(job.destination, destination) + self.assertEqual(job.job_id, job_id) + self.assertEqual( + list(sorted(job.sources, key=lambda tbl: tbl.table_id)), + [ + TableReference.from_string(source_id), + TableReference.from_string(source_id2), + ], + ) + self.assertEqual(job.destination, TableReference.from_string(destination_id)) def test_copy_table_w_explicit_project(self): job_id = "this-is-a-job-id" diff --git a/tests/unit/test_job.py b/tests/unit/test_job.py index d21489616..75212ae95 100644 --- a/tests/unit/test_job.py +++ b/tests/unit/test_job.py @@ -455,28 +455,9 @@ def test_state(self): status["state"] = state self.assertEqual(job.state, state) - def test__scrub_local_properties(self): - before = {"foo": "bar"} - resource = before.copy() - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - job._scrub_local_properties(resource) # no raise - self.assertEqual(resource, before) - - def test__copy_configuration_properties(self): - before = {"foo": "bar"} - resource = before.copy() - client = _make_client(project=self.PROJECT) - job = self._make_one(self.JOB_ID, client) - with self.assertRaises(NotImplementedError): - job._copy_configuration_properties(resource) - self.assertEqual(resource, before) - def _set_properties_job(self): client = _make_client(project=self.PROJECT) job = self._make_one(self.JOB_ID, client) - job._scrub_local_properties = mock.Mock() - job._copy_configuration_properties = mock.Mock() job._set_future_result = mock.Mock() job._properties = { "jobReference": job._properties["jobReference"], @@ -493,9 +474,6 @@ def test__set_properties_no_stats(self): self.assertEqual(job._properties, resource) - job._scrub_local_properties.assert_called_once_with(resource) - job._copy_configuration_properties.assert_called_once_with(config) - def test__set_properties_w_creation_time(self): now, millis = self._datetime_and_millis() config = {"test": True} @@ -509,9 +487,6 @@ def test__set_properties_w_creation_time(self): cleaned["statistics"]["creationTime"] = float(millis) self.assertEqual(job._properties, cleaned) - job._scrub_local_properties.assert_called_once_with(resource) - job._copy_configuration_properties.assert_called_once_with(config) - def test__set_properties_w_start_time(self): now, millis = self._datetime_and_millis() config = {"test": True} @@ -525,9 +500,6 @@ def test__set_properties_w_start_time(self): cleaned["statistics"]["startTime"] = float(millis) self.assertEqual(job._properties, cleaned) - job._scrub_local_properties.assert_called_once_with(resource) - job._copy_configuration_properties.assert_called_once_with(config) - def test__set_properties_w_end_time(self): now, millis = self._datetime_and_millis() config = {"test": True} @@ -541,38 +513,35 @@ def test__set_properties_w_end_time(self): cleaned["statistics"]["endTime"] = float(millis) self.assertEqual(job._properties, cleaned) - job._scrub_local_properties.assert_called_once_with(resource) - job._copy_configuration_properties.assert_called_once_with(config) - - def test__get_resource_config_missing_job_ref(self): + def test__check_resource_config_missing_job_ref(self): resource = {} klass = self._make_derived_class() with self.assertRaises(KeyError): - klass._get_resource_config(resource) + klass._check_resource_config(resource) - def test__get_resource_config_missing_job_id(self): + def test__check_resource_config_missing_job_id(self): resource = {"jobReference": {}} klass = self._make_derived_class() with self.assertRaises(KeyError): - klass._get_resource_config(resource) + klass._check_resource_config(resource) - def test__get_resource_config_missing_configuration(self): + def test__check_resource_config_missing_configuration(self): resource = {"jobReference": {"jobId": self.JOB_ID}} klass = self._make_derived_class() with self.assertRaises(KeyError): - klass._get_resource_config(resource) + klass._check_resource_config(resource) - def test__get_resource_config_missing_config_type(self): + def test__check_resource_config_missing_config_type(self): resource = {"jobReference": {"jobId": self.JOB_ID}, "configuration": {}} klass = self._make_derived_class() with self.assertRaises(KeyError): - klass._get_resource_config(resource) + klass._check_resource_config(resource) - def test__get_resource_config_ok(self): + def test__check_resource_config_ok(self): derived_config = {"foo": "bar"} resource = { "jobReference": {"jobId": self.JOB_ID}, @@ -580,10 +549,8 @@ def test__get_resource_config_ok(self): } klass = self._make_derived_class() - job_id, config = klass._get_resource_config(resource) - - self.assertEqual(job_id, self.JOB_ID) - self.assertEqual(config, {"derived": derived_config}) + # Should not throw. + klass._check_resource_config(resource) def test__build_resource(self): client = _make_client(project=self.PROJECT) @@ -2093,7 +2060,7 @@ def _verifyResourceProperties(self, job, resource): def test_ctor(self): client = _make_client(project=self.PROJECT) job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client) - self.assertIs(job.destination, self.TABLE_REF) + self.assertEqual(job.destination, self.TABLE_REF) self.assertEqual(list(job.source_uris), [self.SOURCE1]) self.assertIs(job._client, client) self.assertEqual(job.job_type, self.JOB_TYPE) @@ -2907,7 +2874,7 @@ def test_ctor(self): source = self._table_ref(self.SOURCE_TABLE) destination = self._table_ref(self.DESTINATION_TABLE) job = self._make_one(self.JOB_ID, [source], destination, client) - self.assertIs(job.destination, destination) + self.assertEqual(job.destination, destination) self.assertEqual(job.sources, [source]) self.assertIs(job._client, client) self.assertEqual(job.job_type, self.JOB_TYPE) @@ -3041,8 +3008,9 @@ def test_from_api_repr_wo_sources(self): }, } klass = self._get_target_class() + job = klass.from_api_repr(RESOURCE, client=client) with self.assertRaises(KeyError): - klass.from_api_repr(RESOURCE, client=client) + _ = job.sources def test_from_api_repr_w_properties(self): from google.cloud.bigquery.job import CreateDisposition From cebb5e0e911e8c9059bc8c9e7fce4440e518bff3 Mon Sep 17 00:00:00 2001 From: Kumar Anirudha <5357586+anistark@users.noreply.github.com> Date: Fri, 16 Oct 2020 20:02:03 +0530 Subject: [PATCH 14/18] deps: add protobuf dependency (#306) Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [x] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [x] Ensure the tests and linter pass - [x] Code coverage does not decrease (if any source code was changed) - [x] Appropriate docs were updated (if necessary) Fixes #305 --- setup.py | 1 + testing/constraints-3.6.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/setup.py b/setup.py index abd5cef95..c7410601e 100644 --- a/setup.py +++ b/setup.py @@ -34,6 +34,7 @@ "google-cloud-core >= 1.4.1, < 2.0dev", "google-resumable-media >= 0.6.0, < 2.0dev", "six >=1.13.0,< 2.0.0dev", + "protobuf >= 3.12.0", ] extras = { "bqstorage": [ diff --git a/testing/constraints-3.6.txt b/testing/constraints-3.6.txt index 798804941..cea0ed84e 100644 --- a/testing/constraints-3.6.txt +++ b/testing/constraints-3.6.txt @@ -8,6 +8,7 @@ libcst==0.2.5 llvmlite==0.34.0 # pandas 0.23.0 is the first version to work with pyarrow to_pandas. pandas==0.23.0 +protobuf == 3.12.0 proto-plus==1.10.0 pyarrow==1.0.0 python-snappy==0.5.4 From 29dd573729102606b6fb3119602faafeb6aa81e7 Mon Sep 17 00:00:00 2001 From: Tres Seaver Date: Fri, 16 Oct 2020 19:04:02 -0400 Subject: [PATCH 15/18] tests: split out snippets builds (#219) @tmatsuo Emulating PR #207. I don't know if I'm missing anything: e.g., I don't quite understand what the `split_system_tests=True` does in the `synth.py` there. Toward #191 --- .kokoro/presubmit/presubmit.cfg | 4 ++++ .kokoro/presubmit/snippets-2.7.cfg | 7 +++++++ .kokoro/presubmit/snippets-3.8.cfg | 7 +++++++ noxfile.py | 10 +++++++--- 4 files changed, 25 insertions(+), 3 deletions(-) create mode 100644 .kokoro/presubmit/snippets-2.7.cfg create mode 100644 .kokoro/presubmit/snippets-3.8.cfg diff --git a/.kokoro/presubmit/presubmit.cfg b/.kokoro/presubmit/presubmit.cfg index b158096f0..17d071cae 100644 --- a/.kokoro/presubmit/presubmit.cfg +++ b/.kokoro/presubmit/presubmit.cfg @@ -5,3 +5,7 @@ env_vars: { key: "RUN_SYSTEM_TESTS" value: "false" } +env_vars: { + key: "RUN_SNIPPETS_TESTS" + value: "false" +} diff --git a/.kokoro/presubmit/snippets-2.7.cfg b/.kokoro/presubmit/snippets-2.7.cfg new file mode 100644 index 000000000..3bd6134d2 --- /dev/null +++ b/.kokoro/presubmit/snippets-2.7.cfg @@ -0,0 +1,7 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +# Only run this nox session. +env_vars: { + key: "NOX_SESSION" + value: "snippets-2.7" +} diff --git a/.kokoro/presubmit/snippets-3.8.cfg b/.kokoro/presubmit/snippets-3.8.cfg new file mode 100644 index 000000000..840d9e716 --- /dev/null +++ b/.kokoro/presubmit/snippets-3.8.cfg @@ -0,0 +1,7 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +# Only run this nox session. +env_vars: { + key: "NOX_SESSION" + value: "snippets-3.8" +} diff --git a/noxfile.py b/noxfile.py index db1dcffde..441782583 100644 --- a/noxfile.py +++ b/noxfile.py @@ -112,14 +112,18 @@ def system(session): def snippets(session): """Run the snippets test suite.""" - constraints_path = str( - CURRENT_DIRECTORY / "testing" / f"constraints-{session.python}.txt" - ) + # Check the value of `RUN_SNIPPETS_TESTS` env var. It defaults to true. + if os.environ.get("RUN_SNIPPETS_TESTS", "true") == "false": + session.skip("RUN_SNIPPETS_TESTS is set to false, skipping") # Sanity check: Only run snippets tests if the environment variable is set. if not os.environ.get("GOOGLE_APPLICATION_CREDENTIALS", ""): session.skip("Credentials must be set via environment variable.") + constraints_path = str( + CURRENT_DIRECTORY / "testing" / f"constraints-{session.python}.txt" + ) + # Install all test dependencies, then install local packages in place. session.install("mock", "pytest", "google-cloud-testutils", "-c", constraints_path) session.install("google-cloud-storage", "-c", constraints_path) From b0dd892176e31ac25fddd15554b5bfa054299d4d Mon Sep 17 00:00:00 2001 From: HemangChothani <50404902+HemangChothani@users.noreply.github.com> Date: Mon, 19 Oct 2020 09:45:12 -0400 Subject: [PATCH 16/18] feat: add timeout paramter to load_table_from_file and it dependent methods (#327) --- google/cloud/bigquery/client.py | 71 +++++++++++++++++++++++++++------ tests/unit/test_client.py | 45 +++++++++++++++------ 2 files changed, 92 insertions(+), 24 deletions(-) diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index b7e082daa..cce393d6c 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -1591,7 +1591,7 @@ def job_from_resource(self, resource): return job.QueryJob.from_api_repr(resource, self) return job.UnknownJob.from_api_repr(resource, self) - def create_job(self, job_config, retry=DEFAULT_RETRY): + def create_job(self, job_config, retry=DEFAULT_RETRY, timeout=None): """Create a new job. Args: job_config (dict): configuration job representation returned from the API. @@ -1599,6 +1599,9 @@ def create_job(self, job_config, retry=DEFAULT_RETRY): Keyword Arguments: retry (Optional[google.api_core.retry.Retry]): How to retry the RPC. + timeout (Optional[float]): + The number of seconds to wait for the underlying HTTP transport + before using ``retry``. Returns: Union[ \ @@ -1617,7 +1620,11 @@ def create_job(self, job_config, retry=DEFAULT_RETRY): destination = _get_sub_prop(job_config, ["load", "destinationTable"]) source_uris = _get_sub_prop(job_config, ["load", "sourceUris"]) return self.load_table_from_uri( - source_uris, destination, job_config=load_job_config, retry=retry + source_uris, + destination, + job_config=load_job_config, + retry=retry, + timeout=timeout, ) elif "copy" in job_config: copy_job_config = google.cloud.bigquery.job.CopyJobConfig.from_api_repr( @@ -1633,7 +1640,11 @@ def create_job(self, job_config, retry=DEFAULT_RETRY): table_ref = TableReference.from_api_repr(source_config) sources.append(table_ref) return self.copy_table( - sources, destination, job_config=copy_job_config, retry=retry + sources, + destination, + job_config=copy_job_config, + retry=retry, + timeout=timeout, ) elif "extract" in job_config: extract_job_config = google.cloud.bigquery.job.ExtractJobConfig.from_api_repr( @@ -1650,6 +1661,7 @@ def create_job(self, job_config, retry=DEFAULT_RETRY): destination_uris, job_config=extract_job_config, retry=retry, + timeout=timeout, source_type=source_type, ) elif "query" in job_config: @@ -1659,7 +1671,9 @@ def create_job(self, job_config, retry=DEFAULT_RETRY): copy_config ) query = _get_sub_prop(copy_config, ["query", "query"]) - return self.query(query, job_config=query_job_config, retry=retry) + return self.query( + query, job_config=query_job_config, retry=retry, timeout=timeout + ) else: raise TypeError("Invalid job configuration received.") @@ -1981,6 +1995,7 @@ def load_table_from_file( location=None, project=None, job_config=None, + timeout=None, ): """Upload the contents of this table from a file-like object. @@ -2020,6 +2035,9 @@ def load_table_from_file( to the client's project. job_config (Optional[google.cloud.bigquery.job.LoadJobConfig]): Extra configuration options for the job. + timeout (Optional[float]): + The number of seconds to wait for the underlying HTTP transport + before using ``retry``. Returns: google.cloud.bigquery.job.LoadJob: A new load job. @@ -2058,11 +2076,11 @@ def load_table_from_file( try: if size is None or size >= _MAX_MULTIPART_SIZE: response = self._do_resumable_upload( - file_obj, job_resource, num_retries + file_obj, job_resource, num_retries, timeout ) else: response = self._do_multipart_upload( - file_obj, job_resource, size, num_retries + file_obj, job_resource, size, num_retries, timeout ) except resumable_media.InvalidResponse as exc: raise exceptions.from_http_response(exc.response) @@ -2080,6 +2098,7 @@ def load_table_from_dataframe( project=None, job_config=None, parquet_compression="snappy", + timeout=None, ): """Upload the contents of a table from a pandas DataFrame. @@ -2143,6 +2162,9 @@ def load_table_from_dataframe( passed as the ``compression`` argument to the underlying ``DataFrame.to_parquet()`` method. https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_parquet.html#pandas.DataFrame.to_parquet + timeout (Optional[float]): + The number of seconds to wait for the underlying HTTP transport + before using ``retry``. Returns: google.cloud.bigquery.job.LoadJob: A new load job. @@ -2249,6 +2271,7 @@ def load_table_from_dataframe( location=location, project=project, job_config=job_config, + timeout=timeout, ) finally: @@ -2264,6 +2287,7 @@ def load_table_from_json( location=None, project=None, job_config=None, + timeout=None, ): """Upload the contents of a table from a JSON string or dict. @@ -2313,6 +2337,9 @@ def load_table_from_json( Extra configuration options for the job. The ``source_format`` setting is always set to :attr:`~google.cloud.bigquery.job.SourceFormat.NEWLINE_DELIMITED_JSON`. + timeout (Optional[float]): + The number of seconds to wait for the underlying HTTP transport + before using ``retry``. Returns: google.cloud.bigquery.job.LoadJob: A new load job. @@ -2357,9 +2384,10 @@ def load_table_from_json( location=location, project=project, job_config=job_config, + timeout=timeout, ) - def _do_resumable_upload(self, stream, metadata, num_retries): + def _do_resumable_upload(self, stream, metadata, num_retries, timeout): """Perform a resumable upload. Args: @@ -2371,13 +2399,17 @@ def _do_resumable_upload(self, stream, metadata, num_retries): Number of upload retries. (Deprecated: This argument will be removed in a future release.) + timeout (float): + The number of seconds to wait for the underlying HTTP transport + before using ``retry``. + Returns: requests.Response: The "200 OK" response object returned after the final chunk is uploaded. """ upload, transport = self._initiate_resumable_upload( - stream, metadata, num_retries + stream, metadata, num_retries, timeout ) while not upload.finished: @@ -2385,7 +2417,7 @@ def _do_resumable_upload(self, stream, metadata, num_retries): return response - def _initiate_resumable_upload(self, stream, metadata, num_retries): + def _initiate_resumable_upload(self, stream, metadata, num_retries, timeout): """Initiate a resumable upload. Args: @@ -2397,6 +2429,10 @@ def _initiate_resumable_upload(self, stream, metadata, num_retries): Number of upload retries. (Deprecated: This argument will be removed in a future release.) + timeout (float): + The number of seconds to wait for the underlying HTTP transport + before using ``retry``. + Returns: Tuple: Pair of @@ -2419,12 +2455,17 @@ def _initiate_resumable_upload(self, stream, metadata, num_retries): ) upload.initiate( - transport, stream, metadata, _GENERIC_CONTENT_TYPE, stream_final=False + transport, + stream, + metadata, + _GENERIC_CONTENT_TYPE, + stream_final=False, + timeout=timeout, ) return upload, transport - def _do_multipart_upload(self, stream, metadata, size, num_retries): + def _do_multipart_upload(self, stream, metadata, size, num_retries, timeout): """Perform a multipart upload. Args: @@ -2441,6 +2482,10 @@ def _do_multipart_upload(self, stream, metadata, size, num_retries): Number of upload retries. (Deprecated: This argument will be removed in a future release.) + timeout (float): + The number of seconds to wait for the underlying HTTP transport + before using ``retry``. + Returns: requests.Response: The "200 OK" response object returned after the multipart @@ -2466,7 +2511,9 @@ def _do_multipart_upload(self, stream, metadata, size, num_retries): max_retries=num_retries ) - response = upload.transmit(self._http, data, metadata, _GENERIC_CONTENT_TYPE) + response = upload.transmit( + self._http, data, metadata, _GENERIC_CONTENT_TYPE, timeout=timeout + ) return response diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index bc2658961..2001ad42b 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -4425,7 +4425,7 @@ def _initiate_resumable_upload_helper(self, num_retries=None): job = LoadJob(None, None, self.TABLE_REF, client, job_config=config) metadata = job.to_api_repr() upload, transport = client._initiate_resumable_upload( - stream, metadata, num_retries + stream, metadata, num_retries, None ) # Check the returned values. @@ -4492,7 +4492,9 @@ def _do_multipart_upload_success_helper(self, get_boundary, num_retries=None): job = LoadJob(None, None, self.TABLE_REF, client, job_config=config) metadata = job.to_api_repr() size = len(data) - response = client._do_multipart_upload(stream, metadata, size, num_retries) + response = client._do_multipart_upload( + stream, metadata, size, num_retries, None + ) # Check the mocks and the returned value. self.assertIs(response, fake_transport.request.return_value) @@ -7251,7 +7253,7 @@ def test_load_table_from_file_resumable(self): ) do_upload.assert_called_once_with( - file_obj, self.EXPECTED_CONFIGURATION, _DEFAULT_NUM_RETRIES + file_obj, self.EXPECTED_CONFIGURATION, _DEFAULT_NUM_RETRIES, None ) # the original config object should not have been modified @@ -7280,7 +7282,7 @@ def test_load_table_from_file_w_explicit_project(self): expected_resource["jobReference"]["location"] = self.LOCATION expected_resource["jobReference"]["projectId"] = "other-project" do_upload.assert_called_once_with( - file_obj, expected_resource, _DEFAULT_NUM_RETRIES + file_obj, expected_resource, _DEFAULT_NUM_RETRIES, None ) def test_load_table_from_file_w_client_location(self): @@ -7310,7 +7312,7 @@ def test_load_table_from_file_w_client_location(self): expected_resource["jobReference"]["location"] = self.LOCATION expected_resource["jobReference"]["projectId"] = "other-project" do_upload.assert_called_once_with( - file_obj, expected_resource, _DEFAULT_NUM_RETRIES + file_obj, expected_resource, _DEFAULT_NUM_RETRIES, None ) def test_load_table_from_file_resumable_metadata(self): @@ -7368,7 +7370,7 @@ def test_load_table_from_file_resumable_metadata(self): ) do_upload.assert_called_once_with( - file_obj, expected_config, _DEFAULT_NUM_RETRIES + file_obj, expected_config, _DEFAULT_NUM_RETRIES, None ) def test_load_table_from_file_multipart(self): @@ -7392,7 +7394,11 @@ def test_load_table_from_file_multipart(self): ) do_upload.assert_called_once_with( - file_obj, self.EXPECTED_CONFIGURATION, file_obj_size, _DEFAULT_NUM_RETRIES + file_obj, + self.EXPECTED_CONFIGURATION, + file_obj_size, + _DEFAULT_NUM_RETRIES, + None, ) def test_load_table_from_file_with_retries(self): @@ -7413,7 +7419,7 @@ def test_load_table_from_file_with_retries(self): ) do_upload.assert_called_once_with( - file_obj, self.EXPECTED_CONFIGURATION, num_retries + file_obj, self.EXPECTED_CONFIGURATION, num_retries, None ) def test_load_table_from_file_with_rewind(self): @@ -7446,7 +7452,7 @@ def test_load_table_from_file_with_readable_gzip(self): ) do_upload.assert_called_once_with( - gzip_file, self.EXPECTED_CONFIGURATION, _DEFAULT_NUM_RETRIES + gzip_file, self.EXPECTED_CONFIGURATION, _DEFAULT_NUM_RETRIES, None ) def test_load_table_from_file_with_writable_gzip(self): @@ -7539,6 +7545,7 @@ def test_load_table_from_dataframe(self): location=None, project=None, job_config=mock.ANY, + timeout=None, ) sent_file = load_table_from_file.mock_calls[0][1][1] @@ -7583,6 +7590,7 @@ def test_load_table_from_dataframe_w_client_location(self): location=self.LOCATION, project=None, job_config=mock.ANY, + timeout=None, ) sent_file = load_table_from_file.mock_calls[0][1][1] @@ -7636,6 +7644,7 @@ def test_load_table_from_dataframe_w_custom_job_config_wihtout_source_format(sel location=self.LOCATION, project=None, job_config=mock.ANY, + timeout=None, ) sent_config = load_table_from_file.mock_calls[0][2]["job_config"] @@ -7691,6 +7700,7 @@ def test_load_table_from_dataframe_w_custom_job_config_w_source_format(self): location=self.LOCATION, project=None, job_config=mock.ANY, + timeout=None, ) sent_config = load_table_from_file.mock_calls[0][2]["job_config"] @@ -7784,6 +7794,7 @@ def test_load_table_from_dataframe_w_automatic_schema(self): location=self.LOCATION, project=None, job_config=mock.ANY, + timeout=None, ) sent_config = load_table_from_file.mock_calls[0][2]["job_config"] @@ -7844,6 +7855,7 @@ def test_load_table_from_dataframe_w_index_and_auto_schema(self): location=self.LOCATION, project=None, job_config=mock.ANY, + timeout=None, ) sent_config = load_table_from_file.mock_calls[0][2]["job_config"] @@ -7890,6 +7902,7 @@ def test_load_table_from_dataframe_unknown_table(self): location=None, project=None, job_config=mock.ANY, + timeout=None, ) @unittest.skipIf( @@ -7931,6 +7944,7 @@ def test_load_table_from_dataframe_w_nullable_int64_datatype(self): location=self.LOCATION, project=None, job_config=mock.ANY, + timeout=None, ) sent_config = load_table_from_file.mock_calls[0][2]["job_config"] @@ -7978,6 +7992,7 @@ def test_load_table_from_dataframe_w_nullable_int64_datatype_automatic_schema(se location=self.LOCATION, project=None, job_config=mock.ANY, + timeout=None, ) sent_config = load_table_from_file.mock_calls[0][2]["job_config"] @@ -8039,6 +8054,7 @@ def test_load_table_from_dataframe_struct_fields(self): location=self.LOCATION, project=None, job_config=mock.ANY, + timeout=None, ) sent_config = load_table_from_file.mock_calls[0][2]["job_config"] @@ -8113,6 +8129,7 @@ def test_load_table_from_dataframe_w_partial_schema(self): location=self.LOCATION, project=None, job_config=mock.ANY, + timeout=None, ) sent_config = load_table_from_file.mock_calls[0][2]["job_config"] @@ -8207,6 +8224,7 @@ def test_load_table_from_dataframe_w_partial_schema_missing_types(self): location=self.LOCATION, project=None, job_config=mock.ANY, + timeout=None, ) assert warned # there should be at least one warning @@ -8320,6 +8338,7 @@ def test_load_table_from_dataframe_w_nulls(self): location=self.LOCATION, project=None, job_config=mock.ANY, + timeout=None, ) sent_config = load_table_from_file.mock_calls[0][2]["job_config"] @@ -8373,6 +8392,7 @@ def test_load_table_from_json_basic_use(self): location=client.location, project=client.project, job_config=mock.ANY, + timeout=None, ) sent_config = load_table_from_file.mock_calls[0][2]["job_config"] @@ -8425,6 +8445,7 @@ def test_load_table_from_json_non_default_args(self): location="EU", project="project-x", job_config=mock.ANY, + timeout=None, ) sent_config = load_table_from_file.mock_calls[0][2]["job_config"] @@ -8499,7 +8520,7 @@ def test__do_resumable_upload(self): client = self._make_client(transport) result = client._do_resumable_upload( - file_obj, self.EXPECTED_CONFIGURATION, None + file_obj, self.EXPECTED_CONFIGURATION, None, None ) content = result.content.decode("utf-8") @@ -8522,7 +8543,7 @@ def test__do_multipart_upload(self): file_obj_len = len(file_obj.getvalue()) client._do_multipart_upload( - file_obj, self.EXPECTED_CONFIGURATION, file_obj_len, None + file_obj, self.EXPECTED_CONFIGURATION, file_obj_len, None, None ) # Verify that configuration data was passed in with the initial @@ -8550,7 +8571,7 @@ def test__do_multipart_upload_wrong_size(self): file_obj_len = len(file_obj.getvalue()) with pytest.raises(ValueError): - client._do_multipart_upload(file_obj, {}, file_obj_len + 1, None) + client._do_multipart_upload(file_obj, {}, file_obj_len + 1, None, None) def test_schema_from_json_with_file_path(self): from google.cloud.bigquery.schema import SchemaField From 502a0926018abf058cb84bd18043c25eba15a2cc Mon Sep 17 00:00:00 2001 From: Yoshi Automation Bot Date: Mon, 19 Oct 2020 07:51:53 -0700 Subject: [PATCH 17/18] feat: add support for listing arima, automl, boosted tree, DNN, and matrix factorization models (#328) * changes without context autosynth cannot find the source of changes triggered by earlier changes in this repository, or by version upgrades to tools such as linters. * chore: update proto definitions for bigquery/v2 to support BQML statistics PiperOrigin-RevId: 337113354 Source-Author: Google APIs Source-Date: Wed Oct 14 10:04:20 2020 -0700 Source-Repo: googleapis/googleapis Source-Sha: 215c12ade72d9d9616457d9b8b2f8a37f38e79f3 Source-Link: https://github.com/googleapis/googleapis/commit/215c12ade72d9d9616457d9b8b2f8a37f38e79f3 * fix: manually revert `type` to `type_` breaking change This should allow us to merge the fixes for `list_models` and avoid a breaking change until `proto-plus` becomes acceptable for our use. * feat: add BIGNUMERIC to data type enums Co-authored-by: Tim Swast --- google/cloud/bigquery/enums.py | 2 + google/cloud/bigquery_v2/__init__.py | 2 + .../bigquery_v2/proto/encryption_config.proto | 3 +- google/cloud/bigquery_v2/proto/model.proto | 576 ++++++++++++++++- .../bigquery_v2/proto/model_reference.proto | 3 +- .../bigquery_v2/proto/standard_sql.proto | 6 +- .../bigquery_v2/proto/table_reference.proto | 39 ++ google/cloud/bigquery_v2/types/__init__.py | 2 + google/cloud/bigquery_v2/types/model.py | 603 +++++++++++++++++- .../cloud/bigquery_v2/types/standard_sql.py | 1 + .../bigquery_v2/types/table_reference.py | 51 ++ synth.metadata | 9 +- 12 files changed, 1282 insertions(+), 15 deletions(-) create mode 100644 google/cloud/bigquery_v2/proto/table_reference.proto create mode 100644 google/cloud/bigquery_v2/types/table_reference.py diff --git a/google/cloud/bigquery/enums.py b/google/cloud/bigquery/enums.py index 3247372e3..eb33e4276 100644 --- a/google/cloud/bigquery/enums.py +++ b/google/cloud/bigquery/enums.py @@ -33,6 +33,7 @@ "DATETIME", "GEOGRAPHY", "NUMERIC", + "BIGNUMERIC", ) ) @@ -81,6 +82,7 @@ class SqlTypeNames(str, enum.Enum): FLOAT = "FLOAT" FLOAT64 = "FLOAT" NUMERIC = "NUMERIC" + BIGNUMERIC = "BIGNUMERIC" BOOLEAN = "BOOLEAN" BOOL = "BOOLEAN" GEOGRAPHY = "GEOGRAPHY" # NOTE: not available in legacy types diff --git a/google/cloud/bigquery_v2/__init__.py b/google/cloud/bigquery_v2/__init__.py index c1989c3b0..ebcc26bef 100644 --- a/google/cloud/bigquery_v2/__init__.py +++ b/google/cloud/bigquery_v2/__init__.py @@ -27,6 +27,7 @@ from .types.standard_sql import StandardSqlDataType from .types.standard_sql import StandardSqlField from .types.standard_sql import StandardSqlStructType +from .types.table_reference import TableReference __all__ = ( @@ -41,4 +42,5 @@ "StandardSqlDataType", "StandardSqlField", "StandardSqlStructType", + "TableReference", ) diff --git a/google/cloud/bigquery_v2/proto/encryption_config.proto b/google/cloud/bigquery_v2/proto/encryption_config.proto index 54445f0fa..1c0512a17 100644 --- a/google/cloud/bigquery_v2/proto/encryption_config.proto +++ b/google/cloud/bigquery_v2/proto/encryption_config.proto @@ -1,4 +1,4 @@ -// Copyright 2019 Google LLC. +// Copyright 2020 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -11,7 +11,6 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -// syntax = "proto3"; diff --git a/google/cloud/bigquery_v2/proto/model.proto b/google/cloud/bigquery_v2/proto/model.proto index 13d980774..2d400dddd 100644 --- a/google/cloud/bigquery_v2/proto/model.proto +++ b/google/cloud/bigquery_v2/proto/model.proto @@ -1,4 +1,4 @@ -// Copyright 2019 Google LLC. +// Copyright 2020 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -11,7 +11,6 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -// syntax = "proto3"; @@ -22,6 +21,7 @@ import "google/api/field_behavior.proto"; import "google/cloud/bigquery/v2/encryption_config.proto"; import "google/cloud/bigquery/v2/model_reference.proto"; import "google/cloud/bigquery/v2/standard_sql.proto"; +import "google/cloud/bigquery/v2/table_reference.proto"; import "google/protobuf/empty.proto"; import "google/protobuf/timestamp.proto"; import "google/protobuf/wrappers.proto"; @@ -62,6 +62,32 @@ service ModelService { } message Model { + message SeasonalPeriod { + enum SeasonalPeriodType { + SEASONAL_PERIOD_TYPE_UNSPECIFIED = 0; + + // No seasonality + NO_SEASONALITY = 1; + + // Daily period, 24 hours. + DAILY = 2; + + // Weekly period, 7 days. + WEEKLY = 3; + + // Monthly period, 30 days or irregular. + MONTHLY = 4; + + // Quarterly period, 90 days or irregular. + QUARTERLY = 5; + + // Yearly period, 365 days or irregular. + YEARLY = 6; + } + + + } + message KmeansEnums { // Indicates the method used to initialize the centroids for KMeans // clustering algorithm. @@ -74,6 +100,9 @@ message Model { // Initializes the centroids using data specified in // kmeans_initialization_column. CUSTOM = 2; + + // Initializes with kmeans++. + KMEANS_PLUS_PLUS = 3; } @@ -280,6 +309,73 @@ message Model { repeated Cluster clusters = 3; } + // Evaluation metrics used by weighted-ALS models specified by + // feedback_type=implicit. + message RankingMetrics { + // Calculates a precision per user for all the items by ranking them and + // then averages all the precisions across all the users. + google.protobuf.DoubleValue mean_average_precision = 1; + + // Similar to the mean squared error computed in regression and explicit + // recommendation models except instead of computing the rating directly, + // the output from evaluate is computed against a preference which is 1 or 0 + // depending on if the rating exists or not. + google.protobuf.DoubleValue mean_squared_error = 2; + + // A metric to determine the goodness of a ranking calculated from the + // predicted confidence by comparing it to an ideal rank measured by the + // original ratings. + google.protobuf.DoubleValue normalized_discounted_cumulative_gain = 3; + + // Determines the goodness of a ranking by computing the percentile rank + // from the predicted confidence and dividing it by the original rank. + google.protobuf.DoubleValue average_rank = 4; + } + + // Model evaluation metrics for ARIMA forecasting models. + message ArimaForecastingMetrics { + // Model evaluation metrics for a single ARIMA forecasting model. + message ArimaSingleModelForecastingMetrics { + // Non-seasonal order. + ArimaOrder non_seasonal_order = 1; + + // Arima fitting metrics. + ArimaFittingMetrics arima_fitting_metrics = 2; + + // Is arima model fitted with drift or not. It is always false when d + // is not 1. + bool has_drift = 3; + + // The id to indicate different time series. + string time_series_id = 4; + + // Seasonal periods. Repeated because multiple periods are supported + // for one time series. + repeated SeasonalPeriod.SeasonalPeriodType seasonal_periods = 5; + } + + // Non-seasonal order. + repeated ArimaOrder non_seasonal_order = 1; + + // Arima model fitting metrics. + repeated ArimaFittingMetrics arima_fitting_metrics = 2; + + // Seasonal periods. Repeated because multiple periods are supported for one + // time series. + repeated SeasonalPeriod.SeasonalPeriodType seasonal_periods = 3; + + // Whether Arima model fitted with drift or not. It is always false when d + // is not 1. + repeated bool has_drift = 4; + + // Id to differentiate different time series for the large-scale case. + repeated string time_series_id = 5; + + // Repeated as there can be many metric sets (one for each model) in + // auto-arima and the large-scale case. + repeated ArimaSingleModelForecastingMetrics arima_single_model_forecasting_metrics = 6; + } + // Evaluation metrics of a model. These are either computed on all training // data or just the eval data based on whether eval data was used during // training. These are not present for imported models. @@ -297,7 +393,71 @@ message Model { // Populated for clustering models. ClusteringMetrics clustering_metrics = 4; + + // Populated for implicit feedback type matrix factorization models. + RankingMetrics ranking_metrics = 5; + + // Populated for ARIMA models. + ArimaForecastingMetrics arima_forecasting_metrics = 6; + } + } + + // Data split result. This contains references to the training and evaluation + // data tables that were used to train the model. + message DataSplitResult { + // Table reference of the training data after split. + TableReference training_table = 1; + + // Table reference of the evaluation data after split. + TableReference evaluation_table = 2; + } + + // Arima order, can be used for both non-seasonal and seasonal parts. + message ArimaOrder { + // Order of the autoregressive part. + int64 p = 1; + + // Order of the differencing part. + int64 d = 2; + + // Order of the moving-average part. + int64 q = 3; + } + + // ARIMA model fitting metrics. + message ArimaFittingMetrics { + // Log-likelihood. + double log_likelihood = 1; + + // AIC. + double aic = 2; + + // Variance. + double variance = 3; + } + + // Global explanations containing the top most important features + // after training. + message GlobalExplanation { + // Explanation for a single feature. + message Explanation { + // Full name of the feature. For non-numerical features, will be + // formatted like .. Overall size of + // feature name will always be truncated to first 120 characters. + string feature_name = 1; + + // Attribution of feature. + google.protobuf.DoubleValue attribution = 2; } + + // A list of the top global explanations. Sorted by absolute value of + // attribution in descending order. + repeated Explanation explanations = 1; + + // Class label for this set of global explanations. Will be empty/null for + // binary logistic and linear regression models. Sorted alphabetically in + // descending order. + string class_label = 2; } // Information about a single training query run for the model. @@ -367,6 +527,12 @@ message Model { // training data. Only applicable for classification models. map label_class_weights = 17; + // User column specified for matrix factorization models. + string user_column = 18; + + // Item column specified for matrix factorization models. + string item_column = 19; + // Distance type for clustering models. DistanceType distance_type = 20; @@ -380,12 +546,83 @@ message Model { // Optimization strategy for training linear regression models. OptimizationStrategy optimization_strategy = 23; + // Hidden units for dnn models. + repeated int64 hidden_units = 24; + + // Batch size for dnn models. + int64 batch_size = 25; + + // Dropout probability for dnn models. + google.protobuf.DoubleValue dropout = 26; + + // Maximum depth of a tree for boosted tree models. + int64 max_tree_depth = 27; + + // Subsample fraction of the training data to grow tree to prevent + // overfitting for boosted tree models. + double subsample = 28; + + // Minimum split loss for boosted tree models. + google.protobuf.DoubleValue min_split_loss = 29; + + // Num factors specified for matrix factorization models. + int64 num_factors = 30; + + // Feedback type that specifies which algorithm to run for matrix + // factorization. + FeedbackType feedback_type = 31; + + // Hyperparameter for matrix factoration when implicit feedback type is + // specified. + google.protobuf.DoubleValue wals_alpha = 32; + // The method used to initialize the centroids for kmeans algorithm. KmeansEnums.KmeansInitializationMethod kmeans_initialization_method = 33; // The column used to provide the initial centroids for kmeans algorithm // when kmeans_initialization_method is CUSTOM. string kmeans_initialization_column = 34; + + // Column to be designated as time series timestamp for ARIMA model. + string time_series_timestamp_column = 35; + + // Column to be designated as time series data for ARIMA model. + string time_series_data_column = 36; + + // Whether to enable auto ARIMA or not. + bool auto_arima = 37; + + // A specification of the non-seasonal part of the ARIMA model: the three + // components (p, d, q) are the AR order, the degree of differencing, and + // the MA order. + ArimaOrder non_seasonal_order = 38; + + // The data frequency of a time series. + DataFrequency data_frequency = 39; + + // Include drift when fitting an ARIMA model. + bool include_drift = 41; + + // The geographical region based on which the holidays are considered in + // time series modeling. If a valid value is specified, then holiday + // effects modeling is enabled. + HolidayRegion holiday_region = 42; + + // The id column that will be used to indicate different time series to + // forecast in parallel. + string time_series_id_column = 43; + + // The number of periods ahead that need to be forecasted. + int64 horizon = 44; + + // Whether to preserve the input structs in output feature names. + // Suppose there is a struct A with field b. + // When false (default), the output feature name is A_b. + // When true, the output feature name is A.b. + bool preserve_input_structs = 45; + + // The max value of non-seasonal p and q. + int64 auto_arima_max_order = 46; } // Information about a single iteration of the training run. @@ -403,6 +640,53 @@ message Model { google.protobuf.Int64Value cluster_size = 3; } + // (Auto-)arima fitting result. Wrap everything in ArimaResult for easier + // refactoring if we want to use model-specific iteration results. + message ArimaResult { + // Arima coefficients. + message ArimaCoefficients { + // Auto-regressive coefficients, an array of double. + repeated double auto_regressive_coefficients = 1; + + // Moving-average coefficients, an array of double. + repeated double moving_average_coefficients = 2; + + // Intercept coefficient, just a double not an array. + double intercept_coefficient = 3; + } + + // Arima model information. + message ArimaModelInfo { + // Non-seasonal order. + ArimaOrder non_seasonal_order = 1; + + // Arima coefficients. + ArimaCoefficients arima_coefficients = 2; + + // Arima fitting metrics. + ArimaFittingMetrics arima_fitting_metrics = 3; + + // Whether Arima model fitted with drift or not. It is always false + // when d is not 1. + bool has_drift = 4; + + // The id to indicate different time series. + string time_series_id = 5; + + // Seasonal periods. Repeated because multiple periods are supported + // for one time series. + repeated SeasonalPeriod.SeasonalPeriodType seasonal_periods = 6; + } + + // This message is repeated because there are multiple arima models + // fitted in auto-arima. For non-auto-arima model, its size is one. + repeated ArimaModelInfo arima_model_info = 1; + + // Seasonal periods. Repeated because multiple periods are supported for + // one time series. + repeated SeasonalPeriod.SeasonalPeriodType seasonal_periods = 2; + } + // Index of the iteration, 0 based. google.protobuf.Int32Value index = 1; @@ -420,6 +704,8 @@ message Model { // Information about top clusters for clustering models. repeated ClusterInfo cluster_infos = 8; + + ArimaResult arima_result = 9; } // Options that were used for this training run, includes @@ -435,6 +721,15 @@ message Model { // The evaluation metrics over training/eval data that were computed at the // end of training. EvaluationMetrics evaluation_metrics = 7; + + // Data split result of the training run. Only set when the input data is + // actually split. + DataSplitResult data_split_result = 9; + + // Global explanations for important features of the model. For multi-class + // models, there is one entry for each label class. For other models, there + // is only one entry in the list. + repeated GlobalExplanation global_explanations = 10; } // Indicates the type of the Model. @@ -450,8 +745,32 @@ message Model { // K-means clustering model. KMEANS = 3; + // Matrix factorization model. + MATRIX_FACTORIZATION = 4; + + // [Beta] DNN classifier model. + DNN_CLASSIFIER = 5; + // [Beta] An imported TensorFlow model. TENSORFLOW = 6; + + // [Beta] DNN regressor model. + DNN_REGRESSOR = 7; + + // [Beta] Boosted tree regressor model. + BOOSTED_TREE_REGRESSOR = 9; + + // [Beta] Boosted tree classifier model. + BOOSTED_TREE_CLASSIFIER = 10; + + // [Beta] ARIMA model. + ARIMA = 11; + + // [Beta] AutoML Tables regression model. + AUTOML_REGRESSOR = 12; + + // [Beta] AutoML Tables classification model. + AUTOML_CLASSIFIER = 13; } // Loss metric to evaluate model training performance. @@ -497,6 +816,243 @@ message Model { AUTO_SPLIT = 5; } + // Type of supported data frequency for time series forecasting models. + enum DataFrequency { + DATA_FREQUENCY_UNSPECIFIED = 0; + + // Automatically inferred from timestamps. + AUTO_FREQUENCY = 1; + + // Yearly data. + YEARLY = 2; + + // Quarterly data. + QUARTERLY = 3; + + // Monthly data. + MONTHLY = 4; + + // Weekly data. + WEEKLY = 5; + + // Daily data. + DAILY = 6; + + // Hourly data. + HOURLY = 7; + } + + // Type of supported holiday regions for time series forecasting models. + enum HolidayRegion { + // Holiday region unspecified. + HOLIDAY_REGION_UNSPECIFIED = 0; + + // Global. + GLOBAL = 1; + + // North America. + NA = 2; + + // Japan and Asia Pacific: Korea, Greater China, India, Australia, and New + // Zealand. + JAPAC = 3; + + // Europe, the Middle East and Africa. + EMEA = 4; + + // Latin America and the Caribbean. + LAC = 5; + + // United Arab Emirates + AE = 6; + + // Argentina + AR = 7; + + // Austria + AT = 8; + + // Australia + AU = 9; + + // Belgium + BE = 10; + + // Brazil + BR = 11; + + // Canada + CA = 12; + + // Switzerland + CH = 13; + + // Chile + CL = 14; + + // China + CN = 15; + + // Colombia + CO = 16; + + // Czechoslovakia + CS = 17; + + // Czech Republic + CZ = 18; + + // Germany + DE = 19; + + // Denmark + DK = 20; + + // Algeria + DZ = 21; + + // Ecuador + EC = 22; + + // Estonia + EE = 23; + + // Egypt + EG = 24; + + // Spain + ES = 25; + + // Finland + FI = 26; + + // France + FR = 27; + + // Great Britain (United Kingdom) + GB = 28; + + // Greece + GR = 29; + + // Hong Kong + HK = 30; + + // Hungary + HU = 31; + + // Indonesia + ID = 32; + + // Ireland + IE = 33; + + // Israel + IL = 34; + + // India + IN = 35; + + // Iran + IR = 36; + + // Italy + IT = 37; + + // Japan + JP = 38; + + // Korea (South) + KR = 39; + + // Latvia + LV = 40; + + // Morocco + MA = 41; + + // Mexico + MX = 42; + + // Malaysia + MY = 43; + + // Nigeria + NG = 44; + + // Netherlands + NL = 45; + + // Norway + NO = 46; + + // New Zealand + NZ = 47; + + // Peru + PE = 48; + + // Philippines + PH = 49; + + // Pakistan + PK = 50; + + // Poland + PL = 51; + + // Portugal + PT = 52; + + // Romania + RO = 53; + + // Serbia + RS = 54; + + // Russian Federation + RU = 55; + + // Saudi Arabia + SA = 56; + + // Sweden + SE = 57; + + // Singapore + SG = 58; + + // Slovenia + SI = 59; + + // Slovakia + SK = 60; + + // Thailand + TH = 61; + + // Turkey + TR = 62; + + // Taiwan + TW = 63; + + // Ukraine + UA = 64; + + // United States + US = 65; + + // Venezuela + VE = 66; + + // Viet Nam + VN = 67; + + // South Africa + ZA = 68; + } + // Indicates the learning rate optimization strategy to use. enum LearnRateStrategy { LEARN_RATE_STRATEGY_UNSPECIFIED = 0; @@ -519,6 +1075,17 @@ message Model { NORMAL_EQUATION = 2; } + // Indicates the training algorithm to use for matrix factorization models. + enum FeedbackType { + FEEDBACK_TYPE_UNSPECIFIED = 0; + + // Use weighted-als for implicit feedback problems. + IMPLICIT = 1; + + // Use nonweighted-als for explicit feedback problems. + EXPLICIT = 2; + } + // Output only. A hash of this resource. string etag = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; @@ -558,8 +1125,9 @@ message Model { // Custom encryption configuration (e.g., Cloud KMS keys). This shows the // encryption configuration of the model data while stored in BigQuery - // storage. - google.cloud.bigquery.v2.EncryptionConfiguration encryption_configuration = 17; + // storage. This field can be used with PatchModel to update encryption key + // for an already encrypted model. + EncryptionConfiguration encryption_configuration = 17; // Output only. Type of the model resource. ModelType model_type = 7 [(google.api.field_behavior) = OUTPUT_ONLY]; diff --git a/google/cloud/bigquery_v2/proto/model_reference.proto b/google/cloud/bigquery_v2/proto/model_reference.proto index fadd17514..c3d1a49a8 100644 --- a/google/cloud/bigquery_v2/proto/model_reference.proto +++ b/google/cloud/bigquery_v2/proto/model_reference.proto @@ -1,4 +1,4 @@ -// Copyright 2019 Google LLC. +// Copyright 2020 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -11,7 +11,6 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -// syntax = "proto3"; diff --git a/google/cloud/bigquery_v2/proto/standard_sql.proto b/google/cloud/bigquery_v2/proto/standard_sql.proto index ff69dfc4e..1514eccbb 100644 --- a/google/cloud/bigquery_v2/proto/standard_sql.proto +++ b/google/cloud/bigquery_v2/proto/standard_sql.proto @@ -1,4 +1,4 @@ -// Copyright 2019 Google LLC. +// Copyright 2020 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -11,7 +11,6 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -// syntax = "proto3"; @@ -73,6 +72,9 @@ message StandardSqlDataType { // Encoded as a decimal string. NUMERIC = 23; + // Encoded as a decimal string. + BIGNUMERIC = 24; + // Encoded as a list with types matching Type.array_type. ARRAY = 16; diff --git a/google/cloud/bigquery_v2/proto/table_reference.proto b/google/cloud/bigquery_v2/proto/table_reference.proto new file mode 100644 index 000000000..ba02f80c4 --- /dev/null +++ b/google/cloud/bigquery_v2/proto/table_reference.proto @@ -0,0 +1,39 @@ +// Copyright 2020 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package google.cloud.bigquery.v2; + +import "google/api/field_behavior.proto"; +import "google/api/annotations.proto"; + +option go_package = "google.golang.org/genproto/googleapis/cloud/bigquery/v2;bigquery"; +option java_outer_classname = "TableReferenceProto"; +option java_package = "com.google.cloud.bigquery.v2"; + +message TableReference { + // Required. The ID of the project containing this table. + string project_id = 1 [(google.api.field_behavior) = REQUIRED]; + + // Required. The ID of the dataset containing this table. + string dataset_id = 2 [(google.api.field_behavior) = REQUIRED]; + + // Required. The ID of the table. The ID must contain only + // letters (a-z, A-Z), numbers (0-9), or underscores (_). The maximum + // length is 1,024 characters. Certain operations allow + // suffixing of the table ID with a partition decorator, such as + // `sample_table$20190123`. + string table_id = 3 [(google.api.field_behavior) = REQUIRED]; +} diff --git a/google/cloud/bigquery_v2/types/__init__.py b/google/cloud/bigquery_v2/types/__init__.py index a8839c74e..1e354641a 100644 --- a/google/cloud/bigquery_v2/types/__init__.py +++ b/google/cloud/bigquery_v2/types/__init__.py @@ -22,6 +22,7 @@ StandardSqlField, StandardSqlStructType, ) +from .table_reference import TableReference from .model import ( Model, GetModelRequest, @@ -38,6 +39,7 @@ "StandardSqlDataType", "StandardSqlField", "StandardSqlStructType", + "TableReference", "Model", "GetModelRequest", "PatchModelRequest", diff --git a/google/cloud/bigquery_v2/types/model.py b/google/cloud/bigquery_v2/types/model.py index a00720d48..3a7bbf43b 100644 --- a/google/cloud/bigquery_v2/types/model.py +++ b/google/cloud/bigquery_v2/types/model.py @@ -21,6 +21,7 @@ from google.cloud.bigquery_v2.types import encryption_config from google.cloud.bigquery_v2.types import model_reference as gcb_model_reference from google.cloud.bigquery_v2.types import standard_sql +from google.cloud.bigquery_v2.types import table_reference from google.protobuf import timestamp_pb2 as timestamp # type: ignore from google.protobuf import wrappers_pb2 as wrappers # type: ignore @@ -84,7 +85,9 @@ class Model(proto.Message): Custom encryption configuration (e.g., Cloud KMS keys). This shows the encryption configuration of the model data while stored in - BigQuery storage. + BigQuery storage. This field can be used with + PatchModel to update encryption key for an + already encrypted model. model_type (~.gcb_model.Model.ModelType): Output only. Type of the model resource. training_runs (Sequence[~.gcb_model.Model.TrainingRun]): @@ -105,7 +108,15 @@ class ModelType(proto.Enum): LINEAR_REGRESSION = 1 LOGISTIC_REGRESSION = 2 KMEANS = 3 + MATRIX_FACTORIZATION = 4 + DNN_CLASSIFIER = 5 TENSORFLOW = 6 + DNN_REGRESSOR = 7 + BOOSTED_TREE_REGRESSOR = 9 + BOOSTED_TREE_CLASSIFIER = 10 + ARIMA = 11 + AUTOML_REGRESSOR = 12 + AUTOML_CLASSIFIER = 13 class LossType(proto.Enum): r"""Loss metric to evaluate model training performance.""" @@ -132,6 +143,93 @@ class DataSplitMethod(proto.Enum): NO_SPLIT = 4 AUTO_SPLIT = 5 + class DataFrequency(proto.Enum): + r"""Type of supported data frequency for time series forecasting + models. + """ + DATA_FREQUENCY_UNSPECIFIED = 0 + AUTO_FREQUENCY = 1 + YEARLY = 2 + QUARTERLY = 3 + MONTHLY = 4 + WEEKLY = 5 + DAILY = 6 + HOURLY = 7 + + class HolidayRegion(proto.Enum): + r"""Type of supported holiday regions for time series forecasting + models. + """ + HOLIDAY_REGION_UNSPECIFIED = 0 + GLOBAL = 1 + NA = 2 + JAPAC = 3 + EMEA = 4 + LAC = 5 + AE = 6 + AR = 7 + AT = 8 + AU = 9 + BE = 10 + BR = 11 + CA = 12 + CH = 13 + CL = 14 + CN = 15 + CO = 16 + CS = 17 + CZ = 18 + DE = 19 + DK = 20 + DZ = 21 + EC = 22 + EE = 23 + EG = 24 + ES = 25 + FI = 26 + FR = 27 + GB = 28 + GR = 29 + HK = 30 + HU = 31 + ID = 32 + IE = 33 + IL = 34 + IN = 35 + IR = 36 + IT = 37 + JP = 38 + KR = 39 + LV = 40 + MA = 41 + MX = 42 + MY = 43 + NG = 44 + NL = 45 + NO = 46 + NZ = 47 + PE = 48 + PH = 49 + PK = 50 + PL = 51 + PT = 52 + RO = 53 + RS = 54 + RU = 55 + SA = 56 + SE = 57 + SG = 58 + SI = 59 + SK = 60 + TH = 61 + TR = 62 + TW = 63 + UA = 64 + US = 65 + VE = 66 + VN = 67 + ZA = 68 + class LearnRateStrategy(proto.Enum): r"""Indicates the learning rate optimization strategy to use.""" LEARN_RATE_STRATEGY_UNSPECIFIED = 0 @@ -144,6 +242,27 @@ class OptimizationStrategy(proto.Enum): BATCH_GRADIENT_DESCENT = 1 NORMAL_EQUATION = 2 + class FeedbackType(proto.Enum): + r"""Indicates the training algorithm to use for matrix + factorization models. + """ + FEEDBACK_TYPE_UNSPECIFIED = 0 + IMPLICIT = 1 + EXPLICIT = 2 + + class SeasonalPeriod(proto.Message): + r"""""" + + class SeasonalPeriodType(proto.Enum): + r"""""" + SEASONAL_PERIOD_TYPE_UNSPECIFIED = 0 + NO_SEASONALITY = 1 + DAILY = 2 + WEEKLY = 3 + MONTHLY = 4 + QUARTERLY = 5 + YEARLY = 6 + class KmeansEnums(proto.Message): r"""""" @@ -154,6 +273,7 @@ class KmeansInitializationMethod(proto.Enum): KMEANS_INITIALIZATION_METHOD_UNSPECIFIED = 0 RANDOM = 1 CUSTOM = 2 + KMEANS_PLUS_PLUS = 3 class RegressionMetrics(proto.Message): r"""Evaluation metrics for regression and explicit feedback type @@ -529,6 +649,129 @@ class CategoryCount(proto.Message): proto.MESSAGE, number=3, message="Model.ClusteringMetrics.Cluster", ) + class RankingMetrics(proto.Message): + r"""Evaluation metrics used by weighted-ALS models specified by + feedback_type=implicit. + + Attributes: + mean_average_precision (~.wrappers.DoubleValue): + Calculates a precision per user for all the + items by ranking them and then averages all the + precisions across all the users. + mean_squared_error (~.wrappers.DoubleValue): + Similar to the mean squared error computed in + regression and explicit recommendation models + except instead of computing the rating directly, + the output from evaluate is computed against a + preference which is 1 or 0 depending on if the + rating exists or not. + normalized_discounted_cumulative_gain (~.wrappers.DoubleValue): + A metric to determine the goodness of a + ranking calculated from the predicted confidence + by comparing it to an ideal rank measured by the + original ratings. + average_rank (~.wrappers.DoubleValue): + Determines the goodness of a ranking by + computing the percentile rank from the predicted + confidence and dividing it by the original rank. + """ + + mean_average_precision = proto.Field( + proto.MESSAGE, number=1, message=wrappers.DoubleValue, + ) + + mean_squared_error = proto.Field( + proto.MESSAGE, number=2, message=wrappers.DoubleValue, + ) + + normalized_discounted_cumulative_gain = proto.Field( + proto.MESSAGE, number=3, message=wrappers.DoubleValue, + ) + + average_rank = proto.Field( + proto.MESSAGE, number=4, message=wrappers.DoubleValue, + ) + + class ArimaForecastingMetrics(proto.Message): + r"""Model evaluation metrics for ARIMA forecasting models. + + Attributes: + non_seasonal_order (Sequence[~.gcb_model.Model.ArimaOrder]): + Non-seasonal order. + arima_fitting_metrics (Sequence[~.gcb_model.Model.ArimaFittingMetrics]): + Arima model fitting metrics. + seasonal_periods (Sequence[~.gcb_model.Model.SeasonalPeriod.SeasonalPeriodType]): + Seasonal periods. Repeated because multiple + periods are supported for one time series. + has_drift (Sequence[bool]): + Whether Arima model fitted with drift or not. + It is always false when d is not 1. + time_series_id (Sequence[str]): + Id to differentiate different time series for + the large-scale case. + arima_single_model_forecasting_metrics (Sequence[~.gcb_model.Model.ArimaForecastingMetrics.ArimaSingleModelForecastingMetrics]): + Repeated as there can be many metric sets + (one for each model) in auto-arima and the + large-scale case. + """ + + class ArimaSingleModelForecastingMetrics(proto.Message): + r"""Model evaluation metrics for a single ARIMA forecasting + model. + + Attributes: + non_seasonal_order (~.gcb_model.Model.ArimaOrder): + Non-seasonal order. + arima_fitting_metrics (~.gcb_model.Model.ArimaFittingMetrics): + Arima fitting metrics. + has_drift (bool): + Is arima model fitted with drift or not. It + is always false when d is not 1. + time_series_id (str): + The id to indicate different time series. + seasonal_periods (Sequence[~.gcb_model.Model.SeasonalPeriod.SeasonalPeriodType]): + Seasonal periods. Repeated because multiple + periods are supported for one time series. + """ + + non_seasonal_order = proto.Field( + proto.MESSAGE, number=1, message="Model.ArimaOrder", + ) + + arima_fitting_metrics = proto.Field( + proto.MESSAGE, number=2, message="Model.ArimaFittingMetrics", + ) + + has_drift = proto.Field(proto.BOOL, number=3) + + time_series_id = proto.Field(proto.STRING, number=4) + + seasonal_periods = proto.RepeatedField( + proto.ENUM, number=5, enum="Model.SeasonalPeriod.SeasonalPeriodType", + ) + + non_seasonal_order = proto.RepeatedField( + proto.MESSAGE, number=1, message="Model.ArimaOrder", + ) + + arima_fitting_metrics = proto.RepeatedField( + proto.MESSAGE, number=2, message="Model.ArimaFittingMetrics", + ) + + seasonal_periods = proto.RepeatedField( + proto.ENUM, number=3, enum="Model.SeasonalPeriod.SeasonalPeriodType", + ) + + has_drift = proto.RepeatedField(proto.BOOL, number=4) + + time_series_id = proto.RepeatedField(proto.STRING, number=5) + + arima_single_model_forecasting_metrics = proto.RepeatedField( + proto.MESSAGE, + number=6, + message="Model.ArimaForecastingMetrics.ArimaSingleModelForecastingMetrics", + ) + class EvaluationMetrics(proto.Message): r"""Evaluation metrics of a model. These are either computed on all training data or just the eval data based on whether eval @@ -547,6 +790,11 @@ class EvaluationMetrics(proto.Message): classification/classifier models. clustering_metrics (~.gcb_model.Model.ClusteringMetrics): Populated for clustering models. + ranking_metrics (~.gcb_model.Model.RankingMetrics): + Populated for implicit feedback type matrix + factorization models. + arima_forecasting_metrics (~.gcb_model.Model.ArimaForecastingMetrics): + Populated for ARIMA models. """ regression_metrics = proto.Field( @@ -571,6 +819,116 @@ class EvaluationMetrics(proto.Message): proto.MESSAGE, number=4, oneof="metrics", message="Model.ClusteringMetrics", ) + ranking_metrics = proto.Field( + proto.MESSAGE, number=5, oneof="metrics", message="Model.RankingMetrics", + ) + + arima_forecasting_metrics = proto.Field( + proto.MESSAGE, + number=6, + oneof="metrics", + message="Model.ArimaForecastingMetrics", + ) + + class DataSplitResult(proto.Message): + r"""Data split result. This contains references to the training + and evaluation data tables that were used to train the model. + + Attributes: + training_table (~.table_reference.TableReference): + Table reference of the training data after + split. + evaluation_table (~.table_reference.TableReference): + Table reference of the evaluation data after + split. + """ + + training_table = proto.Field( + proto.MESSAGE, number=1, message=table_reference.TableReference, + ) + + evaluation_table = proto.Field( + proto.MESSAGE, number=2, message=table_reference.TableReference, + ) + + class ArimaOrder(proto.Message): + r"""Arima order, can be used for both non-seasonal and seasonal + parts. + + Attributes: + p (int): + Order of the autoregressive part. + d (int): + Order of the differencing part. + q (int): + Order of the moving-average part. + """ + + p = proto.Field(proto.INT64, number=1) + + d = proto.Field(proto.INT64, number=2) + + q = proto.Field(proto.INT64, number=3) + + class ArimaFittingMetrics(proto.Message): + r"""ARIMA model fitting metrics. + + Attributes: + log_likelihood (float): + Log-likelihood. + aic (float): + AIC. + variance (float): + Variance. + """ + + log_likelihood = proto.Field(proto.DOUBLE, number=1) + + aic = proto.Field(proto.DOUBLE, number=2) + + variance = proto.Field(proto.DOUBLE, number=3) + + class GlobalExplanation(proto.Message): + r"""Global explanations containing the top most important + features after training. + + Attributes: + explanations (Sequence[~.gcb_model.Model.GlobalExplanation.Explanation]): + A list of the top global explanations. Sorted + by absolute value of attribution in descending + order. + class_label (str): + Class label for this set of global + explanations. Will be empty/null for binary + logistic and linear regression models. Sorted + alphabetically in descending order. + """ + + class Explanation(proto.Message): + r"""Explanation for a single feature. + + Attributes: + feature_name (str): + Full name of the feature. For non-numerical features, will + be formatted like .. + Overall size of feature name will always be truncated to + first 120 characters. + attribution (~.wrappers.DoubleValue): + Attribution of feature. + """ + + feature_name = proto.Field(proto.STRING, number=1) + + attribution = proto.Field( + proto.MESSAGE, number=2, message=wrappers.DoubleValue, + ) + + explanations = proto.RepeatedField( + proto.MESSAGE, number=1, message="Model.GlobalExplanation.Explanation", + ) + + class_label = proto.Field(proto.STRING, number=2) + class TrainingRun(proto.Message): r"""Information about a single training query run for the model. @@ -587,6 +945,14 @@ class TrainingRun(proto.Message): evaluation_metrics (~.gcb_model.Model.EvaluationMetrics): The evaluation metrics over training/eval data that were computed at the end of training. + data_split_result (~.gcb_model.Model.DataSplitResult): + Data split result of the training run. Only + set when the input data is actually split. + global_explanations (Sequence[~.gcb_model.Model.GlobalExplanation]): + Global explanations for important features of + the model. For multi-class models, there is one + entry for each label class. For other models, + there is only one entry in the list. """ class TrainingOptions(proto.Message): @@ -651,6 +1017,12 @@ class TrainingOptions(proto.Message): Weights associated with each label class, for rebalancing the training data. Only applicable for classification models. + user_column (str): + User column specified for matrix + factorization models. + item_column (str): + Item column specified for matrix + factorization models. distance_type (~.gcb_model.Model.DistanceType): Distance type for clustering models. num_clusters (int): @@ -661,12 +1033,71 @@ class TrainingOptions(proto.Message): optimization_strategy (~.gcb_model.Model.OptimizationStrategy): Optimization strategy for training linear regression models. + hidden_units (Sequence[int]): + Hidden units for dnn models. + batch_size (int): + Batch size for dnn models. + dropout (~.wrappers.DoubleValue): + Dropout probability for dnn models. + max_tree_depth (int): + Maximum depth of a tree for boosted tree + models. + subsample (float): + Subsample fraction of the training data to + grow tree to prevent overfitting for boosted + tree models. + min_split_loss (~.wrappers.DoubleValue): + Minimum split loss for boosted tree models. + num_factors (int): + Num factors specified for matrix + factorization models. + feedback_type (~.gcb_model.Model.FeedbackType): + Feedback type that specifies which algorithm + to run for matrix factorization. + wals_alpha (~.wrappers.DoubleValue): + Hyperparameter for matrix factoration when + implicit feedback type is specified. kmeans_initialization_method (~.gcb_model.Model.KmeansEnums.KmeansInitializationMethod): The method used to initialize the centroids for kmeans algorithm. kmeans_initialization_column (str): The column used to provide the initial centroids for kmeans algorithm when kmeans_initialization_method is CUSTOM. + time_series_timestamp_column (str): + Column to be designated as time series + timestamp for ARIMA model. + time_series_data_column (str): + Column to be designated as time series data + for ARIMA model. + auto_arima (bool): + Whether to enable auto ARIMA or not. + non_seasonal_order (~.gcb_model.Model.ArimaOrder): + A specification of the non-seasonal part of + the ARIMA model: the three components (p, d, q) + are the AR order, the degree of differencing, + and the MA order. + data_frequency (~.gcb_model.Model.DataFrequency): + The data frequency of a time series. + include_drift (bool): + Include drift when fitting an ARIMA model. + holiday_region (~.gcb_model.Model.HolidayRegion): + The geographical region based on which the + holidays are considered in time series modeling. + If a valid value is specified, then holiday + effects modeling is enabled. + time_series_id_column (str): + The id column that will be used to indicate + different time series to forecast in parallel. + horizon (int): + The number of periods ahead that need to be + forecasted. + preserve_input_structs (bool): + Whether to preserve the input structs in output feature + names. Suppose there is a struct A with field b. When false + (default), the output feature name is A_b. When true, the + output feature name is A.b. + auto_arima_max_order (int): + The max value of non-seasonal p and q. """ max_iterations = proto.Field(proto.INT64, number=1) @@ -713,6 +1144,10 @@ class TrainingOptions(proto.Message): label_class_weights = proto.MapField(proto.STRING, proto.DOUBLE, number=17) + user_column = proto.Field(proto.STRING, number=18) + + item_column = proto.Field(proto.STRING, number=19) + distance_type = proto.Field( proto.ENUM, number=20, enum="Model.DistanceType", ) @@ -725,6 +1160,32 @@ class TrainingOptions(proto.Message): proto.ENUM, number=23, enum="Model.OptimizationStrategy", ) + hidden_units = proto.RepeatedField(proto.INT64, number=24) + + batch_size = proto.Field(proto.INT64, number=25) + + dropout = proto.Field( + proto.MESSAGE, number=26, message=wrappers.DoubleValue, + ) + + max_tree_depth = proto.Field(proto.INT64, number=27) + + subsample = proto.Field(proto.DOUBLE, number=28) + + min_split_loss = proto.Field( + proto.MESSAGE, number=29, message=wrappers.DoubleValue, + ) + + num_factors = proto.Field(proto.INT64, number=30) + + feedback_type = proto.Field( + proto.ENUM, number=31, enum="Model.FeedbackType", + ) + + wals_alpha = proto.Field( + proto.MESSAGE, number=32, message=wrappers.DoubleValue, + ) + kmeans_initialization_method = proto.Field( proto.ENUM, number=33, @@ -733,6 +1194,34 @@ class TrainingOptions(proto.Message): kmeans_initialization_column = proto.Field(proto.STRING, number=34) + time_series_timestamp_column = proto.Field(proto.STRING, number=35) + + time_series_data_column = proto.Field(proto.STRING, number=36) + + auto_arima = proto.Field(proto.BOOL, number=37) + + non_seasonal_order = proto.Field( + proto.MESSAGE, number=38, message="Model.ArimaOrder", + ) + + data_frequency = proto.Field( + proto.ENUM, number=39, enum="Model.DataFrequency", + ) + + include_drift = proto.Field(proto.BOOL, number=41) + + holiday_region = proto.Field( + proto.ENUM, number=42, enum="Model.HolidayRegion", + ) + + time_series_id_column = proto.Field(proto.STRING, number=43) + + horizon = proto.Field(proto.INT64, number=44) + + preserve_input_structs = proto.Field(proto.BOOL, number=45) + + auto_arima_max_order = proto.Field(proto.INT64, number=46) + class IterationResult(proto.Message): r"""Information about a single iteration of the training run. @@ -753,6 +1242,8 @@ class IterationResult(proto.Message): cluster_infos (Sequence[~.gcb_model.Model.TrainingRun.IterationResult.ClusterInfo]): Information about top clusters for clustering models. + arima_result (~.gcb_model.Model.TrainingRun.IterationResult.ArimaResult): + """ class ClusterInfo(proto.Message): @@ -779,6 +1270,102 @@ class ClusterInfo(proto.Message): proto.MESSAGE, number=3, message=wrappers.Int64Value, ) + class ArimaResult(proto.Message): + r"""(Auto-)arima fitting result. Wrap everything in ArimaResult + for easier refactoring if we want to use model-specific + iteration results. + + Attributes: + arima_model_info (Sequence[~.gcb_model.Model.TrainingRun.IterationResult.ArimaResult.ArimaModelInfo]): + This message is repeated because there are + multiple arima models fitted in auto-arima. For + non-auto-arima model, its size is one. + seasonal_periods (Sequence[~.gcb_model.Model.SeasonalPeriod.SeasonalPeriodType]): + Seasonal periods. Repeated because multiple + periods are supported for one time series. + """ + + class ArimaCoefficients(proto.Message): + r"""Arima coefficients. + + Attributes: + auto_regressive_coefficients (Sequence[float]): + Auto-regressive coefficients, an array of + double. + moving_average_coefficients (Sequence[float]): + Moving-average coefficients, an array of + double. + intercept_coefficient (float): + Intercept coefficient, just a double not an + array. + """ + + auto_regressive_coefficients = proto.RepeatedField( + proto.DOUBLE, number=1 + ) + + moving_average_coefficients = proto.RepeatedField( + proto.DOUBLE, number=2 + ) + + intercept_coefficient = proto.Field(proto.DOUBLE, number=3) + + class ArimaModelInfo(proto.Message): + r"""Arima model information. + + Attributes: + non_seasonal_order (~.gcb_model.Model.ArimaOrder): + Non-seasonal order. + arima_coefficients (~.gcb_model.Model.TrainingRun.IterationResult.ArimaResult.ArimaCoefficients): + Arima coefficients. + arima_fitting_metrics (~.gcb_model.Model.ArimaFittingMetrics): + Arima fitting metrics. + has_drift (bool): + Whether Arima model fitted with drift or not. + It is always false when d is not 1. + time_series_id (str): + The id to indicate different time series. + seasonal_periods (Sequence[~.gcb_model.Model.SeasonalPeriod.SeasonalPeriodType]): + Seasonal periods. Repeated because multiple + periods are supported for one time series. + """ + + non_seasonal_order = proto.Field( + proto.MESSAGE, number=1, message="Model.ArimaOrder", + ) + + arima_coefficients = proto.Field( + proto.MESSAGE, + number=2, + message="Model.TrainingRun.IterationResult.ArimaResult.ArimaCoefficients", + ) + + arima_fitting_metrics = proto.Field( + proto.MESSAGE, number=3, message="Model.ArimaFittingMetrics", + ) + + has_drift = proto.Field(proto.BOOL, number=4) + + time_series_id = proto.Field(proto.STRING, number=5) + + seasonal_periods = proto.RepeatedField( + proto.ENUM, + number=6, + enum="Model.SeasonalPeriod.SeasonalPeriodType", + ) + + arima_model_info = proto.RepeatedField( + proto.MESSAGE, + number=1, + message="Model.TrainingRun.IterationResult.ArimaResult.ArimaModelInfo", + ) + + seasonal_periods = proto.RepeatedField( + proto.ENUM, + number=2, + enum="Model.SeasonalPeriod.SeasonalPeriodType", + ) + index = proto.Field(proto.MESSAGE, number=1, message=wrappers.Int32Value,) duration_ms = proto.Field( @@ -801,6 +1388,12 @@ class ClusterInfo(proto.Message): message="Model.TrainingRun.IterationResult.ClusterInfo", ) + arima_result = proto.Field( + proto.MESSAGE, + number=9, + message="Model.TrainingRun.IterationResult.ArimaResult", + ) + training_options = proto.Field( proto.MESSAGE, number=1, message="Model.TrainingRun.TrainingOptions", ) @@ -815,6 +1408,14 @@ class ClusterInfo(proto.Message): proto.MESSAGE, number=7, message="Model.EvaluationMetrics", ) + data_split_result = proto.Field( + proto.MESSAGE, number=9, message="Model.DataSplitResult", + ) + + global_explanations = proto.RepeatedField( + proto.MESSAGE, number=10, message="Model.GlobalExplanation", + ) + etag = proto.Field(proto.STRING, number=1) model_reference = proto.Field( diff --git a/google/cloud/bigquery_v2/types/standard_sql.py b/google/cloud/bigquery_v2/types/standard_sql.py index 72f12f284..1a32a3c75 100644 --- a/google/cloud/bigquery_v2/types/standard_sql.py +++ b/google/cloud/bigquery_v2/types/standard_sql.py @@ -58,6 +58,7 @@ class TypeKind(proto.Enum): DATETIME = 21 GEOGRAPHY = 22 NUMERIC = 23 + BIGNUMERIC = 24 ARRAY = 16 STRUCT = 17 diff --git a/google/cloud/bigquery_v2/types/table_reference.py b/google/cloud/bigquery_v2/types/table_reference.py new file mode 100644 index 000000000..d213e8bb6 --- /dev/null +++ b/google/cloud/bigquery_v2/types/table_reference.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- + +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import proto # type: ignore + + +__protobuf__ = proto.module( + package="google.cloud.bigquery.v2", manifest={"TableReference",}, +) + + +class TableReference(proto.Message): + r""" + + Attributes: + project_id (str): + Required. The ID of the project containing + this table. + dataset_id (str): + Required. The ID of the dataset containing + this table. + table_id (str): + Required. The ID of the table. The ID must contain only + letters (a-z, A-Z), numbers (0-9), or underscores (_). The + maximum length is 1,024 characters. Certain operations allow + suffixing of the table ID with a partition decorator, such + as ``sample_table$20190123``. + """ + + project_id = proto.Field(proto.STRING, number=1) + + dataset_id = proto.Field(proto.STRING, number=2) + + table_id = proto.Field(proto.STRING, number=3) + + +__all__ = tuple(sorted(__protobuf__.manifest)) diff --git a/synth.metadata b/synth.metadata index 008810630..db77e463d 100644 --- a/synth.metadata +++ b/synth.metadata @@ -4,15 +4,15 @@ "git": { "name": ".", "remote": "https://github.com/googleapis/python-bigquery.git", - "sha": "31644d380b35a76a9147801a4b6b0271c246fd0c" + "sha": "5178b55682f5e264bfc082cde26acb1fdc953a18" } }, { "git": { "name": "googleapis", "remote": "https://github.com/googleapis/googleapis.git", - "sha": "c941026e5e3d600817a20e9ab4d4be03dff21a68", - "internalRef": "334645418" + "sha": "215c12ade72d9d9616457d9b8b2f8a37f38e79f3", + "internalRef": "337113354" } }, { @@ -101,18 +101,19 @@ "google/cloud/bigquery_v2/proto/model.proto", "google/cloud/bigquery_v2/proto/model_reference.proto", "google/cloud/bigquery_v2/proto/standard_sql.proto", + "google/cloud/bigquery_v2/proto/table_reference.proto", "google/cloud/bigquery_v2/py.typed", "google/cloud/bigquery_v2/types/__init__.py", "google/cloud/bigquery_v2/types/encryption_config.py", "google/cloud/bigquery_v2/types/model.py", "google/cloud/bigquery_v2/types/model_reference.py", "google/cloud/bigquery_v2/types/standard_sql.py", + "google/cloud/bigquery_v2/types/table_reference.py", "mypy.ini", "renovate.json", "samples/AUTHORING_GUIDE.md", "samples/CONTRIBUTING.md", "scripts/decrypt-secrets.sh", - "scripts/fixup_bigquery_v2_keywords.py", "scripts/readme-gen/readme_gen.py", "scripts/readme-gen/templates/README.tmpl.rst", "scripts/readme-gen/templates/auth.tmpl.rst", From 82290c365e6b18e9d5c3a94a312f0326df9354bc Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Mon, 19 Oct 2020 15:04:03 +0000 Subject: [PATCH 18/18] chore: release 2.2.0 (#321) :robot: I have created a release \*beep\* \*boop\* --- ## [2.2.0](https://www.github.com/googleapis/python-bigquery/compare/v2.1.0...v2.2.0) (2020-10-19) ### Features * add method api_repr for table list item ([#299](https://www.github.com/googleapis/python-bigquery/issues/299)) ([07c70f0](https://www.github.com/googleapis/python-bigquery/commit/07c70f0292f9212f0c968cd5c9206e8b0409c0da)) * add support for listing arima, automl, boosted tree, DNN, and matrix factorization models ([#328](https://www.github.com/googleapis/python-bigquery/issues/328)) ([502a092](https://www.github.com/googleapis/python-bigquery/commit/502a0926018abf058cb84bd18043c25eba15a2cc)) * add timeout paramter to load_table_from_file and it dependent methods ([#327](https://www.github.com/googleapis/python-bigquery/issues/327)) ([b0dd892](https://www.github.com/googleapis/python-bigquery/commit/b0dd892176e31ac25fddd15554b5bfa054299d4d)) * add to_api_repr method to Model ([#326](https://www.github.com/googleapis/python-bigquery/issues/326)) ([fb401bd](https://www.github.com/googleapis/python-bigquery/commit/fb401bd94477323bba68cf252dd88166495daf54)) * allow client options to be set in magics context ([#322](https://www.github.com/googleapis/python-bigquery/issues/322)) ([5178b55](https://www.github.com/googleapis/python-bigquery/commit/5178b55682f5e264bfc082cde26acb1fdc953a18)) ### Bug Fixes * make TimePartitioning repr evaluable ([#110](https://www.github.com/googleapis/python-bigquery/issues/110)) ([20f473b](https://www.github.com/googleapis/python-bigquery/commit/20f473bfff5ae98377f5d9cdf18bfe5554d86ff4)), closes [#109](https://www.github.com/googleapis/python-bigquery/issues/109) * use version.py instead of pkg_resources.get_distribution ([#307](https://www.github.com/googleapis/python-bigquery/issues/307)) ([b8f502b](https://www.github.com/googleapis/python-bigquery/commit/b8f502b14f21d1815697e4d57cf1225dfb4a7c5e)) ### Performance Improvements * add size parameter for load table from dataframe and json methods ([#280](https://www.github.com/googleapis/python-bigquery/issues/280)) ([3be78b7](https://www.github.com/googleapis/python-bigquery/commit/3be78b737add7111e24e912cd02fc6df75a07de6)) ### Documentation * update clustering field docstrings ([#286](https://www.github.com/googleapis/python-bigquery/issues/286)) ([5ea1ece](https://www.github.com/googleapis/python-bigquery/commit/5ea1ece2d911cdd1f3d9549ee01559ce8ed8269a)), closes [#285](https://www.github.com/googleapis/python-bigquery/issues/285) * update snippets samples to support version 2.0 ([#309](https://www.github.com/googleapis/python-bigquery/issues/309)) ([61634be](https://www.github.com/googleapis/python-bigquery/commit/61634be9bf9e3df7589fc1bfdbda87288859bb13)) ### Dependencies * add protobuf dependency ([#306](https://www.github.com/googleapis/python-bigquery/issues/306)) ([cebb5e0](https://www.github.com/googleapis/python-bigquery/commit/cebb5e0e911e8c9059bc8c9e7fce4440e518bff3)), closes [#305](https://www.github.com/googleapis/python-bigquery/issues/305) * require pyarrow for pandas support ([#314](https://www.github.com/googleapis/python-bigquery/issues/314)) ([801e4c0](https://www.github.com/googleapis/python-bigquery/commit/801e4c0574b7e421aa3a28cafec6fd6bcce940dd)), closes [#265](https://www.github.com/googleapis/python-bigquery/issues/265) --- This PR was generated with [Release Please](https://github.com/googleapis/release-please). --- CHANGELOG.md | 34 ++++++++++++++++++++++++++++++++ google/cloud/bigquery/version.py | 2 +- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ad6c9551f..384704bbf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,40 @@ [1]: https://pypi.org/project/google-cloud-bigquery/#history +## [2.2.0](https://www.github.com/googleapis/python-bigquery/compare/v2.1.0...v2.2.0) (2020-10-19) + + +### Features + +* add method api_repr for table list item ([#299](https://www.github.com/googleapis/python-bigquery/issues/299)) ([07c70f0](https://www.github.com/googleapis/python-bigquery/commit/07c70f0292f9212f0c968cd5c9206e8b0409c0da)) +* add support for listing arima, automl, boosted tree, DNN, and matrix factorization models ([#328](https://www.github.com/googleapis/python-bigquery/issues/328)) ([502a092](https://www.github.com/googleapis/python-bigquery/commit/502a0926018abf058cb84bd18043c25eba15a2cc)) +* add timeout paramter to load_table_from_file and it dependent methods ([#327](https://www.github.com/googleapis/python-bigquery/issues/327)) ([b0dd892](https://www.github.com/googleapis/python-bigquery/commit/b0dd892176e31ac25fddd15554b5bfa054299d4d)) +* add to_api_repr method to Model ([#326](https://www.github.com/googleapis/python-bigquery/issues/326)) ([fb401bd](https://www.github.com/googleapis/python-bigquery/commit/fb401bd94477323bba68cf252dd88166495daf54)) +* allow client options to be set in magics context ([#322](https://www.github.com/googleapis/python-bigquery/issues/322)) ([5178b55](https://www.github.com/googleapis/python-bigquery/commit/5178b55682f5e264bfc082cde26acb1fdc953a18)) + + +### Bug Fixes + +* make TimePartitioning repr evaluable ([#110](https://www.github.com/googleapis/python-bigquery/issues/110)) ([20f473b](https://www.github.com/googleapis/python-bigquery/commit/20f473bfff5ae98377f5d9cdf18bfe5554d86ff4)), closes [#109](https://www.github.com/googleapis/python-bigquery/issues/109) +* use version.py instead of pkg_resources.get_distribution ([#307](https://www.github.com/googleapis/python-bigquery/issues/307)) ([b8f502b](https://www.github.com/googleapis/python-bigquery/commit/b8f502b14f21d1815697e4d57cf1225dfb4a7c5e)) + + +### Performance Improvements + +* add size parameter for load table from dataframe and json methods ([#280](https://www.github.com/googleapis/python-bigquery/issues/280)) ([3be78b7](https://www.github.com/googleapis/python-bigquery/commit/3be78b737add7111e24e912cd02fc6df75a07de6)) + + +### Documentation + +* update clustering field docstrings ([#286](https://www.github.com/googleapis/python-bigquery/issues/286)) ([5ea1ece](https://www.github.com/googleapis/python-bigquery/commit/5ea1ece2d911cdd1f3d9549ee01559ce8ed8269a)), closes [#285](https://www.github.com/googleapis/python-bigquery/issues/285) +* update snippets samples to support version 2.0 ([#309](https://www.github.com/googleapis/python-bigquery/issues/309)) ([61634be](https://www.github.com/googleapis/python-bigquery/commit/61634be9bf9e3df7589fc1bfdbda87288859bb13)) + + +### Dependencies + +* add protobuf dependency ([#306](https://www.github.com/googleapis/python-bigquery/issues/306)) ([cebb5e0](https://www.github.com/googleapis/python-bigquery/commit/cebb5e0e911e8c9059bc8c9e7fce4440e518bff3)), closes [#305](https://www.github.com/googleapis/python-bigquery/issues/305) +* require pyarrow for pandas support ([#314](https://www.github.com/googleapis/python-bigquery/issues/314)) ([801e4c0](https://www.github.com/googleapis/python-bigquery/commit/801e4c0574b7e421aa3a28cafec6fd6bcce940dd)), closes [#265](https://www.github.com/googleapis/python-bigquery/issues/265) + ## [2.1.0](https://www.github.com/googleapis/python-bigquery/compare/v2.0.0...v2.1.0) (2020-10-08) diff --git a/google/cloud/bigquery/version.py b/google/cloud/bigquery/version.py index 8b5d3328c..bd0f8e5c7 100644 --- a/google/cloud/bigquery/version.py +++ b/google/cloud/bigquery/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.1.0" +__version__ = "2.2.0"