From aa4b65596d770306812dc4567db36108020f397b Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 13 Sep 2024 12:11:07 -0700 Subject: [PATCH 1/6] refactor: use dataclass for Context to reduce boilerplate (#53) * refactor: use dataclass for Context to reduce boilerplate * Fix test cover workflow. Ref: https://github.com/googleapis/python-bigquery-dataframes/pull/948 * Fix field name * fix docstring indentation --- .github/workflows/unittest.yml | 1 + bigquery_magics/config.py | 170 ++++++++++++++------------------- owlbot.py | 1 + 3 files changed, 74 insertions(+), 98 deletions(-) diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index 495a7ef..2f025c8 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -30,6 +30,7 @@ jobs: with: name: coverage-artifact-${{ matrix.python }} path: .coverage-${{ matrix.python }} + include-hidden-files: true cover: runs-on: ubuntu-latest diff --git a/bigquery_magics/config.py b/bigquery_magics/config.py index 3971fdd..0cffb44 100644 --- a/bigquery_magics/config.py +++ b/bigquery_magics/config.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from dataclasses import dataclass + import google.api_core.client_options as client_options import google.cloud.bigquery as bigquery import pydata_google_auth @@ -23,6 +25,7 @@ def _get_default_credentials_with_project(): return pydata_google_auth.default(scopes=_SCOPES, use_local_webserver=False) +@dataclass class Context(object): """Storage for objects to be used throughout an IPython notebook session. @@ -30,14 +33,75 @@ class Context(object): and can be found at ``bigquery_magics.context``. """ - def __init__(self): - self._credentials = None - self._project = None - self._connection = None - self._default_query_job_config = bigquery.QueryJobConfig() - self._bigquery_client_options = client_options.ClientOptions() - self._bqstorage_client_options = client_options.ClientOptions() - self._progress_bar_type = "tqdm_notebook" + _credentials = None + _project = None + _connection = None + + default_query_job_config = bigquery.QueryJobConfig() + """google.cloud.bigquery.job.QueryJobConfig: Default job + configuration for queries. + + The context's :class:`~google.cloud.bigquery.job.QueryJobConfig` is + used for queries. Some properties can be overridden with arguments to + the magics. + + Example: + Manually setting the default value for ``maximum_bytes_billed`` + to 100 MB: + + >>> from google.cloud.bigquery import magics + >>> bigquery_magics.context.default_query_job_config.maximum_bytes_billed = 100000000 + """ + + bigquery_client_options = client_options.ClientOptions() + """google.api_core.client_options.ClientOptions: client options to be + used through IPython magics. + + Note:: + The client options do not need to be explicitly defined if no + special network connections are required. Normally you would be + using the https://bigquery.googleapis.com/ end point. + + Example: + Manually setting the endpoint: + + >>> from google.cloud.bigquery import magics + >>> client_options = {} + >>> client_options['api_endpoint'] = "https://some.special.url" + >>> bigquery_magics.context.bigquery_client_options = client_options + """ + + bqstorage_client_options = client_options.ClientOptions() + """google.api_core.client_options.ClientOptions: client options to be + used through IPython magics for the storage client. + + Note:: + The client options do not need to be explicitly defined if no + special network connections are required. Normally you would be + using the https://bigquerystorage.googleapis.com/ end point. + + Example: + Manually setting the endpoint: + + >>> from google.cloud.bigquery import magics + >>> client_options = {} + >>> client_options['api_endpoint'] = "https://some.special.url" + >>> bigquery_magics.context.bqstorage_client_options = client_options + """ + + progress_bar_type = "tqdm_notebook" + """str: Default progress bar type to use to display progress bar while + executing queries through IPython magics. + + Note:: + Install the ``tqdm`` package to use this feature. + + Example: + Manually setting the progress_bar_type: + + >>> from google.cloud.bigquery import magics + >>> bigquery_magics.context.progress_bar_type = "tqdm_notebook" + """ @property def credentials(self): @@ -99,95 +163,5 @@ def project(self): def project(self, value): self._project = value - @property - def bigquery_client_options(self): - """google.api_core.client_options.ClientOptions: client options to be - used through IPython magics. - - Note:: - The client options do not need to be explicitly defined if no - special network connections are required. Normally you would be - using the https://bigquery.googleapis.com/ end point. - - Example: - Manually setting the endpoint: - - >>> from google.cloud.bigquery import magics - >>> client_options = {} - >>> client_options['api_endpoint'] = "https://some.special.url" - >>> bigquery_magics.context.bigquery_client_options = client_options - """ - return self._bigquery_client_options - - @bigquery_client_options.setter - def bigquery_client_options(self, value): - self._bigquery_client_options = value - - @property - def bqstorage_client_options(self): - """google.api_core.client_options.ClientOptions: client options to be - used through IPython magics for the storage client. - - Note:: - The client options do not need to be explicitly defined if no - special network connections are required. Normally you would be - using the https://bigquerystorage.googleapis.com/ end point. - - Example: - Manually setting the endpoint: - - >>> from google.cloud.bigquery import magics - >>> client_options = {} - >>> client_options['api_endpoint'] = "https://some.special.url" - >>> bigquery_magics.context.bqstorage_client_options = client_options - """ - return self._bqstorage_client_options - - @bqstorage_client_options.setter - def bqstorage_client_options(self, value): - self._bqstorage_client_options = value - - @property - def default_query_job_config(self): - """google.cloud.bigquery.job.QueryJobConfig: Default job - configuration for queries. - - The context's :class:`~google.cloud.bigquery.job.QueryJobConfig` is - used for queries. Some properties can be overridden with arguments to - the magics. - - Example: - Manually setting the default value for ``maximum_bytes_billed`` - to 100 MB: - - >>> from google.cloud.bigquery import magics - >>> bigquery_magics.context.default_query_job_config.maximum_bytes_billed = 100000000 - """ - return self._default_query_job_config - - @default_query_job_config.setter - def default_query_job_config(self, value): - self._default_query_job_config = value - - @property - def progress_bar_type(self): - """str: Default progress bar type to use to display progress bar while - executing queries through IPython magics. - - Note:: - Install the ``tqdm`` package to use this feature. - - Example: - Manually setting the progress_bar_type: - - >>> from google.cloud.bigquery import magics - >>> bigquery_magics.context.progress_bar_type = "tqdm_notebook" - """ - return self._progress_bar_type - - @progress_bar_type.setter - def progress_bar_type(self, value): - self._progress_bar_type = value - context = Context() diff --git a/owlbot.py b/owlbot.py index ca0d20e..a05ee95 100644 --- a/owlbot.py +++ b/owlbot.py @@ -57,6 +57,7 @@ # creating clients, not the end user. "docs/multiprocessing.rst", "README.rst", + ".github/workflows/unittest.yml", ], ) From e222deb15d4b39191b14027758c2f0668106ff46 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 16 Sep 2024 19:11:21 -0700 Subject: [PATCH 2/6] =?UTF-8?q?refactor:=20break=20down=20=5Fcell=5Fmagic(?= =?UTF-8?q?)=20into=20several=20helper=20functions=20to=20i=E2=80=A6=20(#5?= =?UTF-8?q?4)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * refactor: break down _cell_magic() into several helper functions to improve readability * Fix import * fix test failures * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * fix type hint for pre 3.9 compatibility * remove unused variable in test --------- Co-authored-by: Owl Bot --- bigquery_magics/bigquery.py | 346 +++++++++++++++++++----------------- tests/system/__init__.py | 13 ++ tests/unit/__init__.py | 13 ++ tests/unit/test_bigquery.py | 25 +-- 4 files changed, 218 insertions(+), 179 deletions(-) create mode 100644 tests/system/__init__.py create mode 100644 tests/unit/__init__.py diff --git a/bigquery_magics/bigquery.py b/bigquery_magics/bigquery.py index b3d24a7..736745e 100644 --- a/bigquery_magics/bigquery.py +++ b/bigquery_magics/bigquery.py @@ -85,20 +85,23 @@ import ast from concurrent import futures import copy -import functools import re import sys import time +from typing import Any, List, Tuple import warnings import IPython # type: ignore from IPython import display # type: ignore from IPython.core import magic_arguments # type: ignore +from IPython.core.getipython import get_ipython from google.api_core import client_info from google.api_core.exceptions import NotFound from google.cloud import bigquery from google.cloud.bigquery import exceptions +from google.cloud.bigquery.dataset import DatasetReference from google.cloud.bigquery.dbapi import _helpers +from google.cloud.bigquery.job import QueryJobConfig from bigquery_magics import line_arg_parser as lap import bigquery_magics._versions_helpers @@ -128,7 +131,7 @@ def _handle_error(error, destination_var=None): query_job = getattr(error, "query_job", None) if query_job is not None: - IPython.get_ipython().push({destination_var: query_job}) + get_ipython().push({destination_var: query_job}) else: # this is the case when previewing table rows by providing just # table ID to cell magic @@ -194,7 +197,7 @@ def _create_dataset_if_necessary(client, dataset_id): dataset_id (str): Dataset id. """ - dataset_reference = bigquery.dataset.DatasetReference(client.project, dataset_id) + dataset_reference = DatasetReference(client.project, dataset_id) try: dataset = client.get_dataset(dataset_reference) return @@ -363,36 +366,50 @@ def _cell_magic(line, query): Returns: pandas.DataFrame: the query results. """ + + params, args = _parse_magic_args(line) + + query = query.strip() + if not query: + error = ValueError("Query is missing.") + _handle_error(error, args.destination_var) + return + query = _validate_and_resolve_query(query, args) + + bq_client, bqstorage_client = _create_clients(args) + + try: + return _make_bq_query( + query, + args=args, + params=params, + bq_client=bq_client, + bqstorage_client=bqstorage_client, + ) + finally: + _close_transports(bq_client, bqstorage_client) + + +def _parse_magic_args(line: str) -> Tuple[List[Any], Any]: # The built-in parser does not recognize Python structures such as dicts, thus # we extract the "--params" option and inteprpret it separately. try: params_option_value, rest_of_args = _split_args_line(line) - except lap.exceptions.QueryParamsParseError as exc: - rebranded_error = SyntaxError( + + except lap.QueryParamsParseError as exc: + raise SyntaxError( "--params is not a correctly formatted JSON string or a JSON " "serializable dictionary" - ) - raise rebranded_error from exc - except lap.exceptions.DuplicateQueryParamsError as exc: - rebranded_error = ValueError("Duplicate --params option.") - raise rebranded_error from exc - except lap.exceptions.ParseError as exc: - rebranded_error = ValueError( - "Unrecognized input, are option values correct? " - "Error details: {}".format(exc.args[0]) - ) - raise rebranded_error from exc + ) from exc - args = magic_arguments.parse_argstring(_cell_magic, rest_of_args) + except lap.DuplicateQueryParamsError as exc: + raise ValueError("Duplicate --params option.") from exc - if args.use_bqstorage_api is not None: - warnings.warn( - "Deprecated option --use_bqstorage_api, the BigQuery " - "Storage API is already used by default.", - category=DeprecationWarning, - ) - use_bqstorage_api = not args.use_rest_api and (bigquery_storage is not None) - location = args.location + except lap.ParseError as exc: + raise ValueError( + "Unrecognized input, are option values correct? " + "Error details: {}".format(exc.args[0]) + ) from exc params = [] if params_option_value: @@ -406,8 +423,28 @@ def _cell_magic(line, query): params = _helpers.to_query_parameters(ast.literal_eval(params_option_value), {}) - project = args.project or context.project + return params, magic_arguments.parse_argstring(_cell_magic, rest_of_args) + + +def _split_args_line(line: str) -> Tuple[str, str]: + """Split out the --params option value from the input line arguments. + + Args: + line: The line arguments passed to the cell magic. + + Returns: + A tuple of two strings. The first is param option value and + the second is the rest of the arguments. + """ + tree = lap.Parser(lap.Lexer(line)).input_line() + + extractor = lap.QueryParamsExtractor() + params_option_value, rest_of_args = extractor.visit(tree) + return params_option_value, rest_of_args + + +def _create_clients(args: Any) -> Tuple[bigquery.Client, Any]: bigquery_client_options = copy.deepcopy(context.bigquery_client_options) if args.bigquery_api_endpoint: if isinstance(bigquery_client_options, dict): @@ -415,16 +452,28 @@ def _cell_magic(line, query): else: bigquery_client_options.api_endpoint = args.bigquery_api_endpoint - client = bigquery.Client( - project=project, + bq_client = bigquery.Client( + project=args.project or context.project, credentials=context.credentials, default_query_job_config=context.default_query_job_config, client_info=client_info.ClientInfo(user_agent=USER_AGENT), client_options=bigquery_client_options, - location=location, + location=args.location, ) if context._connection: - client._connection = context._connection + bq_client._connection = context._connection + + # Check and instantiate bq storage client + if args.use_bqstorage_api is not None: + warnings.warn( + "Deprecated option --use_bqstorage_api, the BigQuery " + "Storage API is already used by default.", + category=DeprecationWarning, + ) + use_bqstorage_api = not args.use_rest_api and (bigquery_storage is not None) + + if not use_bqstorage_api: + return bq_client, None bqstorage_client_options = copy.deepcopy(context.bqstorage_client_options) if args.bqstorage_api_endpoint: @@ -434,111 +483,71 @@ def _cell_magic(line, query): bqstorage_client_options.api_endpoint = args.bqstorage_api_endpoint bqstorage_client = _make_bqstorage_client( - client, - use_bqstorage_api, + bq_client, bqstorage_client_options, ) - close_transports = functools.partial(_close_transports, client, bqstorage_client) - - try: - if args.max_results: - max_results = int(args.max_results) - else: - max_results = None - - query = query.strip() + return bq_client, bqstorage_client - if not query: - error = ValueError("Query is missing.") - _handle_error(error, args.destination_var) - return - # Check if query is given as a reference to a variable. - if query.startswith("$"): - query_var_name = query[1:] - - if not query_var_name: - missing_msg = 'Missing query variable name, empty "$" is not allowed.' - raise NameError(missing_msg) - - if query_var_name.isidentifier(): - ip = IPython.get_ipython() - query = ip.user_ns.get(query_var_name, ip) # ip serves as a sentinel - - if query is ip: - raise NameError( - f"Unknown query, variable {query_var_name} does not exist." - ) - else: - if not isinstance(query, (str, bytes)): - raise TypeError( - f"Query variable {query_var_name} must be a string " - "or a bytes-like value." - ) - - # Any query that does not contain whitespace (aside from leading and trailing whitespace) - # is assumed to be a table id - if not re.search(r"\s", query): - try: - rows = client.list_rows(query, max_results=max_results) - except Exception as ex: - _handle_error(ex, args.destination_var) - return - - result = rows.to_dataframe( - bqstorage_client=bqstorage_client, - create_bqstorage_client=False, - ) - if args.destination_var: - IPython.get_ipython().push({args.destination_var: result}) - return - else: - return result - - job_config = bigquery.job.QueryJobConfig() - job_config.query_parameters = params - job_config.use_legacy_sql = args.use_legacy_sql - job_config.dry_run = args.dry_run - - # Don't override context job config unless --no_query_cache is explicitly set. - if args.no_query_cache: - job_config.use_query_cache = False - - if args.destination_table: - split = args.destination_table.split(".") - if len(split) != 2: - raise ValueError( - "--destination_table should be in a . format." - ) - dataset_id, table_id = split - job_config.allow_large_results = True - dataset_ref = bigquery.dataset.DatasetReference(client.project, dataset_id) - destination_table_ref = dataset_ref.table(table_id) - job_config.destination = destination_table_ref - job_config.create_disposition = "CREATE_IF_NEEDED" - job_config.write_disposition = "WRITE_TRUNCATE" - _create_dataset_if_necessary(client, dataset_id) - - if args.maximum_bytes_billed == "None": - job_config.maximum_bytes_billed = 0 - elif args.maximum_bytes_billed is not None: - value = int(args.maximum_bytes_billed) - job_config.maximum_bytes_billed = value +def _make_bq_query( + query: str, + args: Any, + params: List[Any], + bq_client: bigquery.Client, + bqstorage_client: Any, +): + max_results = int(args.max_results) if args.max_results else None + # Any query that does not contain whitespace (aside from leading and trailing whitespace) + # is assumed to be a table id + if not re.search(r"\s", query): try: - query_job = _run_query(client, query, job_config=job_config) + rows = bq_client.list_rows(query, max_results=max_results) except Exception as ex: _handle_error(ex, args.destination_var) return - if not args.verbose: - display.clear_output() + result = rows.to_dataframe( + bqstorage_client=bqstorage_client, + create_bqstorage_client=False, + ) + if args.destination_var: + get_ipython().push({args.destination_var: result}) + return + else: + return result + + job_config = _create_job_config(args, params) + if args.destination_table: + split = args.destination_table.split(".") + if len(split) != 2: + raise ValueError( + "--destination_table should be in a . format." + ) + dataset_id, table_id = split + job_config.allow_large_results = True + dataset_ref = DatasetReference(bq_client.project, dataset_id) + destination_table_ref = dataset_ref.table(table_id) + job_config.destination = destination_table_ref + job_config.create_disposition = "CREATE_IF_NEEDED" + job_config.write_disposition = "WRITE_TRUNCATE" + _create_dataset_if_necessary(bq_client, dataset_id) - if args.dry_run and args.destination_var: - IPython.get_ipython().push({args.destination_var: query_job}) + try: + query_job = _run_query(bq_client, query, job_config=job_config) + except Exception as ex: + _handle_error(ex, args.destination_var) + return + + if not args.verbose: + display.clear_output() + + if args.dry_run: + if args.destination_var: + get_ipython().push({args.destination_var: query_job}) return - elif args.dry_run: + else: print( "Query validated. This query will process {} bytes.".format( query_job.total_bytes_processed @@ -546,54 +555,76 @@ def _cell_magic(line, query): ) return query_job - progress_bar = context.progress_bar_type or args.progress_bar_type + progress_bar = context.progress_bar_type or args.progress_bar_type - if max_results: - result = query_job.result(max_results=max_results).to_dataframe( - bqstorage_client=None, - create_bqstorage_client=False, - progress_bar_type=progress_bar, - ) - else: - result = query_job.to_dataframe( - bqstorage_client=bqstorage_client, - create_bqstorage_client=False, - progress_bar_type=progress_bar, - ) + if max_results: + result = query_job.result(max_results=max_results).to_dataframe( + bqstorage_client=None, + create_bqstorage_client=False, + progress_bar_type=progress_bar, + ) + else: + result = query_job.to_dataframe( + bqstorage_client=bqstorage_client, + create_bqstorage_client=False, + progress_bar_type=progress_bar, + ) - if args.destination_var: - IPython.get_ipython().push({args.destination_var: result}) - else: - return result - finally: - close_transports() + if args.destination_var: + get_ipython().push({args.destination_var: result}) + else: + return result -def _split_args_line(line): - """Split out the --params option value from the input line arguments. +def _validate_and_resolve_query(query: str, args: Any) -> str: + # Check if query is given as a reference to a variable. + if not query.startswith("$"): + return query - Args: - line (str): The line arguments passed to the cell magic. + query_var_name = query[1:] - Returns: - Tuple[str, str] - """ - lexer = lap.Lexer(line) - scanner = lap.Parser(lexer) - tree = scanner.input_line() + if not query_var_name: + missing_msg = 'Missing query variable name, empty "$" is not allowed.' + raise NameError(missing_msg) - extractor = lap.QueryParamsExtractor() - params_option_value, rest_of_args = extractor.visit(tree) + if query_var_name.isidentifier(): + ip = get_ipython() + query = ip.user_ns.get(query_var_name, ip) # ip serves as a sentinel - return params_option_value, rest_of_args + if query is ip: + raise NameError(f"Unknown query, variable {query_var_name} does not exist.") + elif not isinstance(query, (str, bytes)): + raise TypeError( + f"Query variable {query_var_name} must be a string " + "or a bytes-like value." + ) + return query + + +def _create_job_config(args: Any, params: List[Any]) -> QueryJobConfig: + job_config = QueryJobConfig() + job_config.query_parameters = params + job_config.use_legacy_sql = args.use_legacy_sql + job_config.dry_run = args.dry_run + # Don't override context job config unless --no_query_cache is explicitly set. + if args.no_query_cache: + job_config.use_query_cache = False -def _make_bqstorage_client(client, use_bqstorage_api, client_options): + if args.maximum_bytes_billed == "None": + job_config.maximum_bytes_billed = 0 + elif args.maximum_bytes_billed is not None: + value = int(args.maximum_bytes_billed) + job_config.maximum_bytes_billed = value + + return job_config + + +def _make_bqstorage_client(client, client_options): """Creates a BigQuery Storage client. Args: client (:class:`~google.cloud.bigquery.client.Client`): BigQuery client. - use_bqstorage_api (bool): whether BigQuery Storage API is used or not. client_options (:class:`google.api_core.client_options.ClientOptions`): Custom options used with a new BigQuery Storage client instance if one is created. @@ -608,9 +639,6 @@ def _make_bqstorage_client(client, use_bqstorage_api, client_options): is outdated. BigQuery Storage Client: """ - if not use_bqstorage_api: - return None - try: bigquery_magics._versions_helpers.BQ_STORAGE_VERSIONS.try_import( raise_if_error=True diff --git a/tests/system/__init__.py b/tests/system/__init__.py new file mode 100644 index 0000000..d6b0308 --- /dev/null +++ b/tests/system/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py new file mode 100644 index 0000000..d6b0308 --- /dev/null +++ b/tests/unit/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/unit/test_bigquery.py b/tests/unit/test_bigquery.py index 6a0a8c6..721bb48 100644 --- a/tests/unit/test_bigquery.py +++ b/tests/unit/test_bigquery.py @@ -289,28 +289,17 @@ def test__run_query_dry_run_without_errors_is_silent(): assert len(captured.stdout) == 0 -def test__make_bqstorage_client_false(): - credentials_mock = mock.create_autospec( - google.auth.credentials.Credentials, instance=True - ) - test_client = bigquery.Client( - project="test_project", credentials=credentials_mock, location="test_location" - ) - got = magics._make_bqstorage_client(test_client, False, {}) - assert got is None - - @pytest.mark.skipif( bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" ) -def test__make_bqstorage_client_true(): +def test__make_bqstorage_client(): credentials_mock = mock.create_autospec( google.auth.credentials.Credentials, instance=True ) test_client = bigquery.Client( project="test_project", credentials=credentials_mock, location="test_location" ) - got = magics._make_bqstorage_client(test_client, True, {}) + got = magics._make_bqstorage_client(test_client, {}) assert isinstance(got, bigquery_storage.BigQueryReadClient) @@ -326,7 +315,7 @@ def test__make_bqstorage_client_true_raises_import_error(missing_bq_storage): ) with pytest.raises(ImportError) as exc_context, missing_bq_storage: - magics._make_bqstorage_client(test_client, True, {}) + magics._make_bqstorage_client(test_client, {}) error_msg = str(exc_context.value) assert "google-cloud-bigquery-storage" in error_msg @@ -356,19 +345,15 @@ def test__make_bqstorage_client_true_obsolete_dependency(): with patcher, pytest.raises( google.cloud.bigquery.exceptions.LegacyBigQueryStorageError ): - magics._make_bqstorage_client(test_client, True, {}) + magics._make_bqstorage_client(test_client, {}) @pytest.mark.skipif( bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" ) def test__make_bqstorage_client_true_missing_gapic(missing_grpcio_lib): - credentials_mock = mock.create_autospec( - google.auth.credentials.Credentials, instance=True - ) - with pytest.raises(ImportError) as exc_context, missing_grpcio_lib: - magics._make_bqstorage_client(True, credentials_mock, {}) + magics._make_bqstorage_client(True, {}) assert "grpcio" in str(exc_context.value) From e720063f8d3edf1e477f56709f8cace9343b1f82 Mon Sep 17 00:00:00 2001 From: "gcf-owl-bot[bot]" <78513119+gcf-owl-bot[bot]@users.noreply.github.com> Date: Tue, 17 Sep 2024 11:18:36 -0400 Subject: [PATCH 3/6] build(python): release script update (#55) Source-Link: https://github.com/googleapis/synthtool/commit/71a72973dddbc66ea64073b53eda49f0d22e0942 Post-Processor: gcr.io/cloud-devrel-public-resources/owlbot-python:latest@sha256:e8dcfd7cbfd8beac3a3ff8d3f3185287ea0625d859168cc80faccfc9a7a00455 Co-authored-by: Owl Bot Co-authored-by: Anthonios Partheniou --- .github/.OwlBot.lock.yaml | 4 ++-- .kokoro/release.sh | 2 +- .kokoro/release/common.cfg | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml index 6d064dd..597e0c3 100644 --- a/.github/.OwlBot.lock.yaml +++ b/.github/.OwlBot.lock.yaml @@ -13,5 +13,5 @@ # limitations under the License. docker: image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest - digest: sha256:94bb690db96e6242b2567a4860a94d48fa48696d092e51b0884a1a2c0a79a407 -# created: 2024-07-31T14:52:44.926548819Z + digest: sha256:e8dcfd7cbfd8beac3a3ff8d3f3185287ea0625d859168cc80faccfc9a7a00455 +# created: 2024-09-16T21:04:09.091105552Z diff --git a/.kokoro/release.sh b/.kokoro/release.sh index 2401edd..03e3c31 100755 --- a/.kokoro/release.sh +++ b/.kokoro/release.sh @@ -23,7 +23,7 @@ python3 -m releasetool publish-reporter-script > /tmp/publisher-script; source / export PYTHONUNBUFFERED=1 # Move into the package, build the distribution and upload. -TWINE_PASSWORD=$(cat "${KOKORO_KEYSTORE_DIR}/73713_google-cloud-pypi-token-keystore-1") +TWINE_PASSWORD=$(cat "${KOKORO_KEYSTORE_DIR}/73713_google-cloud-pypi-token-keystore-2") cd github/python-bigquery-magics python3 setup.py sdist bdist_wheel twine upload --username __token__ --password "${TWINE_PASSWORD}" dist/* diff --git a/.kokoro/release/common.cfg b/.kokoro/release/common.cfg index 58691a1..3640ce5 100644 --- a/.kokoro/release/common.cfg +++ b/.kokoro/release/common.cfg @@ -28,7 +28,7 @@ before_action { fetch_keystore { keystore_resource { keystore_config_id: 73713 - keyname: "google-cloud-pypi-token-keystore-1" + keyname: "google-cloud-pypi-token-keystore-2" } } } From b0b7bec9c0e2d651b1c3b414abbe54ca2dccc4a7 Mon Sep 17 00:00:00 2001 From: Mend Renovate Date: Tue, 17 Sep 2024 23:36:54 +0200 Subject: [PATCH 4/6] chore(deps): update all dependencies (#56) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore(deps): update all dependencies * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot Co-authored-by: Anthonios Partheniou --- samples/snippets/requirements-test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/snippets/requirements-test.txt b/samples/snippets/requirements-test.txt index 68f9039..1640e1a 100644 --- a/samples/snippets/requirements-test.txt +++ b/samples/snippets/requirements-test.txt @@ -1,4 +1,4 @@ google-cloud-testutils==1.4.0 pytest===7.4.4; python_version == '3.7' -pytest==8.3.2; python_version >= '3.8' +pytest==8.3.3; python_version >= '3.8' mock==5.1.0 From 90ba05f3d918979788e01b0cd3201ac8f01741a9 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Fri, 20 Sep 2024 14:36:12 -0700 Subject: [PATCH 5/6] feat: add support for BigQuery DataFrames. Set `context.engine` to 'bigframes' to support query results larger than 10 GB (#58) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * define 'engine' field in the conext * use bigframes API when the context.engine says so * remove unnecessary deps * relax pip deps * undo pip deps * make bigframes an optional dependency * fix format * use 'is' for None checks * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * parametrize bigframes installation * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * define unit_bf with bf installation to make owlbot happy * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * parametrize bf installation and disable owlbot enforcement on the noxfile * chang owlbot file to test optional bf deps * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot --- CONTRIBUTING.rst | 6 +- bigquery_magics/bigquery.py | 61 +++++++++++++--- bigquery_magics/config.py | 29 +++++++- noxfile.py | 10 ++- owlbot.py | 17 ++--- setup.py | 1 + testing/constraints-3.9.txt | 1 + tests/unit/test_bigquery.py | 136 ++++++++++++++++++++++++++++++++++++ tests/unit/test_context.py | 13 ++++ 9 files changed, 249 insertions(+), 25 deletions(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 8ccc1af..637c5a2 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -22,7 +22,7 @@ In order to add a feature: documentation. - The feature must work fully on the following CPython versions: - 3.7, 3.8, 3.11 and 3.12 on both UNIX and Windows. + 3.7, 3.8, 3.9, 3.11 and 3.12 on both UNIX and Windows. - The feature must not add unnecessary dependencies (where "unnecessary" is of course subjective, but new dependencies should @@ -148,7 +148,7 @@ Running System Tests .. note:: - System tests are only configured to run under Python 3.8, 3.11 and 3.12. + System tests are only configured to run under Python 3.8, 3.9, 3.11 and 3.12. For expediency, we do not run them in older versions of Python 3. This alone will not run the tests. You'll need to change some local @@ -223,11 +223,13 @@ We support: - `Python 3.7`_ - `Python 3.8`_ +- `Python 3.9`_ - `Python 3.11`_ - `Python 3.12`_ .. _Python 3.7: https://docs.python.org/3.7/ .. _Python 3.8: https://docs.python.org/3.8/ +.. _Python 3.9: https://docs.python.org/3.9/ .. _Python 3.11: https://docs.python.org/3.11/ .. _Python 3.12: https://docs.python.org/3.12/ diff --git a/bigquery_magics/bigquery.py b/bigquery_magics/bigquery.py index 736745e..4cd0c5e 100644 --- a/bigquery_magics/bigquery.py +++ b/bigquery_magics/bigquery.py @@ -114,6 +114,11 @@ except ImportError: bigquery_storage = None +try: + import bigframes.pandas as bpd +except ImportError: + bpd = None + USER_AGENT = f"ipython-{IPython.__version__} bigquery-magics/{bigquery_magics.version.__version__}" context = bigquery_magics.config.context @@ -255,6 +260,7 @@ def _create_dataset_if_necessary(client, dataset_id): help=( "Sets query to be a dry run to estimate costs. " "Defaults to executing the query instead of dry run if this argument is not used." + "Does not work with engine 'bigframes'. " ), ) @magic_arguments.argument( @@ -319,6 +325,7 @@ def _create_dataset_if_necessary(client, dataset_id): "amount of time for the query to finish. By default, this " "information will be displayed as the query runs, but will be " "cleared after the query is finished." + "This flag is ignored when the engine is 'bigframes'." ), ) @magic_arguments.argument( @@ -350,6 +357,7 @@ def _create_dataset_if_necessary(client, dataset_id): help=( "Set the location to execute query." "Defaults to location set in query setting in console." + "This flag is ignored when the engine is 'bigframes'." ), ) def _cell_magic(line, query): @@ -376,18 +384,10 @@ def _cell_magic(line, query): return query = _validate_and_resolve_query(query, args) - bq_client, bqstorage_client = _create_clients(args) + if context.engine == "bigframes": + return _query_with_bigframes(query, params, args) - try: - return _make_bq_query( - query, - args=args, - params=params, - bq_client=bq_client, - bqstorage_client=bqstorage_client, - ) - finally: - _close_transports(bq_client, bqstorage_client) + return _query_with_pandas(query, params, args) def _parse_magic_args(line: str) -> Tuple[List[Any], Any]: @@ -444,6 +444,45 @@ def _split_args_line(line: str) -> Tuple[str, str]: return params_option_value, rest_of_args +def _query_with_bigframes(query: str, params: List[Any], args: Any): + if args.dry_run: + raise ValueError("Dry run is not supported by bigframes engine.") + + if bpd is None: + raise ValueError("Bigframes package is not installed.") + + bpd.options.bigquery.project = context.project + bpd.options.bigquery.credentials = context.credentials + + max_results = int(args.max_results) if args.max_results else None + + result = bpd.read_gbq_query( + query, + max_results=max_results, + configuration=_create_job_config(args, params).to_api_repr(), + ) + + if args.destination_var: + get_ipython().push({args.destination_var: result}) + else: + return result + + +def _query_with_pandas(query: str, params: List[Any], args: Any): + bq_client, bqstorage_client = _create_clients(args) + + try: + return _make_bq_query( + query, + args=args, + params=params, + bq_client=bq_client, + bqstorage_client=bqstorage_client, + ) + finally: + _close_transports(bq_client, bqstorage_client) + + def _create_clients(args: Any) -> Tuple[bigquery.Client, Any]: bigquery_client_options = copy.deepcopy(context.bigquery_client_options) if args.bigquery_api_endpoint: diff --git a/bigquery_magics/config.py b/bigquery_magics/config.py index 0cffb44..8e5f6ec 100644 --- a/bigquery_magics/config.py +++ b/bigquery_magics/config.py @@ -33,8 +33,6 @@ class Context(object): and can be found at ``bigquery_magics.context``. """ - _credentials = None - _project = None _connection = None default_query_job_config = bigquery.QueryJobConfig() @@ -103,6 +101,8 @@ class Context(object): >>> bigquery_magics.context.progress_bar_type = "tqdm_notebook" """ + _credentials = None + @property def credentials(self): """google.auth.credentials.Credentials: Credentials to use for queries @@ -138,6 +138,8 @@ def credentials(self): def credentials(self, value): self._credentials = value + _project = None + @property def project(self): """str: Default project to use for queries performed through IPython @@ -163,5 +165,28 @@ def project(self): def project(self, value): self._project = value + _engine = "pandas" + + @property + def engine(self) -> str: + """Engine to run the query. Could either be "pandas" or "bigframes". + + If using "pandas", the query result will be stored in a Pandas dataframe. + If using "bigframes", the query result will be stored in a bigframes dataframe instead. + + Example: + Manully setting the content engine: + + >>> from google.cloud.bigquery import magics + >>> bigquery_magics.context.engine = 'bigframes' + """ + return self._engine + + @engine.setter + def engine(self, value): + if value != "pandas" and value != "bigframes": + raise ValueError("engine must be either 'pandas' or 'bigframes'") + self._engine = value + context = Context() diff --git a/noxfile.py b/noxfile.py index 22877d5..a448407 100644 --- a/noxfile.py +++ b/noxfile.py @@ -34,7 +34,7 @@ DEFAULT_PYTHON_VERSION = "3.8" -UNIT_TEST_PYTHON_VERSIONS: List[str] = ["3.7", "3.8", "3.11", "3.12"] +UNIT_TEST_PYTHON_VERSIONS: List[str] = ["3.7", "3.8", "3.9", "3.11", "3.12"] UNIT_TEST_STANDARD_DEPENDENCIES = [ "mock", "asyncmock", @@ -57,17 +57,20 @@ ], "3.9": [ "bqstorage", + "bigframes", ], "3.10": [ "bqstorage", + "bigframes", ], "3.11": [], "3.12": [ "bqstorage", + "bigframes", ], } -SYSTEM_TEST_PYTHON_VERSIONS: List[str] = ["3.8", "3.11", "3.12"] +SYSTEM_TEST_PYTHON_VERSIONS: List[str] = ["3.8", "3.9", "3.11", "3.12"] SYSTEM_TEST_STANDARD_DEPENDENCIES: List[str] = [ "mock", "pytest", @@ -86,13 +89,16 @@ ], "3.9": [ "bqstorage", + "bigframes", ], "3.10": [ "bqstorage", + "bigframes", ], "3.11": [], "3.12": [ "bqstorage", + "bigframes", ], } diff --git a/owlbot.py b/owlbot.py index a05ee95..ee6146c 100644 --- a/owlbot.py +++ b/owlbot.py @@ -28,19 +28,20 @@ # Add templated files # ---------------------------------------------------------------------------- -extras = ["bqstorage"] +extras_storage = ["bqstorage"] +extras_bf = ["bqstorage", "bigframes"] extras_by_python = { - "3.7": extras, - "3.8": extras, - "3.9": extras, - "3.10": extras, + "3.7": extras_storage, + "3.8": extras_storage, + "3.9": extras_bf, + "3.10": extras_bf, # Use a middle version of Python to test when no extras are installed. "3.11": [], - "3.12": extras, + "3.12": extras_bf, } templated_files = common.py_library( - unit_test_python_versions=["3.7", "3.8", "3.11", "3.12"], - system_test_python_versions=["3.8", "3.11", "3.12"], + unit_test_python_versions=["3.7", "3.8", "3.9", "3.11", "3.12"], + system_test_python_versions=["3.8", "3.9", "3.11", "3.12"], cov_level=100, unit_test_extras_by_python=extras_by_python, unit_test_external_dependencies=["google-cloud-testutils"], diff --git a/setup.py b/setup.py index 5e4ad45..2b67aa6 100644 --- a/setup.py +++ b/setup.py @@ -54,6 +54,7 @@ "grpcio >= 1.47.0, < 2.0dev", "grpcio >= 1.49.1, < 2.0dev; python_version>='3.11'", ], + "bigframes": ["bigframes >= 1.17.0"], } all_extras = [] diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 66cf6ee..653bc82 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -4,4 +4,5 @@ # We try to test across major versions of our dependencies. # This is the last pandas 2.0.x release. pandas==2.0.3 +bigframes==1.17.0 diff --git a/tests/unit/test_bigquery.py b/tests/unit/test_bigquery.py index 721bb48..7079bea 100644 --- a/tests/unit/test_bigquery.py +++ b/tests/unit/test_bigquery.py @@ -43,6 +43,11 @@ except ImportError: bigquery_storage = None +try: + import bigframes.pandas as bpd +except ImportError: + bpd = None + def make_connection(*args): # TODO(tswast): Remove this in favor of a mock google.cloud.bigquery.Client @@ -121,6 +126,11 @@ def mock_credentials(monkeypatch): monkeypatch.setattr(bigquery_magics.context, "_credentials", credentials) +@pytest.fixture +def bigframes_engine(monkeypatch): + monkeypatch.setattr(bigquery_magics.context, "engine", "bigframes") + + PROJECT_ID = "its-a-project-eh" JOB_ID = "some-random-id" JOB_REFERENCE_RESOURCE = {"projectId": PROJECT_ID, "jobId": JOB_ID} @@ -1884,3 +1894,129 @@ def test_bigquery_magic_with_location(): client_options_used = run_query_mock.call_args_list[0][0][0] assert client_options_used.location == "us-east1" + + +@pytest.mark.usefixtures("ipython_interactive", "mock_credentials", "bigframes_engine") +def test_big_query_magic_bigframes(): + if bpd is None: + pytest.skip("BigFrames not installed") + + ip = IPython.get_ipython() + ip.extension_manager.load_extension("bigquery_magics") + sql = "SELECT 0 AS something" + expected_configuration = { + "query": {"queryParameters": [], "useLegacySql": False}, + "dryRun": False, + } + bf_patch = mock.patch("bigframes.pandas.read_gbq_query", autospec=True) + + with bf_patch as bf_mock: + ip.run_cell_magic("bigquery", "", sql) + + bf_mock.assert_called_once_with( + sql, max_results=None, configuration=expected_configuration + ) + assert bpd.options.bigquery.credentials is bigquery_magics.context.credentials + assert bpd.options.bigquery.project == bigquery_magics.context.project + + +@pytest.mark.usefixtures("ipython_interactive", "mock_credentials", "bigframes_engine") +def test_big_query_magic_bigframes__bigframes_is_not_installed__should_raise_error(): + if bpd is not None: + pytest.skip("BigFrames is installed") + + ip = IPython.get_ipython() + ip.extension_manager.load_extension("bigquery_magics") + sql = "SELECT 0 AS something" + + with pytest.raises(ValueError, match="Bigframes package is not installed."): + ip.run_cell_magic("bigquery", "", sql) + + +@pytest.mark.usefixtures("ipython_interactive", "mock_credentials", "bigframes_engine") +def test_big_query_magic_bigframes_with_params(): + if bpd is None: + pytest.skip("BigFrames not installed") + + ip = IPython.get_ipython() + ip.extension_manager.load_extension("bigquery_magics") + sql = "SELECT 0 AS @p" + expected_configuration = { + "query": { + "queryParameters": [ + { + "name": "p", + "parameterType": {"type": "STRING"}, + "parameterValue": {"value": "num"}, + }, + ], + "useLegacySql": False, + "parameterMode": "NAMED", + }, + "dryRun": False, + } + bf_patch = mock.patch("bigframes.pandas.read_gbq_query", autospec=True) + + with bf_patch as bf_mock: + ip.run_cell_magic("bigquery", '--params {"p":"num"}', sql) + + bf_mock.assert_called_once_with( + sql, max_results=None, configuration=expected_configuration + ) + + +@pytest.mark.usefixtures("ipython_interactive", "mock_credentials", "bigframes_engine") +def test_big_query_magic_bigframes_with_max_results(): + if bpd is None: + pytest.skip("BigFrames not installed") + + ip = IPython.get_ipython() + ip.extension_manager.load_extension("bigquery_magics") + sql = "SELECT 0 AS something" + expected_configuration = { + "query": {"queryParameters": [], "useLegacySql": False}, + "dryRun": False, + } + bf_patch = mock.patch("bigframes.pandas.read_gbq_query", autospec=True) + + with bf_patch as bf_mock: + ip.run_cell_magic("bigquery", "--max_results 10", sql) + + bf_mock.assert_called_once_with( + sql, max_results=10, configuration=expected_configuration + ) + + +@pytest.mark.usefixtures("ipython_interactive", "mock_credentials", "bigframes_engine") +def test_big_query_magic_bigframes_with_destination_var(ipython_ns_cleanup): + if bpd is None: + pytest.skip("BigFrames not installed") + + ip = IPython.get_ipython() + ip.extension_manager.load_extension("bigquery_magics") + sql = "SELECT 0 AS something" + + bf_patch = mock.patch("bigframes.pandas.read_gbq_query", autospec=True) + ipython_ns_cleanup.append((ip, "df")) + + with bf_patch as bf_mock: + ip.run_cell_magic("bigquery", "df", sql) + + assert "df" in ip.user_ns + df = ip.user_ns["df"] + assert df is bf_mock.return_value + + +@pytest.mark.usefixtures("ipython_interactive", "mock_credentials", "bigframes_engine") +def test_big_query_magic_bigframes_with_dry_run__should_fail(): + if bpd is None: + pytest.skip("BigFrames not installed") + + ip = IPython.get_ipython() + ip.extension_manager.load_extension("bigquery_magics") + sql = "SELECT 0 AS @p" + + bf_patch = mock.patch("bigframes.pandas.read_gbq_query", autospec=True) + + with bf_patch, pytest.raises(ValueError): + ip.run_cell_magic("bigquery", "--dry_run", sql) diff --git a/tests/unit/test_context.py b/tests/unit/test_context.py index d69fe41..2909f2c 100644 --- a/tests/unit/test_context.py +++ b/tests/unit/test_context.py @@ -16,6 +16,7 @@ import google.auth.credentials import pydata_google_auth +import pytest import bigquery_magics @@ -64,3 +65,15 @@ def test_context_credentials_and_project_can_be_set_explicitly(): assert bigquery_magics.context.credentials is credentials_mock # default should not be called if credentials & project are explicitly set assert default_mock.call_count == 0 + + +@pytest.mark.parametrize("engine", ["pandas", "bigframes"]) +def test_context_set_engine(engine): + bigquery_magics.context.engine = engine + + assert bigquery_magics.context.engine == engine + + +def test_context_set_invalid_engine(): + with pytest.raises(ValueError): + bigquery_magics.context.engine = "whatever" From ef2becf0d919cac4d905cfc66cd964ce68e198f6 Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Mon, 23 Sep 2024 10:43:05 -0700 Subject: [PATCH 6/6] chore(main): release 0.3.0 (#59) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> --- CHANGELOG.md | 7 +++++++ bigquery_magics/version.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 948ddb1..e9caa48 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Changelog +## [0.3.0](https://github.com/googleapis/python-bigquery-magics/compare/v0.2.0...v0.3.0) (2024-09-20) + + +### Features + +* Add support for BigQuery DataFrames. Set `context.engine` to 'bigframes' to support query results larger than 10 GB ([#58](https://github.com/googleapis/python-bigquery-magics/issues/58)) ([90ba05f](https://github.com/googleapis/python-bigquery-magics/commit/90ba05f3d918979788e01b0cd3201ac8f01741a9)) + ## [0.2.0](https://github.com/googleapis/python-bigquery-magics/compare/v0.1.1...v0.2.0) (2024-08-27) diff --git a/bigquery_magics/version.py b/bigquery_magics/version.py index beaff60..b447a74 100644 --- a/bigquery_magics/version.py +++ b/bigquery_magics/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.2.0" +__version__ = "0.3.0"