diff --git a/google/cloud/bigquery/_helpers.py b/google/cloud/bigquery/_helpers.py index 014a721a8..488766853 100644 --- a/google/cloud/bigquery/_helpers.py +++ b/google/cloud/bigquery/_helpers.py @@ -20,7 +20,7 @@ import math import re import os -from typing import Any, Optional, Union +from typing import Optional, Union from dateutil import relativedelta from google.cloud._helpers import UTC # type: ignore @@ -32,10 +32,7 @@ import packaging.version -from google.cloud.bigquery.exceptions import ( - LegacyBigQueryStorageError, - LegacyPyarrowError, -) +from google.cloud.bigquery import exceptions _RFC3339_MICROS_NO_ZULU = "%Y-%m-%dT%H:%M:%S.%f" _TIMEONLY_WO_MICROS = "%H:%M:%S" @@ -57,8 +54,6 @@ _MIN_BQ_STORAGE_VERSION = packaging.version.Version("2.0.0") -_MIN_PYARROW_VERSION = packaging.version.Version("3.0.0") - _BQ_STORAGE_OPTIONAL_READ_SESSION_VERSION = packaging.version.Version("2.6.0") BIGQUERY_EMULATOR_HOST = "BIGQUERY_EMULATOR_HOST" @@ -115,7 +110,7 @@ def verify_version(self): verify the version compatibility at runtime. Raises: - LegacyBigQueryStorageError: + exceptions.LegacyBigQueryStorageError: If the google-cloud-bigquery-storage package is outdated. """ if self.installed_version < _MIN_BQ_STORAGE_VERSION: @@ -123,76 +118,10 @@ def verify_version(self): "Dependency google-cloud-bigquery-storage is outdated, please upgrade " f"it to version >= {_MIN_BQ_STORAGE_VERSION} (version found: {self.installed_version})." ) - raise LegacyBigQueryStorageError(msg) - - -class PyarrowVersions: - """Version comparisons for pyarrow package.""" - - def __init__(self): - self._installed_version = None - - @property - def installed_version(self) -> packaging.version.Version: - """Return the parsed version of pyarrow.""" - if self._installed_version is None: - import pyarrow # type: ignore - - self._installed_version = packaging.version.parse( - # Use 0.0.0, since it is earlier than any released version. - # Legacy versions also have the same property, but - # creating a LegacyVersion has been deprecated. - # https://github.com/pypa/packaging/issues/321 - getattr(pyarrow, "__version__", "0.0.0") - ) - - return self._installed_version - - @property - def use_compliant_nested_type(self) -> bool: - return self.installed_version.major >= 4 - - def try_import(self, raise_if_error: bool = False) -> Any: - """Verify that a recent enough version of pyarrow extra is - installed. - - The function assumes that pyarrow extra is installed, and should thus - be used in places where this assumption holds. - - Because `pip` can install an outdated version of this extra despite the - constraints in `setup.py`, the calling code can use this helper to - verify the version compatibility at runtime. - - Returns: - The ``pyarrow`` module or ``None``. - - Raises: - LegacyPyarrowError: - If the pyarrow package is outdated and ``raise_if_error`` is ``True``. - """ - try: - import pyarrow - except ImportError as exc: # pragma: NO COVER - if raise_if_error: - raise LegacyPyarrowError( - f"pyarrow package not found. Install pyarrow version >= {_MIN_PYARROW_VERSION}." - ) from exc - return None - - if self.installed_version < _MIN_PYARROW_VERSION: - if raise_if_error: - msg = ( - "Dependency pyarrow is outdated, please upgrade " - f"it to version >= {_MIN_PYARROW_VERSION} (version found: {self.installed_version})." - ) - raise LegacyPyarrowError(msg) - return None - - return pyarrow + raise exceptions.LegacyBigQueryStorageError(msg) BQ_STORAGE_VERSIONS = BQStorageVersions() -PYARROW_VERSIONS = PyarrowVersions() def _not_null(value, field): diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index a14dbec9b..ea790d6c9 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -23,9 +23,9 @@ import warnings from typing import Any, Union -from packaging import version - from google.cloud.bigquery import _helpers +from google.cloud.bigquery import _pyarrow_helpers +from google.cloud.bigquery import _versions_helpers from google.cloud.bigquery import schema try: @@ -49,7 +49,11 @@ db_dtypes_import_exception = exc date_dtype_name = time_dtype_name = "" # Use '' rather than None because pytype -pyarrow = _helpers.PYARROW_VERSIONS.try_import() +pyarrow = _versions_helpers.PYARROW_VERSIONS.try_import() + +_BIGNUMERIC_SUPPORT = False +if pyarrow is not None: + _BIGNUMERIC_SUPPORT = True try: # _BaseGeometry is used to detect shapely objevys in `bq_to_arrow_array` @@ -119,87 +123,6 @@ def __init__(self): self.done = False -def pyarrow_datetime(): - return pyarrow.timestamp("us", tz=None) - - -def pyarrow_numeric(): - return pyarrow.decimal128(38, 9) - - -def pyarrow_bignumeric(): - # 77th digit is partial. - # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#decimal_types - return pyarrow.decimal256(76, 38) - - -def pyarrow_time(): - return pyarrow.time64("us") - - -def pyarrow_timestamp(): - return pyarrow.timestamp("us", tz="UTC") - - -if pyarrow: - # This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py - # When modifying it be sure to update it there as well. - BQ_TO_ARROW_SCALARS = { - "BOOL": pyarrow.bool_, - "BOOLEAN": pyarrow.bool_, - "BYTES": pyarrow.binary, - "DATE": pyarrow.date32, - "DATETIME": pyarrow_datetime, - "FLOAT": pyarrow.float64, - "FLOAT64": pyarrow.float64, - "GEOGRAPHY": pyarrow.string, - "INT64": pyarrow.int64, - "INTEGER": pyarrow.int64, - "NUMERIC": pyarrow_numeric, - "STRING": pyarrow.string, - "TIME": pyarrow_time, - "TIMESTAMP": pyarrow_timestamp, - } - ARROW_SCALAR_IDS_TO_BQ = { - # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes - pyarrow.bool_().id: "BOOL", - pyarrow.int8().id: "INT64", - pyarrow.int16().id: "INT64", - pyarrow.int32().id: "INT64", - pyarrow.int64().id: "INT64", - pyarrow.uint8().id: "INT64", - pyarrow.uint16().id: "INT64", - pyarrow.uint32().id: "INT64", - pyarrow.uint64().id: "INT64", - pyarrow.float16().id: "FLOAT64", - pyarrow.float32().id: "FLOAT64", - pyarrow.float64().id: "FLOAT64", - pyarrow.time32("ms").id: "TIME", - pyarrow.time64("ns").id: "TIME", - pyarrow.timestamp("ns").id: "TIMESTAMP", - pyarrow.date32().id: "DATE", - pyarrow.date64().id: "DATETIME", # because millisecond resolution - pyarrow.binary().id: "BYTES", - pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() - # The exact scale and precision don't matter, see below. - pyarrow.decimal128(38, scale=9).id: "NUMERIC", - } - - if version.parse(pyarrow.__version__) >= version.parse("3.0.0"): - BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric - # The exact decimal's scale and precision are not important, as only - # the type ID matters, and it's the same for all decimal256 instances. - ARROW_SCALAR_IDS_TO_BQ[pyarrow.decimal256(76, scale=38).id] = "BIGNUMERIC" - _BIGNUMERIC_SUPPORT = True - else: - _BIGNUMERIC_SUPPORT = False # pragma: NO COVER - -else: # pragma: NO COVER - BQ_TO_ARROW_SCALARS = {} # pragma: NO COVER - ARROW_SCALAR_IDS_TO_BQ = {} # pragma: NO_COVER - _BIGNUMERIC_SUPPORT = False # pragma: NO COVER - - BQ_FIELD_TYPE_TO_ARROW_FIELD_METADATA = { "GEOGRAPHY": { b"ARROW:extension:name": b"google:sqlType:geography", @@ -240,7 +163,7 @@ def bq_to_arrow_data_type(field): if field_type_upper in schema._STRUCT_TYPES: return bq_to_arrow_struct_data_type(field) - data_type_constructor = BQ_TO_ARROW_SCALARS.get(field_type_upper) + data_type_constructor = _pyarrow_helpers.bq_to_arrow_scalars(field_type_upper) if data_type_constructor is None: return None return data_type_constructor() @@ -568,7 +491,9 @@ def augment_schema(dataframe, current_bq_schema): if pyarrow.types.is_list(arrow_table.type): # `pyarrow.ListType` detected_mode = "REPEATED" - detected_type = ARROW_SCALAR_IDS_TO_BQ.get(arrow_table.values.type.id) + detected_type = _pyarrow_helpers.arrow_scalar_ids_to_bq( + arrow_table.values.type.id + ) # For timezone-naive datetimes, pyarrow assumes the UTC timezone and adds # it to such datetimes, causing them to be recognized as TIMESTAMP type. @@ -584,7 +509,7 @@ def augment_schema(dataframe, current_bq_schema): detected_type = "DATETIME" else: detected_mode = field.mode - detected_type = ARROW_SCALAR_IDS_TO_BQ.get(arrow_table.type.id) + detected_type = _pyarrow_helpers.arrow_scalar_ids_to_bq(arrow_table.type.id) if detected_type is None: unknown_type_fields.append(field) @@ -705,13 +630,13 @@ def dataframe_to_parquet( This argument is ignored for ``pyarrow`` versions earlier than ``4.0.0``. """ - pyarrow = _helpers.PYARROW_VERSIONS.try_import(raise_if_error=True) + pyarrow = _versions_helpers.PYARROW_VERSIONS.try_import(raise_if_error=True) import pyarrow.parquet # type: ignore kwargs = ( {"use_compliant_nested_type": parquet_use_compliant_nested_type} - if _helpers.PYARROW_VERSIONS.use_compliant_nested_type + if _versions_helpers.PYARROW_VERSIONS.use_compliant_nested_type else {} ) diff --git a/google/cloud/bigquery/_pyarrow_helpers.py b/google/cloud/bigquery/_pyarrow_helpers.py new file mode 100644 index 000000000..7266e5e02 --- /dev/null +++ b/google/cloud/bigquery/_pyarrow_helpers.py @@ -0,0 +1,123 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Shared helper functions for connecting BigQuery and pyarrow.""" + +from typing import Any + +from packaging import version + +try: + import pyarrow # type: ignore +except ImportError: # pragma: NO COVER + pyarrow = None + + +def pyarrow_datetime(): + return pyarrow.timestamp("us", tz=None) + + +def pyarrow_numeric(): + return pyarrow.decimal128(38, 9) + + +def pyarrow_bignumeric(): + # 77th digit is partial. + # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#decimal_types + return pyarrow.decimal256(76, 38) + + +def pyarrow_time(): + return pyarrow.time64("us") + + +def pyarrow_timestamp(): + return pyarrow.timestamp("us", tz="UTC") + + +_BQ_TO_ARROW_SCALARS = {} +_ARROW_SCALAR_IDS_TO_BQ = {} + +if pyarrow: + # This dictionary is duplicated in bigquery_storage/test/unite/test_reader.py + # When modifying it be sure to update it there as well. + # Note(todo!!): type "BIGNUMERIC"'s matching pyarrow type is added in _pandas_helpers.py + _BQ_TO_ARROW_SCALARS = { + "BOOL": pyarrow.bool_, + "BOOLEAN": pyarrow.bool_, + "BYTES": pyarrow.binary, + "DATE": pyarrow.date32, + "DATETIME": pyarrow_datetime, + "FLOAT": pyarrow.float64, + "FLOAT64": pyarrow.float64, + "GEOGRAPHY": pyarrow.string, + "INT64": pyarrow.int64, + "INTEGER": pyarrow.int64, + "NUMERIC": pyarrow_numeric, + "STRING": pyarrow.string, + "TIME": pyarrow_time, + "TIMESTAMP": pyarrow_timestamp, + } + + _ARROW_SCALAR_IDS_TO_BQ = { + # https://arrow.apache.org/docs/python/api/datatypes.html#type-classes + pyarrow.bool_().id: "BOOL", + pyarrow.int8().id: "INT64", + pyarrow.int16().id: "INT64", + pyarrow.int32().id: "INT64", + pyarrow.int64().id: "INT64", + pyarrow.uint8().id: "INT64", + pyarrow.uint16().id: "INT64", + pyarrow.uint32().id: "INT64", + pyarrow.uint64().id: "INT64", + pyarrow.float16().id: "FLOAT64", + pyarrow.float32().id: "FLOAT64", + pyarrow.float64().id: "FLOAT64", + pyarrow.time32("ms").id: "TIME", + pyarrow.time64("ns").id: "TIME", + pyarrow.timestamp("ns").id: "TIMESTAMP", + pyarrow.date32().id: "DATE", + pyarrow.date64().id: "DATETIME", # because millisecond resolution + pyarrow.binary().id: "BYTES", + pyarrow.string().id: "STRING", # also alias for pyarrow.utf8() + # The exact scale and precision don't matter, see below. + pyarrow.decimal128(38, scale=9).id: "NUMERIC", + } + + # Adds bignumeric support only if pyarrow version >= 3.0.0 + # Decimal256 support was added to arrow 3.0.0 + # https://arrow.apache.org/blog/2021/01/25/3.0.0-release/ + if version.parse(pyarrow.__version__) >= version.parse("3.0.0"): + _BQ_TO_ARROW_SCALARS["BIGNUMERIC"] = pyarrow_bignumeric + # The exact decimal's scale and precision are not important, as only + # the type ID matters, and it's the same for all decimal256 instances. + _ARROW_SCALAR_IDS_TO_BQ[pyarrow.decimal256(76, scale=38).id] = "BIGNUMERIC" + + +def bq_to_arrow_scalars(bq_scalar: str): + """ + Returns: + The Arrow scalar type that the input BigQuery scalar type maps to. + If it cannot find the BigQuery scalar, return None. + """ + return _BQ_TO_ARROW_SCALARS.get(bq_scalar) + + +def arrow_scalar_ids_to_bq(arrow_scalar: Any): + """ + Returns: + The BigQuery scalar type that the input arrow scalar type maps to. + If it cannot find the arrow scalar, return None. + """ + return _ARROW_SCALAR_IDS_TO_BQ.get(arrow_scalar) diff --git a/google/cloud/bigquery/_versions_helpers.py b/google/cloud/bigquery/_versions_helpers.py new file mode 100644 index 000000000..1f04c74e0 --- /dev/null +++ b/google/cloud/bigquery/_versions_helpers.py @@ -0,0 +1,94 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Shared helper functions for verifying versions of installed modules.""" + +from typing import Any + +import packaging.version + +from google.cloud.bigquery import exceptions + + +_MIN_PYARROW_VERSION = packaging.version.Version("3.0.0") + + +class PyarrowVersions: + """Version comparisons for pyarrow package.""" + + def __init__(self): + self._installed_version = None + + @property + def installed_version(self) -> packaging.version.Version: + """Return the parsed version of pyarrow.""" + if self._installed_version is None: + import pyarrow # type: ignore + + self._installed_version = packaging.version.parse( + # Use 0.0.0, since it is earlier than any released version. + # Legacy versions also have the same property, but + # creating a LegacyVersion has been deprecated. + # https://github.com/pypa/packaging/issues/321 + getattr(pyarrow, "__version__", "0.0.0") + ) + + return self._installed_version + + @property + def use_compliant_nested_type(self) -> bool: + return self.installed_version.major >= 4 + + def try_import(self, raise_if_error: bool = False) -> Any: + """Verify that a recent enough version of pyarrow extra is installed. + + The function assumes that pyarrow extra is installed, and should thus + be used in places where this assumption holds. + + Because `pip` can install an outdated version of this extra despite + the constraints in `setup.py`, the calling code can use this helper + to verify the version compatibility at runtime. + + Returns: + The ``pyarrow`` module or ``None``. + + Raises: + exceptions.LegacyPyarrowError: + If the pyarrow package is outdated and ``raise_if_error`` is + ``True``. + """ + try: + import pyarrow + except ImportError as exc: # pragma: NO COVER + if raise_if_error: + raise exceptions.LegacyPyarrowError( + "pyarrow package not found. Install pyarrow version >=" + f" {_MIN_PYARROW_VERSION}." + ) from exc + return None + + if self.installed_version < _MIN_PYARROW_VERSION: + if raise_if_error: + msg = ( + "Dependency pyarrow is outdated, please upgrade" + f" it to version >= {_MIN_PYARROW_VERSION}" + f" (version found: {self.installed_version})." + ) + raise exceptions.LegacyPyarrowError(msg) + return None + + return pyarrow + + +PYARROW_VERSIONS = PyarrowVersions() diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index f7c7864a1..ed75215b6 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -27,7 +27,6 @@ import json import math import os -import packaging.version import tempfile import typing from typing import ( @@ -45,13 +44,6 @@ import uuid import warnings -try: - import pyarrow # type: ignore - - _PYARROW_VERSION = packaging.version.parse(pyarrow.__version__) -except ImportError: # pragma: NO COVER - pyarrow = None - from google import resumable_media # type: ignore from google.resumable_media.requests import MultipartUpload # type: ignore from google.resumable_media.requests import ResumableUpload @@ -84,12 +76,13 @@ from google.cloud.bigquery._helpers import _DEFAULT_HOST from google.cloud.bigquery._http import Connection from google.cloud.bigquery import _pandas_helpers +from google.cloud.bigquery import _versions_helpers from google.cloud.bigquery.dataset import Dataset from google.cloud.bigquery.dataset import DatasetListItem from google.cloud.bigquery.dataset import DatasetReference from google.cloud.bigquery import enums from google.cloud.bigquery.enums import AutoRowIDs -from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError +from google.cloud.bigquery import exceptions as bq_exceptions from google.cloud.bigquery.opentelemetry_tracing import create_span from google.cloud.bigquery import job from google.cloud.bigquery.job import ( @@ -121,7 +114,8 @@ from google.cloud.bigquery.table import TableReference from google.cloud.bigquery.table import RowIterator from google.cloud.bigquery.format_options import ParquetOptions -from google.cloud.bigquery import _helpers + +pyarrow = _versions_helpers.PYARROW_VERSIONS.try_import() TimeoutType = Union[float, None] ResumableTimeoutType = Union[ @@ -159,9 +153,6 @@ TIMEOUT_HEADER = "X-Server-Timeout" -# https://github.com/googleapis/python-bigquery/issues/781#issuecomment-883497414 -_PYARROW_BAD_VERSIONS = frozenset([packaging.version.Version("2.0.0")]) - class Project(object): """Wrapper for resource describing a BigQuery project. @@ -574,7 +565,7 @@ def _ensure_bqstorage_client( try: BQ_STORAGE_VERSIONS.verify_version() - except LegacyBigQueryStorageError as exc: + except bq_exceptions.LegacyBigQueryStorageError as exc: warnings.warn(str(exc)) return None if bqstorage_client is None: @@ -2686,16 +2677,6 @@ def load_table_from_dataframe( try: if new_job_config.source_format == job.SourceFormat.PARQUET: - if _PYARROW_VERSION in _PYARROW_BAD_VERSIONS: - msg = ( - "Loading dataframe data in PARQUET format with pyarrow " - f"{_PYARROW_VERSION} can result in data corruption. It is " - "therefore *strongly* advised to use a different pyarrow " - "version or a different source format. " - "See: https://github.com/googleapis/python-bigquery/issues/781" - ) - warnings.warn(msg, category=RuntimeWarning) - if new_job_config.schema: if parquet_compression == "snappy": # adjust the default value parquet_compression = parquet_compression.upper() @@ -2714,7 +2695,7 @@ def load_table_from_dataframe( compression=parquet_compression, **( {"use_compliant_nested_type": True} - if _helpers.PYARROW_VERSIONS.use_compliant_nested_type + if _versions_helpers.PYARROW_VERSIONS.use_compliant_nested_type else {} ), ) diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 462447d51..a967a1795 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -61,7 +61,7 @@ from google.cloud.bigquery import _helpers from google.cloud.bigquery import _pandas_helpers from google.cloud.bigquery.enums import DefaultPandasDTypes -from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError +from google.cloud.bigquery import exceptions from google.cloud.bigquery.schema import _build_schema_resource from google.cloud.bigquery.schema import _parse_schema_resource from google.cloud.bigquery.schema import _to_schema_fields @@ -1616,7 +1616,7 @@ def _validate_bqstorage(self, bqstorage_client, create_bqstorage_client): try: _helpers.BQ_STORAGE_VERSIONS.verify_version() - except LegacyBigQueryStorageError as exc: + except exceptions.LegacyBigQueryStorageError as exc: warnings.warn(str(exc)) return False diff --git a/tests/unit/test__helpers.py b/tests/unit/test__helpers.py index 4fb86f665..40223f041 100644 --- a/tests/unit/test__helpers.py +++ b/tests/unit/test__helpers.py @@ -19,16 +19,13 @@ import mock +from google.cloud.bigquery import exceptions + try: from google.cloud import bigquery_storage # type: ignore except ImportError: # pragma: NO COVER bigquery_storage = None -try: - import pyarrow -except ImportError: # pragma: NO COVER - pyarrow = None - @unittest.skipIf(bigquery_storage is None, "Requires `google-cloud-bigquery-storage`") class TestBQStorageVersions(unittest.TestCase): @@ -50,28 +47,24 @@ def _call_fut(self): return _helpers.BQ_STORAGE_VERSIONS.verify_version() def test_raises_no_error_w_recent_bqstorage(self): - from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError - with mock.patch("google.cloud.bigquery_storage.__version__", new="2.0.0"): try: self._call_fut() - except LegacyBigQueryStorageError: # pragma: NO COVER + except exceptions.LegacyBigQueryStorageError: # pragma: NO COVER self.fail("Legacy error raised with a non-legacy dependency version.") def test_raises_error_w_legacy_bqstorage(self): - from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError - with mock.patch("google.cloud.bigquery_storage.__version__", new="1.9.9"): - with self.assertRaises(LegacyBigQueryStorageError): + with self.assertRaises(exceptions.LegacyBigQueryStorageError): self._call_fut() def test_raises_error_w_unknown_bqstorage_version(self): - from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError - with mock.patch("google.cloud.bigquery_storage", autospec=True) as fake_module: del fake_module.__version__ error_pattern = r"version found: 0.0.0" - with self.assertRaisesRegex(LegacyBigQueryStorageError, error_pattern): + with self.assertRaisesRegex( + exceptions.LegacyBigQueryStorageError, error_pattern + ): self._call_fut() def test_installed_version_returns_cached(self): @@ -100,63 +93,6 @@ def test_is_read_session_optional_false(self): assert not versions.is_read_session_optional -@unittest.skipIf(pyarrow is None, "Requires `pyarrow`") -class TestPyarrowVersions(unittest.TestCase): - def tearDown(self): - from google.cloud.bigquery import _helpers - - # Reset any cached versions since it may not match reality. - _helpers.PYARROW_VERSIONS._installed_version = None - - def _object_under_test(self): - from google.cloud.bigquery import _helpers - - return _helpers.PyarrowVersions() - - def _call_try_import(self, **kwargs): - from google.cloud.bigquery import _helpers - - _helpers.PYARROW_VERSIONS._installed_version = None - return _helpers.PYARROW_VERSIONS.try_import(**kwargs) - - def test_try_import_raises_no_error_w_recent_pyarrow(self): - from google.cloud.bigquery.exceptions import LegacyPyarrowError - - with mock.patch("pyarrow.__version__", new="5.0.0"): - try: - pyarrow = self._call_try_import(raise_if_error=True) - self.assertIsNotNone(pyarrow) - except LegacyPyarrowError: # pragma: NO COVER - self.fail("Legacy error raised with a non-legacy dependency version.") - - def test_try_import_returns_none_w_legacy_pyarrow(self): - with mock.patch("pyarrow.__version__", new="2.0.0"): - pyarrow = self._call_try_import() - self.assertIsNone(pyarrow) - - def test_try_import_raises_error_w_legacy_pyarrow(self): - from google.cloud.bigquery.exceptions import LegacyPyarrowError - - with mock.patch("pyarrow.__version__", new="2.0.0"): - with self.assertRaises(LegacyPyarrowError): - self._call_try_import(raise_if_error=True) - - def test_installed_version_returns_cached(self): - versions = self._object_under_test() - versions._installed_version = object() - assert versions.installed_version is versions._installed_version - - def test_installed_version_returns_parsed_version(self): - versions = self._object_under_test() - - with mock.patch("pyarrow.__version__", new="1.2.3"): - version = versions.installed_version - - assert version.major == 1 - assert version.minor == 2 - assert version.micro == 3 - - class Test_not_null(unittest.TestCase): def _call_fut(self, value, field): from google.cloud.bigquery._helpers import _not_null diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index a4cc1fefb..7724f308b 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -41,10 +41,12 @@ from google.cloud.bigquery import exceptions from google.cloud.bigquery import _helpers +from google.cloud.bigquery import _pyarrow_helpers +from google.cloud.bigquery import _versions_helpers from google.cloud.bigquery import schema from google.cloud.bigquery._pandas_helpers import _BIGNUMERIC_SUPPORT -pyarrow = _helpers.PYARROW_VERSIONS.try_import() +pyarrow = _versions_helpers.PYARROW_VERSIONS.try_import() if pyarrow: import pyarrow.parquet @@ -346,14 +348,14 @@ def test_bq_to_arrow_data_type_w_struct(module_under_test, bq_type): pyarrow.field("field04", pyarrow.int64()), pyarrow.field("field05", pyarrow.float64()), pyarrow.field("field06", pyarrow.float64()), - pyarrow.field("field07", module_under_test.pyarrow_numeric()), - pyarrow.field("field08", module_under_test.pyarrow_bignumeric()), + pyarrow.field("field07", _pyarrow_helpers.pyarrow_numeric()), + pyarrow.field("field08", _pyarrow_helpers.pyarrow_bignumeric()), pyarrow.field("field09", pyarrow.bool_()), pyarrow.field("field10", pyarrow.bool_()), - pyarrow.field("field11", module_under_test.pyarrow_timestamp()), + pyarrow.field("field11", _pyarrow_helpers.pyarrow_timestamp()), pyarrow.field("field12", pyarrow.date32()), - pyarrow.field("field13", module_under_test.pyarrow_time()), - pyarrow.field("field14", module_under_test.pyarrow_datetime()), + pyarrow.field("field13", _pyarrow_helpers.pyarrow_time()), + pyarrow.field("field14", _pyarrow_helpers.pyarrow_datetime()), pyarrow.field("field15", pyarrow.string()), ) expected = pyarrow.struct(expected) @@ -394,14 +396,14 @@ def test_bq_to_arrow_data_type_w_array_struct(module_under_test, bq_type): pyarrow.field("field04", pyarrow.int64()), pyarrow.field("field05", pyarrow.float64()), pyarrow.field("field06", pyarrow.float64()), - pyarrow.field("field07", module_under_test.pyarrow_numeric()), - pyarrow.field("field08", module_under_test.pyarrow_bignumeric()), + pyarrow.field("field07", _pyarrow_helpers.pyarrow_numeric()), + pyarrow.field("field08", _pyarrow_helpers.pyarrow_bignumeric()), pyarrow.field("field09", pyarrow.bool_()), pyarrow.field("field10", pyarrow.bool_()), - pyarrow.field("field11", module_under_test.pyarrow_timestamp()), + pyarrow.field("field11", _pyarrow_helpers.pyarrow_timestamp()), pyarrow.field("field12", pyarrow.date32()), - pyarrow.field("field13", module_under_test.pyarrow_time()), - pyarrow.field("field14", module_under_test.pyarrow_datetime()), + pyarrow.field("field13", _pyarrow_helpers.pyarrow_time()), + pyarrow.field("field14", _pyarrow_helpers.pyarrow_datetime()), pyarrow.field("field15", pyarrow.string()), ) expected_value_type = pyarrow.struct(expected) @@ -1117,7 +1119,9 @@ def test_dataframe_to_parquet_without_pyarrow(module_under_test, monkeypatch): mock_pyarrow_import.side_effect = exceptions.LegacyPyarrowError( "pyarrow not installed" ) - monkeypatch.setattr(_helpers.PYARROW_VERSIONS, "try_import", mock_pyarrow_import) + monkeypatch.setattr( + _versions_helpers.PYARROW_VERSIONS, "try_import", mock_pyarrow_import + ) with pytest.raises(exceptions.LegacyPyarrowError): module_under_test.dataframe_to_parquet(pandas.DataFrame(), (), None) diff --git a/tests/unit/test__pyarrow_helpers.py b/tests/unit/test__pyarrow_helpers.py new file mode 100644 index 000000000..f0a872c88 --- /dev/null +++ b/tests/unit/test__pyarrow_helpers.py @@ -0,0 +1,38 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + + +pyarrow = pytest.importorskip("pyarrow", minversion="3.0.0") + + +@pytest.fixture +def module_under_test(): + from google.cloud.bigquery import _pyarrow_helpers + + return _pyarrow_helpers + + +def test_bq_to_arrow_scalars(module_under_test): + assert ( + module_under_test.bq_to_arrow_scalars("BIGNUMERIC") + == module_under_test.pyarrow_bignumeric + ) + assert module_under_test.bq_to_arrow_scalars("UNKNOWN_TYPE") is None + + +def test_arrow_scalar_ids_to_bq(module_under_test): + assert module_under_test.arrow_scalar_ids_to_bq(pyarrow.bool_().id) == "BOOL" + assert module_under_test.arrow_scalar_ids_to_bq("UNKNOWN_TYPE") is None diff --git a/tests/unit/test__versions_helpers.py b/tests/unit/test__versions_helpers.py new file mode 100644 index 000000000..21386610b --- /dev/null +++ b/tests/unit/test__versions_helpers.py @@ -0,0 +1,62 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import mock + +from google.cloud.bigquery import _versions_helpers +from google.cloud.bigquery import exceptions + +pyarrow = pytest.importorskip("pyarrow") + + +def test_try_import_raises_no_error_w_recent_pyarrow(): + versions = _versions_helpers.PyarrowVersions() + with mock.patch("pyarrow.__version__", new="5.0.0"): + try: + pyarrow = versions.try_import(raise_if_error=True) + assert pyarrow is not None + except exceptions.LegacyPyarrowError: # pragma: NO COVER + raise ("Legacy error raised with a non-legacy dependency version.") + + +def test_try_import_returns_none_w_legacy_pyarrow(): + versions = _versions_helpers.PyarrowVersions() + with mock.patch("pyarrow.__version__", new="2.0.0"): + pyarrow = versions.try_import() + assert pyarrow is None + + +def test_try_import_raises_error_w_legacy_pyarrow(): + versions = _versions_helpers.PyarrowVersions() + with mock.patch("pyarrow.__version__", new="2.0.0"): + with pytest.raises(exceptions.LegacyPyarrowError): + versions.try_import(raise_if_error=True) + + +def test_installed_version_returns_cached(): + versions = _versions_helpers.PyarrowVersions() + versions._installed_version = object() + assert versions.installed_version is versions._installed_version + + +def test_installed_version_returns_parsed_version(): + versions = _versions_helpers.PyarrowVersions() + with mock.patch("pyarrow.__version__", new="1.2.3"): + version = versions.installed_version + + assert version.major == 1 + assert version.minor == 2 + assert version.micro == 3 diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index faa065116..3143f2123 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -27,8 +27,8 @@ import warnings import mock -import packaging import requests +import packaging import pytest import pkg_resources @@ -65,6 +65,7 @@ from google.cloud import bigquery from google.cloud.bigquery.dataset import DatasetReference +from google.cloud.bigquery import exceptions from google.cloud.bigquery.retry import DEFAULT_TIMEOUT from google.cloud.bigquery import ParquetOptions @@ -821,14 +822,12 @@ def fail_bqstorage_import(name, globals, locals, fromlist, level): bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) def test_ensure_bqstorage_client_obsolete_dependency(self): - from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError - creds = _make_credentials() client = self._make_one(project=self.PROJECT, credentials=creds) patcher = mock.patch( "google.cloud.bigquery.client.BQ_STORAGE_VERSIONS.verify_version", - side_effect=LegacyBigQueryStorageError("BQ Storage too old"), + side_effect=exceptions.LegacyBigQueryStorageError("BQ Storage too old"), ) with patcher, warnings.catch_warnings(record=True) as warned: bqstorage_client = client._ensure_bqstorage_client() @@ -857,15 +856,13 @@ def test_ensure_bqstorage_client_existing_client_check_passes(self): bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) def test_ensure_bqstorage_client_existing_client_check_fails(self): - from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError - creds = _make_credentials() client = self._make_one(project=self.PROJECT, credentials=creds) mock_storage_client = mock.sentinel.mock_storage_client patcher = mock.patch( "google.cloud.bigquery.client.BQ_STORAGE_VERSIONS.verify_version", - side_effect=LegacyBigQueryStorageError("BQ Storage too old"), + side_effect=exceptions.LegacyBigQueryStorageError("BQ Storage too old"), ) with patcher, warnings.catch_warnings(record=True) as warned: bqstorage_client = client._ensure_bqstorage_client(mock_storage_client) @@ -8615,7 +8612,7 @@ def test_load_table_from_dataframe_w_bad_pyarrow_issues_warning(self): dataframe = pandas.DataFrame(records) pyarrow_version_patch = mock.patch( - "google.cloud.bigquery.client._PYARROW_VERSION", + "google.cloud.bigquery._versions_helpers.PYARROW_VERSIONS._installed_version", packaging.version.parse("2.0.0"), # A known bad version of pyarrow. ) get_table_patch = mock.patch( @@ -8628,22 +8625,13 @@ def test_load_table_from_dataframe_w_bad_pyarrow_issues_warning(self): ) with load_patch, get_table_patch, pyarrow_version_patch: - with warnings.catch_warnings(record=True) as warned: + with pytest.raises(exceptions.LegacyPyarrowError): client.load_table_from_dataframe( dataframe, self.TABLE_REF, location=self.LOCATION, ) - expected_warnings = [ - warning for warning in warned if "pyarrow" in str(warning).lower() - ] - assert len(expected_warnings) == 1 - assert issubclass(expected_warnings[0].category, RuntimeWarning) - msg = str(expected_warnings[0].message) - assert "pyarrow 2.0.0" in msg - assert "data corruption" in msg - @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") def test_load_table_from_dataframe_w_nulls(self): diff --git a/tests/unit/test_magics.py b/tests/unit/test_magics.py index 70bfc4d0c..0cab943f7 100644 --- a/tests/unit/test_magics.py +++ b/tests/unit/test_magics.py @@ -25,6 +25,7 @@ from test_utils.imports import maybe_fail_import from google.cloud import bigquery +from google.cloud.bigquery import exceptions as bq_exceptions from google.cloud.bigquery import job from google.cloud.bigquery import table from google.cloud.bigquery.retry import DEFAULT_TIMEOUT @@ -357,8 +358,6 @@ def test__make_bqstorage_client_true_raises_import_error(missing_bq_storage): bigquery_storage is None, reason="Requires `google-cloud-bigquery-storage`" ) def test__make_bqstorage_client_true_obsolete_dependency(): - from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError - credentials_mock = mock.create_autospec( google.auth.credentials.Credentials, instance=True ) @@ -368,7 +367,7 @@ def test__make_bqstorage_client_true_obsolete_dependency(): patcher = mock.patch( "google.cloud.bigquery.client.BQ_STORAGE_VERSIONS.verify_version", - side_effect=LegacyBigQueryStorageError("BQ Storage too old"), + side_effect=bq_exceptions.LegacyBigQueryStorageError("BQ Storage too old"), ) with patcher, warnings.catch_warnings(record=True) as warned: got = magics._make_bqstorage_client(test_client, True, {}) diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index a2c82c0a8..65eb659bf 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -28,6 +28,8 @@ import google.api_core.exceptions from test_utils.imports import maybe_fail_import +from google.cloud.bigquery import _versions_helpers +from google.cloud.bigquery import exceptions from google.cloud.bigquery.table import TableReference from google.cloud.bigquery.dataset import DatasetReference @@ -40,17 +42,12 @@ bigquery_storage = None big_query_read_grpc_transport = None -from google.cloud.bigquery import _helpers -pyarrow = _helpers.PYARROW_VERSIONS.try_import() -PYARROW_VERSION = pkg_resources.parse_version("0.0.1") +pyarrow = _versions_helpers.PYARROW_VERSIONS.try_import() if pyarrow: - import pyarrow import pyarrow.types - PYARROW_VERSION = pkg_resources.parse_version(pyarrow.__version__) - try: import pandas except (ImportError, AttributeError): # pragma: NO COVER @@ -73,8 +70,6 @@ except (ImportError, AttributeError): # pragma: NO COVER tqdm = None -PYARROW_TIMESTAMP_VERSION = pkg_resources.parse_version("2.0.0") - if pandas is not None: PANDAS_INSTALLED_VERSION = pkg_resources.get_distribution("pandas").parsed_version else: @@ -2262,13 +2257,11 @@ def fail_bqstorage_import(name, globals, locals, fromlist, level): bigquery_storage is None, "Requires `google-cloud-bigquery-storage`" ) def test__validate_bqstorage_returns_false_w_warning_if_obsolete_version(self): - from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError - iterator = self._make_one(first_page_response=None) # not cached patcher = mock.patch( "google.cloud.bigquery.table._helpers.BQ_STORAGE_VERSIONS.verify_version", - side_effect=LegacyBigQueryStorageError("BQ Storage too old"), + side_effect=exceptions.LegacyBigQueryStorageError("BQ Storage too old"), ) with patcher, warnings.catch_warnings(record=True) as warned: result = iterator._validate_bqstorage( @@ -2874,7 +2867,7 @@ def test_to_arrow_ensure_bqstorage_client_wo_bqstorage(self): row_iterator = self._make_one(mock_client, api_request, path, schema) def mock_verify_version(): - raise _helpers.LegacyBigQueryStorageError("no bqstorage") + raise exceptions.LegacyBigQueryStorageError("no bqstorage") with mock.patch( "google.cloud.bigquery._helpers.BQ_STORAGE_VERSIONS.verify_version",