diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py index 52b01d3ef7..0223811ebc 100644 --- a/bigframes/bigquery/_operations/json.py +++ b/bigframes/bigquery/_operations/json.py @@ -53,7 +53,7 @@ def json_set( >>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"] >>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")]) 0 {"a":100,"b":"hi"} - Name: data, dtype: large_string[pyarrow] + Name: data, dtype: dbjson Args: input (bigframes.series.Series): @@ -253,7 +253,7 @@ def parse_json( dtype: string >>> bbq.parse_json(s) 0 {"class":{"students":[{"id":5},{"id":12}]}} - dtype: large_string[pyarrow] + dtype: dbjson Args: input (bigframes.series.Series): diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 0bae094777..d9bba9bdb0 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -108,8 +108,8 @@ def from_table( raise ValueError("must set at most one of 'offests', 'primary_key'") if any(i.field_type == "JSON" for i in table.schema if i.name in schema.names): msg = ( - "Interpreting JSON column(s) as pyarrow.large_string. " - "This behavior may change in future versions." + "Interpreting JSON column(s) as the `db_dtypes.dbjson` extension type is" + "in preview; this behavior may change in future versions." ) warnings.warn(msg, bfe.PreviewWarning) # define data source only for needed columns, this makes row-hashing cheaper diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index 18f0834903..a0afa29a15 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -16,7 +16,6 @@ import textwrap import typing from typing import Any, cast, Dict, Iterable, Optional, Tuple, Union -import warnings import bigframes_vendored.constants as constants import bigframes_vendored.ibis @@ -26,6 +25,7 @@ dtype as python_type_to_ibis_type, ) import bigframes_vendored.ibis.expr.types as ibis_types +import db_dtypes # type: ignore import geopandas as gpd # type: ignore import google.cloud.bigquery as bigquery import numpy as np @@ -33,7 +33,6 @@ import pyarrow as pa import bigframes.dtypes -import bigframes.exceptions as bfe # Type hints for Ibis data types supported by BigQuery DataFrame IbisDtype = Union[ @@ -76,7 +75,7 @@ ibis_dtypes.GeoSpatial(geotype="geography", srid=4326, nullable=True), gpd.array.GeometryDtype(), ), - (ibis_dtypes.json, pd.ArrowDtype(pa.large_string())), + (ibis_dtypes.json, db_dtypes.JSONDtype()), ) BIGFRAMES_TO_IBIS: Dict[bigframes.dtypes.Dtype, ibis_dtypes.DataType] = { @@ -305,13 +304,7 @@ def ibis_dtype_to_bigframes_dtype( if isinstance(ibis_dtype, ibis_dtypes.Integer): return pd.Int64Dtype() - # Temporary: Will eventually support an explicit json type instead of casting to string. if isinstance(ibis_dtype, ibis_dtypes.JSON): - msg = ( - "Interpreting JSON column(s) as pyarrow.large_string. This behavior may change " - "in future versions." - ) - warnings.warn(msg, category=bfe.PreviewWarning) return bigframes.dtypes.JSON_DTYPE if ibis_dtype in IBIS_TO_BIGFRAMES: diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 4f670b51ca..2ab10e025d 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -1188,34 +1188,33 @@ def array_slice_op_impl(x: ibis_types.Value, op: ops.ArraySliceOp): # JSON Ops @scalar_op_compiler.register_binary_op(ops.JSONSet, pass_op=True) def json_set_op_impl(x: ibis_types.Value, y: ibis_types.Value, op: ops.JSONSet): - if x.type().is_json(): - return json_set( - json_obj=x, - json_path=op.json_path, - json_value=y, - ) - else: - # Enabling JSON type eliminates the need for less efficient string conversions. - return to_json_string( - json_set( # type: ignore - json_obj=parse_json(json_str=x), - json_path=op.json_path, - json_value=y, - ) - ) + return json_set(json_obj=x, json_path=op.json_path, json_value=y) @scalar_op_compiler.register_unary_op(ops.JSONExtract, pass_op=True) def json_extract_op_impl(x: ibis_types.Value, op: ops.JSONExtract): - if x.type().is_json(): - return json_extract(json_obj=x, json_path=op.json_path) - # json string - return json_extract_string(json_obj=x, json_path=op.json_path) + # Define a user-defined function whose returned type is dynamically matching the input. + def json_extract(json_or_json_string, json_path: ibis_dtypes.str): # type: ignore + """Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value.""" + ... + + return_type = x.type() + json_extract.__annotations__["return"] = return_type + json_extract_op = ibis_udf.scalar.builtin(json_extract) + return json_extract_op(json_or_json_string=x, json_path=op.json_path) @scalar_op_compiler.register_unary_op(ops.JSONExtractArray, pass_op=True) def json_extract_array_op_impl(x: ibis_types.Value, op: ops.JSONExtractArray): - return json_extract_array(json_obj=x, json_path=op.json_path) + # Define a user-defined function whose returned type is dynamically matching the input. + def json_extract_array(json_or_json_string, json_path: ibis_dtypes.str): # type: ignore + """Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value.""" + ... + + return_type = x.type() + json_extract_array.__annotations__["return"] = ibis_dtypes.Array[return_type] # type: ignore + json_extract_op = ibis_udf.scalar.builtin(json_extract_array) + return json_extract_op(json_or_json_string=x, json_path=op.json_path) @scalar_op_compiler.register_unary_op(ops.JSONExtractStringArray, pass_op=True) @@ -1937,27 +1936,6 @@ def json_set( # type: ignore[empty-body] """Produces a new SQL JSON value with the specified JSON data inserted or replaced.""" -@ibis_udf.scalar.builtin(name="json_extract") -def json_extract( # type: ignore[empty-body] - json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String -) -> ibis_dtypes.JSON: - """Extracts a JSON value and converts it to a JSON value.""" - - -@ibis_udf.scalar.builtin(name="json_extract") -def json_extract_string( # type: ignore[empty-body] - json_obj: ibis_dtypes.String, json_path: ibis_dtypes.String -) -> ibis_dtypes.String: - """Extracts a JSON SRING value and converts it to a SQL JSON-formatted STRING.""" - - -@ibis_udf.scalar.builtin(name="json_extract_array") -def json_extract_array( # type: ignore[empty-body] - json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String -) -> ibis_dtypes.Array[ibis_dtypes.String]: - """Extracts a JSON array and converts it to a SQL ARRAY of JSON-formatted STRINGs or JSON values.""" - - @ibis_udf.scalar.builtin(name="json_extract_string_array") def json_extract_string_array( # type: ignore[empty-body] json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 3da3fa24f3..863615118a 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -21,6 +21,7 @@ from typing import Any, Dict, List, Literal, Union import bigframes_vendored.constants as constants +import db_dtypes # type: ignore import geopandas as gpd # type: ignore import google.cloud.bigquery import numpy as np @@ -59,7 +60,7 @@ # No arrow equivalent GEO_DTYPE = gpd.array.GeometryDtype() # JSON -JSON_DTYPE = pd.ArrowDtype(pa.large_string()) +JSON_DTYPE = db_dtypes.JSONDtype() OBJ_REF_DTYPE = pd.ArrowDtype( pa.struct( ( @@ -161,7 +162,7 @@ class SimpleDtypeInfo: ), SimpleDtypeInfo( dtype=JSON_DTYPE, - arrow_dtype=pa.large_string(), + arrow_dtype=db_dtypes.JSONArrowType(), type_kind=("JSON",), orderable=False, clusterable=False, @@ -320,7 +321,6 @@ def is_struct_like(type_: ExpressionType) -> bool: def is_json_like(type_: ExpressionType) -> bool: - # TODO: Add JSON type support return type_ == JSON_DTYPE or type_ == STRING_DTYPE # Including JSON string diff --git a/bigframes/operations/json_ops.py b/bigframes/operations/json_ops.py index 86c5a19ba7..1daacf4e6b 100644 --- a/bigframes/operations/json_ops.py +++ b/bigframes/operations/json_ops.py @@ -50,7 +50,7 @@ def output_type(self, *input_types): + f" Received type: {input_type}" ) return pd.ArrowDtype( - pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(dtypes.STRING_DTYPE)) + pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(input_type)) ) @@ -118,8 +118,7 @@ def output_type(self, *input_types): + f"Received type: {right_type}" ) - # After JSON type implementation, ONLY return JSON data. - return left_type + return dtypes.JSON_DTYPE @dataclasses.dataclass(frozen=True) diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py index 2f6aade0e5..301e1c4ebb 100644 --- a/bigframes/session/_io/pandas.py +++ b/bigframes/session/_io/pandas.py @@ -17,6 +17,7 @@ from typing import Collection, Union import bigframes_vendored.constants as constants +import db_dtypes # type: ignore import geopandas # type: ignore import numpy as np import pandas @@ -122,6 +123,8 @@ def arrow_to_pandas( ) elif isinstance(dtype, pandas.ArrowDtype): series = _arrow_to_pandas_arrowdtype(column, dtype) + elif isinstance(dtype, db_dtypes.JSONDtype): + series = db_dtypes.JSONArray(column) else: series = column.to_pandas(types_mapper=lambda _: dtype) diff --git a/setup.py b/setup.py index 74a0d5475c..047da2348c 100644 --- a/setup.py +++ b/setup.py @@ -62,6 +62,7 @@ "ipywidgets >=7.7.1", "humanize >=4.6.0", "matplotlib >=3.7.1", + "db-dtypes >=1.4.0", # For vendored ibis-framework. "atpublic>=2.3,<6", "parsy>=2,<3", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 015153cb01..8b7ad892c0 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -26,6 +26,7 @@ tabulate==0.9 ipywidgets==7.7.1 humanize==4.6.0 matplotlib==3.7.1 +db-dtypes==1.4.0 # For vendored ibis-framework. atpublic==2.3 parsy==2.0 diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index b01ac3aaf2..aa490749ae 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -118,7 +118,6 @@ def test_json_set_w_invalid_series_type(): def test_json_extract_from_json(): s = _get_series_from_json([{"a": {"b": [1, 2]}}, {"a": {"c": 1}}, {"a": {"b": 0}}]) actual = bbq.json_extract(s, "$.a.b").to_pandas() - # After the introduction of the JSON type, the output should be a JSON-formatted series. expected = _get_series_from_json([[1, 2], None, 0]).to_pandas() pd.testing.assert_series_equal( actual, @@ -129,12 +128,10 @@ def test_json_extract_from_json(): def test_json_extract_from_string(): s = bpd.Series(['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}']) actual = bbq.json_extract(s, "$.a.b") - expected = _get_series_from_json([[1, 2], None, 0]) + expected = bpd.Series(["[1,2]", None, "0"]) pd.testing.assert_series_equal( actual.to_pandas(), expected.to_pandas(), - check_names=False, - check_dtype=False, # json_extract returns string type. While _get_series_from_json gives a JSON series (pa.large_string). ) @@ -143,20 +140,58 @@ def test_json_extract_w_invalid_series_type(): bbq.json_extract(bpd.Series([1, 2]), "$.a") +def test_json_extract_array_from_json(): + s = _get_series_from_json( + [{"a": ["ab", "2", "3 xy"]}, {"a": []}, {"a": ["4", "5"]}, {}] + ) + actual = bbq.json_extract_array(s, "$.a") + + # This code provides a workaround for issue https://github.com/apache/arrow/issues/45262, + # which currently prevents constructing a series using the pa.list_(db_types.JSONArrrowType()) + sql = """ + SELECT 0 AS id, [JSON '"ab"', JSON '"2"', JSON '"3 xy"'] AS data, + UNION ALL + SELECT 1, [], + UNION ALL + SELECT 2, [JSON '"4"', JSON '"5"'], + UNION ALL + SELECT 3, null, + """ + df = bpd.read_gbq(sql).set_index("id").sort_index() + expected = df["data"] + + pd.testing.assert_series_equal( + actual.to_pandas(), + expected.to_pandas(), + ) + + def test_json_extract_array_from_json_strings(): - s = bpd.Series(['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}']) + s = bpd.Series( + ['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}', "{}"], + dtype=pd.StringDtype(storage="pyarrow"), + ) actual = bbq.json_extract_array(s, "$.a") - expected = bpd.Series([['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"']]) + expected = bpd.Series( + [['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"'], None], + dtype=pd.StringDtype(storage="pyarrow"), + ) pd.testing.assert_series_equal( actual.to_pandas(), expected.to_pandas(), ) -def test_json_extract_array_from_array_strings(): - s = bpd.Series(["[1, 2, 3]", "[]", "[4,5]"]) +def test_json_extract_array_from_json_array_strings(): + s = bpd.Series( + ["[1, 2, 3]", "[]", "[4,5]"], + dtype=pd.StringDtype(storage="pyarrow"), + ) actual = bbq.json_extract_array(s) - expected = bpd.Series([["1", "2", "3"], [], ["4", "5"]]) + expected = bpd.Series( + [["1", "2", "3"], [], ["4", "5"]], + dtype=pd.StringDtype(storage="pyarrow"), + ) pd.testing.assert_series_equal( actual.to_pandas(), expected.to_pandas(), @@ -164,8 +199,9 @@ def test_json_extract_array_from_array_strings(): def test_json_extract_array_w_invalid_series_type(): + s = bpd.Series([1, 2]) with pytest.raises(TypeError): - bbq.json_extract_array(bpd.Series([1, 2])) + bbq.json_extract_array(s) def test_json_extract_string_array_from_json_strings(): @@ -203,14 +239,6 @@ def test_json_extract_string_array_w_invalid_series_type(): bbq.json_extract_string_array(bpd.Series([1, 2])) -# b/381148539 -def test_json_in_struct(): - df = bpd.read_gbq( - "SELECT STRUCT(JSON '{\\\"a\\\": 1}' AS data, 1 AS number) as struct_col" - ) - assert df["struct_col"].struct.field("data")[0] == '{"a":1}' - - def test_parse_json_w_invalid_series_type(): with pytest.raises(TypeError): bbq.parse_json(bpd.Series([1, 2])) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 848e21f6bd..10637b2395 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -12,8 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import math from typing import Tuple +import db_dtypes # type:ignore import google.api_core.exceptions import pandas as pd import pandas.testing @@ -247,23 +249,146 @@ def test_to_pandas_array_struct_correct_result(session): ) -def test_load_json(session): - df = session.read_gbq( - """SELECT - JSON_OBJECT('foo', 10, 'bar', TRUE) AS json_column - """ - ) - +def test_load_json_w_unboxed_py_value(session): + sql = """ + SELECT 0 AS id, JSON_OBJECT('boolean', True) AS json_col, + UNION ALL + SELECT 1, JSON_OBJECT('int', 100), + UNION ALL + SELECT 2, JSON_OBJECT('float', 0.98), + UNION ALL + SELECT 3, JSON_OBJECT('string', 'hello world'), + UNION ALL + SELECT 4, JSON_OBJECT('array', [8, 9, 10]), + UNION ALL + SELECT 5, JSON_OBJECT('null', null), + UNION ALL + SELECT + 6, + JSON_OBJECT( + 'dict', + JSON_OBJECT( + 'int', 1, + 'array', [JSON_OBJECT('bar', 'hello'), JSON_OBJECT('foo', 1)] + ) + ), + """ + df = session.read_gbq(sql, index_col="id") + + assert df.dtypes["json_col"] == db_dtypes.JSONDtype() + assert isinstance(df["json_col"][0], dict) + + assert df["json_col"][0]["boolean"] + assert df["json_col"][1]["int"] == 100 + assert math.isclose(df["json_col"][2]["float"], 0.98) + assert df["json_col"][3]["string"] == "hello world" + assert df["json_col"][4]["array"] == [8, 9, 10] + assert df["json_col"][5]["null"] is None + assert df["json_col"][6]["dict"] == { + "int": 1, + "array": [{"bar": "hello"}, {"foo": 1}], + } + + +def test_load_json_to_pandas_has_correct_result(session): + df = session.read_gbq("SELECT JSON_OBJECT('foo', 10, 'bar', TRUE) AS json_col") + assert df.dtypes["json_col"] == db_dtypes.JSONDtype() result = df.to_pandas() - expected = pd.DataFrame( - { - "json_column": ['{"bar":true,"foo":10}'], - }, - dtype=pd.ArrowDtype(pa.large_string()), - ) - expected.index = expected.index.astype("Int64") - pd.testing.assert_series_equal(result.dtypes, expected.dtypes) - pd.testing.assert_series_equal(result["json_column"], expected["json_column"]) + + # The order of keys within the JSON object shouldn't matter for equality checks. + pd_df = pd.DataFrame( + {"json_col": [{"bar": True, "foo": 10}]}, + dtype=db_dtypes.JSONDtype(), + ) + pd_df.index = pd_df.index.astype("Int64") + pd.testing.assert_series_equal(result.dtypes, pd_df.dtypes) + pd.testing.assert_series_equal(result["json_col"], pd_df["json_col"]) + + +def test_load_json_in_struct(session): + """Avoid regressions for internal issue 381148539.""" + sql = """ + SELECT 0 AS id, STRUCT(JSON_OBJECT('boolean', True) AS data, 1 AS number) AS struct_col + UNION ALL + SELECT 1, STRUCT(JSON_OBJECT('int', 100), 2), + UNION ALL + SELECT 2, STRUCT(JSON_OBJECT('float', 0.98), 3), + UNION ALL + SELECT 3, STRUCT(JSON_OBJECT('string', 'hello world'), 4), + UNION ALL + SELECT 4, STRUCT(JSON_OBJECT('array', [8, 9, 10]), 5), + UNION ALL + SELECT 5, STRUCT(JSON_OBJECT('null', null), 6), + UNION ALL + SELECT + 6, + STRUCT(JSON_OBJECT( + 'dict', + JSON_OBJECT( + 'int', 1, + 'array', [JSON_OBJECT('bar', 'hello'), JSON_OBJECT('foo', 1)] + ) + ), 7), + """ + df = session.read_gbq(sql, index_col="id") + + assert isinstance(df.dtypes["struct_col"], pd.ArrowDtype) + assert isinstance(df.dtypes["struct_col"].pyarrow_dtype, pa.StructType) + + data = df["struct_col"].struct.field("data") + assert data.dtype == db_dtypes.JSONDtype() + + assert data[0]["boolean"] + assert data[1]["int"] == 100 + assert math.isclose(data[2]["float"], 0.98) + assert data[3]["string"] == "hello world" + assert data[4]["array"] == [8, 9, 10] + assert data[5]["null"] is None + assert data[6]["dict"] == { + "int": 1, + "array": [{"bar": "hello"}, {"foo": 1}], + } + + +def test_load_json_in_array(session): + sql = """ + SELECT + 0 AS id, + [ + JSON_OBJECT('boolean', True), + JSON_OBJECT('int', 100), + JSON_OBJECT('float', 0.98), + JSON_OBJECT('string', 'hello world'), + JSON_OBJECT('array', [8, 9, 10]), + JSON_OBJECT('null', null), + JSON_OBJECT( + 'dict', + JSON_OBJECT( + 'int', 1, + 'array', [JSON_OBJECT('bar', 'hello'), JSON_OBJECT('foo', 1)] + ) + ) + ] AS array_col, + """ + df = session.read_gbq(sql, index_col="id") + + assert isinstance(df.dtypes["array_col"], pd.ArrowDtype) + assert isinstance(df.dtypes["array_col"].pyarrow_dtype, pa.ListType) + + data = df["array_col"].list + assert data.len()[0] == 7 + assert data[0].dtype == db_dtypes.JSONDtype() + + assert data[0][0]["boolean"] + assert data[1][0]["int"] == 100 + assert math.isclose(data[2][0]["float"], 0.98) + assert data[3][0]["string"] == "hello world" + assert data[4][0]["array"] == [8, 9, 10] + assert data[5][0]["null"] is None + assert data[6][0]["dict"] == { + "int": 1, + "array": [{"bar": "hello"}, {"foo": 1}], + } def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index): diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 670828f616..3d76122e9d 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -17,6 +17,7 @@ import re import tempfile +import db_dtypes # type: ignore import geopandas as gpd # type: ignore import numpy from packaging.version import Version @@ -281,7 +282,7 @@ def test_get_column(scalars_dfs, col_name, expected_dtype): def test_get_column_w_json(json_df, json_pandas_df): series = json_df["json_col"] series_pandas = series.to_pandas() - assert series.dtype == pd.ArrowDtype(pa.large_string()) + assert series.dtype == db_dtypes.JSONDtype() assert series_pandas.shape[0] == json_pandas_df.shape[0]