From e055046fe08fe259373c8095576997243dbbe27d Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Wed, 11 Jun 2025 23:00:19 +0000 Subject: [PATCH 1/2] feat: add bbq.json_query_array and warn bbq.json_extract_array deprecated --- bigframes/bigquery/_operations/json.py | 59 ++++++++++++++++++++++++++ bigframes/operations/json_ops.py | 17 ++++++++ tests/unit/bigquery/test_json.py | 40 +++++++++++++++++ 3 files changed, 116 insertions(+) diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py index 561fb57348..00d230d684 100644 --- a/bigframes/bigquery/_operations/json.py +++ b/bigframes/bigquery/_operations/json.py @@ -133,6 +133,10 @@ def json_extract_array( `STRING` or `JSON` values. This function uses single quotes and brackets to escape invalid JSONPath characters in JSON keys. + .. deprecated:: 2.5.0 + The ``json_extract_array`` is deprecated and will be removed in a future version. + Use ``json_query_array`` instead. + **Examples:** >>> import bigframes.pandas as bpd @@ -172,6 +176,11 @@ def json_extract_array( Returns: bigframes.series.Series: A new Series with the parsed arrays from the input. """ + msg = ( + "The `json_extract_array` is deprecated and will be removed in a future version. " + "Use `json_query_array` instead." + ) + warnings.warn(bfe.format_message(msg), category=UserWarning) return input._apply_unary_op(ops.JSONExtractArray(json_path=json_path)) @@ -273,6 +282,56 @@ def json_query( return input._apply_unary_op(ops.JSONQuery(json_path=json_path)) +def json_query_array( + input: series.Series, + json_path: str = "$", +) -> series.Series: + """Extracts a JSON array and converts it to a SQL array of JSON-formatted + `STRING` or `JSON` values. This function uses double quotes to escape invalid + JSONPath characters in JSON keys. For example: `"a.b"`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]']) + >>> bbq.json_query_array(s) + 0 ['1' '2' '3'] + 1 ['4' '5'] + dtype: list[pyarrow] + + >>> s = bpd.Series([ + ... '{"fruits": [{"name": "apple"}, {"name": "cherry"}]}', + ... '{"fruits": [{"name": "guava"}, {"name": "grapes"}]}' + ... ]) + >>> bbq.json_query_array(s, "$.fruits") + 0 ['{"name":"apple"}' '{"name":"cherry"}'] + 1 ['{"name":"guava"}' '{"name":"grapes"}'] + dtype: list[pyarrow] + + >>> s = bpd.Series([ + ... '{"fruits": {"color": "red", "names": ["apple","cherry"]}}', + ... '{"fruits": {"color": "green", "names": ["guava", "grapes"]}}' + ... ]) + >>> bbq.json_query_array(s, "$.fruits.names") + 0 ['"apple"' '"cherry"'] + 1 ['"guava"' '"grapes"'] + dtype: list[pyarrow] + + Args: + input (bigframes.series.Series): + The Series containing JSON data (as native JSON objects or JSON-formatted strings). + json_path (str): + The JSON path identifying the data that you want to obtain from the input. + + Returns: + bigframes.series.Series: A new Series with the parsed arrays from the input. + """ + return input._apply_unary_op(ops.JSONQueryArray(json_path=json_path)) + + def json_value( input: series.Series, json_path: str, diff --git a/bigframes/operations/json_ops.py b/bigframes/operations/json_ops.py index b083035d38..95a47dcadb 100644 --- a/bigframes/operations/json_ops.py +++ b/bigframes/operations/json_ops.py @@ -37,6 +37,23 @@ def output_type(self, *input_types): return input_type +@dataclasses.dataclass(frozen=True) +class JSONQueryArray(base_ops.UnaryOp): + name: typing.ClassVar[str] = "json_query_array" + json_path: str + + def output_type(self, *input_types): + input_type = input_types[0] + if not dtypes.is_json_like(input_type): + raise TypeError( + "Input type must be a valid JSON object or JSON-formatted string type." + + f" Received type: {input_type}" + ) + return pd.ArrowDtype( + pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(input_type)) + ) + + @dataclasses.dataclass(frozen=True) class JSONExtractArray(base_ops.UnaryOp): name: typing.ClassVar[str] = "json_extract_array" diff --git a/tests/unit/bigquery/test_json.py b/tests/unit/bigquery/test_json.py index d9beea26db..c6dfd50e38 100644 --- a/tests/unit/bigquery/test_json.py +++ b/tests/unit/bigquery/test_json.py @@ -18,9 +18,49 @@ import bigframes.bigquery as bbq import bigframes.pandas as bpd +import bigframes.operations as ops +import bigframes.dtypes as dtypes def test_json_set_w_invalid_json_path_value_pairs(): mock_series = mock.create_autospec(bpd.pandas.Series, instance=True) with pytest.raises(ValueError, match="Incorrect format"): bbq.json_set(mock_series, json_path_value_pairs=[("$.a", 1, 100)]) # type: ignore + + +def test_json_query_array_specific_path(): + mock_input_series = mock.create_autospec(bpd.Series, instance=True) + # Ensure the mock series has a dtype that is_json_like + mock_input_series.dtype = dtypes.STRING_DTYPE + + bbq.json_query_array(mock_input_series, json_path="$.items") + + mock_input_series._apply_unary_op.assert_called_once_with( + ops.JSONQueryArray(json_path="$.items") + ) + +def test_json_query_array_default_path(): + mock_input_series = mock.create_autospec(bpd.Series, instance=True) + # Ensure the mock series has a dtype that is_json_like + mock_input_series.dtype = dtypes.JSON_DTYPE + + bbq.json_query_array(mock_input_series) # Default path "$" + + mock_input_series._apply_unary_op.assert_called_once_with( + ops.JSONQueryArray(json_path="$") + ) + +def test_json_query_array_input_type_validation_passes_with_json_like(): + # This test is more about the op itself, but we can ensure the function doesn't break it. + # Assumes the op's output_type method will be invoked during series operation. + # This kind of test might be more suitable for operation tests if they exist. + # For now, just ensure the call goes through. + mock_input_series = mock.create_autospec(bpd.Series, instance=True) + mock_input_series.dtype = dtypes.STRING_DTYPE + bbq.json_query_array(mock_input_series) + mock_input_series._apply_unary_op.assert_called_once() + + mock_input_series_json = mock.create_autospec(bpd.Series, instance=True) + mock_input_series_json.dtype = dtypes.JSON_DTYPE + bbq.json_query_array(mock_input_series_json) + mock_input_series_json._apply_unary_op.assert_called_once() From c24dab64aa59b2f78469535f032682bffc7e1d73 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 11 Jun 2025 23:15:51 +0000 Subject: [PATCH 2/2] complete features --- bigframes/bigquery/__init__.py | 2 + bigframes/core/compile/scalar_op_compiler.py | 13 ++++ bigframes/operations/__init__.py | 2 + tests/system/small/bigquery/test_json.py | 63 +++++++++++++++++++- tests/unit/bigquery/test_json.py | 40 ------------- 5 files changed, 79 insertions(+), 41 deletions(-) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 22bcfb1407..cdc3718893 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -40,6 +40,7 @@ json_extract_array, json_extract_string_array, json_query, + json_query_array, json_set, json_value, parse_json, @@ -67,6 +68,7 @@ "json_extract_array", "json_extract_string_array", "json_query", + "json_query_array", "json_set", "json_value", "parse_json", diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index a1fc995159..908f3082c3 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -1379,6 +1379,19 @@ def json_query(json_or_json_string, json_path: ibis_dtypes.str): # type: ignore return json_query_op(json_or_json_string=x, json_path=op.json_path) +@scalar_op_compiler.register_unary_op(ops.JSONQueryArray, pass_op=True) +def json_query_array_op_impl(x: ibis_types.Value, op: ops.JSONQueryArray): + # Define a user-defined function whose returned type is dynamically matching the input. + def json_query_array(json_or_json_string, json_path: ibis_dtypes.str): # type: ignore + """Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value.""" + ... + + return_type = x.type() + json_query_array.__annotations__["return"] = ibis_dtypes.Array[return_type] # type: ignore + json_query_op = ibis_udf.scalar.builtin(json_query_array) + return json_query_op(json_or_json_string=x, json_path=op.json_path) + + @scalar_op_compiler.register_unary_op(ops.ParseJSON, pass_op=True) def parse_json_op_impl(x: ibis_types.Value, op: ops.ParseJSON): return parse_json(json_str=x) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index faf4e18d5e..291bf17fa5 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -109,6 +109,7 @@ JSONExtractArray, JSONExtractStringArray, JSONQuery, + JSONQueryArray, JSONSet, JSONValue, ParseJSON, @@ -359,6 +360,7 @@ "JSONExtractArray", "JSONExtractStringArray", "JSONQuery", + "JSONQueryArray", "JSONSet", "JSONValue", "ParseJSON", diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index 3d155b5f16..4ad16d6cc8 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -128,7 +128,8 @@ def test_json_extract_array_from_json(): ['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4", "5"]}', "{}"], dtype=dtypes.JSON_DTYPE, ) - actual = bbq.json_extract_array(s, "$.a") + with pytest.warns(UserWarning, match="The `json_extract_array` is deprecated"): + actual = bbq.json_extract_array(s, "$.a") # This code provides a workaround for issue https://github.com/apache/arrow/issues/45262, # which currently prevents constructing a series using the pa.list_(db_types.JSONArrrowType()) @@ -241,6 +242,66 @@ def test_json_query_w_invalid_series_type(): bbq.json_query(s, "$.a") +def test_json_query_array_from_json(): + s = bpd.Series( + ['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4", "5"]}', "{}"], + dtype=dtypes.JSON_DTYPE, + ) + actual = bbq.json_query_array(s, "$.a") + + # This code provides a workaround for issue https://github.com/apache/arrow/issues/45262, + # which currently prevents constructing a series using the pa.list_(db_types.JSONArrrowType()) + sql = """ + SELECT 0 AS id, [JSON '"ab"', JSON '"2"', JSON '"3 xy"'] AS data, + UNION ALL + SELECT 1, [], + UNION ALL + SELECT 2, [JSON '"4"', JSON '"5"'], + UNION ALL + SELECT 3, null, + """ + df = bpd.read_gbq(sql).set_index("id").sort_index() + expected = df["data"] + expected.index.name = None + expected.name = None + + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) + + +def test_json_query_array_from_json_strings(): + s = bpd.Series( + ['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}', "{}"], + dtype=pd.StringDtype(storage="pyarrow"), + ) + actual = bbq.json_query_array(s, "$.a") + expected = bpd.Series( + [['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"'], None], + dtype=pd.ArrowDtype(pa.list_(pa.string())), + ) + + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) + + +def test_json_query_array_from_json_array_strings(): + s = bpd.Series( + ["[1, 2, 3]", "[]", "[4,5]"], + dtype=pd.StringDtype(storage="pyarrow"), + ) + actual = bbq.json_query_array(s) + expected = bpd.Series( + [["1", "2", "3"], [], ["4", "5"]], + dtype=pd.ArrowDtype(pa.list_(pa.string())), + ) + + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) + + +def test_json_query_array_w_invalid_series_type(): + s = bpd.Series([1, 2]) + with pytest.raises(TypeError): + bbq.json_query_array(s) + + def test_json_value_from_json(): s = bpd.Series( ['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'], diff --git a/tests/unit/bigquery/test_json.py b/tests/unit/bigquery/test_json.py index c6dfd50e38..d9beea26db 100644 --- a/tests/unit/bigquery/test_json.py +++ b/tests/unit/bigquery/test_json.py @@ -18,49 +18,9 @@ import bigframes.bigquery as bbq import bigframes.pandas as bpd -import bigframes.operations as ops -import bigframes.dtypes as dtypes def test_json_set_w_invalid_json_path_value_pairs(): mock_series = mock.create_autospec(bpd.pandas.Series, instance=True) with pytest.raises(ValueError, match="Incorrect format"): bbq.json_set(mock_series, json_path_value_pairs=[("$.a", 1, 100)]) # type: ignore - - -def test_json_query_array_specific_path(): - mock_input_series = mock.create_autospec(bpd.Series, instance=True) - # Ensure the mock series has a dtype that is_json_like - mock_input_series.dtype = dtypes.STRING_DTYPE - - bbq.json_query_array(mock_input_series, json_path="$.items") - - mock_input_series._apply_unary_op.assert_called_once_with( - ops.JSONQueryArray(json_path="$.items") - ) - -def test_json_query_array_default_path(): - mock_input_series = mock.create_autospec(bpd.Series, instance=True) - # Ensure the mock series has a dtype that is_json_like - mock_input_series.dtype = dtypes.JSON_DTYPE - - bbq.json_query_array(mock_input_series) # Default path "$" - - mock_input_series._apply_unary_op.assert_called_once_with( - ops.JSONQueryArray(json_path="$") - ) - -def test_json_query_array_input_type_validation_passes_with_json_like(): - # This test is more about the op itself, but we can ensure the function doesn't break it. - # Assumes the op's output_type method will be invoked during series operation. - # This kind of test might be more suitable for operation tests if they exist. - # For now, just ensure the call goes through. - mock_input_series = mock.create_autospec(bpd.Series, instance=True) - mock_input_series.dtype = dtypes.STRING_DTYPE - bbq.json_query_array(mock_input_series) - mock_input_series._apply_unary_op.assert_called_once() - - mock_input_series_json = mock.create_autospec(bpd.Series, instance=True) - mock_input_series_json.dtype = dtypes.JSON_DTYPE - bbq.json_query_array(mock_input_series_json) - mock_input_series_json._apply_unary_op.assert_called_once()