diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 22bcfb1407..cdc3718893 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -40,6 +40,7 @@ json_extract_array, json_extract_string_array, json_query, + json_query_array, json_set, json_value, parse_json, @@ -67,6 +68,7 @@ "json_extract_array", "json_extract_string_array", "json_query", + "json_query_array", "json_set", "json_value", "parse_json", diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py index 561fb57348..00d230d684 100644 --- a/bigframes/bigquery/_operations/json.py +++ b/bigframes/bigquery/_operations/json.py @@ -133,6 +133,10 @@ def json_extract_array( `STRING` or `JSON` values. This function uses single quotes and brackets to escape invalid JSONPath characters in JSON keys. + .. deprecated:: 2.5.0 + The ``json_extract_array`` is deprecated and will be removed in a future version. + Use ``json_query_array`` instead. + **Examples:** >>> import bigframes.pandas as bpd @@ -172,6 +176,11 @@ def json_extract_array( Returns: bigframes.series.Series: A new Series with the parsed arrays from the input. """ + msg = ( + "The `json_extract_array` is deprecated and will be removed in a future version. " + "Use `json_query_array` instead." + ) + warnings.warn(bfe.format_message(msg), category=UserWarning) return input._apply_unary_op(ops.JSONExtractArray(json_path=json_path)) @@ -273,6 +282,56 @@ def json_query( return input._apply_unary_op(ops.JSONQuery(json_path=json_path)) +def json_query_array( + input: series.Series, + json_path: str = "$", +) -> series.Series: + """Extracts a JSON array and converts it to a SQL array of JSON-formatted + `STRING` or `JSON` values. This function uses double quotes to escape invalid + JSONPath characters in JSON keys. For example: `"a.b"`. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]']) + >>> bbq.json_query_array(s) + 0 ['1' '2' '3'] + 1 ['4' '5'] + dtype: list[pyarrow] + + >>> s = bpd.Series([ + ... '{"fruits": [{"name": "apple"}, {"name": "cherry"}]}', + ... '{"fruits": [{"name": "guava"}, {"name": "grapes"}]}' + ... ]) + >>> bbq.json_query_array(s, "$.fruits") + 0 ['{"name":"apple"}' '{"name":"cherry"}'] + 1 ['{"name":"guava"}' '{"name":"grapes"}'] + dtype: list[pyarrow] + + >>> s = bpd.Series([ + ... '{"fruits": {"color": "red", "names": ["apple","cherry"]}}', + ... '{"fruits": {"color": "green", "names": ["guava", "grapes"]}}' + ... ]) + >>> bbq.json_query_array(s, "$.fruits.names") + 0 ['"apple"' '"cherry"'] + 1 ['"guava"' '"grapes"'] + dtype: list[pyarrow] + + Args: + input (bigframes.series.Series): + The Series containing JSON data (as native JSON objects or JSON-formatted strings). + json_path (str): + The JSON path identifying the data that you want to obtain from the input. + + Returns: + bigframes.series.Series: A new Series with the parsed arrays from the input. + """ + return input._apply_unary_op(ops.JSONQueryArray(json_path=json_path)) + + def json_value( input: series.Series, json_path: str, diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index a1fc995159..908f3082c3 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -1379,6 +1379,19 @@ def json_query(json_or_json_string, json_path: ibis_dtypes.str): # type: ignore return json_query_op(json_or_json_string=x, json_path=op.json_path) +@scalar_op_compiler.register_unary_op(ops.JSONQueryArray, pass_op=True) +def json_query_array_op_impl(x: ibis_types.Value, op: ops.JSONQueryArray): + # Define a user-defined function whose returned type is dynamically matching the input. + def json_query_array(json_or_json_string, json_path: ibis_dtypes.str): # type: ignore + """Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value.""" + ... + + return_type = x.type() + json_query_array.__annotations__["return"] = ibis_dtypes.Array[return_type] # type: ignore + json_query_op = ibis_udf.scalar.builtin(json_query_array) + return json_query_op(json_or_json_string=x, json_path=op.json_path) + + @scalar_op_compiler.register_unary_op(ops.ParseJSON, pass_op=True) def parse_json_op_impl(x: ibis_types.Value, op: ops.ParseJSON): return parse_json(json_str=x) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index faf4e18d5e..291bf17fa5 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -109,6 +109,7 @@ JSONExtractArray, JSONExtractStringArray, JSONQuery, + JSONQueryArray, JSONSet, JSONValue, ParseJSON, @@ -359,6 +360,7 @@ "JSONExtractArray", "JSONExtractStringArray", "JSONQuery", + "JSONQueryArray", "JSONSet", "JSONValue", "ParseJSON", diff --git a/bigframes/operations/json_ops.py b/bigframes/operations/json_ops.py index b083035d38..95a47dcadb 100644 --- a/bigframes/operations/json_ops.py +++ b/bigframes/operations/json_ops.py @@ -37,6 +37,23 @@ def output_type(self, *input_types): return input_type +@dataclasses.dataclass(frozen=True) +class JSONQueryArray(base_ops.UnaryOp): + name: typing.ClassVar[str] = "json_query_array" + json_path: str + + def output_type(self, *input_types): + input_type = input_types[0] + if not dtypes.is_json_like(input_type): + raise TypeError( + "Input type must be a valid JSON object or JSON-formatted string type." + + f" Received type: {input_type}" + ) + return pd.ArrowDtype( + pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(input_type)) + ) + + @dataclasses.dataclass(frozen=True) class JSONExtractArray(base_ops.UnaryOp): name: typing.ClassVar[str] = "json_extract_array" diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index 3d155b5f16..4ad16d6cc8 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -128,7 +128,8 @@ def test_json_extract_array_from_json(): ['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4", "5"]}', "{}"], dtype=dtypes.JSON_DTYPE, ) - actual = bbq.json_extract_array(s, "$.a") + with pytest.warns(UserWarning, match="The `json_extract_array` is deprecated"): + actual = bbq.json_extract_array(s, "$.a") # This code provides a workaround for issue https://github.com/apache/arrow/issues/45262, # which currently prevents constructing a series using the pa.list_(db_types.JSONArrrowType()) @@ -241,6 +242,66 @@ def test_json_query_w_invalid_series_type(): bbq.json_query(s, "$.a") +def test_json_query_array_from_json(): + s = bpd.Series( + ['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4", "5"]}', "{}"], + dtype=dtypes.JSON_DTYPE, + ) + actual = bbq.json_query_array(s, "$.a") + + # This code provides a workaround for issue https://github.com/apache/arrow/issues/45262, + # which currently prevents constructing a series using the pa.list_(db_types.JSONArrrowType()) + sql = """ + SELECT 0 AS id, [JSON '"ab"', JSON '"2"', JSON '"3 xy"'] AS data, + UNION ALL + SELECT 1, [], + UNION ALL + SELECT 2, [JSON '"4"', JSON '"5"'], + UNION ALL + SELECT 3, null, + """ + df = bpd.read_gbq(sql).set_index("id").sort_index() + expected = df["data"] + expected.index.name = None + expected.name = None + + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) + + +def test_json_query_array_from_json_strings(): + s = bpd.Series( + ['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}', "{}"], + dtype=pd.StringDtype(storage="pyarrow"), + ) + actual = bbq.json_query_array(s, "$.a") + expected = bpd.Series( + [['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"'], None], + dtype=pd.ArrowDtype(pa.list_(pa.string())), + ) + + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) + + +def test_json_query_array_from_json_array_strings(): + s = bpd.Series( + ["[1, 2, 3]", "[]", "[4,5]"], + dtype=pd.StringDtype(storage="pyarrow"), + ) + actual = bbq.json_query_array(s) + expected = bpd.Series( + [["1", "2", "3"], [], ["4", "5"]], + dtype=pd.ArrowDtype(pa.list_(pa.string())), + ) + + pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas()) + + +def test_json_query_array_w_invalid_series_type(): + s = bpd.Series([1, 2]) + with pytest.raises(TypeError): + bbq.json_query_array(s) + + def test_json_value_from_json(): s = bpd.Series( ['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'],