diff --git a/bigframes/operations/_op_converters.py b/bigframes/operations/_op_converters.py new file mode 100644 index 0000000000..3ebf22bcb6 --- /dev/null +++ b/bigframes/operations/_op_converters.py @@ -0,0 +1,37 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import bigframes.operations as ops + + +def convert_index(key: int) -> ops.ArrayIndexOp: + if key < 0: + raise NotImplementedError("Negative indexing is not supported.") + return ops.ArrayIndexOp(index=key) + + +def convert_slice(key: slice) -> ops.ArraySliceOp: + if key.step is not None and key.step != 1: + raise NotImplementedError(f"Only a step of 1 is allowed, got {key.step}") + + if (key.start is not None and key.start < 0) or ( + key.stop is not None and key.stop < 0 + ): + raise NotImplementedError("Slicing with negative numbers is not allowed.") + + return ops.ArraySliceOp( + start=key.start if key.start is not None else 0, + stop=key.stop, + step=key.step, + ) diff --git a/bigframes/operations/lists.py b/bigframes/operations/lists.py new file mode 100644 index 0000000000..16c22dfb2a --- /dev/null +++ b/bigframes/operations/lists.py @@ -0,0 +1,46 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import inspect +from typing import Union + +import bigframes_vendored.pandas.core.arrays.arrow.accessors as vendoracessors + +from bigframes.core import log_adapter +import bigframes.operations as ops +from bigframes.operations._op_converters import convert_index, convert_slice +import bigframes.operations.base +import bigframes.series as series + + +@log_adapter.class_logger +class ListAccessor( + bigframes.operations.base.SeriesMethods, vendoracessors.ListAccessor +): + __doc__ = vendoracessors.ListAccessor.__doc__ + + def len(self): + return self._apply_unary_op(ops.len_op) + + def __getitem__(self, key: Union[int, slice]) -> series.Series: + if isinstance(key, int): + return self._apply_unary_op(convert_index(key)) + elif isinstance(key, slice): + return self._apply_unary_op(convert_slice(key)) + else: + raise ValueError(f"key must be an int or slice, got {type(key).__name__}") + + __getitem__.__doc__ = inspect.getdoc(vendoracessors.ListAccessor.__getitem__) diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index d3e9c7edc6..4af142e0d5 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -23,6 +23,7 @@ from bigframes.core import log_adapter import bigframes.dataframe as df import bigframes.operations as ops +from bigframes.operations._op_converters import convert_index, convert_slice import bigframes.operations.base import bigframes.series as series @@ -40,28 +41,9 @@ class StringMethods(bigframes.operations.base.SeriesMethods, vendorstr.StringMet def __getitem__(self, key: Union[int, slice]) -> series.Series: if isinstance(key, int): - if key < 0: - raise NotImplementedError("Negative indexing is not supported.") - return self._apply_unary_op(ops.ArrayIndexOp(index=key)) + return self._apply_unary_op(convert_index(key)) elif isinstance(key, slice): - if key.step is not None and key.step != 1: - raise NotImplementedError( - f"Only a step of 1 is allowed, got {key.step}" - ) - if (key.start is not None and key.start < 0) or ( - key.stop is not None and key.stop < 0 - ): - raise NotImplementedError( - "Slicing with negative numbers is not allowed." - ) - - return self._apply_unary_op( - ops.ArraySliceOp( - start=key.start if key.start is not None else 0, - stop=key.stop, - step=key.step, - ) - ) + return self._apply_unary_op(convert_slice(key)) else: raise ValueError(f"key must be an int or slice, got {type(key).__name__}") diff --git a/bigframes/series.py b/bigframes/series.py index a166680f85..5192a9cf49 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -53,6 +53,7 @@ import bigframes.operations.aggregations as agg_ops import bigframes.operations.base import bigframes.operations.datetimes as dt +import bigframes.operations.lists as lists import bigframes.operations.plotting as plotting import bigframes.operations.strings as strings import bigframes.operations.structs as structs @@ -66,6 +67,8 @@ " Try converting it to a remote function." ) +_list = list # Type alias to escape Series.list property + @log_adapter.class_logger class Series(bigframes.operations.base.SeriesMethods, vendored_pandas_series.Series): @@ -161,6 +164,10 @@ def query_job(self) -> Optional[bigquery.QueryJob]: def struct(self) -> structs.StructAccessor: return structs.StructAccessor(self._block) + @property + def list(self) -> lists.ListAccessor: + return lists.ListAccessor(self._block) + @property @validations.requires_ordering() def T(self) -> Series: @@ -1708,7 +1715,7 @@ def to_latex( buf, columns=columns, header=header, index=index, **kwargs ) - def tolist(self) -> list: + def tolist(self) -> _list: return self.to_pandas().to_list() to_list = tolist diff --git a/docs/reference/bigframes.pandas/series.rst b/docs/reference/bigframes.pandas/series.rst index f14eb8e862..30cf851de7 100644 --- a/docs/reference/bigframes.pandas/series.rst +++ b/docs/reference/bigframes.pandas/series.rst @@ -35,6 +35,14 @@ String handling :inherited-members: :undoc-members: +List handling +^^^^^^^^^^^^^ + +.. automodule:: bigframes.operations.lists + :members: + :inherited-members: + :undoc-members: + Struct handling ^^^^^^^^^^^^^^^ diff --git a/notebooks/dataframes/struct_and_array_dtypes.ipynb b/notebooks/dataframes/struct_and_array_dtypes.ipynb index 3bcdaf40f7..def65ee6ca 100644 --- a/notebooks/dataframes/struct_and_array_dtypes.ipynb +++ b/notebooks/dataframes/struct_and_array_dtypes.ipynb @@ -6,7 +6,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Copyright 2023 Google LLC\n", + "# Copyright 2024 Google LLC\n", "#\n", "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", "# you may not use this file except in compliance with the License.\n", @@ -212,6 +212,54 @@ "cell_type": "code", "execution_count": 7, "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 3\n", + "1 2\n", + "2 4\n", + "Name: Scores, dtype: Int64" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Find the length of each array with list accessor\n", + "df['Scores'].list.len()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 88\n", + "1 81\n", + "2 89\n", + "Name: Scores, dtype: Int64" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Find the second element in each array with list accessor\n", + "df['Scores'].list[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, "outputs": [ { "data": { @@ -228,7 +276,7 @@ "Name: Scores, dtype: Int64" ] }, - "execution_count": 7, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -243,7 +291,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -261,7 +309,7 @@ "Name: Scores, dtype: Float64" ] }, - "execution_count": 8, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -274,7 +322,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -286,7 +334,7 @@ "Name: Scores, dtype: list[pyarrow]" ] }, - "execution_count": 9, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -299,7 +347,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -361,7 +409,7 @@ "[3 rows x 3 columns]" ] }, - "execution_count": 10, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -394,14 +442,14 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/chelsealin/src/bigframes/venv/lib/python3.12/site-packages/google/cloud/bigquery/_pandas_helpers.py:570: UserWarning: Pyarrow could not determine the type of columns: bigframes_unnamed_index.\n", + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/venv/lib/python3.11/site-packages/google/cloud/bigquery/_pandas_helpers.py:570: UserWarning: Pyarrow could not determine the type of columns: bigframes_unnamed_index.\n", " warnings.warn(\n" ] }, @@ -460,7 +508,7 @@ "[3 rows x 2 columns]" ] }, - "execution_count": 11, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -483,7 +531,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -494,7 +542,7 @@ "dtype: object" ] }, - "execution_count": 12, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -514,7 +562,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -525,7 +573,7 @@ "dtype: object" ] }, - "execution_count": 13, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -537,7 +585,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -549,7 +597,7 @@ "Name: City, dtype: string" ] }, - "execution_count": 14, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -562,7 +610,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -620,7 +668,7 @@ "[3 rows x 2 columns]" ] }, - "execution_count": 15, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -648,7 +696,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.1" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/tests/system/small/operations/test_lists.py b/tests/system/small/operations/test_lists.py new file mode 100644 index 0000000000..7ecf79dc6a --- /dev/null +++ b/tests/system/small/operations/test_lists.py @@ -0,0 +1,83 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import packaging.version +import pandas as pd +import pyarrow as pa +import pytest + +import bigframes.pandas as bpd + +from ...utils import assert_series_equal + + +@pytest.mark.parametrize( + ("key"), + [ + pytest.param(0, id="int"), + pytest.param(slice(None, None, None), id="default_start_slice"), + pytest.param(slice(0, None, 1), id="default_stop_slice"), + pytest.param(slice(0, 2, None), id="default_step_slice"), + ], +) +def test_getitem(key): + if packaging.version.Version(pd.__version__) < packaging.version.Version("2.2.0"): + pytest.skip( + "https://pandas.pydata.org/docs/whatsnew/v2.2.0.html#series-list-accessor-for-pyarrow-list-data" + ) + data = [[1], [2, 3], [4, 5, 6]] + s = bpd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64()))) + pd_s = pd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64()))) + + bf_result = s.list[key].to_pandas() + pd_result = pd_s.list[key] + + assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) + + +@pytest.mark.parametrize( + ("key", "expectation"), + [ + # Negative index + (-1, pytest.raises(NotImplementedError)), + # Slice with negative start + (slice(-1, None, None), pytest.raises(NotImplementedError)), + # Slice with negatiev end + (slice(0, -1, None), pytest.raises(NotImplementedError)), + # Slice with step not equal to 1 + (slice(0, 2, 2), pytest.raises(NotImplementedError)), + ], +) +def test_getitem_notsupported(key, expectation): + data = [[1], [2, 3], [4, 5, 6]] + s = bpd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64()))) + + with expectation as e: + assert s.list[key] == e + + +def test_len(): + if packaging.version.Version(pd.__version__) < packaging.version.Version("2.2.0"): + pytest.skip( + "https://pandas.pydata.org/docs/whatsnew/v2.2.0.html#series-list-accessor-for-pyarrow-list-data" + ) + data = [[], [1], [1, 2], [1, 2, 3]] + s = bpd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64()))) + pd_s = pd.Series(data, dtype=pd.ArrowDtype(pa.list_(pa.int64()))) + + bf_result = s.list.len().to_pandas() + pd_result = pd_s.list.len() + + assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) diff --git a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py index ab199d53bd..771146250a 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py @@ -6,6 +6,71 @@ from bigframes import constants +class ListAccessor: + """Accessor object for list data properties of the Series values.""" + + def len(self): + """Compute the length of each list in the Series. + + **See Also:** + + - :func:`StringMethods.len` : Compute the length of each element in the Series/Index. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import pyarrow as pa + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series( + ... [ + ... [1, 2, 3], + ... [3], + ... ], + ... dtype=bpd.ArrowDtype(pa.list_(pa.int64())), + ... ) + >>> s.list.len() + 0 3 + 1 1 + dtype: Int64 + + Returns: + bigframes.series.Series: A Series or Index of integer values indicating + the length of each element in the Series or Index. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def __getitem__(self, key: int | slice): + """Index or slice lists in the Series. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import pyarrow as pa + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series( + ... [ + ... [1, 2, 3], + ... [3], + ... ], + ... dtype=bpd.ArrowDtype(pa.list_(pa.int64())), + ... ) + >>> s.list[0] + 0 1 + 1 3 + dtype: Int64 + + Args: + key (int | slice): Index or slice of indices to access from each list. + For integer indices, only non-negative values are accepted. For + slices, you must use a non-negative start, a non-negative end, and + a step of 1. + + Returns: + bigframes.series.Series: The list at requested index. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + class StructAccessor: """ Accessor object for structured data properties of the Series values.