From 96c582ee55d1d74e3e9ace1c280440a2e58b199e Mon Sep 17 00:00:00 2001 From: Henry J Solberg Date: Tue, 19 Sep 2023 22:10:11 +0000 Subject: [PATCH 1/5] bug: change return type of `Series.loc[scalar]` Change-Id: Id60a7da3021972da5c8a28fb8f3620e10643c0ed --- bigframes/core/indexers.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index a538c80711..b4e95f25ed 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -37,12 +37,7 @@ def __init__(self, series: bigframes.series.Series): self._series = series def __getitem__(self, key) -> bigframes.series.Series: - """ - Only indexing by a boolean bigframes.series.Series or list of index entries is currently supported - """ - return typing.cast( - bigframes.series.Series, _loc_getitem_series_or_dataframe(self._series, key) - ) + return _loc_getitem_series_or_dataframe(self._series, key) def __setitem__(self, key, value) -> None: # TODO(swast): support MultiIndex @@ -261,8 +256,8 @@ def _loc_getitem_series_or_dataframe( return _perform_loc_list_join(series_or_dataframe, keys_df) else: raise TypeError( - "Invalid argument type. loc currently only supports indexing with a " - "boolean bigframes Series, a list of index entries or a single index entry. " + "Invalid argument type. Expected bigframes.Series, bigframes.Index, " + "list, : (empty slice), or scalar. " f"{constants.FEEDBACK_LINK}" ) From 4284d8f3686a6ffada40f00e96f5e4f66c5e30f3 Mon Sep 17 00:00:00 2001 From: Henry J Solberg Date: Thu, 21 Sep 2023 16:51:31 +0000 Subject: [PATCH 2/5] add scalar case and update return types --- bigframes/core/indexers.py | 67 +++++++++++++++++++--------- bigframes/ml/model_selection.py | 8 ++-- tests/system/small/test_dataframe.py | 2 +- tests/system/small/test_series.py | 2 +- 4 files changed, 54 insertions(+), 25 deletions(-) diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index b4e95f25ed..220474cd6d 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -15,7 +15,7 @@ from __future__ import annotations import typing -from typing import Tuple +from typing import Tuple, Union import ibis import pandas as pd @@ -29,14 +29,18 @@ import bigframes.series if typing.TYPE_CHECKING: - LocSingleKey = typing.Union[bigframes.series.Series, indexes.Index, slice] + LocSingleKey = Union[ + bigframes.series.Series, indexes.Index, slice, bigframes.core.scalar.Scalar + ] class LocSeriesIndexer: def __init__(self, series: bigframes.series.Series): self._series = series - def __getitem__(self, key) -> bigframes.series.Series: + def __getitem__( + self, key + ) -> Union[bigframes.core.scalar.Scalar, bigframes.series.Series]: return _loc_getitem_series_or_dataframe(self._series, key) def __setitem__(self, key, value) -> None: @@ -79,7 +83,7 @@ def __init__(self, series: bigframes.series.Series): def __getitem__( self, key - ) -> bigframes.core.scalar.Scalar | bigframes.series.Series: + ) -> Union[bigframes.core.scalar.Scalar, bigframes.series.Series]: """ Index series using integer offsets. Currently supports index by key type: @@ -98,13 +102,17 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): self._dataframe = dataframe @typing.overload - def __getitem__(self, key: LocSingleKey) -> bigframes.dataframe.DataFrame: + def __getitem__( + self, key: LocSingleKey + ) -> Union[bigframes.dataframe.DataFrame, pd.Series]: ... # Technically this is wrong since we can have duplicate column labels, but # this is expected to be rare. @typing.overload - def __getitem__(self, key: Tuple[LocSingleKey, str]) -> bigframes.series.Series: + def __getitem__( + self, key: Tuple[LocSingleKey, str] + ) -> Union[bigframes.series.Series, bigframes.core.scalar.Scalar]: ... def __getitem__(self, key): @@ -168,7 +176,7 @@ class ILocDataFrameIndexer: def __init__(self, dataframe: bigframes.dataframe.DataFrame): self._dataframe = dataframe - def __getitem__(self, key) -> bigframes.dataframe.DataFrame | pd.Series: + def __getitem__(self, key) -> Union[bigframes.dataframe.DataFrame, pd.Series]: """ Index dataframe using integer offsets. Currently supports index by key type: @@ -183,21 +191,26 @@ def __getitem__(self, key) -> bigframes.dataframe.DataFrame | pd.Series: @typing.overload def _loc_getitem_series_or_dataframe( series_or_dataframe: bigframes.series.Series, key -) -> bigframes.series.Series: +) -> Union[bigframes.core.scalar.Scalar, bigframes.series.Series]: ... @typing.overload def _loc_getitem_series_or_dataframe( series_or_dataframe: bigframes.dataframe.DataFrame, key -) -> bigframes.dataframe.DataFrame: +) -> Union[bigframes.dataframe.DataFrame, pd.Series]: ... def _loc_getitem_series_or_dataframe( - series_or_dataframe: bigframes.dataframe.DataFrame | bigframes.series.Series, + series_or_dataframe: Union[bigframes.dataframe.DataFrame, bigframes.series.Series], key: LocSingleKey, -) -> bigframes.dataframe.DataFrame | bigframes.series.Series: +) -> Union[ + bigframes.dataframe.DataFrame, + bigframes.series.Series, + pd.Series, + bigframes.core.scalar.Scalar, +]: if isinstance(key, bigframes.series.Series) and key.dtype == "boolean": return series_or_dataframe[key] elif isinstance(key, bigframes.series.Series): @@ -217,7 +230,7 @@ def _loc_getitem_series_or_dataframe( # TODO(henryjsolberg): support MultiIndex if len(key) == 0: # type: ignore return typing.cast( - typing.Union[bigframes.dataframe.DataFrame, bigframes.series.Series], + Union[bigframes.dataframe.DataFrame, bigframes.series.Series], series_or_dataframe.iloc[0:0], ) @@ -253,7 +266,15 @@ def _loc_getitem_series_or_dataframe( ) keys_df = keys_df.set_index(index_name, drop=True) keys_df.index.name = None - return _perform_loc_list_join(series_or_dataframe, keys_df) + result = _perform_loc_list_join(series_or_dataframe, keys_df) + if len(result) == 1: + # although loc[scalar_key] returns multiple results when scalar_key + # is not unique, we run a query and return the computed individual + # result here (as a scalar or pandas series) when the key is unique, + # since we expect unique index keys to be more common. loc[[scalar_key]] + # can be used to retrieve one-item DataFrames or Series. + return result.iloc[0] + return result else: raise TypeError( "Invalid argument type. Expected bigframes.Series, bigframes.Index, " @@ -279,9 +300,9 @@ def _perform_loc_list_join( def _perform_loc_list_join( - series_or_dataframe: bigframes.dataframe.DataFrame | bigframes.series.Series, + series_or_dataframe: Union[bigframes.dataframe.DataFrame, bigframes.series.Series], keys_df: bigframes.dataframe.DataFrame, -) -> bigframes.series.Series | bigframes.dataframe.DataFrame: +) -> Union[bigframes.series.Series, bigframes.dataframe.DataFrame]: # right join based on the old index so that the matching rows from the user's # original dataframe will be duplicated and reordered appropriately original_index_names = series_or_dataframe.index.names @@ -304,20 +325,26 @@ def _perform_loc_list_join( @typing.overload def _iloc_getitem_series_or_dataframe( series_or_dataframe: bigframes.series.Series, key -) -> bigframes.series.Series | bigframes.core.scalar.Scalar: +) -> Union[bigframes.series.Series, bigframes.core.scalar.Scalar]: ... @typing.overload def _iloc_getitem_series_or_dataframe( series_or_dataframe: bigframes.dataframe.DataFrame, key -) -> bigframes.dataframe.DataFrame | pd.Series: +) -> Union[bigframes.dataframe.DataFrame, pd.Series]: ... def _iloc_getitem_series_or_dataframe( - series_or_dataframe: bigframes.dataframe.DataFrame | bigframes.series.Series, key -) -> bigframes.dataframe.DataFrame | bigframes.series.Series | bigframes.core.scalar.Scalar | pd.Series: + series_or_dataframe: Union[bigframes.dataframe.DataFrame, bigframes.series.Series], + key, +) -> Union[ + bigframes.dataframe.DataFrame, + bigframes.series.Series, + bigframes.core.scalar.Scalar, + pd.Series, +]: if isinstance(key, int): internal_slice_result = series_or_dataframe._slice(key, key + 1, 1) result_pd_df = internal_slice_result.to_pandas() @@ -331,7 +358,7 @@ def _iloc_getitem_series_or_dataframe( if len(key) == 0: return typing.cast( - typing.Union[bigframes.dataframe.DataFrame, bigframes.series.Series], + Union[bigframes.dataframe.DataFrame, bigframes.series.Series], series_or_dataframe.iloc[0:0], ) df = series_or_dataframe diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py index 110cbcf493..443b9e7be6 100644 --- a/bigframes/ml/model_selection.py +++ b/bigframes/ml/model_selection.py @@ -17,6 +17,7 @@ https://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection.""" +import typing from typing import List, Union from bigframes.ml import utils @@ -79,9 +80,10 @@ def train_test_split( train_index = split_dfs[0].index test_index = split_dfs[1].index - split_dfs += [ - df.loc[index] for df in dfs[1:] for index in (train_index, test_index) - ] + split_dfs += typing.cast( + List[bpd.DataFrame], + [df.loc[index] for df in dfs[1:] for index in (train_index, test_index)], + ) # convert back to Series. results: List[Union[bpd.DataFrame, bpd.Series]] = [] diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 6c96387e97..6fe450a337 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1871,7 +1871,7 @@ def test_loc_single_index_no_duplicate(scalars_df_index, scalars_pandas_df_index bf_result = scalars_df_index.loc[index] pd_result = scalars_pandas_df_index.loc[index] pd.testing.assert_series_equal( - bf_result.to_pandas().iloc[0, :], + bf_result, pd_result, ) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index d702049e68..3bd78bfec2 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -2617,7 +2617,7 @@ def test_loc_single_index_no_duplicate(scalars_df_index, scalars_pandas_df_index index = -2345 bf_result = scalars_df_index.date_col.loc[index] pd_result = scalars_pandas_df_index.date_col.loc[index] - assert bf_result.to_pandas().iloc[0] == pd_result + assert bf_result == pd_result def test_series_bool_interpretation_error(scalars_df_index): From f0e178f386b2a156396afcdf88a5b0406210ca90 Mon Sep 17 00:00:00 2001 From: Henry J Solberg Date: Fri, 22 Sep 2023 16:50:12 +0000 Subject: [PATCH 3/5] remove unneeded iloc in series getitem test --- tests/system/small/test_series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 3bd78bfec2..18cdcaab7b 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -157,7 +157,7 @@ def test_series___getitem___with_default_index(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_result = scalars_df[col_name][key] pd_result = scalars_pandas_df[col_name][key] - assert bf_result.to_pandas().iloc[0] == pd_result + assert bf_result == pd_result @pytest.mark.parametrize( From 3a48f5e5c15b0d4833b3934709cb53668c214593 Mon Sep 17 00:00:00 2001 From: Henry J Solberg Date: Fri, 22 Sep 2023 17:00:36 +0000 Subject: [PATCH 4/5] fix test_series_get_with_default_index --- tests/system/small/test_series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 18cdcaab7b..3cd644c80c 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -118,7 +118,7 @@ def test_series_get_with_default_index(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_result = scalars_df[col_name].get(key) pd_result = scalars_pandas_df[col_name].get(key) - assert bf_result.to_pandas().iloc[0] == pd_result + assert bf_result == pd_result @pytest.mark.parametrize( From e318e1c69353bd25825b1d52c24e1624dfde9dce Mon Sep 17 00:00:00 2001 From: Henry J Solberg Date: Mon, 25 Sep 2023 20:50:15 +0000 Subject: [PATCH 5/5] Run query manual for clarity/redundance --- bigframes/core/indexers.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index 220474cd6d..5bf4e10f36 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -267,13 +267,16 @@ def _loc_getitem_series_or_dataframe( keys_df = keys_df.set_index(index_name, drop=True) keys_df.index.name = None result = _perform_loc_list_join(series_or_dataframe, keys_df) - if len(result) == 1: - # although loc[scalar_key] returns multiple results when scalar_key - # is not unique, we run a query and return the computed individual - # result here (as a scalar or pandas series) when the key is unique, - # since we expect unique index keys to be more common. loc[[scalar_key]] - # can be used to retrieve one-item DataFrames or Series. - return result.iloc[0] + pandas_result = result.to_pandas() + # although loc[scalar_key] returns multiple results when scalar_key + # is not unique, we download the results here and return the computed + # individual result (as a scalar or pandas series) when the key is unique, + # since we expect unique index keys to be more common. loc[[scalar_key]] + # can be used to retrieve one-item DataFrames or Series. + if len(pandas_result) == 1: + return pandas_result.iloc[0] + # when the key is not unique, we return a bigframes data type + # as usual for methods that return dataframes/series return result else: raise TypeError(