From 0fd3bf447be62c8d92bb64c670d94bddcc0f77c8 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 18 Jan 2024 01:26:01 +0000 Subject: [PATCH 1/6] feat: Add Index constructor, copy, get_level_values, to_series fix mypy error --- bigframes/core/blocks.py | 6 +- bigframes/core/indexes/index.py | 88 ++++++++++++++++--- bigframes/dataframe.py | 23 +++-- bigframes/operations/base.py | 45 +++++++--- bigframes/series.py | 4 - tests/system/small/test_index.py | 72 +++++++++++++++ tests/system/small/test_series.py | 34 +++++++ .../pandas/core/indexes/base.py | 51 +++++++++++ 8 files changed, 290 insertions(+), 33 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 8c59f8106b..3586de92f7 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -287,7 +287,6 @@ def reset_index(self, drop: bool = True) -> Block: A new Block because dropping index columns can break references from Index classes that point to this block. """ - block = self new_index_col_id = guid.generate_guid() expr = self._expr.promote_offsets(new_index_col_id) if drop: @@ -295,7 +294,7 @@ def reset_index(self, drop: bool = True) -> Block: # ordering expression as reset_index shouldn't change the row # order. expr = expr.drop_columns(self.index_columns) - block = Block( + return Block( expr, index_columns=[new_index_col_id], column_labels=self.column_labels, @@ -321,13 +320,12 @@ def reset_index(self, drop: bool = True) -> Block: # See: https://pandas.pydata.org/docs/reference/api/pandas.Index.insert.html column_labels_modified = column_labels_modified.insert(level, label) - block = Block( + return Block( expr, index_columns=[new_index_col_id], column_labels=column_labels_modified, index_labels=[None], ) - return block def set_index( self, diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index 4ec11cb163..90402aafba 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -17,7 +17,7 @@ from __future__ import annotations import typing -from typing import Mapping, Sequence, Tuple, Union +from typing import Hashable, Mapping, Optional, Sequence, Tuple, Union import numpy as np import pandas @@ -36,12 +36,57 @@ import bigframes.operations.aggregations as agg_ops import third_party.bigframes_vendored.pandas.core.indexes.base as vendored_pandas_index +if typing.TYPE_CHECKING: + import bigframes.series + class Index(vendored_pandas_index.Index): __doc__ = vendored_pandas_index.Index.__doc__ - def __init__(self, data: blocks.BlockHolder): - self._data = data + def __init__( + self, + data=None, + dtype=None, + *, + name=None, + frame: Optional[blocks.BlockHolder] = None, + ): + import bigframes.dataframe as df + import bigframes.series as series + + if frame is not None: + self._data = frame + elif isinstance(data, df.DataFrame): + raise ValueError("Cannot construct index from dataframe.") + elif isinstance(data, series.Series) or isinstance(data, Index): + if isinstance(data, series.Series): + block = data._block + block = block.set_index( + col_ids=[data._value_column], + ) + elif isinstance(data, Index): + block = data._block + index = Index._from_block(block) + name = data.name if name is None else name + if name is not None: + index.name = name + if dtype is not None: + index = index.astype(dtype) + self._data = df.DataFrame(index._block) + else: + pd_index = pandas.Index(data=data, dtype=dtype, name=name) + pd_df = pandas.DataFrame(index=pd_index) + self._data = df.DataFrame(pd_df) + + @classmethod + def from_frame(self, frame: blocks.BlockHolder) -> Index: + return Index(frame=frame) + + @classmethod + def _from_block(cls, block: blocks.Block) -> Index: + import bigframes.dataframe as df + + return Index.from_frame(df.DataFrame(block)) @property def name(self) -> blocks.Label: @@ -150,12 +195,41 @@ def has_duplicates(self) -> bool: @property def _block(self) -> blocks.Block: - return self._data._get_block() + return self._data._get_block().select_columns([]) @property def T(self) -> Index: return self.transpose() + def copy(self, name: Optional[Hashable] = None): + copy_index = Index._from_block(self._block) + if name is not None: + copy_index.name = name + return copy_index + + def to_series( + self, index: Optional[Index] = None, name: Optional[Hashable] = None + ) -> bigframes.series.Series: + if self.nlevels != 1: + NotImplementedError( + f"Converting multi-index to series is not yet supported. {constants.FEEDBACK_LINK}" + ) + + import bigframes.series + + name = self.name if name is None else name + if index is None: + return bigframes.series.Series(data=self, index=self, name=name) + else: + return bigframes.series.Series(data=self, index=Index(index), name=name) + + def get_level_values(self, level) -> Index: + level_n = level if isinstance(level, int) else self.names.index(level) + block = self._block.drop_levels( + [self._block.index_columns[i] for i in range(self.nlevels) if i != level_n] + ) + return Index._from_block(block) + def _memory_usage(self) -> int: (n_rows,) = self.shape return sum( @@ -366,12 +440,6 @@ def to_numpy(self, dtype=None, **kwargs) -> np.ndarray: def __len__(self): return self.shape[0] - @classmethod - def _from_block(cls, block: blocks.Block) -> Index: - import bigframes.dataframe as df - - return Index(df.DataFrame(block)) - class IndexValue: """An immutable index.""" diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 1288117395..970cc4c3d0 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -146,10 +146,15 @@ def __init__( block = result_index._block if block: - if index: - raise NotImplementedError( - "DataFrame 'index' constructor parameter not supported " - f"when passing BigQuery-backed objects. {constants.FEEDBACK_LINK}" + if index is not None: + bf_index = indexes.Index(index) + idx_block = bf_index._block + idx_cols = idx_block.index_columns + join_idx, (_, r_mapping) = block.reset_index().index.join( + bf_index._block.reset_index().index, how="inner" + ) + block = join_idx._block.set_index( + [r_mapping[idx_col] for idx_col in idx_cols] ) if columns: block = block.select_columns(list(columns)) # type:ignore @@ -250,7 +255,7 @@ def _sql_names( def index( self, ) -> indexes.Index: - return indexes.Index(self) + return indexes.Index.from_frame(self) @index.setter def index(self, value): @@ -661,6 +666,14 @@ def _apply_binop( ): if isinstance(other, (float, int)): return self._apply_scalar_binop(other, op, reverse=reverse) + elif isinstance(other, indexes.Index): + return self._apply_series_binop( + other.to_series(index=self.index), + op, + axis=axis, + how=how, + reverse=reverse, + ) elif isinstance(other, bigframes.series.Series): return self._apply_series_binop( other, op, axis=axis, how=how, reverse=reverse diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 077815a9d6..35c941363e 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -21,6 +21,7 @@ import bigframes.constants as constants import bigframes.core.blocks as blocks import bigframes.core.expression as ex +import bigframes.core.indexes as indexes import bigframes.core.scalar as scalars import bigframes.dtypes import bigframes.operations as ops @@ -54,10 +55,35 @@ def __init__( if isinstance(data, blocks.Block): assert len(data.value_columns) == 1 assert len(data.column_labels) == 1 + assert index is None block = data elif isinstance(data, SeriesMethods): block = data._get_block() + if index is not None: + # reindex + bf_index = indexes.Index(index) + idx_block = bf_index._block + idx_cols = idx_block.value_columns + block_idx, (_, _) = idx_block.index.join(block.index, how="left") + block = block_idx._block + block = block.with_index_labels(bf_index.names) + + elif isinstance(data, indexes.Index): + if data.nlevels != 1: + raise NotImplementedError("Cannot interpret multi-index as Series.") + # Reset index to promote index columns to value columns, set default index + block = data._block.reset_index(drop=False) + if index is not None: + # Align by offset + bf_index = indexes.Index(index) + idx_block = bf_index._block.reset_index(drop=False) + idx_cols = idx_block.value_columns + block_idx, (l_mapping, _) = idx_block.index.join( + block.index, how="left" + ) + block = block_idx._block.set_index([l_mapping[col] for col in idx_cols]) + block = block.with_index_labels(bf_index.names) if block: if name: @@ -66,16 +92,10 @@ def __init__( f"BigQuery DataFrames only supports hashable series names. {constants.FEEDBACK_LINK}" ) block = block.with_column_labels([name]) - if index: - raise NotImplementedError( - f"Series 'index' constructor parameter not supported when passing BigQuery-backed objects. {constants.FEEDBACK_LINK}" - ) if dtype: block = block.multi_apply_unary_op( block.value_columns, ops.AsTypeOp(to_type=dtype) ) - self._block = block - else: import bigframes.pandas @@ -95,14 +115,15 @@ def __init__( if isinstance(dt, pd.ArrowDtype) ) ): - self._block = blocks.block_from_local(pd_dataframe) + block = blocks.block_from_local(pd_dataframe) elif session: - self._block = session.read_pandas(pd_dataframe)._get_block() + block = session.read_pandas(pd_dataframe)._get_block() else: # Uses default global session - self._block = bigframes.pandas.read_pandas(pd_dataframe)._get_block() + block = bigframes.pandas.read_pandas(pd_dataframe)._get_block() if pd_series.name is None: - self._block = self._block.with_column_labels([None]) + block = self._block.with_column_labels([None]) + self._block: blocks.Block = block @property def _value_column(self) -> str: @@ -116,6 +137,10 @@ def _name(self) -> blocks.Label: def _dtype(self): return self._block.dtypes[0] + @property + def index(self) -> indexes.Index: + return indexes.Index.from_frame(self) + def _set_block(self, block: blocks.Block): self._block = block diff --git a/bigframes/series.py b/bigframes/series.py index 2371aad780..bb014cd5c1 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -76,10 +76,6 @@ def dtype(self): def dtypes(self): return self._dtype - @property - def index(self) -> indexes.Index: - return indexes.Index(self) - @property def loc(self) -> bigframes.core.indexers.LocSeriesIndexer: return bigframes.core.indexers.LocSeriesIndexer(self) diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py index f7fa0f0855..2961884ebf 100644 --- a/tests/system/small/test_index.py +++ b/tests/system/small/test_index.py @@ -16,9 +16,44 @@ import pandas as pd import pytest +import bigframes.pandas as bpd from tests.system.utils import assert_pandas_index_equal_ignore_index_type +def test_index_construct_from_list(): + bf_result = bpd.Index( + [3, 14, 159], dtype=pd.Int64Dtype(), name="my_index" + ).to_pandas() + pd_result = pd.Index([3, 14, 159], dtype=pd.Int64Dtype(), name="my_index") + pd.testing.assert_index_equal(bf_result, pd_result) + + +def test_index_construct_from_series(): + bf_result = bpd.Index( + bpd.Series([3, 14, 159], dtype=pd.Float64Dtype(), name="series_name"), + name="index_name", + dtype=pd.Int64Dtype(), + ).to_pandas() + pd_result = pd.Index( + pd.Series([3, 14, 159], dtype=pd.Float64Dtype(), name="series_name"), + name="index_name", + dtype=pd.Int64Dtype(), + ) + pd.testing.assert_index_equal(bf_result, pd_result) + + +def test_index_construct_from_index(): + bf_index_input = bpd.Index( + [3, 14, 159], dtype=pd.Float64Dtype(), name="series_name" + ) + bf_result = bpd.Index( + bf_index_input, dtype=pd.Int64Dtype(), name="index_name" + ).to_pandas() + pd_index_input = pd.Index([3, 14, 159], dtype=pd.Float64Dtype(), name="series_name") + pd_result = pd.Index(pd_index_input, dtype=pd.Int64Dtype(), name="index_name") + pd.testing.assert_index_equal(bf_result, pd_result) + + def test_get_index(scalars_df_index, scalars_pandas_df_index): index = scalars_df_index.index bf_result = index.to_pandas() @@ -240,6 +275,43 @@ def test_index_value_counts(scalars_df_index, scalars_pandas_df_index): pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) +@pytest.mark.parametrize( + ("level",), + [ + ("int64_too",), + ("rowindex_2",), + (1,), + ], +) +def test_index_get_level_values(scalars_df_index, scalars_pandas_df_index, level): + bf_result = ( + scalars_df_index.set_index(["int64_too", "rowindex_2"]) + .index.get_level_values(level) + .to_pandas() + ) + pd_result = scalars_pandas_df_index.set_index( + ["int64_too", "rowindex_2"] + ).index.get_level_values(level) + + pd.testing.assert_index_equal(bf_result, pd_result) + + +def test_index_to_series( + scalars_df_index, + scalars_pandas_df_index, +): + bf_result = ( + scalars_df_index.set_index(["int64_too"]) + .index.to_series(index=scalars_df_index["float64_col"], name="new_name") + .to_pandas() + ) + pd_result = scalars_pandas_df_index.set_index(["int64_too"]).index.to_series( + index=scalars_pandas_df_index["float64_col"], name="new_name" + ) + + pd.testing.assert_series_equal(bf_result, pd_result) + + @pytest.mark.parametrize( ("how",), [ diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 6f919f740f..e533f631b3 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -42,6 +42,40 @@ def test_series_construct_copy(scalars_dfs): pd.testing.assert_series_equal(bf_result, pd_result) +def test_series_construct_copy_with_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = series.Series( + scalars_df["int64_col"], + name="test_series", + dtype="Float64", + index=scalars_df["int64_too"], + ).to_pandas() + pd_result = pd.Series( + scalars_pandas_df["int64_col"], + name="test_series", + dtype="Float64", + index=scalars_pandas_df["int64_too"], + ) + pd.testing.assert_series_equal(bf_result, pd_result) + + +def test_series_construct_copy_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = series.Series( + scalars_df.index, + name="test_series", + dtype="Float64", + index=scalars_df["int64_too"], + ).to_pandas() + pd_result = pd.Series( + scalars_pandas_df.index, + name="test_series", + dtype="Float64", + index=scalars_pandas_df["int64_too"], + ) + pd.testing.assert_series_equal(bf_result, pd_result) + + def test_series_construct_pandas(scalars_dfs): _, scalars_pandas_df = scalars_dfs bf_result = series.Series( diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index e8737341a3..3ad8729271 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -58,6 +58,23 @@ def T(self) -> Index: """Return the transpose, which is by definition self.""" raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def copy( + self, + name=None, + ) -> Index: + """ + Make a copy of this object. + + Name is set on the new object. + + Args: + name (Label, optional): + Set name for new object. + Returns: + Index: Index refer to new object which is a copy of this object. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def transpose(self) -> Index: """ Return the transpose, which is by definition self. @@ -81,6 +98,40 @@ def astype(self, dtype): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def get_level_values(self, level) -> Index: + """ + Return an Index of values for requested level. + + This is primarily useful to get an individual level of values from a + MultiIndex, but is provided on Index as well for compatibility. + + Args: + level (int or str): + It is either the integer position or the name of the level. + + Returns: + Index: Calling object, as there is only one level in the Index. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def to_series(self): + """ + Create a Series with both index and values equal to the index keys. + + Useful with map for returning an indexer based on an index. + + Args: + index (Index, optional): + Index of resulting Series. If None, defaults to original index. + name (str, optional): + Name of resulting Series. If None, defaults to name of original + index. + + Returns: + Series: The dtype will be based on the type of the Index values. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def isin(self, values): """ Return a boolean array where the index values are in `values`. From 755f91c0fcf1b1295ce60cd87493153b46e8b126 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 19 Jan 2024 23:10:20 +0000 Subject: [PATCH 2/6] fix constructor bug --- bigframes/operations/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 35c941363e..588624e5fb 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -122,7 +122,7 @@ def __init__( # Uses default global session block = bigframes.pandas.read_pandas(pd_dataframe)._get_block() if pd_series.name is None: - block = self._block.with_column_labels([None]) + block = block.with_column_labels([None]) self._block: blocks.Block = block @property From a3e400e6c88cbec51e5bea9fe91d68e672af08ec Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Mon, 22 Jan 2024 21:55:48 +0000 Subject: [PATCH 3/6] fix error with index name mutation --- bigframes/core/indexes/index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index 5f6c9fe059..aaf309a487 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -104,7 +104,7 @@ def names(self) -> typing.Sequence[blocks.Label]: @names.setter def names(self, values: typing.Sequence[blocks.Label]): - return self._data._set_block(self._block.with_index_labels(values)) + return self._data._set_block(self._data._get_block().with_index_labels(values)) @property def nlevels(self) -> int: From 2060ea19283fbc910d14d71f9af38fdebbe9618e Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 23 Jan 2024 00:33:57 +0000 Subject: [PATCH 4/6] refactor index to make mutation clearer --- bigframes/core/indexers.py | 2 +- bigframes/core/indexes/index.py | 93 ++++++++++++++++++++------------- bigframes/dataframe.py | 8 ++- bigframes/operations/base.py | 11 ++-- bigframes/series.py | 12 +++-- 5 files changed, 71 insertions(+), 55 deletions(-) diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index 6998d0e974..0a47c3a78e 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -294,7 +294,7 @@ def _loc_getitem_series_or_dataframe( keys_df = keys_df.set_index(temp_name, drop=True) return _perform_loc_list_join(series_or_dataframe, keys_df) elif isinstance(key, bigframes.core.indexes.Index): - block = key._data._get_block() + block = key._block block = block.select_columns(()) keys_df = bigframes.dataframe.DataFrame(block) return _perform_loc_list_join(series_or_dataframe, keys_df) diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index aaf309a487..ccd88ec3e9 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -38,6 +38,7 @@ import third_party.bigframes_vendored.pandas.core.indexes.base as vendored_pandas_index if typing.TYPE_CHECKING: + import bigframes.dataframe import bigframes.series @@ -50,13 +51,12 @@ def __init__( dtype=None, *, name=None, - frame: Optional[blocks.BlockHolder] = None, ): import bigframes.dataframe as df import bigframes.series as series - if frame is not None: - self._data = frame + if isinstance(data, blocks.Block): + block = data.select_columns([]) elif isinstance(data, df.DataFrame): raise ValueError("Cannot construct index from dataframe.") elif isinstance(data, series.Series) or isinstance(data, Index): @@ -67,27 +67,24 @@ def __init__( ) elif isinstance(data, Index): block = data._block - index = Index._from_block(block) + index = Index(data=block) name = data.name if name is None else name if name is not None: index.name = name if dtype is not None: index = index.astype(dtype) - self._data = df.DataFrame(index._block) + block = index._block else: pd_index = pandas.Index(data=data, dtype=dtype, name=name) pd_df = pandas.DataFrame(index=pd_index) - self._data = df.DataFrame(pd_df) + block = df.DataFrame(pd_df)._block + self._block: blocks.Block = block @classmethod - def from_frame(self, frame: blocks.BlockHolder) -> Index: - return Index(frame=frame) - - @classmethod - def _from_block(cls, block: blocks.Block) -> Index: - import bigframes.dataframe as df - - return Index.from_frame(df.DataFrame(block)) + def from_frame( + self, frame: Union[bigframes.series.Series, bigframes.dataframe.DataFrame] + ) -> Index: + return ViewIndex(frame) @property def name(self) -> blocks.Label: @@ -100,15 +97,16 @@ def name(self, value: blocks.Label): @property def names(self) -> typing.Sequence[blocks.Label]: """Returns the names of the Index.""" - return self._data._get_block()._index_labels + return self._block._index_labels @names.setter def names(self, values: typing.Sequence[blocks.Label]): - return self._data._set_block(self._data._get_block().with_index_labels(values)) + new_block = self._block.with_index_labels(values) + self._block = new_block @property def nlevels(self) -> int: - return len(self._data._get_block().index_columns) + return len(self._block.index_columns) @property def values(self) -> np.ndarray: @@ -120,7 +118,7 @@ def ndim(self) -> int: @property def shape(self) -> typing.Tuple[int]: - return (self._data._get_block().shape[0],) + return (self._block.shape[0],) @property def dtype(self): @@ -152,9 +150,7 @@ def is_monotonic_increasing(self) -> bool: """ return typing.cast( bool, - self._data._get_block().is_monotonic_increasing( - self._data._get_block().index_columns - ), + self._block.is_monotonic_increasing(self._block.index_columns), ) @property @@ -167,9 +163,7 @@ def is_monotonic_decreasing(self) -> bool: """ return typing.cast( bool, - self._data._get_block().is_monotonic_decreasing( - self._data._get_block().index_columns - ), + self._block.is_monotonic_decreasing(self._block.index_columns), ) @property @@ -194,16 +188,12 @@ def has_duplicates(self) -> bool: duplicates_df = df.DataFrame(duplicates_block) return duplicates_df["is_duplicate"].any() - @property - def _block(self) -> blocks.Block: - return self._data._get_block().select_columns([]) - @property def T(self) -> Index: return self.transpose() def copy(self, name: Optional[Hashable] = None): - copy_index = Index._from_block(self._block) + copy_index = Index(self._block) if name is not None: copy_index.name = name return copy_index @@ -229,7 +219,7 @@ def get_level_values(self, level) -> Index: block = self._block.drop_levels( [self._block.index_columns[i] for i in range(self.nlevels) if i != level_n] ) - return Index._from_block(block) + return Index(block) def _memory_usage(self) -> int: (n_rows,) = self.shape @@ -254,7 +244,7 @@ def sort_values(self, *, ascending: bool = True, na_position: str = "last"): order.OrderingColumnReference(column, direction=direction, na_last=na_last) for column in index_columns ] - return Index._from_block(self._block.order_by(ordering)) + return Index(self._block.order_by(ordering)) def astype( self, @@ -343,7 +333,7 @@ def rename(self, name: Union[str, Sequence[str]]) -> Index: names = [name] if isinstance(name, str) else list(name) if len(names) != self.nlevels: raise ValueError("'name' must be same length as levels") - return Index._from_block(self._block.with_index_labels(names)) + return Index(self._block.with_index_labels(names)) def drop( self, @@ -365,17 +355,17 @@ def drop( ) block = block.filter(condition_id, keep_null=True) block = block.drop_columns([condition_id]) - return Index._from_block(block) + return Index(block) def dropna(self, how: str = "any") -> Index: if how not in ("any", "all"): raise ValueError("'how' must be one of 'any', 'all'") result = block_ops.dropna(self._block, self._block.index_columns, how=how) # type: ignore - return Index._from_block(result) + return Index(result) def drop_duplicates(self, *, keep: str = "first") -> Index: block = block_ops.drop_duplicates(self._block, self._block.index_columns, keep) - return Index._from_block(block) + return Index(block) def isin(self, values) -> Index: if not utils.is_list_like(values): @@ -404,7 +394,7 @@ def _apply_unary_expr( result_ids.append(result_id) block = block.set_index(result_ids, index_labels=self._block.index_labels) - return Index._from_block(block) + return Index(block) def _apply_aggregation(self, op: agg_ops.AggregateOp) -> typing.Any: if self.nlevels > 1: @@ -442,6 +432,37 @@ def __len__(self): return self.shape[0] +# Index that mutates the originating dataframe/series +class ViewIndex(Index): + def __init__( + self, + series_or_dataframe: typing.Union[ + bigframes.series.Series, bigframes.dataframe.DataFrame + ], + ): + super().__init__(self, series_or_dataframe._block) + self._whole_frame = series_or_dataframe + + @property + def name(self) -> blocks.Label: + return self.names[0] + + @name.setter + def name(self, value: blocks.Label): + self.names = [value] + + @property + def names(self) -> typing.Sequence[blocks.Label]: + """Returns the names of the Index.""" + return self._block._index_labels + + @names.setter + def names(self, values: typing.Sequence[blocks.Label]): + new_block = self._whole_frame._get_block().with_index_labels(values) + self._whole_frame._set_block(new_block) + self._block = new_block + + class IndexValue: """An immutable index.""" diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 970cc4c3d0..4c15458a87 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1165,7 +1165,7 @@ def drop( return DataFrame(block) def _drop_by_index(self, index: indexes.Index) -> DataFrame: - block = index._data._get_block() + block = index._block block, ordering_col = block.promote_offsets() joined_index, (get_column_left, get_column_right) = self._block.index.join( block.index @@ -1301,9 +1301,7 @@ def _assign_single_item_listlike(self, k: str, v: Sequence) -> DataFrame: f"Length of values ({given_rows}) does not match length of index ({actual_rows})" ) - local_df = bigframes.dataframe.DataFrame( - {k: v}, session=self._get_block().expr.session - ) + local_df = DataFrame({k: v}, session=self._get_block().expr.session) # local_df is likely (but not guaranteed) to be cached locally # since the original list came from memory and so is probably < MAX_INLINE_DF_SIZE @@ -1604,7 +1602,7 @@ def _reindex_rows( raise ValueError("Original index must be unique to reindex") keep_original_names = False if isinstance(index, indexes.Index): - new_indexer = DataFrame(data=index._data._get_block())[[]] + new_indexer = DataFrame(data=index._block)[[]] else: if not isinstance(index, pandas.Index): keep_original_names = True diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 588624e5fb..6829d3faab 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -59,15 +59,14 @@ def __init__( block = data elif isinstance(data, SeriesMethods): - block = data._get_block() + block = data._block if index is not None: # reindex bf_index = indexes.Index(index) idx_block = bf_index._block idx_cols = idx_block.value_columns - block_idx, (_, _) = idx_block.index.join(block.index, how="left") - block = block_idx._block - block = block.with_index_labels(bf_index.names) + block_idx, _ = idx_block.index.join(block.index, how="left") + block = block_idx._block.with_index_labels(bf_index.names) elif isinstance(data, indexes.Index): if data.nlevels != 1: @@ -137,10 +136,6 @@ def _name(self) -> blocks.Label: def _dtype(self): return self._block.dtypes[0] - @property - def index(self) -> indexes.Index: - return indexes.Index.from_frame(self) - def _set_block(self, block: blocks.Block): self._block = block diff --git a/bigframes/series.py b/bigframes/series.py index bb014cd5c1..63aa209b03 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -116,6 +116,10 @@ def empty(self) -> bool: def values(self) -> numpy.ndarray: return self.to_numpy() + @property + def index(self) -> indexes.Index: + return indexes.Index.from_frame(self) + @property def query_job(self) -> Optional[bigquery.QueryJob]: """BigQuery job metadata for the most recent query. @@ -974,7 +978,7 @@ def idxmax(self) -> blocks.Label: ] ) block = block.slice(0, 1) - return indexes.Index._from_block(block).to_pandas()[0] + return indexes.Index(block).to_pandas()[0] def idxmin(self) -> blocks.Label: block = self._block.order_by( @@ -987,7 +991,7 @@ def idxmin(self) -> blocks.Label: ] ) block = block.slice(0, 1) - return indexes.Index._from_block(block).to_pandas()[0] + return indexes.Index(block).to_pandas()[0] @property def is_monotonic_increasing(self) -> bool: @@ -1275,9 +1279,7 @@ def reindex(self, index=None, *, validate: typing.Optional[bool] = None): raise ValueError("Original index must be unique to reindex") keep_original_names = False if isinstance(index, indexes.Index): - new_indexer = bigframes.dataframe.DataFrame(data=index._data._get_block())[ - [] - ] + new_indexer = bigframes.dataframe.DataFrame(data=index._block)[[]] else: if not isinstance(index, pandas.Index): keep_original_names = True From 3556784a25e5411543275ed9d320e0a165648de6 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 23 Jan 2024 01:35:16 +0000 Subject: [PATCH 5/6] fix index bugs --- bigframes/core/indexes/index.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index ccd88ec3e9..6fb572d8e0 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -82,7 +82,7 @@ def __init__( @classmethod def from_frame( - self, frame: Union[bigframes.series.Series, bigframes.dataframe.DataFrame] + cls, frame: Union[bigframes.series.Series, bigframes.dataframe.DataFrame] ) -> Index: return ViewIndex(frame) @@ -408,7 +408,7 @@ def __getitem__(self, key: int) -> typing.Any: result_pd_df, _ = self._block.slice(key, key + 1, 1).to_pandas() else: # special case, want [-1:] instead of [-1:0] result_pd_df, _ = self._block.slice(key).to_pandas() - if result_pd_df.empty: + if result_pd_df.index.empty: raise IndexError("single positional indexer is out-of-bounds") return result_pd_df.index[0] else: @@ -440,7 +440,7 @@ def __init__( bigframes.series.Series, bigframes.dataframe.DataFrame ], ): - super().__init__(self, series_or_dataframe._block) + super().__init__(series_or_dataframe._block) self._whole_frame = series_or_dataframe @property From 0008d4988f59d5590a68ec4040059a70930abd21 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 26 Jan 2024 22:57:30 +0000 Subject: [PATCH 6/6] give index custom repr --- bigframes/core/indexes/index.py | 42 ++++++++++++++----- .../bigframes_vendored/pandas/core/frame.py | 7 +++- .../bigframes_vendored/pandas/core/series.py | 7 +++- 3 files changed, 41 insertions(+), 15 deletions(-) diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index 6fb572d8e0..78a4fc6f0b 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -19,6 +19,7 @@ import typing from typing import Hashable, Mapping, Optional, Sequence, Tuple, Union +import google.cloud.bigquery as bigquery import numpy as np import pandas @@ -33,6 +34,7 @@ import bigframes.core.utils as utils import bigframes.dtypes import bigframes.dtypes as bf_dtypes +import bigframes.formatting_helpers as formatter import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops import third_party.bigframes_vendored.pandas.core.indexes.base as vendored_pandas_index @@ -78,13 +80,14 @@ def __init__( pd_index = pandas.Index(data=data, dtype=dtype, name=name) pd_df = pandas.DataFrame(index=pd_index) block = df.DataFrame(pd_df)._block + self._query_job = None self._block: blocks.Block = block @classmethod def from_frame( cls, frame: Union[bigframes.series.Series, bigframes.dataframe.DataFrame] ) -> Index: - return ViewIndex(frame) + return FrameIndex(frame) @property def name(self) -> blocks.Label: @@ -192,6 +195,32 @@ def has_duplicates(self) -> bool: def T(self) -> Index: return self.transpose() + @property + def query_job(self) -> Optional[bigquery.QueryJob]: + """BigQuery job metadata for the most recent query. + + Returns: + The most recent `QueryJob + `_. + """ + if self._query_job is None: + self._query_job = self._block._compute_dry_run() + return self._query_job + + def __repr__(self) -> str: + # TODO(swast): Add a timeout here? If the query is taking a long time, + # maybe we just print the job metadata that we have so far? + # TODO(swast): Avoid downloading the whole series by using job + # metadata, like we do with DataFrame. + opts = bigframes.options.display + max_results = opts.max_rows + if opts.repr_mode == "deferred": + return formatter.repr_query_job(self.query_job) + + pandas_df, _, query_job = self._block.retrieve_repr_request_results(max_results) + self._query_job = query_job + return repr(pandas_df.index) + def copy(self, name: Optional[Hashable] = None): copy_index = Index(self._block) if name is not None: @@ -433,7 +462,7 @@ def __len__(self): # Index that mutates the originating dataframe/series -class ViewIndex(Index): +class FrameIndex(Index): def __init__( self, series_or_dataframe: typing.Union[ @@ -495,15 +524,6 @@ def dtypes( def session(self) -> core.Session: return self._expr.session - def __repr__(self) -> str: - """Converts an Index to a string.""" - # TODO(swast): Add a timeout here? If the query is taking a long time, - # maybe we just print the job metadata that we have so far? - # TODO(swast): Avoid downloading the whole index by using job - # metadata, like we do with DataFrame. - preview = self.to_pandas() - return repr(preview) - def to_pandas(self) -> pandas.Index: """Executes deferred operations and downloads the results.""" # Project down to only the index column. So the query can be cached to visualize other data. diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 10cdbf8f7c..93fba9f3aa 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4714,7 +4714,7 @@ def index(self): [3 rows x 3 columns] >>> df.index # doctest: +ELLIPSIS - + Index([10, 20, 30], dtype='Int64') >>> df.index.values array([10, 20, 30], dtype=object) @@ -4731,7 +4731,10 @@ def index(self): [3 rows x 1 columns] >>> df1.index # doctest: +ELLIPSIS - + MultiIndex([( 'Alice', 'Seattle'), + ( 'Bob', 'New York'), + ('Aritra', 'Kona')], + name='Name') >>> df1.index.values array([('Alice', 'Seattle'), ('Bob', 'New York'), ('Aritra', 'Kona')], dtype=object) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 1aa4ffffbb..33f03572f1 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -69,7 +69,7 @@ def index(self): 30 35 Name: Age, dtype: Int64 >>> s.index # doctest: +ELLIPSIS - + Index([10, 20, 30], dtype='Int64') >>> s.index.values array([10, 20, 30], dtype=object) @@ -84,7 +84,10 @@ def index(self): Aritra Kona 35 Name: Age, dtype: Int64 >>> s1.index # doctest: +ELLIPSIS - + MultiIndex([( 'Alice', 'Seattle'), + ( 'Bob', 'New York'), + ('Aritra', 'Kona')], + name='Name') >>> s1.index.values array([('Alice', 'Seattle'), ('Bob', 'New York'), ('Aritra', 'Kona')], dtype=object)