-
Notifications
You must be signed in to change notification settings - Fork 50
feat: Add Series.peek to preview data efficiently #727
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
b3771b8
e865395
f227476
68fc1e1
a17e027
1764106
5ff4661
936e73d
41f6083
ffbc518
a9b16c4
83fc8fb
ec1d973
c307625
b917c71
848d0a4
79d05b5
06c9866
2ed1520
1ff4f68
81e5a02
e91dbb5
108e449
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
# Copyright 2024 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import bigframes.core.expression as ex | ||
import bigframes.core.schema as schemata | ||
import bigframes.dtypes | ||
import bigframes.operations as ops | ||
|
||
LOW_CARDINALITY_TYPES = [bigframes.dtypes.BOOL_DTYPE] | ||
|
||
COMPARISON_OP_TYPES = tuple( | ||
type(i) | ||
for i in ( | ||
ops.eq_op, | ||
ops.eq_null_match_op, | ||
ops.ne_op, | ||
ops.gt_op, | ||
ops.ge_op, | ||
ops.lt_op, | ||
ops.le_op, | ||
) | ||
) | ||
|
||
|
||
def cluster_cols_for_predicate( | ||
predicate: ex.Expression, schema: schemata.ArraySchema | ||
) -> list[str]: | ||
"""Try to determine cluster col candidates that work with given predicates.""" | ||
# TODO: Prioritize based on predicted selectivity (eg. equality conditions are probably very selective) | ||
if isinstance(predicate, ex.UnboundVariableExpression): | ||
cols = [predicate.id] | ||
elif isinstance(predicate, ex.OpExpression): | ||
op = predicate.op | ||
# TODO: Support geo predicates, which support pruning if clustered (other than st_disjoint) | ||
# https://cloud.google.com/bigquery/docs/reference/standard-sql/geography_functions | ||
if isinstance(op, COMPARISON_OP_TYPES): | ||
cols = cluster_cols_for_comparison(predicate.inputs[0], predicate.inputs[1]) | ||
elif isinstance(op, (type(ops.invert_op))): | ||
cols = cluster_cols_for_predicate(predicate.inputs[0], schema) | ||
elif isinstance(op, (type(ops.and_op), type(ops.or_op))): | ||
left_cols = cluster_cols_for_predicate(predicate.inputs[0], schema) | ||
right_cols = cluster_cols_for_predicate(predicate.inputs[1], schema) | ||
cols = [*left_cols, *[col for col in right_cols if col not in left_cols]] | ||
else: | ||
cols = [] | ||
else: | ||
# Constant | ||
cols = [] | ||
return [ | ||
col for col in cols if bigframes.dtypes.is_clusterable(schema.get_type(col)) | ||
] | ||
|
||
|
||
def cluster_cols_for_comparison( | ||
left_ex: ex.Expression, right_ex: ex.Expression | ||
) -> list[str]: | ||
# TODO: Try to normalize expressions such that one side is a single variable. | ||
# eg. Convert -cola>=3 to cola<-3 and colb+3 < 4 to colb < 1 | ||
if left_ex.is_const: | ||
# There are some invertible ops that would also be ok | ||
tswast marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if isinstance(right_ex, ex.UnboundVariableExpression): | ||
return [right_ex.id] | ||
elif right_ex.is_const: | ||
if isinstance(left_ex, ex.UnboundVariableExpression): | ||
return [left_ex.id] | ||
return [] |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -623,6 +623,40 @@ def head(self, n: int = 5) -> Series: | |
def tail(self, n: int = 5) -> Series: | ||
return typing.cast(Series, self.iloc[-n:]) | ||
|
||
def peek(self, n: int = 5, *, force: bool = True) -> pandas.DataFrame: | ||
""" | ||
Preview n arbitrary elements from the series without guarantees about row selection or ordering. | ||
|
||
``Series.peek(force=False)`` will always be very fast, but will not succeed if data requires | ||
full data scanning. Using ``force=True`` will always succeed, but may be perform queries. | ||
Query results will be cached so that future steps will benefit from these queries. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need a caveat here that caching is session-aware and will attempt to cache the optimal subtree? (Not sure exactly how to phrase that in a friendlier way.) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, not sure how/if we should communicate this to users. I also don't want to lock in any specific execution strategy other than "we might cache if force=True, but we will make that cache as useful as possible using some unspecified approach". |
||
|
||
Args: | ||
n (int, default 5): | ||
The number of rows to select from the series. Which N rows are returned is non-deterministic. | ||
force (bool, default True): | ||
If the data cannot be peeked efficiently, the series will instead be fully materialized as part | ||
of the operation if ``force=True``. If ``force=False``, the operation will throw a ValueError. | ||
Returns: | ||
pandas.Series: A pandas Series with n rows. | ||
|
||
Raises: | ||
ValueError: If force=False and data cannot be efficiently peeked. | ||
""" | ||
maybe_result = self._block.try_peek(n) | ||
if maybe_result is None: | ||
if force: | ||
self._cached() | ||
maybe_result = self._block.try_peek(n, force=True) | ||
assert maybe_result is not None | ||
else: | ||
raise ValueError( | ||
"Cannot peek efficiently when data has aggregates, joins or window functions applied. Use force=True to fully compute dataframe." | ||
) | ||
as_series = maybe_result.squeeze(axis=1) | ||
as_series.name = self.name | ||
tswast marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return as_series | ||
|
||
def nlargest(self, n: int = 5, keep: str = "first") -> Series: | ||
if keep not in ("first", "last", "all"): | ||
raise ValueError("'keep must be one of 'first', 'last', or 'all'") | ||
|
@@ -1419,7 +1453,7 @@ def apply( | |
|
||
# return Series with materialized result so that any error in the remote | ||
# function is caught early | ||
materialized_series = result_series._cached() | ||
materialized_series = result_series._cached(session_aware=False) | ||
return materialized_series | ||
|
||
def combine( | ||
|
@@ -1794,10 +1828,11 @@ def cache(self): | |
Returns: | ||
Series: Self | ||
""" | ||
return self._cached(force=True) | ||
# Do not use session-aware cashing if user-requested | ||
return self._cached(force=True, session_aware=False) | ||
|
||
def _cached(self, *, force: bool = True) -> Series: | ||
self._block.cached(force=force) | ||
def _cached(self, *, force: bool = True, session_aware: bool = True) -> Series: | ||
self._block.cached(force=force, session_aware=session_aware) | ||
return self | ||
|
||
def _optimize_query_complexity(self): | ||
|
Uh oh!
There was an error while loading. Please reload this page.