Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

feat: Add Series.peek to preview data efficiently #727

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 23 commits into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
b3771b8
feat: Add Series.peek to preview data efficiently
TrevorBergeron May 28, 2024
e865395
Merge remote-tracking branch 'github/main' into series_cache
TrevorBergeron May 30, 2024
f227476
add another test
TrevorBergeron May 30, 2024
68fc1e1
Merge remote-tracking branch 'github/main' into series_cache
TrevorBergeron May 31, 2024
a17e027
cleanup comments
TrevorBergeron May 31, 2024
1764106
Merge remote-tracking branch 'github/main' into series_cache
TrevorBergeron Jun 4, 2024
5ff4661
more comments, up to 4 cluster cols for session-based caching
TrevorBergeron Jun 4, 2024
936e73d
add another session caching test
TrevorBergeron Jun 4, 2024
41f6083
Merge remote-tracking branch 'github/main' into series_cache
TrevorBergeron Jun 4, 2024
ffbc518
Merge remote-tracking branch 'github/main' into series_cache
TrevorBergeron Jun 5, 2024
a9b16c4
add todo for geo predicate detection
TrevorBergeron Jun 5, 2024
83fc8fb
Merge branch 'main' into series_cache
tswast Jun 12, 2024
ec1d973
add dtype clusterable and orderable property
TrevorBergeron Jun 12, 2024
c307625
Merge remote-tracking branch 'github/main' into series_cache
TrevorBergeron Jun 13, 2024
b917c71
fix session aware caching unit tests
TrevorBergeron Jun 13, 2024
848d0a4
mock session for planner test
TrevorBergeron Jun 13, 2024
79d05b5
fix offsets column name collision
TrevorBergeron Jun 25, 2024
06c9866
Merge remote-tracking branch 'github/main' into series_cache
TrevorBergeron Jun 25, 2024
2ed1520
Update bigframes/dtypes.py
TrevorBergeron Jun 25, 2024
1ff4f68
Update bigframes/dtypes.py
TrevorBergeron Jun 25, 2024
81e5a02
add another series peek test
TrevorBergeron Jun 25, 2024
e91dbb5
remove partial comment
TrevorBergeron Jun 26, 2024
108e449
Merge remote-tracking branch 'github/main' into series_cache
TrevorBergeron Jun 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions 6 bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2286,13 +2286,13 @@ def to_sql_query(
idx_labels,
)

def cached(self, *, optimize_offsets=False, force: bool = False) -> None:
def cached(self, *, force: bool = False, session_aware: bool = False) -> None:
"""Write the block to a session table."""
# use a heuristic for whether something needs to be cached
if (not force) and self.session._is_trivially_executable(self.expr):
return
if optimize_offsets:
self.session._cache_with_offsets(self.expr)
elif session_aware:
self.session._cache_with_session_awareness(self.expr)
else:
self.session._cache_with_cluster_cols(
self.expr, cluster_cols=self.index_columns
Expand Down
77 changes: 77 additions & 0 deletions 77 bigframes/core/pruning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import bigframes.core.expression as ex
import bigframes.core.schema as schemata
import bigframes.dtypes
import bigframes.operations as ops

LOW_CARDINALITY_TYPES = [bigframes.dtypes.BOOL_DTYPE]

COMPARISON_OP_TYPES = tuple(
type(i)
for i in (
ops.eq_op,
tswast marked this conversation as resolved.
Show resolved Hide resolved
ops.eq_null_match_op,
ops.ne_op,
ops.gt_op,
ops.ge_op,
ops.lt_op,
ops.le_op,
)
)


def cluster_cols_for_predicate(
predicate: ex.Expression, schema: schemata.ArraySchema
) -> list[str]:
"""Try to determine cluster col candidates that work with given predicates."""
# TODO: Prioritize based on predicted selectivity (eg. equality conditions are probably very selective)
if isinstance(predicate, ex.UnboundVariableExpression):
cols = [predicate.id]
elif isinstance(predicate, ex.OpExpression):
op = predicate.op
# TODO: Support geo predicates, which support pruning if clustered (other than st_disjoint)
# https://cloud.google.com/bigquery/docs/reference/standard-sql/geography_functions
if isinstance(op, COMPARISON_OP_TYPES):
cols = cluster_cols_for_comparison(predicate.inputs[0], predicate.inputs[1])
elif isinstance(op, (type(ops.invert_op))):
cols = cluster_cols_for_predicate(predicate.inputs[0], schema)
elif isinstance(op, (type(ops.and_op), type(ops.or_op))):
left_cols = cluster_cols_for_predicate(predicate.inputs[0], schema)
right_cols = cluster_cols_for_predicate(predicate.inputs[1], schema)
cols = [*left_cols, *[col for col in right_cols if col not in left_cols]]
else:
cols = []
else:
# Constant
cols = []
return [
col for col in cols if bigframes.dtypes.is_clusterable(schema.get_type(col))
]


def cluster_cols_for_comparison(
left_ex: ex.Expression, right_ex: ex.Expression
) -> list[str]:
# TODO: Try to normalize expressions such that one side is a single variable.
# eg. Convert -cola>=3 to cola<-3 and colb+3 < 4 to colb < 1
if left_ex.is_const:
# There are some invertible ops that would also be ok
tswast marked this conversation as resolved.
Show resolved Hide resolved
if isinstance(right_ex, ex.UnboundVariableExpression):
return [right_ex.id]
elif right_ex.is_const:
if isinstance(left_ex, ex.UnboundVariableExpression):
return [left_ex.id]
return []
39 changes: 38 additions & 1 deletion 39 bigframes/core/tree_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

import functools
import itertools
from typing import Callable, Dict, Optional
from typing import Callable, Dict, Optional, Sequence

import bigframes.core.nodes as nodes

Expand Down Expand Up @@ -91,6 +91,43 @@ def _node_counts_inner(
)


def count_nodes(forest: Sequence[nodes.BigFrameNode]) -> dict[nodes.BigFrameNode, int]:
tswast marked this conversation as resolved.
Show resolved Hide resolved
"""
Counts the number of instances of each subtree present within a forest.

Memoizes internally to accelerate execution, but cache not persisted (not reused between invocations).

Args:
forest (Sequence of BigFrameNode):
The roots of each tree in the forest

Returns:
dict[BigFramesNode, int]: The number of occurences of each subtree.
"""

def _combine_counts(
left: Dict[nodes.BigFrameNode, int], right: Dict[nodes.BigFrameNode, int]
) -> Dict[nodes.BigFrameNode, int]:
return {
key: left.get(key, 0) + right.get(key, 0)
for key in itertools.chain(left.keys(), right.keys())
}

empty_counts: Dict[nodes.BigFrameNode, int] = {}

@functools.cache
def _node_counts_inner(
subtree: nodes.BigFrameNode,
) -> Dict[nodes.BigFrameNode, int]:
"""Helper function to count occurences of duplicate nodes in a subtree. Considers only nodes in a complexity range"""
child_counts = [_node_counts_inner(child) for child in subtree.child_nodes]
node_counts = functools.reduce(_combine_counts, child_counts, empty_counts)
return _combine_counts(node_counts, {subtree: 1})

counts = [_node_counts_inner(root) for root in forest]
return functools.reduce(_combine_counts, counts, empty_counts)


def replace_nodes(
root: nodes.BigFrameNode,
replacements: dict[nodes.BigFrameNode, nodes.BigFrameNode],
Expand Down
77 changes: 68 additions & 9 deletions 77 bigframes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,52 +74,95 @@ class SimpleDtypeInfo:
logical_bytes: int = (
8 # this is approximate only, some types are variably sized, also, compression
)
orderable: bool = False
clusterable: bool = False


# TODO: Missing BQ types: INTERVAL, JSON, RANGE
# TODO: Add mappings to python types
SIMPLE_TYPES = (
SimpleDtypeInfo(
dtype=INT_DTYPE, arrow_dtype=pa.int64(), type_kind=("INT64", "INTEGER")
dtype=INT_DTYPE,
arrow_dtype=pa.int64(),
type_kind=("INT64", "INTEGER"),
orderable=True,
clusterable=True,
),
SimpleDtypeInfo(
dtype=FLOAT_DTYPE, arrow_dtype=pa.float64(), type_kind=("FLOAT64", "FLOAT")
dtype=FLOAT_DTYPE,
arrow_dtype=pa.float64(),
type_kind=("FLOAT64", "FLOAT"),
orderable=True,
),
SimpleDtypeInfo(
dtype=BOOL_DTYPE,
arrow_dtype=pa.bool_(),
type_kind=("BOOL", "BOOLEAN"),
logical_bytes=1,
orderable=True,
clusterable=True,
),
SimpleDtypeInfo(dtype=STRING_DTYPE, arrow_dtype=pa.string(), type_kind=("STRING",)),
SimpleDtypeInfo(
dtype=DATE_DTYPE, arrow_dtype=pa.date32(), type_kind=("DATE",), logical_bytes=4
dtype=STRING_DTYPE,
arrow_dtype=pa.string(),
type_kind=("STRING",),
orderable=True,
clusterable=True,
),
SimpleDtypeInfo(dtype=TIME_DTYPE, arrow_dtype=pa.time64("us"), type_kind=("TIME",)),
SimpleDtypeInfo(
dtype=DATETIME_DTYPE, arrow_dtype=pa.timestamp("us"), type_kind=("DATETIME",)
dtype=DATE_DTYPE,
arrow_dtype=pa.date32(),
type_kind=("DATE",),
logical_bytes=4,
orderable=True,
clusterable=True,
),
SimpleDtypeInfo(
dtype=TIME_DTYPE,
arrow_dtype=pa.time64("us"),
type_kind=("TIME",),
orderable=True,
),
SimpleDtypeInfo(
dtype=DATETIME_DTYPE,
arrow_dtype=pa.timestamp("us"),
type_kind=("DATETIME",),
orderable=True,
clusterable=True,
),
SimpleDtypeInfo(
dtype=TIMESTAMP_DTYPE,
arrow_dtype=pa.timestamp("us", tz="UTC"),
type_kind=("TIMESTAMP",),
orderable=True,
clusterable=True,
),
SimpleDtypeInfo(
dtype=BYTES_DTYPE, arrow_dtype=pa.binary(), type_kind=("BYTES",), orderable=True
),
SimpleDtypeInfo(dtype=BYTES_DTYPE, arrow_dtype=pa.binary(), type_kind=("BYTES",)),
SimpleDtypeInfo(
dtype=NUMERIC_DTYPE,
arrow_dtype=pa.decimal128(38, 9),
type_kind=("NUMERIC",),
logical_bytes=16,
orderable=True,
clusterable=True,
),
SimpleDtypeInfo(
dtype=BIGNUMERIC_DTYPE,
arrow_dtype=pa.decimal256(76, 38),
type_kind=("BIGNUMERIC",),
logical_bytes=32,
orderable=True,
clusterable=True,
),
# Geo has no corresponding arrow dtype
SimpleDtypeInfo(
dtype=GEO_DTYPE, arrow_dtype=None, type_kind=("GEOGRAPHY",), logical_bytes=40
dtype=GEO_DTYPE,
arrow_dtype=None,
type_kind=("GEOGRAPHY",),
logical_bytes=40,
clusterable=True,
),
)

Expand Down Expand Up @@ -209,9 +252,25 @@ def is_comparable(type: ExpressionType) -> bool:
return (type is not None) and is_orderable(type)


_ORDERABLE_SIMPLE_TYPES = set(
mapping.dtype for mapping in SIMPLE_TYPES if mapping.orderable
)


def is_orderable(type: ExpressionType) -> bool:
# On BQ side, ARRAY, STRUCT, GEOGRAPHY, JSON are not orderable
return not is_array_like(type) and not is_struct_like(type) and (type != GEO_DTYPE)
return type in _ORDERABLE_SIMPLE_TYPES


_CLUSTERABLE_SIMPLE_TYPES = set(
mapping.dtype for mapping in SIMPLE_TYPES if mapping.clusterable
)


def is_clusterable(type: ExpressionType) -> bool:
# https://cloud.google.com/bigquery/docs/clustered-tables#cluster_column_types
# This is based on default database type mapping, could in theory represent in non-default bq type to cluster.
return type in _CLUSTERABLE_SIMPLE_TYPES


def is_bool_coercable(type: ExpressionType) -> bool:
Expand Down
43 changes: 39 additions & 4 deletions 43 bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -623,6 +623,40 @@ def head(self, n: int = 5) -> Series:
def tail(self, n: int = 5) -> Series:
return typing.cast(Series, self.iloc[-n:])

def peek(self, n: int = 5, *, force: bool = True) -> pandas.DataFrame:
"""
Preview n arbitrary elements from the series without guarantees about row selection or ordering.

``Series.peek(force=False)`` will always be very fast, but will not succeed if data requires
full data scanning. Using ``force=True`` will always succeed, but may be perform queries.
Query results will be cached so that future steps will benefit from these queries.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need a caveat here that caching is session-aware and will attempt to cache the optimal subtree? (Not sure exactly how to phrase that in a friendlier way.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, not sure how/if we should communicate this to users. I also don't want to lock in any specific execution strategy other than "we might cache if force=True, but we will make that cache as useful as possible using some unspecified approach".


Args:
n (int, default 5):
The number of rows to select from the series. Which N rows are returned is non-deterministic.
force (bool, default True):
If the data cannot be peeked efficiently, the series will instead be fully materialized as part
of the operation if ``force=True``. If ``force=False``, the operation will throw a ValueError.
Returns:
pandas.Series: A pandas Series with n rows.

Raises:
ValueError: If force=False and data cannot be efficiently peeked.
"""
maybe_result = self._block.try_peek(n)
if maybe_result is None:
if force:
self._cached()
maybe_result = self._block.try_peek(n, force=True)
assert maybe_result is not None
else:
raise ValueError(
"Cannot peek efficiently when data has aggregates, joins or window functions applied. Use force=True to fully compute dataframe."
)
as_series = maybe_result.squeeze(axis=1)
as_series.name = self.name
tswast marked this conversation as resolved.
Show resolved Hide resolved
return as_series

def nlargest(self, n: int = 5, keep: str = "first") -> Series:
if keep not in ("first", "last", "all"):
raise ValueError("'keep must be one of 'first', 'last', or 'all'")
Expand Down Expand Up @@ -1419,7 +1453,7 @@ def apply(

# return Series with materialized result so that any error in the remote
# function is caught early
materialized_series = result_series._cached()
materialized_series = result_series._cached(session_aware=False)
return materialized_series

def combine(
Expand Down Expand Up @@ -1794,10 +1828,11 @@ def cache(self):
Returns:
Series: Self
"""
return self._cached(force=True)
# Do not use session-aware cashing if user-requested
return self._cached(force=True, session_aware=False)

def _cached(self, *, force: bool = True) -> Series:
self._block.cached(force=force)
def _cached(self, *, force: bool = True, session_aware: bool = True) -> Series:
self._block.cached(force=force, session_aware=session_aware)
return self

def _optimize_query_complexity(self):
Expand Down
Loading
Morty Proxy This is a proxified and sanitized view of the page, visit original site.