googleapis · TrevorBergeron · Jun 26, 2024 · May 28, 2024 · May 30, 2024 · May 30, 2024
@@ -2286,13 +2286,13 @@ def to_sql_query(
            idx_labels,
        )

-    def cached(self, *, optimize_offsets=False, force: bool = False) -> None:
+    def cached(self, *, force: bool = False, session_aware: bool = False) -> None:
        """Write the block to a session table."""
        # use a heuristic for whether something needs to be cached
        if (not force) and self.session._is_trivially_executable(self.expr):
            return
-        if optimize_offsets:
-            self.session._cache_with_offsets(self.expr)
+        elif session_aware:
+            self.session._cache_with_session_awareness(self.expr)
        else:
            self.session._cache_with_cluster_cols(
                self.expr, cluster_cols=self.index_columns

@@ -0,0 +1,77 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import bigframes.core.expression as ex
+import bigframes.core.schema as schemata
+import bigframes.dtypes
+import bigframes.operations as ops
+
+LOW_CARDINALITY_TYPES = [bigframes.dtypes.BOOL_DTYPE]
+
+COMPARISON_OP_TYPES = tuple(
+    type(i)
+    for i in (
+        ops.eq_op,
+        ops.eq_null_match_op,
+        ops.ne_op,
+        ops.gt_op,
+        ops.ge_op,
+        ops.lt_op,
+        ops.le_op,
+    )
+)
+
+
+def cluster_cols_for_predicate(
+    predicate: ex.Expression, schema: schemata.ArraySchema
+) -> list[str]:
+    """Try to determine cluster col candidates that work with given predicates."""
+    # TODO: Prioritize based on predicted selectivity (eg. equality conditions are probably very selective)
+    if isinstance(predicate, ex.UnboundVariableExpression):
+        cols = [predicate.id]
+    elif isinstance(predicate, ex.OpExpression):
+        op = predicate.op
+        # TODO: Support geo predicates, which support pruning if clustered (other than st_disjoint)
+        # https://cloud.google.com/bigquery/docs/reference/standard-sql/geography_functions
+        if isinstance(op, COMPARISON_OP_TYPES):
+            cols = cluster_cols_for_comparison(predicate.inputs[0], predicate.inputs[1])
+        elif isinstance(op, (type(ops.invert_op))):
+            cols = cluster_cols_for_predicate(predicate.inputs[0], schema)
+        elif isinstance(op, (type(ops.and_op), type(ops.or_op))):
+            left_cols = cluster_cols_for_predicate(predicate.inputs[0], schema)
+            right_cols = cluster_cols_for_predicate(predicate.inputs[1], schema)
+            cols = [*left_cols, *[col for col in right_cols if col not in left_cols]]
+        else:
+            cols = []
+    else:
+        # Constant
+        cols = []
+    return [
+        col for col in cols if bigframes.dtypes.is_clusterable(schema.get_type(col))
+    ]
+
+
+def cluster_cols_for_comparison(
+    left_ex: ex.Expression, right_ex: ex.Expression
+) -> list[str]:
+    # TODO: Try to normalize expressions such that one side is a single variable.
+    # eg. Convert -cola>=3 to cola<-3 and colb+3 < 4 to colb < 1
+    if left_ex.is_const:
+        # There are some invertible ops that would also be ok
+        if isinstance(right_ex, ex.UnboundVariableExpression):
+            return [right_ex.id]
+    elif right_ex.is_const:
+        if isinstance(left_ex, ex.UnboundVariableExpression):
+            return [left_ex.id]
+    return []
@@ -15,7 +15,7 @@

 import functools
 import itertools
-from typing import Callable, Dict, Optional
+from typing import Callable, Dict, Optional, Sequence

 import bigframes.core.nodes as nodes

@@ -91,6 +91,43 @@ def _node_counts_inner(
    )


+def count_nodes(forest: Sequence[nodes.BigFrameNode]) -> dict[nodes.BigFrameNode, int]:
+    """
+    Counts the number of instances of each subtree present within a forest.
+
+    Memoizes internally to accelerate execution, but cache not persisted (not reused between invocations).
+
+    Args:
+        forest (Sequence of BigFrameNode):
+            The roots of each tree in the forest
+
+    Returns:
+        dict[BigFramesNode, int]: The number of occurences of each subtree.
+    """
+
+    def _combine_counts(
+        left: Dict[nodes.BigFrameNode, int], right: Dict[nodes.BigFrameNode, int]
+    ) -> Dict[nodes.BigFrameNode, int]:
+        return {
+            key: left.get(key, 0) + right.get(key, 0)
+            for key in itertools.chain(left.keys(), right.keys())
+        }
+
+    empty_counts: Dict[nodes.BigFrameNode, int] = {}
+
+    @functools.cache
+    def _node_counts_inner(
+        subtree: nodes.BigFrameNode,
+    ) -> Dict[nodes.BigFrameNode, int]:
+        """Helper function to count occurences of duplicate nodes in a subtree. Considers only nodes in a complexity range"""
+        child_counts = [_node_counts_inner(child) for child in subtree.child_nodes]
+        node_counts = functools.reduce(_combine_counts, child_counts, empty_counts)
+        return _combine_counts(node_counts, {subtree: 1})
+
+    counts = [_node_counts_inner(root) for root in forest]
+    return functools.reduce(_combine_counts, counts, empty_counts)
+
+
 def replace_nodes(
    root: nodes.BigFrameNode,
    replacements: dict[nodes.BigFrameNode, nodes.BigFrameNode],

@@ -74,52 +74,95 @@ class SimpleDtypeInfo:
    logical_bytes: int = (
        8  # this is approximate only, some types are variably sized, also, compression
    )
+    orderable: bool = False
+    clusterable: bool = False


 # TODO: Missing BQ types: INTERVAL, JSON, RANGE
 # TODO: Add mappings to python types
 SIMPLE_TYPES = (
    SimpleDtypeInfo(
-        dtype=INT_DTYPE, arrow_dtype=pa.int64(), type_kind=("INT64", "INTEGER")
+        dtype=INT_DTYPE,
+        arrow_dtype=pa.int64(),
+        type_kind=("INT64", "INTEGER"),
+        orderable=True,
+        clusterable=True,
    ),
    SimpleDtypeInfo(
-        dtype=FLOAT_DTYPE, arrow_dtype=pa.float64(), type_kind=("FLOAT64", "FLOAT")
+        dtype=FLOAT_DTYPE,
+        arrow_dtype=pa.float64(),
+        type_kind=("FLOAT64", "FLOAT"),
+        orderable=True,
    ),
    SimpleDtypeInfo(
        dtype=BOOL_DTYPE,
        arrow_dtype=pa.bool_(),
        type_kind=("BOOL", "BOOLEAN"),
        logical_bytes=1,
+        orderable=True,
+        clusterable=True,
    ),
-    SimpleDtypeInfo(dtype=STRING_DTYPE, arrow_dtype=pa.string(), type_kind=("STRING",)),
    SimpleDtypeInfo(
-        dtype=DATE_DTYPE, arrow_dtype=pa.date32(), type_kind=("DATE",), logical_bytes=4
+        dtype=STRING_DTYPE,
+        arrow_dtype=pa.string(),
+        type_kind=("STRING",),
+        orderable=True,
+        clusterable=True,
    ),
-    SimpleDtypeInfo(dtype=TIME_DTYPE, arrow_dtype=pa.time64("us"), type_kind=("TIME",)),
    SimpleDtypeInfo(
-        dtype=DATETIME_DTYPE, arrow_dtype=pa.timestamp("us"), type_kind=("DATETIME",)
+        dtype=DATE_DTYPE,
+        arrow_dtype=pa.date32(),
+        type_kind=("DATE",),
+        logical_bytes=4,
+        orderable=True,
+        clusterable=True,
+    ),
+    SimpleDtypeInfo(
+        dtype=TIME_DTYPE,
+        arrow_dtype=pa.time64("us"),
+        type_kind=("TIME",),
+        orderable=True,
+    ),
+    SimpleDtypeInfo(
+        dtype=DATETIME_DTYPE,
+        arrow_dtype=pa.timestamp("us"),
+        type_kind=("DATETIME",),
+        orderable=True,
+        clusterable=True,
    ),
    SimpleDtypeInfo(
        dtype=TIMESTAMP_DTYPE,
        arrow_dtype=pa.timestamp("us", tz="UTC"),
        type_kind=("TIMESTAMP",),
+        orderable=True,
+        clusterable=True,
+    ),
+    SimpleDtypeInfo(
+        dtype=BYTES_DTYPE, arrow_dtype=pa.binary(), type_kind=("BYTES",), orderable=True
    ),
-    SimpleDtypeInfo(dtype=BYTES_DTYPE, arrow_dtype=pa.binary(), type_kind=("BYTES",)),
    SimpleDtypeInfo(
        dtype=NUMERIC_DTYPE,
        arrow_dtype=pa.decimal128(38, 9),
        type_kind=("NUMERIC",),
        logical_bytes=16,
+        orderable=True,
+        clusterable=True,
    ),
    SimpleDtypeInfo(
        dtype=BIGNUMERIC_DTYPE,
        arrow_dtype=pa.decimal256(76, 38),
        type_kind=("BIGNUMERIC",),
        logical_bytes=32,
+        orderable=True,
+        clusterable=True,
    ),
    # Geo has no corresponding arrow dtype
    SimpleDtypeInfo(
-        dtype=GEO_DTYPE, arrow_dtype=None, type_kind=("GEOGRAPHY",), logical_bytes=40
+        dtype=GEO_DTYPE,
+        arrow_dtype=None,
+        type_kind=("GEOGRAPHY",),
+        logical_bytes=40,
+        clusterable=True,
    ),
 )

@@ -209,9 +252,25 @@ def is_comparable(type: ExpressionType) -> bool:
    return (type is not None) and is_orderable(type)


+_ORDERABLE_SIMPLE_TYPES = set(
+    mapping.dtype for mapping in SIMPLE_TYPES if mapping.orderable
+)
+
+
 def is_orderable(type: ExpressionType) -> bool:
    # On BQ side, ARRAY, STRUCT, GEOGRAPHY, JSON are not orderable
-    return not is_array_like(type) and not is_struct_like(type) and (type != GEO_DTYPE)
+    return type in _ORDERABLE_SIMPLE_TYPES
+
+
+_CLUSTERABLE_SIMPLE_TYPES = set(
+    mapping.dtype for mapping in SIMPLE_TYPES if mapping.clusterable
+)
+
+
+def is_clusterable(type: ExpressionType) -> bool:
+    # https://cloud.google.com/bigquery/docs/clustered-tables#cluster_column_types
+    # This is based on default database type mapping, could in theory represent in non-default bq type to cluster.
+    return type in _CLUSTERABLE_SIMPLE_TYPES


 def is_bool_coercable(type: ExpressionType) -> bool:

@@ -623,6 +623,40 @@ def head(self, n: int = 5) -> Series:
    def tail(self, n: int = 5) -> Series:
        return typing.cast(Series, self.iloc[-n:])

+    def peek(self, n: int = 5, *, force: bool = True) -> pandas.DataFrame:
+        """
+        Preview n arbitrary elements from the series without guarantees about row selection or ordering.
+
+        ``Series.peek(force=False)`` will always be very fast, but will not succeed if data requires
+        full data scanning. Using ``force=True`` will always succeed, but may be perform queries.
+        Query results will be cached so that future steps will benefit from these queries.
+
+        Args:
+            n (int, default 5):
+                The number of rows to select from the series. Which N rows are returned is non-deterministic.
+            force (bool, default True):
+                If the data cannot be peeked efficiently, the series will instead be fully materialized as part
+                of the operation if ``force=True``. If ``force=False``, the operation will throw a ValueError.
+        Returns:
+            pandas.Series: A pandas Series with n rows.
+
+        Raises:
+            ValueError: If force=False and data cannot be efficiently peeked.
+        """
+        maybe_result = self._block.try_peek(n)
+        if maybe_result is None:
+            if force:
+                self._cached()
+                maybe_result = self._block.try_peek(n, force=True)
+                assert maybe_result is not None
+            else:
+                raise ValueError(
+                    "Cannot peek efficiently when data has aggregates, joins or window functions applied. Use force=True to fully compute dataframe."
+                )
+        as_series = maybe_result.squeeze(axis=1)
+        as_series.name = self.name
+        return as_series
+
    def nlargest(self, n: int = 5, keep: str = "first") -> Series:
        if keep not in ("first", "last", "all"):
            raise ValueError("'keep must be one of 'first', 'last', or 'all'")
@@ -1419,7 +1453,7 @@ def apply(

        # return Series with materialized result so that any error in the remote
        # function is caught early
-        materialized_series = result_series._cached()
+        materialized_series = result_series._cached(session_aware=False)
        return materialized_series

    def combine(
@@ -1794,10 +1828,11 @@ def cache(self):
        Returns:
            Series: Self
        """
-        return self._cached(force=True)
+        # Do not use session-aware cashing if user-requested
+        return self._cached(force=True, session_aware=False)

-    def _cached(self, *, force: bool = True) -> Series:
-        self._block.cached(force=force)
+    def _cached(self, *, force: bool = True, session_aware: bool = True) -> Series:
+        self._block.cached(force=force, session_aware=session_aware)
        return self

    def _optimize_query_complexity(self):
-Original file line number
+Diff line change
@@ -623,6 +623,40 @@ def head(self, n: int = 5) -> Series:
         def tail(self, n: int = 5) -> Series:
             return typing.cast(Series, self.iloc[-n:])
+        def peek(self, n: int = 5, *, force: bool = True) -> pandas.DataFrame:
+            """
+            Preview n arbitrary elements from the series without guarantees about row selection or ordering.
+            ``Series.peek(force=False)`` will always be very fast, but will not succeed if data requires
+            full data scanning. Using ``force=True`` will always succeed, but may be perform queries.
+            Query results will be cached so that future steps will benefit from these queries.
             Copy link

  
      
    
  

  
      

  
    Collaborator


      

  

  
    
      

      
            tswast
  

      

      

      


        Jun 4, 2024


      
    

  


        
      
  
  
  
    

    There was a problem hiding this comment.


  

 
  
    

    Choose a reason for hiding this comment

    
      The reason will be displayed to describe this comment to others. Learn more.
    

    
      
      


  


  
    
      Do we need a caveat here that caching is session-aware and will attempt to cache the optimal subtree? (Not sure exactly how to phrase that in a friendlier way.)
    
  
  


    

        
      
  
  
    
  
  
  
    
    Sorry, something went wrong.
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  


          
      
  
    
    
      
        
            
    All reactions
  


          
          
        
      
    

    



    
        
  
    
        
    
  


      
          
  
      
            Copy link

  
      
    
  

  
      

  
    Contributor


      

  Author


  

  
    
      

      
            TrevorBergeron
  

      

      

      


        Jun 5, 2024


      
    

  


        
      
  
  
  
    

    There was a problem hiding this comment.


  

 
  
    

    Choose a reason for hiding this comment

    
      The reason will be displayed to describe this comment to others. Learn more.
    

    
      
      


  


  
    
      Yeah, not sure how/if we should communicate this to users. I also don't want to lock in any specific execution strategy other than "we might cache if force=True, but we will make that cache as useful as possible using some unspecified approach".
    
  
  


    

        
      
  
  
    
  
  
  
    
    Sorry, something went wrong.
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  


          
      
  
    
    
            tswast reacted with thumbs up emoji
      
        
            
    All reactions
  


          
              
                👍
                  1 reaction
+            Args:
+                n (int, default 5):
+                    The number of rows to select from the series. Which N rows are returned is non-deterministic.
+                force (bool, default True):
+                    If the data cannot be peeked efficiently, the series will instead be fully materialized as part
+                    of the operation if ``force=True``. If ``force=False``, the operation will throw a ValueError.
+            Returns:
+                pandas.Series: A pandas Series with n rows.
+            Raises:
+                ValueError: If force=False and data cannot be efficiently peeked.
+            """
+            maybe_result = self._block.try_peek(n)
+            if maybe_result is None:
+                if force:
+                    self._cached()
+                    maybe_result = self._block.try_peek(n, force=True)
+                    assert maybe_result is not None
+                else:
+                    raise ValueError(
+                        "Cannot peek efficiently when data has aggregates, joins or window functions applied. Use force=True to fully compute dataframe."
+                    )
+            as_series = maybe_result.squeeze(axis=1)
+            as_series.name = self.name
                   tswast marked this conversation as resolved.
              

          
            Show resolved
            Hide resolved
        

                
  
            
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.
+            return as_series
         def nlargest(self, n: int = 5, keep: str = "first") -> Series:
             if keep not in ("first", "last", "all"):
                 raise ValueError("'keep must be one of 'first', 'last', or 'all'")
@@ -1419,7 +1453,7 @@ def apply(
             # return Series with materialized result so that any error in the remote
             # function is caught early
-            materialized_series = result_series._cached()
+            materialized_series = result_series._cached(session_aware=False)
             return materialized_series
         def combine(
@@ -1794,10 +1828,11 @@ def cache(self):
             Returns:
                 Series: Self
             """
-            return self._cached(force=True)
+            # Do not use session-aware cashing if user-requested
+            return self._cached(force=True, session_aware=False)
-        def _cached(self, *, force: bool = True) -> Series:
-            self._block.cached(force=force)
+        def _cached(self, *, force: bool = True, session_aware: bool = True) -> Series:
+            self._block.cached(force=force, session_aware=session_aware)
             return self
         def _optimize_query_complexity(self):
-          Expand Down