From 4ae6fcc01652d50137ed7ca06fed6044827aec1d Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 8 Nov 2023 00:50:41 +0000 Subject: [PATCH 1/3] fix: increase recursion limit, cache compilation tree hashes --- bigframes/core/nodes.py | 72 ++++++++++++++++++++++++++-- bigframes/pandas/__init__.py | 4 ++ tests/system/small/test_dataframe.py | 7 +++ 3 files changed, 80 insertions(+), 3 deletions(-) diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 7b252b164f..6694e55aac 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -13,7 +13,7 @@ # limitations under the License. from __future__ import annotations -from dataclasses import dataclass, field +from dataclasses import dataclass, field, fields import functools import typing from typing import Optional, Tuple @@ -65,6 +65,10 @@ def session(self): return sessions[0] return None + @functools.cached_property + def _node_hash(self): + return hash(tuple(hash(getattr(self, field.name)) for field in fields(self))) + @dataclass(frozen=True) class UnaryNode(BigFrameNode): @@ -93,6 +97,9 @@ class JoinNode(BigFrameNode): def child_nodes(self) -> typing.Sequence[BigFrameNode]: return (self.left_child, self.right_child) + def __hash__(self): + return self._node_hash + @dataclass(frozen=True) class ConcatNode(BigFrameNode): @@ -102,6 +109,9 @@ class ConcatNode(BigFrameNode): def child_nodes(self) -> typing.Sequence[BigFrameNode]: return self.children + def __hash__(self): + return self._node_hash + # Input Nodex @dataclass(frozen=True) @@ -109,6 +119,9 @@ class ReadLocalNode(BigFrameNode): feather_bytes: bytes column_ids: typing.Tuple[str, ...] + def __hash__(self): + return self._node_hash + # TODO: Refactor to take raw gbq object reference @dataclass(frozen=True) @@ -123,39 +136,61 @@ class ReadGbqNode(BigFrameNode): def session(self): return (self.table_session,) + def __hash__(self): + return self._node_hash + # Unary nodes @dataclass(frozen=True) class DropColumnsNode(UnaryNode): columns: Tuple[str, ...] + def __hash__(self): + return self._node_hash + @dataclass(frozen=True) class PromoteOffsetsNode(UnaryNode): col_id: str + def __hash__(self): + return self._node_hash + @dataclass(frozen=True) class FilterNode(UnaryNode): predicate_id: str keep_null: bool = False + def __hash__(self): + return self._node_hash + @dataclass(frozen=True) class OrderByNode(UnaryNode): by: Tuple[OrderingColumnReference, ...] stable: bool = False + def __hash__(self): + return self._node_hash + @dataclass(frozen=True) class ReversedNode(UnaryNode): - pass + # useless field to make sure has distinct hash + reversed: bool = True + + def __hash__(self): + return self._node_hash @dataclass(frozen=True) class SelectNode(UnaryNode): column_ids: typing.Tuple[str, ...] + def __hash__(self): + return self._node_hash + @dataclass(frozen=True) class ProjectUnaryOpNode(UnaryNode): @@ -163,6 +198,9 @@ class ProjectUnaryOpNode(UnaryNode): op: ops.UnaryOp output_id: Optional[str] = None + def __hash__(self): + return self._node_hash + @dataclass(frozen=True) class ProjectBinaryOpNode(UnaryNode): @@ -171,6 +209,9 @@ class ProjectBinaryOpNode(UnaryNode): op: ops.BinaryOp output_id: str + def __hash__(self): + return self._node_hash + @dataclass(frozen=True) class ProjectTernaryOpNode(UnaryNode): @@ -180,6 +221,9 @@ class ProjectTernaryOpNode(UnaryNode): op: ops.TernaryOp output_id: str + def __hash__(self): + return self._node_hash + @dataclass(frozen=True) class AggregateNode(UnaryNode): @@ -187,12 +231,18 @@ class AggregateNode(UnaryNode): by_column_ids: typing.Tuple[str, ...] = tuple([]) dropna: bool = True + def __hash__(self): + return self._node_hash + # TODO: Unify into aggregate @dataclass(frozen=True) class CorrNode(UnaryNode): corr_aggregations: typing.Tuple[typing.Tuple[str, str, str], ...] + def __hash__(self): + return self._node_hash + @dataclass(frozen=True) class WindowOpNode(UnaryNode): @@ -203,10 +253,14 @@ class WindowOpNode(UnaryNode): never_skip_nulls: bool = False skip_reproject_unsafe: bool = False + def __hash__(self): + return self._node_hash + @dataclass(frozen=True) class ReprojectOpNode(UnaryNode): - pass + def __hash__(self): + return self._node_hash @dataclass(frozen=True) @@ -222,12 +276,18 @@ class UnpivotNode(UnaryNode): ] = (pandas.Float64Dtype(),) how: typing.Literal["left", "right"] = "left" + def __hash__(self): + return self._node_hash + @dataclass(frozen=True) class AssignNode(UnaryNode): source_id: str destination_id: str + def __hash__(self): + return self._node_hash + @dataclass(frozen=True) class AssignConstantNode(UnaryNode): @@ -235,6 +295,9 @@ class AssignConstantNode(UnaryNode): value: typing.Hashable dtype: typing.Optional[bigframes.dtypes.Dtype] + def __hash__(self): + return self._node_hash + @dataclass(frozen=True) class RandomSampleNode(UnaryNode): @@ -243,3 +306,6 @@ class RandomSampleNode(UnaryNode): @property def deterministic(self) -> bool: return False + + def __hash__(self): + return self._node_hash diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 1c52b103fb..1fc1f5db79 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -18,6 +18,7 @@ from collections import namedtuple import inspect +import sys import typing from typing import ( Any, @@ -647,6 +648,9 @@ def read_gbq_function(function_name: str): close_session = global_session.close_session reset_session = global_session.close_session +# SQL Compilation uses recursive algorithms on deep trees +# 10M tree depth should be sufficient to generate any sql that is under bigquery limit +sys.setrecursionlimit(10000000) # Use __all__ to let type checkers know what is part of the public API. __all___ = [ diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index bd5930e508..f9d9a1b075 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3376,3 +3376,10 @@ def test_df_dot_operator_series( bf_result, pd_result, ) + + +def test_recursion_limit(scalars_df_index): + scalars_df_index = scalars_df_index[["int64_too", "int64_col", "float64_col"]] + for i in range(400): + scalars_df_index = scalars_df_index + 4 + scalars_df_index.to_pandas() From 2fc4e8e69d5bf6be69470f3014a8231907de9122 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 10 Nov 2023 04:47:43 +0000 Subject: [PATCH 2/3] don't decrease recursion limit --- bigframes/pandas/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 1fc1f5db79..d209e85a98 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -650,7 +650,7 @@ def read_gbq_function(function_name: str): # SQL Compilation uses recursive algorithms on deep trees # 10M tree depth should be sufficient to generate any sql that is under bigquery limit -sys.setrecursionlimit(10000000) +sys.setrecursionlimit(max(10000000, sys.getrecursionlimit())) # Use __all__ to let type checkers know what is part of the public API. __all___ = [ From 66d9b02480e0f903933539060f5de6fb15aea4a6 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 10 Nov 2023 22:22:32 +0000 Subject: [PATCH 3/3] add comment explaining _node_hash method --- bigframes/core/nodes.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index ae4c0c2203..a8f89703f2 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -65,6 +65,9 @@ def session(self): return sessions[0] return None + # BigFrameNode trees can be very deep so its important avoid recalculating the hash from scratch + # Each subclass of BigFrameNode should use this property to implement __hash__ + # The default dataclass-generated __hash__ method is not cached @functools.cached_property def _node_hash(self): return hash(tuple(hash(getattr(self, field.name)) for field in fields(self)))