From fb7eb1d184fb68ee9c5fdb38ab3cdea2c638480a Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 18 Jan 2024 01:26:01 +0000 Subject: [PATCH 1/9] feat: add DataFrame.eval, DataFrame.query --- bigframes/dataframe.py | 11 ++++ bigframes/eval.py | 56 +++++++++++++++++++ tests/system/small/test_dataframe.py | 36 ++++++++++++ .../bigframes_vendored/pandas/core/frame.py | 44 +++++++++++++++ 4 files changed, 147 insertions(+) create mode 100644 bigframes/eval.py diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 2a20a4aabb..838ad46577 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1483,6 +1483,17 @@ def sort_values( ) return DataFrame(self._block.order_by(ordering)) + def eval(self, expr: str) -> DataFrame: + import bigframes.eval as bf_eval + + return bf_eval.eval(self, expr, target=self) + + def query(self, expr: str) -> DataFrame: + import bigframes.eval as bf_eval + + eval_result = bf_eval.eval(self, expr, target=None) + return self[eval_result] + def value_counts( self, subset: typing.Union[blocks.Label, typing.Sequence[blocks.Label]] = None, diff --git a/bigframes/eval.py b/bigframes/eval.py new file mode 100644 index 0000000000..581375ce67 --- /dev/null +++ b/bigframes/eval.py @@ -0,0 +1,56 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dataclasses +from typing import Optional + +import pandas +import pandas.core.computation.parsing as pandas_eval_parsing + +import bigframes.dataframe as dataframe +import bigframes.dtypes +import bigframes.series as series + + +def eval(df: dataframe.DataFrame, expr: str, target: Optional[dataframe.DataFrame]): + index_resolver = { + pandas_eval_parsing.clean_column_name(str(name)): EvalSeries( + df.index.get_level_values(level).to_series() + ) + for level, name in enumerate(df.index.names) + } + column_resolver = { + pandas_eval_parsing.clean_column_name(str(name)): EvalSeries(series) + for name, series in df.items() + } + return pandas.eval( + expr=expr, level=3, target=target, resolvers=(index_resolver, column_resolver) # type: ignore + ) + + +@dataclasses.dataclass +class FakeNumpyArray: + dtype: bigframes.dtypes.Dtype + + +class EvalSeries(series.Series): + """Slight modified series that works better with pandas.eval""" + + def __init__(self, underlying: series.Series): + super().__init__(data=underlying._block) + + @property + def values(self): + """Returns fake numpy array with only dtype property so that eval can determine schema without actually downloading the data.""" + return FakeNumpyArray(self.dtype) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 4ae31fa4a0..e3974454d1 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3592,6 +3592,42 @@ def test_df_to_orc(scalars_df_index, scalars_pandas_df_index): assert bf_result == pd_result +@pytest.mark.parametrize( + ("expr",), + [ + ("new_col = int64_col + int64_too",), + ("new_col = (rowindex > 3) | bool_col",), + ("int64_too = bool_col\nnew_col2 = rowindex",), + ], +) +def test_df_eval(scalars_dfs, expr): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.eval(expr).to_pandas() + pd_result = scalars_pandas_df.eval(expr) + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("expr",), + [ + ("int64_col > int64_too",), + ("bool_col",), + ("((int64_col - int64_too) % @local_var) == 0",), + ], +) +def test_df_query(scalars_dfs, expr): + # local_var is referenced in expressions + local_var = 3 # NOQA + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.query(expr).to_pandas() + pd_result = scalars_pandas_df.query(expr) + + pd.testing.assert_frame_equal(bf_result, pd_result) + + @pytest.mark.parametrize( ("subset", "normalize", "ascending", "dropna"), [ diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 93fba9f3aa..e5b8b624d9 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4869,6 +4869,50 @@ def value_counts( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def eval(self, expr: str) -> DataFrame: + """ + Evaluate a string describing operations on DataFrame columns. + + Operates on columns only, not specific rows or elements. This allows + `eval` to run arbitrary code, which can make you vulnerable to code + injection if you pass user input to this function. + + Args: + expr (str): + The expression string to evaluate. + + Returns: + DataFrame + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def query(self, expr: str) -> DataFrame | None: + """ + Query the columns of a DataFrame with a boolean expression. + + Args: + expr (str): + The query string to evaluate. + + You can refer to variables + in the environment by prefixing them with an '@' character like + ``@a + b``. + + You can refer to column names that are not valid Python variable names + by surrounding them in backticks. Thus, column names containing spaces + or punctuations (besides underscores) or starting with digits must be + surrounded by backticks. (For example, a column named "Area (cm^2)" would + be referenced as ```Area (cm^2)```). Column names which are Python keywords + (like "list", "for", "import", etc) cannot be used. + + For example, if one of your columns is called ``a a`` and you want + to sum it with ``b``, your query should be ```a a` + b``. + + Returns: + DataFrame + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def interpolate(self, method: str = "linear"): """ Fill NaN values using an interpolation method. From 7e5d26617a2a670f46b7cff48179b7f7c373d279 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Tue, 30 Jan 2024 18:54:45 +0000 Subject: [PATCH 2/9] address pr comments --- bigframes/{ => core}/eval.py | 7 +- bigframes/dataframe.py | 4 +- .../pandas/core/computation/parsing.py | 196 ++++++++++++++++++ 3 files changed, 202 insertions(+), 5 deletions(-) rename bigframes/{ => core}/eval.py (83%) create mode 100644 third_party/bigframes_vendored/pandas/core/computation/parsing.py diff --git a/bigframes/eval.py b/bigframes/core/eval.py similarity index 83% rename from bigframes/eval.py rename to bigframes/core/eval.py index 581375ce67..53a1483099 100644 --- a/bigframes/eval.py +++ b/bigframes/core/eval.py @@ -16,24 +16,25 @@ from typing import Optional import pandas -import pandas.core.computation.parsing as pandas_eval_parsing import bigframes.dataframe as dataframe import bigframes.dtypes import bigframes.series as series +import third_party.bigframes_vendored.pandas.core.computation.parsing as vendored_pandas_eval_parsing def eval(df: dataframe.DataFrame, expr: str, target: Optional[dataframe.DataFrame]): index_resolver = { - pandas_eval_parsing.clean_column_name(str(name)): EvalSeries( + vendored_pandas_eval_parsing.clean_column_name(str(name)): EvalSeries( df.index.get_level_values(level).to_series() ) for level, name in enumerate(df.index.names) } column_resolver = { - pandas_eval_parsing.clean_column_name(str(name)): EvalSeries(series) + vendored_pandas_eval_parsing.clean_column_name(str(name)): EvalSeries(series) for name, series in df.items() } + # 3 Levels: user -> logging wrapper -> dataframe -> eval helper (this) return pandas.eval( expr=expr, level=3, target=target, resolvers=(index_resolver, column_resolver) # type: ignore ) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 838ad46577..7e9b407bc3 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1484,12 +1484,12 @@ def sort_values( return DataFrame(self._block.order_by(ordering)) def eval(self, expr: str) -> DataFrame: - import bigframes.eval as bf_eval + import bigframes.core.eval as bf_eval return bf_eval.eval(self, expr, target=self) def query(self, expr: str) -> DataFrame: - import bigframes.eval as bf_eval + import bigframes.core.eval as bf_eval eval_result = bf_eval.eval(self, expr, target=None) return self[eval_result] diff --git a/third_party/bigframes_vendored/pandas/core/computation/parsing.py b/third_party/bigframes_vendored/pandas/core/computation/parsing.py new file mode 100644 index 0000000000..e54f459735 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/computation/parsing.py @@ -0,0 +1,196 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/computation/parsing.py +""" +:func:`~pandas.eval` source string parsing functions +""" +from __future__ import annotations + +from io import StringIO +from keyword import iskeyword +import token +import tokenize +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Hashable, Iterator + +# A token value Python's tokenizer probably will never use. +BACKTICK_QUOTED_STRING = 100 + + +def create_valid_python_identifier(name: str) -> str: + """ + Create valid Python identifiers from any string. + + Check if name contains any special characters. If it contains any + special characters, the special characters will be replaced by + a special string and a prefix is added. + + Raises + ------ + SyntaxError + If the returned name is not a Python valid identifier, raise an exception. + This can happen if there is a hashtag in the name, as the tokenizer will + than terminate and not find the backtick. + But also for characters that fall out of the range of (U+0001..U+007F). + """ + if name.isidentifier() and not iskeyword(name): + return name + + # Create a dict with the special characters and their replacement string. + # EXACT_TOKEN_TYPES contains these special characters + # token.tok_name contains a readable description of the replacement string. + special_characters_replacements = { + char: f"_{token.tok_name[tokval]}_" + for char, tokval in (tokenize.EXACT_TOKEN_TYPES.items()) + } + special_characters_replacements.update( + { + " ": "_", + "?": "_QUESTIONMARK_", + "!": "_EXCLAMATIONMARK_", + "$": "_DOLLARSIGN_", + "€": "_EUROSIGN_", + "°": "_DEGREESIGN_", + # Including quotes works, but there are exceptions. + "'": "_SINGLEQUOTE_", + '"': "_DOUBLEQUOTE_", + # Currently not possible. Terminates parser and won't find backtick. + # "#": "_HASH_", + } + ) + + name = "".join([special_characters_replacements.get(char, char) for char in name]) + name = f"BACKTICK_QUOTED_STRING_{name}" + + if not name.isidentifier(): + raise SyntaxError(f"Could not convert '{name}' to a valid Python identifier.") + + return name + + +def clean_backtick_quoted_toks(tok: tuple[int, str]) -> tuple[int, str]: + """ + Clean up a column name if surrounded by backticks. + + Backtick quoted string are indicated by a certain tokval value. If a string + is a backtick quoted token it will processed by + :func:`_create_valid_python_identifier` so that the parser can find this + string when the query is executed. + In this case the tok will get the NAME tokval. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + tok : Tuple[int, str] + Either the input or token or the replacement values + """ + toknum, tokval = tok + if toknum == BACKTICK_QUOTED_STRING: + return tokenize.NAME, create_valid_python_identifier(tokval) + return toknum, tokval + + +def clean_column_name(name: Hashable) -> Hashable: + """ + Function to emulate the cleaning of a backtick quoted name. + + The purpose for this function is to see what happens to the name of + identifier if it goes to the process of being parsed a Python code + inside a backtick quoted string and than being cleaned + (removed of any special characters). + + Parameters + ---------- + name : hashable + Name to be cleaned. + + Returns + ------- + name : hashable + Returns the name after tokenizing and cleaning. + + Notes + ----- + For some cases, a name cannot be converted to a valid Python identifier. + In that case :func:`tokenize_string` raises a SyntaxError. + In that case, we just return the name unmodified. + + If this name was used in the query string (this makes the query call impossible) + an error will be raised by :func:`tokenize_backtick_quoted_string` instead, + which is not caught and propagates to the user level. + """ + try: + tokenized = tokenize_string(f"`{name}`") + tokval = next(tokenized)[1] + return create_valid_python_identifier(tokval) + except SyntaxError: + return name + + +def tokenize_backtick_quoted_string( + token_generator: Iterator[tokenize.TokenInfo], source: str, string_start: int +) -> tuple[int, str]: + """ + Creates a token from a backtick quoted string. + + Moves the token_generator forwards till right after the next backtick. + + Parameters + ---------- + token_generator : Iterator[tokenize.TokenInfo] + The generator that yields the tokens of the source string (Tuple[int, str]). + The generator is at the first token after the backtick (`) + + source : str + The Python source code string. + + string_start : int + This is the start of backtick quoted string inside the source string. + + Returns + ------- + tok: Tuple[int, str] + The token that represents the backtick quoted string. + The integer is equal to BACKTICK_QUOTED_STRING (100). + """ + for _, tokval, start, _, _ in token_generator: + if tokval == "`": + string_end = start[1] + break + + return BACKTICK_QUOTED_STRING, source[string_start:string_end] + + +def tokenize_string(source: str) -> Iterator[tuple[int, str]]: + """ + Tokenize a Python source code string. + + Parameters + ---------- + source : str + The Python source code string. + + Returns + ------- + tok_generator : Iterator[Tuple[int, str]] + An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]). + """ + line_reader = StringIO(source).readline + token_generator = tokenize.generate_tokens(line_reader) + + # Loop over all tokens till a backtick (`) is found. + # Then, take all tokens till the next backtick to form a backtick quoted string + for toknum, tokval, start, _, _ in token_generator: + if tokval == "`": + try: + yield tokenize_backtick_quoted_string( + token_generator, source, string_start=start[1] + 1 + ) + except Exception as err: + raise SyntaxError(f"Failed to parse backticks in '{source}'.") from err + else: + yield toknum, tokval From 4d85e0ea22c06bc3ca6c5051d162db5379d59712 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 6 Mar 2024 21:03:07 +0000 Subject: [PATCH 3/9] add docstring, disable new tests for legacy pandas --- bigframes/core/eval.py | 14 ++++++++++++++ tests/system/small/test_dataframe.py | 2 ++ 2 files changed, 16 insertions(+) diff --git a/bigframes/core/eval.py b/bigframes/core/eval.py index 53a1483099..0afaa0c11f 100644 --- a/bigframes/core/eval.py +++ b/bigframes/core/eval.py @@ -24,6 +24,20 @@ def eval(df: dataframe.DataFrame, expr: str, target: Optional[dataframe.DataFrame]): + """ + Evaluate the given python expression + + Args: + df (DataFrame): + Columns of this dataframe will be used to resolve variables in expression. + expr (str): + One or more python expression to evaluate. + target (DataFrame or None): + The evaluation result will be written to the target if provided. + + Returns: + Result of evaluation. + """ index_resolver = { vendored_pandas_eval_parsing.clean_column_name(str(name)): EvalSeries( df.index.get_level_values(level).to_series() diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index cb993e1325..ba0c9d9205 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3663,6 +3663,7 @@ def test_df_to_orc(scalars_df_index, scalars_pandas_df_index): assert bf_result == pd_result +@skip_legacy_pandas @pytest.mark.parametrize( ("expr",), [ @@ -3680,6 +3681,7 @@ def test_df_eval(scalars_dfs, expr): pd.testing.assert_frame_equal(bf_result, pd_result) +@skip_legacy_pandas @pytest.mark.parametrize( ("expr",), [ From 2b0d902a2e6d1b7b4d9bb215b886f03024306b80 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 21 Mar 2024 20:09:06 +0000 Subject: [PATCH 4/9] vendor the pandas eval implementation --- bigframes/core/eval.py | 6 +- .../bigframes_vendored/pandas/core/common.py | 26 + .../pandas/core/computation/align.py | 226 +++++ .../pandas/core/computation/common.py | 48 + .../pandas/core/computation/engines.py | 121 +++ .../pandas/core/computation/eval.py | 412 +++++++++ .../pandas/core/computation/expr.py | 831 ++++++++++++++++++ .../pandas/core/computation/ops.py | 605 +++++++++++++ .../pandas/core/computation/scope.py | 355 ++++++++ .../pandas/core/dtypes/inference.py | 31 + .../pandas/util/_exceptions.py | 29 + .../pandas/util/_validators.py | 58 ++ 12 files changed, 2745 insertions(+), 3 deletions(-) create mode 100644 third_party/bigframes_vendored/pandas/core/computation/align.py create mode 100644 third_party/bigframes_vendored/pandas/core/computation/common.py create mode 100644 third_party/bigframes_vendored/pandas/core/computation/engines.py create mode 100644 third_party/bigframes_vendored/pandas/core/computation/eval.py create mode 100644 third_party/bigframes_vendored/pandas/core/computation/expr.py create mode 100644 third_party/bigframes_vendored/pandas/core/computation/ops.py create mode 100644 third_party/bigframes_vendored/pandas/core/computation/scope.py create mode 100644 third_party/bigframes_vendored/pandas/core/dtypes/inference.py create mode 100644 third_party/bigframes_vendored/pandas/util/_exceptions.py create mode 100644 third_party/bigframes_vendored/pandas/util/_validators.py diff --git a/bigframes/core/eval.py b/bigframes/core/eval.py index 0afaa0c11f..692ca1c7bb 100644 --- a/bigframes/core/eval.py +++ b/bigframes/core/eval.py @@ -15,12 +15,12 @@ import dataclasses from typing import Optional -import pandas +import bigframes_vendored.pandas.core.computation.eval as vendored_pandas_eval +import bigframes_vendored.pandas.core.computation.parsing as vendored_pandas_eval_parsing import bigframes.dataframe as dataframe import bigframes.dtypes import bigframes.series as series -import third_party.bigframes_vendored.pandas.core.computation.parsing as vendored_pandas_eval_parsing def eval(df: dataframe.DataFrame, expr: str, target: Optional[dataframe.DataFrame]): @@ -49,7 +49,7 @@ def eval(df: dataframe.DataFrame, expr: str, target: Optional[dataframe.DataFram for name, series in df.items() } # 3 Levels: user -> logging wrapper -> dataframe -> eval helper (this) - return pandas.eval( + return vendored_pandas_eval.eval( expr=expr, level=3, target=target, resolvers=(index_resolver, column_resolver) # type: ignore ) diff --git a/third_party/bigframes_vendored/pandas/core/common.py b/third_party/bigframes_vendored/pandas/core/common.py index ded5a22b8f..872a64db6c 100644 --- a/third_party/bigframes_vendored/pandas/core/common.py +++ b/third_party/bigframes_vendored/pandas/core/common.py @@ -3,6 +3,8 @@ from typing import Callable, TYPE_CHECKING +from bigframes_vendored.pandas.core.dtypes.inference import iterable_not_string + if TYPE_CHECKING: from bigframes_vendored.pandas.pandas._typing import T @@ -40,3 +42,27 @@ def pipe( return func(*args, **kwargs) else: return func(obj, *args, **kwargs) + + +def flatten(line): + """ + Flatten an arbitrarily nested sequence. + + Parameters + ---------- + line : sequence + The non string sequence to flatten + + Notes + ----- + This doesn't consider strings sequences. + + Returns + ------- + flattened : generator + """ + for element in line: + if iterable_not_string(element): + yield from flatten(element) + else: + yield element diff --git a/third_party/bigframes_vendored/pandas/core/computation/align.py b/third_party/bigframes_vendored/pandas/core/computation/align.py new file mode 100644 index 0000000000..2608dabe7a --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/computation/align.py @@ -0,0 +1,226 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/computation/align.py +""" +Core eval alignment algorithms. +""" +from __future__ import annotations + +from functools import partial, wraps +from typing import Callable, TYPE_CHECKING +import warnings + +import bigframes_vendored.pandas.core.common as com +from bigframes_vendored.pandas.core.computation.common import result_type_many +from bigframes_vendored.pandas.util._exceptions import find_stack_level +import numpy as np +from pandas.errors import PerformanceWarning + +if TYPE_CHECKING: + from collections.abc import Sequence + + from bigframes_vendored.pandas.core.generic import NDFrame + from bigframes_vendored.pandas.core.indexes.base import Index + from pandas._typing import F + + +def _align_core_single_unary_op( + term, +) -> tuple[partial | type[NDFrame], dict[str, Index] | None]: + typ: partial | type[NDFrame] + axes: dict[str, Index] | None = None + + if isinstance(term.value, np.ndarray): + typ = partial(np.asanyarray, dtype=term.value.dtype) + else: + typ = type(term.value) + if hasattr(term.value, "axes"): + axes = _zip_axes_from_type(typ, term.value.axes) + + return typ, axes + + +def _zip_axes_from_type( + typ: type[NDFrame], new_axes: Sequence[Index] +) -> dict[str, Index]: + return {name: new_axes[i] for i, name in enumerate(typ._AXIS_ORDERS)} + + +def _any_pandas_objects(terms) -> bool: + """ + Check a sequence of terms for instances of PandasObject. + """ + return any(is_pandas_object(term.value) for term in terms) + + +def _filter_special_cases(f) -> Callable[[F], F]: + @wraps(f) + def wrapper(terms): + # single unary operand + if len(terms) == 1: + return _align_core_single_unary_op(terms[0]) + + term_values = (term.value for term in terms) + + # we don't have any pandas objects + if not _any_pandas_objects(terms): + return result_type_many(*term_values), None + + return f(terms) + + return wrapper + + +@_filter_special_cases +def _align_core(terms): + term_index = [i for i, term in enumerate(terms) if hasattr(term.value, "axes")] + term_dims = [terms[i].value.ndim for i in term_index] + + from pandas import Series + + ndims = Series(dict(zip(term_index, term_dims))) + + # initial axes are the axes of the largest-axis'd term + biggest = terms[ndims.idxmax()].value + typ = biggest._constructor + axes = biggest.axes + naxes = len(axes) + gt_than_one_axis = naxes > 1 + + for value in (terms[i].value for i in term_index): + value_is_series = is_series(value) + is_series_and_gt_one_axis = value_is_series and gt_than_one_axis + + for axis, items in enumerate(value.axes): + if is_series_and_gt_one_axis: + ax, itm = naxes - 1, value.index + else: + ax, itm = axis, items + + if not axes[ax].is_(itm): + axes[ax] = axes[ax].join(itm, how="outer") + + for i, ndim in ndims.items(): + for axis, items in zip(range(ndim), axes): + ti = terms[i].value + + if hasattr(ti, "reindex"): + transpose = value_is_series(ti) and naxes > 1 + reindexer = axes[naxes - 1] if transpose else items + + term_axis_size = len(ti.axes[axis]) + reindexer_size = len(reindexer) + + ordm = np.log10(max(1, abs(reindexer_size - term_axis_size))) + if ordm >= 1 and reindexer_size >= 10000: + w = ( + f"Alignment difference on axis {axis} is larger " + f"than an order of magnitude on term {repr(terms[i].name)}, " + f"by more than {ordm:.4g}; performance may suffer." + ) + warnings.warn( + w, category=PerformanceWarning, stacklevel=find_stack_level() + ) + + obj = ti.reindex(reindexer, axis=axis, copy=False) + terms[i].update(obj) + + terms[i].update(terms[i].value.values) + + return typ, _zip_axes_from_type(typ, axes) + + +def align_terms(terms): + """ + Align a set of terms. + """ + try: + # flatten the parse tree (a nested list, really) + terms = list(com.flatten(terms)) + except TypeError: + # can't iterate so it must just be a constant or single variable + if is_series_or_dataframe(terms.value): + typ = type(terms.value) + return typ, _zip_axes_from_type(typ, terms.value.axes) + return np.result_type(terms.type), None + + # if all resolved variables are numeric scalars + if all(term.is_scalar for term in terms): + return result_type_many(*(term.value for term in terms)).type, None + + # perform the main alignment + typ, axes = _align_core(terms) + return typ, axes + + +def reconstruct_object(typ, obj, axes, dtype): + """ + Reconstruct an object given its type, raw value, and possibly empty + (None) axes. + + Parameters + ---------- + typ : object + A type + obj : object + The value to use in the type constructor + axes : dict + The axes to use to construct the resulting pandas object + + Returns + ------- + ret : typ + An object of type ``typ`` with the value `obj` and possible axes + `axes`. + """ + try: + typ = typ.type + except AttributeError: + pass + + res_t = np.result_type(obj.dtype, dtype) + + if not isinstance(typ, partial) and is_pandas_type(typ): + return typ(obj, dtype=res_t, **axes) + + # special case for pathological things like ~True/~False + if hasattr(res_t, "type") and typ == np.bool_ and res_t != np.bool_: + ret_value = res_t.type(obj) + else: + ret_value = typ(obj).astype(res_t) + # The condition is to distinguish 0-dim array (returned in case of + # scalar) and 1 element array + # e.g. np.array(0) and np.array([0]) + if ( + len(obj.shape) == 1 + and len(obj) == 1 + and not isinstance(ret_value, np.ndarray) + ): + ret_value = np.array([ret_value]).astype(res_t) + + return ret_value + + +# Custom to recognize BigFrames types +def is_series(obj) -> bool: + from bigframes_vendored.pandas.core.series import Series + + return isinstance(obj, Series) + + +def is_series_or_dataframe(obj) -> bool: + from bigframes_vendored.pandas.core.frame import NDFrame + + return isinstance(obj, NDFrame) + + +def is_pandas_object(obj) -> bool: + from bigframes_vendored.pandas.core.frame import NDFrame + from bigframes_vendored.pandas.core.indexes.base import Index + + return isinstance(obj, NDFrame) or isinstance(obj, Index) + + +def is_pandas_type(type) -> bool: + from bigframes_vendored.pandas.core.frame import NDFrame + from bigframes_vendored.pandas.core.indexes.base import Index + + return issubclass(type, NDFrame) or issubclass(type, Index) diff --git a/third_party/bigframes_vendored/pandas/core/computation/common.py b/third_party/bigframes_vendored/pandas/core/computation/common.py new file mode 100644 index 0000000000..7775489d0d --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/computation/common.py @@ -0,0 +1,48 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/computation/common.py +from __future__ import annotations + +from functools import reduce + +import numpy as np +from pandas._config import get_option + + +def ensure_decoded(s) -> str: + """ + If we have bytes, decode them to unicode. + """ + if isinstance(s, (np.bytes_, bytes)): + s = s.decode(get_option("display.encoding")) + return s + + +def result_type_many(*arrays_and_dtypes): + """ + Wrapper around numpy.result_type which overcomes the NPY_MAXARGS (32) + argument limit. + """ + try: + return np.result_type(*arrays_and_dtypes) + except ValueError: + # we have > NPY_MAXARGS terms in our expression + return reduce(np.result_type, arrays_and_dtypes) + except TypeError: + from pandas.core.dtypes.cast import find_common_type + from pandas.core.dtypes.common import is_extension_array_dtype + + arr_and_dtypes = list(arrays_and_dtypes) + ea_dtypes, non_ea_dtypes = [], [] + for arr_or_dtype in arr_and_dtypes: + if is_extension_array_dtype(arr_or_dtype): + ea_dtypes.append(arr_or_dtype) + else: + non_ea_dtypes.append(arr_or_dtype) + + if non_ea_dtypes: + try: + np_dtype = np.result_type(*non_ea_dtypes) + except ValueError: + np_dtype = reduce(np.result_type, arrays_and_dtypes) + return find_common_type(ea_dtypes + [np_dtype]) + + return find_common_type(ea_dtypes) diff --git a/third_party/bigframes_vendored/pandas/core/computation/engines.py b/third_party/bigframes_vendored/pandas/core/computation/engines.py new file mode 100644 index 0000000000..4713565502 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/computation/engines.py @@ -0,0 +1,121 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/computation/engines.py +""" +Engine classes for :func:`~pandas.eval` +""" +from __future__ import annotations + +import abc +from typing import TYPE_CHECKING + +from bigframes_vendored.pandas.core.computation.align import ( + align_terms, + reconstruct_object, +) +from bigframes_vendored.pandas.core.computation.ops import MATHOPS, REDUCTIONS +from pandas.errors import NumExprClobberingError +from pandas.io.formats import printing + +if TYPE_CHECKING: + from bigframes_vendored.pandas.core.computation.expr import Expr + +_ne_builtins = frozenset(MATHOPS + REDUCTIONS) + + +def _check_ne_builtin_clash(expr: Expr) -> None: + """ + Attempt to prevent foot-shooting in a helpful way. + + Parameters + ---------- + expr : Expr + Terms can contain + """ + names = expr.names + overlap = names & _ne_builtins + + if overlap: + s = ", ".join([repr(x) for x in overlap]) + raise NumExprClobberingError( + f'Variables in expression "{expr}" overlap with builtins: ({s})' + ) + + +class AbstractEngine(metaclass=abc.ABCMeta): + """Object serving as a base class for all engines.""" + + has_neg_frac = False + + def __init__(self, expr) -> None: + self.expr = expr + self.aligned_axes = None + self.result_type = None + + def convert(self) -> str: + """ + Convert an expression for evaluation. + + Defaults to return the expression as a string. + """ + return printing.pprint_thing(self.expr) + + def evaluate(self) -> object: + """ + Run the engine on the expression. + + This method performs alignment which is necessary no matter what engine + is being used, thus its implementation is in the base class. + + Returns + ------- + object + The result of the passed expression. + """ + if not self._is_aligned: + self.result_type, self.aligned_axes = align_terms(self.expr.terms) + + # make sure no names in resolvers and locals/globals clash + res = self._evaluate() + return reconstruct_object( + self.result_type, res, self.aligned_axes, self.expr.terms.return_type + ) + + @property + def _is_aligned(self) -> bool: + return self.aligned_axes is not None and self.result_type is not None + + @abc.abstractmethod + def _evaluate(self): + """ + Return an evaluated expression. + + Parameters + ---------- + env : Scope + The local and global environment in which to evaluate an + expression. + + Notes + ----- + Must be implemented by subclasses. + """ + + +class PythonEngine(AbstractEngine): + """ + Evaluate an expression in Python space. + + Mostly for testing purposes. + """ + + has_neg_frac = False + + def evaluate(self): + return self.expr() + + def _evaluate(self) -> None: + pass + + +ENGINES: dict[str, type[AbstractEngine]] = { + "python": PythonEngine, +} diff --git a/third_party/bigframes_vendored/pandas/core/computation/eval.py b/third_party/bigframes_vendored/pandas/core/computation/eval.py new file mode 100644 index 0000000000..cbc41101df --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/computation/eval.py @@ -0,0 +1,412 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/computation/eval.py +""" +Top level ``eval`` module. +""" +from __future__ import annotations + +import tokenize +from typing import TYPE_CHECKING +import warnings + +from bigframes_vendored.pandas.core.computation.engines import ENGINES +from bigframes_vendored.pandas.core.computation.expr import Expr, PARSERS +from bigframes_vendored.pandas.core.computation.parsing import tokenize_string +from bigframes_vendored.pandas.core.computation.scope import ensure_scope +from bigframes_vendored.pandas.core.generic import NDFrame +from bigframes_vendored.pandas.util._exceptions import find_stack_level +from bigframes_vendored.pandas.util._validators import validate_bool_kwarg +from pandas.core.dtypes.common import is_extension_array_dtype +from pandas.io.formats.printing import pprint_thing + +if TYPE_CHECKING: + from pandas.core.computation.ops import BinOp + + +def _check_engine(engine: str | None) -> str: + """ + Make sure a valid engine is passed. + + Parameters + ---------- + engine : str + String to validate. + + Raises + ------ + KeyError + * If an invalid engine is passed. + ImportError + * If numexpr was requested but doesn't exist. + + Returns + ------- + str + Engine name. + """ + from pandas.core.computation.check import NUMEXPR_INSTALLED + from pandas.core.computation.expressions import USE_NUMEXPR + + if engine is None: + engine = "numexpr" if USE_NUMEXPR else "python" + + if engine not in ENGINES: + valid_engines = list(ENGINES.keys()) + raise KeyError( + f"Invalid engine '{engine}' passed, valid engines are {valid_engines}" + ) + + # TODO: validate this in a more general way (thinking of future engines + # that won't necessarily be import-able) + # Could potentially be done on engine instantiation + if engine == "numexpr" and not NUMEXPR_INSTALLED: + raise ImportError( + "'numexpr' is not installed or an unsupported version. Cannot use " + "engine='numexpr' for query/eval if 'numexpr' is not installed" + ) + + return engine + + +def _check_parser(parser: str): + """ + Make sure a valid parser is passed. + + Parameters + ---------- + parser : str + + Raises + ------ + KeyError + * If an invalid parser is passed + """ + if parser not in PARSERS: + raise KeyError( + f"Invalid parser '{parser}' passed, valid parsers are {PARSERS.keys()}" + ) + + +def _check_resolvers(resolvers): + if resolvers is not None: + for resolver in resolvers: + if not hasattr(resolver, "__getitem__"): + name = type(resolver).__name__ + raise TypeError( + f"Resolver of type '{name}' does not " + "implement the __getitem__ method" + ) + + +def _check_expression(expr): + """ + Make sure an expression is not an empty string + + Parameters + ---------- + expr : object + An object that can be converted to a string + + Raises + ------ + ValueError + * If expr is an empty string + """ + if not expr: + raise ValueError("expr cannot be an empty string") + + +def _convert_expression(expr) -> str: + """ + Convert an object to an expression. + + This function converts an object to an expression (a unicode string) and + checks to make sure it isn't empty after conversion. This is used to + convert operators to their string representation for recursive calls to + :func:`~pandas.eval`. + + Parameters + ---------- + expr : object + The object to be converted to a string. + + Returns + ------- + str + The string representation of an object. + + Raises + ------ + ValueError + * If the expression is empty. + """ + s = pprint_thing(expr) + _check_expression(s) + return s + + +def _check_for_locals(expr: str, stack_level: int, parser: str): + at_top_of_stack = stack_level == 0 + not_pandas_parser = parser != "pandas" + + if not_pandas_parser: + msg = "The '@' prefix is only supported by the pandas parser" + elif at_top_of_stack: + msg = ( + "The '@' prefix is not allowed in top-level eval calls.\n" + "please refer to your variables by name without the '@' prefix." + ) + + if at_top_of_stack or not_pandas_parser: + for toknum, tokval in tokenize_string(expr): + if toknum == tokenize.OP and tokval == "@": + raise SyntaxError(msg) + + +def eval( + expr: str | BinOp, # we leave BinOp out of the docstr bc it isn't for users + parser: str = "pandas", + engine: str | None = None, + local_dict=None, + global_dict=None, + resolvers=(), + level: int = 0, + target=None, + inplace: bool = False, +): + """ + Evaluate a Python expression as a string using various backends. + + The following arithmetic operations are supported: ``+``, ``-``, ``*``, + ``/``, ``**``, ``%``, ``//`` (python engine only) along with the following + boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not). + Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`, + :keyword:`or`, and :keyword:`not` with the same semantics as the + corresponding bitwise operators. :class:`~pandas.Series` and + :class:`~pandas.DataFrame` objects are supported and behave as they would + with plain ol' Python evaluation. + + Parameters + ---------- + expr : str + The expression to evaluate. This string cannot contain any Python + `statements + `__, + only Python `expressions + `__. + parser : {'pandas', 'python'}, default 'pandas' + The parser to use to construct the syntax tree from the expression. The + default of ``'pandas'`` parses code slightly different than standard + Python. Alternatively, you can parse an expression using the + ``'python'`` parser to retain strict Python semantics. See the + :ref:`enhancing performance ` documentation for + more details. + engine : {'python', 'numexpr'}, default 'numexpr' + + The engine used to evaluate the expression. Supported engines are + + - None : tries to use ``numexpr``, falls back to ``python`` + - ``'numexpr'`` : This default engine evaluates pandas objects using + numexpr for large speed ups in complex expressions with large frames. + - ``'python'`` : Performs operations as if you had ``eval``'d in top + level python. This engine is generally not that useful. + + More backends may be available in the future. + local_dict : dict or None, optional + A dictionary of local variables, taken from locals() by default. + global_dict : dict or None, optional + A dictionary of global variables, taken from globals() by default. + resolvers : list of dict-like or None, optional + A list of objects implementing the ``__getitem__`` special method that + you can use to inject an additional collection of namespaces to use for + variable lookup. For example, this is used in the + :meth:`~DataFrame.query` method to inject the + ``DataFrame.index`` and ``DataFrame.columns`` + variables that refer to their respective :class:`~pandas.DataFrame` + instance attributes. + level : int, optional + The number of prior stack frames to traverse and add to the current + scope. Most users will **not** need to change this parameter. + target : object, optional, default None + This is the target object for assignment. It is used when there is + variable assignment in the expression. If so, then `target` must + support item assignment with string keys, and if a copy is being + returned, it must also support `.copy()`. + inplace : bool, default False + If `target` is provided, and the expression mutates `target`, whether + to modify `target` inplace. Otherwise, return a copy of `target` with + the mutation. + + Returns + ------- + ndarray, numeric scalar, DataFrame, Series, or None + The completion value of evaluating the given code or None if ``inplace=True``. + + Raises + ------ + ValueError + There are many instances where such an error can be raised: + + - `target=None`, but the expression is multiline. + - The expression is multiline, but not all them have item assignment. + An example of such an arrangement is this: + + a = b + 1 + a + 2 + + Here, there are expressions on different lines, making it multiline, + but the last line has no variable assigned to the output of `a + 2`. + - `inplace=True`, but the expression is missing item assignment. + - Item assignment is provided, but the `target` does not support + string item assignment. + - Item assignment is provided and `inplace=False`, but the `target` + does not support the `.copy()` method + + See Also + -------- + DataFrame.query : Evaluates a boolean expression to query the columns + of a frame. + DataFrame.eval : Evaluate a string describing operations on + DataFrame columns. + + Notes + ----- + The ``dtype`` of any objects involved in an arithmetic ``%`` operation are + recursively cast to ``float64``. + + See the :ref:`enhancing performance ` documentation for + more details. + + Examples + -------- + >>> df = pd.DataFrame({"animal": ["dog", "pig"], "age": [10, 20]}) + >>> df + animal age + 0 dog 10 + 1 pig 20 + + We can add a new column using ``pd.eval``: + + >>> pd.eval("double_age = df.age * 2", target=df) + animal age double_age + 0 dog 10 20 + 1 pig 20 40 + """ + inplace = validate_bool_kwarg(inplace, "inplace") + + exprs: list[str | BinOp] + if isinstance(expr, str): + _check_expression(expr) + exprs = [e.strip() for e in expr.splitlines() if e.strip() != ""] + else: + # ops.BinOp; for internal compat, not intended to be passed by users + exprs = [expr] + multi_line = len(exprs) > 1 + + if multi_line and target is None: + raise ValueError( + "multi-line expressions are only valid in the " + "context of data, use DataFrame.eval" + ) + engine = _check_engine(engine) + _check_parser(parser) + _check_resolvers(resolvers) + + ret = None + first_expr = True + target_modified = False + + for expr in exprs: + expr = _convert_expression(expr) + _check_for_locals(expr, level, parser) + + # get our (possibly passed-in) scope + env = ensure_scope( + level + 1, + global_dict=global_dict, + local_dict=local_dict, + resolvers=resolvers, + target=target, + ) + + parsed_expr = Expr(expr, engine=engine, parser=parser, env=env) + + if engine == "numexpr" and ( + is_extension_array_dtype(parsed_expr.terms.return_type) + or getattr(parsed_expr.terms, "operand_types", None) is not None + and any( + is_extension_array_dtype(elem) + for elem in parsed_expr.terms.operand_types + ) + ): + warnings.warn( + "Engine has switched to 'python' because numexpr does not support " + "extension array dtypes. Please set your engine to python manually.", + RuntimeWarning, + stacklevel=find_stack_level(), + ) + engine = "python" + + # construct the engine and evaluate the parsed expression + eng = ENGINES[engine] + eng_inst = eng(parsed_expr) + ret = eng_inst.evaluate() + + if parsed_expr.assigner is None: + if multi_line: + raise ValueError( + "Multi-line expressions are only valid " + "if all expressions contain an assignment" + ) + if inplace: + raise ValueError("Cannot operate inplace if there is no assignment") + + # assign if needed + assigner = parsed_expr.assigner + if env.target is not None and assigner is not None: + target_modified = True + + # if returning a copy, copy only on the first assignment + if not inplace and first_expr: + try: + target = env.target + if isinstance(target, NDFrame): + target = target.copy() + except AttributeError as err: + raise ValueError("Cannot return a copy of the target") from err + else: + target = env.target + + # TypeError is most commonly raised (e.g. int, list), but you + # get IndexError if you try to do this assignment on np.ndarray. + # we will ignore numpy warnings here; e.g. if trying + # to use a non-numeric indexer + try: + with warnings.catch_warnings(record=True): + # TODO: Filter the warnings we actually care about here. + if inplace and isinstance(target, NDFrame): + target.loc[:, assigner] = ret + else: + target[ # pyright: ignore[reportGeneralTypeIssues] + assigner + ] = ret + except (TypeError, IndexError) as err: + raise ValueError("Cannot assign expression output to target") from err + + if not resolvers: + resolvers = ({assigner: ret},) + else: + # existing resolver needs updated to handle + # case of mutating existing column in copy + for resolver in resolvers: + if assigner in resolver: + resolver[assigner] = ret + break + else: + resolvers += ({assigner: ret},) + + ret = None + first_expr = False + + # We want to exclude `inplace=None` as being False. + if inplace is False: + return target if target_modified else ret diff --git a/third_party/bigframes_vendored/pandas/core/computation/expr.py b/third_party/bigframes_vendored/pandas/core/computation/expr.py new file mode 100644 index 0000000000..6a01ff845c --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/computation/expr.py @@ -0,0 +1,831 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/computation/expr.py +""" +:func:`~pandas.eval` parsers. +""" +from __future__ import annotations + +import ast +from functools import partial, reduce +from keyword import iskeyword +import tokenize +from typing import Callable, TypeVar + +import bigframes_vendored.pandas.core.common as com +from bigframes_vendored.pandas.core.computation.ops import ( + ARITH_OPS_SYMS, + BinOp, + BOOL_OPS_SYMS, + CMP_OPS_SYMS, + Constant, + Div, + FuncNode, + is_term, + LOCAL_TAG, + MATHOPS, + Op, + REDUCTIONS, + Term, + UNARY_OPS_SYMS, + UnaryOp, +) +from bigframes_vendored.pandas.core.computation.parsing import ( + clean_backtick_quoted_toks, + tokenize_string, +) +from bigframes_vendored.pandas.core.computation.scope import Scope +import numpy as np +from pandas.errors import UndefinedVariableError +from pandas.io.formats import printing + + +def _rewrite_assign(tok: tuple[int, str]) -> tuple[int, str]: + """ + Rewrite the assignment operator for PyTables expressions that use ``=`` + as a substitute for ``==``. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + tuple of int, str + Either the input or token or the replacement values + """ + toknum, tokval = tok + return toknum, "==" if tokval == "=" else tokval + + +def _replace_booleans(tok: tuple[int, str]) -> tuple[int, str]: + """ + Replace ``&`` with ``and`` and ``|`` with ``or`` so that bitwise + precedence is changed to boolean precedence. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + tuple of int, str + Either the input or token or the replacement values + """ + toknum, tokval = tok + if toknum == tokenize.OP: + if tokval == "&": + return tokenize.NAME, "and" + elif tokval == "|": + return tokenize.NAME, "or" + return toknum, tokval + return toknum, tokval + + +def _replace_locals(tok: tuple[int, str]) -> tuple[int, str]: + """ + Replace local variables with a syntactically valid name. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + tuple of int, str + Either the input or token or the replacement values + + Notes + ----- + This is somewhat of a hack in that we rewrite a string such as ``'@a'`` as + ``'__pd_eval_local_a'`` by telling the tokenizer that ``__pd_eval_local_`` + is a ``tokenize.OP`` and to replace the ``'@'`` symbol with it. + """ + toknum, tokval = tok + if toknum == tokenize.OP and tokval == "@": + return tokenize.OP, LOCAL_TAG + return toknum, tokval + + +def _compose2(f, g): + """ + Compose 2 callables. + """ + return lambda *args, **kwargs: f(g(*args, **kwargs)) + + +def _compose(*funcs): + """ + Compose 2 or more callables. + """ + assert len(funcs) > 1, "At least 2 callables must be passed to compose" + return reduce(_compose2, funcs) + + +def _preparse( + source: str, + f=_compose( + _replace_locals, _replace_booleans, _rewrite_assign, clean_backtick_quoted_toks + ), +) -> str: + """ + Compose a collection of tokenization functions. + + Parameters + ---------- + source : str + A Python source code string + f : callable + This takes a tuple of (toknum, tokval) as its argument and returns a + tuple with the same structure but possibly different elements. Defaults + to the composition of ``_rewrite_assign``, ``_replace_booleans``, and + ``_replace_locals``. + + Returns + ------- + str + Valid Python source code + + Notes + ----- + The `f` parameter can be any callable that takes *and* returns input of the + form ``(toknum, tokval)``, where ``toknum`` is one of the constants from + the ``tokenize`` module and ``tokval`` is a string. + """ + assert callable(f), "f must be callable" + return tokenize.untokenize(f(x) for x in tokenize_string(source)) + + +def _is_type(t): + """ + Factory for a type checking function of type ``t`` or tuple of types. + """ + return lambda x: isinstance(x.value, t) + + +_is_list = _is_type(list) +_is_str = _is_type(str) + + +# partition all AST nodes +_all_nodes = frozenset( + node + for node in (getattr(ast, name) for name in dir(ast)) + if isinstance(node, type) and issubclass(node, ast.AST) +) + + +def _filter_nodes(superclass, all_nodes=_all_nodes): + """ + Filter out AST nodes that are subclasses of ``superclass``. + """ + node_names = (node.__name__ for node in all_nodes if issubclass(node, superclass)) + return frozenset(node_names) + + +_all_node_names = frozenset(x.__name__ for x in _all_nodes) +_mod_nodes = _filter_nodes(ast.mod) +_stmt_nodes = _filter_nodes(ast.stmt) +_expr_nodes = _filter_nodes(ast.expr) +_expr_context_nodes = _filter_nodes(ast.expr_context) +_boolop_nodes = _filter_nodes(ast.boolop) +_operator_nodes = _filter_nodes(ast.operator) +_unary_op_nodes = _filter_nodes(ast.unaryop) +_cmp_op_nodes = _filter_nodes(ast.cmpop) +_comprehension_nodes = _filter_nodes(ast.comprehension) +_handler_nodes = _filter_nodes(ast.excepthandler) +_arguments_nodes = _filter_nodes(ast.arguments) +_keyword_nodes = _filter_nodes(ast.keyword) +_alias_nodes = _filter_nodes(ast.alias) + + +# nodes that we don't support directly but are needed for parsing +_hacked_nodes = frozenset(["Assign", "Module", "Expr"]) + + +_unsupported_expr_nodes = frozenset( + [ + "Yield", + "GeneratorExp", + "IfExp", + "DictComp", + "SetComp", + "Repr", + "Lambda", + "Set", + "AST", + "Is", + "IsNot", + ] +) + +# these nodes are low priority or won't ever be supported (e.g., AST) +_unsupported_nodes = ( + _stmt_nodes + | _mod_nodes + | _handler_nodes + | _arguments_nodes + | _keyword_nodes + | _alias_nodes + | _expr_context_nodes + | _unsupported_expr_nodes +) - _hacked_nodes + +# we're adding a different assignment in some cases to be equality comparison +# and we don't want `stmt` and friends in their so get only the class whose +# names are capitalized +_base_supported_nodes = (_all_node_names - _unsupported_nodes) | _hacked_nodes +intersection = _unsupported_nodes & _base_supported_nodes +_msg = f"cannot both support and not support {intersection}" +assert not intersection, _msg + + +def _node_not_implemented(node_name: str) -> Callable[..., None]: + """ + Return a function that raises a NotImplementedError with a passed node name. + """ + + def f(self, *args, **kwargs): + raise NotImplementedError(f"'{node_name}' nodes are not implemented") + + return f + + +# should be bound by BaseExprVisitor but that creates a circular dependency: +# _T is used in disallow, but disallow is used to define BaseExprVisitor +# https://github.com/microsoft/pyright/issues/2315 +_T = TypeVar("_T") + + +def disallow(nodes: set[str]) -> Callable[[type[_T]], type[_T]]: + """ + Decorator to disallow certain nodes from parsing. Raises a + NotImplementedError instead. + + Returns + ------- + callable + """ + + def disallowed(cls: type[_T]) -> type[_T]: + # error: "Type[_T]" has no attribute "unsupported_nodes" + cls.unsupported_nodes = () # type: ignore[attr-defined] + for node in nodes: + new_method = _node_not_implemented(node) + name = f"visit_{node}" + # error: "Type[_T]" has no attribute "unsupported_nodes" + cls.unsupported_nodes += (name,) # type: ignore[attr-defined] + setattr(cls, name, new_method) + return cls + + return disallowed + + +def _op_maker(op_class, op_symbol): + """ + Return a function to create an op class with its symbol already passed. + + Returns + ------- + callable + """ + + def f(self, node, *args, **kwargs): + """ + Return a partial function with an Op subclass with an operator already passed. + + Returns + ------- + callable + """ + return partial(op_class, op_symbol, *args, **kwargs) + + return f + + +_op_classes = {"binary": BinOp, "unary": UnaryOp} + + +def add_ops(op_classes): + """ + Decorator to add default implementation of ops. + """ + + def f(cls): + for op_attr_name, op_class in op_classes.items(): + ops = getattr(cls, f"{op_attr_name}_ops") + ops_map = getattr(cls, f"{op_attr_name}_op_nodes_map") + for op in ops: + op_node = ops_map[op] + if op_node is not None: + made_op = _op_maker(op_class, op) + setattr(cls, f"visit_{op_node}", made_op) + return cls + + return f + + +@disallow(_unsupported_nodes) +@add_ops(_op_classes) +class BaseExprVisitor(ast.NodeVisitor): + """ + Custom ast walker. Parsers of other engines should subclass this class + if necessary. + + Parameters + ---------- + env : Scope + engine : str + parser : str + preparser : callable + """ + + const_type: type[Term] = Constant + term_type = Term + + binary_ops = CMP_OPS_SYMS + BOOL_OPS_SYMS + ARITH_OPS_SYMS + binary_op_nodes = ( + "Gt", + "Lt", + "GtE", + "LtE", + "Eq", + "NotEq", + "In", + "NotIn", + "BitAnd", + "BitOr", + "And", + "Or", + "Add", + "Sub", + "Mult", + None, + "Pow", + "FloorDiv", + "Mod", + ) + binary_op_nodes_map = dict(zip(binary_ops, binary_op_nodes)) + + unary_ops = UNARY_OPS_SYMS + unary_op_nodes = "UAdd", "USub", "Invert", "Not" + unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes)) + + rewrite_map = { + ast.Eq: ast.In, + ast.NotEq: ast.NotIn, + ast.In: ast.In, + ast.NotIn: ast.NotIn, + } + + unsupported_nodes: tuple[str, ...] + + def __init__(self, env, engine, parser, preparser=_preparse) -> None: + self.env = env + self.engine = engine + self.parser = parser + self.preparser = preparser + self.assigner = None + + def visit(self, node, **kwargs): + if isinstance(node, str): + clean = self.preparser(node) + try: + node = ast.fix_missing_locations(ast.parse(clean)) + except SyntaxError as e: + if any(iskeyword(x) for x in clean.split()): + e.msg = "Python keyword not valid identifier in numexpr query" + raise e + + method = f"visit_{type(node).__name__}" + visitor = getattr(self, method) + return visitor(node, **kwargs) + + def visit_Module(self, node, **kwargs): + if len(node.body) != 1: + raise SyntaxError("only a single expression is allowed") + expr = node.body[0] + return self.visit(expr, **kwargs) + + def visit_Expr(self, node, **kwargs): + return self.visit(node.value, **kwargs) + + def _rewrite_membership_op(self, node, left, right): + # the kind of the operator (is actually an instance) + op_instance = node.op + op_type = type(op_instance) + + # must be two terms and the comparison operator must be ==/!=/in/not in + if is_term(left) and is_term(right) and op_type in self.rewrite_map: + left_list, right_list = map(_is_list, (left, right)) + left_str, right_str = map(_is_str, (left, right)) + + # if there are any strings or lists in the expression + if left_list or right_list or left_str or right_str: + op_instance = self.rewrite_map[op_type]() + + # pop the string variable out of locals and replace it with a list + # of one string, kind of a hack + if right_str: + name = self.env.add_tmp([right.value]) + right = self.term_type(name, self.env) + + if left_str: + name = self.env.add_tmp([left.value]) + left = self.term_type(name, self.env) + + op = self.visit(op_instance) + return op, op_instance, left, right + + def _maybe_transform_eq_ne(self, node, left=None, right=None): + if left is None: + left = self.visit(node.left, side="left") + if right is None: + right = self.visit(node.right, side="right") + op, op_class, left, right = self._rewrite_membership_op(node, left, right) + return op, op_class, left, right + + def _maybe_downcast_constants(self, left, right): + f32 = np.dtype(np.float32) + if ( + left.is_scalar + and hasattr(left, "value") + and not right.is_scalar + and right.return_type == f32 + ): + # right is a float32 array, left is a scalar + name = self.env.add_tmp(np.float32(left.value)) + left = self.term_type(name, self.env) + if ( + right.is_scalar + and hasattr(right, "value") + and not left.is_scalar + and left.return_type == f32 + ): + # left is a float32 array, right is a scalar + name = self.env.add_tmp(np.float32(right.value)) + right = self.term_type(name, self.env) + + return left, right + + def _maybe_eval(self, binop, eval_in_python): + # eval `in` and `not in` (for now) in "partial" python space + # things that can be evaluated in "eval" space will be turned into + # temporary variables. for example, + # [1,2] in a + 2 * b + # in that case a + 2 * b will be evaluated using numexpr, and the "in" + # call will be evaluated using isin (in python space) + return binop.evaluate( + self.env, self.engine, self.parser, self.term_type, eval_in_python + ) + + def _maybe_evaluate_binop( + self, + op, + op_class, + lhs, + rhs, + eval_in_python=("in", "not in"), + maybe_eval_in_python=("==", "!=", "<", ">", "<=", ">="), + ): + res = op(lhs, rhs) + + if res.has_invalid_return_type: + raise TypeError( + f"unsupported operand type(s) for {res.op}: " + f"'{lhs.type}' and '{rhs.type}'" + ) + + if self.engine != "pytables" and ( + res.op in CMP_OPS_SYMS + and getattr(lhs, "is_datetime", False) + or getattr(rhs, "is_datetime", False) + ): + # all date ops must be done in python bc numexpr doesn't work + # well with NaT + return self._maybe_eval(res, self.binary_ops) + + if res.op in eval_in_python: + # "in"/"not in" ops are always evaluated in python + return self._maybe_eval(res, eval_in_python) + elif self.engine != "pytables": + if ( + getattr(lhs, "return_type", None) == object + or getattr(rhs, "return_type", None) == object + ): + # evaluate "==" and "!=" in python if either of our operands + # has an object return type + return self._maybe_eval(res, eval_in_python + maybe_eval_in_python) + return res + + def visit_BinOp(self, node, **kwargs): + op, op_class, left, right = self._maybe_transform_eq_ne(node) + left, right = self._maybe_downcast_constants(left, right) + return self._maybe_evaluate_binop(op, op_class, left, right) + + def visit_Div(self, node, **kwargs): + return lambda lhs, rhs: Div(lhs, rhs) + + def visit_UnaryOp(self, node, **kwargs): + op = self.visit(node.op) + operand = self.visit(node.operand) + return op(operand) + + def visit_Name(self, node, **kwargs): + return self.term_type(node.id, self.env, **kwargs) + + # TODO(py314): deprecated since Python 3.8. Remove after Python 3.14 is min + def visit_NameConstant(self, node, **kwargs) -> Term: + return self.const_type(node.value, self.env) + + # TODO(py314): deprecated since Python 3.8. Remove after Python 3.14 is min + def visit_Num(self, node, **kwargs) -> Term: + return self.const_type(node.value, self.env) + + def visit_Constant(self, node, **kwargs) -> Term: + return self.const_type(node.value, self.env) + + # TODO(py314): deprecated since Python 3.8. Remove after Python 3.14 is min + def visit_Str(self, node, **kwargs): + name = self.env.add_tmp(node.s) + return self.term_type(name, self.env) + + def visit_List(self, node, **kwargs): + name = self.env.add_tmp([self.visit(e)(self.env) for e in node.elts]) + return self.term_type(name, self.env) + + visit_Tuple = visit_List + + def visit_Index(self, node, **kwargs): + """df.index[4]""" + return self.visit(node.value) + + def visit_Subscript(self, node, **kwargs): + from pandas import eval as pd_eval + + value = self.visit(node.value) + slobj = self.visit(node.slice) + result = pd_eval( + slobj, local_dict=self.env, engine=self.engine, parser=self.parser + ) + try: + # a Term instance + v = value.value[result] + except AttributeError: + # an Op instance + lhs = pd_eval( + value, local_dict=self.env, engine=self.engine, parser=self.parser + ) + v = lhs[result] + name = self.env.add_tmp(v) + return self.term_type(name, env=self.env) + + def visit_Slice(self, node, **kwargs): + """df.index[slice(4,6)]""" + lower = node.lower + if lower is not None: + lower = self.visit(lower).value + upper = node.upper + if upper is not None: + upper = self.visit(upper).value + step = node.step + if step is not None: + step = self.visit(step).value + + return slice(lower, upper, step) + + def visit_Assign(self, node, **kwargs): + """ + support a single assignment node, like + + c = a + b + + set the assigner at the top level, must be a Name node which + might or might not exist in the resolvers + + """ + if len(node.targets) != 1: + raise SyntaxError("can only assign a single expression") + if not isinstance(node.targets[0], ast.Name): + raise SyntaxError("left hand side of an assignment must be a single name") + if self.env.target is None: + raise ValueError("cannot assign without a target object") + + try: + assigner = self.visit(node.targets[0], **kwargs) + except UndefinedVariableError: + assigner = node.targets[0].id + + self.assigner = getattr(assigner, "name", assigner) + if self.assigner is None: + raise SyntaxError( + "left hand side of an assignment must be a single resolvable name" + ) + + return self.visit(node.value, **kwargs) + + def visit_Attribute(self, node, **kwargs): + attr = node.attr + value = node.value + + ctx = node.ctx + if isinstance(ctx, ast.Load): + # resolve the value + resolved = self.visit(value).value + try: + v = getattr(resolved, attr) + name = self.env.add_tmp(v) + return self.term_type(name, self.env) + except AttributeError: + # something like datetime.datetime where scope is overridden + if isinstance(value, ast.Name) and value.id == attr: + return resolved + raise + + raise ValueError(f"Invalid Attribute context {type(ctx).__name__}") + + def visit_Call(self, node, side=None, **kwargs): + if isinstance(node.func, ast.Attribute) and node.func.attr != "__call__": + res = self.visit_Attribute(node.func) + elif not isinstance(node.func, ast.Name): + raise TypeError("Only named functions are supported") + else: + try: + res = self.visit(node.func) + except UndefinedVariableError: + # Check if this is a supported function name + try: + res = FuncNode(node.func.id) + except ValueError: + # Raise original error + raise + + if res is None: + # error: "expr" has no attribute "id" + raise ValueError( + f"Invalid function call {node.func.id}" # type: ignore[attr-defined] + ) + if hasattr(res, "value"): + res = res.value + + if isinstance(res, FuncNode): + new_args = [self.visit(arg) for arg in node.args] + + if node.keywords: + raise TypeError( + f'Function "{res.name}" does not support keyword arguments' + ) + + return res(*new_args) + + else: + new_args = [self.visit(arg)(self.env) for arg in node.args] + + for key in node.keywords: + if not isinstance(key, ast.keyword): + # error: "expr" has no attribute "id" + raise ValueError( + "keyword error in function call " # type: ignore[attr-defined] + f"'{node.func.id}'" + ) + + if key.arg: + kwargs[key.arg] = self.visit(key.value)(self.env) + + name = self.env.add_tmp(res(*new_args, **kwargs)) + return self.term_type(name=name, env=self.env) + + def translate_In(self, op): + return op + + def visit_Compare(self, node, **kwargs): + ops = node.ops + comps = node.comparators + + # base case: we have something like a CMP b + if len(comps) == 1: + op = self.translate_In(ops[0]) + binop = ast.BinOp(op=op, left=node.left, right=comps[0]) + return self.visit(binop) + + # recursive case: we have a chained comparison, a CMP b CMP c, etc. + left = node.left + values = [] + for op, comp in zip(ops, comps): + new_node = self.visit( + ast.Compare(comparators=[comp], left=left, ops=[self.translate_In(op)]) + ) + left = comp + values.append(new_node) + return self.visit(ast.BoolOp(op=ast.And(), values=values)) + + def _try_visit_binop(self, bop): + if isinstance(bop, (Op, Term)): + return bop + return self.visit(bop) + + def visit_BoolOp(self, node, **kwargs): + def visitor(x, y): + lhs = self._try_visit_binop(x) + rhs = self._try_visit_binop(y) + + op, op_class, lhs, rhs = self._maybe_transform_eq_ne(node, lhs, rhs) + return self._maybe_evaluate_binop(op, node.op, lhs, rhs) + + operands = node.values + return reduce(visitor, operands) + + +_python_not_supported = frozenset(["Dict", "BoolOp", "In", "NotIn"]) +_numexpr_supported_calls = frozenset(REDUCTIONS + MATHOPS) + + +@disallow( + (_unsupported_nodes | _python_not_supported) + - (_boolop_nodes | frozenset(["BoolOp", "Attribute", "In", "NotIn", "Tuple"])) +) +class PandasExprVisitor(BaseExprVisitor): + def __init__( + self, + env, + engine, + parser, + preparser=partial( + _preparse, + f=_compose(_replace_locals, _replace_booleans, clean_backtick_quoted_toks), + ), + ) -> None: + super().__init__(env, engine, parser, preparser) + + +@disallow(_unsupported_nodes | _python_not_supported | frozenset(["Not"])) +class PythonExprVisitor(BaseExprVisitor): + def __init__( + self, env, engine, parser, preparser=lambda source, f=None: source + ) -> None: + super().__init__(env, engine, parser, preparser=preparser) + + +class Expr: + """ + Object encapsulating an expression. + + Parameters + ---------- + expr : str + engine : str, optional, default 'numexpr' + parser : str, optional, default 'pandas' + env : Scope, optional, default None + level : int, optional, default 2 + """ + + env: Scope + engine: str + parser: str + + def __init__( + self, + expr, + engine: str = "numexpr", + parser: str = "pandas", + env: Scope | None = None, + level: int = 0, + ) -> None: + self.expr = expr + self.env = env or Scope(level=level + 1) + self.engine = engine + self.parser = parser + self._visitor = PARSERS[parser](self.env, self.engine, self.parser) + self.terms = self.parse() + + @property + def assigner(self): + return getattr(self._visitor, "assigner", None) + + def __call__(self): + return self.terms(self.env) + + def __repr__(self) -> str: + return printing.pprint_thing(self.terms) + + def __len__(self) -> int: + return len(self.expr) + + def parse(self): + """ + Parse an expression. + """ + return self._visitor.visit(self.expr) + + @property + def names(self): + """ + Get the names in an expression. + """ + if is_term(self.terms): + return frozenset([self.terms.name]) + return frozenset(term.name for term in com.flatten(self.terms)) + + +PARSERS = {"python": PythonExprVisitor, "pandas": PandasExprVisitor} diff --git a/third_party/bigframes_vendored/pandas/core/computation/ops.py b/third_party/bigframes_vendored/pandas/core/computation/ops.py new file mode 100644 index 0000000000..75b914c876 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/computation/ops.py @@ -0,0 +1,605 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/computation/ops.py +""" +Operator classes for eval. +""" + +from __future__ import annotations + +from datetime import datetime +from functools import partial +import operator +from typing import Callable, Literal, TYPE_CHECKING + +import bigframes_vendored.pandas.core.common as com +from bigframes_vendored.pandas.core.computation.common import ( + ensure_decoded, + result_type_many, +) +from bigframes_vendored.pandas.core.computation.scope import DEFAULT_GLOBALS +import numpy as np +from pandas._libs.tslibs import Timestamp +from pandas.core.dtypes.common import is_list_like, is_scalar +from pandas.io.formats.printing import pprint_thing, pprint_thing_encoded + +if TYPE_CHECKING: + from collections.abc import Iterable, Iterator + +REDUCTIONS = ("sum", "prod", "min", "max") + +_unary_math_ops = ( + "sin", + "cos", + "exp", + "log", + "expm1", + "log1p", + "sqrt", + "sinh", + "cosh", + "tanh", + "arcsin", + "arccos", + "arctan", + "arccosh", + "arcsinh", + "arctanh", + "abs", + "log10", + "floor", + "ceil", +) +_binary_math_ops = ("arctan2",) + +MATHOPS = _unary_math_ops + _binary_math_ops + + +LOCAL_TAG = "__pd_eval_local_" + + +class Term: + def __new__(cls, name, env, side=None, encoding=None): + klass = Constant if not isinstance(name, str) else cls + # error: Argument 2 for "super" not an instance of argument 1 + supr_new = super(Term, klass).__new__ # type: ignore[misc] + return supr_new(klass) + + is_local: bool + + def __init__(self, name, env, side=None, encoding=None) -> None: + # name is a str for Term, but may be something else for subclasses + self._name = name + self.env = env + self.side = side + tname = str(name) + self.is_local = tname.startswith(LOCAL_TAG) or tname in DEFAULT_GLOBALS + self._value = self._resolve_name() + self.encoding = encoding + + @property + def local_name(self) -> str: + return self.name.replace(LOCAL_TAG, "") + + def __repr__(self) -> str: + return pprint_thing(self.name) + + def __call__(self, *args, **kwargs): + return self.value + + def evaluate(self, *args, **kwargs) -> Term: + return self + + def _resolve_name(self): + local_name = str(self.local_name) + is_local = self.is_local + if local_name in self.env.scope and isinstance( + self.env.scope[local_name], type + ): + is_local = False + + res = self.env.resolve(local_name, is_local=is_local) + self.update(res) + + if hasattr(res, "ndim") and res.ndim > 2: + raise NotImplementedError( + "N-dimensional objects, where N > 2, are not supported with eval" + ) + return res + + def update(self, value) -> None: + """ + search order for local (i.e., @variable) variables: + + scope, key_variable + [('locals', 'local_name'), + ('globals', 'local_name'), + ('locals', 'key'), + ('globals', 'key')] + """ + key = self.name + + # if it's a variable name (otherwise a constant) + if isinstance(key, str): + self.env.swapkey(self.local_name, key, new_value=value) + + self.value = value + + @property + def is_scalar(self) -> bool: + return is_scalar(self._value) + + @property + def type(self): + try: + # potentially very slow for large, mixed dtype frames + return self._value.values.dtype + except AttributeError: + try: + # ndarray + return self._value.dtype + except AttributeError: + # scalar + return type(self._value) + + return_type = type + + @property + def raw(self) -> str: + return f"{type(self).__name__}(name={repr(self.name)}, type={self.type})" + + @property + def is_datetime(self) -> bool: + try: + t = self.type.type + except AttributeError: + t = self.type + + return issubclass(t, (datetime, np.datetime64)) + + @property + def value(self): + return self._value + + @value.setter + def value(self, new_value) -> None: + self._value = new_value + + @property + def name(self): + return self._name + + @property + def ndim(self) -> int: + return self._value.ndim + + +class Constant(Term): + def _resolve_name(self): + return self._name + + @property + def name(self): + return self.value + + def __repr__(self) -> str: + # in python 2 str() of float + # can truncate shorter than repr() + return repr(self.name) + + +_bool_op_map = {"not": "~", "and": "&", "or": "|"} + + +class Op: + """ + Hold an operator of arbitrary arity. + """ + + op: str + + def __init__(self, op: str, operands: Iterable[Term | Op], encoding=None) -> None: + self.op = _bool_op_map.get(op, op) + self.operands = operands + self.encoding = encoding + + def __iter__(self) -> Iterator: + return iter(self.operands) + + def __repr__(self) -> str: + """ + Print a generic n-ary operator and its operands using infix notation. + """ + # recurse over the operands + parened = (f"({pprint_thing(opr)})" for opr in self.operands) + return pprint_thing(f" {self.op} ".join(parened)) + + @property + def return_type(self): + # clobber types to bool if the op is a boolean operator + if self.op in (CMP_OPS_SYMS + BOOL_OPS_SYMS): + return np.bool_ + return result_type_many(*(term.type for term in com.flatten(self))) + + @property + def has_invalid_return_type(self) -> bool: + types = self.operand_types + obj_dtype_set = frozenset([np.dtype("object")]) + return self.return_type == object and types - obj_dtype_set + + @property + def operand_types(self): + return frozenset(term.type for term in com.flatten(self)) + + @property + def is_scalar(self) -> bool: + return all(operand.is_scalar for operand in self.operands) + + @property + def is_datetime(self) -> bool: + try: + t = self.return_type.type + except AttributeError: + t = self.return_type + + return issubclass(t, (datetime, np.datetime64)) + + +def _in(x, y): + """ + Compute the vectorized membership of ``x in y`` if possible, otherwise + use Python. + """ + try: + return x.isin(y) + except AttributeError: + if is_list_like(x): + try: + return y.isin(x) + except AttributeError: + pass + return x in y + + +def _not_in(x, y): + """ + Compute the vectorized membership of ``x not in y`` if possible, + otherwise use Python. + """ + try: + return ~x.isin(y) + except AttributeError: + if is_list_like(x): + try: + return ~y.isin(x) + except AttributeError: + pass + return x not in y + + +CMP_OPS_SYMS = (">", "<", ">=", "<=", "==", "!=", "in", "not in") +_cmp_ops_funcs = ( + operator.gt, + operator.lt, + operator.ge, + operator.le, + operator.eq, + operator.ne, + _in, + _not_in, +) +_cmp_ops_dict = dict(zip(CMP_OPS_SYMS, _cmp_ops_funcs)) + +BOOL_OPS_SYMS = ("&", "|", "and", "or") +_bool_ops_funcs = (operator.and_, operator.or_, operator.and_, operator.or_) +_bool_ops_dict = dict(zip(BOOL_OPS_SYMS, _bool_ops_funcs)) + +ARITH_OPS_SYMS = ("+", "-", "*", "/", "**", "//", "%") +_arith_ops_funcs = ( + operator.add, + operator.sub, + operator.mul, + operator.truediv, + operator.pow, + operator.floordiv, + operator.mod, +) +_arith_ops_dict = dict(zip(ARITH_OPS_SYMS, _arith_ops_funcs)) + +SPECIAL_CASE_ARITH_OPS_SYMS = ("**", "//", "%") +_special_case_arith_ops_funcs = (operator.pow, operator.floordiv, operator.mod) +_special_case_arith_ops_dict = dict( + zip(SPECIAL_CASE_ARITH_OPS_SYMS, _special_case_arith_ops_funcs) +) + +_binary_ops_dict = {} + +for d in (_cmp_ops_dict, _bool_ops_dict, _arith_ops_dict): + _binary_ops_dict.update(d) + + +def _cast_inplace(terms, acceptable_dtypes, dtype) -> None: + """ + Cast an expression inplace. + + Parameters + ---------- + terms : Op + The expression that should cast. + acceptable_dtypes : list of acceptable numpy.dtype + Will not cast if term's dtype in this list. + dtype : str or numpy.dtype + The dtype to cast to. + """ + dt = np.dtype(dtype) + for term in terms: + if term.type in acceptable_dtypes: + continue + + try: + new_value = term.value.astype(dt) + except AttributeError: + new_value = dt.type(term.value) + term.update(new_value) + + +def is_term(obj) -> bool: + return isinstance(obj, Term) + + +class BinOp(Op): + """ + Hold a binary operator and its operands. + + Parameters + ---------- + op : str + lhs : Term or Op + rhs : Term or Op + """ + + def __init__(self, op: str, lhs, rhs) -> None: + super().__init__(op, (lhs, rhs)) + self.lhs = lhs + self.rhs = rhs + + self._disallow_scalar_only_bool_ops() + + self.convert_values() + + try: + self.func = _binary_ops_dict[op] + except KeyError as err: + # has to be made a list for python3 + keys = list(_binary_ops_dict.keys()) + raise ValueError( + f"Invalid binary operator {repr(op)}, valid operators are {keys}" + ) from err + + def __call__(self, env): + """ + Recursively evaluate an expression in Python space. + + Parameters + ---------- + env : Scope + + Returns + ------- + object + The result of an evaluated expression. + """ + # recurse over the left/right nodes + left = self.lhs(env) + right = self.rhs(env) + + return self.func(left, right) + + def evaluate(self, env, engine: str, parser, term_type, eval_in_python): + """ + Evaluate a binary operation *before* being passed to the engine. + + Parameters + ---------- + env : Scope + engine : str + parser : str + term_type : type + eval_in_python : list + + Returns + ------- + term_type + The "pre-evaluated" expression as an instance of ``term_type`` + """ + if engine == "python": + res = self(env) + else: + # recurse over the left/right nodes + + left = self.lhs.evaluate( + env, + engine=engine, + parser=parser, + term_type=term_type, + eval_in_python=eval_in_python, + ) + + right = self.rhs.evaluate( + env, + engine=engine, + parser=parser, + term_type=term_type, + eval_in_python=eval_in_python, + ) + + # base cases + if self.op in eval_in_python: + res = self.func(left.value, right.value) + else: + from pandas.core.computation.eval import eval + + res = eval(self, local_dict=env, engine=engine, parser=parser) + + name = env.add_tmp(res) + return term_type(name, env=env) + + def convert_values(self) -> None: + """ + Convert datetimes to a comparable value in an expression. + """ + + def stringify(value): + encoder: Callable + if self.encoding is not None: + encoder = partial(pprint_thing_encoded, encoding=self.encoding) + else: + encoder = pprint_thing + return encoder(value) + + lhs, rhs = self.lhs, self.rhs + + if is_term(lhs) and lhs.is_datetime and is_term(rhs) and rhs.is_scalar: + v = rhs.value + if isinstance(v, (int, float)): + v = stringify(v) + v = Timestamp(ensure_decoded(v)) + if v.tz is not None: + v = v.tz_convert("UTC") + self.rhs.update(v) + + if is_term(rhs) and rhs.is_datetime and is_term(lhs) and lhs.is_scalar: + v = lhs.value + if isinstance(v, (int, float)): + v = stringify(v) + v = Timestamp(ensure_decoded(v)) + if v.tz is not None: + v = v.tz_convert("UTC") + self.lhs.update(v) + + def _disallow_scalar_only_bool_ops(self): + rhs = self.rhs + lhs = self.lhs + + # GH#24883 unwrap dtype if necessary to ensure we have a type object + rhs_rt = rhs.return_type + rhs_rt = getattr(rhs_rt, "type", rhs_rt) + lhs_rt = lhs.return_type + lhs_rt = getattr(lhs_rt, "type", lhs_rt) + if ( + (lhs.is_scalar or rhs.is_scalar) + and self.op in _bool_ops_dict + and ( + not ( + issubclass(rhs_rt, (bool, np.bool_)) + and issubclass(lhs_rt, (bool, np.bool_)) + ) + ) + ): + raise NotImplementedError("cannot evaluate scalar only bool ops") + + +def isnumeric(dtype) -> bool: + return issubclass(np.dtype(dtype).type, np.number) + + +class Div(BinOp): + """ + Div operator to special case casting. + + Parameters + ---------- + lhs, rhs : Term or Op + The Terms or Ops in the ``/`` expression. + """ + + def __init__(self, lhs, rhs) -> None: + super().__init__("/", lhs, rhs) + + if not isnumeric(lhs.return_type) or not isnumeric(rhs.return_type): + raise TypeError( + f"unsupported operand type(s) for {self.op}: " + f"'{lhs.return_type}' and '{rhs.return_type}'" + ) + + # do not upcast float32s to float64 un-necessarily + acceptable_dtypes = [np.float32, np.float64] + _cast_inplace(com.flatten(self), acceptable_dtypes, np.float64) + + +UNARY_OPS_SYMS = ("+", "-", "~", "not") +_unary_ops_funcs = (operator.pos, operator.neg, operator.invert, operator.invert) +_unary_ops_dict = dict(zip(UNARY_OPS_SYMS, _unary_ops_funcs)) + + +class UnaryOp(Op): + """ + Hold a unary operator and its operands. + + Parameters + ---------- + op : str + The token used to represent the operator. + operand : Term or Op + The Term or Op operand to the operator. + + Raises + ------ + ValueError + * If no function associated with the passed operator token is found. + """ + + def __init__(self, op: Literal["+", "-", "~", "not"], operand) -> None: + super().__init__(op, (operand,)) + self.operand = operand + + try: + self.func = _unary_ops_dict[op] + except KeyError as err: + raise ValueError( + f"Invalid unary operator {repr(op)}, " + f"valid operators are {UNARY_OPS_SYMS}" + ) from err + + def __call__(self, env) -> MathCall: + operand = self.operand(env) + # error: Cannot call function of unknown type + return self.func(operand) # type: ignore[operator] + + def __repr__(self) -> str: + return pprint_thing(f"{self.op}({self.operand})") + + @property + def return_type(self) -> np.dtype: + operand = self.operand + if operand.return_type == np.dtype("bool"): + return np.dtype("bool") + if isinstance(operand, Op) and ( + operand.op in _cmp_ops_dict or operand.op in _bool_ops_dict + ): + return np.dtype("bool") + return np.dtype("int") + + +class MathCall(Op): + def __init__(self, func, args) -> None: + super().__init__(func.name, args) + self.func = func + + def __call__(self, env): + # error: "Op" not callable + operands = [op(env) for op in self.operands] # type: ignore[operator] + return self.func.func(*operands) + + def __repr__(self) -> str: + operands = map(str, self.operands) + return pprint_thing(f"{self.op}({','.join(operands)})") + + +class FuncNode: + def __init__(self, name: str) -> None: + if name not in MATHOPS: + raise ValueError(f'"{name}" is not a supported function') + self.name = name + self.func = getattr(np, name) + + def __call__(self, *args): + return MathCall(self, args) diff --git a/third_party/bigframes_vendored/pandas/core/computation/scope.py b/third_party/bigframes_vendored/pandas/core/computation/scope.py new file mode 100644 index 0000000000..bfd7eb1d12 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/computation/scope.py @@ -0,0 +1,355 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/computation/scope.py +""" +Module for scope operations +""" +from __future__ import annotations + +from collections import ChainMap +import datetime +import inspect +from io import StringIO +import itertools +import pprint +import struct +import sys +from typing import TypeVar + +import numpy as np +from pandas._libs.tslibs import Timestamp +from pandas.errors import UndefinedVariableError + +_KT = TypeVar("_KT") +_VT = TypeVar("_VT") + + +# https://docs.python.org/3/library/collections.html#chainmap-examples-and-recipes +class DeepChainMap(ChainMap[_KT, _VT]): + """ + Variant of ChainMap that allows direct updates to inner scopes. + + Only works when all passed mapping are mutable. + """ + + def __setitem__(self, key: _KT, value: _VT) -> None: + for mapping in self.maps: + if key in mapping: + mapping[key] = value + return + self.maps[0][key] = value + + def __delitem__(self, key: _KT) -> None: + """ + Raises + ------ + KeyError + If `key` doesn't exist. + """ + for mapping in self.maps: + if key in mapping: + del mapping[key] + return + raise KeyError(key) + + +def ensure_scope( + level: int, global_dict=None, local_dict=None, resolvers=(), target=None +) -> Scope: + """Ensure that we are grabbing the correct scope.""" + return Scope( + level + 1, + global_dict=global_dict, + local_dict=local_dict, + resolvers=resolvers, + target=target, + ) + + +def _replacer(x) -> str: + """ + Replace a number with its hexadecimal representation. Used to tag + temporary variables with their calling scope's id. + """ + # get the hex repr of the binary char and remove 0x and pad by pad_size + # zeros + try: + hexin = ord(x) + except TypeError: + # bytes literals masquerade as ints when iterating in py3 + hexin = x + + return hex(hexin) + + +def _raw_hex_id(obj) -> str: + """Return the padded hexadecimal id of ``obj``.""" + # interpret as a pointer since that's what really what id returns + packed = struct.pack("@P", id(obj)) + return "".join([_replacer(x) for x in packed]) + + +DEFAULT_GLOBALS = { + "Timestamp": Timestamp, + "datetime": datetime.datetime, + "True": True, + "False": False, + "list": list, + "tuple": tuple, + "inf": np.inf, + "Inf": np.inf, +} + + +def _get_pretty_string(obj) -> str: + """ + Return a prettier version of obj. + + Parameters + ---------- + obj : object + Object to pretty print + + Returns + ------- + str + Pretty print object repr + """ + sio = StringIO() + pprint.pprint(obj, stream=sio) + return sio.getvalue() + + +class Scope: + """ + Object to hold scope, with a few bells to deal with some custom syntax + and contexts added by pandas. + + Parameters + ---------- + level : int + global_dict : dict or None, optional, default None + local_dict : dict or Scope or None, optional, default None + resolvers : list-like or None, optional, default None + target : object + + Attributes + ---------- + level : int + scope : DeepChainMap + target : object + temps : dict + """ + + __slots__ = ["level", "scope", "target", "resolvers", "temps"] + level: int + scope: DeepChainMap + resolvers: DeepChainMap + temps: dict + + def __init__( + self, level: int, global_dict=None, local_dict=None, resolvers=(), target=None + ) -> None: + self.level = level + 1 + + # shallow copy because we don't want to keep filling this up with what + # was there before if there are multiple calls to Scope/_ensure_scope + self.scope = DeepChainMap(DEFAULT_GLOBALS.copy()) + self.target = target + + if isinstance(local_dict, Scope): + self.scope.update(local_dict.scope) + if local_dict.target is not None: + self.target = local_dict.target + self._update(local_dict.level) + + frame = sys._getframe(self.level) + + try: + # shallow copy here because we don't want to replace what's in + # scope when we align terms (alignment accesses the underlying + # numpy array of pandas objects) + scope_global = self.scope.new_child( + (global_dict if global_dict is not None else frame.f_globals).copy() + ) + self.scope = DeepChainMap(scope_global) + if not isinstance(local_dict, Scope): + scope_local = self.scope.new_child( + (local_dict if local_dict is not None else frame.f_locals).copy() + ) + self.scope = DeepChainMap(scope_local) + finally: + del frame + + # assumes that resolvers are going from outermost scope to inner + if isinstance(local_dict, Scope): + resolvers += tuple(local_dict.resolvers.maps) + self.resolvers = DeepChainMap(*resolvers) + self.temps = {} + + def __repr__(self) -> str: + scope_keys = _get_pretty_string(list(self.scope.keys())) + res_keys = _get_pretty_string(list(self.resolvers.keys())) + return f"{type(self).__name__}(scope={scope_keys}, resolvers={res_keys})" + + @property + def has_resolvers(self) -> bool: + """ + Return whether we have any extra scope. + + For example, DataFrames pass Their columns as resolvers during calls to + ``DataFrame.eval()`` and ``DataFrame.query()``. + + Returns + ------- + hr : bool + """ + return bool(len(self.resolvers)) + + def resolve(self, key: str, is_local: bool): + """ + Resolve a variable name in a possibly local context. + + Parameters + ---------- + key : str + A variable name + is_local : bool + Flag indicating whether the variable is local or not (prefixed with + the '@' symbol) + + Returns + ------- + value : object + The value of a particular variable + """ + try: + # only look for locals in outer scope + if is_local: + return self.scope[key] + + # not a local variable so check in resolvers if we have them + if self.has_resolvers: + return self.resolvers[key] + + # if we're here that means that we have no locals and we also have + # no resolvers + assert not is_local and not self.has_resolvers + return self.scope[key] + except KeyError: + try: + # last ditch effort we look in temporaries + # these are created when parsing indexing expressions + # e.g., df[df > 0] + return self.temps[key] + except KeyError as err: + raise UndefinedVariableError(key, is_local) from err + + def swapkey(self, old_key: str, new_key: str, new_value=None) -> None: + """ + Replace a variable name, with a potentially new value. + + Parameters + ---------- + old_key : str + Current variable name to replace + new_key : str + New variable name to replace `old_key` with + new_value : object + Value to be replaced along with the possible renaming + """ + if self.has_resolvers: + maps = self.resolvers.maps + self.scope.maps + else: + maps = self.scope.maps + + maps.append(self.temps) + + for mapping in maps: + if old_key in mapping: + mapping[new_key] = new_value + return + + def _get_vars(self, stack, scopes: list[str]) -> None: + """ + Get specifically scoped variables from a list of stack frames. + + Parameters + ---------- + stack : list + A list of stack frames as returned by ``inspect.stack()`` + scopes : sequence of strings + A sequence containing valid stack frame attribute names that + evaluate to a dictionary. For example, ('locals', 'globals') + """ + variables = itertools.product(scopes, stack) + for scope, (frame, _, _, _, _, _) in variables: + try: + d = getattr(frame, f"f_{scope}") + self.scope = DeepChainMap(self.scope.new_child(d)) + finally: + # won't remove it, but DECREF it + # in Py3 this probably isn't necessary since frame won't be + # scope after the loop + del frame + + def _update(self, level: int) -> None: + """ + Update the current scope by going back `level` levels. + + Parameters + ---------- + level : int + """ + sl = level + 1 + + # add sl frames to the scope starting with the + # most distant and overwriting with more current + # makes sure that we can capture variable scope + stack = inspect.stack() + + try: + self._get_vars(stack[:sl], scopes=["locals"]) + finally: + del stack[:], stack + + def add_tmp(self, value) -> str: + """ + Add a temporary variable to the scope. + + Parameters + ---------- + value : object + An arbitrary object to be assigned to a temporary variable. + + Returns + ------- + str + The name of the temporary variable created. + """ + name = f"{type(value).__name__}_{self.ntemps}_{_raw_hex_id(self)}" + + # add to inner most scope + assert name not in self.temps + self.temps[name] = value + assert name in self.temps + + # only increment if the variable gets put in the scope + return name + + @property + def ntemps(self) -> int: + """The number of temporary variables in this scope""" + return len(self.temps) + + @property + def full_scope(self) -> DeepChainMap: + """ + Return the full scope for use with passing to engines transparently + as a mapping. + + Returns + ------- + vars : DeepChainMap + All variables in this scope. + """ + maps = [self.temps] + self.resolvers.maps + self.scope.maps + return DeepChainMap(*maps) diff --git a/third_party/bigframes_vendored/pandas/core/dtypes/inference.py b/third_party/bigframes_vendored/pandas/core/dtypes/inference.py new file mode 100644 index 0000000000..fcbb4c242f --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/dtypes/inference.py @@ -0,0 +1,31 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/dtypes/inference.py +""" basic inference routines """ + +from __future__ import annotations + +from collections import abc + + +def iterable_not_string(obj) -> bool: + """ + Check if the object is an iterable but not a string. + + Parameters + ---------- + obj : The object to check. + + Returns + ------- + is_iter_not_string : bool + Whether `obj` is a non-string iterable. + + Examples + -------- + >>> iterable_not_string([1, 2, 3]) + True + >>> iterable_not_string("foo") + False + >>> iterable_not_string(1) + False + """ + return isinstance(obj, abc.Iterable) and not isinstance(obj, str) diff --git a/third_party/bigframes_vendored/pandas/util/_exceptions.py b/third_party/bigframes_vendored/pandas/util/_exceptions.py new file mode 100644 index 0000000000..4ca649153a --- /dev/null +++ b/third_party/bigframes_vendored/pandas/util/_exceptions.py @@ -0,0 +1,29 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/util/_exceptions.py +from __future__ import annotations + +import inspect +import os + + +def find_stack_level() -> int: + """ + Find the first place in the stack that is not inside pandas + (tests notwithstanding). + """ + + import pandas as pd + + pkg_dir = os.path.dirname(pd.__file__) + test_dir = os.path.join(pkg_dir, "tests") + + # https://stackoverflow.com/questions/17407119/python-inspect-stack-is-slow + frame = inspect.currentframe() + n = 0 + while frame: + fname = inspect.getfile(frame) + if fname.startswith(pkg_dir) and not fname.startswith(test_dir): + frame = frame.f_back + n += 1 + else: + break + return n diff --git a/third_party/bigframes_vendored/pandas/util/_validators.py b/third_party/bigframes_vendored/pandas/util/_validators.py new file mode 100644 index 0000000000..1f36e0d528 --- /dev/null +++ b/third_party/bigframes_vendored/pandas/util/_validators.py @@ -0,0 +1,58 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/util/_validators.py +""" +Module that contains many useful utilities +for validating data or function arguments +""" +from __future__ import annotations + +from typing import TypeVar + +from pandas.core.dtypes.common import is_bool + +BoolishT = TypeVar("BoolishT", bool, int) +BoolishNoneT = TypeVar("BoolishNoneT", bool, int, None) + + +def validate_bool_kwarg( + value: BoolishNoneT, + arg_name: str, + none_allowed: bool = True, + int_allowed: bool = False, +) -> BoolishNoneT: + """ + Ensure that argument passed in arg_name can be interpreted as boolean. + + Parameters + ---------- + value : bool + Value to be validated. + arg_name : str + Name of the argument. To be reflected in the error message. + none_allowed : bool, default True + Whether to consider None to be a valid boolean. + int_allowed : bool, default False + Whether to consider integer value to be a valid boolean. + + Returns + ------- + value + The same value as input. + + Raises + ------ + ValueError + If the value is not a valid boolean. + """ + good_value = is_bool(value) + if none_allowed: + good_value = good_value or (value is None) + + if int_allowed: + good_value = good_value or isinstance(value, int) + + if not good_value: + raise ValueError( + f'For argument "{arg_name}" expected type bool, received ' + f"type {type(value).__name__}." + ) + return value # pyright: ignore[reportGeneralTypeIssues] From fc4b26a01e728b4b2c5f74cb34f5f93e08d41d31 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 21 Mar 2024 21:56:46 +0000 Subject: [PATCH 5/9] amend eval docstring --- .../pandas/core/computation/eval.py | 173 ++++++++---------- 1 file changed, 79 insertions(+), 94 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/computation/eval.py b/third_party/bigframes_vendored/pandas/core/computation/eval.py index cbc41101df..c895d1009f 100644 --- a/third_party/bigframes_vendored/pandas/core/computation/eval.py +++ b/third_party/bigframes_vendored/pandas/core/computation/eval.py @@ -185,100 +185,11 @@ def eval( :class:`~pandas.DataFrame` objects are supported and behave as they would with plain ol' Python evaluation. - Parameters - ---------- - expr : str - The expression to evaluate. This string cannot contain any Python - `statements - `__, - only Python `expressions - `__. - parser : {'pandas', 'python'}, default 'pandas' - The parser to use to construct the syntax tree from the expression. The - default of ``'pandas'`` parses code slightly different than standard - Python. Alternatively, you can parse an expression using the - ``'python'`` parser to retain strict Python semantics. See the - :ref:`enhancing performance ` documentation for - more details. - engine : {'python', 'numexpr'}, default 'numexpr' - - The engine used to evaluate the expression. Supported engines are - - - None : tries to use ``numexpr``, falls back to ``python`` - - ``'numexpr'`` : This default engine evaluates pandas objects using - numexpr for large speed ups in complex expressions with large frames. - - ``'python'`` : Performs operations as if you had ``eval``'d in top - level python. This engine is generally not that useful. - - More backends may be available in the future. - local_dict : dict or None, optional - A dictionary of local variables, taken from locals() by default. - global_dict : dict or None, optional - A dictionary of global variables, taken from globals() by default. - resolvers : list of dict-like or None, optional - A list of objects implementing the ``__getitem__`` special method that - you can use to inject an additional collection of namespaces to use for - variable lookup. For example, this is used in the - :meth:`~DataFrame.query` method to inject the - ``DataFrame.index`` and ``DataFrame.columns`` - variables that refer to their respective :class:`~pandas.DataFrame` - instance attributes. - level : int, optional - The number of prior stack frames to traverse and add to the current - scope. Most users will **not** need to change this parameter. - target : object, optional, default None - This is the target object for assignment. It is used when there is - variable assignment in the expression. If so, then `target` must - support item assignment with string keys, and if a copy is being - returned, it must also support `.copy()`. - inplace : bool, default False - If `target` is provided, and the expression mutates `target`, whether - to modify `target` inplace. Otherwise, return a copy of `target` with - the mutation. - - Returns - ------- - ndarray, numeric scalar, DataFrame, Series, or None - The completion value of evaluating the given code or None if ``inplace=True``. + **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None - Raises - ------ - ValueError - There are many instances where such an error can be raised: - - - `target=None`, but the expression is multiline. - - The expression is multiline, but not all them have item assignment. - An example of such an arrangement is this: - - a = b + 1 - a + 2 - - Here, there are expressions on different lines, making it multiline, - but the last line has no variable assigned to the output of `a + 2`. - - `inplace=True`, but the expression is missing item assignment. - - Item assignment is provided, but the `target` does not support - string item assignment. - - Item assignment is provided and `inplace=False`, but the `target` - does not support the `.copy()` method - - See Also - -------- - DataFrame.query : Evaluates a boolean expression to query the columns - of a frame. - DataFrame.eval : Evaluate a string describing operations on - DataFrame columns. - - Notes - ----- - The ``dtype`` of any objects involved in an arithmetic ``%`` operation are - recursively cast to ``float64``. - - See the :ref:`enhancing performance ` documentation for - more details. - - Examples - -------- - >>> df = pd.DataFrame({"animal": ["dog", "pig"], "age": [10, 20]}) + >>> df = bpd.DataFrame({"animal": ["dog", "pig"], "age": [10, 20]}) >>> df animal age 0 dog 10 @@ -286,10 +197,84 @@ def eval( We can add a new column using ``pd.eval``: - >>> pd.eval("double_age = df.age * 2", target=df) + >>> df.eval("double_age = df.age * 2") animal age double_age 0 dog 10 20 1 pig 20 40 + + Args: + expr (str): + The expression to evaluate. This string cannot contain any Python + `statements + `__, + only Python `expressions + `__. + parser ({'pandas', 'python'}, default 'pandas'): + The parser to use to construct the syntax tree from the expression. The + default of ``'pandas'`` parses code slightly different than standard + Python. Alternatively, you can parse an expression using the + ``'python'`` parser to retain strict Python semantics. See the + :ref:`enhancing performance ` documentation for + more details. + engine ({'python', 'numexpr'}, default 'numexpr'): + + The engine used to evaluate the expression. Supported engines are + + - None : tries to use ``numexpr``, falls back to ``python`` + - ``'numexpr'`` : This default engine evaluates pandas objects using + numexpr for large speed ups in complex expressions with large frames. + - ``'python'`` : Performs operations as if you had ``eval``'d in top + level python. This engine is generally not that useful. + + More backends may be available in the future. + local_dict (dict or None, optional): + A dictionary of local variables, taken from locals() by default. + global_dict (dict or None, optional): + A dictionary of global variables, taken from globals() by default. + resolvers (list of dict-like or None, optional): + A list of objects implementing the ``__getitem__`` special method that + you can use to inject an additional collection of namespaces to use for + variable lookup. For example, this is used in the + :meth:`~DataFrame.query` method to inject the + ``DataFrame.index`` and ``DataFrame.columns`` + variables that refer to their respective :class:`~pandas.DataFrame` + instance attributes. + level (int, optional): + The number of prior stack frames to traverse and add to the current + scope. Most users will **not** need to change this parameter. + target (object, optional, default None): + This is the target object for assignment. It is used when there is + variable assignment in the expression. If so, then `target` must + support item assignment with string keys, and if a copy is being + returned, it must also support `.copy()`. + inplace (bool, default False): + If `target` is provided, and the expression mutates `target`, whether + to modify `target` inplace. Otherwise, return a copy of `target` with + the mutation. + + Returns: + ndarray, numeric scalar, DataFrame, Series, or None: + The completion value of evaluating the given code or None if ``inplace=True``. + + Raises: + ValueError: + There are many instances where such an error can be raised: + + - `target=None`, but the expression is multiline. + - The expression is multiline, but not all them have item assignment. + An example of such an arrangement is this: + + a = b + 1 + a + 2 + + Here, there are expressions on different lines, making it multiline, + but the last line has no variable assigned to the output of `a + 2`. + - `inplace=True`, but the expression is missing item assignment. + - Item assignment is provided, but the `target` does not support + string item assignment. + - Item assignment is provided and `inplace=False`, but the `target` + does not support the `.copy()` method + """ inplace = validate_bool_kwarg(inplace, "inplace") From 59888b4a57b70e022717745f871e753c0ed4f12d Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Thu, 21 Mar 2024 23:26:01 +0000 Subject: [PATCH 6/9] fix doctest expectation --- .../bigframes_vendored/pandas/core/computation/eval.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/third_party/bigframes_vendored/pandas/core/computation/eval.py b/third_party/bigframes_vendored/pandas/core/computation/eval.py index c895d1009f..11a3c35e19 100644 --- a/third_party/bigframes_vendored/pandas/core/computation/eval.py +++ b/third_party/bigframes_vendored/pandas/core/computation/eval.py @@ -194,6 +194,8 @@ def eval( animal age 0 dog 10 1 pig 20 + + [2 rows x 2 columns] We can add a new column using ``pd.eval``: @@ -201,6 +203,8 @@ def eval( animal age double_age 0 dog 10 20 1 pig 20 40 + + [2 rows x 3 columns] Args: expr (str): From 838ff14f8af844228c614a0010fc2543654afb65 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Fri, 22 Mar 2024 16:59:45 +0000 Subject: [PATCH 7/9] amend doctest --- .../pandas/core/computation/eval.py | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/computation/eval.py b/third_party/bigframes_vendored/pandas/core/computation/eval.py index 11a3c35e19..0f568567c8 100644 --- a/third_party/bigframes_vendored/pandas/core/computation/eval.py +++ b/third_party/bigframes_vendored/pandas/core/computation/eval.py @@ -186,25 +186,25 @@ def eval( with plain ol' Python evaluation. **Examples:** - >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None - - >>> df = bpd.DataFrame({"animal": ["dog", "pig"], "age": [10, 20]}) - >>> df - animal age - 0 dog 10 - 1 pig 20 - - [2 rows x 2 columns] - - We can add a new column using ``pd.eval``: - - >>> df.eval("double_age = df.age * 2") - animal age double_age - 0 dog 10 20 - 1 pig 20 40 - - [2 rows x 3 columns] + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"animal": ["dog", "pig"], "age": [10, 20]}) + >>> df + animal age + 0 dog 10 + 1 pig 20 + + [2 rows x 2 columns] + + We can add a new column using ``pd.eval``: + + >>> df.eval("double_age = age * 2") + animal age double_age + 0 dog 10 20 + 1 pig 20 40 + + [2 rows x 3 columns] Args: expr (str): From b143097ca33ba29a69d4bbe0337571c021046604 Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Sat, 23 Mar 2024 00:34:04 +0000 Subject: [PATCH 8/9] pr comments --- .../pandas/core/computation/engines.py | 27 ----- .../pandas/core/computation/eval.py | 39 +------ .../pandas/core/computation/expr.py | 3 - .../bigframes_vendored/pandas/core/frame.py | 108 ++++++++++++++++++ 4 files changed, 111 insertions(+), 66 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/computation/engines.py b/third_party/bigframes_vendored/pandas/core/computation/engines.py index 4713565502..15fd48b237 100644 --- a/third_party/bigframes_vendored/pandas/core/computation/engines.py +++ b/third_party/bigframes_vendored/pandas/core/computation/engines.py @@ -5,40 +5,13 @@ from __future__ import annotations import abc -from typing import TYPE_CHECKING from bigframes_vendored.pandas.core.computation.align import ( align_terms, reconstruct_object, ) -from bigframes_vendored.pandas.core.computation.ops import MATHOPS, REDUCTIONS -from pandas.errors import NumExprClobberingError from pandas.io.formats import printing -if TYPE_CHECKING: - from bigframes_vendored.pandas.core.computation.expr import Expr - -_ne_builtins = frozenset(MATHOPS + REDUCTIONS) - - -def _check_ne_builtin_clash(expr: Expr) -> None: - """ - Attempt to prevent foot-shooting in a helpful way. - - Parameters - ---------- - expr : Expr - Terms can contain - """ - names = expr.names - overlap = names & _ne_builtins - - if overlap: - s = ", ".join([repr(x) for x in overlap]) - raise NumExprClobberingError( - f'Variables in expression "{expr}" overlap with builtins: ({s})' - ) - class AbstractEngine(metaclass=abc.ABCMeta): """Object serving as a base class for all engines.""" diff --git a/third_party/bigframes_vendored/pandas/core/computation/eval.py b/third_party/bigframes_vendored/pandas/core/computation/eval.py index 0f568567c8..56d60174a6 100644 --- a/third_party/bigframes_vendored/pandas/core/computation/eval.py +++ b/third_party/bigframes_vendored/pandas/core/computation/eval.py @@ -13,9 +13,7 @@ from bigframes_vendored.pandas.core.computation.parsing import tokenize_string from bigframes_vendored.pandas.core.computation.scope import ensure_scope from bigframes_vendored.pandas.core.generic import NDFrame -from bigframes_vendored.pandas.util._exceptions import find_stack_level from bigframes_vendored.pandas.util._validators import validate_bool_kwarg -from pandas.core.dtypes.common import is_extension_array_dtype from pandas.io.formats.printing import pprint_thing if TYPE_CHECKING: @@ -35,19 +33,15 @@ def _check_engine(engine: str | None) -> str: ------ KeyError * If an invalid engine is passed. - ImportError - * If numexpr was requested but doesn't exist. Returns ------- str Engine name. """ - from pandas.core.computation.check import NUMEXPR_INSTALLED - from pandas.core.computation.expressions import USE_NUMEXPR if engine is None: - engine = "numexpr" if USE_NUMEXPR else "python" + engine = "python" if engine not in ENGINES: valid_engines = list(ENGINES.keys()) @@ -55,15 +49,6 @@ def _check_engine(engine: str | None) -> str: f"Invalid engine '{engine}' passed, valid engines are {valid_engines}" ) - # TODO: validate this in a more general way (thinking of future engines - # that won't necessarily be import-able) - # Could potentially be done on engine instantiation - if engine == "numexpr" and not NUMEXPR_INSTALLED: - raise ImportError( - "'numexpr' is not installed or an unsupported version. Cannot use " - "engine='numexpr' for query/eval if 'numexpr' is not installed" - ) - return engine @@ -220,13 +205,11 @@ def eval( ``'python'`` parser to retain strict Python semantics. See the :ref:`enhancing performance ` documentation for more details. - engine ({'python', 'numexpr'}, default 'numexpr'): + engine ({'python'}, default None): The engine used to evaluate the expression. Supported engines are - - None : tries to use ``numexpr``, falls back to ``python`` - - ``'numexpr'`` : This default engine evaluates pandas objects using - numexpr for large speed ups in complex expressions with large frames. + - None : defaults to ``python`` - ``'python'`` : Performs operations as if you had ``eval``'d in top level python. This engine is generally not that useful. @@ -319,22 +302,6 @@ def eval( parsed_expr = Expr(expr, engine=engine, parser=parser, env=env) - if engine == "numexpr" and ( - is_extension_array_dtype(parsed_expr.terms.return_type) - or getattr(parsed_expr.terms, "operand_types", None) is not None - and any( - is_extension_array_dtype(elem) - for elem in parsed_expr.terms.operand_types - ) - ): - warnings.warn( - "Engine has switched to 'python' because numexpr does not support " - "extension array dtypes. Please set your engine to python manually.", - RuntimeWarning, - stacklevel=find_stack_level(), - ) - engine = "python" - # construct the engine and evaluate the parsed expression eng = ENGINES[engine] eng_inst = eng(parsed_expr) diff --git a/third_party/bigframes_vendored/pandas/core/computation/expr.py b/third_party/bigframes_vendored/pandas/core/computation/expr.py index 6a01ff845c..44f649e59d 100644 --- a/third_party/bigframes_vendored/pandas/core/computation/expr.py +++ b/third_party/bigframes_vendored/pandas/core/computation/expr.py @@ -21,9 +21,7 @@ FuncNode, is_term, LOCAL_TAG, - MATHOPS, Op, - REDUCTIONS, Term, UNARY_OPS_SYMS, UnaryOp, @@ -738,7 +736,6 @@ def visitor(x, y): _python_not_supported = frozenset(["Dict", "BoolOp", "In", "NotIn"]) -_numexpr_supported_calls = frozenset(REDUCTIONS + MATHOPS) @disallow( diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index f45877a4d1..7eed1c5c07 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4911,6 +4911,68 @@ def eval(self, expr: str) -> DataFrame: `eval` to run arbitrary code, which can make you vulnerable to code injection if you pass user input to this function. + **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) + >>> df + A B + 0 1 10 + 1 2 8 + 2 3 6 + 3 4 4 + 4 5 2 + + [5 rows x 2 columns] + >>> df.eval('A + B') + 0 11 + 1 10 + 2 9 + 3 8 + 4 7 + dtype: int64 + + Assignment is allowed though by default the original DataFrame is not + modified. + + >>> df.eval('C = A + B') + A B C + 0 1 10 11 + 1 2 8 10 + 2 3 6 9 + 3 4 4 8 + 4 5 2 7 + + [5 rows x 3 columns] + >>> df + A B + 0 1 10 + 1 2 8 + 2 3 6 + 3 4 4 + 4 5 2 + + [5 rows x 2 columns] + + Multiple columns can be assigned to using multi-line expressions: + + >>> df.eval( + ... ''' + ... C = A + B + ... D = A - B + ... ''' + ... ) + A B C D + 0 1 10 11 -9 + 1 2 8 10 -6 + 2 3 6 9 -3 + 3 4 4 8 0 + 4 5 2 7 3 + + [5 rows x 4 columns] + + Args: expr (str): The expression string to evaluate. @@ -4924,6 +4986,52 @@ def query(self, expr: str) -> DataFrame | None: """ Query the columns of a DataFrame with a boolean expression. + **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'A': range(1, 6), + ... 'B': range(10, 0, -2), + ... 'C C': range(10, 5, -1)}) + >>> df + A B C C + 0 1 10 10 + 1 2 8 9 + 2 3 6 8 + 3 4 4 7 + 4 5 2 6 + + [5 rows x 3 columns] + >>> df.query('A > B') + A B C C + 4 5 2 6 + + [1 rows x 3 columns] + + The previous expression is equivalent to + + >>> df[df.A > df.B] + A B C C + 4 5 2 6 + + [1 rows x 3 columns] + + For columns with spaces in their name, you can use backtick quoting. + + >>> df.query('B == `C C`') + A B C C + 0 1 10 10 + + [1 rows x 3 columns] + + The previous expression is equivalent to + + >>> df[df.B == df['C C']] + A B C C + 0 1 10 10 + + [1 rows x 3 columns] + Args: expr (str): The query string to evaluate. From 6887fc721ca4efb4738817efc7b894f6670d335b Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Sat, 23 Mar 2024 20:47:42 +0000 Subject: [PATCH 9/9] Fix doctest for eval --- third_party/bigframes_vendored/pandas/core/frame.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 1f6faffe9a..2640cce6da 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2865,6 +2865,7 @@ def cov(self, *, numeric_only) -> DataFrame: Returns: DataFrame: The covariance matrix of the series of the DataFrame. """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def update( self, other, join: str = "left", overwrite: bool = True, filter_func=None @@ -4959,7 +4960,7 @@ def eval(self, expr: str) -> DataFrame: 2 9 3 8 4 7 - dtype: int64 + dtype: Int64 Assignment is allowed though by default the original DataFrame is not modified.