From 8b8155fef9c5cd36cfabf728ccebf6a14a1cbbda Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Wed, 22 Jan 2025 13:11:37 -0800 Subject: [PATCH 01/11] feat: Add DataFrame.mask method (#1302) --- bigframes/dataframe.py | 3 + tests/system/small/test_dataframe.py | 11 +++ .../bigframes_vendored/pandas/core/frame.py | 92 +++++++++++++++++++ 3 files changed, 106 insertions(+) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index ce57661919..7f60f1c769 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2365,6 +2365,9 @@ def where(self, cond, other=None): result.columns.names = self.columns.names return result + def mask(self, cond, other=None): + return self.where(~cond, other=other) + def dropna( self, *, diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index e7d6ad67e1..93c865536c 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -331,6 +331,17 @@ def test_where_series_cond(scalars_df_index, scalars_pandas_df_index): pandas.testing.assert_frame_equal(bf_result, pd_result) +def test_mask_series_cond(scalars_df_index, scalars_pandas_df_index): + cond_bf = scalars_df_index["int64_col"] > 0 + cond_pd = scalars_pandas_df_index["int64_col"] > 0 + + bf_df = scalars_df_index[["int64_too", "int64_col", "float64_col"]] + pd_df = scalars_pandas_df_index[["int64_too", "int64_col", "float64_col"]] + bf_result = bf_df.mask(cond_bf, bf_df + 1).to_pandas() + pd_result = pd_df.mask(cond_pd, pd_df + 1) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + def test_where_series_multi_index(scalars_df_index, scalars_pandas_df_index): # Test when a dataframe has multi-index or multi-columns. columns = ["int64_col", "float64_col"] diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index c8ca1b74b5..bf4d2f2d0c 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -2048,6 +2048,98 @@ def where(self, cond, other): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def mask(self, cond, other): + """Replace values where the condition is False. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'a': [20, 10, 0], 'b': [0, 10, 20]}) + >>> df + a b + 0 20 0 + 1 10 10 + 2 0 20 + + [3 rows x 2 columns] + + You can filter the values in the dataframe based on a condition. The + values matching the condition would be kept, and not matching would be + replaced. The default replacement value is ``NA``. For example, when the + condition is a dataframe: + + >>> df.mask(df > 0) + a b + 0 0 + 1 + 2 0 + + [3 rows x 2 columns] + + You can specify a custom replacement value for non-matching values. + + >>> df.mask(df > 0, -1) + a b + 0 -1 0 + 1 -1 -1 + 2 0 -1 + + [3 rows x 2 columns] + + Besides dataframe, the condition can be a series too. For example: + + >>> df.mask(df['a'] > 10, -1) + a b + 0 -1 -1 + 1 10 10 + 2 0 20 + + [3 rows x 2 columns] + + As for the replacement, it can be a dataframe too. For example: + + >>> df.mask(df > 10, -df) + a b + 0 -20 0 + 1 10 10 + 2 0 -20 + + [3 rows x 2 columns] + + >>> df.mask(df['a'] > 10, -df) + a b + 0 -20 0 + 1 10 10 + 2 0 20 + + [3 rows x 2 columns] + + Please note, replacement doesn't support Series for now. In pandas, when + specifying a Series as replacement, the axis value should be specified + at the same time, which is not supported in bigframes DataFrame. + + Args: + cond (bool Series/DataFrame, array-like, or callable): + Where cond is False, keep the original value. Where True, replace + with corresponding value from other. If cond is callable, it is + computed on the Series/DataFrame and returns boolean + Series/DataFrame or array. The callable must not change input + Series/DataFrame (though pandas doesn’t check it). + other (scalar, DataFrame, or callable): + Entries where cond is True are replaced with corresponding value + from other. If other is callable, it is computed on the + DataFrame and returns scalar or DataFrame. The callable must not + change input DataFrame (though pandas doesn’t check it). If not + specified, entries will be filled with the corresponding NULL + value (np.nan for numpy dtypes, pd.NA for extension dtypes). + + Returns: + DataFrame: DataFrame after the replacement. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + # ---------------------------------------------------------------------- # Sorting From 9597ba90644fc35ba8ea3c790cdf689b2db1c3d6 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Wed, 22 Jan 2025 17:11:22 -0800 Subject: [PATCH 02/11] refactor: Make window op node support non-unary ops (#1295) --- bigframes/core/__init__.py | 3 +- bigframes/core/compile/aggregate_compiler.py | 3 +- bigframes/core/compile/compiled.py | 48 ++++++++++++-------- bigframes/core/compile/compiler.py | 3 +- bigframes/core/compile/polars/compiler.py | 26 +++++++++-- bigframes/core/nodes.py | 18 ++++---- bigframes/operations/aggregations.py | 2 +- 7 files changed, 65 insertions(+), 38 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index ee9917f619..0bae094777 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -405,8 +405,7 @@ def project_window_op( ArrayValue( nodes.WindowOpNode( child=self.node, - column_name=ex.deref(column_name), - op=op, + expression=ex.UnaryAggregation(op, ex.deref(column_name)), window_spec=window_spec, output_name=ids.ColumnId(output_name), never_skip_nulls=never_skip_nulls, diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index f97856efa5..7a018a662e 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -479,10 +479,9 @@ def _( return _apply_window_if_present(column.dense_rank(), window) + 1 -@compile_unary_agg.register +@compile_nullary_agg.register def _( op: agg_ops.RowNumberOp, - column: ibis_types.Column, window=None, ) -> ibis_types.IntegerValue: return _apply_window_if_present(ibis_api.row_number(), window) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 526826495e..ae5e2ff8c0 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -861,8 +861,7 @@ def promote_offsets(self, col_id: str) -> OrderedIR: ## Methods that only work with ordering def project_window_op( self, - column_name: ex.DerefOp, - op: agg_ops.UnaryWindowOp, + expression: ex.Aggregation, window_spec: WindowSpec, output_name: str, *, @@ -881,8 +880,11 @@ def project_window_op( # See: https://github.com/ibis-project/ibis/issues/9773 used_exprs = map( self._compile_expression, - itertools.chain( - (column_name,), map(ex.DerefOp, window_spec.all_referenced_columns) + map( + ex.DerefOp, + itertools.chain( + expression.column_references, window_spec.all_referenced_columns + ), ), ) can_directly_window = not any( @@ -890,44 +892,54 @@ def project_window_op( ) if not can_directly_window: return self._reproject_to_table().project_window_op( - column_name, - op, + expression, window_spec, output_name, never_skip_nulls=never_skip_nulls, ) - column = typing.cast(ibis_types.Column, self._compile_expression(column_name)) window = self._ibis_window_from_spec( - window_spec, require_total_order=op.uses_total_row_ordering + window_spec, require_total_order=expression.op.uses_total_row_ordering ) bindings = {col: self._get_ibis_column(col) for col in self.column_ids} window_op = agg_compiler.compile_analytic( - ex.UnaryAggregation(op, column_name), + expression, window, bindings=bindings, ) + inputs = tuple( + typing.cast(ibis_types.Column, self._compile_expression(ex.DerefOp(column))) + for column in expression.column_references + ) clauses = [] - if op.skips_nulls and not never_skip_nulls: - clauses.append((column.isnull(), ibis_types.null())) - if window_spec.min_periods: - if op.skips_nulls: + if expression.op.skips_nulls and not never_skip_nulls: + for column in inputs: + clauses.append((column.isnull(), ibis_types.null())) + if window_spec.min_periods and len(inputs) > 0: + if expression.op.skips_nulls: # Most operations do not count NULL values towards min_periods + per_col_does_count = (column.notnull() for column in inputs) + # All inputs must be non-null for observation to count + is_observation = functools.reduce( + lambda x, y: x & y, per_col_does_count + ).cast(int) observation_count = agg_compiler.compile_analytic( - ex.UnaryAggregation(agg_ops.count_op, column_name), + ex.UnaryAggregation(agg_ops.sum_op, ex.deref("_observation_count")), window, - bindings=bindings, + bindings={"_observation_count": is_observation}, ) else: # Operations like count treat even NULLs as valid observations for the sake of min_periods # notnull is just used to convert null values to non-null (FALSE) values to be counted - denulled_value = typing.cast(ibis_types.BooleanColumn, column.notnull()) + is_observation = inputs[0].notnull() observation_count = agg_compiler.compile_analytic( - ex.UnaryAggregation(agg_ops.count_op, ex.deref("_denulled")), + ex.UnaryAggregation( + agg_ops.count_op, ex.deref("_observation_count") + ), window, - bindings={**bindings, "_denulled": denulled_value}, + bindings={"_observation_count": is_observation}, ) clauses.append( ( diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 9e87b4b4e8..9548bb48f4 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -364,8 +364,7 @@ def compile_aggregate(self, node: nodes.AggregateNode, ordered: bool = True): @_compile_node.register def compile_window(self, node: nodes.WindowOpNode, ordered: bool = True): result = self.compile_ordered_ir(node.child).project_window_op( - node.column_name, - node.op, + node.expression, node.window_spec, node.output_name.sql, never_skip_nulls=node.never_skip_nulls, diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index 7d8d54a7f0..6d5b11a5e8 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -16,7 +16,7 @@ import dataclasses import functools import itertools -from typing import cast, Sequence, TYPE_CHECKING +from typing import cast, Sequence, Tuple, TYPE_CHECKING import bigframes.core import bigframes.core.expression as ex @@ -125,6 +125,24 @@ def get_args( f"Aggregation {agg} not yet supported in polars engine." ) + def compile_agg_expr(self, expr: ex.Aggregation): + if isinstance(expr, ex.NullaryAggregation): + inputs: Tuple = () + elif isinstance(expr, ex.UnaryAggregation): + assert isinstance(expr.arg, ex.DerefOp) + inputs = (expr.arg.id.sql,) + elif isinstance(expr, ex.BinaryAggregation): + assert isinstance(expr.left, ex.DerefOp) + assert isinstance(expr.right, ex.DerefOp) + inputs = ( + expr.left.id.sql, + expr.right.id.sql, + ) + else: + raise ValueError(f"Unexpected aggregation: {expr.op}") + + return self.compile_agg_op(expr.op, inputs) + def compile_agg_op(self, op: agg_ops.WindowOp, inputs: Sequence[str] = []): if isinstance(op, agg_ops.ProductOp): # TODO: Need schema to cast back to original type if posisble (eg float back to int) @@ -320,9 +338,9 @@ def compile_sample(self, node: nodes.RandomSampleNode): @compile_node.register def compile_window(self, node: nodes.WindowOpNode): df = self.compile_node(node.child) - agg_expr = self.agg_compiler.compile_agg_op( - node.op, [node.column_name.id.sql] - ).alias(node.output_name.sql) + agg_expr = self.agg_compiler.compile_agg_expr(node.expression).alias( + node.output_name.sql + ) # Three window types: completely unbound, grouped and row bounded window = node.window_spec diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index fe79da2bf6..88d55ac70b 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -33,7 +33,6 @@ import bigframes.core.slices as slices import bigframes.core.window_spec as window import bigframes.dtypes -import bigframes.operations.aggregations as agg_ops if typing.TYPE_CHECKING: import bigframes.core.ordering as orderings @@ -1325,8 +1324,7 @@ def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): @dataclasses.dataclass(frozen=True, eq=False) class WindowOpNode(UnaryNode): - column_name: ex.DerefOp - op: agg_ops.UnaryWindowOp + expression: ex.Aggregation window_spec: window.WindowSpec output_name: bigframes.core.identifiers.ColumnId never_skip_nulls: bool = False @@ -1334,7 +1332,7 @@ class WindowOpNode(UnaryNode): def _validate(self): """Validate the local data in the node.""" - assert self.column_name.id in self.child.ids + assert all(ref in self.child.ids for ref in self.expression.column_references) @property def non_local(self) -> bool: @@ -1363,9 +1361,11 @@ def row_count(self) -> Optional[int]: @functools.cached_property def added_field(self) -> Field: - input_type = self.child.get_type(self.column_name.id) - new_item_dtype = self.op.output_type(input_type) - return Field(self.output_name, new_item_dtype) + input_types = self.child._dtype_lookup + return Field( + self.output_name, + bigframes.dtypes.dtype_for_etype(self.expression.output_type(input_types)), + ) @property def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: @@ -1376,7 +1376,7 @@ def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: return self.child.prune(used_cols) consumed_ids = ( used_cols.difference([self.output_name]) - .union([self.column_name.id]) + .union(self.expression.column_references) .union(self.window_spec.all_referenced_columns) ) return self.transform_children(lambda x: x.prune(consumed_ids)) @@ -1391,7 +1391,7 @@ def remap_vars( def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): return dataclasses.replace( self, - column_name=self.column_name.remap_column_refs( + expression=self.expression.remap_column_refs( mappings, allow_partial_bindings=True ), window_spec=self.window_spec.remap_column_refs( diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 9de58fe5db..365b664ee0 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -381,7 +381,7 @@ def skips_nulls(self): # This should really by a NullaryWindowOp, but APIs don't support that yet. @dataclasses.dataclass(frozen=True) -class RowNumberOp(UnaryWindowOp): +class RowNumberOp(NullaryWindowOp): name: ClassVar[str] = "rownumber" @property From d941a84099b2d9fbbe648313249b9367c2ca3626 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 23 Jan 2025 06:43:04 +0000 Subject: [PATCH 03/11] chore: keep gemini code commented out in the notebook template (#1314) * Keep Gemini code commented out in the notebook template * disable progress bar to avoid leaking internal information --- .../bq_dataframes_template.ipynb | 917 ++++-------------- 1 file changed, 205 insertions(+), 712 deletions(-) diff --git a/notebooks/getting_started/bq_dataframes_template.ipynb b/notebooks/getting_started/bq_dataframes_template.ipynb index 2b47c40397..90186b297d 100644 --- a/notebooks/getting_started/bq_dataframes_template.ipynb +++ b/notebooks/getting_started/bq_dataframes_template.ipynb @@ -137,7 +137,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": { "id": "oM1iC_MfAts1" }, @@ -158,7 +158,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": { "id": "PyQmSRbKA8r-" }, @@ -179,14 +179,15 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": { "id": "NPPMuw2PXGeo" }, "outputs": [], "source": [ "# Note: The project option is not required in all environments.\n", - "# On BigQuery Studio, the project ID is automatically detected.\n", + "# For example, In BigQuery Studio, the project ID is automatically detected,\n", + "# But in Google Colab it must be set by the user.\n", "bpd.options.bigquery.project = PROJECT_ID\n", "\n", "# Note: The location option is not required.\n", @@ -198,7 +199,13 @@ "# Note: BigQuery DataFrames objects are by default fully ordered like Pandas.\n", "# If ordering is not important for you, you can uncomment the following\n", "# expression to run BigQuery DataFrames in partial ordering mode.\n", - "#bpd.options.bigquery.ordering_mode = \"partial\"" + "#bpd.options.bigquery.ordering_mode = \"partial\"\n", + "\n", + "# Note: By default BigQuery DataFrames emits out BigQuery job metadata via a\n", + "# progress bar. But in this notebook let's disable the progress bar to keep the\n", + "# experience less verbose. If you would like the default behavior, please\n", + "# comment out the following expression. \n", + "bpd.options.display.progress_bar = None" ] }, { @@ -242,24 +249,11 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "metadata": { "id": "Vyex9BQI-BNa" }, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 9c49a31b-7db6-49e1-b711-42eeebfdf7d3 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# This is how you read a BigQuery table\n", "df = bpd.read_gbq(\"bigquery-public-data.ml_datasets.penguins\")\n", @@ -277,33 +271,9 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "Query job a4004810-9249-4fe3-ab87-7cc33b69808d is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 09214d4a-8911-41b3-9f14-1c781cb7dc1b is DONE. 31.7 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/html": [ @@ -336,76 +306,76 @@ " \n", " \n", " \n", - " 171\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 41.1\n", - " 19.0\n", - " 182.0\n", - " 3425.0\n", - " MALE\n", - " \n", - " \n", - " 219\n", + " 198\n", " Gentoo penguin (Pygoscelis papua)\n", " Biscoe\n", - " 45.7\n", - " 13.9\n", - " 214.0\n", + " 43.3\n", + " 13.4\n", + " 209.0\n", " 4400.0\n", " FEMALE\n", " \n", " \n", - " 59\n", - " Gentoo penguin (Pygoscelis papua)\n", - " Biscoe\n", - " 49.9\n", - " 16.1\n", - " 213.0\n", - " 5400.0\n", + " 235\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Torgersen\n", + " 35.1\n", + " 19.4\n", + " 193.0\n", + " 4200.0\n", " MALE\n", " \n", " \n", - " 132\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Biscoe\n", - " 39.6\n", - " 20.7\n", - " 191.0\n", - " 3900.0\n", + " 317\n", + " Chinstrap penguin (Pygoscelis antarctica)\n", + " Dream\n", + " 45.4\n", + " 18.7\n", + " 188.0\n", + " 3525.0\n", " FEMALE\n", " \n", " \n", - " 223\n", - " Gentoo penguin (Pygoscelis papua)\n", - " Biscoe\n", - " 47.3\n", - " 13.8\n", - " 216.0\n", - " 4725.0\n", - " <NA>\n", + " 117\n", + " Chinstrap penguin (Pygoscelis antarctica)\n", + " Dream\n", + " 48.5\n", + " 17.5\n", + " 191.0\n", + " 3400.0\n", + " MALE\n", + " \n", + " \n", + " 159\n", + " Chinstrap penguin (Pygoscelis antarctica)\n", + " Dream\n", + " 45.6\n", + " 19.4\n", + " 194.0\n", + " 3525.0\n", + " FEMALE\n", " \n", " \n", "\n", "" ], "text/plain": [ - " species island culmen_length_mm \\\n", - "171 Adelie Penguin (Pygoscelis adeliae) Dream 41.1 \n", - "219 Gentoo penguin (Pygoscelis papua) Biscoe 45.7 \n", - "59 Gentoo penguin (Pygoscelis papua) Biscoe 49.9 \n", - "132 Adelie Penguin (Pygoscelis adeliae) Biscoe 39.6 \n", - "223 Gentoo penguin (Pygoscelis papua) Biscoe 47.3 \n", + " species island culmen_length_mm \\\n", + "198 Gentoo penguin (Pygoscelis papua) Biscoe 43.3 \n", + "235 Adelie Penguin (Pygoscelis adeliae) Torgersen 35.1 \n", + "317 Chinstrap penguin (Pygoscelis antarctica) Dream 45.4 \n", + "117 Chinstrap penguin (Pygoscelis antarctica) Dream 48.5 \n", + "159 Chinstrap penguin (Pygoscelis antarctica) Dream 45.6 \n", "\n", " culmen_depth_mm flipper_length_mm body_mass_g sex \n", - "171 19.0 182.0 3425.0 MALE \n", - "219 13.9 214.0 4400.0 FEMALE \n", - "59 16.1 213.0 5400.0 MALE \n", - "132 20.7 191.0 3900.0 FEMALE \n", - "223 13.8 216.0 4725.0 " + "198 13.4 209.0 4400.0 FEMALE \n", + "235 19.4 193.0 4200.0 MALE \n", + "317 18.7 188.0 3525.0 FEMALE \n", + "117 17.5 191.0 3400.0 MALE \n", + "159 19.4 194.0 3525.0 FEMALE " ] }, - "execution_count": 3, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -454,28 +424,16 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": { "id": "YKwCW7Nsavap" }, "outputs": [ - { - "data": { - "text/html": [ - "Query job 788957fa-55af-40af-a17c-913c1d0ec170 is DONE. 2.7 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", "text": [ - "average_body_mass: 4201.754385964913\n" + "average_body_mass: 4201.754385964906\n" ] } ], @@ -495,35 +453,11 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": { "id": "4PyKMR61-Mjy" }, "outputs": [ - { - "data": { - "text/html": [ - "Query job 0026583d-b326-4393-82f9-a1d2629fa745 is DONE. 15.6 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 30e17097-f818-4e87-a4a1-3eef82bc38be is DONE. 163 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/html": [ @@ -580,7 +514,7 @@ "[3 rows x 1 columns]" ] }, - "execution_count": 5, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -609,7 +543,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -625,45 +559,21 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "Query job 223af6b4-d58d-42c3-b7c9-13a303536e21 is DONE. 31.7 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 96fb4ef9-6c12-4cfa-aa2c-16377efed8f3 is DONE. 11.0 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/plain": [ "133 {'culmen_length_mm': None, 'culmen_depth_mm': ...\n", "279 {'culmen_length_mm': 37.9, 'culmen_depth_mm': ...\n", "34 {'culmen_length_mm': 37.8, 'culmen_depth_mm': ...\n", + "208 {'culmen_length_mm': 40.5, 'culmen_depth_mm': ...\n", "96 {'culmen_length_mm': 37.7, 'culmen_depth_mm': ...\n", - "18 {'culmen_length_mm': 38.8, 'culmen_depth_mm': ...\n", "dtype: struct[pyarrow]" ] }, - "execution_count": 7, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -684,45 +594,21 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "Query job 81db1901-0704-4af2-8395-4c310e043f30 is DONE. 34.5 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job dfb40bba-3170-47b8-9ab4-c3ed5ab7550e is DONE. 8.2 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/plain": [ - "116 \n", - "308 15.5\n", - "285 15.9\n", - "2 16.0\n", - "245 16.1\n", + "133 \n", + "279 18.6\n", + "34 18.3\n", + "96 18.7\n", + "208 18.9\n", "dtype: Float64" ] }, - "execution_count": 8, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -753,34 +639,22 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "Query job 4420834b-8f6f-46ce-9488-a7ae3960e72b is DONE. 34.5 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/plain": [ "" ] }, - "execution_count": 9, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -795,46 +669,22 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "Query job 2e9d7f11-c442-4a6b-905f-0f5ac498b399 is DONE. 12.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 27db4f04-849b-4193-acae-6ecff5f4350f is DONE. 23.8 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/plain": [ "" ] }, - "execution_count": 10, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -859,72 +709,12 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "Query job 58f9cb42-382b-428b-9ed5-95c064e3ab29 is DONE. 12.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 18128891-8a44-42bb-9f30-a1e978203f41 is DONE. 12.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 1ddc23a6-4b05-448b-bb21-d778634ea5c5 is DONE. 12.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job b4617c85-24c9-4b63-8926-b5db63e7c319 is DONE. 12.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 95df92b9-1b7d-4c6d-9f20-69f928e36850 is DONE. 12.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -957,7 +747,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -966,46 +756,10 @@ "text": [ "\n", "We have a dataframe of \n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "Query job 6474b1c7-b6e5-4be7-93c6-cf8b0338dd58 is DONE. 34.5 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ "\n", - "We have a dataframe of \n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "Load job c93fdfb6-bbe1-446c-b7be-4922d67498ed is DONE. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + "\n", + "We have a dataframe of \n", + "\n", "\n", "We have a dataframe of \n", "\n" @@ -1047,33 +801,9 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 15, "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "Query job c8682a2f-c1df-437f-95c3-836a317752d5 is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job e4198287-043e-45cf-b615-990b00abc521 is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", @@ -1111,69 +841,9 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 16, "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "Query job c45c1c1c-7a9a-4598-929c-ef18455a5d58 is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job a6ccb958-f34e-48c1-9ec6-961e1e07ae25 is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 3f357436-61ea-4d75-9a3c-a7042d20330e is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 040b5cc6-9ded-43ea-9176-89a7eaf741ce is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 99f1db63-2cb7-40de-9dea-bc5624c540d0 is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", @@ -1215,7 +885,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -1231,7 +901,7 @@ " ('linreg', LinearRegression(fit_intercept=False))])" ] }, - "execution_count": 8, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1269,57 +939,9 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 19, "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "Query job 1ccf6bcb-f38d-4680-a5df-89aaa03d6ea5 is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job cefb04e4-6fd2-477b-a579-be3b2ca745ba is DONE. 22.6 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job bebcf4df-d8d1-45c4-9efc-86c7645cd40d is DONE. 30.0 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 25d66657-3a7b-4e79-bca2-5a8116ba5c94 is DONE. 6.2 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/html": [ @@ -1352,52 +974,52 @@ " \n", " \n", " \n", - " 286\n", - " 3230.741308\n", + " 37\n", + " -18640.718256\n", " Biscoe\n", - " 37.9\n", - " 18.6\n", - " 172.0\n", - " FEMALE\n", - " Adelie Penguin (Pygoscelis adeliae)\n", + " 44.5\n", + " 15.7\n", + " 217.0\n", + " .\n", + " Gentoo penguin (Pygoscelis papua)\n", " \n", " \n", - " 41\n", - " 3186.002207\n", - " Torgersen\n", - " 40.2\n", - " 17.0\n", - " 176.0\n", + " 245\n", + " 3109.962252\n", + " Dream\n", + " 33.1\n", + " 16.1\n", + " 178.0\n", " FEMALE\n", " Adelie Penguin (Pygoscelis adeliae)\n", " \n", " \n", - " 72\n", - " 3324.549608\n", - " Dream\n", - " 36.5\n", - " 18.0\n", + " 267\n", + " 3372.443434\n", + " Torgersen\n", + " 41.1\n", + " 17.6\n", " 182.0\n", " FEMALE\n", " Adelie Penguin (Pygoscelis adeliae)\n", " \n", " \n", - " 36\n", - " 3426.252381\n", - " Dream\n", - " 36.0\n", - " 18.5\n", - " 186.0\n", + " 280\n", + " 3341.376012\n", + " Torgersen\n", + " 36.6\n", + " 17.8\n", + " 185.0\n", " FEMALE\n", " Adelie Penguin (Pygoscelis adeliae)\n", " \n", " \n", - " 247\n", - " 3413.687704\n", + " 40\n", + " 3310.178937\n", " Biscoe\n", - " 39.0\n", - " 17.5\n", - " 186.0\n", + " 37.6\n", + " 17.0\n", + " 185.0\n", " FEMALE\n", " Adelie Penguin (Pygoscelis adeliae)\n", " \n", @@ -1407,21 +1029,21 @@ ], "text/plain": [ " predicted_body_mass_g island culmen_length_mm culmen_depth_mm \\\n", - "286 3230.741308 Biscoe 37.9 18.6 \n", - "41 3186.002207 Torgersen 40.2 17.0 \n", - "72 3324.549608 Dream 36.5 18.0 \n", - "36 3426.252381 Dream 36.0 18.5 \n", - "247 3413.687704 Biscoe 39.0 17.5 \n", + "37 -18640.718256 Biscoe 44.5 15.7 \n", + "245 3109.962252 Dream 33.1 16.1 \n", + "267 3372.443434 Torgersen 41.1 17.6 \n", + "280 3341.376012 Torgersen 36.6 17.8 \n", + "40 3310.178937 Biscoe 37.6 17.0 \n", "\n", " flipper_length_mm sex species \n", - "286 172.0 FEMALE Adelie Penguin (Pygoscelis adeliae) \n", - "41 176.0 FEMALE Adelie Penguin (Pygoscelis adeliae) \n", - "72 182.0 FEMALE Adelie Penguin (Pygoscelis adeliae) \n", - "36 186.0 FEMALE Adelie Penguin (Pygoscelis adeliae) \n", - "247 186.0 FEMALE Adelie Penguin (Pygoscelis adeliae) " + "37 217.0 . Gentoo penguin (Pygoscelis papua) \n", + "245 178.0 FEMALE Adelie Penguin (Pygoscelis adeliae) \n", + "267 182.0 FEMALE Adelie Penguin (Pygoscelis adeliae) \n", + "280 185.0 FEMALE Adelie Penguin (Pygoscelis adeliae) \n", + "40 185.0 FEMALE Adelie Penguin (Pygoscelis adeliae) " ] }, - "execution_count": 12, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -1448,33 +1070,9 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 20, "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "Query job 33f30db0-43ab-47f4-ae17-bf8c7e5e8bdc is DONE. 30.0 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 0a49d654-7883-409b-bc8f-3b16eeee873b is DONE. 48 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/html": [ @@ -1507,12 +1105,12 @@ " \n", " \n", " 0\n", - " 233.280852\n", - " 79664.958768\n", - " 0.004725\n", - " 200.960554\n", - " 0.889414\n", - " 0.897374\n", + " 582.272638\n", + " 8337651.200465\n", + " 0.004989\n", + " 193.446297\n", + " -11.273389\n", + " -11.091156\n", " \n", " \n", "\n", @@ -1521,15 +1119,15 @@ ], "text/plain": [ " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - "0 233.280852 79664.958768 0.004725 \n", + "0 582.272638 8337651.200465 0.004989 \n", "\n", - " median_absolute_error r2_score explained_variance \n", - "0 200.960554 0.889414 0.897374 \n", + " median_absolute_error r2_score explained_variance \n", + "0 193.446297 -11.273389 -11.091156 \n", "\n", "[1 rows x 6 columns]" ] }, - "execution_count": 13, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1547,52 +1145,16 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 21, "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "Query job d42f72e4-c1dc-4859-8d2f-99e09006ed64 is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 5b8dc932-3d79-47ab-b43a-8cd987091883 is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job b93b8f2f-533d-46b7-b320-0349cee22635 is DONE. 30.0 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/plain": [ - "0.8894138438612413" + "-11.273389374372979" ] }, - "execution_count": 14, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1609,7 +1171,9 @@ "source": [ "## Generative AI with BigQuery DataFrames\n", "\n", - "BigQuery DataFrames integration with the Large Language Models (LLM) supported by BigQuery ML. Check out the [`bigframes.ml.llm`](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm) module for all the available models." + "BigQuery DataFrames integration with the Large Language Models (LLM) supported by BigQuery ML. Check out the [`bigframes.ml.llm`](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm) module for all the available models.\n", + "\n", + "To use this feature you would need to have a few additional APIs enabled and IAM roles configured. Please make sure of that by following [this documentation](https://cloud.google.com/bigquery/docs/use-bigquery-dataframes#remote-models) and then uncomment the code in the following cells to try out the integration with Gemini." ] }, { @@ -1623,21 +1187,9 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 22, "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "Query job 180784d4-bff2-4165-9958-07f77b711980 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/html": [ @@ -1689,17 +1241,17 @@ "[3 rows x 1 columns]" ] }, - "execution_count": 44, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df = bpd.DataFrame(\n", - " {\n", - " \"prompt\": [\"What is BigQuery?\", \"What is BQML?\", \"What is BigQuery DataFrames?\"],\n", - " })\n", - "df" + "# df = bpd.DataFrame(\n", + "# {\n", + "# \"prompt\": [\"What is BigQuery?\", \"What is BQML?\", \"What is BigQuery DataFrames?\"],\n", + "# })\n", + "# df" ] }, { @@ -1713,65 +1265,17 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 23, "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "Query job aefe66a0-70da-44e4-89be-05a152b046f8 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 8b3b543d-e4a2-4534-b55a-6a499e958108 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/shobs/code/bigframes/bigframes/core/__init__.py:109: PreviewWarning: Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.\n", - " warnings.warn(\n" + "/usr/local/google/home/shobs/code/bigframes/bigframes/core/__init__.py:114: PreviewWarning: Interpreting JSON column(s) as pyarrow.large_string. This behavior may change in future versions.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" ] }, - { - "data": { - "text/html": [ - "Query job cef454ea-1d31-4de4-a724-eed712e36d2c is DONE. 6 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 1eb4c2a5-9626-40fb-a346-1e6c137c5239 is DONE. 10.7 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/html": [ @@ -1802,15 +1306,18 @@ " \n", " \n", " 0\n", - " BigQuery is a serverless, highly scalable, and...\n", - " [{\"category\":1,\"probability\":1,\"probability_sc...\n", + " ## BigQuery: Your Data Warehouse in the Cloud\n", + "...\n", + " [{\"category\":\"HARM_CATEGORY_HATE_SPEECH\",\"prob...\n", " \n", " What is BigQuery?\n", " \n", " \n", " 1\n", - " ## BQML: Bringing Quantum Machine Learning to ...\n", - " [{\"category\":1,\"probability\":1,\"probability_sc...\n", + " ## BQML - BigQuery Machine Learning\n", + "\n", + "BQML stan...\n", + " [{\"category\":\"HARM_CATEGORY_HATE_SPEECH\",\"prob...\n", " \n", " What is BQML?\n", " \n", @@ -1819,7 +1326,7 @@ " ## BigQuery DataFrames\n", "\n", "BigQuery DataFrames is...\n", - " [{\"category\":1,\"probability\":1,\"probability_sc...\n", + " [{\"category\":\"HARM_CATEGORY_HATE_SPEECH\",\"prob...\n", " \n", " What is BigQuery DataFrames?\n", " \n", @@ -1830,16 +1337,19 @@ ], "text/plain": [ " ml_generate_text_llm_result \\\n", - "0 BigQuery is a serverless, highly scalable, and... \n", - "1 ## BQML: Bringing Quantum Machine Learning to ... \n", + "0 ## BigQuery: Your Data Warehouse in the Cloud\n", + "... \n", + "1 ## BQML - BigQuery Machine Learning\n", + "\n", + "BQML stan... \n", "2 ## BigQuery DataFrames\n", "\n", "BigQuery DataFrames is... \n", "\n", " ml_generate_text_rai_result ml_generate_text_status \\\n", - "0 [{\"category\":1,\"probability\":1,\"probability_sc... \n", - "1 [{\"category\":1,\"probability\":1,\"probability_sc... \n", - "2 [{\"category\":1,\"probability\":1,\"probability_sc... \n", + "0 [{\"category\":\"HARM_CATEGORY_HATE_SPEECH\",\"prob... \n", + "1 [{\"category\":\"HARM_CATEGORY_HATE_SPEECH\",\"prob... \n", + "2 [{\"category\":\"HARM_CATEGORY_HATE_SPEECH\",\"prob... \n", "\n", " prompt \n", "0 What is BigQuery? \n", @@ -1849,18 +1359,18 @@ "[3 rows x 4 columns]" ] }, - "execution_count": 45, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from bigframes.ml.llm import GeminiTextGenerator\n", + "# from bigframes.ml.llm import GeminiTextGenerator\n", "\n", - "model = GeminiTextGenerator()\n", + "# model = GeminiTextGenerator()\n", "\n", - "pred = model.predict(df)\n", - "pred" + "# pred = model.predict(df)\n", + "# pred" ] }, { @@ -1872,66 +1382,49 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 24, "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "Query job d5fed4ed-26c2-45a6-b842-3af8c901985c is DONE. 10.7 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", "text": [ "## BigQuery DataFrames\n", "\n", - "BigQuery DataFrames is an open-source project offered by Google that provides the capabilities of using pandas-style APIs directly in BigQuery's serverless environment for performing SQL and DDL queries. This essentially means you have the flexibility to write pandas code within BigQuery for data exploration, transformation, visualization and building machine learning models. It acts as an intermediary bridge that facilitates SQL queries to the BigQuery engine for running your analysis with speed and scalability on large datasets without requiring manual configuration. It can further extend its functionalities through third-party libraries like scikit-learn, matplotlib, seaborn etc., enhancing its versatility within the realm of data manipulation.\n", - "\n", - "Here's are some key benefits associated with BigQuery DataFrames:\n", - "\n", - "\n", - "### Streamlined Experience:\n", - "\n", - "BigQuery DataFrames simplifies your development workflow by eliminating the back-and-forth communication between pandas and BigQuery environments for data operations. It allows working seamlessly within BigQuery to leverage powerful SQL features while using familiar pandas functions on data stored within. This enables a smoother process from ingesting, analyzing, and visualizing your data efficiently.\n", - "\n", - "### Serverless Infrastructure:\n", - "\n", - "One of the greatest advantages of BigQuery DataFrames is that it runs on serverless infrastructure, eliminating the need to maintain complex environments for development. This translates to less complexity, easy management, and a focus on efficient analysis rather than infrastructure upkeep.\n", - "\n", - "### Scalable Capabilities:\n", - "\n", - "As mentioned, BigQuery excels in dealing with immense data sets with efficient storage and processing power due to its architecture built for handling petabyte-scale datasets in Google Cloud Storage. DataFrames inherits this strength, empowering the analysis of vast information while ensuring speed and reliability throughout.\n", + "BigQuery DataFrames is a Python library that allows you to interact with BigQuery data using the familiar Pandas API. This means you can use all the powerful tools and methods from the Pandas library to explore, analyze, and transform your BigQuery data, without needing to learn a new language or API.\n", "\n", - "### Open-source Ecosystem:\n", + "Here are some of the key benefits of using BigQuery DataFrames:\n", "\n", - "While Google spearheads its initial creation, DataFrames benefits immensely from its open-source structure. This fosters community-wide involvement in its advancement; developers are continually making contributions that bolster functionalities, introduce improvements with regular updates and fixes.\n", + "* **Ease of use:** If you're already familiar with Pandas, you can start using BigQuery DataFrames with minimal learning curve.\n", + "* **Speed and efficiency:** BigQuery DataFrames leverages the power of BigQuery to perform complex operations on large datasets efficiently.\n", + "* **Flexibility:** You can use BigQuery DataFrames for a wide range of tasks, including data exploration, analysis, cleaning, and transformation.\n", + "* **Integration with other tools:** BigQuery DataFrames integrates seamlessly with other Google Cloud tools like Colab and Vertex AI, allowing you to build end-to-end data analysis pipelines.\n", "\n", - "Here are a few scenarios where using BigQuery DataFrames might prove particularly valuable:\n", + "Here are some of the key features of BigQuery DataFrames:\n", "\n", - "- Performing exploratory analysis on a diverse range of dataset directly on serverless infrastructure with scalability, saving valuable operational cost and time.\n", + "* **Support for most Pandas operations:** You can use most of the DataFrame methods you're familiar with, such as `groupby`, `filter`, `sort_values`, and `apply`.\n", + "* **Automatic schema inference:** BigQuery DataFrames automatically infers the schema of your data, so you don't need to manually specify it.\n", + "* **Efficient handling of large datasets:** BigQuery DataFrames pushes computations to BigQuery, which allows you to work with large datasets without running out of memory.\n", + "* **Support for both public and private datasets:** You can use BigQuery DataFrames to access both public and private datasets stored in BigQuery.\n", "\n", - "- Implementing data preprocessing steps using Python and DataFrames within Google Cloud Platform without having to transfer and analyze it elsewhere, streamlining workflow within the same framework.\n", + "## Getting Started with BigQuery DataFrames\n", "\n", - "- When building and training ML models directly from datasets without exporting the data outside, maintaining security within and improving efficiency.\n", + "Getting started with BigQuery DataFrames is easy. You just need to install the library and configure your authentication. Once you're set up, you can start using it to interact with your BigQuery data.\n", "\n", + "Here are some resources to help you get started:\n", "\n", - "However, be aware that DataFrames is an ever-evolving project and some aspects such as DML functionalities remain under active development to reach feature completion as compared to standard SQL commands which have matured functionality already in place within DataFrames.\n", + "* **Documentation:** https://cloud.google.com/bigquery/docs/reference/libraries/bigquery-dataframe\n", + "* **Quickstart:** https://cloud.google.com/bigquery/docs/reference/libraries/bigquery-dataframe-python-quickstart\n", + "* **Tutorials:** https://cloud.google.com/bigquery/docs/tutorials/bq-dataframe-pandas-tutorial\n", "\n", + "## Conclusion\n", "\n", - "Would you like me to delve deeper into specific features of DataFrames, its current limitations or perhaps provide examples of its applications or user cases?\n" + "BigQuery DataFrames is a powerful tool that can help you get the most out of your BigQuery data. If you're looking for a way to easily analyze and transform your BigQuery data using the familiar Pandas API, then BigQuery DataFrames is a great option.\n" ] } ], "source": [ - "print(pred.loc[2][\"ml_generate_text_llm_result\"])" + "# print(pred.loc[2][\"ml_generate_text_llm_result\"])" ] }, { @@ -1998,7 +1491,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.6" + "version": "3.10.12" } }, "nbformat": 4, From 57746e141c95904d51907cb15141bf4399bbcb5d Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 23 Jan 2025 14:32:12 -0800 Subject: [PATCH 04/11] refactor: ReadLocal and Explode nodes support offset outputs (#1301) --- bigframes/core/compile/compiled.py | 116 ---------------------- bigframes/core/compile/compiler.py | 11 ++- bigframes/core/compile/explode.py | 151 +++++++++++++++++++++++++++++ bigframes/core/nodes.py | 35 ++++++- 4 files changed, 191 insertions(+), 122 deletions(-) create mode 100644 bigframes/core/compile/explode.py diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index ae5e2ff8c0..15805a38fc 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -40,7 +40,6 @@ from bigframes.core.ordering import ( ascending_over, encode_order_string, - join_orderings, OrderingExpression, RowOrdering, TotalOrdering, @@ -420,50 +419,6 @@ def _uniform_sampling(self, fraction: float) -> UnorderedIR: columns=columns, ) - def explode(self, columns: typing.Sequence[ex.DerefOp]) -> UnorderedIR: - table = self._to_ibis_expr() - column_ids = tuple(ref.id.sql for ref in columns) - - # The offset array ensures null represents empty arrays after unnesting. - offset_array_id = bigframes.core.guid.generate_guid("offset_array_") - offset_array = bigframes_vendored.ibis.range( - 0, - bigframes_vendored.ibis.greatest( - 1, # We always want at least 1 element to fill in NULLs for empty arrays. - bigframes_vendored.ibis.least( - *[table[column_id].length() for column_id in column_ids] - ), - ), - 1, - ).name(offset_array_id) - table_w_offset_array = table.select( - offset_array, - *self._column_names, - ) - - unnest_offset_id = bigframes.core.guid.generate_guid("unnest_offset_") - unnest_offset = ( - table_w_offset_array[offset_array_id].unnest().name(unnest_offset_id) - ) - table_w_offset = table_w_offset_array.select( - unnest_offset, - *self._column_names, - ) - - unnested_columns = [ - table_w_offset[column_id][table_w_offset[unnest_offset_id]].name(column_id) - if column_id in column_ids - else table_w_offset[column_id] - for column_id in self._column_names - ] - table_w_unnest = table_w_offset.select(*unnested_columns) - - columns = [table_w_unnest[column_name] for column_name in self._column_names] - return UnorderedIR( - table_w_unnest, - columns=columns, # type: ignore - ) - def as_ordered_ir(self) -> OrderedIR: """Convert to OrderedIr, but without any definite ordering.""" return OrderedIR(self._table, self._columns, predicates=self._predicates) @@ -746,77 +701,6 @@ def _uniform_sampling(self, fraction: float) -> OrderedIR: ordering=self._ordering, ) - def explode(self, columns: typing.Sequence[ex.DerefOp]) -> OrderedIR: - if self.order_non_deterministic: - id = bigframes.core.guid.generate_guid() - return self.promote_offsets(id) - table = self._to_ibis_expr(ordering_mode="unordered", expose_hidden_cols=True) - column_ids = tuple(ref.id.sql for ref in columns) - - offset_array_id = bigframes.core.guid.generate_guid("offset_array_") - offset_array = bigframes_vendored.ibis.range( - 0, - bigframes_vendored.ibis.greatest( - 1, # We always want at least 1 element to fill in NULLs for empty arrays. - bigframes_vendored.ibis.least( - *[table[column_id].length() for column_id in column_ids] - ), - ), - 1, - ).name(offset_array_id) - table_w_offset_array = table.select( - offset_array, - *self._column_names, - *self._hidden_ordering_column_names, - ) - - unnest_offset_id = bigframes.core.guid.generate_guid("unnest_offset_") - unnest_offset = ( - table_w_offset_array[offset_array_id].unnest().name(unnest_offset_id) - ) - table_w_offset = table_w_offset_array.select( - unnest_offset, - *self._column_names, - *self._hidden_ordering_column_names, - ) - - unnested_columns = [ - table_w_offset[column_id][table_w_offset[unnest_offset_id]].name(column_id) - if column_id in column_ids - else table_w_offset[column_id] - for column_id in self._column_names - ] - - table_w_unnest = table_w_offset.select( - table_w_offset[unnest_offset_id], - *unnested_columns, - *self._hidden_ordering_column_names, - ) - - columns = [table_w_unnest[column_name] for column_name in self._column_names] - hidden_ordering_columns = [ - *[ - table_w_unnest[column_name] - for column_name in self._hidden_ordering_column_names - ], - table_w_unnest[unnest_offset_id], - ] - l_mappings = {id: id for id in self._ordering.referenced_columns} - r_mappings = {ids.ColumnId(unnest_offset_id): ids.ColumnId(unnest_offset_id)} - ordering = join_orderings( - self._ordering, - TotalOrdering.from_offset_col(unnest_offset_id), - l_mappings, - r_mappings, - ) - - return OrderedIR( - table_w_unnest, - columns=columns, # type: ignore - hidden_ordering_columns=hidden_ordering_columns, - ordering=ordering, - ) - def promote_offsets(self, col_id: str) -> OrderedIR: """ Convenience function to promote copy of column offsets to a value column. Can be used to reset index. diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 9548bb48f4..6f47d198c5 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -26,6 +26,7 @@ import bigframes.core.compile.compiled as compiled import bigframes.core.compile.concat as concat_impl +import bigframes.core.compile.explode import bigframes.core.compile.ibis_types import bigframes.core.compile.scalar_op_compiler import bigframes.core.compile.scalar_op_compiler as compile_scalar @@ -373,7 +374,15 @@ def compile_window(self, node: nodes.WindowOpNode, ordered: bool = True): @_compile_node.register def compile_explode(self, node: nodes.ExplodeNode, ordered: bool = True): - return self.compile_node(node.child, ordered).explode(node.column_ids) + offsets_col = node.offsets_col.sql if (node.offsets_col is not None) else None + if ordered: + return bigframes.core.compile.explode.explode_ordered( + self.compile_ordered_ir(node.child), node.column_ids, offsets_col + ) + else: + return bigframes.core.compile.explode.explode_unordered( + self.compile_unordered_ir(node.child), node.column_ids, offsets_col + ) @_compile_node.register def compile_random_sample(self, node: nodes.RandomSampleNode, ordered: bool = True): diff --git a/bigframes/core/compile/explode.py b/bigframes/core/compile/explode.py new file mode 100644 index 0000000000..0dfc129810 --- /dev/null +++ b/bigframes/core/compile/explode.py @@ -0,0 +1,151 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import typing + +import bigframes_vendored.ibis + +import bigframes.core.compile.compiled as compiled +import bigframes.core.expression as ex +import bigframes.core.guid +import bigframes.core.identifiers as ids +import bigframes.core.ordering +from bigframes.core.ordering import TotalOrdering + + +def explode_unordered( + input: compiled.UnorderedIR, + columns: typing.Sequence[ex.DerefOp], + offsets_id: typing.Optional[str], +) -> compiled.UnorderedIR: + table = input._to_ibis_expr() + column_ids = tuple(ref.id.sql for ref in columns) + + # The offset array ensures null represents empty arrays after unnesting. + offset_array_id = bigframes.core.guid.generate_guid("offset_array_") + offset_array = bigframes_vendored.ibis.range( + 0, + bigframes_vendored.ibis.greatest( + 1, # We always want at least 1 element to fill in NULLs for empty arrays. + bigframes_vendored.ibis.least( + *[table[column_id].length() for column_id in column_ids] + ), + ), + 1, + ).name(offset_array_id) + table_w_offset_array = table.select( + offset_array, + *input._column_names, + ) + + unnest_offset_id = offsets_id or bigframes.core.guid.generate_guid("unnest_offset_") + unnest_offset = ( + table_w_offset_array[offset_array_id].unnest().name(unnest_offset_id) + ) + table_w_offset = table_w_offset_array.select( + unnest_offset, + *input._column_names, + ) + + output_cols = tuple(input.column_ids) + ((offsets_id,) if offsets_id else ()) + unnested_columns = [ + table_w_offset[column_id][table_w_offset[unnest_offset_id]].name(column_id) + if column_id in column_ids + else table_w_offset[column_id] + for column_id in output_cols + ] + table_w_unnest = table_w_offset.select(*unnested_columns) + + columns = [table_w_unnest[column_name] for column_name in output_cols] + return compiled.UnorderedIR( + table_w_unnest, + columns=columns, # type: ignore + ) + + +def explode_ordered( + input: compiled.OrderedIR, + columns: typing.Sequence[ex.DerefOp], + offsets_id: typing.Optional[str], +) -> compiled.OrderedIR: + if input.order_non_deterministic: + id = bigframes.core.guid.generate_guid() + return input.promote_offsets(id) + table = input._to_ibis_expr(ordering_mode="unordered", expose_hidden_cols=True) + column_ids = tuple(ref.id.sql for ref in columns) + + offset_array_id = bigframes.core.guid.generate_guid("offset_array_") + offset_array = bigframes_vendored.ibis.range( + 0, + bigframes_vendored.ibis.greatest( + 1, # We always want at least 1 element to fill in NULLs for empty arrays. + bigframes_vendored.ibis.least( + *[table[column_id].length() for column_id in column_ids] + ), + ), + 1, + ).name(offset_array_id) + table_w_offset_array = table.select( + offset_array, + *input._column_names, + *input._hidden_ordering_column_names, + ) + + unnest_offset_id = offsets_id or bigframes.core.guid.generate_guid("unnest_offset_") + unnest_offset = ( + table_w_offset_array[offset_array_id].unnest().name(unnest_offset_id) + ) + table_w_offset = table_w_offset_array.select( + unnest_offset, + *input._column_names, + *input._hidden_ordering_column_names, + ) + + unnested_columns = [ + table_w_offset[column_id][table_w_offset[unnest_offset_id]].name(column_id) + if column_id in column_ids + else table_w_offset[column_id] + for column_id in input._column_names + ] + + table_w_unnest = table_w_offset.select( + table_w_offset[unnest_offset_id], + *unnested_columns, + *input._hidden_ordering_column_names, + ) + + output_cols = tuple(input.column_ids) + ((offsets_id,) if offsets_id else ()) + columns = [table_w_unnest[column_name] for column_name in output_cols] + hidden_ordering_columns = [ + table_w_unnest[column_name] + for column_name in input._hidden_ordering_column_names + ] + if offsets_id is None: + hidden_ordering_columns.append(table_w_unnest[unnest_offset_id]) + l_mappings = {id: id for id in input._ordering.referenced_columns} + r_mappings = {ids.ColumnId(unnest_offset_id): ids.ColumnId(unnest_offset_id)} + ordering = bigframes.core.ordering.join_orderings( + input._ordering, + TotalOrdering.from_offset_col(unnest_offset_id), + l_mappings, + r_mappings, + ) + + return compiled.OrderedIR( + table_w_unnest, + columns=columns, # type: ignore + hidden_ordering_columns=hidden_ordering_columns, + ordering=ordering, + ) diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 88d55ac70b..d5083c3737 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -659,16 +659,24 @@ class ScanList: @dataclasses.dataclass(frozen=True, eq=False) class ReadLocalNode(LeafNode): + # TODO: Combine feather_bytes, data_schema, n_rows into a LocalDataDef struct feather_bytes: bytes data_schema: schemata.ArraySchema n_rows: int # Mapping of local ids to bfet id. scan_list: ScanList + # Offsets are generated only if this is non-null + offsets_col: Optional[bigframes.core.identifiers.ColumnId] = None session: typing.Optional[bigframes.session.Session] = None @property def fields(self) -> Iterable[Field]: - return (Field(col_id, dtype) for col_id, dtype, _ in self.scan_list.items) + fields = (Field(col_id, dtype) for col_id, dtype, _ in self.scan_list.items) + if self.offsets_col is not None: + return itertools.chain( + fields, (Field(self.offsets_col, bigframes.dtypes.INT_DTYPE),) + ) + return fields @property def variables_introduced(self) -> int: @@ -697,7 +705,7 @@ def row_count(self) -> typing.Optional[int]: @property def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: - return tuple(item.id for item in self.scan_list.items) + return tuple(item.id for item in self.fields) def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: # Don't preoduce empty scan list no matter what, will result in broken sql syntax @@ -711,6 +719,7 @@ def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: self.data_schema, self.n_rows, new_scan_list, + self.offsets_col, self.session, ) @@ -723,7 +732,14 @@ def remap_vars( for item in self.scan_list.items ) ) - return dataclasses.replace(self, scan_list=new_scan_list) + new_offsets_col = ( + mappings.get(self.offsets_col, self.offsets_col) + if (self.offsets_col is not None) + else None + ) + return dataclasses.replace( + self, scan_list=new_scan_list, offsets_col=new_offsets_col + ) def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): return self @@ -1439,6 +1455,8 @@ def remap_refs( @dataclasses.dataclass(frozen=True, eq=False) class ExplodeNode(UnaryNode): column_ids: typing.Tuple[ex.DerefOp, ...] + # Offsets are generated only if this is non-null + offsets_col: Optional[bigframes.core.identifiers.ColumnId] = None @property def row_preserving(self) -> bool: @@ -1446,7 +1464,7 @@ def row_preserving(self) -> bool: @property def fields(self) -> Iterable[Field]: - return ( + fields = ( Field( field.id, bigframes.dtypes.arrow_dtype_to_bigframes_dtype( @@ -1457,6 +1475,11 @@ def fields(self) -> Iterable[Field]: else field for field in self.child.fields ) + if self.offsets_col is not None: + return itertools.chain( + fields, (Field(self.offsets_col, bigframes.dtypes.INT_DTYPE),) + ) + return fields @property def relation_ops_created(self) -> int: @@ -1472,7 +1495,7 @@ def row_count(self) -> Optional[int]: @property def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: - return () + return (self.offsets_col,) if (self.offsets_col is not None) else () def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: # Cannot prune explode op @@ -1482,6 +1505,8 @@ def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] ) -> BigFrameNode: + if (self.offsets_col is not None) and self.offsets_col in mappings: + return dataclasses.replace(self, offsets_col=mappings[self.offsets_col]) return self def remap_refs( From f6722629fb47eed5befb0ecae2e6b5ec9042d669 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 23 Jan 2025 14:33:28 -0800 Subject: [PATCH 05/11] feat!: Enable reading JSON data with `dbjson` extension dtype (#1139) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change updates how we handle JSON data types read from BigQuery. Previously, BigQuery JSON types were treated as generic large strings within our system. To improve accuracy and functionality, we now map them to a dedicated JSON data type (db_dtypes.JSONType or db_dtypes.JSONArrowType for pyarrow). While this provides a more appropriate representation of JSON data, it's important to note that this feature is still in preview and may evolve. Co-authored-by: Owl Bot Co-authored-by: Tim Sweña (Swast) Release-As: 1.34.0 --- bigframes/bigquery/_operations/json.py | 4 +- bigframes/core/__init__.py | 4 +- bigframes/core/compile/ibis_types.py | 11 +- bigframes/core/compile/scalar_op_compiler.py | 60 +++---- bigframes/dtypes.py | 6 +- bigframes/operations/json_ops.py | 5 +- bigframes/session/_io/pandas.py | 3 + setup.py | 1 + testing/constraints-3.9.txt | 1 + tests/system/small/bigquery/test_json.py | 64 +++++--- tests/system/small/test_dataframe_io.py | 157 +++++++++++++++++-- tests/system/small/test_series.py | 3 +- 12 files changed, 224 insertions(+), 95 deletions(-) diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py index 52b01d3ef7..0223811ebc 100644 --- a/bigframes/bigquery/_operations/json.py +++ b/bigframes/bigquery/_operations/json.py @@ -53,7 +53,7 @@ def json_set( >>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"] >>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")]) 0 {"a":100,"b":"hi"} - Name: data, dtype: large_string[pyarrow] + Name: data, dtype: dbjson Args: input (bigframes.series.Series): @@ -253,7 +253,7 @@ def parse_json( dtype: string >>> bbq.parse_json(s) 0 {"class":{"students":[{"id":5},{"id":12}]}} - dtype: large_string[pyarrow] + dtype: dbjson Args: input (bigframes.series.Series): diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 0bae094777..d9bba9bdb0 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -108,8 +108,8 @@ def from_table( raise ValueError("must set at most one of 'offests', 'primary_key'") if any(i.field_type == "JSON" for i in table.schema if i.name in schema.names): msg = ( - "Interpreting JSON column(s) as pyarrow.large_string. " - "This behavior may change in future versions." + "Interpreting JSON column(s) as the `db_dtypes.dbjson` extension type is" + "in preview; this behavior may change in future versions." ) warnings.warn(msg, bfe.PreviewWarning) # define data source only for needed columns, this makes row-hashing cheaper diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index 18f0834903..a0afa29a15 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -16,7 +16,6 @@ import textwrap import typing from typing import Any, cast, Dict, Iterable, Optional, Tuple, Union -import warnings import bigframes_vendored.constants as constants import bigframes_vendored.ibis @@ -26,6 +25,7 @@ dtype as python_type_to_ibis_type, ) import bigframes_vendored.ibis.expr.types as ibis_types +import db_dtypes # type: ignore import geopandas as gpd # type: ignore import google.cloud.bigquery as bigquery import numpy as np @@ -33,7 +33,6 @@ import pyarrow as pa import bigframes.dtypes -import bigframes.exceptions as bfe # Type hints for Ibis data types supported by BigQuery DataFrame IbisDtype = Union[ @@ -76,7 +75,7 @@ ibis_dtypes.GeoSpatial(geotype="geography", srid=4326, nullable=True), gpd.array.GeometryDtype(), ), - (ibis_dtypes.json, pd.ArrowDtype(pa.large_string())), + (ibis_dtypes.json, db_dtypes.JSONDtype()), ) BIGFRAMES_TO_IBIS: Dict[bigframes.dtypes.Dtype, ibis_dtypes.DataType] = { @@ -305,13 +304,7 @@ def ibis_dtype_to_bigframes_dtype( if isinstance(ibis_dtype, ibis_dtypes.Integer): return pd.Int64Dtype() - # Temporary: Will eventually support an explicit json type instead of casting to string. if isinstance(ibis_dtype, ibis_dtypes.JSON): - msg = ( - "Interpreting JSON column(s) as pyarrow.large_string. This behavior may change " - "in future versions." - ) - warnings.warn(msg, category=bfe.PreviewWarning) return bigframes.dtypes.JSON_DTYPE if ibis_dtype in IBIS_TO_BIGFRAMES: diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 4f670b51ca..2ab10e025d 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -1188,34 +1188,33 @@ def array_slice_op_impl(x: ibis_types.Value, op: ops.ArraySliceOp): # JSON Ops @scalar_op_compiler.register_binary_op(ops.JSONSet, pass_op=True) def json_set_op_impl(x: ibis_types.Value, y: ibis_types.Value, op: ops.JSONSet): - if x.type().is_json(): - return json_set( - json_obj=x, - json_path=op.json_path, - json_value=y, - ) - else: - # Enabling JSON type eliminates the need for less efficient string conversions. - return to_json_string( - json_set( # type: ignore - json_obj=parse_json(json_str=x), - json_path=op.json_path, - json_value=y, - ) - ) + return json_set(json_obj=x, json_path=op.json_path, json_value=y) @scalar_op_compiler.register_unary_op(ops.JSONExtract, pass_op=True) def json_extract_op_impl(x: ibis_types.Value, op: ops.JSONExtract): - if x.type().is_json(): - return json_extract(json_obj=x, json_path=op.json_path) - # json string - return json_extract_string(json_obj=x, json_path=op.json_path) + # Define a user-defined function whose returned type is dynamically matching the input. + def json_extract(json_or_json_string, json_path: ibis_dtypes.str): # type: ignore + """Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value.""" + ... + + return_type = x.type() + json_extract.__annotations__["return"] = return_type + json_extract_op = ibis_udf.scalar.builtin(json_extract) + return json_extract_op(json_or_json_string=x, json_path=op.json_path) @scalar_op_compiler.register_unary_op(ops.JSONExtractArray, pass_op=True) def json_extract_array_op_impl(x: ibis_types.Value, op: ops.JSONExtractArray): - return json_extract_array(json_obj=x, json_path=op.json_path) + # Define a user-defined function whose returned type is dynamically matching the input. + def json_extract_array(json_or_json_string, json_path: ibis_dtypes.str): # type: ignore + """Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value.""" + ... + + return_type = x.type() + json_extract_array.__annotations__["return"] = ibis_dtypes.Array[return_type] # type: ignore + json_extract_op = ibis_udf.scalar.builtin(json_extract_array) + return json_extract_op(json_or_json_string=x, json_path=op.json_path) @scalar_op_compiler.register_unary_op(ops.JSONExtractStringArray, pass_op=True) @@ -1937,27 +1936,6 @@ def json_set( # type: ignore[empty-body] """Produces a new SQL JSON value with the specified JSON data inserted or replaced.""" -@ibis_udf.scalar.builtin(name="json_extract") -def json_extract( # type: ignore[empty-body] - json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String -) -> ibis_dtypes.JSON: - """Extracts a JSON value and converts it to a JSON value.""" - - -@ibis_udf.scalar.builtin(name="json_extract") -def json_extract_string( # type: ignore[empty-body] - json_obj: ibis_dtypes.String, json_path: ibis_dtypes.String -) -> ibis_dtypes.String: - """Extracts a JSON SRING value and converts it to a SQL JSON-formatted STRING.""" - - -@ibis_udf.scalar.builtin(name="json_extract_array") -def json_extract_array( # type: ignore[empty-body] - json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String -) -> ibis_dtypes.Array[ibis_dtypes.String]: - """Extracts a JSON array and converts it to a SQL ARRAY of JSON-formatted STRINGs or JSON values.""" - - @ibis_udf.scalar.builtin(name="json_extract_string_array") def json_extract_string_array( # type: ignore[empty-body] json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 3da3fa24f3..863615118a 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -21,6 +21,7 @@ from typing import Any, Dict, List, Literal, Union import bigframes_vendored.constants as constants +import db_dtypes # type: ignore import geopandas as gpd # type: ignore import google.cloud.bigquery import numpy as np @@ -59,7 +60,7 @@ # No arrow equivalent GEO_DTYPE = gpd.array.GeometryDtype() # JSON -JSON_DTYPE = pd.ArrowDtype(pa.large_string()) +JSON_DTYPE = db_dtypes.JSONDtype() OBJ_REF_DTYPE = pd.ArrowDtype( pa.struct( ( @@ -161,7 +162,7 @@ class SimpleDtypeInfo: ), SimpleDtypeInfo( dtype=JSON_DTYPE, - arrow_dtype=pa.large_string(), + arrow_dtype=db_dtypes.JSONArrowType(), type_kind=("JSON",), orderable=False, clusterable=False, @@ -320,7 +321,6 @@ def is_struct_like(type_: ExpressionType) -> bool: def is_json_like(type_: ExpressionType) -> bool: - # TODO: Add JSON type support return type_ == JSON_DTYPE or type_ == STRING_DTYPE # Including JSON string diff --git a/bigframes/operations/json_ops.py b/bigframes/operations/json_ops.py index 86c5a19ba7..1daacf4e6b 100644 --- a/bigframes/operations/json_ops.py +++ b/bigframes/operations/json_ops.py @@ -50,7 +50,7 @@ def output_type(self, *input_types): + f" Received type: {input_type}" ) return pd.ArrowDtype( - pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(dtypes.STRING_DTYPE)) + pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(input_type)) ) @@ -118,8 +118,7 @@ def output_type(self, *input_types): + f"Received type: {right_type}" ) - # After JSON type implementation, ONLY return JSON data. - return left_type + return dtypes.JSON_DTYPE @dataclasses.dataclass(frozen=True) diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py index 2f6aade0e5..301e1c4ebb 100644 --- a/bigframes/session/_io/pandas.py +++ b/bigframes/session/_io/pandas.py @@ -17,6 +17,7 @@ from typing import Collection, Union import bigframes_vendored.constants as constants +import db_dtypes # type: ignore import geopandas # type: ignore import numpy as np import pandas @@ -122,6 +123,8 @@ def arrow_to_pandas( ) elif isinstance(dtype, pandas.ArrowDtype): series = _arrow_to_pandas_arrowdtype(column, dtype) + elif isinstance(dtype, db_dtypes.JSONDtype): + series = db_dtypes.JSONArray(column) else: series = column.to_pandas(types_mapper=lambda _: dtype) diff --git a/setup.py b/setup.py index 74a0d5475c..047da2348c 100644 --- a/setup.py +++ b/setup.py @@ -62,6 +62,7 @@ "ipywidgets >=7.7.1", "humanize >=4.6.0", "matplotlib >=3.7.1", + "db-dtypes >=1.4.0", # For vendored ibis-framework. "atpublic>=2.3,<6", "parsy>=2,<3", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 015153cb01..8b7ad892c0 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -26,6 +26,7 @@ tabulate==0.9 ipywidgets==7.7.1 humanize==4.6.0 matplotlib==3.7.1 +db-dtypes==1.4.0 # For vendored ibis-framework. atpublic==2.3 parsy==2.0 diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index b01ac3aaf2..aa490749ae 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -118,7 +118,6 @@ def test_json_set_w_invalid_series_type(): def test_json_extract_from_json(): s = _get_series_from_json([{"a": {"b": [1, 2]}}, {"a": {"c": 1}}, {"a": {"b": 0}}]) actual = bbq.json_extract(s, "$.a.b").to_pandas() - # After the introduction of the JSON type, the output should be a JSON-formatted series. expected = _get_series_from_json([[1, 2], None, 0]).to_pandas() pd.testing.assert_series_equal( actual, @@ -129,12 +128,10 @@ def test_json_extract_from_json(): def test_json_extract_from_string(): s = bpd.Series(['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}']) actual = bbq.json_extract(s, "$.a.b") - expected = _get_series_from_json([[1, 2], None, 0]) + expected = bpd.Series(["[1,2]", None, "0"]) pd.testing.assert_series_equal( actual.to_pandas(), expected.to_pandas(), - check_names=False, - check_dtype=False, # json_extract returns string type. While _get_series_from_json gives a JSON series (pa.large_string). ) @@ -143,20 +140,58 @@ def test_json_extract_w_invalid_series_type(): bbq.json_extract(bpd.Series([1, 2]), "$.a") +def test_json_extract_array_from_json(): + s = _get_series_from_json( + [{"a": ["ab", "2", "3 xy"]}, {"a": []}, {"a": ["4", "5"]}, {}] + ) + actual = bbq.json_extract_array(s, "$.a") + + # This code provides a workaround for issue https://github.com/apache/arrow/issues/45262, + # which currently prevents constructing a series using the pa.list_(db_types.JSONArrrowType()) + sql = """ + SELECT 0 AS id, [JSON '"ab"', JSON '"2"', JSON '"3 xy"'] AS data, + UNION ALL + SELECT 1, [], + UNION ALL + SELECT 2, [JSON '"4"', JSON '"5"'], + UNION ALL + SELECT 3, null, + """ + df = bpd.read_gbq(sql).set_index("id").sort_index() + expected = df["data"] + + pd.testing.assert_series_equal( + actual.to_pandas(), + expected.to_pandas(), + ) + + def test_json_extract_array_from_json_strings(): - s = bpd.Series(['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}']) + s = bpd.Series( + ['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}', "{}"], + dtype=pd.StringDtype(storage="pyarrow"), + ) actual = bbq.json_extract_array(s, "$.a") - expected = bpd.Series([['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"']]) + expected = bpd.Series( + [['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"'], None], + dtype=pd.StringDtype(storage="pyarrow"), + ) pd.testing.assert_series_equal( actual.to_pandas(), expected.to_pandas(), ) -def test_json_extract_array_from_array_strings(): - s = bpd.Series(["[1, 2, 3]", "[]", "[4,5]"]) +def test_json_extract_array_from_json_array_strings(): + s = bpd.Series( + ["[1, 2, 3]", "[]", "[4,5]"], + dtype=pd.StringDtype(storage="pyarrow"), + ) actual = bbq.json_extract_array(s) - expected = bpd.Series([["1", "2", "3"], [], ["4", "5"]]) + expected = bpd.Series( + [["1", "2", "3"], [], ["4", "5"]], + dtype=pd.StringDtype(storage="pyarrow"), + ) pd.testing.assert_series_equal( actual.to_pandas(), expected.to_pandas(), @@ -164,8 +199,9 @@ def test_json_extract_array_from_array_strings(): def test_json_extract_array_w_invalid_series_type(): + s = bpd.Series([1, 2]) with pytest.raises(TypeError): - bbq.json_extract_array(bpd.Series([1, 2])) + bbq.json_extract_array(s) def test_json_extract_string_array_from_json_strings(): @@ -203,14 +239,6 @@ def test_json_extract_string_array_w_invalid_series_type(): bbq.json_extract_string_array(bpd.Series([1, 2])) -# b/381148539 -def test_json_in_struct(): - df = bpd.read_gbq( - "SELECT STRUCT(JSON '{\\\"a\\\": 1}' AS data, 1 AS number) as struct_col" - ) - assert df["struct_col"].struct.field("data")[0] == '{"a":1}' - - def test_parse_json_w_invalid_series_type(): with pytest.raises(TypeError): bbq.parse_json(bpd.Series([1, 2])) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 848e21f6bd..10637b2395 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -12,8 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import math from typing import Tuple +import db_dtypes # type:ignore import google.api_core.exceptions import pandas as pd import pandas.testing @@ -247,23 +249,146 @@ def test_to_pandas_array_struct_correct_result(session): ) -def test_load_json(session): - df = session.read_gbq( - """SELECT - JSON_OBJECT('foo', 10, 'bar', TRUE) AS json_column - """ - ) - +def test_load_json_w_unboxed_py_value(session): + sql = """ + SELECT 0 AS id, JSON_OBJECT('boolean', True) AS json_col, + UNION ALL + SELECT 1, JSON_OBJECT('int', 100), + UNION ALL + SELECT 2, JSON_OBJECT('float', 0.98), + UNION ALL + SELECT 3, JSON_OBJECT('string', 'hello world'), + UNION ALL + SELECT 4, JSON_OBJECT('array', [8, 9, 10]), + UNION ALL + SELECT 5, JSON_OBJECT('null', null), + UNION ALL + SELECT + 6, + JSON_OBJECT( + 'dict', + JSON_OBJECT( + 'int', 1, + 'array', [JSON_OBJECT('bar', 'hello'), JSON_OBJECT('foo', 1)] + ) + ), + """ + df = session.read_gbq(sql, index_col="id") + + assert df.dtypes["json_col"] == db_dtypes.JSONDtype() + assert isinstance(df["json_col"][0], dict) + + assert df["json_col"][0]["boolean"] + assert df["json_col"][1]["int"] == 100 + assert math.isclose(df["json_col"][2]["float"], 0.98) + assert df["json_col"][3]["string"] == "hello world" + assert df["json_col"][4]["array"] == [8, 9, 10] + assert df["json_col"][5]["null"] is None + assert df["json_col"][6]["dict"] == { + "int": 1, + "array": [{"bar": "hello"}, {"foo": 1}], + } + + +def test_load_json_to_pandas_has_correct_result(session): + df = session.read_gbq("SELECT JSON_OBJECT('foo', 10, 'bar', TRUE) AS json_col") + assert df.dtypes["json_col"] == db_dtypes.JSONDtype() result = df.to_pandas() - expected = pd.DataFrame( - { - "json_column": ['{"bar":true,"foo":10}'], - }, - dtype=pd.ArrowDtype(pa.large_string()), - ) - expected.index = expected.index.astype("Int64") - pd.testing.assert_series_equal(result.dtypes, expected.dtypes) - pd.testing.assert_series_equal(result["json_column"], expected["json_column"]) + + # The order of keys within the JSON object shouldn't matter for equality checks. + pd_df = pd.DataFrame( + {"json_col": [{"bar": True, "foo": 10}]}, + dtype=db_dtypes.JSONDtype(), + ) + pd_df.index = pd_df.index.astype("Int64") + pd.testing.assert_series_equal(result.dtypes, pd_df.dtypes) + pd.testing.assert_series_equal(result["json_col"], pd_df["json_col"]) + + +def test_load_json_in_struct(session): + """Avoid regressions for internal issue 381148539.""" + sql = """ + SELECT 0 AS id, STRUCT(JSON_OBJECT('boolean', True) AS data, 1 AS number) AS struct_col + UNION ALL + SELECT 1, STRUCT(JSON_OBJECT('int', 100), 2), + UNION ALL + SELECT 2, STRUCT(JSON_OBJECT('float', 0.98), 3), + UNION ALL + SELECT 3, STRUCT(JSON_OBJECT('string', 'hello world'), 4), + UNION ALL + SELECT 4, STRUCT(JSON_OBJECT('array', [8, 9, 10]), 5), + UNION ALL + SELECT 5, STRUCT(JSON_OBJECT('null', null), 6), + UNION ALL + SELECT + 6, + STRUCT(JSON_OBJECT( + 'dict', + JSON_OBJECT( + 'int', 1, + 'array', [JSON_OBJECT('bar', 'hello'), JSON_OBJECT('foo', 1)] + ) + ), 7), + """ + df = session.read_gbq(sql, index_col="id") + + assert isinstance(df.dtypes["struct_col"], pd.ArrowDtype) + assert isinstance(df.dtypes["struct_col"].pyarrow_dtype, pa.StructType) + + data = df["struct_col"].struct.field("data") + assert data.dtype == db_dtypes.JSONDtype() + + assert data[0]["boolean"] + assert data[1]["int"] == 100 + assert math.isclose(data[2]["float"], 0.98) + assert data[3]["string"] == "hello world" + assert data[4]["array"] == [8, 9, 10] + assert data[5]["null"] is None + assert data[6]["dict"] == { + "int": 1, + "array": [{"bar": "hello"}, {"foo": 1}], + } + + +def test_load_json_in_array(session): + sql = """ + SELECT + 0 AS id, + [ + JSON_OBJECT('boolean', True), + JSON_OBJECT('int', 100), + JSON_OBJECT('float', 0.98), + JSON_OBJECT('string', 'hello world'), + JSON_OBJECT('array', [8, 9, 10]), + JSON_OBJECT('null', null), + JSON_OBJECT( + 'dict', + JSON_OBJECT( + 'int', 1, + 'array', [JSON_OBJECT('bar', 'hello'), JSON_OBJECT('foo', 1)] + ) + ) + ] AS array_col, + """ + df = session.read_gbq(sql, index_col="id") + + assert isinstance(df.dtypes["array_col"], pd.ArrowDtype) + assert isinstance(df.dtypes["array_col"].pyarrow_dtype, pa.ListType) + + data = df["array_col"].list + assert data.len()[0] == 7 + assert data[0].dtype == db_dtypes.JSONDtype() + + assert data[0][0]["boolean"] + assert data[1][0]["int"] == 100 + assert math.isclose(data[2][0]["float"], 0.98) + assert data[3][0]["string"] == "hello world" + assert data[4][0]["array"] == [8, 9, 10] + assert data[5][0]["null"] is None + assert data[6][0]["dict"] == { + "int": 1, + "array": [{"bar": "hello"}, {"foo": 1}], + } def test_to_pandas_batches_w_correct_dtypes(scalars_df_default_index): diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 670828f616..3d76122e9d 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -17,6 +17,7 @@ import re import tempfile +import db_dtypes # type: ignore import geopandas as gpd # type: ignore import numpy from packaging.version import Version @@ -281,7 +282,7 @@ def test_get_column(scalars_dfs, col_name, expected_dtype): def test_get_column_w_json(json_df, json_pandas_df): series = json_df["json_col"] series_pandas = series.to_pandas() - assert series.dtype == pd.ArrowDtype(pa.large_string()) + assert series.dtype == db_dtypes.JSONDtype() assert series_pandas.shape[0] == json_pandas_df.shape[0] From dad522df84df3f5581ec58426be9bafb09124341 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Thu, 23 Jan 2025 15:48:06 -0800 Subject: [PATCH 06/11] chore: fix experimental blob video display in IPython 7.x (#1319) --- bigframes/operations/blob.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index a4de2f80c7..205a9fcf5c 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -216,7 +216,7 @@ def display_single_url(read_url: str, content_type: str): response = requests.get(read_url) ipy_display.display(ipy_display.Audio(response.content)) elif content_type.startswith("video"): - ipy_display.display(ipy_display.Video(url=read_url)) + ipy_display.display(ipy_display.Video(read_url)) else: # display as raw data response = requests.get(read_url) ipy_display.display(response.content) From b5033559a77a9bc5ffb7dc1e44e02aaaaf1e051e Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 23 Jan 2025 17:32:17 -0800 Subject: [PATCH 07/11] feat: Add DataFrame.corrwith method (#1315) --- bigframes/core/blocks.py | 16 ++--- bigframes/core/expression.py | 3 + bigframes/dataframe.py | 42 ++++++++++++ tests/system/small/test_dataframe.py | 66 +++++++++++++++++++ .../bigframes_vendored/pandas/core/frame.py | 41 ++++++++++++ 5 files changed, 160 insertions(+), 8 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index afc03dbdea..727ee013f8 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -2152,7 +2152,7 @@ def merge( def _align_both_axes( self, other: Block, how: str - ) -> Tuple[Block, pd.Index, Sequence[Tuple[ex.Expression, ex.Expression]]]: + ) -> Tuple[Block, pd.Index, Sequence[Tuple[ex.RefOrConstant, ex.RefOrConstant]]]: # Join rows aligned_block, (get_column_left, get_column_right) = self.join(other, how=how) # join columns schema @@ -2161,7 +2161,7 @@ def _align_both_axes( columns, lcol_indexer, rcol_indexer = self.column_labels, None, None else: columns, lcol_indexer, rcol_indexer = self.column_labels.join( - other.column_labels, how="outer", return_indexers=True + other.column_labels, how=how, return_indexers=True ) lcol_indexer = ( lcol_indexer if (lcol_indexer is not None) else range(len(columns)) @@ -2183,11 +2183,11 @@ def _align_both_axes( left_inputs = [left_input_lookup(i) for i in lcol_indexer] right_inputs = [righ_input_lookup(i) for i in rcol_indexer] - return aligned_block, columns, tuple(zip(left_inputs, right_inputs)) + return aligned_block, columns, tuple(zip(left_inputs, right_inputs)) # type: ignore def _align_axis_0( self, other: Block, how: str - ) -> Tuple[Block, pd.Index, Sequence[Tuple[ex.Expression, ex.Expression]]]: + ) -> Tuple[Block, pd.Index, Sequence[Tuple[ex.DerefOp, ex.DerefOp]]]: assert len(other.value_columns) == 1 aligned_block, (get_column_left, get_column_right) = self.join(other, how=how) @@ -2203,7 +2203,7 @@ def _align_axis_0( def _align_series_block_axis_1( self, other: Block, how: str - ) -> Tuple[Block, pd.Index, Sequence[Tuple[ex.Expression, ex.Expression]]]: + ) -> Tuple[Block, pd.Index, Sequence[Tuple[ex.RefOrConstant, ex.RefOrConstant]]]: assert len(other.value_columns) == 1 if other._transpose_cache is None: raise ValueError( @@ -2244,11 +2244,11 @@ def _align_series_block_axis_1( left_inputs = [left_input_lookup(i) for i in lcol_indexer] right_inputs = [righ_input_lookup(i) for i in rcol_indexer] - return aligned_block, columns, tuple(zip(left_inputs, right_inputs)) + return aligned_block, columns, tuple(zip(left_inputs, right_inputs)) # type: ignore def _align_pd_series_axis_1( self, other: pd.Series, how: str - ) -> Tuple[Block, pd.Index, Sequence[Tuple[ex.Expression, ex.Expression]]]: + ) -> Tuple[Block, pd.Index, Sequence[Tuple[ex.RefOrConstant, ex.RefOrConstant]]]: if self.column_labels.equals(other.index): columns, lcol_indexer, rcol_indexer = self.column_labels, None, None else: @@ -2275,7 +2275,7 @@ def _align_pd_series_axis_1( left_inputs = [left_input_lookup(i) for i in lcol_indexer] right_inputs = [righ_input_lookup(i) for i in rcol_indexer] - return self, columns, tuple(zip(left_inputs, right_inputs)) + return self, columns, tuple(zip(left_inputs, right_inputs)) # type: ignore def _apply_binop( self, diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py index 2d561657cb..9173bebfc4 100644 --- a/bigframes/core/expression.py +++ b/bigframes/core/expression.py @@ -420,3 +420,6 @@ def deterministic(self) -> bool: return ( all(input.deterministic for input in self.inputs) and self.op.deterministic ) + + +RefOrConstant = Union[DerefOp, ScalarConstantExpression] diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 7f60f1c769..6c866ad4b5 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1473,6 +1473,48 @@ def cov(self, *, numeric_only: bool = False) -> DataFrame: return result + def corrwith( + self, + other: typing.Union[DataFrame, bigframes.series.Series], + *, + numeric_only: bool = False, + ): + other_frame = other if isinstance(other, DataFrame) else other.to_frame() + if numeric_only: + l_frame = self._drop_non_numeric() + r_frame = other_frame._drop_non_numeric() + else: + l_frame = self._raise_on_non_numeric("corrwith") + r_frame = other_frame._raise_on_non_numeric("corrwith") + + l_block = l_frame.astype(bigframes.dtypes.FLOAT_DTYPE)._block + r_block = r_frame.astype(bigframes.dtypes.FLOAT_DTYPE)._block + + if isinstance(other, DataFrame): + block, labels, expr_pairs = l_block._align_both_axes(r_block, how="inner") + else: + assert isinstance(other, bigframes.series.Series) + block, labels, expr_pairs = l_block._align_axis_0(r_block, how="inner") + + na_cols = l_block.column_labels.join( + r_block.column_labels, how="outer" + ).difference(labels) + + block, _ = block.aggregate( + aggregations=tuple( + ex.BinaryAggregation(agg_ops.CorrOp(), left_ex, right_ex) + for left_ex, right_ex in expr_pairs + ), + column_labels=labels, + ) + block = block.project_exprs( + (ex.const(float("nan")),) * len(na_cols), labels=na_cols + ) + block = block.transpose( + original_row_index=pandas.Index([None]), single_row_mode=True + ) + return bigframes.pandas.Series(block) + def to_arrow( self, *, diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 93c865536c..4266cdba88 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2246,6 +2246,72 @@ def test_cov_w_numeric_only(scalars_dfs_maybe_ordered, columns, numeric_only): ) +def test_df_corrwith_df(scalars_dfs_maybe_ordered): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + + l_cols = ["int64_col", "float64_col", "int64_too"] + r_cols = ["int64_too", "float64_col"] + + bf_result = scalars_df[l_cols].corrwith(scalars_df[r_cols]).to_pandas() + pd_result = scalars_pandas_df[l_cols].corrwith(scalars_pandas_df[r_cols]) + + # BigFrames and Pandas differ in their data type handling: + # - Column types: BigFrames uses Float64, Pandas uses float64. + # - Index types: BigFrames uses strign, Pandas uses object. + pd.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_df_corrwith_df_numeric_only(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + l_cols = ["int64_col", "float64_col", "int64_too", "string_col"] + r_cols = ["int64_too", "float64_col", "bool_col"] + + bf_result = ( + scalars_df[l_cols].corrwith(scalars_df[r_cols], numeric_only=True).to_pandas() + ) + pd_result = scalars_pandas_df[l_cols].corrwith( + scalars_pandas_df[r_cols], numeric_only=True + ) + + # BigFrames and Pandas differ in their data type handling: + # - Column types: BigFrames uses Float64, Pandas uses float64. + # - Index types: BigFrames uses strign, Pandas uses object. + pd.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_df_corrwith_df_non_numeric_error(scalars_dfs): + scalars_df, _ = scalars_dfs + + l_cols = ["int64_col", "float64_col", "int64_too", "string_col"] + r_cols = ["int64_too", "float64_col", "bool_col"] + + with pytest.raises(NotImplementedError): + scalars_df[l_cols].corrwith(scalars_df[r_cols], numeric_only=False) + + +@skip_legacy_pandas +def test_df_corrwith_series(scalars_dfs_maybe_ordered): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + + l_cols = ["int64_col", "float64_col", "int64_too"] + r_col = "float64_col" + + bf_result = scalars_df[l_cols].corrwith(scalars_df[r_col]).to_pandas() + pd_result = scalars_pandas_df[l_cols].corrwith(scalars_pandas_df[r_col]) + + # BigFrames and Pandas differ in their data type handling: + # - Column types: BigFrames uses Float64, Pandas uses float64. + # - Index types: BigFrames uses strign, Pandas uses object. + pd.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + @pytest.mark.parametrize( ("op"), [ diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index bf4d2f2d0c..f5aa23d00b 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4146,6 +4146,47 @@ def cov(self, *, numeric_only) -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def corrwith( + self, + other, + *, + numeric_only: bool = False, + ): + """ + Compute pairwise correlation. + + Pairwise correlation is computed between rows or columns of + DataFrame with rows or columns of Series or DataFrame. DataFrames + are first aligned along both axes before computing the + correlations. + + **Examples:** + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> index = ["a", "b", "c", "d", "e"] + >>> columns = ["one", "two", "three", "four"] + >>> df1 = bpd.DataFrame(np.arange(20).reshape(5, 4), index=index, columns=columns) + >>> df2 = bpd.DataFrame(np.arange(16).reshape(4, 4), index=index[:4], columns=columns) + >>> df1.corrwith(df2) + one 1.0 + two 1.0 + three 1.0 + four 1.0 + dtype: Float64 + + Args: + other (DataFrame, Series): + Object with which to compute correlations. + + numeric_only (bool, default False): + Include only `float`, `int` or `boolean` data. + + Returns: + bigframes.pandas.Series: Pairwise correlations. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def update( self, other, join: str = "left", overwrite: bool = True, filter_func=None ) -> DataFrame: From bd3f584a7eab5d01dedebb7ca2485942ef5b5ebe Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Fri, 24 Jan 2025 14:24:21 -0800 Subject: [PATCH 08/11] feat: (df|s).hist(), (df|s).line(), (df|s).area(), (df|s).bar(), df.scatter() (#1320) --- bigframes/dataframe.py | 50 +++++++++ bigframes/series.py | 50 +++++++-- .../system/small/operations/test_plotting.py | 102 +++++++++++++++--- 3 files changed, 180 insertions(+), 22 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 6c866ad4b5..552cd0084c 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -4313,6 +4313,56 @@ def get_right_id(id): def plot(self): return plotting.PlotAccessor(self) + def hist( + self, by: typing.Optional[typing.Sequence[str]] = None, bins: int = 10, **kwargs + ): + return self.plot.hist(by=by, bins=bins, **kwargs) + + hist.__doc__ = inspect.getdoc(plotting.PlotAccessor.hist) + + def line( + self, + x: typing.Optional[typing.Hashable] = None, + y: typing.Optional[typing.Hashable] = None, + **kwargs, + ): + return self.plot.line(x=x, y=y, **kwargs) + + line.__doc__ = inspect.getdoc(plotting.PlotAccessor.line) + + def area( + self, + x: typing.Optional[typing.Hashable] = None, + y: typing.Optional[typing.Hashable] = None, + stacked: bool = True, + **kwargs, + ): + return self.plot.area(x=x, y=y, stacked=stacked, **kwargs) + + area.__doc__ = inspect.getdoc(plotting.PlotAccessor.area) + + def bar( + self, + x: typing.Optional[typing.Hashable] = None, + y: typing.Optional[typing.Hashable] = None, + **kwargs, + ): + return self.plot.bar(x=x, y=y, **kwargs) + + bar.__doc__ = inspect.getdoc(plotting.PlotAccessor.bar) + + def scatter( + self, + x: typing.Optional[typing.Hashable] = None, + y: typing.Optional[typing.Hashable] = None, + s: typing.Union[typing.Hashable, typing.Sequence[typing.Hashable]] = None, + c: typing.Union[typing.Hashable, typing.Sequence[typing.Hashable]] = None, + **kwargs, + ): + return self.plot.scatter(x=x, y=y, s=s, c=c, **kwargs) + + scatter.__doc__ = inspect.getdoc(plotting.PlotAccessor.scatter) + def __matmul__(self, other) -> DataFrame: return self.dot(other) diff --git a/bigframes/series.py b/bigframes/series.py index 46847996f1..e705a97fa9 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1984,16 +1984,48 @@ def __array_ufunc__( return NotImplemented - # Keep this at the bottom of the Series class to avoid - # confusing type checker by overriding str - @property - def str(self) -> strings.StringMethods: - return strings.StringMethods(self._block) - @property def plot(self): return plotting.PlotAccessor(self) + def hist( + self, by: typing.Optional[typing.Sequence[str]] = None, bins: int = 10, **kwargs + ): + return self.plot.hist(by=by, bins=bins, **kwargs) + + hist.__doc__ = inspect.getdoc(plotting.PlotAccessor.hist) + + def line( + self, + x: typing.Optional[typing.Hashable] = None, + y: typing.Optional[typing.Hashable] = None, + **kwargs, + ): + return self.plot.line(x=x, y=y, **kwargs) + + line.__doc__ = inspect.getdoc(plotting.PlotAccessor.line) + + def area( + self, + x: typing.Optional[typing.Hashable] = None, + y: typing.Optional[typing.Hashable] = None, + stacked: bool = True, + **kwargs, + ): + return self.plot.area(x=x, y=y, stacked=stacked, **kwargs) + + area.__doc__ = inspect.getdoc(plotting.PlotAccessor.area) + + def bar( + self, + x: typing.Optional[typing.Hashable] = None, + y: typing.Optional[typing.Hashable] = None, + **kwargs, + ): + return self.plot.bar(x=x, y=y, **kwargs) + + bar.__doc__ = inspect.getdoc(plotting.PlotAccessor.bar) + def _slice( self, start: typing.Optional[int] = None, @@ -2022,6 +2054,12 @@ def _cached(self, *, force: bool = True, session_aware: bool = True) -> Series: self._block.cached(force=force, session_aware=session_aware) return self + # Keep this at the bottom of the Series class to avoid + # confusing type checker by overriding str + @property + def str(self) -> strings.StringMethods: + return strings.StringMethods(self._block) + def _is_list_like(obj: typing.Any) -> typing_extensions.TypeGuard[typing.Sequence]: return pandas.api.types.is_list_like(obj) diff --git a/tests/system/small/operations/test_plotting.py b/tests/system/small/operations/test_plotting.py index 3624232ea0..c2f3ba423f 100644 --- a/tests/system/small/operations/test_plotting.py +++ b/tests/system/small/operations/test_plotting.py @@ -34,10 +34,20 @@ def _check_legend_labels(ax, labels): assert label == e -def test_series_hist_bins(scalars_dfs): +@pytest.mark.parametrize( + ("alias"), + [ + pytest.param(True), + pytest.param(False), + ], +) +def test_series_hist_bins(scalars_dfs, alias): scalars_df, scalars_pandas_df = scalars_dfs bins = 5 - ax = scalars_df["int64_col"].plot.hist(bins=bins) + if alias: + ax = scalars_df["int64_col"].hist(bins=bins) + else: + ax = scalars_df["int64_col"].plot.hist(bins=bins) pd_ax = scalars_pandas_df["int64_col"].plot.hist(bins=bins) # Compares axis values and height between bigframes and pandas histograms. @@ -49,11 +59,21 @@ def test_series_hist_bins(scalars_dfs): assert ax.patches[i]._height == pd_ax.patches[i]._height -def test_dataframes_hist_bins(scalars_dfs): +@pytest.mark.parametrize( + ("alias"), + [ + pytest.param(True), + pytest.param(False), + ], +) +def test_dataframes_hist_bins(scalars_dfs, alias): scalars_df, scalars_pandas_df = scalars_dfs bins = 7 columns = ["int64_col", "int64_too", "float64_col"] - ax = scalars_df[columns].plot.hist(bins=bins) + if alias: + ax = scalars_df[columns].hist(bins=bins) + else: + ax = scalars_df[columns].plot.hist(bins=bins) pd_ax = scalars_pandas_df[columns].plot.hist(bins=bins) # Compares axis values and height between bigframes and pandas histograms. @@ -171,10 +191,25 @@ def test_hist_kwargs_ticks_props(scalars_dfs): tm.assert_almost_equal(ylabels[i].get_rotation(), pd_ylables[i].get_rotation()) -def test_line(scalars_dfs): +@pytest.mark.parametrize( + ("col_names", "alias"), + [ + pytest.param( + ["int64_col", "float64_col", "int64_too", "bool_col"], True, id="df_alias" + ), + pytest.param( + ["int64_col", "float64_col", "int64_too", "bool_col"], False, id="df" + ), + pytest.param(["int64_col"], True, id="series_alias"), + pytest.param(["int64_col"], False, id="series"), + ], +) +def test_line(scalars_dfs, col_names, alias): scalars_df, scalars_pandas_df = scalars_dfs - col_names = ["int64_col", "float64_col", "int64_too", "bool_col"] - ax = scalars_df[col_names].plot.line() + if alias: + ax = scalars_df[col_names].line() + else: + ax = scalars_df[col_names].plot.line() pd_ax = scalars_pandas_df[col_names].plot.line() tm.assert_almost_equal(ax.get_xticks(), pd_ax.get_xticks()) tm.assert_almost_equal(ax.get_yticks(), pd_ax.get_yticks()) @@ -183,10 +218,21 @@ def test_line(scalars_dfs): tm.assert_almost_equal(line.get_data()[1], pd_line.get_data()[1]) -def test_area(scalars_dfs): +@pytest.mark.parametrize( + ("col_names", "alias"), + [ + pytest.param(["int64_col", "float64_col", "int64_too"], True, id="df_alias"), + pytest.param(["int64_col", "float64_col", "int64_too"], False, id="df"), + pytest.param(["int64_col"], True, id="series_alias"), + pytest.param(["int64_col"], False, id="series"), + ], +) +def test_area(scalars_dfs, col_names, alias): scalars_df, scalars_pandas_df = scalars_dfs - col_names = ["int64_col", "float64_col", "int64_too"] - ax = scalars_df[col_names].plot.area(stacked=False) + if alias: + ax = scalars_df[col_names].area(stacked=False) + else: + ax = scalars_df[col_names].plot.area(stacked=False) pd_ax = scalars_pandas_df[col_names].plot.area(stacked=False) tm.assert_almost_equal(ax.get_xticks(), pd_ax.get_xticks()) tm.assert_almost_equal(ax.get_yticks(), pd_ax.get_yticks()) @@ -195,10 +241,21 @@ def test_area(scalars_dfs): tm.assert_almost_equal(line.get_data()[1], pd_line.get_data()[1]) -def test_bar(scalars_dfs): +@pytest.mark.parametrize( + ("col_names", "alias"), + [ + pytest.param(["int64_col", "float64_col", "int64_too"], True, id="df_alias"), + pytest.param(["int64_col", "float64_col", "int64_too"], False, id="df"), + pytest.param(["int64_col"], True, id="series_alias"), + pytest.param(["int64_col"], False, id="series"), + ], +) +def test_bar(scalars_dfs, col_names, alias): scalars_df, scalars_pandas_df = scalars_dfs - col_names = ["int64_col", "float64_col", "int64_too"] - ax = scalars_df[col_names].plot.bar() + if alias: + ax = scalars_df[col_names].bar() + else: + ax = scalars_df[col_names].plot.bar() pd_ax = scalars_pandas_df[col_names].plot.bar() tm.assert_almost_equal(ax.get_xticks(), pd_ax.get_xticks()) tm.assert_almost_equal(ax.get_yticks(), pd_ax.get_yticks()) @@ -207,10 +264,23 @@ def test_bar(scalars_dfs): tm.assert_almost_equal(line.get_data()[1], pd_line.get_data()[1]) -def test_scatter(scalars_dfs): +@pytest.mark.parametrize( + ("col_names", "alias"), + [ + pytest.param( + ["int64_col", "float64_col", "int64_too", "bool_col"], True, id="df_alias" + ), + pytest.param( + ["int64_col", "float64_col", "int64_too", "bool_col"], False, id="df" + ), + ], +) +def test_scatter(scalars_dfs, col_names, alias): scalars_df, scalars_pandas_df = scalars_dfs - col_names = ["int64_col", "float64_col", "int64_too", "bool_col"] - ax = scalars_df[col_names].plot.scatter(x="int64_col", y="float64_col") + if alias: + ax = scalars_df[col_names].scatter(x="int64_col", y="float64_col") + else: + ax = scalars_df[col_names].plot.scatter(x="int64_col", y="float64_col") pd_ax = scalars_pandas_df[col_names].plot.scatter(x="int64_col", y="float64_col") tm.assert_almost_equal(ax.get_xticks(), pd_ax.get_xticks()) tm.assert_almost_equal(ax.get_yticks(), pd_ax.get_yticks()) From fe4fbb4dea3f6d0751b8db281cc2f3192fced71d Mon Sep 17 00:00:00 2001 From: jialuoo Date: Fri, 24 Jan 2025 17:06:52 -0800 Subject: [PATCH 09/11] refactor: rename the bigframes function files (#1312) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * refactor: rename the bigframes function files * fix the import * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot --- bigframes/dataframe.py | 2 +- ...function_client.py => _function_client.py} | 4 +- ...nction_session.py => _function_session.py} | 5 +- .../{remote_function.py => function.py} | 8 +-- ...ction_template.py => function_template.py} | 0 bigframes/pandas/__init__.py | 4 +- bigframes/session/__init__.py | 14 +++--- tests/system/large/test_remote_function.py | 10 ++-- tests/system/small/test_remote_function.py | 50 +++++++++---------- tests/system/utils.py | 4 +- ..._template.py => test_function_template.py} | 12 ++--- tests/unit/polars_session.py | 2 +- tests/unit/test_remote_function.py | 14 ++---- 13 files changed, 60 insertions(+), 69 deletions(-) rename bigframes/functions/{_remote_function_client.py => _function_client.py} (99%) rename bigframes/functions/{_remote_function_session.py => _function_session.py} (99%) rename bigframes/functions/{remote_function.py => function.py} (96%) rename bigframes/functions/{remote_function_template.py => function_template.py} (100%) rename tests/unit/functions/{test_remote_function_template.py => test_function_template.py} (92%) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 552cd0084c..fec53dbf01 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -3996,7 +3996,7 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs): # Early check whether the dataframe dtypes are currently supported # in the remote function # NOTE: Keep in sync with the value converters used in the gcf code - # generated in remote_function_template.py + # generated in function_template.py remote_function_supported_dtypes = ( bigframes.dtypes.INT_DTYPE, bigframes.dtypes.FLOAT_DTYPE, diff --git a/bigframes/functions/_remote_function_client.py b/bigframes/functions/_function_client.py similarity index 99% rename from bigframes/functions/_remote_function_client.py rename to bigframes/functions/_function_client.py index 0d0cc08128..104119a510 100644 --- a/bigframes/functions/_remote_function_client.py +++ b/bigframes/functions/_function_client.py @@ -29,7 +29,7 @@ from bigframes_vendored import constants import requests -import bigframes.functions.remote_function_template +import bigframes.functions.function_template as bff_template if TYPE_CHECKING: from bigframes.session import Session @@ -215,7 +215,7 @@ def generate_cloud_function_code( f.write("\n".join(package_requirements)) # main.py - entry_point = bigframes.functions.remote_function_template.generate_cloud_function_main_code( + entry_point = bff_template.generate_cloud_function_main_code( def_, directory, input_types=input_types, diff --git a/bigframes/functions/_remote_function_session.py b/bigframes/functions/_function_session.py similarity index 99% rename from bigframes/functions/_remote_function_session.py rename to bigframes/functions/_function_session.py index d6b729bf6e..00626a252f 100644 --- a/bigframes/functions/_remote_function_session.py +++ b/bigframes/functions/_function_session.py @@ -52,8 +52,7 @@ import pandas -from . import _remote_function_client as rf_client -from . import _utils +from . import _function_client, _utils class RemoteFunctionSession: @@ -468,7 +467,7 @@ def wrapper(func): signature, input_types, output_type # type: ignore ) - remote_function_client = rf_client.RemoteFunctionClient( + remote_function_client = _function_client.RemoteFunctionClient( dataset_ref.project, cloud_function_region, cloud_functions_client, diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/function.py similarity index 96% rename from bigframes/functions/remote_function.py rename to bigframes/functions/function.py index 533c93e7cb..57df8f9407 100644 --- a/bigframes/functions/remote_function.py +++ b/bigframes/functions/function.py @@ -35,9 +35,9 @@ import bigframes.core.compile.ibis_types import bigframes.dtypes import bigframes.exceptions as bfe -import bigframes.functions.remote_function_template +import bigframes.functions.function_template -from . import _remote_function_session as rf_session +from . import _function_session as bff_session from . import _utils logger = logging.getLogger(__name__) @@ -120,11 +120,11 @@ def get_routine_reference( def remote_function(*args, **kwargs): - remote_function_session = rf_session.RemoteFunctionSession() + remote_function_session = bff_session.RemoteFunctionSession() return remote_function_session.remote_function(*args, **kwargs) -remote_function.__doc__ = rf_session.RemoteFunctionSession.remote_function.__doc__ +remote_function.__doc__ = bff_session.RemoteFunctionSession.remote_function.__doc__ def read_gbq_function( diff --git a/bigframes/functions/remote_function_template.py b/bigframes/functions/function_template.py similarity index 100% rename from bigframes/functions/remote_function_template.py rename to bigframes/functions/function_template.py diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 395b573916..c744d3b945 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -34,7 +34,7 @@ import bigframes.core.tools import bigframes.dataframe import bigframes.enums -import bigframes.functions._utils as functions_utils +import bigframes.functions._utils as bff_utils from bigframes.pandas.io.api import ( from_glob_path, read_csv, @@ -222,7 +222,7 @@ def clean_up_by_session_id( session.bqclient, dataset, session_id ) - functions_utils._clean_up_by_session_id( + bff_utils._clean_up_by_session_id( session.bqclient, session.cloudfunctionsclient, dataset, session_id ) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 1d85967729..02f79a7d99 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -67,8 +67,8 @@ import bigframes.dtypes import bigframes.exceptions import bigframes.exceptions as bfe -import bigframes.functions._remote_function_session as bigframes_rf_session -import bigframes.functions.remote_function as bigframes_rf +import bigframes.functions._function_session as bff_session +import bigframes.functions.function as bff import bigframes.session._io.bigquery as bf_io_bigquery import bigframes.session.clients import bigframes.session.executor @@ -245,7 +245,7 @@ def __init__( ) self._metrics = bigframes.session.metrics.ExecutionMetrics() - self._remote_function_session = bigframes_rf_session.RemoteFunctionSession() + self._function_session = bff_session.RemoteFunctionSession() self._temp_storage_manager = ( bigframes.session.temp_storage.TemporaryGbqStorageManager( self._clients_provider.bqclient, @@ -377,9 +377,9 @@ def close(self): if temp_storage_manager: self._temp_storage_manager.clean_up_tables() - remote_function_session = getattr(self, "_remote_function_session", None) + remote_function_session = getattr(self, "_function_session", None) if remote_function_session: - self._remote_function_session.clean_up( + self._function_session.clean_up( self.bqclient, self.cloudfunctionsclient, self.session_id ) @@ -1380,7 +1380,7 @@ def remote_function( `bigframes_remote_function` - The bigquery remote function capable of calling into `bigframes_cloud_function`. """ - return self._remote_function_session.remote_function( + return self._function_session.remote_function( input_types, output_type, session=self, @@ -1556,7 +1556,7 @@ def read_gbq_function( not including the `bigframes_cloud_function` property. """ - return bigframes_rf.read_gbq_function( + return bff.read_gbq_function( function_name=function_name, session=self, is_row_processor=is_row_processor, diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index d0eb6c1904..f226143b50 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -32,7 +32,7 @@ import bigframes.dataframe import bigframes.dtypes import bigframes.exceptions -import bigframes.functions._utils as functions_utils +import bigframes.functions._utils as bff_utils import bigframes.pandas as bpd import bigframes.series from tests.system.utils import ( @@ -633,11 +633,9 @@ def add_one(x): add_one_uniq, add_one_uniq_dir = make_uniq_udf(add_one) # Expected cloud function name for the unique udf - package_requirements = functions_utils._get_updated_package_requirements() - add_one_uniq_hash = functions_utils._get_hash( - add_one_uniq, package_requirements - ) - add_one_uniq_cf_name = functions_utils.get_cloud_function_name( + package_requirements = bff_utils._get_updated_package_requirements() + add_one_uniq_hash = bff_utils._get_hash(add_one_uniq, package_requirements) + add_one_uniq_cf_name = bff_utils.get_cloud_function_name( add_one_uniq_hash, session.session_id ) diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index c3f3890459..0dc8960f62 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -25,8 +25,8 @@ import bigframes import bigframes.dtypes import bigframes.exceptions -from bigframes.functions import _utils as rf_utils -from bigframes.functions import remote_function as rf +from bigframes.functions import _utils as bff_utils +from bigframes.functions import function as bff from tests.system.utils import assert_pandas_df_equal _prefixer = test_utils.prefixer.Prefixer("bigframes", "") @@ -94,12 +94,12 @@ def get_rf_name(func, package_requirements=None, is_row_processor=False): """Get a remote function name for testing given a udf.""" # Augment user package requirements with any internal package # requirements - package_requirements = rf_utils._get_updated_package_requirements( + package_requirements = bff_utils._get_updated_package_requirements( package_requirements, is_row_processor ) # Compute a unique hash representing the user code - function_hash = rf_utils._get_hash(func, package_requirements) + function_hash = bff_utils._get_hash(func, package_requirements) return f"bigframes_{function_hash}" @@ -117,7 +117,7 @@ def test_remote_function_direct_no_session_param( def square(x): return x * x - square = rf.remote_function( + square = bff.remote_function( int, int, bigquery_client=bigquery_client, @@ -176,7 +176,7 @@ def test_remote_function_direct_no_session_param_location_specified( def square(x): return x * x - square = rf.remote_function( + square = bff.remote_function( int, int, bigquery_client=bigquery_client, @@ -235,7 +235,7 @@ def square(x): ValueError, match=re.escape("The location does not match BigQuery connection location:"), ): - rf.remote_function( + bff.remote_function( int, int, bigquery_client=bigquery_client, @@ -263,7 +263,7 @@ def test_remote_function_direct_no_session_param_location_project_specified( def square(x): return x * x - square = rf.remote_function( + square = bff.remote_function( int, int, bigquery_client=bigquery_client, @@ -324,7 +324,7 @@ def square(x): "The project_id does not match BigQuery connection gcp_project_id:" ), ): - rf.remote_function( + bff.remote_function( int, int, bigquery_client=bigquery_client, @@ -346,7 +346,7 @@ def test_remote_function_direct_session_param( def square(x): return x * x - square = rf.remote_function( + square = bff.remote_function( int, int, session=session_with_bq_connection, @@ -636,7 +636,7 @@ def add_one(x): def test_read_gbq_function_detects_invalid_function(session, dataset_id): dataset_ref = bigquery.DatasetReference.from_string(dataset_id) with pytest.raises(ValueError) as e: - rf.read_gbq_function( + bff.read_gbq_function( str(dataset_ref.routine("not_a_function")), session=session, ) @@ -658,7 +658,7 @@ def test_read_gbq_function_like_original( def square1(x): return x * x - square1 = rf.remote_function( + square1 = bff.remote_function( [int], int, bigquery_client=bigquery_client, @@ -674,7 +674,7 @@ def square1(x): # Function should still work normally. assert square1(2) == 4 - square2 = rf.read_gbq_function( + square2 = bff.read_gbq_function( function_name=square1.bigframes_remote_function, # type: ignore session=session, ) @@ -745,7 +745,7 @@ def test_read_gbq_function_reads_udfs(session, bigquery_client, dataset_id): for routine in (sql_routine, js_routine): # Create the routine in BigQuery and read it back using read_gbq_function. bigquery_client.create_routine(routine, exists_ok=True) - square = rf.read_gbq_function( + square = bff.read_gbq_function( str(routine.reference), session=session, ) @@ -757,7 +757,7 @@ def test_read_gbq_function_reads_udfs(session, bigquery_client, dataset_id): src = {"x": [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]} - routine_ref_str = rf_utils.routine_ref_to_string_for_query(routine.reference) + routine_ref_str = bff_utils.routine_ref_to_string_for_query(routine.reference) direct_sql = " UNION ALL ".join( [f"SELECT {x} AS x, {routine_ref_str}({x}) AS y" for x in src["x"]] ) @@ -818,7 +818,7 @@ def test_read_gbq_function_requires_explicit_types( bigquery_client.create_routine(only_arg_type_specified, exists_ok=True) bigquery_client.create_routine(neither_type_specified, exists_ok=True) - rf.read_gbq_function( + bff.read_gbq_function( str(both_types_specified.reference), session=session, ) @@ -826,17 +826,17 @@ def test_read_gbq_function_requires_explicit_types( bigframes.exceptions.UnknownDataTypeWarning, match="missing input data types.*assume default data type", ): - rf.read_gbq_function( + bff.read_gbq_function( str(only_return_type_specified.reference), session=session, ) with pytest.raises(ValueError): - rf.read_gbq_function( + bff.read_gbq_function( str(only_arg_type_specified.reference), session=session, ) with pytest.raises(ValueError): - rf.read_gbq_function( + bff.read_gbq_function( str(neither_type_specified.reference), session=session, ) @@ -878,13 +878,13 @@ def test_read_gbq_function_respects_python_output_type( body="TO_JSON_STRING([x, x+1, x+2])", arguments=[arg], return_type=bigquery.StandardSqlDataType(bigquery.StandardSqlTypeNames.STRING), - description=rf_utils.get_bigframes_metadata(python_output_type=array_type), + description=bff_utils.get_bigframes_metadata(python_output_type=array_type), type_=bigquery.RoutineType.SCALAR_FUNCTION, ) # Create the routine in BigQuery and read it back using read_gbq_function. bigquery_client.create_routine(sql_routine, exists_ok=True) - func = rf.read_gbq_function(str(sql_routine.reference), session=session) + func = bff.read_gbq_function(str(sql_routine.reference), session=session) # test that the function works as expected s = bigframes.series.Series([1, 10, 100]) @@ -920,7 +920,7 @@ def test_read_gbq_function_supports_python_output_type_only_for_string_outputs( body="x+1", arguments=[arg], return_type=bigquery.StandardSqlDataType(bigquery.StandardSqlTypeNames.INT64), - description=rf_utils.get_bigframes_metadata(python_output_type=array_type), + description=bff_utils.get_bigframes_metadata(python_output_type=array_type), type_=bigquery.RoutineType.SCALAR_FUNCTION, ) @@ -933,7 +933,7 @@ def test_read_gbq_function_supports_python_output_type_only_for_string_outputs( TypeError, match="An explicit output_type should be provided only for a BigQuery function with STRING output.", ): - rf.read_gbq_function(str(sql_routine.reference), session=session) + bff.read_gbq_function(str(sql_routine.reference), session=session) @pytest.mark.parametrize( @@ -959,13 +959,13 @@ def test_read_gbq_function_supported_python_output_type( body="CAST(x AS STRING)", arguments=[arg], return_type=bigquery.StandardSqlDataType(bigquery.StandardSqlTypeNames.STRING), - description=rf_utils.get_bigframes_metadata(python_output_type=array_type), + description=bff_utils.get_bigframes_metadata(python_output_type=array_type), type_=bigquery.RoutineType.SCALAR_FUNCTION, ) # Create the routine in BigQuery and read it back using read_gbq_function. bigquery_client.create_routine(sql_routine, exists_ok=True) - rf.read_gbq_function(str(sql_routine.reference), session=session) + bff.read_gbq_function(str(sql_routine.reference), session=session) @pytest.mark.flaky(retries=2, delay=120) diff --git a/tests/system/utils.py b/tests/system/utils.py index 83d0e683bc..7c12c8033a 100644 --- a/tests/system/utils.py +++ b/tests/system/utils.py @@ -26,7 +26,7 @@ import pyarrow as pa # type: ignore import pytest -import bigframes.functions._utils as functions_utils +import bigframes.functions._utils as bff_utils import bigframes.pandas ML_REGRESSION_METRICS = [ @@ -351,7 +351,7 @@ def get_cloud_functions( not name or not name_prefix ), "Either 'name' or 'name_prefix' can be passed but not both." - _, location = functions_utils.get_remote_function_locations(location) + _, location = bff_utils.get_remote_function_locations(location) parent = f"projects/{project}/locations/{location}" request = functions_v2.ListFunctionsRequest(parent=parent) page_result = functions_client.list_functions(request=request) diff --git a/tests/unit/functions/test_remote_function_template.py b/tests/unit/functions/test_function_template.py similarity index 92% rename from tests/unit/functions/test_remote_function_template.py rename to tests/unit/functions/test_function_template.py index 70b033d938..11db01ed9e 100644 --- a/tests/unit/functions/test_remote_function_template.py +++ b/tests/unit/functions/test_function_template.py @@ -20,7 +20,7 @@ import pytest import bigframes.dtypes -import bigframes.functions.remote_function_template as remote_function_template +import bigframes.functions.function_template as bff_template HELLO_WORLD_BASE64_BYTES = b"SGVsbG8sIFdvcmxkIQ==" HELLO_WORLD_BASE64_STR = "SGVsbG8sIFdvcmxkIQ==" @@ -59,7 +59,7 @@ ), ) def test_convert_from_bq_json(type_, json_value, expected): - got = remote_function_template.convert_from_bq_json(type_, json_value) + got = bff_template.convert_from_bq_json(type_, json_value) assert got == expected @@ -76,7 +76,7 @@ def test_convert_from_bq_json(type_, json_value, expected): ], ) def test_convert_from_bq_json_none(type_): - got = remote_function_template.convert_from_bq_json(type_, None) + got = bff_template.convert_from_bq_json(type_, None) assert got is None @@ -113,7 +113,7 @@ def test_convert_from_bq_json_none(type_): ), ) def test_convert_to_bq_json(type_, value, expected): - got = remote_function_template.convert_to_bq_json(type_, value) + got = bff_template.convert_to_bq_json(type_, value) assert got == expected @@ -130,7 +130,7 @@ def test_convert_to_bq_json(type_, value, expected): ], ) def test_convert_to_bq_json_none(type_): - got = remote_function_template.convert_to_bq_json(type_, None) + got = bff_template.convert_to_bq_json(type_, None) assert got is None @@ -176,7 +176,7 @@ def test_convert_to_bq_json_none(type_): ), ) def test_get_pd_series(row_json, expected): - got = remote_function_template.get_pd_series(row_json) + got = bff_template.get_pd_series(row_json) pandas.testing.assert_series_equal(got, expected) diff --git a/tests/unit/polars_session.py b/tests/unit/polars_session.py index dfb1f5bfa6..cffd8ff7ca 100644 --- a/tests/unit/polars_session.py +++ b/tests/unit/polars_session.py @@ -82,7 +82,7 @@ def __init__(self): self._allow_ambiguity = False # type: ignore self._default_index_type = bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64 self._metrics = bigframes.session.metrics.ExecutionMetrics() - self._remote_function_session = None # type: ignore + self._function_session = None # type: ignore self._temp_storage_manager = None # type: ignore self._executor = TestExecutor() self._loader = None # type: ignore diff --git a/tests/unit/test_remote_function.py b/tests/unit/test_remote_function.py index a8c4f2ac2e..413a694680 100644 --- a/tests/unit/test_remote_function.py +++ b/tests/unit/test_remote_function.py @@ -21,7 +21,7 @@ import bigframes.core.compile.ibis_types import bigframes.dtypes -import bigframes.functions.remote_function +import bigframes.functions.function as bff import bigframes.series from tests.unit import resources @@ -42,9 +42,7 @@ def test_series_input_types_to_str(series_type): """Check that is_row_processor=True uses str as the input type to serialize a row.""" session = resources.create_bigquery_session() - remote_function_decorator = bigframes.functions.remote_function.remote_function( - session=session - ) + remote_function_decorator = bff.remote_function(session=session) with pytest.warns( bigframes.exceptions.PreviewWarning, @@ -75,9 +73,7 @@ def test_supported_types_correspond(): def test_missing_input_types(): session = resources.create_bigquery_session() - remote_function_decorator = bigframes.functions.remote_function.remote_function( - session=session - ) + remote_function_decorator = bff.remote_function(session=session) def function_without_parameter_annotations(myparam) -> str: return str(myparam) @@ -93,9 +89,7 @@ def function_without_parameter_annotations(myparam) -> str: def test_missing_output_type(): session = resources.create_bigquery_session() - remote_function_decorator = bigframes.functions.remote_function.remote_function( - session=session - ) + remote_function_decorator = bff.remote_function(session=session) def function_without_return_annotation(myparam: int): return str(myparam) From 39019510d0c2758096589ecd0d83175f313a8cf5 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 27 Jan 2025 13:48:58 -0800 Subject: [PATCH 10/11] chore: define timedelta type and to_timedelta function (#1317) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * define timedelta type and to_timedelta function * remove unnecessary file * remove TypeAlias type for 3.9 compatibility * fix mypy * fix lint * move timedelta out of the simple dtype list * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * fix type casts in tests --------- Co-authored-by: Owl Bot --- bigframes/core/compile/ibis_types.py | 1 + bigframes/core/compile/scalar_op_compiler.py | 7 ++ bigframes/dtypes.py | 4 + bigframes/operations/__init__.py | 3 + bigframes/operations/timedelta_ops.py | 31 ++++++ bigframes/pandas/__init__.py | 2 + bigframes/pandas/core/__init__.py | 13 +++ bigframes/pandas/core/api.py | 17 ++++ bigframes/pandas/core/tools/__init__.py | 13 +++ bigframes/pandas/core/tools/timedeltas.py | 64 ++++++++++++ tests/system/small/test_pandas.py | 67 +++++++++++++ .../pandas/core/tools/timedeltas.py | 99 +++++++++++++++++++ 12 files changed, 321 insertions(+) create mode 100644 bigframes/operations/timedelta_ops.py create mode 100644 bigframes/pandas/core/__init__.py create mode 100644 bigframes/pandas/core/api.py create mode 100644 bigframes/pandas/core/tools/__init__.py create mode 100644 bigframes/pandas/core/tools/timedeltas.py create mode 100644 third_party/bigframes_vendored/pandas/core/tools/timedeltas.py diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index a0afa29a15..e5d637e426 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -81,6 +81,7 @@ BIGFRAMES_TO_IBIS: Dict[bigframes.dtypes.Dtype, ibis_dtypes.DataType] = { pandas: ibis for ibis, pandas in BIDIRECTIONAL_MAPPINGS } +BIGFRAMES_TO_IBIS.update({bigframes.dtypes.TIMEDETLA_DTYPE: ibis_dtypes.int64}) IBIS_TO_BIGFRAMES: Dict[ibis_dtypes.DataType, bigframes.dtypes.Dtype] = { ibis: pandas for ibis, pandas in BIDIRECTIONAL_MAPPINGS } diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 2ab10e025d..b42f983619 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -1140,6 +1140,13 @@ def to_timestamp_op_impl(x: ibis_types.Value, op: ops.ToTimestampOp): return x.cast(ibis_dtypes.Timestamp(timezone="UTC")) +@scalar_op_compiler.register_unary_op(ops.ToTimedeltaOp, pass_op=True) +def to_timedelta_op_impl(x: ibis_types.Value, op: ops.ToTimedeltaOp): + return ( + typing.cast(ibis_types.NumericValue, x) * UNIT_TO_US_CONVERSION_FACTORS[op.unit] # type: ignore + ).floor() + + @scalar_op_compiler.register_unary_op(ops.RemoteFunctionOp, pass_op=True) def remote_function_op_impl(x: ibis_types.Value, op: ops.RemoteFunctionOp): ibis_node = getattr(op.func, "ibis_node", None) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 863615118a..4db124134a 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -55,6 +55,7 @@ TIME_DTYPE = pd.ArrowDtype(pa.time64("us")) DATETIME_DTYPE = pd.ArrowDtype(pa.timestamp("us")) TIMESTAMP_DTYPE = pd.ArrowDtype(pa.timestamp("us", tz="UTC")) +TIMEDETLA_DTYPE = pd.ArrowDtype(pa.duration("us")) NUMERIC_DTYPE = pd.ArrowDtype(pa.decimal128(38, 9)) BIGNUMERIC_DTYPE = pd.ArrowDtype(pa.decimal256(76, 38)) # No arrow equivalent @@ -632,6 +633,9 @@ def convert_to_schema_field( return google.cloud.bigquery.SchemaField( name, "RECORD", fields=inner_fields ) + if bigframes_dtype.pyarrow_dtype == pa.duration("us"): + # Timedeltas are represented as integers in microseconds. + return google.cloud.bigquery.SchemaField(name, "INTEGER") raise ValueError( f"No arrow conversion for {bigframes_dtype}. {constants.FEEDBACK_LINK}" ) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index e55cbc4925..d8b0447686 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -170,6 +170,7 @@ ) from bigframes.operations.struct_ops import StructFieldOp, StructOp from bigframes.operations.time_ops import hour_op, minute_op, normalize_op, second_op +from bigframes.operations.timedelta_ops import ToTimedeltaOp __all__ = [ # Base ops @@ -240,6 +241,8 @@ "minute_op", "second_op", "normalize_op", + # Timedelta ops + "ToTimedeltaOp", # Datetime ops "date_op", "time_op", diff --git a/bigframes/operations/timedelta_ops.py b/bigframes/operations/timedelta_ops.py new file mode 100644 index 0000000000..0bcd6eb08f --- /dev/null +++ b/bigframes/operations/timedelta_ops.py @@ -0,0 +1,31 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import dataclasses +import typing + +from bigframes import dtypes +from bigframes.operations import base_ops + + +@dataclasses.dataclass(frozen=True) +class ToTimedeltaOp(base_ops.UnaryOp): + name: typing.ClassVar[str] = "to_timedelta" + unit: typing.Literal["us", "ms", "s", "m", "h", "d", "W"] + + def output_type(self, *input_types): + if input_types[0] is not dtypes.INT_DTYPE: + raise TypeError("expected integer input") + return dtypes.TIMEDETLA_DTYPE diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index c744d3b945..4a5e4d4b3a 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -35,6 +35,7 @@ import bigframes.dataframe import bigframes.enums import bigframes.functions._utils as bff_utils +from bigframes.pandas.core.api import to_timedelta from bigframes.pandas.io.api import ( from_glob_path, read_csv, @@ -313,6 +314,7 @@ def reset_session(): "read_pickle", "remote_function", "to_datetime", + "to_timedelta", "from_glob_path", # pandas dtype attributes "NA", diff --git a/bigframes/pandas/core/__init__.py b/bigframes/pandas/core/__init__.py new file mode 100644 index 0000000000..0a2669d7a2 --- /dev/null +++ b/bigframes/pandas/core/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/bigframes/pandas/core/api.py b/bigframes/pandas/core/api.py new file mode 100644 index 0000000000..0f3161afcc --- /dev/null +++ b/bigframes/pandas/core/api.py @@ -0,0 +1,17 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from bigframes.pandas.core.tools.timedeltas import to_timedelta + +__all__ = ["to_timedelta"] diff --git a/bigframes/pandas/core/tools/__init__.py b/bigframes/pandas/core/tools/__init__.py new file mode 100644 index 0000000000..0a2669d7a2 --- /dev/null +++ b/bigframes/pandas/core/tools/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/bigframes/pandas/core/tools/timedeltas.py b/bigframes/pandas/core/tools/timedeltas.py new file mode 100644 index 0000000000..0cedf425fe --- /dev/null +++ b/bigframes/pandas/core/tools/timedeltas.py @@ -0,0 +1,64 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import typing + +from bigframes_vendored.pandas.core.tools import ( + timedeltas as vendored_pandas_timedeltas, +) +import pandas as pd + +from bigframes import operations as ops +from bigframes import series + + +def to_timedelta( + arg: typing.Union[series.Series, str, int, float], + unit: typing.Optional[vendored_pandas_timedeltas.UnitChoices] = None, +) -> typing.Union[series.Series, pd.Timedelta]: + if not isinstance(arg, series.Series): + return pd.to_timedelta(arg, unit) + + canonical_unit = "us" if unit is None else _canonicalize_unit(unit) + return arg._apply_unary_op(ops.ToTimedeltaOp(canonical_unit)) + + +to_timedelta.__doc__ = vendored_pandas_timedeltas.to_timedelta.__doc__ + + +def _canonicalize_unit( + unit: vendored_pandas_timedeltas.UnitChoices, +) -> typing.Literal["us", "ms", "s", "m", "h", "d", "W"]: + if unit in {"w", "W"}: + return "W" + + if unit in {"D", "d", "days", "day"}: + return "d" + + if unit in {"hours", "hour", "hr", "h"}: + return "h" + + if unit in {"m", "minute", "min", "minutes"}: + return "m" + + if unit in {"s", "seconds", "sec", "second"}: + return "s" + + if unit in {"ms", "milliseconds", "millisecond", "milli", "millis"}: + return "ms" + + if unit in {"us", "microseconds", "microsecond", "µs", "micro", "micros"}: + return "us" + + raise TypeError(f"Unrecognized unit: {unit}") diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index 30ffaa8a7d..e46d073056 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -13,6 +13,7 @@ # limitations under the License. from datetime import datetime +import typing import pandas as pd import pytest @@ -726,3 +727,69 @@ def test_to_datetime_timestamp_inputs(arg, utc, output_in_utc): pd.testing.assert_series_equal( bf_result, pd_result, check_index_type=False, check_names=False ) + + +@pytest.mark.parametrize( + "unit", + [ + "W", + "w", + "D", + "d", + "days", + "day", + "hours", + "hour", + "hr", + "h", + "m", + "minute", + "min", + "minutes", + "s", + "seconds", + "sec", + "second", + "ms", + "milliseconds", + "millisecond", + "milli", + "millis", + "us", + "microseconds", + "microsecond", + "µs", + "micro", + "micros", + ], +) +def test_to_timedelta_with_bf_series(session, unit): + bf_series = bpd.Series([1, 2, 3], session=session) + pd_series = pd.Series([1, 2, 3]) + + actual_result = ( + typing.cast(bpd.Series, bpd.to_timedelta(bf_series, unit)) + .to_pandas() + .astype("timedelta64[ns]") + ) + + expected_result = pd.to_timedelta(pd_series, unit) + pd.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) + + +@pytest.mark.parametrize( + "unit", + ["Y", "M", "whatever"], +) +def test_to_timedelta_with_bf_series_invalid_unit(session, unit): + bf_series = bpd.Series([1, 2, 3], session=session) + + with pytest.raises(TypeError): + bpd.to_timedelta(bf_series, unit) + + +@pytest.mark.parametrize("input", [1, 1.2, "1s"]) +def test_to_timedelta_non_bf_series(input): + assert bpd.to_timedelta(input) == pd.to_timedelta(input) diff --git a/third_party/bigframes_vendored/pandas/core/tools/timedeltas.py b/third_party/bigframes_vendored/pandas/core/tools/timedeltas.py new file mode 100644 index 0000000000..9442e965fa --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/tools/timedeltas.py @@ -0,0 +1,99 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/v2.2.3/pandas/core/tools/timedeltas.py + +import typing + +from bigframes_vendored import constants +import pandas as pd + +from bigframes import series + +UnitChoices = typing.Literal[ + "W", + "w", + "D", + "d", + "days", + "day", + "hours", + "hour", + "hr", + "h", + "m", + "minute", + "min", + "minutes", + "s", + "seconds", + "sec", + "second", + "ms", + "milliseconds", + "millisecond", + "milli", + "millis", + "us", + "microseconds", + "microsecond", + "µs", + "micro", + "micros", +] + + +def to_timedelta( + arg: typing.Union[series.Series, str, int, float], + unit: typing.Optional[UnitChoices] = None, +) -> typing.Union[series.Series, pd.Timedelta]: + """ + Converts a scalar or Series to a timedelta object. + + .. note:: + BigQuery only supports precision up to microseconds (us). Therefore, when working + with timedeltas that have a finer granularity than microseconds, be aware that + the additional precision will not be represented in BigQuery. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Converting a Scalar to timedelta + + >>> scalar = 2 + >>> bpd.to_timedelta(scalar, unit='s') + Timedelta('0 days 00:00:02') + + Converting a Series of integers to a Series of timedeltas + + >>> int_series = bpd.Series([1,2,3]) + >>> bpd.to_timedelta(int_series, unit='s') + 0 0 days 00:00:01 + 1 0 days 00:00:02 + 2 0 days 00:00:03 + dtype: duration[us][pyarrow] + + Args: + arg (int, float, str, Series): + The object to convert to a dataframe + unit (str, default 'us'): + Denotes the unit of the arg for numeric `arg`. Defaults to ``"us"``. + + Possible values: + + * 'W' + * 'D' / 'days' / 'day' + * 'hours' / 'hour' / 'hr' / 'h' / 'H' + * 'm' / 'minute' / 'min' / 'minutes' + * 's' / 'seconds' / 'sec' / 'second' + * 'ms' / 'milliseconds' / 'millisecond' / 'milli' / 'millis' + * 'us' / 'microseconds' / 'microsecond' / 'micro' / 'micros' + + Returns: + Union[pandas.Timedelta, bigframes.pandas.Series]: + Return type depends on input + - Series: Series of duration[us][pyarrow] dtype + - scalar: timedelta + + """ + + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From a1cafa239c806b3027852a82dfde1417e27f9fde Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Mon, 27 Jan 2025 16:35:42 -0800 Subject: [PATCH 11/11] chore(main): release 1.34.0 (#1313) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> --- CHANGELOG.md | 14 ++++++++++++++ bigframes/version.py | 2 +- third_party/bigframes_vendored/version.py | 2 +- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d8befd372d..886e4f8921 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,20 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.34.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.33.0...v1.34.0) (2025-01-27) + + +### ⚠ BREAKING CHANGES + +* Enable reading JSON data with `dbjson` extension dtype ([#1139](https://github.com/googleapis/python-bigquery-dataframes/issues/1139)) + +### Features + +* (df|s).hist(), (df|s).line(), (df|s).area(), (df|s).bar(), df.scatter() ([#1320](https://github.com/googleapis/python-bigquery-dataframes/issues/1320)) ([bd3f584](https://github.com/googleapis/python-bigquery-dataframes/commit/bd3f584a7eab5d01dedebb7ca2485942ef5b5ebe)) +* Add DataFrame.corrwith method ([#1315](https://github.com/googleapis/python-bigquery-dataframes/issues/1315)) ([b503355](https://github.com/googleapis/python-bigquery-dataframes/commit/b5033559a77a9bc5ffb7dc1e44e02aaaaf1e051e)) +* Add DataFrame.mask method ([#1302](https://github.com/googleapis/python-bigquery-dataframes/issues/1302)) ([8b8155f](https://github.com/googleapis/python-bigquery-dataframes/commit/8b8155fef9c5cd36cfabf728ccebf6a14a1cbbda)) +* Enable reading JSON data with `dbjson` extension dtype ([#1139](https://github.com/googleapis/python-bigquery-dataframes/issues/1139)) ([f672262](https://github.com/googleapis/python-bigquery-dataframes/commit/f6722629fb47eed5befb0ecae2e6b5ec9042d669)) + ## [1.33.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.32.0...v1.33.0) (2025-01-22) diff --git a/bigframes/version.py b/bigframes/version.py index 50dde36b01..1fef294cef 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.33.0" +__version__ = "1.34.0" diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index 50dde36b01..1fef294cef 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.33.0" +__version__ = "1.34.0"