diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
index 8659d83d82..a7805de447 100644
--- a/.github/workflows/unittest.yml
+++ b/.github/workflows/unittest.yml
@@ -11,7 +11,7 @@ jobs:
runs-on: ubuntu-22.04
strategy:
matrix:
- python: ['3.9', '3.10', '3.11', '3.12']
+ python: ['3.9', '3.10', '3.11', '3.12', '3.13']
steps:
- name: Checkout
uses: actions/checkout@v4
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 886e4f8921..af87cae3b2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,41 @@
[1]: https://pypi.org/project/bigframes/#history
+## [1.35.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.34.0...v1.35.0) (2025-02-04)
+
+
+### Features
+
+* Add Series.keys() ([#1342](https://github.com/googleapis/python-bigquery-dataframes/issues/1342)) ([deb015d](https://github.com/googleapis/python-bigquery-dataframes/commit/deb015dc1276549519d51363501355272f8976d8))
+* Allow `case_when` to change dtypes if case list contains the condition `(True, some_default_value)` ([#1311](https://github.com/googleapis/python-bigquery-dataframes/issues/1311)) ([5c2a2c6](https://github.com/googleapis/python-bigquery-dataframes/commit/5c2a2c6086be20cba7da08ecd37899699aab518f))
+* Support python type as astype arg ([#1316](https://github.com/googleapis/python-bigquery-dataframes/issues/1316)) ([b26e135](https://github.com/googleapis/python-bigquery-dataframes/commit/b26e13570f198ec4d252590a8c07253624db667a))
+* Support time_series_id_col in ARIMAPlus ([#1282](https://github.com/googleapis/python-bigquery-dataframes/issues/1282)) ([97532c9](https://github.com/googleapis/python-bigquery-dataframes/commit/97532c9ba02cd709d69666dd0afca5c1df8b9faf))
+
+
+### Bug Fixes
+
+* Exclude `DataFrame` and `Series` `__call__` from unimplemented API metrics ([#1351](https://github.com/googleapis/python-bigquery-dataframes/issues/1351)) ([f2d5264](https://github.com/googleapis/python-bigquery-dataframes/commit/f2d526445da7dae29c49c8d6dacdfee7d2fa9d79))
+* Make `DataFrame` `__getattr__` and `__setattr__` more robust to subclassing ([#1352](https://github.com/googleapis/python-bigquery-dataframes/issues/1352)) ([417de3a](https://github.com/googleapis/python-bigquery-dataframes/commit/417de3a449e5d0748831b502f4f5b9fb9ba38714))
+
+
+### Performance Improvements
+
+* Fall back to ordering by bq pk when possible ([#1350](https://github.com/googleapis/python-bigquery-dataframes/issues/1350)) ([3c4abf2](https://github.com/googleapis/python-bigquery-dataframes/commit/3c4abf24ea186e98f629b6f83c0f3e36dc0571c6))
+* Improve isin performance ([#1203](https://github.com/googleapis/python-bigquery-dataframes/issues/1203)) ([db087b0](https://github.com/googleapis/python-bigquery-dataframes/commit/db087b0bfe4b3ba965682d620079c923e098e362))
+* Prevent inlining of remote ops ([#1347](https://github.com/googleapis/python-bigquery-dataframes/issues/1347)) ([012081a](https://github.com/googleapis/python-bigquery-dataframes/commit/012081af9ef825ced96ec1e772b9646cbe09d9a1))
+
+
+### Dependencies
+
+* Add support for Python 3.13 for everything but remote functions ([#1307](https://github.com/googleapis/python-bigquery-dataframes/issues/1307)) ([533db96](https://github.com/googleapis/python-bigquery-dataframes/commit/533db9685d159de2bc76307b0e0add676bd679a0))
+
+
+### Documentation
+
+* Add `GeoSeries` docs ([#1327](https://github.com/googleapis/python-bigquery-dataframes/issues/1327)) ([05f83d1](https://github.com/googleapis/python-bigquery-dataframes/commit/05f83d18d276091a1549dbba1f2baf8c91c8c37e))
+* Add link to DataFrames intro to improve SEO ([#1176](https://github.com/googleapis/python-bigquery-dataframes/issues/1176)) ([aafb5be](https://github.com/googleapis/python-bigquery-dataframes/commit/aafb5be3e9c50f477fca2a1ebb5338194672913f))
+* Add snippet to explain the univariate model's forecast result in the Forecast a single time series with a univariate model tutorial ([#1272](https://github.com/googleapis/python-bigquery-dataframes/issues/1272)) ([c22126b](https://github.com/googleapis/python-bigquery-dataframes/commit/c22126b846db428d21c0f5cbd2d439ecc56365b2))
+
## [1.34.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.33.0...v1.34.0) (2025-01-27)
diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
index 8d68e4fc27..5374e7e377 100644
--- a/CONTRIBUTING.rst
+++ b/CONTRIBUTING.rst
@@ -22,7 +22,7 @@ In order to add a feature:
documentation.
- The feature must work fully on the following CPython versions:
- 3.9, 3.10, 3.11 and 3.12 on both UNIX and Windows.
+ 3.9, 3.10, 3.11, 3.12 and 3.13 on both UNIX and Windows.
- The feature must not add unnecessary dependencies (where
"unnecessary" is of course subjective, but new dependencies should
@@ -72,7 +72,7 @@ We use `nox `__ to instrument our tests.
- To run a single unit test::
- $ nox -s unit-3.12 -- -k
+ $ nox -s unit-3.13 -- -k
.. note::
@@ -143,12 +143,12 @@ Running System Tests
$ nox -s system
# Run a single system test
- $ nox -s system-3.12 -- -k
+ $ nox -s system-3.13 -- -k
.. note::
- System tests are only configured to run under Python 3.9, 3.11 and 3.12.
+ System tests are only configured to run under Python 3.9, 3.11, 3.12 and 3.13.
For expediency, we do not run them in older versions of Python 3.
This alone will not run the tests. You'll need to change some local
@@ -262,11 +262,13 @@ We support:
- `Python 3.10`_
- `Python 3.11`_
- `Python 3.12`_
+- `Python 3.13`_
.. _Python 3.9: https://docs.python.org/3.9/
.. _Python 3.10: https://docs.python.org/3.10/
.. _Python 3.11: https://docs.python.org/3.11/
.. _Python 3.12: https://docs.python.org/3.12/
+.. _Python 3.13: https://docs.python.org/3.13/
Supported versions can be found in our ``noxfile.py`` `config`_.
diff --git a/README.rst b/README.rst
index 70041c7c8e..185c50c14a 100644
--- a/README.rst
+++ b/README.rst
@@ -30,7 +30,8 @@ Documentation
Getting started with BigQuery DataFrames
----------------------------------------
-Try the `BigQuery DataFrames quickstart `_
+Read `Introduction to BigQuery DataFrames `_
+and try the `BigQuery DataFrames quickstart `_
to get up and running in just a few minutes.
diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py
index d9bba9bdb0..5f64bf68dd 100644
--- a/bigframes/core/__init__.py
+++ b/bigframes/core/__init__.py
@@ -120,7 +120,9 @@ def from_table(
if offsets_col:
ordering = orderings.TotalOrdering.from_offset_col(offsets_col)
elif primary_key:
- ordering = orderings.TotalOrdering.from_primary_key(primary_key)
+ ordering = orderings.TotalOrdering.from_primary_key(
+ [ids.ColumnId(key_part) for key_part in primary_key]
+ )
# Scan all columns by default, we define this list as it can be pruned while preserving source_def
scan_list = nodes.ScanList(
@@ -415,6 +417,18 @@ def project_window_op(
output_name,
)
+ def isin(
+ self, other: ArrayValue, lcol: str, rcol: str
+ ) -> typing.Tuple[ArrayValue, str]:
+ node = nodes.InNode(
+ self.node,
+ other.node,
+ ex.deref(lcol),
+ ex.deref(rcol),
+ indicator_col=ids.ColumnId.unique(),
+ )
+ return ArrayValue(node), node.indicator_col.name
+
def relational_join(
self,
other: ArrayValue,
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
index 727ee013f8..43f605dc03 100644
--- a/bigframes/core/blocks.py
+++ b/bigframes/core/blocks.py
@@ -707,7 +707,7 @@ def split(
# Create an ordering col and convert to string
block, ordering_col = block.promote_offsets()
block, string_ordering_col = block.apply_unary_op(
- ordering_col, ops.AsTypeOp(to_type="string[pyarrow]")
+ ordering_col, ops.AsTypeOp(to_type=bigframes.dtypes.STRING_DTYPE)
)
# Apply hash method to sum col and order by it.
@@ -1410,7 +1410,7 @@ def grouped_head(
block, result_id = self.apply_window_op(
value_columns[0],
- agg_ops.rank_op,
+ agg_ops.count_op,
window_spec=window_spec,
)
@@ -1479,7 +1479,9 @@ def add_prefix(self, prefix: str, axis: str | int | None = None) -> Block:
expr, new_col = expr.project_to_id(
expression=ops.add_op.as_expr(
ex.const(prefix),
- ops.AsTypeOp(to_type="string").as_expr(index_col),
+ ops.AsTypeOp(to_type=bigframes.dtypes.STRING_DTYPE).as_expr(
+ index_col
+ ),
),
)
new_index_cols.append(new_col)
@@ -1502,7 +1504,9 @@ def add_suffix(self, suffix: str, axis: str | int | None = None) -> Block:
for index_col in self._index_columns:
expr, new_col = expr.project_to_id(
expression=ops.add_op.as_expr(
- ops.AsTypeOp(to_type="string").as_expr(index_col),
+ ops.AsTypeOp(to_type=bigframes.dtypes.STRING_DTYPE).as_expr(
+ index_col
+ ),
ex.const(suffix),
),
)
@@ -2036,23 +2040,15 @@ def isin(self, other: Block):
return block
def _isin_inner(self: Block, col: str, unique_values: core.ArrayValue) -> Block:
- unique_values, const = unique_values.create_constant(
- True, dtype=bigframes.dtypes.BOOL_DTYPE
- )
- expr, (l_map, r_map) = self._expr.relational_join(
- unique_values, ((col, unique_values.column_ids[0]),), type="left"
- )
- expr, matches = expr.project_to_id(ops.notnull_op.as_expr(r_map[const]))
+ expr, matches = self._expr.isin(unique_values, col, unique_values.column_ids[0])
- new_index_cols = tuple(l_map[idx_col] for idx_col in self.index_columns)
new_value_cols = tuple(
- l_map[val_col] if val_col != col else matches
- for val_col in self.value_columns
+ val_col if val_col != col else matches for val_col in self.value_columns
)
- expr = expr.select_columns((*new_index_cols, *new_value_cols))
+ expr = expr.select_columns((*self.index_columns, *new_value_cols))
return Block(
expr,
- index_columns=new_index_cols,
+ index_columns=self.index_columns,
column_labels=self.column_labels,
index_labels=self._index_labels,
)
diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py
index 7a018a662e..02c7ae128b 100644
--- a/bigframes/core/compile/aggregate_compiler.py
+++ b/bigframes/core/compile/aggregate_compiler.py
@@ -55,7 +55,7 @@ def compile_aggregate(
return compile_nullary_agg(aggregate.op)
if isinstance(aggregate, ex.UnaryAggregation):
input = scalar_compiler.compile_expression(aggregate.arg, bindings=bindings)
- if aggregate.op.can_order_by:
+ if not aggregate.op.order_independent:
return compile_ordered_unary_agg(aggregate.op, input, order_by=order_by) # type: ignore
else:
return compile_unary_agg(aggregate.op, input) # type: ignore
@@ -150,6 +150,11 @@ def _(op: agg_ops.SizeOp, window=None) -> ibis_types.NumericValue:
return _apply_window_if_present(ibis_ops.count(1), window)
+@compile_unary_agg.register
+def _(op: agg_ops.SizeUnaryOp, _, window=None) -> ibis_types.NumericValue:
+ return _apply_window_if_present(ibis_ops.count(1), window)
+
+
@compile_unary_agg.register
@numeric_op
def _(
@@ -171,13 +176,6 @@ def _(
column: ibis_types.NumericColumn,
window=None,
) -> ibis_types.NumericValue:
- # PERCENTILE_CONT has very few allowed windows. For example, "window
- # framing clause is not allowed for analytic function percentile_cont".
- if window is not None:
- raise NotImplementedError(
- f"Median with windowing is not supported. {constants.FEEDBACK_LINK}"
- )
-
# TODO(swast): Allow switching between exact and approximate median.
# For now, the best we can do is an approximate median when we're doing
# an aggregation, as PERCENTILE_CONT is only an analytic function.
diff --git a/bigframes/core/compile/api.py b/bigframes/core/compile/api.py
index 61eaa63f85..9280cfbb7b 100644
--- a/bigframes/core/compile/api.py
+++ b/bigframes/core/compile/api.py
@@ -24,9 +24,7 @@
import bigframes.core.ordering
import bigframes.core.schema
-_STRICT_COMPILER = compiler.Compiler(
- strict=True, enable_pruning=True, enable_densify_ids=True
-)
+_STRICT_COMPILER = compiler.Compiler(strict=True)
class SQLCompiler:
@@ -72,9 +70,7 @@ def compile_raw(
def test_only_try_evaluate(node: bigframes.core.nodes.BigFrameNode):
"""Use only for unit testing paths - not fully featured. Will throw exception if fails."""
node = _STRICT_COMPILER._preprocess(node)
- ibis = _STRICT_COMPILER.compile_ordered_ir(node)._to_ibis_expr(
- ordering_mode="unordered"
- )
+ ibis = _STRICT_COMPILER.compile_node(node)._to_ibis_expr()
return ibis.pandas.connect({}).execute(ibis)
@@ -83,7 +79,7 @@ def test_only_ibis_inferred_schema(node: bigframes.core.nodes.BigFrameNode):
import bigframes.core.schema
node = _STRICT_COMPILER._preprocess(node)
- compiled = _STRICT_COMPILER.compile_unordered_ir(node)
+ compiled = _STRICT_COMPILER.compile_node(node)
items = tuple(
bigframes.core.schema.SchemaItem(name, compiled.get_column_type(ibis_id))
for name, ibis_id in zip(node.schema.names, compiled.column_ids)
diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py
index 15805a38fc..906bdb1f0d 100644
--- a/bigframes/core/compile/compiled.py
+++ b/bigframes/core/compile/compiled.py
@@ -13,21 +13,17 @@
# limitations under the License.
from __future__ import annotations
-import abc
import functools
import itertools
import typing
-from typing import Collection, Literal, Optional, Sequence
+from typing import Optional, Sequence
import bigframes_vendored.ibis
import bigframes_vendored.ibis.backends.bigquery.backend as ibis_bigquery
-import bigframes_vendored.ibis.backends.bigquery.datatypes as ibis_bigquery_dtatatypes
import bigframes_vendored.ibis.common.deferred as ibis_deferred # type: ignore
import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes
import bigframes_vendored.ibis.expr.operations as ibis_ops
-import bigframes_vendored.ibis.expr.schema as ibis_schema
import bigframes_vendored.ibis.expr.types as ibis_types
-import google.cloud.bigquery
import pandas
import bigframes.core.compile.aggregate_compiler as agg_compiler
@@ -36,39 +32,23 @@
import bigframes.core.compile.scalar_op_compiler as op_compilers
import bigframes.core.expression as ex
import bigframes.core.guid
-import bigframes.core.identifiers as ids
-from bigframes.core.ordering import (
- ascending_over,
- encode_order_string,
- OrderingExpression,
- RowOrdering,
- TotalOrdering,
-)
+from bigframes.core.ordering import OrderingExpression
import bigframes.core.sql
from bigframes.core.window_spec import RangeWindowBounds, RowsWindowBounds, WindowSpec
import bigframes.dtypes
import bigframes.operations.aggregations as agg_ops
-ORDER_ID_COLUMN = "bigframes_ordering_id"
-PREDICATE_COLUMN = "bigframes_predicate"
-
-
-T = typing.TypeVar("T", bound="BaseIbisIR")
-
op_compiler = op_compilers.scalar_op_compiler
-class BaseIbisIR(abc.ABC):
- """Implementation detail, contains common logic between ordered and unordered IR"""
-
+# Ibis Implementations
+class UnorderedIR:
def __init__(
self,
table: ibis_types.Table,
columns: Sequence[ibis_types.Value],
- predicates: Optional[Collection[ibis_types.BooleanValue]] = None,
):
self._table = table
- self._predicates = tuple(predicates) if predicates is not None else ()
# Allow creating a DataFrame directly from an Ibis table expression.
# TODO(swast): Validate that each column references the same table (or
# no table for literal values).
@@ -83,6 +63,36 @@ def __init__(
# dictionary mapping names to column values.
self._column_names = {column.get_name(): column for column in self._columns}
+ def to_sql(
+ self,
+ *,
+ order_by: Sequence[OrderingExpression] = (),
+ limit: Optional[int] = None,
+ selections: Optional[Sequence[str]] = None,
+ ) -> str:
+ ibis_table = self._to_ibis_expr()
+ # This set of output transforms maybe should be its own output node??
+ if order_by or limit:
+ sql = ibis_bigquery.Backend().compile(ibis_table)
+ sql = (
+ bigframes.core.compile.googlesql.Select()
+ .from_(sql)
+ .select(selections or self.column_ids)
+ .sql()
+ )
+
+ # Single row frames may not have any ordering columns
+ if len(order_by) > 0:
+ order_by_clause = bigframes.core.sql.ordering_clause(order_by)
+ sql += f"\n{order_by_clause}"
+ if limit is not None:
+ if not isinstance(limit, int):
+ raise TypeError(f"Limit param: {limit} must be an int.")
+ sql += f"\nLIMIT {limit}"
+ else:
+ sql = ibis_bigquery.Backend().compile(self._to_ibis_expr())
+ return typing.cast(str, sql)
+
@property
def columns(self) -> typing.Tuple[ibis_types.Value, ...]:
return self._columns
@@ -91,74 +101,40 @@ def columns(self) -> typing.Tuple[ibis_types.Value, ...]:
def column_ids(self) -> typing.Sequence[str]:
return tuple(self._column_names.keys())
- @property
- def _reduced_predicate(self) -> typing.Optional[ibis_types.BooleanValue]:
- """Returns the frame's predicates as an equivalent boolean value, useful where a single predicate value is preferred."""
- return (
- _reduce_predicate_list(self._predicates).name(PREDICATE_COLUMN)
- if self._predicates
- else None
- )
-
@property
def _ibis_bindings(self) -> dict[str, ibis_types.Value]:
return {col: self._get_ibis_column(col) for col in self.column_ids}
- @property
- @abc.abstractmethod
- def is_ordered_ir(self: T) -> bool:
- """Whether it is a OrderedIR or UnorderedIR."""
- ...
-
- @abc.abstractmethod
- def filter(self: T, predicate: ex.Expression) -> T:
- """Filter the table on a given expression, the predicate must be a boolean expression."""
- ...
-
- @abc.abstractmethod
- def _reproject_to_table(self: T) -> T:
- """
- Internal operators that projects the internal representation into a
- new ibis table expression where each value column is a direct
- reference to a column in that table expression. Needed after
- some operations such as window operations that cannot be used
- recursively in projections.
- """
- ...
-
def projection(
- self: T,
+ self,
expression_id_pairs: typing.Tuple[typing.Tuple[ex.Expression, str], ...],
- ) -> T:
+ ) -> UnorderedIR:
"""Apply an expression to the ArrayValue and assign the output to a column."""
+ cannot_inline = any(expr.expensive for expr, _ in expression_id_pairs)
+
bindings = {col: self._get_ibis_column(col) for col in self.column_ids}
new_values = [
op_compiler.compile_expression(expression, bindings).name(id)
for expression, id in expression_id_pairs
]
- result = self._select(tuple([*self._columns, *new_values])) # type: ignore
- return result
+ result = UnorderedIR(self._table, (*self._columns, *new_values))
+ if cannot_inline:
+ return result._reproject_to_table()
+ else:
+ # Cheap ops can defer "SELECT" and inline into later ops
+ return result
def selection(
- self: T,
+ self,
input_output_pairs: typing.Tuple[typing.Tuple[ex.DerefOp, str], ...],
- ) -> T:
+ ) -> UnorderedIR:
"""Apply an expression to the ArrayValue and assign the output to a column."""
bindings = {col: self._get_ibis_column(col) for col in self.column_ids}
values = [
op_compiler.compile_expression(input, bindings).name(id)
for input, id in input_output_pairs
]
- result = self._select(tuple(values)) # type: ignore
- return result
-
- @abc.abstractmethod
- def _select(self: T, values: typing.Tuple[ibis_types.Value]) -> T:
- ...
-
- @abc.abstractmethod
- def _set_or_replace_by_id(self: T, id: str, new_value: ibis_types.Value) -> T:
- ...
+ return UnorderedIR(self._table, tuple(values))
def _get_ibis_column(self, key: str) -> ibis_types.Value:
"""Gets the Ibis expression for a given column."""
@@ -178,188 +154,39 @@ def get_column_type(self, key: str) -> bigframes.dtypes.Dtype:
bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype(ibis_type),
)
- def _aggregate_base(
- self,
- table: ibis_types.Table,
- order_by: typing.Sequence[ibis_types.Value] = [],
- aggregations: typing.Sequence[typing.Tuple[ex.Aggregation, str]] = [],
- by_column_ids: typing.Sequence[ex.DerefOp] = (),
- dropna: bool = True,
- ) -> OrderedIR:
- assert not self.is_ordered_ir or len(order_by) > 0
-
- bindings = {col: table[col] for col in self.column_ids}
- stats = {
- col_out: agg_compiler.compile_aggregate(
- aggregate, bindings, order_by=order_by
- )
- for aggregate, col_out in aggregations
- }
- if by_column_ids:
- result = table.group_by((ref.id.sql for ref in by_column_ids)).aggregate(
- **stats
- )
- # Must have deterministic ordering, so order by the unique "by" column
- ordering = TotalOrdering(
- tuple(
- [
- OrderingExpression(ex.DerefOp(ref.id.local_normalized))
- for ref in by_column_ids
- ]
- ),
- total_ordering_columns=frozenset(
- [ex.DerefOp(ref.id.local_normalized) for ref in by_column_ids]
- ),
- )
- columns = tuple(result[key] for key in result.columns)
- expr = OrderedIR(result, columns=columns, ordering=ordering)
- if dropna:
- for ref in by_column_ids:
- expr = expr._filter(expr._compile_expression(ref).notnull())
- return expr
- else:
- aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)}
- result = table.aggregate(**aggregates)
- # Ordering is irrelevant for single-row output, but set ordering id regardless
- # as other ops(join etc.) expect it.
- # TODO: Maybe can make completely empty
- ordering = TotalOrdering(
- ordering_value_columns=tuple([]),
- total_ordering_columns=frozenset([]),
- )
- return OrderedIR(
- result,
- columns=[result[col_id] for col_id in [*stats.keys()]],
- hidden_ordering_columns=[result[ORDER_ID_COLUMN]],
- ordering=ordering,
- )
-
-
-# Ibis Implementations
-class UnorderedIR(BaseIbisIR):
- def __init__(
- self,
- table: ibis_types.Table,
- columns: Sequence[ibis_types.Value],
- predicates: Optional[Collection[ibis_types.BooleanValue]] = None,
- ):
- super().__init__(table, columns, predicates)
-
- @property
- def is_ordered_ir(self) -> bool:
- return False
-
- def builder(self):
- """Creates a mutable builder for expressions."""
- # Since ArrayValue is intended to be immutable (immutability offers
- # potential opportunities for caching, though we might need to introduce
- # more node types for that to be useful), we create a builder class.
- return UnorderedIR.Builder(
- self._table,
- columns=self._columns,
- predicates=self._predicates,
- )
-
- def peek_sql(self, n: int):
- # Peek currently implemented as top level LIMIT op.
- # Execution engine handles limit pushdown.
- # In future, may push down limit/filters in compilation.
- sql = ibis_bigquery.Backend().compile(self._to_ibis_expr().limit(n))
- return typing.cast(str, sql)
-
- def to_sql(
- self,
- offset_column: typing.Optional[str] = None,
- ordered: bool = False,
- ) -> str:
- if offset_column or ordered:
- raise ValueError("Cannot produce sorted sql in partial ordering mode")
- sql = ibis_bigquery.Backend().compile(self._to_ibis_expr())
- return typing.cast(str, sql)
-
- def with_total_order(self, by: Sequence[OrderingExpression]) -> OrderedIR:
- return OrderedIR(
- table=self._table,
- columns=self._columns,
- predicates=self._predicates,
- ordering=TotalOrdering(
- ordering_value_columns=tuple(by),
- total_ordering_columns=frozenset(
- map(
- ex.DerefOp,
- itertools.chain.from_iterable(
- col.referenced_columns for col in by
- ),
- )
- ),
- ),
- )
-
- def row_count(self, name: str) -> OrderedIR:
+ def row_count(self, name: str) -> UnorderedIR:
original_table = self._to_ibis_expr()
ibis_table = original_table.agg(
[
original_table.count().name(name),
]
)
- return OrderedIR(
+ return UnorderedIR(
ibis_table,
(ibis_table[name],),
- ordering=TotalOrdering(
- ordering_value_columns=(ascending_over(name),),
- total_ordering_columns=frozenset([ex.deref(name)]),
- ),
)
def _to_ibis_expr(
self,
*,
- expose_hidden_cols: bool = False,
fraction: Optional[float] = None,
):
"""
Creates an Ibis table expression representing the DataFrame.
- ArrayValue objects are sorted, so the following options are available
- to reflect this in the ibis expression.
-
- * "string_encoded": An ordered string column is provided in output table.
- * "unordered": No ordering information will be provided in output. Only
- value columns are projected.
-
- For offset or ordered column, order_col_name can be used to assign the
- output label for the ordering column. If none is specified, the default
- column name will be 'bigframes_ordering_id'
-
Args:
expose_hidden_cols:
If True, include the hidden ordering columns in the results.
- Only compatible with `order_by` and `unordered`
- ``ordering_mode``.
+
Returns:
An ibis expression representing the data help by the ArrayValue object.
"""
- columns = list(self._columns)
- columns_to_drop: list[
- str
- ] = [] # Ordering/Filtering columns that will be dropped at end
-
- if self._reduced_predicate is not None:
- columns.append(self._reduced_predicate)
- # Usually drop predicate as it is will be all TRUE after filtering
- if not expose_hidden_cols:
- columns_to_drop.append(self._reduced_predicate.get_name())
-
# Special case for empty tables, since we can't create an empty
# projection.
- if not columns:
+ if not self._columns:
return bigframes_vendored.ibis.memtable([])
- table = self._table.select(columns)
- base_table = table
- if self._reduced_predicate is not None:
- table = table.filter(base_table[PREDICATE_COLUMN])
- table = table.drop(*columns_to_drop)
+ table = self._table.select(self._columns)
if fraction is not None:
table = table.filter(
bigframes_vendored.ibis.random() < ibis_types.literal(fraction)
@@ -367,29 +194,20 @@ def _to_ibis_expr(
return table
def filter(self, predicate: ex.Expression) -> UnorderedIR:
- for ref in predicate.column_references:
- ibis_value = self._get_ibis_column(ref.sql)
- if is_window(ibis_value):
- # ibis doesn't support qualify syntax, so create CTE if filtering over window expression
- # https://github.com/ibis-project/ibis/issues/9775
- return self._reproject_to_table().filter(predicate)
-
- bindings = {col: self._get_ibis_column(col) for col in self.column_ids}
- condition = op_compiler.compile_expression(predicate, bindings)
- return self._filter(condition) # type:ignore
-
- def _filter(self, predicate_value: ibis_types.BooleanValue) -> UnorderedIR:
- """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression."""
- expr = self.builder()
- expr.predicates = [*self._predicates, predicate_value]
- return expr.build()
+ table = self._to_ibis_expr()
+ condition = op_compiler.compile_expression(predicate, table)
+ table = table.filter(condition)
+ return UnorderedIR(
+ table, tuple(table[column_name] for column_name in self._column_names)
+ )
def aggregate(
self,
aggregations: typing.Sequence[typing.Tuple[ex.Aggregation, str]],
by_column_ids: typing.Sequence[ex.DerefOp] = (),
dropna: bool = True,
- ) -> OrderedIR:
+ order_by: typing.Sequence[OrderingExpression] = (),
+ ) -> UnorderedIR:
"""
Apply aggregations to the expression.
Arguments:
@@ -402,9 +220,32 @@ def aggregate(
information.
"""
table = self._to_ibis_expr()
- return self._aggregate_base(
- table, aggregations=aggregations, by_column_ids=by_column_ids, dropna=dropna
- )
+ bindings = {col: table[col] for col in self.column_ids}
+ stats = {
+ col_out: agg_compiler.compile_aggregate(
+ aggregate,
+ bindings,
+ order_by=_convert_ordering_to_table_values(table, order_by),
+ )
+ for aggregate, col_out in aggregations
+ }
+ if by_column_ids:
+ if dropna:
+ table = table.filter(
+ [table[ref.id.sql].notnull() for ref in by_column_ids]
+ )
+ result = table.group_by((ref.id.sql for ref in by_column_ids)).aggregate(
+ **stats
+ )
+ return UnorderedIR(
+ result, columns=tuple(result[key] for key in result.columns)
+ )
+ else:
+ result = table.aggregate(**stats)
+ return UnorderedIR(
+ result,
+ columns=[result[col_id] for col_id in [*stats.keys()]],
+ )
def _uniform_sampling(self, fraction: float) -> UnorderedIR:
"""Sampling the table on given fraction.
@@ -419,29 +260,7 @@ def _uniform_sampling(self, fraction: float) -> UnorderedIR:
columns=columns,
)
- def as_ordered_ir(self) -> OrderedIR:
- """Convert to OrderedIr, but without any definite ordering."""
- return OrderedIR(self._table, self._columns, predicates=self._predicates)
-
## Helpers
- def _set_or_replace_by_id(
- self, id: str, new_value: ibis_types.Value
- ) -> UnorderedIR:
- builder = self.builder()
- if id in self.column_ids:
- builder.columns = [
- val if (col_id != id) else new_value.name(id)
- for col_id, val in zip(self.column_ids, self._columns)
- ]
- else:
- builder.columns = [*self.columns, new_value.name(id)]
- return builder.build()
-
- def _select(self, values: typing.Tuple[ibis_types.Value, ...]) -> UnorderedIR:
- builder = self.builder()
- builder.columns = values
- return builder.build()
-
def _reproject_to_table(self) -> UnorderedIR:
"""
Internal operators that projects the internal representation into a
@@ -457,113 +276,14 @@ def _reproject_to_table(self) -> UnorderedIR:
columns=columns,
)
- class Builder:
- def __init__(
- self,
- table: ibis_types.Table,
- columns: Collection[ibis_types.Value] = (),
- predicates: Optional[Collection[ibis_types.BooleanValue]] = None,
- ):
- self.table = table
- self.columns = list(columns)
- self.predicates = list(predicates) if predicates is not None else None
-
- def build(self) -> UnorderedIR:
- return UnorderedIR(
- table=self.table,
- columns=self.columns,
- predicates=self.predicates,
- )
-
-
-class OrderedIR(BaseIbisIR):
- """Immutable BigQuery DataFrames expression tree.
-
- Note: Usage of this class is considered to be private and subject to change
- at any time.
-
- This class is a wrapper around Ibis expressions. Its purpose is to defer
- Ibis projection operations to keep generated SQL small and correct when
- mixing and matching columns from different versions of a DataFrame.
-
- Args:
- table: An Ibis table expression.
- columns: Ibis value expressions that can be projected as columns.
- hidden_ordering_columns: Ibis value expressions to store ordering.
- ordering: An ordering property of the data frame.
- predicates: A list of filters on the data frame.
- """
-
- def __init__(
- self,
- table: ibis_types.Table,
- columns: Sequence[ibis_types.Value],
- hidden_ordering_columns: Optional[Sequence[ibis_types.Value]] = None,
- ordering: RowOrdering = RowOrdering(),
- predicates: Optional[Collection[ibis_types.BooleanValue]] = None,
- ):
- super().__init__(table, columns, predicates)
- self._ordering = ordering
- # Meta columns store ordering, or other data that doesn't correspond to dataframe columns
- self._hidden_ordering_columns = (
- tuple(hidden_ordering_columns)
- if hidden_ordering_columns is not None
- else ()
- )
-
- # To allow for more efficient lookup by column name, create a
- # dictionary mapping names to column values.
- self._column_names = {
- (
- column.resolve(table) # type: ignore
- # TODO(https://github.com/ibis-project/ibis/issues/7613): use
- # public API to refer to Deferred type.
- if isinstance(column, ibis_deferred.Deferred)
- else column
- ).get_name(): column
- for column in self._columns
- }
- self._hidden_ordering_column_names = {
- typing.cast(str, column.get_name()): column
- for column in self._hidden_ordering_columns
- }
- ### Validation
- value_col_ids = self._column_names.keys()
- hidden_col_ids = self._hidden_ordering_column_names.keys()
-
- all_columns = value_col_ids | hidden_col_ids
- ordering_valid = all(
- set(ref.sql for ref in col.scalar_expression.column_references).issubset(
- all_columns
- )
- for col in ordering.all_ordering_columns
- )
- if value_col_ids & hidden_col_ids:
- raise ValueError(
- f"Keys in both hidden and exposed list: {value_col_ids & hidden_col_ids}"
- )
- if not ordering_valid:
- raise ValueError(f"Illegal ordering keys: {ordering.all_ordering_columns}")
-
- @property
- def is_ordered_ir(self) -> bool:
- return True
-
- @property
- def order_non_deterministic(self) -> bool:
- # ordering suffix non-determinism is ok, as rand() is used as suffix for auto-generated order keys.
- # but must be resolved before or explode, otherwise the engine might pull the rand() evaluation above the join,
- # creating inconsistencies
- return not all(col.deterministic for col in self._ordering.all_ordering_columns)
-
- @property
- def has_total_order(self) -> bool:
- return isinstance(self._ordering, TotalOrdering)
-
@classmethod
def from_pandas(
- cls, pd_df: pandas.DataFrame, scan_cols: bigframes.core.nodes.ScanList
- ) -> OrderedIR:
+ cls,
+ pd_df: pandas.DataFrame,
+ scan_cols: bigframes.core.nodes.ScanList,
+ offsets: typing.Optional[str] = None,
+ ) -> UnorderedIR:
+ # TODO: add offsets
"""
Builds an in-memory only (SQL only) expr from a pandas dataframe.
@@ -574,7 +294,8 @@ def from_pandas(
# ibis memtable cannot handle NA, must convert to None
# this destroys the schema however
ibis_values = pd_df.astype("object").where(pandas.notnull(pd_df), None) # type: ignore
- ibis_values = ibis_values.assign(**{ORDER_ID_COLUMN: range(len(pd_df))})
+ if offsets:
+ ibis_values = ibis_values.assign(**{offsets: range(len(pd_df))})
# derive the ibis schema from the original pandas schema
ibis_schema = [
(
@@ -583,165 +304,25 @@ def from_pandas(
)
for id, dtype, local_label in scan_cols.items
]
- ibis_schema.append((ORDER_ID_COLUMN, ibis_dtypes.int64))
+ if offsets:
+ ibis_schema.append((offsets, ibis_dtypes.int64))
keys_memtable = bigframes_vendored.ibis.memtable(
ibis_values, schema=bigframes_vendored.ibis.schema(ibis_schema)
)
+ columns = [
+ keys_memtable[local_label].name(col_id.sql)
+ for col_id, _, local_label in scan_cols.items
+ ]
+ if offsets:
+ columns.append(keys_memtable[offsets].name(offsets))
+
return cls(
keys_memtable,
- columns=[
- keys_memtable[local_label].name(col_id.sql)
- for col_id, _, local_label in scan_cols.items
- ],
- ordering=TotalOrdering.from_offset_col(ORDER_ID_COLUMN),
- hidden_ordering_columns=(keys_memtable[ORDER_ID_COLUMN],),
- )
-
- @property
- def _ibis_bindings(self) -> dict[str, ibis_types.Value]:
- all_keys = itertools.chain(self.column_ids, self._hidden_column_ids)
- return {col: self._get_any_column(col) for col in all_keys}
-
- @property
- def _hidden_column_ids(self) -> typing.Sequence[str]:
- return tuple(self._hidden_ordering_column_names.keys())
-
- @property
- def _ibis_order(self) -> Sequence[ibis_types.Value]:
- """Returns a sequence of ibis values which can be directly used to order a
- table expression. Has direction modifiers applied."""
- return _convert_ordering_to_table_values(
- {**self._column_names, **self._hidden_ordering_column_names},
- self._ordering.all_ordering_columns,
- )
-
- def to_unordered(self) -> UnorderedIR:
- return UnorderedIR(self._table, self._columns, self._predicates)
-
- def builder(self) -> OrderedIR.Builder:
- """Creates a mutable builder for expressions."""
- # Since ArrayValue is intended to be immutable (immutability offers
- # potential opportunities for caching, though we might need to introduce
- # more node types for that to be useful), we create a builder class.
- return OrderedIR.Builder(
- self._table,
- columns=self._columns,
- hidden_ordering_columns=self._hidden_ordering_columns,
- ordering=self._ordering,
- predicates=self._predicates,
- )
-
- def order_by(self, by: Sequence[OrderingExpression]) -> OrderedIR:
- expr_builder = self.builder()
- expr_builder.ordering = self._ordering.with_ordering_columns(by)
- return expr_builder.build()
-
- def reversed(self) -> OrderedIR:
- expr_builder = self.builder()
- expr_builder.ordering = self._ordering.with_reverse()
- return expr_builder.build()
-
- def aggregate(
- self,
- aggregations: typing.Sequence[typing.Tuple[ex.Aggregation, str]],
- by_column_ids: typing.Sequence[ex.DerefOp] = (),
- dropna: bool = True,
- ) -> OrderedIR:
- """
- Apply aggregations to the expression.
- Arguments:
- aggregations: input_column_id, operation, output_column_id tuples
- by_column_ids: column ids of the aggregation key, this is preserved through
- the transform
- dropna: whether null keys should be dropped
- Returns:
- OrderedIR
- """
- table = self._to_ibis_expr(ordering_mode="unordered", expose_hidden_cols=True)
-
- all_columns = {
- column_name: table[column_name]
- for column_name in {
- **self._column_names,
- **self._hidden_ordering_column_names,
- }
- }
- order_by = _convert_ordering_to_table_values(
- all_columns,
- self._ordering.all_ordering_columns,
- )
-
- return self._aggregate_base(
- table,
- order_by=order_by,
- aggregations=aggregations,
- by_column_ids=by_column_ids,
- dropna=dropna,
- )
-
- def _uniform_sampling(self, fraction: float) -> OrderedIR:
- """Sampling the table on given fraction.
-
- .. warning::
- The row numbers of result is non-deterministic, avoid to use.
- """
- table = self._to_ibis_expr(
- ordering_mode="unordered", expose_hidden_cols=True, fraction=fraction
- )
- columns = [table[column_name] for column_name in self._column_names]
- hidden_ordering_columns = [
- table[column_name] for column_name in self._hidden_ordering_column_names
- ]
- return OrderedIR(
- table,
columns=columns,
- hidden_ordering_columns=hidden_ordering_columns,
- ordering=self._ordering,
)
- def promote_offsets(self, col_id: str) -> OrderedIR:
- """
- Convenience function to promote copy of column offsets to a value column. Can be used to reset index.
- """
- # Special case: offsets already exist
- ordering = self._ordering
- # Case 1, already have offsets, just create column from them
- if ordering.is_sequential and (ordering.total_order_col is not None):
- expr_builder = self.builder()
- expr_builder.columns = [
- *self.columns,
- self._compile_expression(
- ordering.total_order_col.scalar_expression
- ).name(col_id),
- ]
- return expr_builder.build()
- # Cannot nest analytic expressions, so reproject to cte first if needed.
- # Also ibis cannot window literals, so need to reproject those (even though this is legal in googlesql)
- # Seee: https://github.com/ibis-project/ibis/issues/9773
- can_directly_window = not any(
- map(
- lambda x: is_literal(x) or is_window(x),
- itertools.chain(self._ibis_order, self._predicates),
- )
- )
- if not can_directly_window:
- return self._reproject_to_table().promote_offsets(col_id)
-
- window = bigframes_vendored.ibis.window(order_by=self._ibis_order)
- if self._predicates:
- window = window.group_by(self._reduced_predicate)
- offsets = bigframes_vendored.ibis.row_number().over(window)
- expr_builder = self.builder()
- expr_builder.columns = [
- *self.columns,
- offsets.name(col_id),
- ]
- # Reproject, so that offsets are just a scalar value that can be used elsewhere
- expr_builder.ordering = TotalOrdering.from_offset_col(col_id)
- return expr_builder.build()._reproject_to_table()
-
## Methods that only work with ordering
def project_window_op(
self,
@@ -750,7 +331,7 @@ def project_window_op(
output_name: str,
*,
never_skip_nulls=False,
- ) -> OrderedIR:
+ ) -> UnorderedIR:
"""
Creates a new expression based on this expression with unary operation applied to one column.
column_name: the id of the input column present in the expression
@@ -782,9 +363,10 @@ def project_window_op(
never_skip_nulls=never_skip_nulls,
)
- window = self._ibis_window_from_spec(
- window_spec, require_total_order=expression.op.uses_total_row_ordering
- )
+ if expression.op.order_independent and not window_spec.row_bounded:
+ # notably percentile_cont does not support ordering clause
+ window_spec = window_spec.without_order()
+ window = self._ibis_window_from_spec(window_spec)
bindings = {col: self._get_ibis_column(col) for col in self.column_ids}
window_op = agg_compiler.compile_analytic(
@@ -838,404 +420,12 @@ def project_window_op(
case_statement = case_statement.else_(window_op).end() # type: ignore
window_op = case_statement # type: ignore
- result = self._set_or_replace_by_id(output_name, window_op)
- return result
-
- def _reproject_to_table(self) -> OrderedIR:
- table = self._to_ibis_expr(
- ordering_mode="unordered",
- expose_hidden_cols=True,
- )
- columns = [table[column_name] for column_name in self._column_names]
- ordering_col_ids = list(
- id.sql
- for id in itertools.chain.from_iterable(
- ref.scalar_expression.column_references
- for ref in self._ordering.all_ordering_columns
- )
- )
- hidden_ordering_columns = [
- table[column_name]
- for column_name in self._hidden_ordering_column_names
- if column_name in ordering_col_ids
- ]
- return OrderedIR(
- table,
- columns=columns,
- hidden_ordering_columns=hidden_ordering_columns,
- ordering=self._ordering,
- )
-
- def to_sql(
- self,
- ordered: bool = False,
- limit: Optional[int] = None,
- ) -> str:
- if ordered or limit:
- # Need to bake ordering expressions into the selected column in order for our ordering clause builder to work.
- baked_ir = self._bake_ordering()
- sql = ibis_bigquery.Backend().compile(
- baked_ir._to_ibis_expr(
- ordering_mode="unordered",
- expose_hidden_cols=True,
- )
- )
- sql = (
- bigframes.core.compile.googlesql.Select()
- .from_(sql)
- .select(self.column_ids)
- .sql()
- )
-
- # Single row frames may not have any ordering columns
- if len(baked_ir._ordering.all_ordering_columns) > 0:
- order_by_clause = bigframes.core.sql.ordering_clause(
- baked_ir._ordering.all_ordering_columns
- )
- sql += f"\n{order_by_clause}"
- if limit is not None:
- if not isinstance(limit, int):
- raise TypeError(f"Limit param: {limit} must be an int.")
- sql += f"\nLIMIT {limit}"
- else:
- sql = ibis_bigquery.Backend().compile(
- self._to_ibis_expr(
- ordering_mode="unordered",
- expose_hidden_cols=False,
- )
- )
- return typing.cast(str, sql)
-
- def raw_sql_and_schema(
- self,
- column_ids: typing.Sequence[str],
- ) -> typing.Tuple[str, typing.Sequence[google.cloud.bigquery.SchemaField]]:
- """Return sql with all hidden columns. Used to cache with ordering information.
-
- Also returns schema, as the extra ordering columns are determined compile-time.
- """
- col_id_overrides = dict(zip(self.column_ids, column_ids))
- all_columns = (*self.column_ids, *self._hidden_ordering_column_names.keys())
- as_ibis = self._to_ibis_expr(
- ordering_mode="unordered",
- expose_hidden_cols=True,
- )
- as_ibis = as_ibis.select(all_columns).rename(col_id_overrides)
-
- # Ibis will produce non-nullable schema types, but bigframes should always be nullable
- fixed_ibis_schema = ibis_schema.Schema.from_tuples(
- (name, dtype.copy(nullable=True))
- for (name, dtype) in as_ibis.schema().items()
- )
- bq_schema = ibis_bigquery_dtatatypes.BigQuerySchema.from_ibis(fixed_ibis_schema)
- return ibis_bigquery.Backend().compile(as_ibis), bq_schema
-
- def _to_ibis_expr(
- self,
- *,
- expose_hidden_cols: bool = False,
- fraction: Optional[float] = None,
- ordering_mode: Literal["string_encoded", "unordered"],
- order_col_name: Optional[str] = ORDER_ID_COLUMN,
- ):
- """
- Creates an Ibis table expression representing the DataFrame.
-
- ArrayValue objects are sorted, so the following options are available
- to reflect this in the ibis expression.
-
-
- * "string_encoded": An ordered string column is provided in output table.
- * "unordered": No ordering information will be provided in output. Only
- value columns are projected.
-
- For offset or ordered column, order_col_name can be used to assign the
- output label for the ordering column. If none is specified, the default
- column name will be 'bigframes_ordering_id'
-
- Args:
- expose_hidden_cols:
- If True, include the hidden ordering columns in the results.
- Only compatible with `order_by` and `unordered`
- ``ordering_mode``.
- ordering_mode:
- How to construct the Ibis expression from the ArrayValue. See
- above for details.
- order_col_name:
- If the ordering mode outputs a single ordering or offsets
- column, use this as the column name.
- Returns:
- An ibis expression representing the data help by the ArrayValue object.
- """
- assert ordering_mode in (
- "string_encoded",
- "unordered",
- )
- if expose_hidden_cols and ordering_mode in ("ordered_col"):
- raise ValueError(
- f"Cannot expose hidden ordering columns with ordering_mode {ordering_mode}"
- )
-
- columns = list(self._columns)
- columns_to_drop: list[
- str
- ] = [] # Ordering/Filtering columns that will be dropped at end
-
- if self._reduced_predicate is not None:
- columns.append(self._reduced_predicate)
- # Usually drop predicate as it is will be all TRUE after filtering
- if not expose_hidden_cols:
- columns_to_drop.append(self._reduced_predicate.get_name())
-
- order_columns = self._create_order_columns(
- ordering_mode, order_col_name, expose_hidden_cols
- )
- columns.extend(order_columns)
-
- # Special case for empty tables, since we can't create an empty
- # projection.
- if not columns:
- return bigframes_vendored.ibis.memtable([])
-
- # Make sure we don't have any unbound (deferred) columns.
- table = self._table.select(columns)
-
- table = table.select(table[column] for column in table.columns)
- base_table = table
- if self._reduced_predicate is not None:
- table = table.filter(base_table[PREDICATE_COLUMN])
- table = table.drop(*columns_to_drop)
- if fraction is not None:
- table = table.filter(
- bigframes_vendored.ibis.random() < ibis_types.literal(fraction)
- )
- return table
-
- def filter(self, predicate: ex.Expression) -> OrderedIR:
- for ref in predicate.column_references:
- ibis_value = self._get_ibis_column(ref.sql)
- if is_window(ibis_value):
- # ibis doesn't support qualify syntax, so create CTE if filtering over window expression
- # https://github.com/ibis-project/ibis/issues/9775
- return self._reproject_to_table().filter(predicate)
-
- bindings = {col: self._get_ibis_column(col) for col in self.column_ids}
- condition = op_compiler.compile_expression(predicate, bindings)
- return self._filter(condition) # type: ignore
-
- def _filter(self, predicate_value: ibis_types.BooleanValue) -> OrderedIR:
- """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression."""
- expr = self.builder()
- expr.ordering = expr.ordering.with_non_sequential()
- expr.predicates = [*self._predicates, predicate_value]
- return expr.build()
-
- def _set_or_replace_by_id(self, id: str, new_value: ibis_types.Value) -> OrderedIR:
- """Safely assign by id while maintaining ordering integrity."""
- # TODO: Split into explicit set and replace methods
- ordering_col_ids = set(
- id.sql
- for id in itertools.chain.from_iterable(
- col_ref.scalar_expression.column_references
- for col_ref in self._ordering.ordering_value_columns
- )
- )
- if id in ordering_col_ids:
- return self._hide_column(id)._set_or_replace_by_id(id, new_value)
-
- builder = self.builder()
- if id in self.column_ids:
- builder.columns = [
- val if (col_id != id) else new_value.name(id)
- for col_id, val in zip(self.column_ids, self._columns)
- ]
- else:
- builder.columns = [*self.columns, new_value.name(id)]
- return builder.build()
-
- def _select(self, values: typing.Tuple[ibis_types.Value, ...]) -> OrderedIR:
- """Safely assign by id while maintaining ordering integrity."""
- # TODO: Split into explicit set and replace methods
- ordering_col_ids = set(
- id.sql
- for id in itertools.chain.from_iterable(
- [
- col_ref.scalar_expression.column_references
- for col_ref in self._ordering.ordering_value_columns
- ]
- )
- )
- ir = self
- mappings = {typing.cast(str, value.get_name()): value for value in values}
- for ordering_id in ordering_col_ids:
- # Drop case
- if (ordering_id not in mappings) and (ordering_id in ir.column_ids):
- # id is being dropped, hide it first
- ir = ir._hide_column(ordering_id)
- # Mutate case
- elif (ordering_id in mappings) and not mappings[ordering_id].equals(
- ir._get_any_column(ordering_id)
- ):
- ir = ir._hide_column(ordering_id)
-
- builder = ir.builder()
- builder.columns = list(values)
- return builder.build()
-
- ## Ordering specific helpers
- def _get_any_column(self, key: str) -> ibis_types.Value:
- """Gets the Ibis expression for a given column. Will also get hidden columns."""
- all_columns = {**self._column_names, **self._hidden_ordering_column_names}
- if key not in all_columns.keys():
- raise ValueError(
- "Column name {} not in set of values: {}".format(
- key, all_columns.keys()
- )
- )
- return typing.cast(ibis_types.Value, all_columns[key])
-
- def _get_hidden_ordering_column(self, key: str) -> ibis_types.Column:
- """Gets the Ibis expression for a given hidden column."""
- if key not in self._hidden_ordering_column_names.keys():
- raise ValueError(
- "Column name {} not in set of values: {}".format(
- key, self._hidden_ordering_column_names.keys()
- )
- )
- return typing.cast(ibis_types.Column, self._hidden_ordering_column_names[key])
-
- def _hide_column(self, column_id: str) -> OrderedIR:
- """Pushes columns to hidden columns list. Used to hide ordering columns that have been dropped or destructively mutated."""
- expr_builder = self.builder()
- # Need to rename column as caller might be creating a new row with the same name but different values.
- # Can avoid this if don't allow callers to determine ids and instead generate unique ones in this class.
- new_name = ids.ColumnId(
- bigframes.core.guid.generate_guid(prefix="bigframes_hidden_")
- )
- expr_builder.hidden_ordering_columns = [
- *self._hidden_ordering_columns,
- self._get_ibis_column(column_id).name(new_name.sql),
- ]
- matching_ref = next(
- ref for ref in self._ordering.referenced_columns if ref.sql == column_id
- )
- # allow_partial_bindings since only remapping hidden column, not all columns
- expr_builder.ordering = self._ordering.remap_column_refs(
- {matching_ref: new_name}, allow_partial_bindings=True
- )
- return expr_builder.build()
-
- def _bake_ordering(self) -> OrderedIR:
- """Bakes ordering expression into the selection, maybe creating hidden columns."""
- ordering_expressions = self._ordering.all_ordering_columns
- new_exprs: list[OrderingExpression] = []
- new_baked_cols: list[ibis_types.Value] = []
- for expr in ordering_expressions:
- if isinstance(expr.scalar_expression, ex.OpExpression):
- baked_column = self._compile_expression(expr.scalar_expression).name(
- bigframes.core.guid.generate_guid()
- )
- new_baked_cols.append(baked_column)
- new_expr = OrderingExpression(
- ex.deref(baked_column.get_name()), expr.direction, expr.na_last
- )
- new_exprs.append(new_expr)
- elif isinstance(expr.scalar_expression, ex.DerefOp):
- order_col = expr.scalar_expression.id
- new_exprs.append(expr)
- if order_col.sql not in self.column_ids:
- new_baked_cols.append(
- self._ibis_bindings[expr.scalar_expression.id.sql]
- )
-
- if isinstance(self._ordering, TotalOrdering):
- new_ordering: RowOrdering = TotalOrdering(
- tuple(new_exprs),
- self._ordering.integer_encoding,
- self._ordering.string_encoding,
- total_ordering_columns=frozenset(
- map(
- ex.DerefOp,
- itertools.chain.from_iterable(
- col.referenced_columns for col in new_exprs
- ),
- )
- ),
- )
- else:
- new_ordering = RowOrdering(
- tuple(new_exprs),
- self._ordering.integer_encoding,
- self._ordering.string_encoding,
- )
- return OrderedIR(
- self._table,
- columns=self.columns,
- hidden_ordering_columns=tuple(new_baked_cols),
- ordering=new_ordering,
- predicates=self._predicates,
- )
-
- def _create_order_columns(
- self,
- ordering_mode: str,
- order_col_name: Optional[str],
- expose_hidden_cols: bool,
- ) -> typing.Sequence[ibis_types.Value]:
- # Generate offsets if current ordering id semantics are not sufficiently strict
- if ordering_mode == "string_encoded":
- return (self._create_string_ordering_column().name(order_col_name),)
- elif expose_hidden_cols:
- return self._hidden_ordering_columns
- return ()
-
- def _create_offset_column(self) -> ibis_types.IntegerColumn:
- if self._ordering.total_order_col and self._ordering.is_sequential:
- offsets = self._compile_expression(
- self._ordering.total_order_col.scalar_expression
- )
- return typing.cast(ibis_types.IntegerColumn, offsets)
- else:
- window = bigframes_vendored.ibis.window(order_by=self._ibis_order)
- if self._predicates:
- window = window.group_by(self._reduced_predicate)
- offsets = bigframes_vendored.ibis.row_number().over(window)
- return typing.cast(ibis_types.IntegerColumn, offsets)
-
- def _create_string_ordering_column(self) -> ibis_types.StringColumn:
- if self._ordering.total_order_col and self._ordering.is_string_encoded:
- string_order_ids = op_compiler.compile_expression(
- self._ordering.total_order_col.scalar_expression, self._ibis_bindings
- )
- return typing.cast(ibis_types.StringColumn, string_order_ids)
- if (
- self._ordering.total_order_col
- and self._ordering.integer_encoding.is_encoded
- ):
- # Special case: non-negative integer ordering id can be converted directly to string without regenerating row numbers
- int_values = self._compile_expression(
- self._ordering.total_order_col.scalar_expression
- )
- return encode_order_string(
- typing.cast(ibis_types.IntegerColumn, int_values),
- )
- else:
- # Have to build string from scratch
- window = bigframes_vendored.ibis.window(order_by=self._ibis_order)
- if self._predicates:
- window = window.group_by(self._reduced_predicate)
- row_nums = typing.cast(
- ibis_types.IntegerColumn,
- bigframes_vendored.ibis.row_number().over(window),
- )
- return encode_order_string(row_nums)
+ return UnorderedIR(self._table, (*self.columns, window_op.name(output_name)))
def _compile_expression(self, expr: ex.Expression):
return op_compiler.compile_expression(expr, self._ibis_bindings)
- def _ibis_window_from_spec(
- self, window_spec: WindowSpec, require_total_order: bool
- ):
+ def _ibis_window_from_spec(self, window_spec: WindowSpec):
group_by: typing.List[ibis_types.Value] = (
[
typing.cast(
@@ -1246,8 +436,6 @@ def _ibis_window_from_spec(
if window_spec.grouping_keys
else []
)
- if self._reduced_predicate is not None:
- group_by.append(self._reduced_predicate)
# Construct ordering. There are basically 3 main cases
# 1. Order-independent op (aggregation, cut, rank) with unbound window - no ordering clause needed
@@ -1255,15 +443,12 @@ def _ibis_window_from_spec(
# 3. Order-depedenpent op (navigation functions, array_agg) or rows bounds - use total row order to break ties.
if window_spec.ordering:
order_by = _convert_ordering_to_table_values(
- {**self._column_names, **self._hidden_ordering_column_names},
+ self._column_names,
window_spec.ordering,
)
- if require_total_order or isinstance(window_spec.bounds, RowsWindowBounds):
- # Some operators need an unambiguous ordering, so the table's total ordering is appended
- order_by = tuple([*order_by, *self._ibis_order])
- elif require_total_order or isinstance(window_spec.bounds, RowsWindowBounds):
+ elif window_spec.row_bounded:
# If window spec has following or preceding bounds, we need to apply an unambiguous ordering.
- order_by = tuple(self._ibis_order)
+ raise ValueError("No ordering provided for ordered analytic function")
else:
# Unbound grouping window. Suitable for aggregations but not for analytic function application.
order_by = None
@@ -1284,30 +469,6 @@ def _ibis_window_from_spec(
raise ValueError(f"unrecognized window bounds {bounds}")
return window
- class Builder:
- def __init__(
- self,
- table: ibis_types.Table,
- ordering: RowOrdering,
- columns: Collection[ibis_types.Value] = (),
- hidden_ordering_columns: Collection[ibis_types.Value] = (),
- predicates: Optional[Collection[ibis_types.BooleanValue]] = None,
- ):
- self.table = table
- self.columns = list(columns)
- self.hidden_ordering_columns = list(hidden_ordering_columns)
- self.ordering = ordering
- self.predicates = list(predicates) if predicates is not None else None
-
- def build(self) -> OrderedIR:
- return OrderedIR(
- table=self.table,
- columns=self.columns,
- hidden_ordering_columns=self.hidden_ordering_columns,
- ordering=self.ordering,
- predicates=self.predicates,
- )
-
def is_literal(column: ibis_types.Value) -> bool:
# Unfortunately, Literals in ibis are not "Columns"s and therefore can't be aggregated.
@@ -1325,18 +486,6 @@ def is_window(column: ibis_types.Value) -> bool:
return any(isinstance(op, ibis_ops.WindowFunction) for op in matches)
-def _reduce_predicate_list(
- predicate_list: typing.Collection[ibis_types.BooleanValue],
-) -> ibis_types.BooleanValue:
- """Converts a list of predicates BooleanValues into a single BooleanValue."""
- if len(predicate_list) == 0:
- raise ValueError("Cannot reduce empty list of predicates")
- if len(predicate_list) == 1:
- (item,) = predicate_list
- return item
- return functools.reduce(lambda acc, pred: acc.__and__(pred), predicate_list)
-
-
def _convert_ordering_to_table_values(
value_lookup: typing.Mapping[str, ibis_types.Value],
ordering_columns: typing.Sequence[OrderingExpression],
diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py
index 6f47d198c5..a72ca47190 100644
--- a/bigframes/core/compile/compiler.py
+++ b/bigframes/core/compile/compiler.py
@@ -24,16 +24,17 @@
import google.cloud.bigquery
import pandas as pd
+from bigframes.core import utils
import bigframes.core.compile.compiled as compiled
import bigframes.core.compile.concat as concat_impl
import bigframes.core.compile.explode
import bigframes.core.compile.ibis_types
+import bigframes.core.compile.isin
import bigframes.core.compile.scalar_op_compiler
import bigframes.core.compile.scalar_op_compiler as compile_scalar
import bigframes.core.compile.schema_translator
import bigframes.core.compile.single_column
import bigframes.core.expression as ex
-import bigframes.core.guid as guids
import bigframes.core.identifiers as ids
import bigframes.core.nodes as nodes
import bigframes.core.ordering as bf_ordering
@@ -50,23 +51,40 @@ class Compiler:
# In unstrict mode, ordering from ReadTable or after joins may be ambiguous to improve query performance.
strict: bool = True
scalar_op_compiler = compile_scalar.ScalarOpCompiler()
- enable_pruning: bool = False
- enable_densify_ids: bool = False
def compile_sql(
self, node: nodes.BigFrameNode, ordered: bool, output_ids: typing.Sequence[str]
) -> str:
- node = self.set_output_names(node, output_ids)
+ # TODO: get rid of output_ids arg
+ assert len(output_ids) == len(list(node.fields))
+ node = set_output_names(node, output_ids)
if ordered:
node, limit = rewrites.pullup_limit_from_slice(node)
- ir = self.compile_ordered_ir(self._preprocess(node))
- return ir.to_sql(ordered=True, limit=limit)
+ node = nodes.bottom_up(node, rewrites.rewrite_slice)
+ node, ordering = rewrites.pull_up_order(
+ node, order_root=True, ordered_joins=self.strict
+ )
+ ir = self.compile_node(node)
+ return ir.to_sql(
+ order_by=ordering.all_ordering_columns,
+ limit=limit,
+ selections=output_ids,
+ )
else:
- ir = self.compile_unordered_ir(self._preprocess(node)) # type: ignore
- return ir.to_sql()
+ node = nodes.bottom_up(node, rewrites.rewrite_slice)
+ node, _ = rewrites.pull_up_order(
+ node, order_root=False, ordered_joins=self.strict
+ )
+ ir = self.compile_node(node)
+ return ir.to_sql(selections=output_ids)
def compile_peek_sql(self, node: nodes.BigFrameNode, n_rows: int) -> str:
- return self.compile_unordered_ir(self._preprocess(node)).peek_sql(n_rows)
+ ids = [id.sql for id in node.ids]
+ node = nodes.bottom_up(node, rewrites.rewrite_slice)
+ node, _ = rewrites.pull_up_order(
+ node, order_root=False, ordered_joins=self.strict
+ )
+ return self.compile_node(node).to_sql(limit=n_rows, selections=ids)
def compile_raw(
self,
@@ -74,98 +92,60 @@ def compile_raw(
) -> typing.Tuple[
str, typing.Sequence[google.cloud.bigquery.SchemaField], bf_ordering.RowOrdering
]:
- ir = self.compile_ordered_ir(self._preprocess(node))
- sql, schema = ir.raw_sql_and_schema(column_ids=node.schema.names)
- return sql, schema, ir._ordering
+ node = nodes.bottom_up(node, rewrites.rewrite_slice)
+ node, ordering = rewrites.pull_up_order(node, ordered_joins=self.strict)
+ ir = self.compile_node(node)
+ sql = ir.to_sql()
+ return sql, node.schema.to_bigquery(), ordering
def _preprocess(self, node: nodes.BigFrameNode):
- if self.enable_pruning:
- used_fields = frozenset(field.id for field in node.fields)
- node = node.prune(used_fields)
node = nodes.bottom_up(node, rewrites.rewrite_slice)
- if self.enable_densify_ids:
- original_names = [id.name for id in node.ids]
- node, _ = rewrites.remap_variables(
- node, id_generator=ids.anonymous_serial_ids()
- )
- node = self.set_output_names(node, original_names)
- return node
-
- def set_output_names(
- self, node: bigframes.core.nodes.BigFrameNode, output_ids: typing.Sequence[str]
- ):
- # TODO: Create specialized output operators that will handle final names
- return nodes.SelectionNode(
- node,
- tuple(
- (ex.DerefOp(old_id), ids.ColumnId(out_id))
- for old_id, out_id in zip(node.ids, output_ids)
- ),
+ node, _ = rewrites.pull_up_order(
+ node, order_root=False, ordered_joins=self.strict
)
-
- def compile_ordered_ir(self, node: nodes.BigFrameNode) -> compiled.OrderedIR:
- return typing.cast(compiled.OrderedIR, self.compile_node(node, True))
-
- def compile_unordered_ir(self, node: nodes.BigFrameNode) -> compiled.UnorderedIR:
- return typing.cast(compiled.UnorderedIR, self.compile_node(node, False))
+ return node
# TODO: Remove cache when schema no longer requires compilation to derive schema (and therefor only compiles for execution)
@functools.lru_cache(maxsize=5000)
- def compile_node(
- self, node: nodes.BigFrameNode, ordered: bool = True
- ) -> compiled.UnorderedIR | compiled.OrderedIR:
+ def compile_node(self, node: nodes.BigFrameNode) -> compiled.UnorderedIR:
"""Compile node into CompileArrayValue. Caches result."""
- return self._compile_node(node, ordered)
+ return self._compile_node(node)
@functools.singledispatchmethod
- def _compile_node(
- self, node: nodes.BigFrameNode, ordered: bool = True
- ) -> compiled.UnorderedIR:
+ def _compile_node(self, node: nodes.BigFrameNode) -> compiled.UnorderedIR:
"""Defines transformation but isn't cached, always use compile_node instead"""
raise ValueError(f"Can't compile unrecognized node: {node}")
@_compile_node.register
- def compile_join(self, node: nodes.JoinNode, ordered: bool = True):
+ def compile_join(self, node: nodes.JoinNode):
condition_pairs = tuple(
(left.id.sql, right.id.sql) for left, right in node.conditions
)
- if ordered:
- # In general, joins are an ordering destroying operation.
- # With ordering_mode = "partial", make this explicit. In
- # this case, we don't need to provide a deterministic ordering.
- if self.strict:
- left_ordered = self.compile_ordered_ir(node.left_child)
- right_ordered = self.compile_ordered_ir(node.right_child)
- return bigframes.core.compile.single_column.join_by_column_ordered(
- left=left_ordered,
- right=right_ordered,
- type=node.type,
- conditions=condition_pairs,
- )
- else:
- left_unordered = self.compile_unordered_ir(node.left_child)
- right_unordered = self.compile_unordered_ir(node.right_child)
- return bigframes.core.compile.single_column.join_by_column_unordered(
- left=left_unordered,
- right=right_unordered,
- type=node.type,
- conditions=condition_pairs,
- ).as_ordered_ir()
- else:
- left_unordered = self.compile_unordered_ir(node.left_child)
- right_unordered = self.compile_unordered_ir(node.right_child)
- return bigframes.core.compile.single_column.join_by_column_unordered(
- left=left_unordered,
- right=right_unordered,
- type=node.type,
- conditions=condition_pairs,
- )
+ left_unordered = self.compile_node(node.left_child)
+ right_unordered = self.compile_node(node.right_child)
+ return bigframes.core.compile.single_column.join_by_column_unordered(
+ left=left_unordered,
+ right=right_unordered,
+ type=node.type,
+ conditions=condition_pairs,
+ )
+
+ @_compile_node.register
+ def compile_isin(self, node: nodes.InNode):
+ left_unordered = self.compile_node(node.left_child)
+ right_unordered = self.compile_node(node.right_child)
+ return bigframes.core.compile.isin.isin_unordered(
+ left=left_unordered,
+ right=right_unordered,
+ indicator_col=node.indicator_col.sql,
+ conditions=(node.left_col.id.sql, node.right_col.id.sql),
+ )
@_compile_node.register
- def compile_fromrange(self, node: nodes.FromRangeNode, ordered: bool = True):
+ def compile_fromrange(self, node: nodes.FromRangeNode):
# Both start and end are single elements and do not inherently have an order
- start = self.compile_unordered_ir(node.start)
- end = self.compile_unordered_ir(node.end)
+ start = self.compile_node(node.start)
+ end = self.compile_node(node.end)
start_table = start._to_ibis_expr()
end_table = end._to_ibis_expr()
@@ -183,36 +163,29 @@ def compile_fromrange(self, node: nodes.FromRangeNode, ordered: bool = True):
.as_table()
.unnest([node.output_id.sql])
)
- if ordered:
- return compiled.OrderedIR(
- labels,
- columns=[labels[labels.columns[0]]],
- ordering=bf_ordering.TotalOrdering().from_offset_col(labels.columns[0]),
- )
- else:
- return compiled.UnorderedIR(
- labels,
- columns=[labels[labels.columns[0]]],
- )
+ return compiled.UnorderedIR(
+ labels,
+ columns=[labels[labels.columns[0]]],
+ )
@_compile_node.register
- def compile_readlocal(self, node: nodes.ReadLocalNode, ordered: bool = True):
+ def compile_readlocal(self, node: nodes.ReadLocalNode):
array_as_pd = pd.read_feather(
io.BytesIO(node.feather_bytes),
columns=[item.source_id for item in node.scan_list.items],
)
- ordered_ir = compiled.OrderedIR.from_pandas(array_as_pd, node.scan_list)
- if ordered:
- return ordered_ir
- else:
- return ordered_ir.to_unordered()
+
+ # Convert timedeltas to microseconds for compatibility with BigQuery
+ _ = utils.replace_timedeltas_with_micros(array_as_pd)
+
+ offsets = node.offsets_col.sql if node.offsets_col else None
+ return compiled.UnorderedIR.from_pandas(
+ array_as_pd, node.scan_list, offsets=offsets
+ )
@_compile_node.register
- def compile_readtable(self, node: nodes.ReadTableNode, ordered: bool = True):
- if ordered:
- return self.compile_read_table_ordered(node.source, node.scan_list)
- else:
- return self.compile_read_table_unordered(node.source, node.scan_list)
+ def compile_readtable(self, node: nodes.ReadTableNode):
+ return self.compile_read_table_unordered(node.source, node.scan_list)
def read_table_as_unordered_ibis(
self, source: nodes.BigqueryDataSource
@@ -250,140 +223,71 @@ def compile_read_table_unordered(
),
)
- def compile_read_table_ordered(
- self, source: nodes.BigqueryDataSource, scan_list: nodes.ScanList
- ):
- ibis_table = self.read_table_as_unordered_ibis(source)
- if source.ordering is not None:
- visible_column_mapping = {
- ids.ColumnId(scan_item.source_id): scan_item.id
- for scan_item in scan_list.items
- }
- full_mapping = {
- ids.ColumnId(col.name): ids.ColumnId(guids.generate_guid())
- for col in source.ordering.referenced_columns
- }
- full_mapping.update(visible_column_mapping)
-
- ordering = source.ordering.remap_column_refs(full_mapping)
- hidden_columns = tuple(
- ibis_table[source_id.sql].name(out_id.sql)
- for source_id, out_id in full_mapping.items()
- if source_id not in visible_column_mapping
- )
- else:
- # In unstrict mode, don't generate total ordering from hashing as this is
- # expensive (prevent removing any columns from table scan)
- ordering, hidden_columns = bf_ordering.RowOrdering(), ()
-
- return compiled.OrderedIR(
- ibis_table,
- columns=tuple(
- bigframes.core.compile.ibis_types.ibis_value_to_canonical_type(
- ibis_table[scan_item.source_id].name(scan_item.id.sql)
- )
- for scan_item in scan_list.items
- ),
- ordering=ordering,
- hidden_ordering_columns=hidden_columns,
- )
-
@_compile_node.register
- def compile_promote_offsets(
- self, node: nodes.PromoteOffsetsNode, ordered: bool = True
- ):
- result = self.compile_ordered_ir(node.child).promote_offsets(node.col_id.sql)
- return result if ordered else result.to_unordered()
+ def compile_filter(self, node: nodes.FilterNode):
+ return self.compile_node(node.child).filter(node.predicate)
@_compile_node.register
- def compile_filter(self, node: nodes.FilterNode, ordered: bool = True):
- return self.compile_node(node.child, ordered).filter(node.predicate)
-
- @_compile_node.register
- def compile_orderby(self, node: nodes.OrderByNode, ordered: bool = True):
- if ordered:
- if node.is_total_order:
- # more efficient, can just discard any previous ordering and get same result
- return self.compile_unordered_ir(node.child).with_total_order(node.by)
- else:
- return self.compile_ordered_ir(node.child).order_by(node.by)
- else:
- return self.compile_unordered_ir(node.child)
-
- @_compile_node.register
- def compile_reversed(self, node: nodes.ReversedNode, ordered: bool = True):
- if ordered:
- return self.compile_ordered_ir(node.child).reversed()
- else:
- return self.compile_unordered_ir(node.child)
-
- @_compile_node.register
- def compile_selection(self, node: nodes.SelectionNode, ordered: bool = True):
- result = self.compile_node(node.child, ordered)
+ def compile_selection(self, node: nodes.SelectionNode):
+ result = self.compile_node(node.child)
selection = tuple((ref, id.sql) for ref, id in node.input_output_pairs)
return result.selection(selection)
@_compile_node.register
- def compile_projection(self, node: nodes.ProjectionNode, ordered: bool = True):
- result = self.compile_node(node.child, ordered)
+ def compile_projection(self, node: nodes.ProjectionNode):
+ result = self.compile_node(node.child)
projections = ((expr, id.sql) for expr, id in node.assignments)
return result.projection(tuple(projections))
@_compile_node.register
- def compile_concat(self, node: nodes.ConcatNode, ordered: bool = True):
+ def compile_concat(self, node: nodes.ConcatNode):
output_ids = [id.sql for id in node.output_ids]
- if ordered:
- compiled_ordered = [self.compile_ordered_ir(node) for node in node.children]
- return concat_impl.concat_ordered(compiled_ordered, output_ids)
- else:
- compiled_unordered = [
- self.compile_unordered_ir(node) for node in node.children
- ]
- return concat_impl.concat_unordered(compiled_unordered, output_ids)
+ compiled_unordered = [self.compile_node(node) for node in node.children]
+ return concat_impl.concat_unordered(compiled_unordered, output_ids)
@_compile_node.register
- def compile_rowcount(self, node: nodes.RowCountNode, ordered: bool = True):
- result = self.compile_unordered_ir(node.child).row_count(name=node.col_id.sql)
- return result if ordered else result.to_unordered()
+ def compile_rowcount(self, node: nodes.RowCountNode):
+ result = self.compile_node(node.child).row_count(name=node.col_id.sql)
+ return result
@_compile_node.register
- def compile_aggregate(self, node: nodes.AggregateNode, ordered: bool = True):
- has_ordered_aggregation_ops = any(
- aggregate.op.can_order_by for aggregate, _ in node.aggregations
- )
+ def compile_aggregate(self, node: nodes.AggregateNode):
aggs = tuple((agg, id.sql) for agg, id in node.aggregations)
- if ordered and has_ordered_aggregation_ops:
- return self.compile_ordered_ir(node.child).aggregate(
- aggs, node.by_column_ids, node.dropna
- )
- else:
- result = self.compile_unordered_ir(node.child).aggregate(
- aggs, node.by_column_ids, node.dropna
- )
- return result if ordered else result.to_unordered()
+ result = self.compile_node(node.child).aggregate(
+ aggs, node.by_column_ids, node.dropna, order_by=node.order_by
+ )
+ return result
@_compile_node.register
- def compile_window(self, node: nodes.WindowOpNode, ordered: bool = True):
- result = self.compile_ordered_ir(node.child).project_window_op(
+ def compile_window(self, node: nodes.WindowOpNode):
+ result = self.compile_node(node.child).project_window_op(
node.expression,
node.window_spec,
node.output_name.sql,
never_skip_nulls=node.never_skip_nulls,
)
- return result if ordered else result.to_unordered()
+ return result
@_compile_node.register
- def compile_explode(self, node: nodes.ExplodeNode, ordered: bool = True):
+ def compile_explode(self, node: nodes.ExplodeNode):
offsets_col = node.offsets_col.sql if (node.offsets_col is not None) else None
- if ordered:
- return bigframes.core.compile.explode.explode_ordered(
- self.compile_ordered_ir(node.child), node.column_ids, offsets_col
- )
- else:
- return bigframes.core.compile.explode.explode_unordered(
- self.compile_unordered_ir(node.child), node.column_ids, offsets_col
- )
+ return bigframes.core.compile.explode.explode_unordered(
+ self.compile_node(node.child), node.column_ids, offsets_col
+ )
@_compile_node.register
- def compile_random_sample(self, node: nodes.RandomSampleNode, ordered: bool = True):
- return self.compile_node(node.child, ordered)._uniform_sampling(node.fraction)
+ def compile_random_sample(self, node: nodes.RandomSampleNode):
+ return self.compile_node(node.child)._uniform_sampling(node.fraction)
+
+
+def set_output_names(
+ node: bigframes.core.nodes.BigFrameNode, output_ids: typing.Sequence[str]
+):
+ # TODO: Create specialized output operators that will handle final names
+ return nodes.SelectionNode(
+ node,
+ tuple(
+ (ex.DerefOp(old_id), ids.ColumnId(out_id))
+ for old_id, out_id in zip(node.ids, output_ids)
+ ),
+ )
diff --git a/bigframes/core/compile/concat.py b/bigframes/core/compile/concat.py
index ede326d00b..742f429f54 100644
--- a/bigframes/core/compile/concat.py
+++ b/bigframes/core/compile/concat.py
@@ -13,21 +13,11 @@
# limitations under the License.
from __future__ import annotations
-import math
import typing
import bigframes_vendored.ibis.expr.api as ibis_api
import bigframes.core.compile.compiled as compiled
-import bigframes.core.expression as ex
-from bigframes.core.ordering import (
- ascending_over,
- reencode_order_string,
- StringEncoding,
- TotalOrdering,
-)
-
-ORDER_ID_COLUMN = "bigframes_ordering_id"
def concat_unordered(
@@ -49,57 +39,3 @@ def concat_unordered(
combined_table,
columns=[combined_table[col] for col in combined_table.columns],
)
-
-
-def concat_ordered(
- items: typing.Sequence[compiled.OrderedIR],
- output_ids: typing.Sequence[str],
-) -> compiled.OrderedIR:
- """Append together multiple ArrayValue objects."""
- if len(items) == 1:
- return items[0]
-
- tables = []
- prefix_base = 10
- prefix_size = math.ceil(math.log(len(items), prefix_base))
- # Must normalize all ids to the same encoding size
- max_encoding_size = max(
- *[expression._ordering.string_encoding.length for expression in items],
- )
- for i, expr in enumerate(items):
- ordering_prefix = str(i).zfill(prefix_size)
- renames = {
- old_id: new_id for old_id, new_id in zip(expr.column_ids, output_ids)
- }
- table = expr._to_ibis_expr(
- ordering_mode="string_encoded",
- order_col_name=ORDER_ID_COLUMN,
- )
- table = table.select(
- [
- table[col].name(renames[col])
- if col != ORDER_ID_COLUMN
- else (
- ordering_prefix
- + reencode_order_string(table[ORDER_ID_COLUMN], max_encoding_size)
- ).name(ORDER_ID_COLUMN)
- for col in table.columns
- ]
- )
- tables.append(table)
- combined_table = ibis_api.union(*tables)
- ordering = TotalOrdering(
- ordering_value_columns=tuple([ascending_over(ORDER_ID_COLUMN)]),
- total_ordering_columns=frozenset([ex.deref(ORDER_ID_COLUMN)]),
- string_encoding=StringEncoding(True, prefix_size + max_encoding_size),
- )
- return compiled.OrderedIR(
- combined_table,
- columns=[
- combined_table[col]
- for col in combined_table.columns
- if col != ORDER_ID_COLUMN
- ],
- hidden_ordering_columns=[combined_table[ORDER_ID_COLUMN]],
- ordering=ordering,
- )
diff --git a/bigframes/core/compile/explode.py b/bigframes/core/compile/explode.py
index 0dfc129810..59e3a13d02 100644
--- a/bigframes/core/compile/explode.py
+++ b/bigframes/core/compile/explode.py
@@ -20,9 +20,7 @@
import bigframes.core.compile.compiled as compiled
import bigframes.core.expression as ex
import bigframes.core.guid
-import bigframes.core.identifiers as ids
import bigframes.core.ordering
-from bigframes.core.ordering import TotalOrdering
def explode_unordered(
@@ -73,79 +71,3 @@ def explode_unordered(
table_w_unnest,
columns=columns, # type: ignore
)
-
-
-def explode_ordered(
- input: compiled.OrderedIR,
- columns: typing.Sequence[ex.DerefOp],
- offsets_id: typing.Optional[str],
-) -> compiled.OrderedIR:
- if input.order_non_deterministic:
- id = bigframes.core.guid.generate_guid()
- return input.promote_offsets(id)
- table = input._to_ibis_expr(ordering_mode="unordered", expose_hidden_cols=True)
- column_ids = tuple(ref.id.sql for ref in columns)
-
- offset_array_id = bigframes.core.guid.generate_guid("offset_array_")
- offset_array = bigframes_vendored.ibis.range(
- 0,
- bigframes_vendored.ibis.greatest(
- 1, # We always want at least 1 element to fill in NULLs for empty arrays.
- bigframes_vendored.ibis.least(
- *[table[column_id].length() for column_id in column_ids]
- ),
- ),
- 1,
- ).name(offset_array_id)
- table_w_offset_array = table.select(
- offset_array,
- *input._column_names,
- *input._hidden_ordering_column_names,
- )
-
- unnest_offset_id = offsets_id or bigframes.core.guid.generate_guid("unnest_offset_")
- unnest_offset = (
- table_w_offset_array[offset_array_id].unnest().name(unnest_offset_id)
- )
- table_w_offset = table_w_offset_array.select(
- unnest_offset,
- *input._column_names,
- *input._hidden_ordering_column_names,
- )
-
- unnested_columns = [
- table_w_offset[column_id][table_w_offset[unnest_offset_id]].name(column_id)
- if column_id in column_ids
- else table_w_offset[column_id]
- for column_id in input._column_names
- ]
-
- table_w_unnest = table_w_offset.select(
- table_w_offset[unnest_offset_id],
- *unnested_columns,
- *input._hidden_ordering_column_names,
- )
-
- output_cols = tuple(input.column_ids) + ((offsets_id,) if offsets_id else ())
- columns = [table_w_unnest[column_name] for column_name in output_cols]
- hidden_ordering_columns = [
- table_w_unnest[column_name]
- for column_name in input._hidden_ordering_column_names
- ]
- if offsets_id is None:
- hidden_ordering_columns.append(table_w_unnest[unnest_offset_id])
- l_mappings = {id: id for id in input._ordering.referenced_columns}
- r_mappings = {ids.ColumnId(unnest_offset_id): ids.ColumnId(unnest_offset_id)}
- ordering = bigframes.core.ordering.join_orderings(
- input._ordering,
- TotalOrdering.from_offset_col(unnest_offset_id),
- l_mappings,
- r_mappings,
- )
-
- return compiled.OrderedIR(
- table_w_unnest,
- columns=columns, # type: ignore
- hidden_ordering_columns=hidden_ordering_columns,
- ordering=ordering,
- )
diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py
index e5d637e426..8a55f6775d 100644
--- a/bigframes/core/compile/ibis_types.py
+++ b/bigframes/core/compile/ibis_types.py
@@ -13,9 +13,8 @@
# limitations under the License.
from __future__ import annotations
-import textwrap
import typing
-from typing import Any, cast, Dict, Iterable, Optional, Tuple, Union
+from typing import cast, Dict, Iterable, Optional, Tuple, Union
import bigframes_vendored.constants as constants
import bigframes_vendored.ibis
@@ -28,7 +27,6 @@
import db_dtypes # type: ignore
import geopandas as gpd # type: ignore
import google.cloud.bigquery as bigquery
-import numpy as np
import pandas as pd
import pyarrow as pa
@@ -228,9 +226,7 @@ def ibis_value_to_canonical_type(value: ibis_types.Value) -> ibis_types.Value:
def bigframes_dtype_to_ibis_dtype(
- bigframes_dtype: Union[
- bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype, np.dtype[Any]
- ]
+ bigframes_dtype: bigframes.dtypes.Dtype,
) -> ibis_dtypes.DataType:
"""Converts a BigQuery DataFrames supported dtype to an Ibis dtype.
@@ -244,11 +240,6 @@ def bigframes_dtype_to_ibis_dtype(
Raises:
ValueError: If passed a dtype not supported by BigQuery DataFrames.
"""
- if str(bigframes_dtype) in bigframes.dtypes.BIGFRAMES_STRING_TO_BIGFRAMES:
- bigframes_dtype = bigframes.dtypes.BIGFRAMES_STRING_TO_BIGFRAMES[
- cast(bigframes.dtypes.DtypeString, str(bigframes_dtype))
- ]
-
if bigframes_dtype in BIGFRAMES_TO_IBIS.keys():
return BIGFRAMES_TO_IBIS[bigframes_dtype]
@@ -256,24 +247,7 @@ def bigframes_dtype_to_ibis_dtype(
return _arrow_dtype_to_ibis_dtype(bigframes_dtype.pyarrow_dtype)
else:
- raise ValueError(
- textwrap.dedent(
- f"""
- Unexpected data type {bigframes_dtype}. The following
- str dtypes are supppted: 'boolean','Float64','Int64',
- 'int64[pyarrow]','string','string[pyarrow]',
- 'timestamp[us, tz=UTC][pyarrow]','timestamp[us][pyarrow]',
- 'date32[day][pyarrow]','time64[us][pyarrow]'.
- The following pandas.ExtensionDtype are supported:
- pandas.BooleanDtype(), pandas.Float64Dtype(),
- pandas.Int64Dtype(), pandas.StringDtype(storage="pyarrow"),
- pd.ArrowDtype(pa.date32()), pd.ArrowDtype(pa.time64("us")),
- pd.ArrowDtype(pa.timestamp("us")),
- pd.ArrowDtype(pa.timestamp("us", tz="UTC")).
- {constants.FEEDBACK_LINK}
- """
- )
- )
+ raise ValueError(f"Datatype has no ibis type mapping: {bigframes_dtype}")
def ibis_dtype_to_bigframes_dtype(
diff --git a/bigframes/core/compile/isin.py b/bigframes/core/compile/isin.py
new file mode 100644
index 0000000000..29acf9e284
--- /dev/null
+++ b/bigframes/core/compile/isin.py
@@ -0,0 +1,71 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helpers to join ArrayValue objects."""
+
+from __future__ import annotations
+
+import itertools
+from typing import Tuple
+
+import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes
+import bigframes_vendored.ibis.expr.types as ibis_types
+
+import bigframes.core.compile.compiled as compiled
+
+
+def isin_unordered(
+ left: compiled.UnorderedIR,
+ right: compiled.UnorderedIR,
+ indicator_col: str,
+ conditions: Tuple[str, str],
+) -> compiled.UnorderedIR:
+ """Join two expressions by column equality.
+
+ Arguments:
+ left: Expression for left table to join.
+ right: Expression for right table to join.
+ conditions: Id pairs to compare
+ Returns:
+ The joined expression.
+ """
+ left_table = left._to_ibis_expr()
+ right_table = right._to_ibis_expr()
+ new_column = (
+ value_to_join_key(left_table[conditions[0]])
+ .isin(value_to_join_key(right_table[conditions[1]]))
+ .name(indicator_col)
+ )
+
+ columns = tuple(
+ itertools.chain(
+ (left_table[col.get_name()] for col in left.columns), (new_column,)
+ )
+ )
+
+ return compiled.UnorderedIR(
+ left_table,
+ columns=columns,
+ )
+
+
+def value_to_join_key(value: ibis_types.Value):
+ """Converts nullable values to non-null string SQL will not match null keys together - but pandas does."""
+ if not value.type().is_string():
+ value = value.cast(ibis_dtypes.str)
+ return (
+ value.fill_null(ibis_types.literal("$NULL_SENTINEL$"))
+ if hasattr(value, "fill_null")
+ else value.fillna(ibis_types.literal("$NULL_SENTINEL$"))
+ )
diff --git a/bigframes/core/compile/single_column.py b/bigframes/core/compile/single_column.py
index b903f9b552..9216051d91 100644
--- a/bigframes/core/compile/single_column.py
+++ b/bigframes/core/compile/single_column.py
@@ -23,110 +23,6 @@
import bigframes_vendored.ibis.expr.types as ibis_types
import bigframes.core.compile.compiled as compiled
-import bigframes.core.guid as guids
-import bigframes.core.identifiers as ids
-import bigframes.core.ordering as orderings
-
-
-def join_by_column_ordered(
- left: compiled.OrderedIR,
- right: compiled.OrderedIR,
- conditions: Tuple[Tuple[str, str], ...],
- type: Literal["inner", "outer", "left", "right", "cross"],
-) -> compiled.OrderedIR:
- """Join two expressions by column equality.
-
- Arguments:
- left: Expression for left table to join.
- left_column_ids: Column IDs (not label) to join by.
- right: Expression for right table to join.
- right_column_ids: Column IDs (not label) to join by.
- how: The type of join to perform.
- allow_row_identity_join (bool):
- If True, allow matching by row identity. Set to False to always
- perform a true JOIN in generated SQL.
- Returns:
- The joined expression. The resulting columns will be, in order,
- first the coalesced join keys, then, all the left columns, and
- finally, all the right columns.
- """
- if type == "right":
- if left.order_non_deterministic:
- right = right._bake_ordering()
- else:
- if left.order_non_deterministic:
- left = left._bake_ordering()
-
- # Do not reset the generator
- l_value_mapping = dict(zip(left.column_ids, left.column_ids))
- r_value_mapping = dict(zip(right.column_ids, right.column_ids))
-
- # hidden columns aren't necessarily unique, so need to remap to guids
- l_hidden_mapping = {
- id: guids.generate_guid("hidden_") for id in left._hidden_column_ids
- }
- r_hidden_mapping = {
- id: guids.generate_guid("hidden_") for id in right._hidden_column_ids
- }
-
- l_mapping = {**l_value_mapping, **l_hidden_mapping}
- r_mapping = {**r_value_mapping, **r_hidden_mapping}
-
- left_table = left._to_ibis_expr(
- ordering_mode="unordered",
- expose_hidden_cols=True,
- )
- left_table = left_table.rename({val: key for key, val in l_hidden_mapping.items()})
- right_table = right._to_ibis_expr(
- ordering_mode="unordered",
- expose_hidden_cols=True,
- )
- right_table = right_table.rename(
- {val: key for key, val in r_hidden_mapping.items()}
- )
- join_conditions = [
- value_to_join_key(left_table[l_mapping[left_index]])
- == value_to_join_key(right_table[r_mapping[right_index]])
- for left_index, right_index in conditions
- ]
-
- combined_table = ibis_api.join(
- left_table,
- right_table,
- predicates=join_conditions,
- how=type, # type: ignore
- )
-
- # Preserve ordering accross joins.
- ordering = orderings.join_orderings(
- left._ordering,
- right._ordering,
- {ids.ColumnId(lin): ids.ColumnId(lout) for lin, lout in l_mapping.items()},
- {ids.ColumnId(rin): ids.ColumnId(rout) for rin, rout in r_mapping.items()},
- left_order_dominates=(type != "right"),
- )
-
- # We could filter out the original join columns, but predicates/ordering
- # might still reference them in implicit joins.
- columns = [combined_table[l_mapping[col.get_name()]] for col in left.columns] + [
- combined_table[r_mapping[col.get_name()]] for col in right.columns
- ]
- hidden_ordering_columns = [
- *[
- combined_table[l_hidden_mapping[col.get_name()]]
- for col in left._hidden_ordering_columns
- ],
- *[
- combined_table[r_hidden_mapping[col.get_name()]]
- for col in right._hidden_ordering_columns
- ],
- ]
- return compiled.OrderedIR(
- combined_table,
- columns=columns,
- hidden_ordering_columns=hidden_ordering_columns,
- ordering=ordering,
- )
def join_by_column_unordered(
@@ -167,8 +63,6 @@ def join_by_column_unordered(
predicates=join_conditions,
how=type, # type: ignore
)
- # We could filter out the original join columns, but predicates/ordering
- # might still reference them in implicit joins.
columns = [combined_table[col.get_name()] for col in left.columns] + [
combined_table[col.get_name()] for col in right.columns
]
diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py
index 9173bebfc4..8621d5d915 100644
--- a/bigframes/core/expression.py
+++ b/bigframes/core/expression.py
@@ -18,7 +18,9 @@
import dataclasses
import itertools
import typing
-from typing import Mapping, TypeVar, Union
+from typing import Generator, Mapping, TypeVar, Union
+
+import pandas as pd
import bigframes.core.identifiers as ids
import bigframes.dtypes as dtypes
@@ -153,6 +155,16 @@ class Expression(abc.ABC):
def free_variables(self) -> typing.Tuple[str, ...]:
return ()
+ @property
+ def children(self) -> typing.Tuple[Expression, ...]:
+ return ()
+
+ @property
+ def expensive(self) -> bool:
+ return any(
+ isinstance(ex, OpExpression) and ex.op.expensive for ex in self.walk()
+ )
+
@property
@abc.abstractmethod
def column_references(self) -> typing.Tuple[ids.ColumnId, ...]:
@@ -214,6 +226,11 @@ def is_identity(self) -> bool:
"""True for identity operation that does not transform input."""
return False
+ def walk(self) -> Generator[Expression, None, None]:
+ yield self
+ for child in self.children:
+ yield from child.children
+
@dataclasses.dataclass(frozen=True)
class ScalarConstantExpression(Expression):
@@ -253,6 +270,17 @@ def is_bijective(self) -> bool:
# () <-> value
return True
+ def __eq__(self, other):
+ if not isinstance(other, ScalarConstantExpression):
+ return False
+
+ # With python 3.13 and the pre-release version of pandas,
+ # NA == NA is NA instead of True
+ if pd.isna(self.value) and pd.isna(other.value): # type: ignore
+ return self.dtype == other.dtype
+
+ return self.value == other.value and self.dtype == other.dtype
+
@dataclasses.dataclass(frozen=True)
class UnboundVariableExpression(Expression):
@@ -376,6 +404,10 @@ def free_variables(self) -> typing.Tuple[str, ...]:
def is_const(self) -> bool:
return all(child.is_const for child in self.inputs)
+ @property
+ def children(self):
+ return self.inputs
+
def output_type(
self, input_types: dict[ids.ColumnId, dtypes.ExpressionType]
) -> dtypes.ExpressionType:
diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py
index 5fb5fb14d2..5f9fcb257e 100644
--- a/bigframes/core/groupby/__init__.py
+++ b/bigframes/core/groupby/__init__.py
@@ -683,10 +683,12 @@ def cummin(self, *args, **kwargs) -> series.Series:
@validations.requires_ordering()
def cumcount(self, *args, **kwargs) -> series.Series:
+ # TODO: Add nullary op support to implement more cleanly
return (
self._apply_window_op(
- agg_ops.rank_op,
+ agg_ops.SizeUnaryOp(),
discard_name=True,
+ never_skip_nulls=True,
)
- 1
)
@@ -758,6 +760,7 @@ def _apply_window_op(
op: agg_ops.WindowOp,
discard_name=False,
window: typing.Optional[core.WindowSpec] = None,
+ never_skip_nulls: bool = False,
):
"""Apply window op to groupby. Defaults to grouped cumulative window."""
window_spec = window or window_specs.cumulative_rows(
@@ -770,6 +773,7 @@ def _apply_window_op(
op,
result_label=label,
window_spec=window_spec,
+ never_skip_nulls=never_skip_nulls,
)
return series.Series(block.select_column(result_id))
diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py
index da0daf027a..6ad0973262 100644
--- a/bigframes/core/indexes/base.py
+++ b/bigframes/core/indexes/base.py
@@ -78,7 +78,8 @@ def __new__(
if name is not None:
index.name = name
if dtype is not None:
- index = index.astype(dtype)
+ bf_dtype = bigframes.dtypes.bigframes_type(dtype)
+ index = index.astype(bf_dtype)
block = index._block
elif isinstance(data, pandas.Index):
pd_df = pandas.DataFrame(index=data)
@@ -310,7 +311,7 @@ def sort_values(self, *, ascending: bool = True, na_position: str = "last"):
def astype(
self,
- dtype: Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype],
+ dtype,
*,
errors: Literal["raise", "null"] = "raise",
) -> Index:
@@ -318,6 +319,7 @@ def astype(
raise ValueError("Argument 'errors' must be one of 'raise' or 'null'")
if self.nlevels > 1:
raise TypeError("Multiindex does not support 'astype'")
+ dtype = bigframes.dtypes.bigframes_type(dtype)
return self._apply_unary_expr(
ops.AsTypeOp(to_type=dtype, safe=(errors == "null")).as_expr(
ex.free_var("arg")
diff --git a/bigframes/core/local_data.py b/bigframes/core/local_data.py
index 573562cefa..d891e385d5 100644
--- a/bigframes/core/local_data.py
+++ b/bigframes/core/local_data.py
@@ -59,6 +59,15 @@ def arrow_type_replacements(type: pa.DataType) -> pa.DataType:
if pa.types.is_time64(type):
# This is potentially lossy, but BigFrames doesn't support ns
return pa.time64("us")
+ if pa.types.is_duration(type):
+ # This is potentially lossy, but BigFrames doesn't support ns
+ return pa.duration("us")
+ if pa.types.is_decimal128(type):
+ return pa.decimal128(38, 9)
+ if pa.types.is_decimal256(type):
+ return pa.decimal256(76, 38)
+ if pa.types.is_dictionary(type):
+ return arrow_type_replacements(type.value_type)
if pa.types.is_large_string(type):
# simple string type can handle the largest strings needed
return pa.string()
diff --git a/bigframes/core/log_adapter.py b/bigframes/core/log_adapter.py
index 36aa6682bd..714a522183 100644
--- a/bigframes/core/log_adapter.py
+++ b/bigframes/core/log_adapter.py
@@ -63,6 +63,9 @@ def submit_pandas_labels(
- 'PANDAS_PARAM_TRACKING_TASK': Indicates that the unimplemented feature is a
parameter of a method.
"""
+ if method_name.startswith("_") and not method_name.startswith("__"):
+ return
+
labels_dict = {
"task": task,
"class_name": class_name.lower(),
@@ -75,7 +78,9 @@ def submit_pandas_labels(
else:
return
- if hasattr(cls, method_name):
+ # Omit __call__, because its not implemented on the actual instances of
+ # DataFrame/Series, only as the constructor.
+ if method_name != "__call__" and hasattr(cls, method_name):
method = getattr(cls, method_name)
else:
return
diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py
index d5083c3737..085d52daa6 100644
--- a/bigframes/core/nodes.py
+++ b/bigframes/core/nodes.py
@@ -208,6 +208,12 @@ def explicitly_ordered(self) -> bool:
"""
...
+ @functools.cached_property
+ def height(self) -> int:
+ if len(self.child_nodes) == 0:
+ return 0
+ return max(child.height for child in self.child_nodes) + 1
+
@functools.cached_property
def total_variables(self) -> int:
return self.variables_introduced + sum(
@@ -284,6 +290,34 @@ def prune(self, used_cols: COLUMN_SET) -> BigFrameNode:
return self.transform_children(lambda x: x.prune(used_cols))
+class AdditiveNode:
+ """Definition of additive - if you drop added_fields, you end up with the descendent.
+
+ .. code-block:: text
+
+ AdditiveNode (fields: a, b, c; added_fields: c)
+ |
+ | additive_base
+ V
+ BigFrameNode (fields: a, b)
+
+ """
+
+ @property
+ @abc.abstractmethod
+ def added_fields(self) -> Tuple[Field, ...]:
+ ...
+
+ @property
+ @abc.abstractmethod
+ def additive_base(self) -> BigFrameNode:
+ ...
+
+ @abc.abstractmethod
+ def replace_additive_base(self, BigFrameNode):
+ ...
+
+
@dataclasses.dataclass(frozen=True, eq=False)
class UnaryNode(BigFrameNode):
child: BigFrameNode
@@ -381,6 +415,106 @@ def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]):
return self
+@dataclasses.dataclass(frozen=True, eq=False)
+class InNode(BigFrameNode, AdditiveNode):
+ """
+ Special Join Type that only returns rows from the left side, as well as adding a bool column indicating whether a match exists on the right side.
+
+ Modelled separately from join node, as this operation preserves row identity.
+ """
+
+ left_child: BigFrameNode
+ right_child: BigFrameNode
+ left_col: ex.DerefOp
+ right_col: ex.DerefOp
+ indicator_col: bfet_ids.ColumnId
+
+ def _validate(self):
+ assert not (
+ set(self.left_child.ids) & set(self.right_child.ids)
+ ), "Join ids collide"
+
+ @property
+ def row_preserving(self) -> bool:
+ return False
+
+ @property
+ def non_local(self) -> bool:
+ return True
+
+ @property
+ def child_nodes(self) -> typing.Sequence[BigFrameNode]:
+ return (self.left_child, self.right_child)
+
+ @property
+ def order_ambiguous(self) -> bool:
+ return False
+
+ @property
+ def explicitly_ordered(self) -> bool:
+ # Preserves left ordering always
+ return True
+
+ @property
+ def added_fields(self) -> Tuple[Field, ...]:
+ return (Field(self.indicator_col, bigframes.dtypes.BOOL_DTYPE),)
+
+ @property
+ def fields(self) -> Iterable[Field]:
+ return itertools.chain(
+ self.left_child.fields,
+ self.added_fields,
+ )
+
+ @functools.cached_property
+ def variables_introduced(self) -> int:
+ """Defines the number of variables generated by the current node. Used to estimate query planning complexity."""
+ return 1
+
+ @property
+ def joins(self) -> bool:
+ return True
+
+ @property
+ def row_count(self) -> Optional[int]:
+ return self.left_child.row_count
+
+ @property
+ def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]:
+ return (self.indicator_col,)
+
+ @property
+ def additive_base(self) -> BigFrameNode:
+ return self.left_child
+
+ def replace_additive_base(self, node: BigFrameNode):
+ return dataclasses.replace(self, left_child=node)
+
+ def transform_children(
+ self, t: Callable[[BigFrameNode], BigFrameNode]
+ ) -> BigFrameNode:
+ transformed = dataclasses.replace(
+ self, left_child=t(self.left_child), right_child=t(self.right_child)
+ )
+ if self == transformed:
+ # reusing existing object speeds up eq, and saves a small amount of memory
+ return self
+ return transformed
+
+ def prune(self, used_cols: COLUMN_SET) -> BigFrameNode:
+ return self
+
+ def remap_vars(
+ self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
+ ) -> BigFrameNode:
+ return dataclasses.replace(
+ self, indicator_col=mappings.get(self.indicator_col, self.indicator_col)
+ )
+
+ def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]):
+ return dataclasses.replace(self, left_col=self.left_col.remap_column_refs(mappings, allow_partial_bindings=True), right_col=self.right_col.remap_column_refs(mappings, allow_partial_bindings=True)) # type: ignore
+
+
@dataclasses.dataclass(frozen=True, eq=False)
class JoinNode(BigFrameNode):
left_child: BigFrameNode
@@ -890,6 +1024,32 @@ def remap_vars(
def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]):
return self
+ def with_order_cols(self):
+ # Maybe the ordering should be required to always be in the scan list, and then we won't need this?
+ if self.source.ordering is None:
+ return self, orderings.RowOrdering()
+
+ order_cols = {col.sql for col in self.source.ordering.referenced_columns}
+ scan_cols = {col.source_id for col in self.scan_list.items}
+ new_scan_cols = [
+ ScanItem(
+ bigframes.core.ids.ColumnId.unique(),
+ dtype=bigframes.dtypes.convert_schema_field(field)[1],
+ source_id=field.name,
+ )
+ for field in self.source.table.physical_schema
+ if (field.name in order_cols) and (field.name not in scan_cols)
+ ]
+ new_scan_list = ScanList(items=(*self.scan_list.items, *new_scan_cols))
+ new_order = self.source.ordering.remap_column_refs(
+ {
+ bigframes.core.ids.ColumnId(item.source_id): item.id
+ for item in new_scan_cols
+ },
+ allow_partial_bindings=True,
+ )
+ return dataclasses.replace(self, scan_list=new_scan_list), new_order
+
@dataclasses.dataclass(frozen=True, eq=False)
class CachedTableNode(ReadTableNode):
@@ -900,7 +1060,7 @@ class CachedTableNode(ReadTableNode):
# Unary nodes
@dataclasses.dataclass(frozen=True, eq=False)
-class PromoteOffsetsNode(UnaryNode):
+class PromoteOffsetsNode(UnaryNode, AdditiveNode):
col_id: bigframes.core.identifiers.ColumnId
@property
@@ -933,6 +1093,13 @@ def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]:
def added_fields(self) -> Tuple[Field, ...]:
return (Field(self.col_id, bigframes.dtypes.INT_DTYPE),)
+ @property
+ def additive_base(self) -> BigFrameNode:
+ return self.child
+
+ def replace_additive_base(self, node: BigFrameNode):
+ return dataclasses.replace(self, child=node)
+
def prune(self, used_cols: COLUMN_SET) -> BigFrameNode:
if self.col_id not in used_cols:
return self.child.prune(used_cols)
@@ -1113,6 +1280,9 @@ def row_count(self) -> Optional[int]:
def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]:
return tuple(id for _, id in self.input_output_pairs)
+ def get_id_mapping(self) -> dict[bfet_ids.ColumnId, bfet_ids.ColumnId]:
+ return {ref.id: out_id for ref, out_id in self.input_output_pairs}
+
def prune(self, used_cols: COLUMN_SET) -> BigFrameNode:
pruned_selections = (
tuple(
@@ -1142,7 +1312,7 @@ def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]):
@dataclasses.dataclass(frozen=True, eq=False)
-class ProjectionNode(UnaryNode):
+class ProjectionNode(UnaryNode, AdditiveNode):
"""Assigns new variables (without modifying existing ones)"""
assignments: typing.Tuple[
@@ -1183,6 +1353,13 @@ def row_count(self) -> Optional[int]:
def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]:
return tuple(id for _, id in self.assignments)
+ @property
+ def additive_base(self) -> BigFrameNode:
+ return self.child
+
+ def replace_additive_base(self, node: BigFrameNode):
+ return dataclasses.replace(self, child=node)
+
def prune(self, used_cols: COLUMN_SET) -> BigFrameNode:
pruned_assignments = tuple(i for i in self.assignments if i[1] in used_cols)
if len(pruned_assignments) == 0:
@@ -1260,6 +1437,7 @@ class AggregateNode(UnaryNode):
typing.Tuple[ex.Aggregation, bigframes.core.identifiers.ColumnId], ...
]
by_column_ids: typing.Tuple[ex.DerefOp, ...] = tuple([])
+ order_by: Tuple[OrderingExpression, ...] = ()
dropna: bool = True
@property
@@ -1308,6 +1486,12 @@ def row_count(self) -> Optional[int]:
def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]:
return tuple(id for _, id in self.aggregations)
+ @property
+ def has_ordered_ops(self) -> bool:
+ return not all(
+ aggregate.op.order_independent for aggregate, _ in self.aggregations
+ )
+
def prune(self, used_cols: COLUMN_SET) -> BigFrameNode:
by_ids = (ref.id for ref in self.by_column_ids)
pruned_aggs = (
@@ -1319,7 +1503,9 @@ def prune(self, used_cols: COLUMN_SET) -> BigFrameNode:
)
consumed_ids = frozenset(itertools.chain(by_ids, agg_inputs))
pruned_child = self.child.prune(consumed_ids)
- return AggregateNode(pruned_child, pruned_aggs, self.by_column_ids, self.dropna)
+ return AggregateNode(
+ pruned_child, pruned_aggs, self.by_column_ids, dropna=self.dropna
+ )
def remap_vars(
self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]
@@ -1333,13 +1519,14 @@ def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]):
for agg, id in self.aggregations
)
new_by_ids = tuple(id.remap_column_refs(mappings) for id in self.by_column_ids)
+ new_order_by = tuple(part.remap_column_refs(mappings) for part in self.order_by)
return dataclasses.replace(
- self, by_column_ids=new_by_ids, aggregations=new_aggs
+ self, by_column_ids=new_by_ids, aggregations=new_aggs, order_by=new_order_by
)
@dataclasses.dataclass(frozen=True, eq=False)
-class WindowOpNode(UnaryNode):
+class WindowOpNode(UnaryNode, AdditiveNode):
expression: ex.Aggregation
window_spec: window.WindowSpec
output_name: bigframes.core.identifiers.ColumnId
@@ -1348,6 +1535,10 @@ class WindowOpNode(UnaryNode):
def _validate(self):
"""Validate the local data in the node."""
+ # Since inner order and row bounds are coupled, rank ops can't be row bounded
+ assert (
+ not self.window_spec.row_bounded
+ ) or self.expression.op.implicitly_inherits_order
assert all(ref in self.child.ids for ref in self.expression.column_references)
@property
@@ -1387,6 +1578,21 @@ def added_field(self) -> Field:
def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]:
return (self.output_name,)
+ @property
+ def inherits_order(self) -> bool:
+ # does the op both use ordering at all? and if so, can it inherit order?
+ op_inherits_order = (
+ not self.expression.op.order_independent
+ ) and self.expression.op.implicitly_inherits_order
+ return op_inherits_order or self.window_spec.row_bounded
+
+ @property
+ def additive_base(self) -> BigFrameNode:
+ return self.child
+
+ def replace_additive_base(self, node: BigFrameNode):
+ return dataclasses.replace(self, child=node)
+
def prune(self, used_cols: COLUMN_SET) -> BigFrameNode:
if self.output_name not in used_cols:
return self.child.prune(used_cols)
diff --git a/bigframes/core/ordering.py b/bigframes/core/ordering.py
index 357cc8145c..2fc7573b21 100644
--- a/bigframes/core/ordering.py
+++ b/bigframes/core/ordering.py
@@ -16,21 +16,12 @@
from dataclasses import dataclass, field
from enum import Enum
-import math
import typing
-from typing import Mapping, Optional, Sequence, Set
-
-import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes
-import bigframes_vendored.ibis.expr.types as ibis_types
+from typing import Mapping, Optional, Sequence, Set, Union
import bigframes.core.expression as expression
import bigframes.core.identifiers as ids
-# TODO(tbergeron): Encode more efficiently
-ORDERING_ID_STRING_BASE: int = 10
-# Sufficient to store any value up to 2^63
-DEFAULT_ORDERING_ID_LENGTH: int = math.ceil(63 * math.log(2, ORDERING_ID_STRING_BASE))
-
class OrderingDirection(Enum):
ASC = 1
@@ -93,16 +84,6 @@ def with_reverse(self) -> OrderingExpression:
# Encoding classes specify additional properties for some ordering representations
-@dataclass(frozen=True)
-class StringEncoding:
- """String encoded order ids are fixed length and can be concat together in joins."""
-
- is_encoded: bool = False
- # Encoding size must be tracked in order to know what how to combine ordering ids across tables (eg how much to pad when combining different length).
- # Also will be needed to determine when length is too large and need to compact ordering id with a ROW_NUMBER operation.
- length: int = DEFAULT_ORDERING_ID_LENGTH
-
-
@dataclass(frozen=True)
class IntegerEncoding:
"""Integer encoded order ids are guaranteed non-negative."""
@@ -117,7 +98,6 @@ class RowOrdering:
ordering_value_columns: typing.Tuple[OrderingExpression, ...] = ()
integer_encoding: IntegerEncoding = IntegerEncoding(False)
- string_encoding: StringEncoding = StringEncoding(False)
@property
def all_ordering_columns(self) -> Sequence[OrderingExpression]:
@@ -131,11 +111,6 @@ def referenced_columns(self) -> Set[ids.ColumnId]:
for col in part.referenced_columns
)
- @property
- def is_string_encoded(self) -> bool:
- """True if ordering is fully defined by a fixed length string column."""
- return self.string_encoding.is_encoded
-
@property
def is_sequential(self) -> bool:
return self.integer_encoding.is_encoded and self.integer_encoding.is_sequential
@@ -207,6 +182,13 @@ def with_ordering_columns(
new_ordering,
)
+ def join(
+ self,
+ other: RowOrdering,
+ ) -> RowOrdering:
+ joined_refs = [*self.all_ordering_columns, *other.all_ordering_columns]
+ return RowOrdering(tuple(joined_refs))
+
def _truncate_ordering(
self, order_refs: tuple[OrderingExpression, ...]
) -> tuple[OrderingExpression, ...]:
@@ -239,19 +221,20 @@ def __post_init__(self):
)
@classmethod
- def from_offset_col(cls, col: str) -> TotalOrdering:
+ def from_offset_col(cls, col: Union[ids.ColumnId, str]) -> TotalOrdering:
+ col_id = ids.ColumnId(col) if isinstance(col, str) else col
return TotalOrdering(
(ascending_over(col),),
integer_encoding=IntegerEncoding(True, is_sequential=True),
- total_ordering_columns=frozenset({expression.deref(col)}),
+ total_ordering_columns=frozenset({expression.DerefOp(col_id)}),
)
@classmethod
- def from_primary_key(cls, primary_key: Sequence[str]) -> TotalOrdering:
+ def from_primary_key(cls, primary_key: Sequence[ids.ColumnId]) -> TotalOrdering:
return TotalOrdering(
tuple(ascending_over(col) for col in primary_key),
total_ordering_columns=frozenset(
- {expression.deref(col) for col in primary_key}
+ {expression.DerefOp(col) for col in primary_key}
),
)
@@ -342,10 +325,38 @@ def remap_column_refs(
return TotalOrdering(
tuple(new_value_columns),
integer_encoding=self.integer_encoding,
- string_encoding=self.string_encoding,
total_ordering_columns=new_total_order,
)
+ @typing.overload
+ def join(
+ self,
+ other: TotalOrdering,
+ ) -> TotalOrdering:
+ ...
+
+ @typing.overload
+ def join(
+ self,
+ other: RowOrdering,
+ ) -> RowOrdering:
+ ...
+
+ def join(
+ self,
+ other: RowOrdering,
+ ) -> RowOrdering:
+ joined_refs = [*self.all_ordering_columns, *other.all_ordering_columns]
+ if isinstance(other, TotalOrdering):
+ left_total_order_cols = frozenset(self.total_ordering_columns)
+ right_total_order_cols = frozenset(other.total_ordering_columns)
+ return TotalOrdering(
+ ordering_value_columns=tuple(joined_refs),
+ total_ordering_columns=left_total_order_cols | right_total_order_cols,
+ )
+ else:
+ return RowOrdering(tuple(joined_refs))
+
@property
def total_order_col(self) -> Optional[OrderingExpression]:
"""Returns column id of columns that defines total ordering, if such as column exists"""
@@ -357,93 +368,18 @@ def total_order_col(self) -> Optional[OrderingExpression]:
return order_ref
-def encode_order_string(
- order_id: ibis_types.IntegerColumn, length: int = DEFAULT_ORDERING_ID_LENGTH
-) -> ibis_types.StringColumn:
- """Converts an order id value to string if it is not already a string. MUST produced fixed-length strings."""
- # This is very inefficient encoding base-10 string uses only 10 characters per byte(out of 256 bit combinations)
- # Furthermore, if know tighter bounds on order id are known, can produce smaller strings.
- # 19 characters chosen as it can represent any positive Int64 in base-10
- # For missing values, ":" * 19 is used as it is larger than any other value this function produces, so null values will be last.
- string_order_id = typing.cast(
- ibis_types.StringValue,
- order_id.cast(ibis_dtypes.string),
- ).lpad(length, "0")
- return typing.cast(ibis_types.StringColumn, string_order_id)
-
-
-def reencode_order_string(
- order_id: ibis_types.StringColumn, length: int
-) -> ibis_types.StringColumn:
- return typing.cast(
- ibis_types.StringColumn,
- (typing.cast(ibis_types.StringValue, order_id).lpad(length, "0")),
- )
-
-
# Convenience functions
-def ascending_over(id: str, nulls_last: bool = True) -> OrderingExpression:
- return OrderingExpression(expression.deref(id), na_last=nulls_last)
+def ascending_over(
+ id: Union[ids.ColumnId, str], nulls_last: bool = True
+) -> OrderingExpression:
+ col_id = ids.ColumnId(id) if isinstance(id, str) else id
+ return OrderingExpression(expression.DerefOp(col_id), na_last=nulls_last)
-def descending_over(id: str, nulls_last: bool = True) -> OrderingExpression:
+def descending_over(
+ id: Union[ids.ColumnId, str], nulls_last: bool = True
+) -> OrderingExpression:
+ col_id = ids.ColumnId(id) if isinstance(id, str) else id
return OrderingExpression(
- expression.deref(id), direction=OrderingDirection.DESC, na_last=nulls_last
+ expression.DerefOp(col_id), direction=OrderingDirection.DESC, na_last=nulls_last
)
-
-
-@typing.overload
-def join_orderings(
- left: TotalOrdering,
- right: TotalOrdering,
- left_id_mapping: Mapping[ids.ColumnId, ids.ColumnId],
- right_id_mapping: Mapping[ids.ColumnId, ids.ColumnId],
- left_order_dominates: bool = True,
-) -> TotalOrdering:
- ...
-
-
-@typing.overload
-def join_orderings(
- left: RowOrdering,
- right: RowOrdering,
- left_id_mapping: Mapping[ids.ColumnId, ids.ColumnId],
- right_id_mapping: Mapping[ids.ColumnId, ids.ColumnId],
- left_order_dominates: bool = True,
-) -> RowOrdering:
- ...
-
-
-def join_orderings(
- left: RowOrdering,
- right: RowOrdering,
- left_id_mapping: Mapping[ids.ColumnId, ids.ColumnId],
- right_id_mapping: Mapping[ids.ColumnId, ids.ColumnId],
- left_order_dominates: bool = True,
-) -> RowOrdering:
- left_ordering_refs = [
- ref.remap_column_refs(left_id_mapping) for ref in left.all_ordering_columns
- ]
- right_ordering_refs = [
- ref.remap_column_refs(right_id_mapping) for ref in right.all_ordering_columns
- ]
- if left_order_dominates:
- joined_refs = [*left_ordering_refs, *right_ordering_refs]
- else:
- joined_refs = [*right_ordering_refs, *left_ordering_refs]
-
- if isinstance(left, TotalOrdering) and isinstance(right, TotalOrdering):
- left_total_order_cols = frozenset(
- [left_id_mapping[ref.id] for ref in left.total_ordering_columns]
- )
- right_total_order_cols = frozenset(
- [right_id_mapping[ref.id] for ref in right.total_ordering_columns]
- )
- return TotalOrdering(
- ordering_value_columns=tuple(joined_refs),
- total_ordering_columns=frozenset(
- map(expression.DerefOp, left_total_order_cols | right_total_order_cols)
- ),
- )
- else:
- return RowOrdering(tuple(joined_refs))
diff --git a/bigframes/core/rewrite/__init__.py b/bigframes/core/rewrite/__init__.py
index f5275239d9..9044cb25f9 100644
--- a/bigframes/core/rewrite/__init__.py
+++ b/bigframes/core/rewrite/__init__.py
@@ -15,6 +15,7 @@
from bigframes.core.rewrite.identifiers import remap_variables
from bigframes.core.rewrite.implicit_align import try_row_join
from bigframes.core.rewrite.legacy_align import legacy_join_as_projection
+from bigframes.core.rewrite.order import pull_up_order
from bigframes.core.rewrite.slices import pullup_limit_from_slice, rewrite_slice
__all__ = [
@@ -23,4 +24,5 @@
"rewrite_slice",
"pullup_limit_from_slice",
"remap_variables",
+ "pull_up_order",
]
diff --git a/bigframes/core/rewrite/implicit_align.py b/bigframes/core/rewrite/implicit_align.py
index 41cc1ce82a..1b864fb919 100644
--- a/bigframes/core/rewrite/implicit_align.py
+++ b/bigframes/core/rewrite/implicit_align.py
@@ -14,7 +14,8 @@
from __future__ import annotations
import dataclasses
-from typing import Iterable, Optional, Tuple
+import itertools
+from typing import cast, Optional, Sequence, Set, Tuple
import bigframes.core.expression
import bigframes.core.guid
@@ -24,16 +25,13 @@
import bigframes.core.window_spec
import bigframes.operations.aggregations
-# Additive nodes leave existing columns completely intact, and only add new columns to the end
-ADDITIVE_NODES = (
- bigframes.core.nodes.ProjectionNode,
- bigframes.core.nodes.WindowOpNode,
- bigframes.core.nodes.PromoteOffsetsNode,
-)
# Combination of selects and additive nodes can be merged as an explicit keyless "row join"
ALIGNABLE_NODES = (
- *ADDITIVE_NODES,
bigframes.core.nodes.SelectionNode,
+ bigframes.core.nodes.ProjectionNode,
+ bigframes.core.nodes.WindowOpNode,
+ bigframes.core.nodes.PromoteOffsetsNode,
+ bigframes.core.nodes.InNode,
)
@@ -73,6 +71,7 @@ def get_expression_spec(
(
bigframes.core.nodes.WindowOpNode,
bigframes.core.nodes.PromoteOffsetsNode,
+ bigframes.core.nodes.InNode,
),
):
if set(expression.column_references).isdisjoint(
@@ -85,7 +84,7 @@ def get_expression_spec(
return ExpressionSpec(expression, curr_node)
else:
return ExpressionSpec(expression, curr_node)
- curr_node = curr_node.child
+ curr_node = curr_node.child_nodes[0]
def try_row_join(
@@ -95,7 +94,7 @@ def try_row_join(
) -> Optional[bigframes.core.nodes.BigFrameNode]:
"""Joins the two nodes"""
divergent_node = first_shared_descendent(
- l_node, r_node, descendable_types=ALIGNABLE_NODES
+ {l_node, r_node}, descendable_types=ALIGNABLE_NODES
)
if divergent_node is None:
return None
@@ -124,11 +123,11 @@ def _linearize_trees(
# base case: append tree does not have any divergent nodes to linearize
if append_tree == divergent_node:
return base_tree
- else:
- assert isinstance(append_tree, ADDITIVE_NODES)
- return append_tree.replace_child(
- _linearize_trees(base_tree, append_tree.child)
- )
+
+ assert isinstance(append_tree, bigframes.core.nodes.AdditiveNode)
+ return append_tree.replace_additive_base(
+ _linearize_trees(base_tree, append_tree.additive_base)
+ )
merged_node = _linearize_trees(l_node, r_node)
return bigframes.core.nodes.SelectionNode(merged_node, combined_selection)
@@ -161,13 +160,40 @@ def pull_up_selection(
(bigframes.core.expression.DerefOp(field.id), field.id)
for field in node.fields
)
- assert isinstance(node, (bigframes.core.nodes.SelectionNode, *ADDITIVE_NODES))
- child_node, child_selections = pull_up_selection(
- node.child, stop, rename_vars=rename_vars
- )
- mapping = {out: ref.id for ref, out in child_selections}
- if isinstance(node, ADDITIVE_NODES):
- new_node: bigframes.core.nodes.BigFrameNode = node.replace_child(child_node)
+ # InNode needs special handling, as its a binary node, but row identity is from left side only.
+ # TODO: Merge code with unary op paths
+ if isinstance(node, bigframes.core.nodes.InNode):
+ child_node, child_selections = pull_up_selection(
+ node.left_child, stop=stop, rename_vars=rename_vars
+ )
+ mapping = {out: ref.id for ref, out in child_selections}
+
+ new_in_node: bigframes.core.nodes.InNode = dataclasses.replace(
+ node, left_child=child_node
+ )
+ new_in_node = new_in_node.remap_refs(mapping)
+ if rename_vars:
+ new_in_node = cast(
+ bigframes.core.nodes.InNode,
+ new_in_node.remap_vars(
+ {node.indicator_col: bigframes.core.identifiers.ColumnId.unique()}
+ ),
+ )
+ added_selection = (
+ bigframes.core.expression.DerefOp(new_in_node.indicator_col),
+ node.indicator_col,
+ )
+ new_selection = (*child_selections, added_selection)
+ return new_in_node, new_selection
+
+ if isinstance(node, bigframes.core.nodes.AdditiveNode):
+ child_node, child_selections = pull_up_selection(
+ node.additive_base, stop, rename_vars=rename_vars
+ )
+ mapping = {out: ref.id for ref, out in child_selections}
+ new_node: bigframes.core.nodes.BigFrameNode = node.replace_additive_base(
+ child_node
+ )
new_node = new_node.remap_refs(mapping)
if rename_vars:
var_renames = {
@@ -177,7 +203,7 @@ def pull_up_selection(
new_node = new_node.remap_vars(var_renames)
else:
var_renames = {}
- assert isinstance(new_node, ADDITIVE_NODES)
+ assert isinstance(new_node, bigframes.core.nodes.AdditiveNode)
added_selections = (
(
bigframes.core.expression.DerefOp(var_renames.get(field.id, field.id)),
@@ -188,6 +214,10 @@ def pull_up_selection(
new_selection = (*child_selections, *added_selections)
return new_node, new_selection
elif isinstance(node, bigframes.core.nodes.SelectionNode):
+ child_node, child_selections = pull_up_selection(
+ node.child, stop, rename_vars=rename_vars
+ )
+ mapping = {out: ref.id for ref, out in child_selections}
new_selection = tuple(
(
bigframes.core.expression.DerefOp(mapping[ref.id]),
@@ -201,26 +231,31 @@ def pull_up_selection(
## Traversal helpers
def first_shared_descendent(
- left: bigframes.core.nodes.BigFrameNode,
- right: bigframes.core.nodes.BigFrameNode,
- descendable_types: Tuple[type[bigframes.core.nodes.UnaryNode], ...],
+ roots: Set[bigframes.core.nodes.BigFrameNode],
+ descendable_types: Tuple[type[bigframes.core.nodes.BigFrameNode], ...],
) -> Optional[bigframes.core.nodes.BigFrameNode]:
- l_path = tuple(descend(left, descendable_types))
- r_path = tuple(descend(right, descendable_types))
- if l_path[-1] != r_path[-1]:
+ if not roots:
return None
+ if len(roots) == 1:
+ return next(iter(roots))
- for l_node, r_node in zip(l_path[-len(r_path) :], r_path[-len(l_path) :]):
- if l_node == r_node:
- return l_node
- # should be impossible, as l_path[-1] == r_path[-1]
- raise ValueError()
+ min_height = min(root.height for root in roots)
+ def descend(
+ root: bigframes.core.nodes.BigFrameNode,
+ ) -> Sequence[bigframes.core.nodes.BigFrameNode]:
+ # Special case to not descend into right side of IsInNode
+ if isinstance(root, bigframes.core.nodes.AdditiveNode):
+ return (root.additive_base,)
+ return root.child_nodes
-def descend(
- root: bigframes.core.nodes.BigFrameNode,
- descendable_types: Tuple[type[bigframes.core.nodes.UnaryNode], ...],
-) -> Iterable[bigframes.core.nodes.BigFrameNode]:
- yield root
- if isinstance(root, descendable_types):
- yield from descend(root.child, descendable_types)
+ roots_to_descend = set(root for root in roots if root.height > min_height)
+ if not roots_to_descend:
+ roots_to_descend = roots
+ if any(not isinstance(root, descendable_types) for root in roots_to_descend):
+ return None
+ as_is = roots - roots_to_descend
+ descended = set(
+ itertools.chain.from_iterable(descend(root) for root in roots_to_descend)
+ )
+ return first_shared_descendent(as_is.union(descended), descendable_types)
diff --git a/bigframes/core/rewrite/legacy_align.py b/bigframes/core/rewrite/legacy_align.py
index a671f34bd4..05641130fb 100644
--- a/bigframes/core/rewrite/legacy_align.py
+++ b/bigframes/core/rewrite/legacy_align.py
@@ -361,5 +361,5 @@ def common_selection_root(
) -> Optional[nodes.BigFrameNode]:
"""Find common subtree between join subtrees"""
return bigframes.core.rewrite.implicit_align.first_shared_descendent(
- l_tree, r_tree, descendable_types=LEGACY_REWRITER_NODES
+ {l_tree, r_tree}, descendable_types=LEGACY_REWRITER_NODES
)
diff --git a/bigframes/core/rewrite/order.py b/bigframes/core/rewrite/order.py
new file mode 100644
index 0000000000..3f8c409b76
--- /dev/null
+++ b/bigframes/core/rewrite/order.py
@@ -0,0 +1,436 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import dataclasses
+import functools
+from typing import Mapping, Tuple
+
+import bigframes.core.expression
+import bigframes.core.identifiers
+import bigframes.core.nodes
+import bigframes.core.ordering
+import bigframes.core.window_spec
+import bigframes.operations
+
+
+# Makes ordering explicit in window definitions
+def pull_up_order(
+ root: bigframes.core.nodes.BigFrameNode,
+ *,
+ order_root: bool = True,
+ ordered_joins: bool = True,
+) -> Tuple[bigframes.core.nodes.BigFrameNode, bigframes.core.ordering.RowOrdering]:
+ """
+ Pull the ordering up, putting full order definition into window ops.
+
+ May create extra colums, which must be removed by callers if they want to preserve original schema.
+
+ Requires the following nodes to be removed/rewritten: SliceNode
+
+ """
+
+ @functools.cache
+ def pull_up_order_inner(
+ node: bigframes.core.nodes.BigFrameNode,
+ ) -> Tuple[bigframes.core.nodes.BigFrameNode, bigframes.core.ordering.RowOrdering]:
+ """Pull filter nodes out of a tree section."""
+ if isinstance(node, bigframes.core.nodes.ReversedNode):
+ child_result, child_order = pull_up_order_inner(node.child)
+ return child_result, child_order.with_reverse()
+ elif isinstance(node, bigframes.core.nodes.OrderByNode):
+ if node.is_total_order:
+ new_node = remove_order(node.child)
+ else:
+ new_node, child_order = pull_up_order_inner(node.child)
+
+ new_by = []
+ ids: list[bigframes.core.ids.ColumnId] = []
+ for part in node.by:
+ if not isinstance(
+ part.scalar_expression, bigframes.core.expression.DerefOp
+ ):
+ id = bigframes.core.ids.ColumnId.unique()
+ new_node = bigframes.core.nodes.ProjectionNode(
+ new_node, ((part.scalar_expression, id),)
+ )
+ new_part = bigframes.core.ordering.OrderingExpression(
+ bigframes.core.expression.DerefOp(id),
+ part.direction,
+ part.na_last,
+ )
+ new_by.append(new_part)
+ ids.append(id)
+ else:
+ new_by.append(part)
+ ids.append(part.scalar_expression.id)
+
+ if node.is_total_order:
+ new_order: bigframes.core.ordering.RowOrdering = (
+ bigframes.core.ordering.TotalOrdering(
+ ordering_value_columns=tuple(new_by),
+ total_ordering_columns=frozenset(
+ map(lambda x: bigframes.core.expression.DerefOp(x), ids)
+ ),
+ )
+ )
+ else:
+ assert child_order
+ new_order = child_order.with_ordering_columns(new_by)
+ return new_node, new_order
+ elif isinstance(node, bigframes.core.nodes.ProjectionNode):
+ child_result, child_order = pull_up_order_inner(node.child)
+ return node.replace_child(child_result), child_order
+ elif isinstance(node, bigframes.core.nodes.JoinNode):
+ if ordered_joins:
+ return pull_order_join(node)
+ else:
+ return (
+ dataclasses.replace(
+ node,
+ left_child=remove_order_strict(node.left_child),
+ right_child=remove_order_strict(node.right_child),
+ ),
+ bigframes.core.ordering.RowOrdering(),
+ )
+ elif isinstance(node, bigframes.core.nodes.ConcatNode):
+ return pull_order_concat(node)
+ elif isinstance(node, bigframes.core.nodes.FromRangeNode):
+ new_start = remove_order_strict(node.start)
+ new_end = remove_order_strict(node.end)
+
+ new_node = dataclasses.replace(node, start=new_start, end=new_end)
+ return node, bigframes.core.ordering.TotalOrdering.from_primary_key(
+ [node.output_id]
+ )
+ elif isinstance(node, bigframes.core.nodes.ReadLocalNode):
+ if node.offsets_col is None:
+ offsets_id = bigframes.core.ids.ColumnId.unique()
+ new_root = dataclasses.replace(node, offsets_col=offsets_id)
+ return new_root, bigframes.core.ordering.TotalOrdering.from_offset_col(
+ offsets_id
+ )
+ else:
+ return node, bigframes.core.ordering.TotalOrdering.from_offset_col(
+ node.offsets_col
+ )
+ elif isinstance(node, bigframes.core.nodes.ReadTableNode):
+ if node.source.ordering is not None:
+ return node.with_order_cols()
+ else:
+ # No defined ordering
+ return node, bigframes.core.ordering.RowOrdering()
+ elif isinstance(node, bigframes.core.nodes.PromoteOffsetsNode):
+ child_result, child_order = pull_up_order_inner(node.child)
+ if child_order.is_total_ordering and child_order.is_sequential:
+ # special case, we can just project the ordering
+ order_expression = child_order.total_order_col
+ assert order_expression is not None
+ order_expression.scalar_expression
+ new_node = bigframes.core.nodes.ProjectionNode(
+ child_result, ((order_expression.scalar_expression, node.col_id),)
+ )
+ return new_node, bigframes.core.ordering.TotalOrdering.from_offset_col(
+ node.col_id
+ )
+ else:
+ # Otherwise we need to generate offsets
+ agg = bigframes.core.expression.NullaryAggregation(
+ bigframes.core.agg_ops.RowNumberOp()
+ )
+ window_spec = bigframes.core.window_spec.unbound(
+ ordering=tuple(child_order.all_ordering_columns)
+ )
+ new_offsets_node = bigframes.core.nodes.WindowOpNode(
+ child_result, agg, window_spec, node.col_id
+ )
+ return (
+ new_offsets_node,
+ bigframes.core.ordering.TotalOrdering.from_offset_col(node.col_id),
+ )
+ elif isinstance(node, bigframes.core.nodes.FilterNode):
+ child_result, child_order = pull_up_order_inner(node.child)
+ return node.replace_child(child_result), child_order.with_non_sequential()
+ elif isinstance(node, bigframes.core.nodes.InNode):
+ child_result, child_order = pull_up_order_inner(node.left_child)
+ subquery_result = remove_order_strict(node.right_child)
+ return (
+ dataclasses.replace(
+ node, left_child=child_result, right_child=subquery_result
+ ),
+ child_order,
+ )
+ elif isinstance(node, bigframes.core.nodes.SelectionNode):
+ child_result, child_order = pull_up_order_inner(node.child)
+ selected_ids = set(ref.id for ref, _ in node.input_output_pairs)
+ unselected_order_cols = tuple(
+ col for col in child_order.referenced_columns if col not in selected_ids
+ )
+ # Create unique ids just to be safe
+ new_selections = {
+ col: bigframes.core.ids.ColumnId.unique()
+ for col in unselected_order_cols
+ }
+ all_selections = (
+ *node.input_output_pairs,
+ *(
+ (bigframes.core.expression.DerefOp(k), v)
+ for k, v in new_selections.items()
+ ),
+ )
+
+ new_select_node = dataclasses.replace(
+ node, child=child_result, input_output_pairs=all_selections
+ )
+ new_order = child_order.remap_column_refs(new_select_node.get_id_mapping())
+ return new_select_node, new_order
+ elif isinstance(node, bigframes.core.nodes.RowCountNode):
+ child_result = remove_order(node.child)
+ return node.replace_child(
+ child_result
+ ), bigframes.core.ordering.TotalOrdering.from_primary_key([node.col_id])
+ elif isinstance(node, bigframes.core.nodes.AggregateNode):
+ if node.has_ordered_ops:
+ child_result, child_order = pull_up_order_inner(node.child)
+ new_order_by = child_order.with_ordering_columns(node.order_by)
+ new_order = bigframes.core.ordering.TotalOrdering.from_primary_key(
+ [ref.id for ref in node.by_column_ids]
+ )
+ return (
+ dataclasses.replace(
+ node,
+ child=child_result,
+ order_by=tuple(new_order_by.all_ordering_columns),
+ ),
+ new_order,
+ )
+ else:
+ child_result = remove_order(node.child)
+ return node.replace_child(
+ child_result
+ ), bigframes.core.ordering.TotalOrdering.from_primary_key(
+ [ref.id for ref in node.by_column_ids]
+ )
+ elif isinstance(node, bigframes.core.nodes.WindowOpNode):
+ child_result, child_order = pull_up_order_inner(node.child)
+ if node.inherits_order:
+ new_window_order = (
+ *node.window_spec.ordering,
+ *child_order.all_ordering_columns,
+ )
+ new_window_spec = dataclasses.replace(
+ node.window_spec, ordering=new_window_order
+ )
+ else:
+ new_window_spec = node.window_spec
+ return (
+ dataclasses.replace(
+ node, child=child_result, window_spec=new_window_spec
+ ),
+ child_order,
+ )
+ elif isinstance(node, bigframes.core.nodes.RandomSampleNode):
+ child_result, child_order = pull_up_order_inner(node.child)
+ return node.replace_child(child_result), child_order.with_non_sequential()
+ elif isinstance(node, bigframes.core.nodes.ExplodeNode):
+ child_result, child_order = pull_up_order_inner(node.child)
+ if node.offsets_col is None:
+ offsets_id = bigframes.core.ids.ColumnId.unique()
+ new_explode: bigframes.core.nodes.BigFrameNode = dataclasses.replace(
+ node, child=child_result, offsets_col=offsets_id
+ )
+ else:
+ offsets_id = node.offsets_col
+ new_explode = node.replace_child(child_result)
+ inner_order = bigframes.core.orderings.TotalOrdering.from_offset_col(
+ offsets_id
+ )
+ return new_explode, child_order.join(inner_order)
+ raise ValueError(f"Unexpected node: {node}")
+
+ def pull_order_concat(
+ node: bigframes.core.nodes.ConcatNode,
+ ) -> Tuple[
+ bigframes.core.nodes.BigFrameNode, bigframes.core.ordering.TotalOrdering
+ ]:
+ new_sources = []
+ for i, source in enumerate(node.child_nodes):
+ new_source, order = pull_up_order_inner(source)
+ offsets_id = bigframes.core.ids.ColumnId.unique()
+ table_id = bigframes.core.ids.ColumnId.unique()
+ if order.is_total_ordering and order.integer_encoding.is_encoded:
+ order_expression = order.total_order_col
+ assert order_expression is not None
+ new_source = bigframes.core.nodes.ProjectionNode(
+ new_source, ((order_expression.scalar_expression, offsets_id),)
+ )
+ else:
+ agg = bigframes.core.expression.NullaryAggregation(
+ bigframes.core.agg_ops.RowNumberOp()
+ )
+ window_spec = bigframes.core.window_spec.unbound(
+ ordering=tuple(order.all_ordering_columns)
+ )
+ new_source = bigframes.core.nodes.WindowOpNode(
+ new_source, agg, window_spec, offsets_id
+ )
+ new_source = bigframes.core.nodes.ProjectionNode(
+ new_source, ((bigframes.core.expression.const(i), table_id),)
+ )
+ selection = tuple(
+ (
+ (bigframes.core.expression.DerefOp(id), id)
+ for id in (*source.ids, table_id, offsets_id)
+ )
+ )
+ new_source = bigframes.core.nodes.SelectionNode(new_source, selection)
+ new_sources.append(new_source)
+
+ union_offsets_id = bigframes.core.ids.ColumnId.unique()
+ union_table_id = bigframes.core.ids.ColumnId.unique()
+ new_ids = (*node.output_ids, union_table_id, union_offsets_id)
+ new_node = dataclasses.replace(
+ node, children=tuple(new_sources), output_ids=new_ids
+ )
+ new_ordering = bigframes.core.ordering.TotalOrdering.from_primary_key(
+ (union_table_id, union_offsets_id)
+ )
+ return new_node, new_ordering
+
+ def pull_order_join(
+ node: bigframes.core.nodes.JoinNode,
+ ) -> Tuple[bigframes.core.nodes.BigFrameNode, bigframes.core.ordering.RowOrdering]:
+ left_child, left_order = pull_up_order_inner(node.left_child)
+ # as tree is a dag, and pull_up_order_inner is memoized, self-joins can create conflicts in new columns
+ right_child, right_order = pull_up_order_inner(node.right_child)
+ conflicts = set(left_child.ids) & set(right_child.ids)
+ if conflicts:
+ right_child, mapping = rename_cols(right_child, conflicts)
+ right_order = right_order.remap_column_refs(
+ mapping, allow_partial_bindings=True
+ )
+
+ if node.type in ("right", "outer"):
+ # right side is nullable
+ left_indicator = bigframes.core.ids.ColumnId.unique()
+ left_child = bigframes.core.nodes.ProjectionNode(
+ left_child, ((bigframes.core.expression.const(True), left_indicator),)
+ )
+ left_order = left_order.with_ordering_columns(
+ [bigframes.core.ordering.descending_over(left_indicator)]
+ )
+ if node.type in ("left", "outer"):
+ # right side is nullable
+ right_indicator = bigframes.core.ids.ColumnId.unique()
+ right_child = bigframes.core.nodes.ProjectionNode(
+ right_child, ((bigframes.core.expression.const(True), right_indicator),)
+ )
+ right_order = right_order.with_ordering_columns(
+ [bigframes.core.ordering.descending_over(right_indicator)]
+ )
+
+ new_join = dataclasses.replace(
+ node, left_child=left_child, right_child=right_child
+ )
+ new_order = (
+ left_order.join(right_order)
+ if (node.type != "right")
+ else right_order.join(left_order)
+ )
+ return new_join, new_order
+
+ @functools.cache
+ def remove_order(
+ node: bigframes.core.nodes.BigFrameNode,
+ ) -> bigframes.core.nodes.BigFrameNode:
+ if isinstance(
+ node, (bigframes.core.nodes.OrderByNode, bigframes.core.nodes.ReversedNode)
+ ):
+ return remove_order(node.child)
+ elif isinstance(
+ node,
+ (
+ bigframes.core.nodes.WindowOpNode,
+ bigframes.core.nodes.PromoteOffsetsNode,
+ ),
+ ):
+ if isinstance(node, bigframes.core.nodes.PromoteOffsetsNode):
+ node = rewrite_promote_offsets(node)
+ if node.inherits_order:
+ child_result, child_order = pull_up_order_inner(node.child)
+ new_window_order = (
+ *node.window_spec.ordering,
+ *child_order.all_ordering_columns,
+ )
+ new_window_spec = dataclasses.replace(
+ node.window_spec, ordering=new_window_order
+ )
+ return dataclasses.replace(
+ node, child=child_result, window_spec=new_window_spec
+ )
+ elif isinstance(node, bigframes.core.nodes.AggregateNode):
+ if node.has_ordered_ops:
+ child_result, child_order = pull_up_order_inner(node.child)
+ new_order_by = child_order.with_ordering_columns(node.order_by)
+ return dataclasses.replace(
+ node,
+ child=child_result,
+ order_by=tuple(new_order_by.all_ordering_columns),
+ )
+
+ return node.transform_children(remove_order)
+
+ def remove_order_strict(
+ node: bigframes.core.nodes.BigFrameNode,
+ ) -> bigframes.core.nodes.BigFrameNode:
+ result = remove_order(node)
+ if result.ids != node.ids:
+ return bigframes.core.nodes.SelectionNode(
+ result,
+ tuple((bigframes.core.expression.DerefOp(id), id) for id in node.ids),
+ )
+ return result
+
+ return (
+ pull_up_order_inner(root)
+ if order_root
+ else (remove_order(root), bigframes.core.ordering.RowOrdering())
+ )
+
+
+def rewrite_promote_offsets(
+ node: bigframes.core.nodes.PromoteOffsetsNode,
+) -> bigframes.core.nodes.WindowOpNode:
+ agg = bigframes.core.expression.NullaryAggregation(
+ bigframes.core.agg_ops.RowNumberOp()
+ )
+ window_spec = bigframes.core.window_spec.unbound()
+ return bigframes.core.nodes.WindowOpNode(node.child, agg, window_spec, node.col_id)
+
+
+def rename_cols(
+ node: bigframes.core.nodes.BigFrameNode, cols: set[bigframes.core.ids.ColumnId]
+) -> Tuple[
+ bigframes.core.nodes.BigFrameNode,
+ Mapping[bigframes.core.ids.ColumnId, bigframes.core.ids.ColumnId],
+]:
+ mappings = dict((id, bigframes.core.ids.ColumnId.unique()) for id in cols)
+
+ result_node = bigframes.core.nodes.SelectionNode(
+ node,
+ tuple(
+ (bigframes.core.expression.DerefOp(id), mappings.get(id, id))
+ for id in node.ids
+ ),
+ )
+
+ return result_node, dict(mappings)
diff --git a/bigframes/core/schema.py b/bigframes/core/schema.py
index 2b49f81d85..e3808dfffd 100644
--- a/bigframes/core/schema.py
+++ b/bigframes/core/schema.py
@@ -38,9 +38,13 @@ class ArraySchema:
items: typing.Tuple[SchemaItem, ...]
@classmethod
- def from_bq_table(cls, table: google.cloud.bigquery.Table):
+ def from_bq_table(
+ cls,
+ table: google.cloud.bigquery.Table,
+ column_type_overrides: typing.Dict[str, bigframes.dtypes.Dtype] = {},
+ ):
items = tuple(
- SchemaItem(name, dtype)
+ SchemaItem(name, column_type_overrides.get(name, dtype))
for name, dtype in bigframes.dtypes.bf_type_from_type_kind(
table.schema
).items()
diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py
index f9ca6cb5f0..7cb2ec7535 100644
--- a/bigframes/core/utils.py
+++ b/bigframes/core/utils.py
@@ -19,6 +19,7 @@
import bigframes_vendored.pandas.io.common as vendored_pandas_io_common
import pandas as pd
+import pandas.api.types as pdtypes
import typing_extensions
import bigframes.exceptions as bfe
@@ -184,3 +185,29 @@ def wrapper(*args, **kwargs):
return wrapper
return decorator
+
+
+def timedelta_to_micros(td: pd.Timedelta) -> int:
+ # td.value returns total nanoseconds.
+ return td.value // 1000
+
+
+def replace_timedeltas_with_micros(dataframe: pd.DataFrame) -> List[str]:
+ """
+ Replaces in-place timedeltas to integer values in microseconds. Nanosecond part is ignored.
+
+ Returns:
+ The names of updated columns
+ """
+ updated_columns = []
+
+ for col in dataframe.columns:
+ if pdtypes.is_timedelta64_dtype(dataframe[col].dtype):
+ dataframe[col] = dataframe[col].apply(timedelta_to_micros)
+ updated_columns.append(col)
+
+ if pdtypes.is_timedelta64_dtype(dataframe.index.dtype):
+ dataframe.index = dataframe.index.map(timedelta_to_micros)
+ updated_columns.append(dataframe.index.name)
+
+ return updated_columns
diff --git a/bigframes/core/window_spec.py b/bigframes/core/window_spec.py
index d8098f18f7..b4a3d35471 100644
--- a/bigframes/core/window_spec.py
+++ b/bigframes/core/window_spec.py
@@ -13,7 +13,7 @@
# limitations under the License.
from __future__ import annotations
-from dataclasses import dataclass
+from dataclasses import dataclass, replace
import itertools
from typing import Mapping, Optional, Set, Tuple, Union
@@ -181,6 +181,12 @@ def all_referenced_columns(self) -> Set[ids.ColumnId]:
)
return set(itertools.chain((i.id for i in self.grouping_keys), ordering_vars))
+ def without_order(self) -> WindowSpec:
+ """Removes ordering clause if ordering isn't required to define bounds."""
+ if self.row_bounded:
+ raise ValueError("Cannot remove order from row-bounded window")
+ return replace(self, ordering=())
+
def remap_column_refs(
self,
mapping: Mapping[ids.ColumnId, ids.ColumnId],
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
index fec53dbf01..6308dcc8da 100644
--- a/bigframes/dataframe.py
+++ b/bigframes/dataframe.py
@@ -118,6 +118,8 @@ def __init__(
):
global bigframes
+ self._query_job: Optional[bigquery.QueryJob] = None
+
if copy is not None and not copy:
raise ValueError(
f"DataFrame constructor only supports copy=True. {constants.FEEDBACK_LINK}"
@@ -180,8 +182,8 @@ def __init__(
if columns:
block = block.select_columns(list(columns)) # type:ignore
if dtype:
- block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=dtype))
- self._block = block
+ bf_dtype = bigframes.dtypes.bigframes_type(dtype)
+ block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype))
else:
import bigframes.pandas
@@ -193,10 +195,14 @@ def __init__(
dtype=dtype, # type:ignore
)
if session:
- self._block = session.read_pandas(pd_dataframe)._get_block()
+ block = session.read_pandas(pd_dataframe)._get_block()
else:
- self._block = bigframes.pandas.read_pandas(pd_dataframe)._get_block()
- self._query_job: Optional[bigquery.QueryJob] = None
+ block = bigframes.pandas.read_pandas(pd_dataframe)._get_block()
+
+ # We use _block as an indicator in __getattr__ and __setattr__ to see
+ # if the object is fully initialized, so make sure we set the _block
+ # attribute last.
+ self._block = block
self._block.session._register_object(self)
def __dir__(self):
@@ -368,6 +374,7 @@ def astype(
dtype: Union[
bigframes.dtypes.DtypeString,
bigframes.dtypes.Dtype,
+ type,
dict[str, Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype]],
],
*,
@@ -378,23 +385,15 @@ def astype(
safe_cast = errors == "null"
- # Type strings check
- if dtype in bigframes.dtypes.DTYPE_STRINGS:
- return self._apply_unary_op(ops.AsTypeOp(dtype, safe_cast))
-
- # Type instances check
- if type(dtype) in bigframes.dtypes.DTYPES:
- return self._apply_unary_op(ops.AsTypeOp(dtype, safe_cast))
-
if isinstance(dtype, dict):
result = self.copy()
for col, to_type in dtype.items():
result[col] = result[col].astype(to_type)
return result
- raise TypeError(
- f"Invalid type {type(dtype)} for dtype input. {constants.FEEDBACK_LINK}"
- )
+ dtype = bigframes.dtypes.bigframes_type(dtype)
+
+ return self._apply_unary_op(ops.AsTypeOp(dtype, safe_cast))
def _to_sql_query(
self, include_index: bool, enable_cache: bool = True
@@ -631,13 +630,17 @@ def _getitem_bool_series(self, key: bigframes.series.Series) -> DataFrame:
return DataFrame(block)
def __getattr__(self, key: str):
- # Protect against recursion errors with uninitialized DataFrame
- # objects. See:
+ # To allow subclasses to set private attributes before the class is
+ # fully initialized, protect against recursion errors with
+ # uninitialized DataFrame objects. Note: this comes at the downside
+ # that columns with a leading `_` won't be treated as columns.
+ #
+ # See:
# https://github.com/googleapis/python-bigquery-dataframes/issues/728
# and
# https://nedbatchelder.com/blog/201010/surprising_getattr_recursion.html
if key == "_block":
- raise AttributeError("_block")
+ raise AttributeError(key)
if key in self._block.column_labels:
return self.__getitem__(key)
@@ -657,26 +660,36 @@ def __getattr__(self, key: str):
raise AttributeError(key)
def __setattr__(self, key: str, value):
- if key in ["_block", "_query_job"]:
+ if key == "_block":
+ object.__setattr__(self, key, value)
+ return
+
+ # To allow subclasses to set private attributes before the class is
+ # fully initialized, assume anything set before `_block` is initialized
+ # is a regular attribute.
+ if not hasattr(self, "_block"):
object.__setattr__(self, key, value)
return
- # Can this be removed???
+
+ # If someone has a column named the same as a normal attribute
+ # (e.g. index), we want to set the normal attribute, not the column.
+ # To do that, check if there is a normal attribute by using
+ # __getattribute__ (not __getattr__, because that includes columns).
+ # If that returns a value without raising, then we know this is a
+ # normal attribute and we should prefer that.
try:
- # boring attributes go through boring old path
object.__getattribute__(self, key)
return object.__setattr__(self, key, value)
except AttributeError:
pass
- # if this fails, go on to more involved attribute setting
- # (note that this matches __getattr__, above).
- try:
- if key in self.columns:
- self[key] = value
- else:
- object.__setattr__(self, key, value)
- # Can this be removed?
- except (AttributeError, TypeError):
+ # If we made it here, then we know that it's not a regular attribute
+ # already, so it might be a column to update. Note: we don't allow
+ # adding new columns using __setattr__, only __setitem__, that way we
+ # can still add regular new attributes.
+ if key in self._block.column_labels:
+ self[key] = value
+ else:
object.__setattr__(self, key, value)
def __repr__(self) -> str:
@@ -768,11 +781,11 @@ def _repr_html_(self) -> str:
def obj_ref_rt_to_html(obj_ref_rt) -> str:
obj_ref_rt_json = json.loads(obj_ref_rt)
+ gcs_metadata = obj_ref_rt_json["objectref"]["details"][
+ "gcs_metadata"
+ ]
content_type = typing.cast(
- str,
- obj_ref_rt_json["objectref"]["details"]["gcs_metadata"][
- "content_type"
- ],
+ str, gcs_metadata.get("content_type", "")
)
if content_type.startswith("image"):
url = obj_ref_rt_json["access_urls"]["read_url"]
diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py
index 4db124134a..8b1ca3b0c8 100644
--- a/bigframes/dtypes.py
+++ b/bigframes/dtypes.py
@@ -17,6 +17,7 @@
from dataclasses import dataclass
import datetime
import decimal
+import textwrap
import typing
from typing import Any, Dict, List, Literal, Union
@@ -79,7 +80,7 @@
),
pa.field(
"details",
- pa.large_string(), # JSON
+ db_dtypes.JSONArrowType(),
),
)
)
@@ -408,11 +409,16 @@ def dtype_for_etype(etype: ExpressionType) -> Dtype:
def arrow_dtype_to_bigframes_dtype(arrow_dtype: pa.DataType) -> Dtype:
if arrow_dtype in _ARROW_TO_BIGFRAMES:
return _ARROW_TO_BIGFRAMES[arrow_dtype]
+
if pa.types.is_list(arrow_dtype):
return pd.ArrowDtype(arrow_dtype)
+
if pa.types.is_struct(arrow_dtype):
return pd.ArrowDtype(arrow_dtype)
+ if pa.types.is_duration(arrow_dtype):
+ return pd.ArrowDtype(arrow_dtype)
+
# BigFrames doesn't distinguish between string and large_string because the
# largest string (2 GB) is already larger than the largest BigQuery row.
if pa.types.is_string(arrow_dtype) or pa.types.is_large_string(arrow_dtype):
@@ -422,7 +428,7 @@ def arrow_dtype_to_bigframes_dtype(arrow_dtype: pa.DataType) -> Dtype:
return DEFAULT_DTYPE
# No other types matched.
- raise ValueError(
+ raise TypeError(
f"Unexpected Arrow data type {arrow_dtype}. {constants.FEEDBACK_LINK}"
)
@@ -447,7 +453,7 @@ def bigframes_dtype_to_arrow_dtype(
if pa.types.is_struct(bigframes_dtype.pyarrow_dtype):
return bigframes_dtype.pyarrow_dtype
else:
- raise ValueError(
+ raise TypeError(
f"No arrow conversion for {bigframes_dtype}. {constants.FEEDBACK_LINK}"
)
@@ -474,7 +480,7 @@ def bigframes_dtype_to_literal(
if isinstance(bigframes_dtype, gpd.array.GeometryDtype):
return shapely.Point((0, 0))
- raise ValueError(
+ raise TypeError(
f"No literal conversion for {bigframes_dtype}. {constants.FEEDBACK_LINK}"
)
@@ -515,11 +521,91 @@ def arrow_type_to_literal(
if pa.types.is_time(arrow_type):
return datetime.time(1, 1, 1)
- raise ValueError(
+ raise TypeError(
f"No literal conversion for {arrow_type}. {constants.FEEDBACK_LINK}"
)
+def bigframes_type(dtype) -> Dtype:
+ """Convert type object to canoncial bigframes dtype."""
+ if _is_bigframes_dtype(dtype):
+ return dtype
+ elif isinstance(dtype, str):
+ return _dtype_from_string(dtype)
+ elif isinstance(dtype, type):
+ return _infer_dtype_from_python_type(dtype)
+ elif isinstance(dtype, pa.DataType):
+ return arrow_dtype_to_bigframes_dtype(dtype)
+ else:
+ raise TypeError(
+ f"Cannot infer supported datatype for: {dtype}. {constants.FEEDBACK_LINK}"
+ )
+
+
+def _is_bigframes_dtype(dtype) -> bool:
+ """True iff dtyps is a canonical bigframes dtype"""
+ # have to be quite strict, as pyarrow dtypes equal their string form, and we don't consider that a canonical form.
+ if (type(dtype), dtype) in set(
+ (type(item.dtype), item.dtype) for item in SIMPLE_TYPES
+ ):
+ return True
+ if isinstance(dtype, pd.ArrowDtype):
+ try:
+ _ = arrow_dtype_to_bigframes_dtype(dtype.pyarrow_dtype)
+ return True
+ except TypeError:
+ return False
+ return False
+
+
+def _infer_dtype_from_python_type(type: type) -> Dtype:
+ if issubclass(type, (bool, np.bool_)):
+ return BOOL_DTYPE
+ if issubclass(type, (int, np.integer)):
+ return INT_DTYPE
+ if issubclass(type, (float, np.floating)):
+ return FLOAT_DTYPE
+ if issubclass(type, decimal.Decimal):
+ return NUMERIC_DTYPE
+ if issubclass(type, (str, np.str_)):
+ return STRING_DTYPE
+ if issubclass(type, (bytes, np.bytes_)):
+ return BYTES_DTYPE
+ if issubclass(type, datetime.date):
+ return DATE_DTYPE
+ if issubclass(type, datetime.time):
+ return TIME_DTYPE
+ else:
+ raise TypeError(
+ f"No matching datatype for python type: {type}. {constants.FEEDBACK_LINK}"
+ )
+
+
+def _dtype_from_string(dtype_string: str) -> typing.Optional[Dtype]:
+ if str(dtype_string) in BIGFRAMES_STRING_TO_BIGFRAMES:
+ return BIGFRAMES_STRING_TO_BIGFRAMES[
+ typing.cast(DtypeString, str(dtype_string))
+ ]
+ raise TypeError(
+ textwrap.dedent(
+ f"""
+ Unexpected data type string {dtype_string}. The following
+ dtypes are supppted: 'boolean','Float64','Int64',
+ 'int64[pyarrow]','string','string[pyarrow]',
+ 'timestamp[us, tz=UTC][pyarrow]','timestamp[us][pyarrow]',
+ 'date32[day][pyarrow]','time64[us][pyarrow]'.
+ The following pandas.ExtensionDtype are supported:
+ pandas.BooleanDtype(), pandas.Float64Dtype(),
+ pandas.Int64Dtype(), pandas.StringDtype(storage="pyarrow"),
+ pd.ArrowDtype(pa.date32()), pd.ArrowDtype(pa.time64("us")),
+ pd.ArrowDtype(pa.timestamp("us")),
+ pd.ArrowDtype(pa.timestamp("us", tz="UTC")).
+ {constants.FEEDBACK_LINK}
+ """
+ )
+ )
+
+
def infer_literal_type(literal) -> typing.Optional[Dtype]:
# Maybe also normalize literal to canonical python representation to remove this burden from compilers?
if pd.api.types.is_list_like(literal):
@@ -539,30 +625,17 @@ def infer_literal_type(literal) -> typing.Optional[Dtype]:
return pd.ArrowDtype(pa.struct(fields))
if pd.isna(literal):
return None # Null value without a definite type
- if isinstance(literal, (bool, np.bool_)):
- return BOOL_DTYPE
- if isinstance(literal, (int, np.integer)):
- return INT_DTYPE
- if isinstance(literal, (float, np.floating)):
- return FLOAT_DTYPE
- if isinstance(literal, decimal.Decimal):
- return NUMERIC_DTYPE
- if isinstance(literal, (str, np.str_)):
- return STRING_DTYPE
- if isinstance(literal, (bytes, np.bytes_)):
- return BYTES_DTYPE
# Make sure to check datetime before date as datetimes are also dates
if isinstance(literal, (datetime.datetime, pd.Timestamp)):
if literal.tzinfo is not None:
return TIMESTAMP_DTYPE
else:
return DATETIME_DTYPE
- if isinstance(literal, datetime.date):
- return DATE_DTYPE
- if isinstance(literal, datetime.time):
- return TIME_DTYPE
+ from_python_type = _infer_dtype_from_python_type(type(literal))
+ if from_python_type is not None:
+ return from_python_type
else:
- raise ValueError(f"Unable to infer type for value: {literal}")
+ raise TypeError(f"Unable to infer type for value: {literal}")
def infer_literal_arrow_type(literal) -> typing.Optional[pa.DataType]:
@@ -602,7 +675,7 @@ def convert_schema_field(
return field.name, pd.ArrowDtype(pa_type)
return field.name, _TK_TO_BIGFRAMES[field.field_type]
else:
- raise ValueError(f"Cannot handle type: {field.field_type}")
+ raise TypeError(f"Cannot handle type: {field.field_type}")
def convert_to_schema_field(
@@ -636,7 +709,7 @@ def convert_to_schema_field(
if bigframes_dtype.pyarrow_dtype == pa.duration("us"):
# Timedeltas are represented as integers in microseconds.
return google.cloud.bigquery.SchemaField(name, "INTEGER")
- raise ValueError(
+ raise TypeError(
f"No arrow conversion for {bigframes_dtype}. {constants.FEEDBACK_LINK}"
)
diff --git a/bigframes/functions/_function_client.py b/bigframes/functions/_function_client.py
index 104119a510..f5001ff909 100644
--- a/bigframes/functions/_function_client.py
+++ b/bigframes/functions/_function_client.py
@@ -54,10 +54,12 @@
)
-class RemoteFunctionClient:
+class FunctionClient:
# Wait time (in seconds) for an IAM binding to take effect after creation
_iam_wait_seconds = 120
+ # TODO(b/392707725): Convert all necessary parameters for cloud function
+ # deployment into method parameters.
def __init__(
self,
gcp_project_id,
diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py
index 00626a252f..a0518978a3 100644
--- a/bigframes/functions/_function_session.py
+++ b/bigframes/functions/_function_session.py
@@ -55,18 +55,18 @@
from . import _function_client, _utils
-class RemoteFunctionSession:
- """Session to manage remote functions."""
+class FunctionSession:
+ """Session to manage bigframes functions."""
def __init__(self):
- # Session level mapping of remote function artifacts
+ # Session level mapping of function artifacts
self._temp_artifacts: Dict[str, str] = dict()
# Lock to synchronize the update of the session artifacts
self._artifacts_lock = threading.Lock()
def _update_temp_artifacts(self, bqrf_routine: str, gcf_path: str):
- """Update remote function artifacts in the current session."""
+ """Update function artifacts in the current session."""
with self._artifacts_lock:
self._temp_artifacts[bqrf_routine] = gcf_path
@@ -76,11 +76,11 @@ def clean_up(
gcfclient: functions_v2.FunctionServiceClient,
session_id: str,
):
- """Delete remote function artifacts in the current session."""
+ """Delete function artifacts in the current session."""
with self._artifacts_lock:
for bqrf_routine, gcf_path in self._temp_artifacts.items():
- # Let's accept the possibility that the remote function may have
- # been deleted directly by the user
+ # Let's accept the possibility that the function may have been
+ # deleted directly by the user
bqclient.delete_routine(bqrf_routine, not_found_ok=True)
# Let's accept the possibility that the cloud function may have
@@ -467,7 +467,7 @@ def wrapper(func):
signature, input_types, output_type # type: ignore
)
- remote_function_client = _function_client.RemoteFunctionClient(
+ remote_function_client = _function_client.FunctionClient(
dataset_ref.project,
cloud_function_region,
cloud_functions_client,
diff --git a/bigframes/functions/_utils.py b/bigframes/functions/_utils.py
index 591da01dd0..f1f8c97e7f 100644
--- a/bigframes/functions/_utils.py
+++ b/bigframes/functions/_utils.py
@@ -30,8 +30,8 @@
import bigframes.core.compile.ibis_types
import bigframes.dtypes
-# Naming convention for the remote function artifacts
-_BIGFRAMES_REMOTE_FUNCTION_PREFIX = "bigframes"
+# Naming convention for the function artifacts
+_BIGFRAMES_FUNCTION_PREFIX = "bigframes"
_BQ_FUNCTION_NAME_SEPERATOR = "_"
_GCF_FUNCTION_NAME_SEPERATOR = "-"
@@ -66,10 +66,10 @@ def _get_updated_package_requirements(
):
requirements = [f"cloudpickle=={cloudpickle.__version__}"]
if is_row_processor:
- # bigframes remote function will send an entire row of data as json,
- # which would be converted to a pandas series and processed
- # Ensure numpy versions match to avoid unpickling problems. See
- # internal issue b/347934471.
+ # bigframes function will send an entire row of data as json, which
+ # would be converted to a pandas series and processed Ensure numpy
+ # versions match to avoid unpickling problems. See internal issue
+ # b/347934471.
requirements.append(f"numpy=={numpy.__version__}")
requirements.append(f"pandas=={pandas.__version__}")
requirements.append(f"pyarrow=={pyarrow.__version__}")
@@ -94,14 +94,14 @@ def _clean_up_by_session_id(
point in time.
"""
- # First clean up the BQ remote functions and then the underlying
- # cloud functions, so that at no point we are left with a remote function
- # that is pointing to a cloud function that does not exist
+ # First clean up the BQ remote functions and then the underlying cloud
+ # functions, so that at no point we are left with a remote function that is
+ # pointing to a cloud function that does not exist
endpoints_to_be_deleted: Set[str] = set()
match_prefix = "".join(
[
- _BIGFRAMES_REMOTE_FUNCTION_PREFIX,
+ _BIGFRAMES_FUNCTION_PREFIX,
_BQ_FUNCTION_NAME_SEPERATOR,
session_id,
_BQ_FUNCTION_NAME_SEPERATOR,
@@ -176,7 +176,7 @@ def routine_ref_to_string_for_query(routine_ref: bigquery.RoutineReference) -> s
def get_cloud_function_name(function_hash, session_id=None, uniq_suffix=None):
"Get a name for the cloud function for the given user defined function."
- parts = [_BIGFRAMES_REMOTE_FUNCTION_PREFIX]
+ parts = [_BIGFRAMES_FUNCTION_PREFIX]
if session_id:
parts.append(session_id)
parts.append(function_hash)
@@ -186,8 +186,8 @@ def get_cloud_function_name(function_hash, session_id=None, uniq_suffix=None):
def get_remote_function_name(function_hash, session_id, uniq_suffix=None):
- "Get a name for the BQ remote function for the given user defined function."
- parts = [_BIGFRAMES_REMOTE_FUNCTION_PREFIX, session_id, function_hash]
+ "Get a name for the remote function for the given user defined function."
+ parts = [_BIGFRAMES_FUNCTION_PREFIX, session_id, function_hash]
if uniq_suffix:
parts.append(uniq_suffix)
return _BQ_FUNCTION_NAME_SEPERATOR.join(parts)
diff --git a/bigframes/functions/function.py b/bigframes/functions/function.py
index 57df8f9407..ef2c81a953 100644
--- a/bigframes/functions/function.py
+++ b/bigframes/functions/function.py
@@ -120,11 +120,11 @@ def get_routine_reference(
def remote_function(*args, **kwargs):
- remote_function_session = bff_session.RemoteFunctionSession()
+ remote_function_session = bff_session.FunctionSession()
return remote_function_session.remote_function(*args, **kwargs)
-remote_function.__doc__ = bff_session.RemoteFunctionSession.remote_function.__doc__
+remote_function.__doc__ = bff_session.FunctionSession.remote_function.__doc__
def read_gbq_function(
@@ -174,7 +174,7 @@ def read_gbq_function(
# The name "args" conflicts with the Ibis operator, so we use
# non-standard names for the arguments here.
def func(*bigframes_args, **bigframes_kwargs):
- f"""Remote function {str(routine_ref)}."""
+ f"""Bigframes function {str(routine_ref)}."""
nonlocal node # type: ignore
expr = node(*bigframes_args, **bigframes_kwargs) # type: ignore
diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py
index f06de99181..c353e47f3a 100644
--- a/bigframes/ml/base.py
+++ b/bigframes/ml/base.py
@@ -165,6 +165,34 @@ def fit(
return self._fit(X, y)
+class SupervisedTrainableWithIdColPredictor(SupervisedTrainablePredictor):
+ """Inherits from SupervisedTrainablePredictor,
+ but adds an optional id_col parameter to fit()."""
+
+ def __init__(self):
+ super().__init__()
+ self.id_col = None
+
+ def _fit(
+ self,
+ X: utils.ArrayType,
+ y: utils.ArrayType,
+ transforms=None,
+ id_col: Optional[utils.ArrayType] = None,
+ ):
+ return self
+
+ def fit(
+ self,
+ X: utils.ArrayType,
+ y: utils.ArrayType,
+ transforms=None,
+ id_col: Optional[utils.ArrayType] = None,
+ ):
+ self.id_col = id_col
+ return self._fit(X, y, transforms=transforms, id_col=self.id_col)
+
+
class TrainableWithEvaluationPredictor(TrainablePredictor):
"""A BigQuery DataFrames ML Model base class that can be used to fit and predict outputs.
diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py
index d038b8f4c0..ad00ed3f2c 100644
--- a/bigframes/ml/core.py
+++ b/bigframes/ml/core.py
@@ -181,15 +181,23 @@ def detect_anomalies(
def forecast(self, options: Mapping[str, int | float]) -> bpd.DataFrame:
sql = self._model_manipulation_sql_generator.ml_forecast(struct_options=options)
- return self._session.read_gbq(sql, index_col="forecast_timestamp").reset_index()
+ timestamp_col_name = "forecast_timestamp"
+ index_cols = [timestamp_col_name]
+ first_col_name = self._session.read_gbq(sql).columns.values[0]
+ if timestamp_col_name != first_col_name:
+ index_cols.append(first_col_name)
+ return self._session.read_gbq(sql, index_col=index_cols).reset_index()
def explain_forecast(self, options: Mapping[str, int | float]) -> bpd.DataFrame:
sql = self._model_manipulation_sql_generator.ml_explain_forecast(
struct_options=options
)
- return self._session.read_gbq(
- sql, index_col="time_series_timestamp"
- ).reset_index()
+ timestamp_col_name = "time_series_timestamp"
+ index_cols = [timestamp_col_name]
+ first_col_name = self._session.read_gbq(sql).columns.values[0]
+ if timestamp_col_name != first_col_name:
+ index_cols.append(first_col_name)
+ return self._session.read_gbq(sql, index_col=index_cols).reset_index()
def evaluate(self, input_data: Optional[bpd.DataFrame] = None):
sql = self._model_manipulation_sql_generator.ml_evaluate(
@@ -390,6 +398,7 @@ def create_time_series_model(
self,
X_train: bpd.DataFrame,
y_train: bpd.DataFrame,
+ id_col: Optional[bpd.DataFrame] = None,
transforms: Optional[Iterable[str]] = None,
options: Mapping[str, Union[str, int, float, Iterable[str]]] = {},
) -> BqmlModel:
@@ -399,13 +408,21 @@ def create_time_series_model(
assert (
y_train.columns.size == 1
), "Time stamp data input must only contain 1 column."
+ assert id_col is None or (
+ id_col is not None and id_col.columns.size == 1
+ ), "Time series id input is either None or must only contain 1 column."
options = dict(options)
# Cache dataframes to make sure base table is not a snapshot
# cached dataframe creates a full copy, never uses snapshot
- input_data = X_train.join(y_train, how="outer").cache()
+ input_data = X_train.join(y_train, how="outer")
+ if id_col is not None:
+ input_data = input_data.join(id_col, how="outer")
+ input_data = input_data.cache()
options.update({"TIME_SERIES_TIMESTAMP_COL": X_train.columns.tolist()[0]})
options.update({"TIME_SERIES_DATA_COL": y_train.columns.tolist()[0]})
+ if id_col is not None:
+ options.update({"TIME_SERIES_ID_COL": id_col.columns.tolist()[0]})
session = X_train._session
model_ref = self._create_model_ref(session._anonymous_dataset)
diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py
index 4e6c5036e7..7aa8ba5a5f 100644
--- a/bigframes/ml/forecasting.py
+++ b/bigframes/ml/forecasting.py
@@ -45,7 +45,7 @@
@log_adapter.class_logger
-class ARIMAPlus(base.SupervisedTrainablePredictor):
+class ARIMAPlus(base.SupervisedTrainableWithIdColPredictor):
"""Time Series ARIMA Plus model.
Args:
@@ -183,18 +183,26 @@ def _fit(
X: utils.ArrayType,
y: utils.ArrayType,
transforms: Optional[List[str]] = None,
- ):
+ id_col: Optional[utils.ArrayType] = None,
+ ) -> ARIMAPlus:
"""Fit the model to training data.
Args:
- X (bigframes.dataframe.DataFrame or bigframes.series.Series):
- A dataframe of training timestamp.
-
- y (bigframes.dataframe.DataFrame or bigframes.series.Series):
+ X (bigframes.dataframe.DataFrame or bigframes.series.Series,
+ or pandas.core.frame.DataFrame or pandas.core.series.Series):
+ A dataframe or series of trainging timestamp.
+ y (bigframes.dataframe.DataFrame, or bigframes.series.Series,
+ or pandas.core.frame.DataFrame, or pandas.core.series.Series):
Target values for training.
transforms (Optional[List[str]], default None):
Do not use. Internal param to be deprecated.
Use bigframes.ml.pipeline instead.
+ id_col (Optional[bigframes.dataframe.DataFrame]
+ or Optional[bigframes.series.Series]
+ or Optional[pandas.core.frame.DataFrame]
+ or Optional[pandas.core.frame.Series]
+ or None, default None):
+ An optional dataframe or series of training id col.
Returns:
ARIMAPlus: Fitted estimator.
@@ -202,18 +210,26 @@ def _fit(
X, y = utils.batch_convert_to_dataframe(X, y)
if X.columns.size != 1:
- raise ValueError(
- "Time series timestamp input X must only contain 1 column."
- )
+ raise ValueError("Time series timestamp input X contain at least 1 column.")
if y.columns.size != 1:
raise ValueError("Time series data input y must only contain 1 column.")
+ if id_col is not None:
+ (id_col,) = utils.batch_convert_to_dataframe(id_col)
+
+ if id_col.columns.size != 1:
+ raise ValueError(
+ "Time series id input id_col must only contain 1 column."
+ )
+
self._bqml_model = self._bqml_model_factory.create_time_series_model(
X,
y,
+ id_col=id_col,
transforms=transforms,
options=self._bqml_options,
)
+ return self
def predict(
self, X=None, *, horizon: int = 3, confidence_level: float = 0.95
@@ -237,7 +253,7 @@ def predict(
Returns:
bigframes.dataframe.DataFrame: The predicted DataFrames. Which
- contains 2 columns: "forecast_timestamp" and "forecast_value".
+ contains 2 columns: "forecast_timestamp", "id" as optional, and "forecast_value".
"""
if horizon < 1 or horizon > 1000:
raise ValueError(f"horizon must be [1, 1000], but is {horizon}.")
@@ -345,6 +361,7 @@ def score(
self,
X: utils.ArrayType,
y: utils.ArrayType,
+ id_col: Optional[utils.ArrayType] = None,
) -> bpd.DataFrame:
"""Calculate evaluation metrics of the model.
@@ -355,13 +372,22 @@ def score(
for the outputs relevant to this model type.
Args:
- X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series):
- A BigQuery DataFrame only contains 1 column as
+ X (bigframes.dataframe.DataFrame or bigframes.series.Series
+ or pandas.core.frame.DataFrame or pandas.core.series.Series):
+ A dataframe or series only contains 1 column as
evaluation timestamp. The timestamp must be within the horizon
of the model, which by default is 1000 data points.
- y (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series):
- A BigQuery DataFrame only contains 1 column as
+ y (bigframes.dataframe.DataFrame or bigframes.series.Series
+ or pandas.core.frame.DataFrame or pandas.core.series.Series):
+ A dataframe or series only contains 1 column as
evaluation numeric values.
+ id_col (Optional[bigframes.dataframe.DataFrame]
+ or Optional[bigframes.series.Series]
+ or Optional[pandas.core.frame.DataFrame]
+ or Optional[pandas.core.series.Series]
+ or None, default None):
+ An optional dataframe or series contains at least 1 column as
+ evaluation id column.
Returns:
bigframes.dataframe.DataFrame: A DataFrame as evaluation result.
@@ -371,6 +397,10 @@ def score(
X, y = utils.batch_convert_to_dataframe(X, y, session=self._bqml_model.session)
input_data = X.join(y, how="outer")
+ if id_col is not None:
+ (id_col,) = utils.batch_convert_to_dataframe(id_col)
+ input_data = input_data.join(id_col, how="outer")
+
return self._bqml_model.evaluate(input_data)
def summary(
diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py
index bdefc793f9..d2e97a7608 100644
--- a/bigframes/ml/llm.py
+++ b/bigframes/ml/llm.py
@@ -643,37 +643,16 @@ def __init__(
):
self.model_name = model_name
self.session = session or global_session.get_global_session()
- self._bq_connection_manager = self.session.bqconnectionmanager
-
- connection_name = connection_name or self.session._bq_connection
- self.connection_name = clients.resolve_full_bq_connection_name(
- connection_name,
- default_project=self.session._project,
- default_location=self.session._location,
- )
+ self.connection_name = connection_name
self._bqml_model_factory = globals.bqml_model_factory()
self._bqml_model: core.BqmlModel = self._create_bqml_model()
def _create_bqml_model(self):
# Parse and create connection if needed.
- if not self.connection_name:
- raise ValueError(
- "Must provide connection_name, either in constructor or through session options."
- )
-
- if self._bq_connection_manager:
- connection_name_parts = self.connection_name.split(".")
- if len(connection_name_parts) != 3:
- raise ValueError(
- f"connection_name must be of the format .., got {self.connection_name}."
- )
- self._bq_connection_manager.create_bq_connection(
- project_id=connection_name_parts[0],
- location=connection_name_parts[1],
- connection_id=connection_name_parts[2],
- iam_role="aiplatform.user",
- )
+ self.connection_name = self.session._create_bq_connection(
+ connection=self.connection_name, iam_role="aiplatform.user"
+ )
if self.model_name not in _TEXT_EMBEDDING_ENDPOINTS:
msg = _MODEL_NOT_SUPPORTED_WARNING.format(
@@ -828,37 +807,16 @@ def __init__(
self.model_name = model_name
self.session = session or global_session.get_global_session()
self.max_iterations = max_iterations
- self._bq_connection_manager = self.session.bqconnectionmanager
-
- connection_name = connection_name or self.session._bq_connection
- self.connection_name = clients.resolve_full_bq_connection_name(
- connection_name,
- default_project=self.session._project,
- default_location=self.session._location,
- )
+ self.connection_name = connection_name
self._bqml_model_factory = globals.bqml_model_factory()
self._bqml_model: core.BqmlModel = self._create_bqml_model()
def _create_bqml_model(self):
# Parse and create connection if needed.
- if not self.connection_name:
- raise ValueError(
- "Must provide connection_name, either in constructor or through session options."
- )
-
- if self._bq_connection_manager:
- connection_name_parts = self.connection_name.split(".")
- if len(connection_name_parts) != 3:
- raise ValueError(
- f"connection_name must be of the format .., got {self.connection_name}."
- )
- self._bq_connection_manager.create_bq_connection(
- project_id=connection_name_parts[0],
- location=connection_name_parts[1],
- connection_id=connection_name_parts[2],
- iam_role="aiplatform.user",
- )
+ self.connection_name = self.session._create_bq_connection(
+ connection=self.connection_name, iam_role="aiplatform.user"
+ )
if self.model_name not in _GEMINI_ENDPOINTS:
msg = _MODEL_NOT_SUPPORTED_WARNING.format(
@@ -953,10 +911,7 @@ def fit(
options["prompt_col"] = X.columns.tolist()[0]
self._bqml_model = self._bqml_model_factory.create_llm_remote_model(
- X,
- y,
- options=options,
- connection_name=self.connection_name,
+ X, y, options=options, connection_name=cast(str, self.connection_name)
)
return self
@@ -1179,37 +1134,16 @@ def __init__(
):
self.model_name = model_name
self.session = session or global_session.get_global_session()
- self._bq_connection_manager = self.session.bqconnectionmanager
-
- connection_name = connection_name or self.session._bq_connection
- self.connection_name = clients.resolve_full_bq_connection_name(
- connection_name,
- default_project=self.session._project,
- default_location=self.session._location,
- )
+ self.connection_name = connection_name
self._bqml_model_factory = globals.bqml_model_factory()
self._bqml_model: core.BqmlModel = self._create_bqml_model()
def _create_bqml_model(self):
# Parse and create connection if needed.
- if not self.connection_name:
- raise ValueError(
- "Must provide connection_name, either in constructor or through session options."
- )
-
- if self._bq_connection_manager:
- connection_name_parts = self.connection_name.split(".")
- if len(connection_name_parts) != 3:
- raise ValueError(
- f"connection_name must be of the format .., got {self.connection_name}."
- )
- self._bq_connection_manager.create_bq_connection(
- project_id=connection_name_parts[0],
- location=connection_name_parts[1],
- connection_id=connection_name_parts[2],
- iam_role="aiplatform.user",
- )
+ self.connection_name = self.session._create_bq_connection(
+ connection=self.connection_name, iam_role="aiplatform.user"
+ )
if self.model_name not in _CLAUDE_3_ENDPOINTS:
msg = _MODEL_NOT_SUPPORTED_WARNING.format(
diff --git a/bigframes/ml/remote.py b/bigframes/ml/remote.py
index 21a3a50421..6ee6840656 100644
--- a/bigframes/ml/remote.py
+++ b/bigframes/ml/remote.py
@@ -19,7 +19,6 @@
from typing import Mapping, Optional
import warnings
-from bigframes import clients
from bigframes.core import global_session, log_adapter
import bigframes.dataframe
from bigframes.ml import base, core, globals, utils
@@ -63,35 +62,16 @@ def __init__(
self.session = session or global_session.get_global_session()
self._bq_connection_manager = self.session.bqconnectionmanager
- connection_name = connection_name or self.session._bq_connection
- self.connection_name = clients.resolve_full_bq_connection_name(
- connection_name,
- default_project=self.session._project,
- default_location=self.session._location,
- )
+ self.connection_name = connection_name
self._bqml_model_factory = globals.bqml_model_factory()
self._bqml_model: core.BqmlModel = self._create_bqml_model()
def _create_bqml_model(self):
# Parse and create connection if needed.
- if not self.connection_name:
- raise ValueError(
- "Must provide connection_name, either in constructor or through session options."
- )
-
- if self._bq_connection_manager:
- connection_name_parts = self.connection_name.split(".")
- if len(connection_name_parts) != 3:
- raise ValueError(
- f"connection_name must be of the format .., got {self.connection_name}."
- )
- self._bq_connection_manager.create_bq_connection(
- project_id=connection_name_parts[0],
- location=connection_name_parts[1],
- connection_id=connection_name_parts[2],
- iam_role="aiplatform.user",
- )
+ self.connection_name = self.session._create_bq_connection(
+ connection=self.connection_name, iam_role="aiplatform.user"
+ )
options = {
"endpoint": self.endpoint,
diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py
index 365b664ee0..5f32cb980a 100644
--- a/bigframes/operations/aggregations.py
+++ b/bigframes/operations/aggregations.py
@@ -34,19 +34,22 @@ def skips_nulls(self):
return True
@property
- def uses_total_row_ordering(self):
- """Whether the operator needs total row ordering. (eg. lead, lag, array_agg)"""
- return False
+ def implicitly_inherits_order(self):
+ """
+ Whether the operator implicitly inherits the underlying array order, should it exist.
- @property
- def can_order_by(self):
- return False
+ Notably, rank operations do not want to inherit ordering. Even order-independent operations
+ may inherit order when needed for row bounds.
+ """
+ return True
@property
def order_independent(self):
"""
True if the output of the operator does not depend on the ordering of input rows.
+ Aggregation functions are usually order independent, except array_agg, string_agg.
+
Navigation functions are a notable case that are not order independent.
"""
return False
@@ -89,12 +92,11 @@ def arguments(self) -> int:
@property
def order_independent(self):
- """
- True if results don't depend on the order of the input.
+ return True
- Almost all aggregation functions are order independent, excepting ``array_agg`` and ``string_agg``.
- """
- return not self.can_order_by
+ @property
+ def uses_total_row_ordering(self):
+ return False
@dataclasses.dataclass(frozen=True)
@@ -126,6 +128,15 @@ def output_type(self, *input_types: dtypes.ExpressionType):
return dtypes.INT_DTYPE
+# TODO: Remove this temporary hack once nullary ops are better supported in APIs
+@dataclasses.dataclass(frozen=True)
+class SizeUnaryOp(UnaryAggregateOp):
+ name: ClassVar[str] = "size"
+
+ def output_type(self, *input_types: dtypes.ExpressionType):
+ return dtypes.INT_DTYPE
+
+
@dataclasses.dataclass(frozen=True)
class SumOp(UnaryAggregateOp):
name: ClassVar[str] = "sum"
@@ -143,6 +154,10 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
class MedianOp(UnaryAggregateOp):
name: ClassVar[str] = "median"
+ @property
+ def order_independent(self) -> bool:
+ return True
+
def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
# These will change if median is changed to exact implementation.
if not dtypes.is_orderable(input_types[0]):
@@ -161,6 +176,10 @@ class QuantileOp(UnaryAggregateOp):
def name(self):
return f"{int(self.q * 100)}%"
+ @property
+ def order_independent(self) -> bool:
+ return True
+
def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
return signatures.UNARY_REAL_NUMERIC.output_type(input_types[0])
@@ -288,8 +307,8 @@ class ArrayAggOp(UnaryAggregateOp):
name: ClassVar[str] = "arrayagg"
@property
- def can_order_by(self):
- return True
+ def order_independent(self):
+ return False
@property
def skips_nulls(self):
@@ -335,7 +354,7 @@ def order_independent(self):
@dataclasses.dataclass(frozen=True)
-class QcutOp(UnaryWindowOp):
+class QcutOp(UnaryWindowOp): # bucket op
quantiles: typing.Union[int, typing.Tuple[float, ...]]
@property
@@ -392,6 +411,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
return dtypes.INT_DTYPE
+# TODO: Convert to NullaryWindowOp
@dataclasses.dataclass(frozen=True)
class RankOp(UnaryWindowOp):
name: ClassVar[str] = "rank"
@@ -401,15 +421,14 @@ def skips_nulls(self):
return False
def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
- return signatures.FixedOutputType(
- dtypes.is_orderable, dtypes.INT_DTYPE, "orderable"
- ).output_type(input_types[0])
+ return dtypes.INT_DTYPE
@property
- def order_independent(self):
- return True
+ def implicitly_inherits_order(self):
+ return False
+# TODO: Convert to NullaryWindowOp
@dataclasses.dataclass(frozen=True)
class DenseRankOp(UnaryWindowOp):
@property
@@ -417,30 +436,20 @@ def skips_nulls(self):
return False
def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
- return signatures.FixedOutputType(
- dtypes.is_orderable, dtypes.INT_DTYPE, "orderable"
- ).output_type(input_types[0])
+ return dtypes.INT_DTYPE
@property
- def order_independent(self):
- return True
+ def implicitly_inherits_order(self):
+ return False
@dataclasses.dataclass(frozen=True)
class FirstOp(UnaryWindowOp):
name: ClassVar[str] = "first"
- @property
- def uses_total_row_ordering(self):
- return True
-
@dataclasses.dataclass(frozen=True)
class FirstNonNullOp(UnaryWindowOp):
- @property
- def uses_total_row_ordering(self):
- return True
-
@property
def skips_nulls(self):
return False
@@ -450,17 +459,9 @@ def skips_nulls(self):
class LastOp(UnaryWindowOp):
name: ClassVar[str] = "last"
- @property
- def uses_total_row_ordering(self):
- return True
-
@dataclasses.dataclass(frozen=True)
class LastNonNullOp(UnaryWindowOp):
- @property
- def uses_total_row_ordering(self):
- return True
-
@property
def skips_nulls(self):
return False
@@ -470,10 +471,6 @@ def skips_nulls(self):
class ShiftOp(UnaryWindowOp):
periods: int
- @property
- def uses_total_row_ordering(self):
- return True
-
@property
def skips_nulls(self):
return False
@@ -483,10 +480,6 @@ def skips_nulls(self):
class DiffOp(UnaryWindowOp):
periods: int
- @property
- def uses_total_row_ordering(self):
- return True
-
@property
def skips_nulls(self):
return False
diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py
index f6e8223aa0..75db2f48e9 100644
--- a/bigframes/operations/base.py
+++ b/bigframes/operations/base.py
@@ -87,7 +87,8 @@ def __init__(
if name is not None:
data.name = name
if dtype is not None:
- data = data.astype(dtype)
+ bf_dtype = bigframes.dtypes.bigframes_type(dtype)
+ data = data.astype(bf_dtype)
else: # local dict-like data
data = read_pandas_func(pd.Series(data, name=name, dtype=dtype)) # type: ignore
data_block = data._block
diff --git a/bigframes/operations/base_ops.py b/bigframes/operations/base_ops.py
index 0308283ad4..fc92ffe760 100644
--- a/bigframes/operations/base_ops.py
+++ b/bigframes/operations/base_ops.py
@@ -48,6 +48,11 @@ def deterministic(self) -> bool:
"""Whether the operation is deterministic" (given deterministic inputs)"""
...
+ @property
+ def expensive(self) -> bool:
+ """Whether the operation is expensive to calculate. Such ops shouldn't be inlined if referenced multiple places."""
+ ...
+
@dataclasses.dataclass(frozen=True)
class ScalarOp:
@@ -73,6 +78,10 @@ def deterministic(self) -> bool:
"""Whether the operation is deterministic" (given deterministic inputs)"""
return True
+ @property
+ def expensive(self) -> bool:
+ return False
+
@dataclasses.dataclass(frozen=True)
class NaryOp(ScalarOp):
diff --git a/bigframes/operations/datetime_ops.py b/bigframes/operations/datetime_ops.py
index 3ee8a00141..5086de27d3 100644
--- a/bigframes/operations/datetime_ops.py
+++ b/bigframes/operations/datetime_ops.py
@@ -43,7 +43,7 @@ class ToDatetimeOp(base_ops.UnaryOp):
format: typing.Optional[str] = None
unit: typing.Optional[str] = None
- def output_type(self, *input_types):
+ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
if input_types[0] not in (
dtypes.FLOAT_DTYPE,
dtypes.INT_DTYPE,
@@ -59,7 +59,7 @@ class ToTimestampOp(base_ops.UnaryOp):
format: typing.Optional[str] = None
unit: typing.Optional[str] = None
- def output_type(self, *input_types):
+ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
# Must be numeric or string
if input_types[0] not in (
dtypes.FLOAT_DTYPE,
@@ -75,7 +75,7 @@ class StrftimeOp(base_ops.UnaryOp):
name: typing.ClassVar[str] = "strftime"
date_format: str
- def output_type(self, *input_types):
+ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
return dtypes.STRING_DTYPE
@@ -83,7 +83,9 @@ def output_type(self, *input_types):
class UnixSeconds(base_ops.UnaryOp):
name: typing.ClassVar[str] = "unix_seconds"
- def output_type(self, *input_types):
+ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
+ if input_types[0] is not dtypes.TIMESTAMP_DTYPE:
+ raise TypeError("expected timestamp input")
return dtypes.INT_DTYPE
@@ -91,7 +93,9 @@ def output_type(self, *input_types):
class UnixMillis(base_ops.UnaryOp):
name: typing.ClassVar[str] = "unix_millis"
- def output_type(self, *input_types):
+ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
+ if input_types[0] is not dtypes.TIMESTAMP_DTYPE:
+ raise TypeError("expected timestamp input")
return dtypes.INT_DTYPE
@@ -99,5 +103,7 @@ def output_type(self, *input_types):
class UnixMicros(base_ops.UnaryOp):
name: typing.ClassVar[str] = "unix_micros"
- def output_type(self, *input_types):
+ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
+ if input_types[0] is not dtypes.TIMESTAMP_DTYPE:
+ raise TypeError("expected timestamp input")
return dtypes.INT_DTYPE
diff --git a/bigframes/operations/generic_ops.py b/bigframes/operations/generic_ops.py
index ef7e1f5cea..b90a43b091 100644
--- a/bigframes/operations/generic_ops.py
+++ b/bigframes/operations/generic_ops.py
@@ -16,8 +16,6 @@
import functools
import typing
-import pyarrow as pa
-
from bigframes import dtypes
from bigframes.operations import base_ops
import bigframes.operations.type as op_typing
@@ -56,17 +54,10 @@
class AsTypeOp(base_ops.UnaryOp):
name: typing.ClassVar[str] = "astype"
# TODO: Convert strings to dtype earlier
- to_type: typing.Union[dtypes.DtypeString, dtypes.Dtype]
+ to_type: dtypes.Dtype
safe: bool = False
def output_type(self, *input_types):
- # TODO: We should do this conversion earlier
- if self.to_type == pa.string():
- return dtypes.STRING_DTYPE
- if isinstance(self.to_type, str):
- return dtypes.BIGFRAMES_STRING_TO_BIGFRAMES[
- typing.cast(dtypes.DtypeString, self.to_type)
- ]
return self.to_type
diff --git a/bigframes/operations/remote_function_ops.py b/bigframes/operations/remote_function_ops.py
index 0bced56f8d..5b738c0bb5 100644
--- a/bigframes/operations/remote_function_ops.py
+++ b/bigframes/operations/remote_function_ops.py
@@ -25,6 +25,10 @@ class RemoteFunctionOp(base_ops.UnaryOp):
func: typing.Callable
apply_on_null: bool
+ @property
+ def expensive(self) -> bool:
+ return True
+
def output_type(self, *input_types):
# This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method
if hasattr(self.func, "output_dtype"):
@@ -45,6 +49,10 @@ class BinaryRemoteFunctionOp(base_ops.BinaryOp):
name: typing.ClassVar[str] = "binary_remote_function"
func: typing.Callable
+ @property
+ def expensive(self) -> bool:
+ return True
+
def output_type(self, *input_types):
# This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method
if hasattr(self.func, "output_dtype"):
@@ -65,6 +73,10 @@ class NaryRemoteFunctionOp(base_ops.NaryOp):
name: typing.ClassVar[str] = "nary_remote_function"
func: typing.Callable
+ @property
+ def expensive(self) -> bool:
+ return True
+
def output_type(self, *input_types):
# This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method
if hasattr(self.func, "output_dtype"):
diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py
index 2b094064cd..46d4344499 100644
--- a/bigframes/operations/strings.py
+++ b/bigframes/operations/strings.py
@@ -20,7 +20,6 @@
import bigframes_vendored.constants as constants
import bigframes_vendored.pandas.core.strings.accessor as vendorstr
-from bigframes import clients
from bigframes.core import log_adapter
import bigframes.dataframe as df
import bigframes.operations as ops
@@ -306,11 +305,8 @@ def to_blob(self, connection: Optional[str] = None) -> series.Series:
raise NotImplementedError()
session = self._block.session
- connection = connection or session._bq_connection
- connection = clients.resolve_full_bq_connection_name(
- connection,
- default_project=session._project,
- default_location=session._location,
+ connection = session._create_bq_connection(
+ connection=connection, iam_role="storage.objectUser"
)
return self._apply_binary_op(connection, ops.obj_make_ref_op)
diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py
index 4a5e4d4b3a..93c08a22aa 100644
--- a/bigframes/pandas/__init__.py
+++ b/bigframes/pandas/__init__.py
@@ -42,6 +42,7 @@
read_gbq,
read_gbq_function,
read_gbq_model,
+ read_gbq_object_table,
read_gbq_query,
read_gbq_table,
read_json,
@@ -306,6 +307,7 @@ def reset_session():
"read_gbq",
"read_gbq_function",
"read_gbq_model",
+ "read_gbq_object_table",
"read_gbq_query",
"read_gbq_table",
"read_json",
diff --git a/bigframes/pandas/io/api.py b/bigframes/pandas/io/api.py
index 454b2e729e..a119ff67b0 100644
--- a/bigframes/pandas/io/api.py
+++ b/bigframes/pandas/io/api.py
@@ -193,6 +193,21 @@ def read_gbq_model(model_name: str):
read_gbq_model.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq_model)
+def read_gbq_object_table(
+ object_table: str, *, name: Optional[str] = None
+) -> bigframes.dataframe.DataFrame:
+ return global_session.with_default_session(
+ bigframes.session.Session.read_gbq_object_table,
+ object_table,
+ name=name,
+ )
+
+
+read_gbq_object_table.__doc__ = inspect.getdoc(
+ bigframes.session.Session.read_gbq_object_table
+)
+
+
def read_gbq_query(
query: str,
*,
diff --git a/bigframes/series.py b/bigframes/series.py
index e705a97fa9..706c0f4f09 100644
--- a/bigframes/series.py
+++ b/bigframes/series.py
@@ -168,6 +168,10 @@ def values(self) -> numpy.ndarray:
def index(self) -> indexes.Index:
return indexes.Index.from_frame(self)
+ @validations.requires_index
+ def keys(self) -> indexes.Index:
+ return self.index
+
@property
def query_job(self) -> Optional[bigquery.QueryJob]:
"""BigQuery job metadata for the most recent query.
@@ -362,6 +366,7 @@ def astype(
) -> Series:
if errors not in ["raise", "null"]:
raise ValueError("Argument 'errors' must be one of 'raise' or 'null'")
+ dtype = bigframes.dtypes.bigframes_type(dtype)
return self._apply_unary_op(
bigframes.operations.AsTypeOp(to_type=dtype, safe=(errors == "null"))
)
@@ -483,7 +488,19 @@ def between(self, left, right, inclusive="both"):
)
def case_when(self, caselist) -> Series:
- cases = list(itertools.chain(*caselist, (True, self)))
+ cases = []
+
+ for condition, output in itertools.chain(caselist, [(True, self)]):
+ cases.append(condition)
+ cases.append(output)
+ # In pandas, the default value if no case matches is the original value.
+ # This makes it impossible to change the type of the column, but if
+ # the condition is always True, we know it will match and no subsequent
+ # conditions matter (including the fallback to `self`). This break allows
+ # the type to change (see: internal issue 349926559).
+ if condition is True:
+ break
+
return self._apply_nary_op(
ops.case_when_op,
cases,
diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py
index 02f79a7d99..c8c44be40b 100644
--- a/bigframes/session/__init__.py
+++ b/bigframes/session/__init__.py
@@ -120,6 +120,7 @@
pandas.ArrowDtype(pa.timestamp("us", tz="UTC")),
pandas.ArrowDtype(pa.decimal128(38, 9)),
pandas.ArrowDtype(pa.decimal256(76, 38)),
+ pandas.ArrowDtype(pa.duration("us")),
)
@@ -245,7 +246,7 @@ def __init__(
)
self._metrics = bigframes.session.metrics.ExecutionMetrics()
- self._function_session = bff_session.RemoteFunctionSession()
+ self._function_session = bff_session.FunctionSession()
self._temp_storage_manager = (
bigframes.session.temp_storage.TemporaryGbqStorageManager(
self._clients_provider.bqclient,
@@ -806,6 +807,7 @@ def _read_pandas_inline(
pa.ArrowInvalid, # Thrown by arrow for unsupported types, such as geo.
pa.ArrowTypeError, # Thrown by arrow for types without mapping (geo).
ValueError, # Thrown by ibis for some unhandled types
+ TypeError, # Not all types handleable by local code path
) as exc:
if should_raise:
raise ValueError(
@@ -1482,7 +1484,7 @@ def read_gbq_function(
2 TestCad$123456Str
dtype: string
- Another use case is to define your own remote funtion and use it later.
+ Another use case is to define your own remote function and use it later.
For example, define the remote function:
>>> @bpd.remote_function()
@@ -1624,6 +1626,11 @@ def from_glob_path(
self, path: str, *, connection: Optional[str] = None, name: Optional[str] = None
) -> dataframe.DataFrame:
r"""Create a BigFrames DataFrame that contains a BigFrames Blob column from a global wildcard path.
+ This operation creates a temporary BQ Object Table under the hood and requires bigquery.connections.delegate permission or BigQuery Connection Admin role.
+ If you have an existing BQ Object Table, use read_gbq_object_table().
+
+ .. note::
+ BigFrames Blob is still under experiments. It may not work and subject to change in the future.
Args:
path (str):
@@ -1641,16 +1648,64 @@ def from_glob_path(
if not bigframes.options.experiments.blob:
raise NotImplementedError()
- connection = connection or self._bq_connection
+ # TODO(garrettwu): switch to pseudocolumn when b/374988109 is done.
+ connection = self._create_bq_connection(
+ connection=connection, iam_role="storage.objectUser"
+ )
+
+ table = self._create_object_table(path, connection)
+
+ s = self.read_gbq(table)["uri"].str.to_blob(connection)
+ return s.rename(name).to_frame()
+
+ def _create_bq_connection(
+ self, iam_role: str, *, connection: Optional[str] = None
+ ) -> str:
+ """Create the connection with the session settings and try to attach iam role to the connection SA.
+ If any of project, location or connection isn't specified, use the session defaults. Returns fully-qualified connection name."""
+ connection = self._bq_connection if not connection else connection
connection = bigframes.clients.resolve_full_bq_connection_name(
- connection,
+ connection_name=connection,
default_project=self._project,
default_location=self._location,
)
+ connection_parts = connection.split(".")
+ assert len(connection_parts) == 3
+
+ self.bqconnectionmanager.create_bq_connection(
+ project_id=connection_parts[0],
+ location=connection_parts[1],
+ connection_id=connection_parts[2],
+ iam_role=iam_role,
+ )
- table = self._create_object_table(path, connection)
+ return connection
- s = self.read_gbq(table)["uri"].str.to_blob(connection)
+ def read_gbq_object_table(
+ self, object_table: str, *, name: Optional[str] = None
+ ) -> dataframe.DataFrame:
+ """Read an existing object table to create a BigFrames Blob DataFrame. Use the connection of the object table for the connection of the blob.
+ This function dosen't retrieve the object table data. If you want to read the data, use read_gbq() instead.
+
+ .. note::
+ BigFrames Blob is still under experiments. It may not work and subject to change in the future.
+
+ Args:
+ object_table (str): name of the object table of form ...
+ name (str or None): the returned blob column name.
+
+ Returns:
+ bigframes.pandas.DataFrame:
+ Result BigFrames DataFrame.
+ """
+ if not bigframes.options.experiments.blob:
+ raise NotImplementedError()
+
+ # TODO(garrettwu): switch to pseudocolumn when b/374988109 is done.
+ table = self.bqclient.get_table(object_table)
+ connection = table._properties["externalDataConfiguration"]["connectionId"]
+
+ s = self.read_gbq(object_table)["uri"].str.to_blob(connection)
return s.rename(name).to_frame()
diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py
index ac9523243e..ed68762ee8 100644
--- a/bigframes/session/_io/bigquery/read_gbq_table.py
+++ b/bigframes/session/_io/bigquery/read_gbq_table.py
@@ -152,24 +152,28 @@ def validate_table(
return False
-def are_index_cols_unique(
+def infer_unique_columns(
bqclient: bigquery.Client,
table: bigquery.table.Table,
index_cols: List[str],
api_name: str,
metadata_only: bool = False,
-) -> bool:
- if len(index_cols) == 0:
- return False
+) -> Tuple[str, ...]:
+ """Return a set of columns that can provide a unique row key or empty if none can be inferred.
+
+ Note: primary keys are not enforced, but these are assumed to be unique
+ by the query engine, so we make the same assumption here.
+ """
# If index_cols contain the primary_keys, the query engine assumes they are
# provide a unique index.
- primary_keys = frozenset(_get_primary_keys(table))
- if (len(primary_keys) > 0) and primary_keys <= frozenset(index_cols):
- return True
+ primary_keys = tuple(_get_primary_keys(table))
+ if (len(primary_keys) > 0) and frozenset(primary_keys) <= frozenset(index_cols):
+ # Essentially, just reordering the primary key to match the index col order
+ return tuple(index_col for index_col in index_cols if index_col in primary_keys)
- if metadata_only:
+ if primary_keys or metadata_only or (not index_cols):
# Sometimes not worth scanning data to check uniqueness
- return False
+ return primary_keys
# TODO(b/337925142): Avoid a "SELECT *" subquery here by ensuring
# table_expression only selects just index_cols.
is_unique_sql = bigframes.core.sql.is_distinct_sql(index_cols, table.reference)
@@ -178,7 +182,9 @@ def are_index_cols_unique(
results = bqclient.query_and_wait(is_unique_sql, job_config=job_config)
row = next(iter(results))
- return row["total_count"] == row["distinct_count"]
+ if row["total_count"] == row["distinct_count"]:
+ return tuple(index_cols)
+ return ()
def _get_primary_keys(
@@ -279,54 +285,3 @@ def get_index_cols(
index_cols = primary_keys
return index_cols
-
-
-def get_time_travel_datetime_and_table_metadata(
- bqclient: bigquery.Client,
- table_ref: bigquery.TableReference,
- *,
- api_name: str,
- cache: Dict[bigquery.TableReference, Tuple[datetime.datetime, bigquery.Table]],
- use_cache: bool = True,
-) -> Tuple[datetime.datetime, bigquery.Table]:
- cached_table = cache.get(table_ref)
- if use_cache and cached_table is not None:
- snapshot_timestamp, _ = cached_table
-
- # Cache hit could be unexpected. See internal issue 329545805.
- # Raise a warning with more information about how to avoid the
- # problems with the cache.
- msg = (
- f"Reading cached table from {snapshot_timestamp} to avoid "
- "incompatibilies with previous reads of this table. To read "
- "the latest version, set `use_cache=False` or close the "
- "current session with Session.close() or "
- "bigframes.pandas.close_session()."
- )
- # There are many layers before we get to (possibly) the user's code:
- # pandas.read_gbq_table
- # -> with_default_session
- # -> Session.read_gbq_table
- # -> _read_gbq_table
- # -> _get_snapshot_sql_and_primary_key
- # -> get_snapshot_datetime_and_table_metadata
- warnings.warn(msg, stacklevel=7)
- return cached_table
-
- # TODO(swast): It's possible that the table metadata is changed between now
- # and when we run the CURRENT_TIMESTAMP() query to see when we can time
- # travel to. Find a way to fetch the table metadata and BQ's current time
- # atomically.
- table = bqclient.get_table(table_ref)
-
- job_config = bigquery.QueryJobConfig()
- job_config.labels["bigframes-api"] = api_name
- snapshot_timestamp = list(
- bqclient.query(
- "SELECT CURRENT_TIMESTAMP() AS `current_timestamp`",
- job_config=job_config,
- ).result()
- )[0][0]
- cached_table = (snapshot_timestamp, table)
- cache[table_ref] = cached_table
- return cached_table
diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py
index 301e1c4ebb..532a909430 100644
--- a/bigframes/session/_io/pandas.py
+++ b/bigframes/session/_io/pandas.py
@@ -14,7 +14,7 @@
from __future__ import annotations
import dataclasses
-from typing import Collection, Union
+from typing import Collection, List, Union
import bigframes_vendored.constants as constants
import db_dtypes # type: ignore
@@ -38,6 +38,7 @@ class DataFrameAndLabels:
column_labels: Collection
index_labels: Collection
ordering_col: str
+ timedelta_cols: List[str]
def _arrow_to_pandas_arrowdtype(
@@ -163,9 +164,12 @@ def pandas_to_bq_compatible(pandas_dataframe: pandas.DataFrame) -> DataFrameAndL
pandas_dataframe_copy.columns = pandas.Index(new_col_ids)
pandas_dataframe_copy[ordering_col] = np.arange(pandas_dataframe_copy.shape[0])
+ timedelta_cols = utils.replace_timedeltas_with_micros(pandas_dataframe_copy)
+
return DataFrameAndLabels(
df=pandas_dataframe_copy,
column_labels=col_labels,
index_labels=idx_labels,
ordering_col=ordering_col,
+ timedelta_cols=timedelta_cols,
)
diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py
index 43faae37c3..ba693696c3 100644
--- a/bigframes/session/loader.py
+++ b/bigframes/session/loader.py
@@ -176,10 +176,16 @@ def read_pandas_load_job(
self._start_generic_job(load_job)
destination_table = self._bqclient.get_table(load_table_destination)
+ col_type_overrides: typing.Dict[str, bigframes.dtypes.Dtype] = {
+ col: bigframes.dtypes.TIMEDETLA_DTYPE
+ for col in df_and_labels.timedelta_cols
+ }
array_value = core.ArrayValue.from_table(
table=destination_table,
- # TODO: Generate this directly from original pandas df.
- schema=schemata.ArraySchema.from_bq_table(destination_table),
+ # TODO (b/394156190): Generate this directly from original pandas df.
+ schema=schemata.ArraySchema.from_bq_table(
+ destination_table, col_type_overrides
+ ),
session=self._session,
offsets_col=ordering_col,
).drop_columns([ordering_col])
@@ -229,10 +235,16 @@ def read_pandas_streaming(
f"Problem loading at least one row from DataFrame: {errors}. {constants.FEEDBACK_LINK}"
)
+ col_type_overrides: typing.Dict[str, bigframes.dtypes.Dtype] = {
+ col: bigframes.dtypes.TIMEDETLA_DTYPE
+ for col in df_and_labels.timedelta_cols
+ }
array_value = (
core.ArrayValue.from_table(
table=destination_table,
- schema=schemata.ArraySchema.from_bq_table(destination_table),
+ schema=schemata.ArraySchema.from_bq_table(
+ destination_table, col_type_overrides
+ ),
session=self._session,
# Don't set the offsets column because we want to group by it.
)
@@ -424,7 +436,7 @@ def read_gbq_table(
# in the query that checks for index uniqueness.
# TODO(b/338065601): Provide a way to assume uniqueness and avoid this
# check.
- is_index_unique = bf_read_gbq_table.are_index_cols_unique(
+ primary_key = bf_read_gbq_table.infer_unique_columns(
bqclient=self._bqclient,
table=table,
index_cols=index_cols,
@@ -440,12 +452,12 @@ def read_gbq_table(
schema=schema,
predicate=filter_str,
at_time=time_travel_timestamp if enable_snapshot else None,
- primary_key=index_cols if is_index_unique else (),
+ primary_key=primary_key,
session=self._session,
)
# if we don't have a unique index, we order by row hash if we are in strict mode
if self._force_total_order:
- if not is_index_unique:
+ if not primary_key:
array_value = array_value.order_by(
[
bigframes.core.ordering.OrderingExpression(
diff --git a/bigframes/session/metrics.py b/bigframes/session/metrics.py
index 352cd0d892..33bcd7fbf5 100644
--- a/bigframes/session/metrics.py
+++ b/bigframes/session/metrics.py
@@ -30,28 +30,31 @@ class ExecutionMetrics:
slot_millis: int = 0
bytes_processed: int = 0
execution_secs: float = 0
+ query_char_count: int = 0
def count_job_stats(self, query_job: bq_job.QueryJob):
stats = get_performance_stats(query_job)
if stats is not None:
- bytes_processed, slot_millis, execution_secs = stats
+ bytes_processed, slot_millis, execution_secs, query_char_count = stats
self.execution_count += 1
self.bytes_processed += bytes_processed
self.slot_millis += slot_millis
self.execution_secs += execution_secs
+ self.query_char_count += query_char_count
if LOGGING_NAME_ENV_VAR in os.environ:
# when running notebooks via pytest nbmake
- write_stats_to_disk(bytes_processed, slot_millis, execution_secs)
+ write_stats_to_disk(
+ bytes_processed, slot_millis, execution_secs, query_char_count
+ )
def get_performance_stats(
query_job: bigquery.QueryJob,
-) -> Optional[Tuple[int, int, float]]:
+) -> Optional[Tuple[int, int, float, int]]:
"""Parse the query job for performance stats.
Return None if the stats do not reflect real work done in bigquery.
"""
-
if (
query_job.configuration.dry_run
or query_job.created is None
@@ -68,12 +71,13 @@ def get_performance_stats(
return None # filter out mocks
execution_secs = (query_job.ended - query_job.created).total_seconds()
+ query_char_count = len(query_job.query)
- return bytes_processed, slot_millis, execution_secs
+ return bytes_processed, slot_millis, execution_secs, query_char_count
def write_stats_to_disk(
- bytes_processed: int, slot_millis: int, exec_seconds: Optional[float]
+ bytes_processed: int, slot_millis: int, exec_seconds: float, query_char_count: int
):
"""For pytest runs only, log information about the query job
to a file in order to create a performance report.
@@ -103,3 +107,10 @@ def write_stats_to_disk(
)
with open(exec_time_file, "a") as f:
f.write(str(exec_seconds) + "\n")
+
+ # store length of query
+ query_char_count_file = os.path.join(
+ current_directory, test_name + ".query_char_count"
+ )
+ with open(query_char_count_file, "a") as f:
+ f.write(str(query_char_count) + "\n")
diff --git a/bigframes/version.py b/bigframes/version.py
index 1fef294cef..d9b9875805 100644
--- a/bigframes/version.py
+++ b/bigframes/version.py
@@ -12,4 +12,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-__version__ = "1.34.0"
+__version__ = "1.35.0"
diff --git a/docs/reference/bigframes.geopandas/geoseries.rst b/docs/reference/bigframes.geopandas/geoseries.rst
index 1819613955..91e853b1f8 100644
--- a/docs/reference/bigframes.geopandas/geoseries.rst
+++ b/docs/reference/bigframes.geopandas/geoseries.rst
@@ -11,7 +11,7 @@ GeoSeries
Series
------
-.. autoclass:: bigframes.geopandas.geoseries.GeoSeries
+.. autoclass:: bigframes.geopandas.GeoSeries
:members:
:inherited-members:
:undoc-members:
diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml
index 47d9e97d7a..c17a1788df 100644
--- a/docs/templates/toc.yml
+++ b/docs/templates/toc.yml
@@ -207,6 +207,10 @@
- name: BigQuery built-in functions
uid: bigframes.bigquery
name: bigframes.bigquery
+ - items:
+ - name: GeoSeries
+ uid: bigframes.geopandas
+ name: bigframes.geopandas
- items:
- name: Overview
uid: bigframes.streaming
diff --git a/notebooks/geo/geoseries.ipynb b/notebooks/geo/geoseries.ipynb
new file mode 100644
index 0000000000..160d19ce91
--- /dev/null
+++ b/notebooks/geo/geoseries.ipynb
@@ -0,0 +1,371 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Copyright 2025 Google LLC\n",
+ "#\n",
+ "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+ "# you may not use this file except in compliance with the License.\n",
+ "# You may obtain a copy of the License at\n",
+ "#\n",
+ "# https://www.apache.org/licenses/LICENSE-2.0\n",
+ "#\n",
+ "# Unless required by applicable law or agreed to in writing, software\n",
+ "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+ "# See the License for the specific language governing permissions and\n",
+ "# limitations under the License."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Analyzing a GEOGRAPHY column with `bigframes.geopandas.GeoSeries`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import bigframes\n",
+ "import bigframes.geopandas\n",
+ "import bigframes.pandas as bpd\n",
+ "import shapely\n",
+ "bpd.options.display.progress_bar = None"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Load the Counties table from the Census Bureau US Boundaries dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/usr/local/google/home/arwas/src1/python-bigquery-dataframes/bigframes/session/_io/bigquery/read_gbq_table.py:274: DefaultIndexWarning: Table 'bigquery-public-data.geo_us_boundaries.counties' is clustered and/or partitioned, but BigQuery DataFrames was not able to find a suitable index. To avoid this warning, set at least one of: `index_col` or `filters`.\n",
+ " warnings.warn(msg, category=bfe.DefaultIndexWarning)\n"
+ ]
+ }
+ ],
+ "source": [
+ "df = bpd.read_gbq(\"bigquery-public-data.geo_us_boundaries.counties\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Create a series from the int_point_geom column"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "point_geom_series = df['int_point_geom']"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## The `GeoSeries` constructor accepts local data or a `bigframes.pandas.Series` object."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 1. Create a GeoSeries from local data with `Peek`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "54 POINT (-93.47523 45.00612)\n",
+ "256 POINT (-89.60507 42.67552)\n",
+ "266 POINT (-104.11408 39.31516)\n",
+ "485 POINT (-91.23193 32.34688)\n",
+ "765 POINT (-83.42808 38.20427)\n",
+ "Name: int_point_geom, dtype: geometry"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "five_geo_points = point_geom_series.peek(n = 5)\n",
+ "five_geo_points"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Convert the five geo points to `bigframes.gopandas.GeoSeries`"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Note: TypeError is raised if the GEOGRAPHY column contains geometry type other than `Point`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 POINT (-86.87338 38.37334)\n",
+ "1 POINT (-118.48037 46.25461)\n",
+ "2 POINT (-92.5617 32.30429)\n",
+ "3 POINT (-83.46189 39.55525)\n",
+ "4 POINT (-119.46779 47.21363)\n",
+ "dtype: geometry"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "geo_points = bigframes.geopandas.GeoSeries(\n",
+ " [point for point in five_geo_points]\n",
+ ")\n",
+ "geo_points"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Retrieve the x (longitude) and y (latitude) from the GeoSeries with `.x` and `.y`."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### `.x`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 -86.873385\n",
+ "1 -118.48037\n",
+ "2 -92.5617\n",
+ "3 -83.461893\n",
+ "4 -119.467788\n",
+ "dtype: Float64"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "geo_points.x"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### `.y`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 38.373344\n",
+ "1 46.254606\n",
+ "2 32.30429\n",
+ "3 39.555246\n",
+ "4 47.213633\n",
+ "dtype: Float64"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "geo_points.y"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 2. Alternatively, use the `.geo` accessor to access GeoSeries methods from a `bigframes.pandas.Series` object."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### `geo.x`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 -101.298265\n",
+ "1 -99.111085\n",
+ "2 -66.58687\n",
+ "3 -102.601791\n",
+ "4 -71.578625\n",
+ "5 -88.961529\n",
+ "6 -87.492986\n",
+ "7 -82.422666\n",
+ "8 -100.208166\n",
+ "9 -85.815939\n",
+ "10 -101.681133\n",
+ "11 -119.516659\n",
+ "12 -89.398306\n",
+ "13 -107.78848\n",
+ "14 -91.159306\n",
+ "15 -113.887042\n",
+ "16 -83.470416\n",
+ "17 -98.520146\n",
+ "18 -83.911718\n",
+ "19 -87.321865\n",
+ "20 -91.727626\n",
+ "21 -93.466093\n",
+ "22 -101.143324\n",
+ "23 -78.657634\n",
+ "24 -94.272323\n",
+ "dtype: Float64"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "point_geom_series.geo.x"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### `geo.y`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 46.710819\n",
+ "1 29.353661\n",
+ "2 18.211152\n",
+ "3 38.835646\n",
+ "4 41.869768\n",
+ "5 39.860237\n",
+ "6 36.892059\n",
+ "7 38.143642\n",
+ "8 34.524623\n",
+ "9 30.862007\n",
+ "10 40.180165\n",
+ "11 46.228125\n",
+ "12 36.054196\n",
+ "13 38.154731\n",
+ "14 38.761902\n",
+ "15 44.928506\n",
+ "16 30.447232\n",
+ "17 29.448671\n",
+ "18 42.602532\n",
+ "19 34.529776\n",
+ "20 33.957675\n",
+ "21 42.037538\n",
+ "22 29.875285\n",
+ "23 36.299884\n",
+ "24 44.821657\n",
+ "dtype: Float64"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "point_geom_series.geo.y"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.19"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/getting_started/getting_started_bq_dataframes.ipynb b/notebooks/getting_started/getting_started_bq_dataframes.ipynb
index 38ce75cc25..c5deeef1c5 100644
--- a/notebooks/getting_started/getting_started_bq_dataframes.ipynb
+++ b/notebooks/getting_started/getting_started_bq_dataframes.ipynb
@@ -1448,6 +1448,20 @@
"Running your own Python functions (or being able to bring your packages) and using them at scale is a challenge many data scientists face. BigQuery DataFrames makes it easy to deploy [remote functions](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.pandas#bigframes_pandas_remote_function) that run scalar Python functions at BigQuery scale. These functions are persisted as [BigQuery remote functions](https://cloud.google.com/bigquery/docs/remote-functions) that you can then re-use."
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "\n",
+ "# Python 3.13 is not yet a supported runtime for remote functions.\n",
+ "# See: https://cloud.google.com/functions/docs/runtime-support#python for the supported runtimes.\n",
+ "if sys.version_info >= (3, 13, 0):\n",
+ " sys.exit(0)"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {
diff --git a/notebooks/location/regionalized.ipynb b/notebooks/location/regionalized.ipynb
index 5a8239a42a..1b138c6a66 100644
--- a/notebooks/location/regionalized.ipynb
+++ b/notebooks/location/regionalized.ipynb
@@ -1339,6 +1339,20 @@
"# Using the Remote Functions"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "\n",
+ "# Python 3.13 is not yet a supported runtime for remote functions.\n",
+ "# See: https://cloud.google.com/functions/docs/runtime-support#python for the supported runtimes.\n",
+ "if sys.version_info >= (3, 13, 0):\n",
+ " sys.exit(0)"
+ ]
+ },
{
"attachments": {},
"cell_type": "markdown",
diff --git a/notebooks/remote_functions/remote_function.ipynb b/notebooks/remote_functions/remote_function.ipynb
index 1c1048d356..2114311e10 100644
--- a/notebooks/remote_functions/remote_function.ipynb
+++ b/notebooks/remote_functions/remote_function.ipynb
@@ -1,5 +1,20 @@
{
"cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bcff4fc4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "\n",
+ "# Python 3.13 is not yet a supported runtime for remote functions.\n",
+ "# See: https://cloud.google.com/functions/docs/runtime-support#python for the supported runtimes.\n",
+ "if sys.version_info >= (3, 13, 0):\n",
+ " sys.exit(0)"
+ ]
+ },
{
"cell_type": "code",
"execution_count": 19,
diff --git a/notebooks/remote_functions/remote_function_usecases.ipynb b/notebooks/remote_functions/remote_function_usecases.ipynb
index b897def4e8..d4dde6e6b1 100644
--- a/notebooks/remote_functions/remote_function_usecases.ipynb
+++ b/notebooks/remote_functions/remote_function_usecases.ipynb
@@ -21,6 +21,20 @@
"# limitations under the License."
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "\n",
+ "# Python 3.13 is not yet a supported runtime for remote functions.\n",
+ "# See: https://cloud.google.com/functions/docs/runtime-support#python for the supported runtimes.\n",
+ "if sys.version_info >= (3, 13, 0):\n",
+ " sys.exit(0)"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
diff --git a/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb b/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb
index 641a30e104..78f0d27474 100644
--- a/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb
+++ b/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb
@@ -28,6 +28,20 @@
""
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "\n",
+ "# Python 3.13 is not yet a supported runtime for remote functions.\n",
+ "# See: https://cloud.google.com/functions/docs/runtime-support#python for the supported runtimes.\n",
+ "if sys.version_info >= (3, 13, 0):\n",
+ " sys.exit(0)"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
diff --git a/noxfile.py b/noxfile.py
index 863c7b26d3..b851bf160d 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -24,7 +24,6 @@
import shutil
import time
from typing import Dict, List
-import warnings
import nox
import nox.sessions
@@ -32,6 +31,19 @@
BLACK_VERSION = "black==22.3.0"
ISORT_VERSION = "isort==5.12.0"
+# TODO: switch to 3.13 once remote functions / cloud run adds a runtime for it (internal issue 333742751)
+LATEST_FULLY_SUPPORTED_PYTHON = "3.12"
+
+# Notebook tests should match colab and BQ Studio.
+# Check with import sys; sys.version_info
+# on a fresh notebook runtime.
+COLAB_AND_BQ_STUDIO_PYTHON_VERSIONS = [
+ # BQ Studio
+ "3.10",
+ # colab.research.google.com
+ "3.11",
+]
+
# pytest-retry is not yet compatible with pytest 8.x.
# https://github.com/str0zzapreti/pytest-retry/issues/32
PYTEST_VERSION = "pytest<8.0.0dev"
@@ -47,7 +59,7 @@
DEFAULT_PYTHON_VERSION = "3.10"
-UNIT_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12"]
+UNIT_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13"]
UNIT_TEST_STANDARD_DEPENDENCIES = [
"mock",
"asyncmock",
@@ -57,15 +69,15 @@
"pytest-asyncio",
"pytest-mock",
]
-UNIT_TEST_EXTERNAL_DEPENDENCIES: List[str] = []
UNIT_TEST_LOCAL_DEPENDENCIES: List[str] = []
UNIT_TEST_DEPENDENCIES: List[str] = []
UNIT_TEST_EXTRAS: List[str] = []
UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {"3.12": ["polars"]}
-# There are 4 different ibis-framework 9.x versions we want to test against.
-# 3.10 is needed for Windows tests.
-SYSTEM_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12"]
+# 3.10 is needed for Windows tests as it is the only version installed in the
+# bigframes-windows container image. For more information, search
+# bigframes/windows-docker, internally.
+SYSTEM_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.12", "3.13"]
SYSTEM_TEST_STANDARD_DEPENDENCIES = [
"jinja2",
"mock",
@@ -169,14 +181,6 @@ def install_unittest_dependencies(session, install_test_extra, *constraints):
standard_deps = UNIT_TEST_STANDARD_DEPENDENCIES + UNIT_TEST_DEPENDENCIES
session.install(*standard_deps, *constraints)
- if UNIT_TEST_EXTERNAL_DEPENDENCIES:
- msg = (
- "'unit_test_external_dependencies' is deprecated. Instead, please "
- "use 'unit_test_dependencies' or 'unit_test_local_dependencies'.",
- )
- warnings.warn(msg, DeprecationWarning)
- session.install(*UNIT_TEST_EXTERNAL_DEPENDENCIES, *constraints)
-
if UNIT_TEST_LOCAL_DEPENDENCIES:
session.install(*UNIT_TEST_LOCAL_DEPENDENCIES, *constraints)
@@ -375,7 +379,7 @@ def system(session: nox.sessions.Session):
)
-@nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS[-1])
+@nox.session(python=LATEST_FULLY_SUPPORTED_PYTHON)
def system_noextras(session: nox.sessions.Session):
"""Run the system test suite."""
run_system(
@@ -386,7 +390,7 @@ def system_noextras(session: nox.sessions.Session):
)
-@nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS[-1])
+@nox.session(python=LATEST_FULLY_SUPPORTED_PYTHON)
def doctest(session: nox.sessions.Session):
"""Run the system test suite."""
run_system(
@@ -444,7 +448,7 @@ def cover(session):
"report",
"--include=bigframes/*",
"--show-missing",
- "--fail-under=86",
+ "--fail-under=85",
)
# Make sure there is no dead code in our test directories.
@@ -694,7 +698,7 @@ def system_prerelease(session: nox.sessions.Session):
# This would mean that we will only rely on the standard remote function
# tests.
small_remote_function_tests = os.path.join(
- small_tests_dir, "test_remote_function.py"
+ small_tests_dir, "functions", "test_remote_function.py"
)
assert os.path.exists(small_remote_function_tests)
@@ -705,7 +709,7 @@ def system_prerelease(session: nox.sessions.Session):
)
-@nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS)
+@nox.session(python=COLAB_AND_BQ_STUDIO_PYTHON_VERSIONS)
def notebook(session: nox.Session):
google_cloud_project = os.getenv("GOOGLE_CLOUD_PROJECT")
if not google_cloud_project:
@@ -762,6 +766,20 @@ def notebook(session: nox.Session):
"notebooks/apps/synthetic_data_generation.ipynb",
]
+ # TODO: remove exception for Python 3.13 cloud run adds a runtime for it (internal issue 333742751)
+ # TODO: remove exception for Python 3.13 if nbmake adds support for
+ # sys.exit(0) or pytest.skip(...).
+ # See: https://github.com/treebeardtech/nbmake/issues/134
+ if session.python == "3.13":
+ denylist.extend(
+ [
+ "notebooks/getting_started/getting_started_bq_dataframes.ipynb",
+ "notebooks/remote_functions/remote_function_usecases.ipynb",
+ "notebooks/remote_functions/remote_function_vertex_claude_model.ipynb",
+ "notebooks/remote_functions/remote_function.ipynb",
+ ]
+ )
+
# Convert each Path notebook object to a string using a list comprehension.
notebooks = [str(nb) for nb in notebooks_list]
diff --git a/owlbot.py b/owlbot.py
index 5de70bcad6..10fc47ebd7 100644
--- a/owlbot.py
+++ b/owlbot.py
@@ -31,8 +31,8 @@
# ----------------------------------------------------------------------------
templated_files = common.py_library(
default_python_version="3.10",
- unit_test_python_versions=["3.9", "3.10", "3.11", "3.12"],
- system_test_python_versions=["3.9", "3.11", "3.12"],
+ unit_test_python_versions=["3.9", "3.10", "3.11", "3.12", "3.13"],
+ system_test_python_versions=["3.9", "3.11", "3.12", "3.13"],
cov_level=35,
intersphinx_dependencies={
"pandas": "https://pandas.pydata.org/pandas-docs/stable/",
diff --git a/samples/snippets/create_multiple_timeseries_forecasting_model_test.py b/samples/snippets/create_multiple_timeseries_forecasting_model_test.py
index e414fdea9c..b749c37d50 100644
--- a/samples/snippets/create_multiple_timeseries_forecasting_model_test.py
+++ b/samples/snippets/create_multiple_timeseries_forecasting_model_test.py
@@ -17,6 +17,7 @@ def test_multiple_timeseries_forecasting_model(random_model_id: str) -> None:
your_model_id = random_model_id
# [START bigquery_dataframes_bqml_arima_multiple_step_2_visualize]
+
import bigframes.pandas as bpd
df = bpd.read_gbq("bigquery-public-data.new_york.citibike_trips")
diff --git a/samples/snippets/create_single_timeseries_forecasting_model_test.py b/samples/snippets/create_single_timeseries_forecasting_model_test.py
index 60b8d13149..9965da2817 100644
--- a/samples/snippets/create_single_timeseries_forecasting_model_test.py
+++ b/samples/snippets/create_single_timeseries_forecasting_model_test.py
@@ -104,7 +104,22 @@ def test_create_single_timeseries() -> None:
# 25 2017-08-27 00:00:00+00:00 1853.735689 410.596551 0.8 1327.233216 2380.238162 1327.233216 2380.238162
# 1 2017-08-03 00:00:00+00:00 2621.33159 241.093355 0.8 2312.180802 2930.482379 2312.180802 2930.482379
# [END bigquery_dataframes_single_timeseries_forecasting_model_tutorial_forecast]
+
+ # [START bigquery_dataframes_single_timeseries_forecasting_model_tutorial_explain_forecast]
+ ex_pred = model.predict_explain(horizon=30, confidence_level=0.8)
+
+ print(ex_pred.head(4))
+ # Expected output:
+ # time_series_timestamp time_series_type time_series_data time_series_adjusted_data standard_error confidence_level prediction_interval_lower_bound prediction_interval_upper_bound trend seasonal_period_yearly seasonal_period_quarterly seasonal_period_monthly seasonal_period_weekly seasonal_period_daily holiday_effect spikes_and_dips step_changes residual
+ # 0 2016-08-01 00:00:00+00:00 history 1711.0 505.716474 206.939556 0.0 169.611938 1205.283526 336.104536
+ # 1 2016-08-02 00:00:00+00:00 history 2140.0 623.137701 206.939556 336.104428 287.033273 1205.283526 311.578773
+ # 2 2016-08-03 00:00:00+00:00 history 2890.0 1008.655091 206.939556 563.514213 445.140878 1205.283526 676.061383
+ # 3 2016-08-04 00:00:00+00:00 history 3161.0 1389.40959 206.939556 986.317236 403.092354 1205.283526 566.306884
+ # 4 2016-08-05 00:00:00+00:00 history 2702.0 1394.395741 206.939556 1248.707386 145.688355 1205.283526 102.320733
+ # 5 2016-08-06 00:00:00+00:00 history 1663.0 437.09243 206.939556 1188.59004 -751.49761 1205.283526 20.624044
+ # [END bigquery_dataframes_single_timeseries_forecasting_model_tutorial_explain_forecast]
assert coef is not None
+ assert ex_pred is not None
assert summary is not None
assert model is not None
assert parsed_date is not None
diff --git a/scripts/run_and_publish_benchmark.py b/scripts/run_and_publish_benchmark.py
index 8b55493770..28605a8155 100644
--- a/scripts/run_and_publish_benchmark.py
+++ b/scripts/run_and_publish_benchmark.py
@@ -88,6 +88,8 @@ def collect_benchmark_result(
millis_files = sorted(path.rglob("*.slotmillis"))
bq_seconds_files = sorted(path.rglob("*.bq_exec_time_seconds"))
local_seconds_files = sorted(path.rglob("*.local_exec_time_seconds"))
+ query_char_count_files = sorted(path.rglob("*.query_char_count"))
+
error_files = sorted(path.rglob("*.error"))
if not (
@@ -95,15 +97,18 @@ def collect_benchmark_result(
== len(millis_files)
== len(local_seconds_files)
== len(bq_seconds_files)
+ == len(query_char_count_files)
):
raise ValueError(
- "Mismatch in the number of report files for bytes, millis, and seconds."
+ "Mismatch in the number of report files for bytes, millis, seconds and query char count."
)
for idx in range(len(bytes_files)):
bytes_file = bytes_files[idx]
millis_file = millis_files[idx]
bq_seconds_file = bq_seconds_files[idx]
+ query_char_count_file = query_char_count_files[idx]
+
filename = bytes_file.relative_to(path).with_suffix("")
if filename != millis_file.relative_to(path).with_suffix(
@@ -136,12 +141,17 @@ def collect_benchmark_result(
lines = file.read().splitlines()
bq_seconds = sum(float(line) for line in lines) / iterations
+ with open(query_char_count_file, "r") as file:
+ lines = file.read().splitlines()
+ query_char_count = sum(int(line) for line in lines) / iterations
+
results_dict[str(filename)] = [
query_count,
total_bytes,
total_slot_millis,
local_seconds,
bq_seconds,
+ query_char_count,
]
finally:
for files_to_remove in (
@@ -149,6 +159,7 @@ def collect_benchmark_result(
path.rglob("*.slotmillis"),
path.rglob("*.local_exec_time_seconds"),
path.rglob("*.bq_exec_time_seconds"),
+ path.rglob("*.query_char_count"),
path.rglob("*.error"),
):
for log_file in files_to_remove:
@@ -160,6 +171,7 @@ def collect_benchmark_result(
"Slot_Millis",
"Local_Execution_Time_Sec",
"BigQuery_Execution_Time_Sec",
+ "Query_Char_Count",
]
benchmark_metrics = pd.DataFrame.from_dict(
@@ -182,15 +194,19 @@ def collect_benchmark_result(
)
print(
f"{index} - query count: {row['Query_Count']},"
+ f" query char count: {row['Query_Char_Count']},",
f" bytes processed sum: {row['Bytes_Processed']},"
f" slot millis sum: {row['Slot_Millis']},"
f" local execution time: {formatted_local_exec_time} seconds,"
- f" bigquery execution time: {round(row['BigQuery_Execution_Time_Sec'], 1)} seconds"
+ f" bigquery execution time: {round(row['BigQuery_Execution_Time_Sec'], 1)} seconds",
)
geometric_mean_queries = geometric_mean_excluding_zeros(
benchmark_metrics["Query_Count"]
)
+ geometric_mean_query_char_count = geometric_mean_excluding_zeros(
+ benchmark_metrics["Query_Char_Count"]
+ )
geometric_mean_bytes = geometric_mean_excluding_zeros(
benchmark_metrics["Bytes_Processed"]
)
@@ -206,6 +222,7 @@ def collect_benchmark_result(
print(
f"---Geometric mean of queries: {geometric_mean_queries}, "
+ f"Geometric mean of queries char counts: {geometric_mean_query_char_count}, "
f"Geometric mean of bytes processed: {geometric_mean_bytes}, "
f"Geometric mean of slot millis: {geometric_mean_slot_millis}, "
f"Geometric mean of local execution time: {geometric_mean_local_seconds} seconds, "
diff --git a/scripts/test_publish_api_coverage.py b/scripts/test_publish_api_coverage.py
index 0b87563482..034a266177 100644
--- a/scripts/test_publish_api_coverage.py
+++ b/scripts/test_publish_api_coverage.py
@@ -25,6 +25,10 @@ def api_coverage_df():
return publish_api_coverage.build_api_coverage_table("my_bf_ver", "my_release_ver")
+@pytest.mark.skipif(
+ sys.version_info >= (3, 13),
+ reason="Issues with installing sklearn for this test in python 3.13",
+)
def test_api_coverage_produces_expected_schema(api_coverage_df):
if sys.version.split(".")[:2] == ["3", "9"]:
pytest.skip(
@@ -54,6 +58,10 @@ def test_api_coverage_produces_expected_schema(api_coverage_df):
)
+@pytest.mark.skipif(
+ sys.version_info >= (3, 13),
+ reason="Issues with installing sklearn for this test in python 3.13",
+)
def test_api_coverage_produces_missing_parameters(api_coverage_df):
"""Make sure at least some functions have reported missing parameters."""
assert (api_coverage_df["missing_parameters"].str.len() > 0).any()
diff --git a/scripts/tpch_result_verify.py b/scripts/tpch_result_verify.py
index e241327a4a..0c932f6eac 100644
--- a/scripts/tpch_result_verify.py
+++ b/scripts/tpch_result_verify.py
@@ -24,705 +24,16 @@
project_id = "bigframes-dev-perf"
dataset_id = "tpch_0001g"
-line_item_ds = f"bigframes-dev-perf.{dataset_id}.LINEITEM"
-region_ds = f"bigframes-dev-perf.{dataset_id}.REGION"
-nation_ds = f"bigframes-dev-perf.{dataset_id}.NATION"
-supplier_ds = f"bigframes-dev-perf.{dataset_id}.SUPPLIER"
-part_ds = f"bigframes-dev-perf.{dataset_id}.PART"
-part_supp_ds = f"bigframes-dev-perf.{dataset_id}.PARTSUPP"
-customer_ds = f"bigframes-dev-perf.{dataset_id}.CUSTOMER"
-orders_ds = f"bigframes-dev-perf.{dataset_id}.ORDERS"
-
-q1_query = f"""
- select
- l_returnflag,
- l_linestatus,
- sum(l_quantity) as sum_qty,
- sum(l_extendedprice) as sum_base_price,
- sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
- sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
- avg(l_quantity) as avg_qty,
- avg(l_extendedprice) as avg_price,
- avg(l_discount) as avg_disc,
- count(*) as count_order
- from
- {line_item_ds}
- where
- l_shipdate <= '1998-09-02'
- group by
- l_returnflag,
- l_linestatus
- order by
- l_returnflag,
- l_linestatus
-"""
-
-q2_query = f"""
- select
- s_acctbal,
- s_name,
- n_name,
- p_partkey,
- p_mfgr,
- s_address,
- s_phone,
- s_comment
- from
- {part_ds},
- {supplier_ds},
- {part_supp_ds},
- {nation_ds},
- {region_ds}
- where
- p_partkey = ps_partkey
- and s_suppkey = ps_suppkey
- and p_size = 15
- and p_type like '%BRASS'
- and s_nationkey = n_nationkey
- and n_regionkey = r_regionkey
- and r_name = 'EUROPE'
- and ps_supplycost = (
- select
- min(ps_supplycost)
- from
- {part_supp_ds},
- {supplier_ds},
- {nation_ds},
- {region_ds}
- where
- p_partkey = ps_partkey
- and s_suppkey = ps_suppkey
- and s_nationkey = n_nationkey
- and n_regionkey = r_regionkey
- and r_name = 'EUROPE'
- )
- order by
- s_acctbal desc,
- n_name,
- s_name,
- p_partkey
- limit 100
-"""
-
-q3_query = f"""
- select
- l_orderkey,
- sum(l_extendedprice * (1 - l_discount)) as revenue,
- o_orderdate,
- o_shippriority
- from
- {customer_ds},
- {orders_ds},
- {line_item_ds}
- where
- c_mktsegment = 'BUILDING'
- and c_custkey = o_custkey
- and l_orderkey = o_orderkey
- and o_orderdate < '1995-03-15'
- and l_shipdate > '1995-03-15'
- group by
- l_orderkey,
- o_orderdate,
- o_shippriority
- order by
- revenue desc,
- o_orderdate
- limit 10
-"""
-
-q4_query = f"""
- select
- o_orderpriority,
- count(*) as order_count
- from
- {orders_ds}
- where
- o_orderdate >= date '1993-07-01'
- and o_orderdate < date '1993-10-01'
- and exists (
- select
- *
- from
- {line_item_ds}
- where
- l_orderkey = o_orderkey
- and l_commitdate < l_receiptdate
- )
- group by
- o_orderpriority
- order by
- o_orderpriority
-"""
-
-q5_query = f"""
- select
- n_name,
- sum(l_extendedprice * (1 - l_discount)) as revenue
- from
- {customer_ds},
- {orders_ds},
- {line_item_ds},
- {supplier_ds},
- {nation_ds},
- {region_ds}
- where
- c_custkey = o_custkey
- and l_orderkey = o_orderkey
- and l_suppkey = s_suppkey
- and c_nationkey = s_nationkey
- and s_nationkey = n_nationkey
- and n_regionkey = r_regionkey
- and r_name = 'ASIA'
- and o_orderdate >= date '1994-01-01'
- and o_orderdate < date '1995-01-01'
- group by
- n_name
- order by
- revenue desc
-"""
-
-q6_query = f"""
- select
- sum(l_extendedprice * l_discount) as revenue
- from
- {line_item_ds}
- where
- l_shipdate >= date '1994-01-01'
- and l_shipdate < date '1994-01-01' + interval '1' year
- and l_discount between .05 and .07
- and l_quantity < 24
-"""
-
-q7_query = f"""
- select
- supp_nation,
- cust_nation,
- l_year,
- sum(volume) as revenue
- from
- (
- select
- n1.n_name as supp_nation,
- n2.n_name as cust_nation,
- EXTRACT(YEAR FROM l_shipdate) as l_year,
- l_extendedprice * (1 - l_discount) as volume
- from
- {supplier_ds},
- {line_item_ds},
- {orders_ds},
- {customer_ds},
- {nation_ds} n1,
- {nation_ds} n2
- where
- s_suppkey = l_suppkey
- and o_orderkey = l_orderkey
- and c_custkey = o_custkey
- and s_nationkey = n1.n_nationkey
- and c_nationkey = n2.n_nationkey
- and (
- (n1.n_name = 'FRANCE' and n2.n_name = 'GERMANY')
- or (n1.n_name = 'GERMANY' and n2.n_name = 'FRANCE')
- )
- and l_shipdate between date '1995-01-01' and date '1996-12-31'
- ) as shipping
- group by
- supp_nation,
- cust_nation,
- l_year
- order by
- supp_nation,
- cust_nation,
- l_year
-"""
-
-q8_query = f"""
- select
- o_year,
- round(
- sum(case
- when nation = 'BRAZIL' then volume
- else 0
- end) / sum(volume)
- , 2) as mkt_share
- from
- (
- select
- extract(year from o_orderdate) as o_year,
- l_extendedprice * (1 - l_discount) as volume,
- n2.n_name as nation
- from
- {part_ds},
- {supplier_ds},
- {line_item_ds},
- {orders_ds},
- {customer_ds},
- {nation_ds} n1,
- {nation_ds} n2,
- {region_ds}
- where
- p_partkey = l_partkey
- and s_suppkey = l_suppkey
- and l_orderkey = o_orderkey
- and o_custkey = c_custkey
- and c_nationkey = n1.n_nationkey
- and n1.n_regionkey = r_regionkey
- and r_name = 'AMERICA'
- and s_nationkey = n2.n_nationkey
- and o_orderdate between date '1995-01-01' and date '1996-12-31'
- and p_type = 'ECONOMY ANODIZED STEEL'
- ) as all_nations
- group by
- o_year
- order by
- o_year
-"""
-
-q9_query = f"""
- select
- nation,
- o_year,
- round(sum(amount), 2) as sum_profit
- from
- (
- select
- n_name as nation,
- EXTRACT(YEAR FROM o_orderdate) as o_year,
- l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount
- from
- {part_ds},
- {supplier_ds},
- {line_item_ds},
- {part_supp_ds},
- {orders_ds},
- {nation_ds}
- where
- s_suppkey = l_suppkey
- and ps_suppkey = l_suppkey
- and ps_partkey = l_partkey
- and p_partkey = l_partkey
- and o_orderkey = l_orderkey
- and s_nationkey = n_nationkey
- and p_name like '%green%'
- ) as profit
- group by
- nation,
- o_year
- order by
- nation,
- o_year desc
-"""
-
-q10_query = f"""
- select
- c_custkey,
- c_name,
- round(sum(l_extendedprice * (1 - l_discount)), 2) as revenue,
- c_acctbal,
- n_name,
- c_address,
- c_phone,
- c_comment
- from
- {customer_ds},
- {orders_ds},
- {line_item_ds},
- {nation_ds}
- where
- c_custkey = o_custkey
- and l_orderkey = o_orderkey
- and o_orderdate >= date '1993-10-01'
- and o_orderdate < date '1993-10-01' + interval '3' month
- and l_returnflag = 'R'
- and c_nationkey = n_nationkey
- group by
- c_custkey,
- c_name,
- c_acctbal,
- c_phone,
- n_name,
- c_address,
- c_comment
- order by
- revenue desc
- limit 20
-"""
-
-q11_query = f"""
- select
- ps_partkey,
- round(sum(ps_supplycost * ps_availqty), 2) as value
- from
- {part_supp_ds},
- {supplier_ds},
- {nation_ds}
- where
- ps_suppkey = s_suppkey
- and s_nationkey = n_nationkey
- and n_name = 'GERMANY'
- group by
- ps_partkey having
- sum(ps_supplycost * ps_availqty) > (
- select
- sum(ps_supplycost * ps_availqty) * 0.0001
- from
- {part_supp_ds},
- {supplier_ds},
- {nation_ds}
- where
- ps_suppkey = s_suppkey
- and s_nationkey = n_nationkey
- and n_name = 'GERMANY'
- )
- order by
- value desc
-"""
-
-q12_query = f"""
- select
- l_shipmode,
- sum(case
- when o_orderpriority = '1-URGENT'
- or o_orderpriority = '2-HIGH'
- then 1
- else 0
- end) as high_line_count,
- sum(case
- when o_orderpriority <> '1-URGENT'
- and o_orderpriority <> '2-HIGH'
- then 1
- else 0
- end) as low_line_count
- from
- {orders_ds},
- {line_item_ds}
- where
- o_orderkey = l_orderkey
- and l_shipmode in ('MAIL', 'SHIP')
- and l_commitdate < l_receiptdate
- and l_shipdate < l_commitdate
- and l_receiptdate >= date '1994-01-01'
- and l_receiptdate < date '1994-01-01' + interval '1' year
- group by
- l_shipmode
- order by
- l_shipmode
-"""
-
-q13_query = f"""
- SELECT
- c_count, COUNT(*) AS custdist
- FROM (
- SELECT
- c_custkey,
- COUNT(o_orderkey) AS c_count
- FROM
- {customer_ds} LEFT OUTER JOIN {orders_ds} ON
- c_custkey = o_custkey
- AND o_comment NOT LIKE '%special%requests%'
- GROUP BY
- c_custkey
- ) AS c_orders
- GROUP BY
- c_count
- ORDER BY
- custdist DESC,
- c_count DESC
-"""
-
-q14_query = f"""
- select
- round(100.00 * sum(case
- when p_type like 'PROMO%'
- then l_extendedprice * (1 - l_discount)
- else 0
- end) / sum(l_extendedprice * (1 - l_discount)), 2) as promo_revenue
- from
- {line_item_ds},
- {part_ds}
- where
- l_partkey = p_partkey
- and l_shipdate >= date '1995-09-01'
- and l_shipdate < date '1995-09-01' + interval '1' month
-"""
-
-q15_query = f"""
- WITH revenue AS (
- SELECT
- l_suppkey AS supplier_no,
- SUM(l_extendedprice * (1 - l_discount)) AS total_revenue
- FROM
- {line_item_ds}
- WHERE
- l_shipdate >= DATE '1996-01-01'
- AND l_shipdate < DATE '1996-01-01' + INTERVAL '3' month
- GROUP BY
- l_suppkey
- )
- SELECT
- s.s_suppkey,
- s.s_name,
- s.s_address,
- s.s_phone,
- r.total_revenue
- FROM
- {supplier_ds} s
- JOIN
- revenue r ON s.s_suppkey = r.supplier_no
- WHERE
- r.total_revenue = (SELECT MAX(total_revenue) FROM revenue)
- ORDER BY
- s.s_suppkey;
-"""
-
-q16_query = f"""
- select
- p_brand,
- p_type,
- p_size,
- count(distinct ps_suppkey) as supplier_cnt
- from
- {part_supp_ds},
- {part_ds}
- where
- p_partkey = ps_partkey
- and p_brand <> 'Brand#45'
- and p_type not like 'MEDIUM POLISHED%'
- and p_size in (49, 14, 23, 45, 19, 3, 36, 9)
- and ps_suppkey not in (
- select
- s_suppkey
- from
- {supplier_ds}
- where
- s_comment like '%Customer%Complaints%'
- )
- group by
- p_brand,
- p_type,
- p_size
- order by
- supplier_cnt desc,
- p_brand,
- p_type,
- p_size
-"""
-
-q17_query = f"""
- select
- round(sum(l_extendedprice) / 7.0, 2) as avg_yearly
- from
- {line_item_ds},
- {part_ds}
- where
- p_partkey = l_partkey
- and p_brand = 'Brand#23'
- and p_container = 'MED BOX'
- and l_quantity < (
- select
- 0.2 * avg(l_quantity)
- from
- {line_item_ds}
- where
- l_partkey = p_partkey
- )
-"""
-
-q18_query = f"""
- select
- c_name,
- c_custkey,
- o_orderkey,
- o_orderdate as o_orderdat,
- o_totalprice,
- sum(l_quantity) as col6
- from
- {customer_ds},
- {orders_ds},
- {line_item_ds}
- where
- o_orderkey in (
- select
- l_orderkey
- from
- {line_item_ds}
- group by
- l_orderkey having
- sum(l_quantity) > 300
- )
- and c_custkey = o_custkey
- and o_orderkey = l_orderkey
- group by
- c_name,
- c_custkey,
- o_orderkey,
- o_orderdate,
- o_totalprice
- order by
- o_totalprice desc,
- o_orderdate
- limit 100
-"""
-
-q19_query = f"""
- select
- round(sum(l_extendedprice* (1 - l_discount)), 2) as revenue
- from
- {line_item_ds},
- {part_ds}
- where
- (
- p_partkey = l_partkey
- and p_brand = 'Brand#12'
- and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG')
- and l_quantity >= 1 and l_quantity <= 1 + 10
- and p_size between 1 and 5
- and l_shipmode in ('AIR', 'AIR REG')
- and l_shipinstruct = 'DELIVER IN PERSON'
- )
- or
- (
- p_partkey = l_partkey
- and p_brand = 'Brand#23'
- and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK')
- and l_quantity >= 10 and l_quantity <= 20
- and p_size between 1 and 10
- and l_shipmode in ('AIR', 'AIR REG')
- and l_shipinstruct = 'DELIVER IN PERSON'
- )
- or
- (
- p_partkey = l_partkey
- and p_brand = 'Brand#34'
- and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG')
- and l_quantity >= 20 and l_quantity <= 30
- and p_size between 1 and 15
- and l_shipmode in ('AIR', 'AIR REG')
- and l_shipinstruct = 'DELIVER IN PERSON'
- )
-"""
-
-q20_query = f"""
- select
- s_name,
- s_address
- from
- {supplier_ds},
- {nation_ds}
- where
- s_suppkey in (
- select
- ps_suppkey
- from
- {part_supp_ds}
- where
- ps_partkey in (
- select
- p_partkey
- from
- {part_ds}
- where
- p_name like 'forest%'
- )
- and ps_availqty > (
- select
- 0.5 * sum(l_quantity)
- from
- {line_item_ds}
- where
- l_partkey = ps_partkey
- and l_suppkey = ps_suppkey
- and l_shipdate >= date '1994-01-01'
- and l_shipdate < date '1994-01-01' + interval '1' year
- )
- )
- and s_nationkey = n_nationkey
- and n_name = 'CANADA'
- order by
- s_name
-"""
-
-
-q21_query = f"""
- select
- s_name,
- count(*) as numwait
- from
- {supplier_ds},
- {line_item_ds} l1,
- {orders_ds},
- {nation_ds}
- where
- s_suppkey = l1.l_suppkey
- and o_orderkey = l1.l_orderkey
- and o_orderstatus = 'F'
- and l1.l_receiptdate > l1.l_commitdate
- and exists (
- select
- *
- from
- {line_item_ds} l2
- where
- l2.l_orderkey = l1.l_orderkey
- and l2.l_suppkey <> l1.l_suppkey
- )
- and not exists (
- select
- *
- from
- {line_item_ds} l3
- where
- l3.l_orderkey = l1.l_orderkey
- and l3.l_suppkey <> l1.l_suppkey
- and l3.l_receiptdate > l3.l_commitdate
- )
- and s_nationkey = n_nationkey
- and n_name = 'SAUDI ARABIA'
- group by
- s_name
- order by
- numwait desc,
- s_name
- limit 100
-"""
-
-q22_query = f"""
- select
- cntrycode,
- count(*) as numcust,
- sum(c_acctbal) as totacctbal
- from (
- select
- SUBSTR(c_phone, 1, 2) AS cntrycode,
- c_acctbal
- from
- {customer_ds}
- where
- SUBSTR(c_phone, 1, 2) in
- ('13', '31', '23', '29', '30', '18', '17')
- and c_acctbal > (
- select
- avg(c_acctbal)
- from
- {customer_ds}
- where
- c_acctbal > 0.00
- and SUBSTR(c_phone, 1, 2) in
- ('13', '31', '23', '29', '30', '18', '17')
- )
- and not exists (
- select
- *
- from
- {orders_ds}
- where
- o_custkey = c_custkey
- )
- ) as custsale
- group by
- cntrycode
- order by
- cntrycode
-"""
+dataset = {
+ "line_item_ds": f"bigframes-dev-perf.{dataset_id}.LINEITEM",
+ "region_ds": f"bigframes-dev-perf.{dataset_id}.REGION",
+ "nation_ds": f"bigframes-dev-perf.{dataset_id}.NATION",
+ "supplier_ds": f"bigframes-dev-perf.{dataset_id}.SUPPLIER",
+ "part_ds": f"bigframes-dev-perf.{dataset_id}.PART",
+ "part_supp_ds": f"bigframes-dev-perf.{dataset_id}.PARTSUPP",
+ "customer_ds": f"bigframes-dev-perf.{dataset_id}.CUSTOMER",
+ "orders_ds": f"bigframes-dev-perf.{dataset_id}.ORDERS",
+}
def _execute_query(query):
@@ -764,15 +75,21 @@ def verify(query_num=None):
for i in tqdm(range_iter, desc="Processing queries"):
if query_num is not None and i != query_num:
continue
- query_var_name = f"q{i}_query"
- sql_query = globals().get(query_var_name, "Query not defined")
+
+ # Execute SQL:
+ sql_file_path = f"third_party/bigframes_vendored/tpch/sql_queries/q{i}.sql"
+ with open(sql_file_path, "r") as f:
+ sql_query = f.read()
+ sql_query = sql_query.format(**dataset)
file_path = f"third_party/bigframes_vendored/tpch/queries/q{i}.py"
if os.path.exists(file_path):
with open(file_path, "r") as file:
file_content = file.read()
file_content = re.sub(
- r"(\w+)\.to_gbq\(\)", r"return \1.to_pandas()", file_content
+ r"next\((\w+)\.to_pandas_batches\((.*?)\)\)",
+ r"return \1.to_pandas()",
+ file_content,
)
file_content = re.sub(r"_\s*=\s*(\w+)", r"return \1", file_content)
sql_result = _execute_query(sql_query)
diff --git a/setup.py b/setup.py
index 047da2348c..4386177a5e 100644
--- a/setup.py
+++ b/setup.py
@@ -126,6 +126,7 @@
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
+ "Programming Language :: Python :: 3.13",
"Operating System :: OS Independent",
"Topic :: Internet",
],
diff --git a/tests/benchmark/tpch/config.jsonl b/tests/benchmark/tpch/config.jsonl
index e6f7a444f6..779b0fe2d7 100644
--- a/tests/benchmark/tpch/config.jsonl
+++ b/tests/benchmark/tpch/config.jsonl
@@ -6,5 +6,3 @@
{"benchmark_suffix": "100g_unordered", "project_id": "bigframes-dev-perf", "dataset_id": "tpch_0100g", "ordered": false}
{"benchmark_suffix": "1t_ordered", "project_id": "bigframes-dev-perf", "dataset_id": "tpch_0001t", "ordered": true}
{"benchmark_suffix": "1t_unordered", "project_id": "bigframes-dev-perf", "dataset_id": "tpch_0001t", "ordered": false}
-{"benchmark_suffix": "10t_ordered", "project_id": "bigframes-dev-perf", "dataset_id": "tpch_0010t", "ordered": true}
-{"benchmark_suffix": "10t_unordered", "project_id": "bigframes-dev-perf", "dataset_id": "tpch_0010t", "ordered": false}
diff --git a/tests/data/time_series.jsonl b/tests/data/time_series.jsonl
index e0f9ca7ae2..329e5a8b61 100644
--- a/tests/data/time_series.jsonl
+++ b/tests/data/time_series.jsonl
@@ -1,366 +1,732 @@
-{"parsed_date":"2017-07-01 00:00:00 UTC","total_visits":"2048"}
-{"parsed_date":"2016-09-07 00:00:00 UTC","total_visits":"2562"}
-{"parsed_date":"2016-10-25 00:00:00 UTC","total_visits":"3842"}
-{"parsed_date":"2017-04-10 00:00:00 UTC","total_visits":"2563"}
-{"parsed_date":"2017-01-09 00:00:00 UTC","total_visits":"2308"}
-{"parsed_date":"2017-05-02 00:00:00 UTC","total_visits":"2564"}
-{"parsed_date":"2016-11-11 00:00:00 UTC","total_visits":"3588"}
-{"parsed_date":"2017-07-30 00:00:00 UTC","total_visits":"1799"}
-{"parsed_date":"2017-06-10 00:00:00 UTC","total_visits":"1545"}
-{"parsed_date":"2016-08-14 00:00:00 UTC","total_visits":"1801"}
-{"parsed_date":"2017-05-14 00:00:00 UTC","total_visits":"1290"}
-{"parsed_date":"2017-02-08 00:00:00 UTC","total_visits":"2570"}
-{"parsed_date":"2017-06-01 00:00:00 UTC","total_visits":"2826"}
-{"parsed_date":"2017-04-23 00:00:00 UTC","total_visits":"1548"}
-{"parsed_date":"2016-11-04 00:00:00 UTC","total_visits":"3596"}
-{"parsed_date":"2017-02-04 00:00:00 UTC","total_visits":"1549"}
-{"parsed_date":"2016-12-09 00:00:00 UTC","total_visits":"2830"}
-{"parsed_date":"2016-10-30 00:00:00 UTC","total_visits":"3086"}
-{"parsed_date":"2017-03-28 00:00:00 UTC","total_visits":"2577"}
-{"parsed_date":"2017-06-11 00:00:00 UTC","total_visits":"1555"}
-{"parsed_date":"2016-12-17 00:00:00 UTC","total_visits":"2324"}
-{"parsed_date":"2016-09-22 00:00:00 UTC","total_visits":"2581"}
-{"parsed_date":"2017-01-29 00:00:00 UTC","total_visits":"1814"}
-{"parsed_date":"2017-03-22 00:00:00 UTC","total_visits":"2582"}
-{"parsed_date":"2017-02-21 00:00:00 UTC","total_visits":"2582"}
-{"parsed_date":"2016-10-14 00:00:00 UTC","total_visits":"2838"}
-{"parsed_date":"2017-04-27 00:00:00 UTC","total_visits":"2838"}
-{"parsed_date":"2016-10-26 00:00:00 UTC","total_visits":"4375"}
-{"parsed_date":"2016-08-22 00:00:00 UTC","total_visits":"2584"}
-{"parsed_date":"2016-12-07 00:00:00 UTC","total_visits":"2840"}
-{"parsed_date":"2017-01-20 00:00:00 UTC","total_visits":"2074"}
-{"parsed_date":"2017-03-07 00:00:00 UTC","total_visits":"2586"}
-{"parsed_date":"2017-05-16 00:00:00 UTC","total_visits":"3098"}
-{"parsed_date":"2017-05-03 00:00:00 UTC","total_visits":"2588"}
-{"parsed_date":"2017-05-01 00:00:00 UTC","total_visits":"2588"}
-{"parsed_date":"2016-11-27 00:00:00 UTC","total_visits":"3356"}
-{"parsed_date":"2017-04-29 00:00:00 UTC","total_visits":"1566"}
-{"parsed_date":"2016-09-18 00:00:00 UTC","total_visits":"1822"}
-{"parsed_date":"2017-03-23 00:00:00 UTC","total_visits":"2847"}
-{"parsed_date":"2017-03-14 00:00:00 UTC","total_visits":"2338"}
-{"parsed_date":"2016-12-21 00:00:00 UTC","total_visits":"2594"}
-{"parsed_date":"2016-10-11 00:00:00 UTC","total_visits":"2850"}
-{"parsed_date":"2017-01-24 00:00:00 UTC","total_visits":"3618"}
-{"parsed_date":"2017-03-05 00:00:00 UTC","total_visits":"1827"}
-{"parsed_date":"2017-01-19 00:00:00 UTC","total_visits":"2083"}
-{"parsed_date":"2016-08-09 00:00:00 UTC","total_visits":"2851"}
-{"parsed_date":"2017-04-08 00:00:00 UTC","total_visits":"1829"}
-{"parsed_date":"2017-04-12 00:00:00 UTC","total_visits":"2341"}
-{"parsed_date":"2016-09-29 00:00:00 UTC","total_visits":"2597"}
-{"parsed_date":"2016-12-20 00:00:00 UTC","total_visits":"3110"}
-{"parsed_date":"2017-01-15 00:00:00 UTC","total_visits":"1576"}
-{"parsed_date":"2017-04-14 00:00:00 UTC","total_visits":"1834"}
-{"parsed_date":"2017-02-28 00:00:00 UTC","total_visits":"2347"}
-{"parsed_date":"2016-09-16 00:00:00 UTC","total_visits":"2603"}
-{"parsed_date":"2016-10-18 00:00:00 UTC","total_visits":"3628"}
-{"parsed_date":"2017-02-24 00:00:00 UTC","total_visits":"2093"}
-{"parsed_date":"2017-05-17 00:00:00 UTC","total_visits":"3117"}
-{"parsed_date":"2017-06-23 00:00:00 UTC","total_visits":"2095"}
-{"parsed_date":"2016-11-12 00:00:00 UTC","total_visits":"3119"}
-{"parsed_date":"2016-11-21 00:00:00 UTC","total_visits":"4143"}
-{"parsed_date":"2017-02-27 00:00:00 UTC","total_visits":"2352"}
-{"parsed_date":"2016-12-26 00:00:00 UTC","total_visits":"1586"}
-{"parsed_date":"2017-04-25 00:00:00 UTC","total_visits":"2354"}
-{"parsed_date":"2017-03-21 00:00:00 UTC","total_visits":"2611"}
-{"parsed_date":"2016-12-22 00:00:00 UTC","total_visits":"2100"}
-{"parsed_date":"2016-10-01 00:00:00 UTC","total_visits":"1589"}
-{"parsed_date":"2016-09-24 00:00:00 UTC","total_visits":"1845"}
-{"parsed_date":"2017-06-21 00:00:00 UTC","total_visits":"2357"}
-{"parsed_date":"2016-09-02 00:00:00 UTC","total_visits":"2613"}
-{"parsed_date":"2016-08-26 00:00:00 UTC","total_visits":"2359"}
-{"parsed_date":"2016-10-12 00:00:00 UTC","total_visits":"2871"}
-{"parsed_date":"2017-05-15 00:00:00 UTC","total_visits":"2360"}
-{"parsed_date":"2017-06-12 00:00:00 UTC","total_visits":"2361"}
-{"parsed_date":"2016-08-16 00:00:00 UTC","total_visits":"2873"}
-{"parsed_date":"2017-04-30 00:00:00 UTC","total_visits":"1594"}
-{"parsed_date":"2017-04-05 00:00:00 UTC","total_visits":"2619"}
-{"parsed_date":"2016-08-12 00:00:00 UTC","total_visits":"2619"}
-{"parsed_date":"2016-11-08 00:00:00 UTC","total_visits":"3899"}
-{"parsed_date":"2016-08-13 00:00:00 UTC","total_visits":"1596"}
-{"parsed_date":"2017-05-09 00:00:00 UTC","total_visits":"2108"}
-{"parsed_date":"2017-02-23 00:00:00 UTC","total_visits":"2364"}
-{"parsed_date":"2017-07-31 00:00:00 UTC","total_visits":"2620"}
-{"parsed_date":"2017-06-25 00:00:00 UTC","total_visits":"1597"}
-{"parsed_date":"2017-07-29 00:00:00 UTC","total_visits":"1597"}
-{"parsed_date":"2016-09-17 00:00:00 UTC","total_visits":"1853"}
-{"parsed_date":"2016-12-27 00:00:00 UTC","total_visits":"1855"}
-{"parsed_date":"2017-05-20 00:00:00 UTC","total_visits":"1855"}
-{"parsed_date":"2016-10-08 00:00:00 UTC","total_visits":"2114"}
-{"parsed_date":"2016-10-27 00:00:00 UTC","total_visits":"4162"}
-{"parsed_date":"2017-07-08 00:00:00 UTC","total_visits":"1859"}
-{"parsed_date":"2016-08-24 00:00:00 UTC","total_visits":"2627"}
-{"parsed_date":"2016-12-23 00:00:00 UTC","total_visits":"1604"}
-{"parsed_date":"2017-02-02 00:00:00 UTC","total_visits":"2372"}
-{"parsed_date":"2016-09-08 00:00:00 UTC","total_visits":"2628"}
-{"parsed_date":"2017-04-02 00:00:00 UTC","total_visits":"1861"}
-{"parsed_date":"2017-02-15 00:00:00 UTC","total_visits":"2629"}
-{"parsed_date":"2017-07-05 00:00:00 UTC","total_visits":"2885"}
-{"parsed_date":"2016-10-17 00:00:00 UTC","total_visits":"3397"}
-{"parsed_date":"2017-02-20 00:00:00 UTC","total_visits":"2374"}
-{"parsed_date":"2017-03-24 00:00:00 UTC","total_visits":"2374"}
-{"parsed_date":"2017-04-20 00:00:00 UTC","total_visits":"2374"}
-{"parsed_date":"2016-11-18 00:00:00 UTC","total_visits":"3654"}
-{"parsed_date":"2017-07-25 00:00:00 UTC","total_visits":"2631"}
-{"parsed_date":"2016-11-13 00:00:00 UTC","total_visits":"3144"}
-{"parsed_date":"2017-03-18 00:00:00 UTC","total_visits":"1610"}
-{"parsed_date":"2016-08-03 00:00:00 UTC","total_visits":"2890"}
-{"parsed_date":"2016-08-19 00:00:00 UTC","total_visits":"2379"}
-{"parsed_date":"2017-02-14 00:00:00 UTC","total_visits":"2379"}
-{"parsed_date":"2017-07-11 00:00:00 UTC","total_visits":"2635"}
-{"parsed_date":"2017-04-22 00:00:00 UTC","total_visits":"1612"}
-{"parsed_date":"2016-10-07 00:00:00 UTC","total_visits":"2892"}
-{"parsed_date":"2016-09-05 00:00:00 UTC","total_visits":"2125"}
-{"parsed_date":"2016-09-23 00:00:00 UTC","total_visits":"2381"}
-{"parsed_date":"2016-11-15 00:00:00 UTC","total_visits":"4685"}
-{"parsed_date":"2017-01-28 00:00:00 UTC","total_visits":"1614"}
-{"parsed_date":"2017-07-14 00:00:00 UTC","total_visits":"2382"}
-{"parsed_date":"2017-01-07 00:00:00 UTC","total_visits":"1615"}
-{"parsed_date":"2017-04-03 00:00:00 UTC","total_visits":"2383"}
-{"parsed_date":"2017-03-20 00:00:00 UTC","total_visits":"2383"}
-{"parsed_date":"2016-12-18 00:00:00 UTC","total_visits":"2128"}
-{"parsed_date":"2017-03-17 00:00:00 UTC","total_visits":"2129"}
-{"parsed_date":"2017-05-23 00:00:00 UTC","total_visits":"2129"}
-{"parsed_date":"2016-11-30 00:00:00 UTC","total_visits":"4435"}
-{"parsed_date":"2017-01-01 00:00:00 UTC","total_visits":"1364"}
-{"parsed_date":"2017-01-02 00:00:00 UTC","total_visits":"1620"}
-{"parsed_date":"2016-09-25 00:00:00 UTC","total_visits":"1877"}
-{"parsed_date":"2016-08-07 00:00:00 UTC","total_visits":"1622"}
-{"parsed_date":"2016-10-09 00:00:00 UTC","total_visits":"2134"}
-{"parsed_date":"2017-03-01 00:00:00 UTC","total_visits":"2390"}
-{"parsed_date":"2017-01-04 00:00:00 UTC","total_visits":"2390"}
-{"parsed_date":"2017-06-06 00:00:00 UTC","total_visits":"2391"}
-{"parsed_date":"2017-04-18 00:00:00 UTC","total_visits":"2391"}
-{"parsed_date":"2017-04-06 00:00:00 UTC","total_visits":"2647"}
-{"parsed_date":"2017-01-30 00:00:00 UTC","total_visits":"2392"}
-{"parsed_date":"2016-10-16 00:00:00 UTC","total_visits":"2649"}
-{"parsed_date":"2016-08-04 00:00:00 UTC","total_visits":"3161"}
-{"parsed_date":"2016-10-21 00:00:00 UTC","total_visits":"3419"}
-{"parsed_date":"2016-08-02 00:00:00 UTC","total_visits":"2140"}
-{"parsed_date":"2017-03-06 00:00:00 UTC","total_visits":"2396"}
-{"parsed_date":"2016-09-13 00:00:00 UTC","total_visits":"2396"}
-{"parsed_date":"2016-09-14 00:00:00 UTC","total_visits":"2652"}
-{"parsed_date":"2017-04-19 00:00:00 UTC","total_visits":"2397"}
-{"parsed_date":"2017-06-19 00:00:00 UTC","total_visits":"2142"}
-{"parsed_date":"2016-12-13 00:00:00 UTC","total_visits":"3166"}
-{"parsed_date":"2017-06-20 00:00:00 UTC","total_visits":"2143"}
-{"parsed_date":"2016-10-10 00:00:00 UTC","total_visits":"2911"}
-{"parsed_date":"2017-07-06 00:00:00 UTC","total_visits":"2658"}
-{"parsed_date":"2017-01-03 00:00:00 UTC","total_visits":"2403"}
-{"parsed_date":"2017-01-08 00:00:00 UTC","total_visits":"1637"}
-{"parsed_date":"2017-02-25 00:00:00 UTC","total_visits":"1638"}
-{"parsed_date":"2017-05-24 00:00:00 UTC","total_visits":"2406"}
-{"parsed_date":"2016-11-22 00:00:00 UTC","total_visits":"3942"}
-{"parsed_date":"2017-05-06 00:00:00 UTC","total_visits":"1383"}
-{"parsed_date":"2017-07-02 00:00:00 UTC","total_visits":"1895"}
-{"parsed_date":"2016-12-01 00:00:00 UTC","total_visits":"4200"}
-{"parsed_date":"2017-03-16 00:00:00 UTC","total_visits":"2409"}
-{"parsed_date":"2016-12-12 00:00:00 UTC","total_visits":"3433"}
-{"parsed_date":"2016-12-25 00:00:00 UTC","total_visits":"1386"}
-{"parsed_date":"2017-02-26 00:00:00 UTC","total_visits":"1643"}
-{"parsed_date":"2017-04-28 00:00:00 UTC","total_visits":"2411"}
-{"parsed_date":"2016-08-11 00:00:00 UTC","total_visits":"2667"}
-{"parsed_date":"2017-07-20 00:00:00 UTC","total_visits":"2668"}
-{"parsed_date":"2017-05-21 00:00:00 UTC","total_visits":"1645"}
-{"parsed_date":"2017-06-17 00:00:00 UTC","total_visits":"1391"}
-{"parsed_date":"2016-12-29 00:00:00 UTC","total_visits":"1647"}
-{"parsed_date":"2017-07-17 00:00:00 UTC","total_visits":"2671"}
-{"parsed_date":"2017-01-16 00:00:00 UTC","total_visits":"1906"}
-{"parsed_date":"2017-03-03 00:00:00 UTC","total_visits":"2162"}
-{"parsed_date":"2016-11-14 00:00:00 UTC","total_visits":"4466"}
-{"parsed_date":"2016-08-30 00:00:00 UTC","total_visits":"2675"}
-{"parsed_date":"2016-08-27 00:00:00 UTC","total_visits":"1654"}
-{"parsed_date":"2017-02-09 00:00:00 UTC","total_visits":"2678"}
-{"parsed_date":"2017-06-03 00:00:00 UTC","total_visits":"1399"}
-{"parsed_date":"2017-05-07 00:00:00 UTC","total_visits":"1400"}
-{"parsed_date":"2016-11-02 00:00:00 UTC","total_visits":"3960"}
-{"parsed_date":"2016-12-15 00:00:00 UTC","total_visits":"2937"}
-{"parsed_date":"2017-04-01 00:00:00 UTC","total_visits":"2170"}
-{"parsed_date":"2017-07-21 00:00:00 UTC","total_visits":"2427"}
-{"parsed_date":"2016-08-06 00:00:00 UTC","total_visits":"1663"}
-{"parsed_date":"2016-09-01 00:00:00 UTC","total_visits":"2687"}
-{"parsed_date":"2017-06-28 00:00:00 UTC","total_visits":"2687"}
-{"parsed_date":"2016-08-20 00:00:00 UTC","total_visits":"1664"}
-{"parsed_date":"2017-04-26 00:00:00 UTC","total_visits":"4224"}
-{"parsed_date":"2017-07-09 00:00:00 UTC","total_visits":"1921"}
-{"parsed_date":"2017-07-28 00:00:00 UTC","total_visits":"2433"}
-{"parsed_date":"2016-09-19 00:00:00 UTC","total_visits":"2689"}
-{"parsed_date":"2017-07-24 00:00:00 UTC","total_visits":"2436"}
-{"parsed_date":"2017-06-13 00:00:00 UTC","total_visits":"2181"}
-{"parsed_date":"2016-09-15 00:00:00 UTC","total_visits":"2949"}
-{"parsed_date":"2017-02-03 00:00:00 UTC","total_visits":"2182"}
-{"parsed_date":"2016-09-10 00:00:00 UTC","total_visits":"1671"}
-{"parsed_date":"2017-06-09 00:00:00 UTC","total_visits":"1927"}
-{"parsed_date":"2017-01-11 00:00:00 UTC","total_visits":"2185"}
-{"parsed_date":"2017-02-19 00:00:00 UTC","total_visits":"2187"}
-{"parsed_date":"2017-01-17 00:00:00 UTC","total_visits":"2443"}
-{"parsed_date":"2017-05-12 00:00:00 UTC","total_visits":"1932"}
-{"parsed_date":"2016-12-16 00:00:00 UTC","total_visits":"2956"}
-{"parsed_date":"2017-02-01 00:00:00 UTC","total_visits":"2445"}
-{"parsed_date":"2016-11-26 00:00:00 UTC","total_visits":"3213"}
-{"parsed_date":"2017-06-02 00:00:00 UTC","total_visits":"2190"}
-{"parsed_date":"2016-08-05 00:00:00 UTC","total_visits":"2702"}
-{"parsed_date":"2016-11-01 00:00:00 UTC","total_visits":"3728"}
-{"parsed_date":"2017-01-05 00:00:00 UTC","total_visits":"2193"}
-{"parsed_date":"2017-03-08 00:00:00 UTC","total_visits":"2449"}
-{"parsed_date":"2016-08-28 00:00:00 UTC","total_visits":"1682"}
-{"parsed_date":"2017-07-04 00:00:00 UTC","total_visits":"1938"}
-{"parsed_date":"2017-03-10 00:00:00 UTC","total_visits":"2194"}
-{"parsed_date":"2017-07-07 00:00:00 UTC","total_visits":"2450"}
-{"parsed_date":"2016-10-29 00:00:00 UTC","total_visits":"2964"}
-{"parsed_date":"2016-10-13 00:00:00 UTC","total_visits":"2964"}
-{"parsed_date":"2016-12-04 00:00:00 UTC","total_visits":"3220"}
-{"parsed_date":"2017-01-21 00:00:00 UTC","total_visits":"1685"}
-{"parsed_date":"2017-06-29 00:00:00 UTC","total_visits":"2709"}
-{"parsed_date":"2016-08-29 00:00:00 UTC","total_visits":"2454"}
-{"parsed_date":"2016-12-19 00:00:00 UTC","total_visits":"3222"}
-{"parsed_date":"2017-05-30 00:00:00 UTC","total_visits":"2199"}
-{"parsed_date":"2017-02-10 00:00:00 UTC","total_visits":"2199"}
-{"parsed_date":"2016-08-31 00:00:00 UTC","total_visits":"3223"}
-{"parsed_date":"2017-06-18 00:00:00 UTC","total_visits":"1432"}
-{"parsed_date":"2017-01-12 00:00:00 UTC","total_visits":"2203"}
-{"parsed_date":"2017-05-18 00:00:00 UTC","total_visits":"2715"}
-{"parsed_date":"2016-10-23 00:00:00 UTC","total_visits":"2971"}
-{"parsed_date":"2016-09-04 00:00:00 UTC","total_visits":"1692"}
-{"parsed_date":"2016-12-10 00:00:00 UTC","total_visits":"2207"}
-{"parsed_date":"2016-12-11 00:00:00 UTC","total_visits":"2208"}
-{"parsed_date":"2017-04-11 00:00:00 UTC","total_visits":"2464"}
-{"parsed_date":"2016-09-21 00:00:00 UTC","total_visits":"2720"}
-{"parsed_date":"2016-11-06 00:00:00 UTC","total_visits":"3232"}
-{"parsed_date":"2017-01-26 00:00:00 UTC","total_visits":"2209"}
-{"parsed_date":"2016-09-12 00:00:00 UTC","total_visits":"2465"}
-{"parsed_date":"2017-04-21 00:00:00 UTC","total_visits":"2210"}
-{"parsed_date":"2017-01-06 00:00:00 UTC","total_visits":"2210"}
-{"parsed_date":"2017-04-04 00:00:00 UTC","total_visits":"2978"}
-{"parsed_date":"2017-01-22 00:00:00 UTC","total_visits":"1700"}
-{"parsed_date":"2017-07-26 00:00:00 UTC","total_visits":"2725"}
-{"parsed_date":"2016-08-18 00:00:00 UTC","total_visits":"2725"}
-{"parsed_date":"2016-09-27 00:00:00 UTC","total_visits":"2727"}
-{"parsed_date":"2016-12-02 00:00:00 UTC","total_visits":"3751"}
-{"parsed_date":"2017-05-05 00:00:00 UTC","total_visits":"1960"}
-{"parsed_date":"2016-11-19 00:00:00 UTC","total_visits":"2984"}
-{"parsed_date":"2016-11-09 00:00:00 UTC","total_visits":"3752"}
-{"parsed_date":"2016-12-05 00:00:00 UTC","total_visits":"4265"}
-{"parsed_date":"2017-05-11 00:00:00 UTC","total_visits":"2218"}
-{"parsed_date":"2017-01-25 00:00:00 UTC","total_visits":"2986"}
-{"parsed_date":"2017-03-11 00:00:00 UTC","total_visits":"1707"}
-{"parsed_date":"2017-03-30 00:00:00 UTC","total_visits":"2731"}
-{"parsed_date":"2016-10-20 00:00:00 UTC","total_visits":"3755"}
-{"parsed_date":"2017-02-07 00:00:00 UTC","total_visits":"2476"}
-{"parsed_date":"2017-02-22 00:00:00 UTC","total_visits":"2477"}
-{"parsed_date":"2017-07-23 00:00:00 UTC","total_visits":"1966"}
-{"parsed_date":"2016-11-03 00:00:00 UTC","total_visits":"4014"}
-{"parsed_date":"2016-08-01 00:00:00 UTC","total_visits":"1711"}
-{"parsed_date":"2017-01-13 00:00:00 UTC","total_visits":"1967"}
-{"parsed_date":"2017-05-19 00:00:00 UTC","total_visits":"2223"}
-{"parsed_date":"2016-11-20 00:00:00 UTC","total_visits":"3247"}
-{"parsed_date":"2016-11-25 00:00:00 UTC","total_visits":"3759"}
-{"parsed_date":"2017-03-25 00:00:00 UTC","total_visits":"1712"}
-{"parsed_date":"2017-01-27 00:00:00 UTC","total_visits":"1969"}
-{"parsed_date":"2017-06-26 00:00:00 UTC","total_visits":"2226"}
-{"parsed_date":"2017-05-25 00:00:00 UTC","total_visits":"2228"}
-{"parsed_date":"2017-01-31 00:00:00 UTC","total_visits":"2229"}
-{"parsed_date":"2017-07-13 00:00:00 UTC","total_visits":"2741"}
-{"parsed_date":"2017-03-15 00:00:00 UTC","total_visits":"2486"}
-{"parsed_date":"2017-05-28 00:00:00 UTC","total_visits":"1463"}
-{"parsed_date":"2017-03-09 00:00:00 UTC","total_visits":"2231"}
-{"parsed_date":"2017-07-15 00:00:00 UTC","total_visits":"1721"}
-{"parsed_date":"2016-11-24 00:00:00 UTC","total_visits":"3770"}
-{"parsed_date":"2016-10-05 00:00:00 UTC","total_visits":"3770"}
-{"parsed_date":"2016-12-31 00:00:00 UTC","total_visits":"1211"}
-{"parsed_date":"2016-10-02 00:00:00 UTC","total_visits":"1724"}
-{"parsed_date":"2017-07-22 00:00:00 UTC","total_visits":"1724"}
-{"parsed_date":"2016-09-11 00:00:00 UTC","total_visits":"1725"}
-{"parsed_date":"2017-06-15 00:00:00 UTC","total_visits":"2237"}
-{"parsed_date":"2017-06-05 00:00:00 UTC","total_visits":"2493"}
-{"parsed_date":"2017-02-06 00:00:00 UTC","total_visits":"2238"}
-{"parsed_date":"2016-10-15 00:00:00 UTC","total_visits":"2495"}
-{"parsed_date":"2016-08-21 00:00:00 UTC","total_visits":"1730"}
-{"parsed_date":"2016-08-23 00:00:00 UTC","total_visits":"2754"}
-{"parsed_date":"2017-06-30 00:00:00 UTC","total_visits":"2499"}
-{"parsed_date":"2017-01-18 00:00:00 UTC","total_visits":"2245"}
-{"parsed_date":"2016-08-10 00:00:00 UTC","total_visits":"2757"}
-{"parsed_date":"2016-12-08 00:00:00 UTC","total_visits":"3013"}
-{"parsed_date":"2016-11-28 00:00:00 UTC","total_visits":"4807"}
-{"parsed_date":"2017-05-22 00:00:00 UTC","total_visits":"2248"}
-{"parsed_date":"2016-09-20 00:00:00 UTC","total_visits":"2760"}
-{"parsed_date":"2016-10-06 00:00:00 UTC","total_visits":"3016"}
-{"parsed_date":"2016-09-06 00:00:00 UTC","total_visits":"2508"}
-{"parsed_date":"2016-09-03 00:00:00 UTC","total_visits":"1741"}
-{"parsed_date":"2016-12-06 00:00:00 UTC","total_visits":"3021"}
-{"parsed_date":"2016-12-24 00:00:00 UTC","total_visits":"1231"}
-{"parsed_date":"2016-10-28 00:00:00 UTC","total_visits":"3791"}
-{"parsed_date":"2016-12-30 00:00:00 UTC","total_visits":"1232"}
-{"parsed_date":"2017-05-29 00:00:00 UTC","total_visits":"1745"}
-{"parsed_date":"2017-07-10 00:00:00 UTC","total_visits":"2769"}
-{"parsed_date":"2017-06-22 00:00:00 UTC","total_visits":"2258"}
-{"parsed_date":"2017-07-19 00:00:00 UTC","total_visits":"2514"}
-{"parsed_date":"2016-10-03 00:00:00 UTC","total_visits":"2514"}
-{"parsed_date":"2017-06-14 00:00:00 UTC","total_visits":"2517"}
-{"parsed_date":"2016-10-22 00:00:00 UTC","total_visits":"3029"}
-{"parsed_date":"2017-01-23 00:00:00 UTC","total_visits":"2262"}
-{"parsed_date":"2017-04-24 00:00:00 UTC","total_visits":"2263"}
-{"parsed_date":"2016-11-10 00:00:00 UTC","total_visits":"4055"}
-{"parsed_date":"2016-09-26 00:00:00 UTC","total_visits":"2776"}
-{"parsed_date":"2016-10-19 00:00:00 UTC","total_visits":"3544"}
-{"parsed_date":"2017-03-04 00:00:00 UTC","total_visits":"1753"}
-{"parsed_date":"2017-05-26 00:00:00 UTC","total_visits":"2009"}
-{"parsed_date":"2017-02-13 00:00:00 UTC","total_visits":"2266"}
-{"parsed_date":"2017-02-18 00:00:00 UTC","total_visits":"1755"}
-{"parsed_date":"2017-03-02 00:00:00 UTC","total_visits":"2267"}
-{"parsed_date":"2017-03-31 00:00:00 UTC","total_visits":"2268"}
-{"parsed_date":"2017-01-10 00:00:00 UTC","total_visits":"2268"}
-{"parsed_date":"2017-03-29 00:00:00 UTC","total_visits":"2525"}
-{"parsed_date":"2017-03-27 00:00:00 UTC","total_visits":"2525"}
-{"parsed_date":"2016-11-23 00:00:00 UTC","total_visits":"3805"}
-{"parsed_date":"2017-05-27 00:00:00 UTC","total_visits":"1502"}
-{"parsed_date":"2016-10-24 00:00:00 UTC","total_visits":"4063"}
-{"parsed_date":"2016-12-14 00:00:00 UTC","total_visits":"3040"}
-{"parsed_date":"2017-02-11 00:00:00 UTC","total_visits":"1761"}
-{"parsed_date":"2017-07-27 00:00:00 UTC","total_visits":"2529"}
-{"parsed_date":"2017-02-17 00:00:00 UTC","total_visits":"2785"}
-{"parsed_date":"2017-04-15 00:00:00 UTC","total_visits":"1506"}
-{"parsed_date":"2016-11-05 00:00:00 UTC","total_visits":"3042"}
-{"parsed_date":"2016-10-04 00:00:00 UTC","total_visits":"4322"}
-{"parsed_date":"2017-05-13 00:00:00 UTC","total_visits":"1251"}
-{"parsed_date":"2017-04-16 00:00:00 UTC","total_visits":"1507"}
-{"parsed_date":"2016-12-28 00:00:00 UTC","total_visits":"1763"}
-{"parsed_date":"2016-08-15 00:00:00 UTC","total_visits":"3043"}
-{"parsed_date":"2016-12-03 00:00:00 UTC","total_visits":"3044"}
-{"parsed_date":"2017-06-27 00:00:00 UTC","total_visits":"2789"}
-{"parsed_date":"2017-06-24 00:00:00 UTC","total_visits":"1510"}
-{"parsed_date":"2017-07-16 00:00:00 UTC","total_visits":"1766"}
-{"parsed_date":"2017-04-09 00:00:00 UTC","total_visits":"1766"}
-{"parsed_date":"2017-06-07 00:00:00 UTC","total_visits":"2279"}
-{"parsed_date":"2017-04-17 00:00:00 UTC","total_visits":"2279"}
-{"parsed_date":"2016-09-28 00:00:00 UTC","total_visits":"2535"}
-{"parsed_date":"2017-03-26 00:00:00 UTC","total_visits":"1768"}
-{"parsed_date":"2017-05-10 00:00:00 UTC","total_visits":"2024"}
-{"parsed_date":"2017-06-08 00:00:00 UTC","total_visits":"2280"}
-{"parsed_date":"2017-05-08 00:00:00 UTC","total_visits":"2025"}
-{"parsed_date":"2017-03-13 00:00:00 UTC","total_visits":"2537"}
-{"parsed_date":"2016-11-17 00:00:00 UTC","total_visits":"4074"}
-{"parsed_date":"2016-08-25 00:00:00 UTC","total_visits":"2539"}
-{"parsed_date":"2017-02-16 00:00:00 UTC","total_visits":"2539"}
-{"parsed_date":"2017-06-16 00:00:00 UTC","total_visits":"2028"}
-{"parsed_date":"2016-11-16 00:00:00 UTC","total_visits":"4334"}
-{"parsed_date":"2016-08-17 00:00:00 UTC","total_visits":"2799"}
-{"parsed_date":"2017-03-19 00:00:00 UTC","total_visits":"1776"}
-{"parsed_date":"2016-11-29 00:00:00 UTC","total_visits":"4337"}
-{"parsed_date":"2017-02-05 00:00:00 UTC","total_visits":"1522"}
-{"parsed_date":"2016-10-31 00:00:00 UTC","total_visits":"3827"}
-{"parsed_date":"2017-05-31 00:00:00 UTC","total_visits":"2292"}
-{"parsed_date":"2017-07-18 00:00:00 UTC","total_visits":"2804"}
-{"parsed_date":"2017-03-12 00:00:00 UTC","total_visits":"1781"}
-{"parsed_date":"2016-09-09 00:00:00 UTC","total_visits":"2549"}
-{"parsed_date":"2017-01-14 00:00:00 UTC","total_visits":"1526"}
-{"parsed_date":"2017-05-04 00:00:00 UTC","total_visits":"2806"}
-{"parsed_date":"2016-11-07 00:00:00 UTC","total_visits":"3832"}
-{"parsed_date":"2017-04-07 00:00:00 UTC","total_visits":"2297"}
-{"parsed_date":"2017-07-12 00:00:00 UTC","total_visits":"2554"}
-{"parsed_date":"2017-04-13 00:00:00 UTC","total_visits":"2300"}
-{"parsed_date":"2017-08-01 00:00:00 UTC","total_visits":"2556"}
-{"parsed_date":"2017-06-04 00:00:00 UTC","total_visits":"1534"}
-{"parsed_date":"2017-02-12 00:00:00 UTC","total_visits":"1790"}
-{"parsed_date":"2017-07-03 00:00:00 UTC","total_visits":"2046"}
-{"parsed_date":"2016-09-30 00:00:00 UTC","total_visits":"2303"}
-{"parsed_date":"2016-08-08 00:00:00 UTC","total_visits":"2815"}
+{"parsed_date":"2017-07-01 00:00:00 UTC","id":"1","total_visits":"2048"}
+{"parsed_date":"2016-09-07 00:00:00 UTC","id":"1","total_visits":"2562"}
+{"parsed_date":"2016-10-25 00:00:00 UTC","id":"1","total_visits":"3842"}
+{"parsed_date":"2017-04-10 00:00:00 UTC","id":"1","total_visits":"2563"}
+{"parsed_date":"2017-01-09 00:00:00 UTC","id":"1","total_visits":"2308"}
+{"parsed_date":"2017-05-02 00:00:00 UTC","id":"1","total_visits":"2564"}
+{"parsed_date":"2016-11-11 00:00:00 UTC","id":"1","total_visits":"3588"}
+{"parsed_date":"2017-07-30 00:00:00 UTC","id":"1","total_visits":"1799"}
+{"parsed_date":"2017-06-10 00:00:00 UTC","id":"1","total_visits":"1545"}
+{"parsed_date":"2016-08-14 00:00:00 UTC","id":"1","total_visits":"1801"}
+{"parsed_date":"2017-05-14 00:00:00 UTC","id":"1","total_visits":"1290"}
+{"parsed_date":"2017-02-08 00:00:00 UTC","id":"1","total_visits":"2570"}
+{"parsed_date":"2017-06-01 00:00:00 UTC","id":"1","total_visits":"2826"}
+{"parsed_date":"2017-04-23 00:00:00 UTC","id":"1","total_visits":"1548"}
+{"parsed_date":"2016-11-04 00:00:00 UTC","id":"1","total_visits":"3596"}
+{"parsed_date":"2017-02-04 00:00:00 UTC","id":"1","total_visits":"1549"}
+{"parsed_date":"2016-12-09 00:00:00 UTC","id":"1","total_visits":"2830"}
+{"parsed_date":"2016-10-30 00:00:00 UTC","id":"1","total_visits":"3086"}
+{"parsed_date":"2017-03-28 00:00:00 UTC","id":"1","total_visits":"2577"}
+{"parsed_date":"2017-06-11 00:00:00 UTC","id":"1","total_visits":"1555"}
+{"parsed_date":"2016-12-17 00:00:00 UTC","id":"1","total_visits":"2324"}
+{"parsed_date":"2016-09-22 00:00:00 UTC","id":"1","total_visits":"2581"}
+{"parsed_date":"2017-01-29 00:00:00 UTC","id":"1","total_visits":"1814"}
+{"parsed_date":"2017-03-22 00:00:00 UTC","id":"1","total_visits":"2582"}
+{"parsed_date":"2017-02-21 00:00:00 UTC","id":"1","total_visits":"2582"}
+{"parsed_date":"2016-10-14 00:00:00 UTC","id":"1","total_visits":"2838"}
+{"parsed_date":"2017-04-27 00:00:00 UTC","id":"1","total_visits":"2838"}
+{"parsed_date":"2016-10-26 00:00:00 UTC","id":"1","total_visits":"4375"}
+{"parsed_date":"2016-08-22 00:00:00 UTC","id":"1","total_visits":"2584"}
+{"parsed_date":"2016-12-07 00:00:00 UTC","id":"1","total_visits":"2840"}
+{"parsed_date":"2017-01-20 00:00:00 UTC","id":"1","total_visits":"2074"}
+{"parsed_date":"2017-03-07 00:00:00 UTC","id":"1","total_visits":"2586"}
+{"parsed_date":"2017-05-16 00:00:00 UTC","id":"1","total_visits":"3098"}
+{"parsed_date":"2017-05-03 00:00:00 UTC","id":"1","total_visits":"2588"}
+{"parsed_date":"2017-05-01 00:00:00 UTC","id":"1","total_visits":"2588"}
+{"parsed_date":"2016-11-27 00:00:00 UTC","id":"1","total_visits":"3356"}
+{"parsed_date":"2017-04-29 00:00:00 UTC","id":"1","total_visits":"1566"}
+{"parsed_date":"2016-09-18 00:00:00 UTC","id":"1","total_visits":"1822"}
+{"parsed_date":"2017-03-23 00:00:00 UTC","id":"1","total_visits":"2847"}
+{"parsed_date":"2017-03-14 00:00:00 UTC","id":"1","total_visits":"2338"}
+{"parsed_date":"2016-12-21 00:00:00 UTC","id":"1","total_visits":"2594"}
+{"parsed_date":"2016-10-11 00:00:00 UTC","id":"1","total_visits":"2850"}
+{"parsed_date":"2017-01-24 00:00:00 UTC","id":"1","total_visits":"3618"}
+{"parsed_date":"2017-03-05 00:00:00 UTC","id":"1","total_visits":"1827"}
+{"parsed_date":"2017-01-19 00:00:00 UTC","id":"1","total_visits":"2083"}
+{"parsed_date":"2016-08-09 00:00:00 UTC","id":"1","total_visits":"2851"}
+{"parsed_date":"2017-04-08 00:00:00 UTC","id":"1","total_visits":"1829"}
+{"parsed_date":"2017-04-12 00:00:00 UTC","id":"1","total_visits":"2341"}
+{"parsed_date":"2016-09-29 00:00:00 UTC","id":"1","total_visits":"2597"}
+{"parsed_date":"2016-12-20 00:00:00 UTC","id":"1","total_visits":"3110"}
+{"parsed_date":"2017-01-15 00:00:00 UTC","id":"1","total_visits":"1576"}
+{"parsed_date":"2017-04-14 00:00:00 UTC","id":"1","total_visits":"1834"}
+{"parsed_date":"2017-02-28 00:00:00 UTC","id":"1","total_visits":"2347"}
+{"parsed_date":"2016-09-16 00:00:00 UTC","id":"1","total_visits":"2603"}
+{"parsed_date":"2016-10-18 00:00:00 UTC","id":"1","total_visits":"3628"}
+{"parsed_date":"2017-02-24 00:00:00 UTC","id":"1","total_visits":"2093"}
+{"parsed_date":"2017-05-17 00:00:00 UTC","id":"1","total_visits":"3117"}
+{"parsed_date":"2017-06-23 00:00:00 UTC","id":"1","total_visits":"2095"}
+{"parsed_date":"2016-11-12 00:00:00 UTC","id":"1","total_visits":"3119"}
+{"parsed_date":"2016-11-21 00:00:00 UTC","id":"1","total_visits":"4143"}
+{"parsed_date":"2017-02-27 00:00:00 UTC","id":"1","total_visits":"2352"}
+{"parsed_date":"2016-12-26 00:00:00 UTC","id":"1","total_visits":"1586"}
+{"parsed_date":"2017-04-25 00:00:00 UTC","id":"1","total_visits":"2354"}
+{"parsed_date":"2017-03-21 00:00:00 UTC","id":"1","total_visits":"2611"}
+{"parsed_date":"2016-12-22 00:00:00 UTC","id":"1","total_visits":"2100"}
+{"parsed_date":"2016-10-01 00:00:00 UTC","id":"1","total_visits":"1589"}
+{"parsed_date":"2016-09-24 00:00:00 UTC","id":"1","total_visits":"1845"}
+{"parsed_date":"2017-06-21 00:00:00 UTC","id":"1","total_visits":"2357"}
+{"parsed_date":"2016-09-02 00:00:00 UTC","id":"1","total_visits":"2613"}
+{"parsed_date":"2016-08-26 00:00:00 UTC","id":"1","total_visits":"2359"}
+{"parsed_date":"2016-10-12 00:00:00 UTC","id":"1","total_visits":"2871"}
+{"parsed_date":"2017-05-15 00:00:00 UTC","id":"1","total_visits":"2360"}
+{"parsed_date":"2017-06-12 00:00:00 UTC","id":"1","total_visits":"2361"}
+{"parsed_date":"2016-08-16 00:00:00 UTC","id":"1","total_visits":"2873"}
+{"parsed_date":"2017-04-30 00:00:00 UTC","id":"1","total_visits":"1594"}
+{"parsed_date":"2017-04-05 00:00:00 UTC","id":"1","total_visits":"2619"}
+{"parsed_date":"2016-08-12 00:00:00 UTC","id":"1","total_visits":"2619"}
+{"parsed_date":"2016-11-08 00:00:00 UTC","id":"1","total_visits":"3899"}
+{"parsed_date":"2016-08-13 00:00:00 UTC","id":"1","total_visits":"1596"}
+{"parsed_date":"2017-05-09 00:00:00 UTC","id":"1","total_visits":"2108"}
+{"parsed_date":"2017-02-23 00:00:00 UTC","id":"1","total_visits":"2364"}
+{"parsed_date":"2017-07-31 00:00:00 UTC","id":"1","total_visits":"2620"}
+{"parsed_date":"2017-06-25 00:00:00 UTC","id":"1","total_visits":"1597"}
+{"parsed_date":"2017-07-29 00:00:00 UTC","id":"1","total_visits":"1597"}
+{"parsed_date":"2016-09-17 00:00:00 UTC","id":"1","total_visits":"1853"}
+{"parsed_date":"2016-12-27 00:00:00 UTC","id":"1","total_visits":"1855"}
+{"parsed_date":"2017-05-20 00:00:00 UTC","id":"1","total_visits":"1855"}
+{"parsed_date":"2016-10-08 00:00:00 UTC","id":"1","total_visits":"2114"}
+{"parsed_date":"2016-10-27 00:00:00 UTC","id":"1","total_visits":"4162"}
+{"parsed_date":"2017-07-08 00:00:00 UTC","id":"1","total_visits":"1859"}
+{"parsed_date":"2016-08-24 00:00:00 UTC","id":"1","total_visits":"2627"}
+{"parsed_date":"2016-12-23 00:00:00 UTC","id":"1","total_visits":"1604"}
+{"parsed_date":"2017-02-02 00:00:00 UTC","id":"1","total_visits":"2372"}
+{"parsed_date":"2016-09-08 00:00:00 UTC","id":"1","total_visits":"2628"}
+{"parsed_date":"2017-04-02 00:00:00 UTC","id":"1","total_visits":"1861"}
+{"parsed_date":"2017-02-15 00:00:00 UTC","id":"1","total_visits":"2629"}
+{"parsed_date":"2017-07-05 00:00:00 UTC","id":"1","total_visits":"2885"}
+{"parsed_date":"2016-10-17 00:00:00 UTC","id":"1","total_visits":"3397"}
+{"parsed_date":"2017-02-20 00:00:00 UTC","id":"1","total_visits":"2374"}
+{"parsed_date":"2017-03-24 00:00:00 UTC","id":"1","total_visits":"2374"}
+{"parsed_date":"2017-04-20 00:00:00 UTC","id":"1","total_visits":"2374"}
+{"parsed_date":"2016-11-18 00:00:00 UTC","id":"1","total_visits":"3654"}
+{"parsed_date":"2017-07-25 00:00:00 UTC","id":"1","total_visits":"2631"}
+{"parsed_date":"2016-11-13 00:00:00 UTC","id":"1","total_visits":"3144"}
+{"parsed_date":"2017-03-18 00:00:00 UTC","id":"1","total_visits":"1610"}
+{"parsed_date":"2016-08-03 00:00:00 UTC","id":"1","total_visits":"2890"}
+{"parsed_date":"2016-08-19 00:00:00 UTC","id":"1","total_visits":"2379"}
+{"parsed_date":"2017-02-14 00:00:00 UTC","id":"1","total_visits":"2379"}
+{"parsed_date":"2017-07-11 00:00:00 UTC","id":"1","total_visits":"2635"}
+{"parsed_date":"2017-04-22 00:00:00 UTC","id":"1","total_visits":"1612"}
+{"parsed_date":"2016-10-07 00:00:00 UTC","id":"1","total_visits":"2892"}
+{"parsed_date":"2016-09-05 00:00:00 UTC","id":"1","total_visits":"2125"}
+{"parsed_date":"2016-09-23 00:00:00 UTC","id":"1","total_visits":"2381"}
+{"parsed_date":"2016-11-15 00:00:00 UTC","id":"1","total_visits":"4685"}
+{"parsed_date":"2017-01-28 00:00:00 UTC","id":"1","total_visits":"1614"}
+{"parsed_date":"2017-07-14 00:00:00 UTC","id":"1","total_visits":"2382"}
+{"parsed_date":"2017-01-07 00:00:00 UTC","id":"1","total_visits":"1615"}
+{"parsed_date":"2017-04-03 00:00:00 UTC","id":"1","total_visits":"2383"}
+{"parsed_date":"2017-03-20 00:00:00 UTC","id":"1","total_visits":"2383"}
+{"parsed_date":"2016-12-18 00:00:00 UTC","id":"1","total_visits":"2128"}
+{"parsed_date":"2017-03-17 00:00:00 UTC","id":"1","total_visits":"2129"}
+{"parsed_date":"2017-05-23 00:00:00 UTC","id":"1","total_visits":"2129"}
+{"parsed_date":"2016-11-30 00:00:00 UTC","id":"1","total_visits":"4435"}
+{"parsed_date":"2017-01-01 00:00:00 UTC","id":"1","total_visits":"1364"}
+{"parsed_date":"2017-01-02 00:00:00 UTC","id":"1","total_visits":"1620"}
+{"parsed_date":"2016-09-25 00:00:00 UTC","id":"1","total_visits":"1877"}
+{"parsed_date":"2016-08-07 00:00:00 UTC","id":"1","total_visits":"1622"}
+{"parsed_date":"2016-10-09 00:00:00 UTC","id":"1","total_visits":"2134"}
+{"parsed_date":"2017-03-01 00:00:00 UTC","id":"1","total_visits":"2390"}
+{"parsed_date":"2017-01-04 00:00:00 UTC","id":"1","total_visits":"2390"}
+{"parsed_date":"2017-06-06 00:00:00 UTC","id":"1","total_visits":"2391"}
+{"parsed_date":"2017-04-18 00:00:00 UTC","id":"1","total_visits":"2391"}
+{"parsed_date":"2017-04-06 00:00:00 UTC","id":"1","total_visits":"2647"}
+{"parsed_date":"2017-01-30 00:00:00 UTC","id":"1","total_visits":"2392"}
+{"parsed_date":"2016-10-16 00:00:00 UTC","id":"1","total_visits":"2649"}
+{"parsed_date":"2016-08-04 00:00:00 UTC","id":"1","total_visits":"3161"}
+{"parsed_date":"2016-10-21 00:00:00 UTC","id":"1","total_visits":"3419"}
+{"parsed_date":"2016-08-02 00:00:00 UTC","id":"1","total_visits":"2140"}
+{"parsed_date":"2017-03-06 00:00:00 UTC","id":"1","total_visits":"2396"}
+{"parsed_date":"2016-09-13 00:00:00 UTC","id":"1","total_visits":"2396"}
+{"parsed_date":"2016-09-14 00:00:00 UTC","id":"1","total_visits":"2652"}
+{"parsed_date":"2017-04-19 00:00:00 UTC","id":"1","total_visits":"2397"}
+{"parsed_date":"2017-06-19 00:00:00 UTC","id":"1","total_visits":"2142"}
+{"parsed_date":"2016-12-13 00:00:00 UTC","id":"1","total_visits":"3166"}
+{"parsed_date":"2017-06-20 00:00:00 UTC","id":"1","total_visits":"2143"}
+{"parsed_date":"2016-10-10 00:00:00 UTC","id":"1","total_visits":"2911"}
+{"parsed_date":"2017-07-06 00:00:00 UTC","id":"1","total_visits":"2658"}
+{"parsed_date":"2017-01-03 00:00:00 UTC","id":"1","total_visits":"2403"}
+{"parsed_date":"2017-01-08 00:00:00 UTC","id":"1","total_visits":"1637"}
+{"parsed_date":"2017-02-25 00:00:00 UTC","id":"1","total_visits":"1638"}
+{"parsed_date":"2017-05-24 00:00:00 UTC","id":"1","total_visits":"2406"}
+{"parsed_date":"2016-11-22 00:00:00 UTC","id":"1","total_visits":"3942"}
+{"parsed_date":"2017-05-06 00:00:00 UTC","id":"1","total_visits":"1383"}
+{"parsed_date":"2017-07-02 00:00:00 UTC","id":"1","total_visits":"1895"}
+{"parsed_date":"2016-12-01 00:00:00 UTC","id":"1","total_visits":"4200"}
+{"parsed_date":"2017-03-16 00:00:00 UTC","id":"1","total_visits":"2409"}
+{"parsed_date":"2016-12-12 00:00:00 UTC","id":"1","total_visits":"3433"}
+{"parsed_date":"2016-12-25 00:00:00 UTC","id":"1","total_visits":"1386"}
+{"parsed_date":"2017-02-26 00:00:00 UTC","id":"1","total_visits":"1643"}
+{"parsed_date":"2017-04-28 00:00:00 UTC","id":"1","total_visits":"2411"}
+{"parsed_date":"2016-08-11 00:00:00 UTC","id":"1","total_visits":"2667"}
+{"parsed_date":"2017-07-20 00:00:00 UTC","id":"1","total_visits":"2668"}
+{"parsed_date":"2017-05-21 00:00:00 UTC","id":"1","total_visits":"1645"}
+{"parsed_date":"2017-06-17 00:00:00 UTC","id":"1","total_visits":"1391"}
+{"parsed_date":"2016-12-29 00:00:00 UTC","id":"1","total_visits":"1647"}
+{"parsed_date":"2017-07-17 00:00:00 UTC","id":"1","total_visits":"2671"}
+{"parsed_date":"2017-01-16 00:00:00 UTC","id":"1","total_visits":"1906"}
+{"parsed_date":"2017-03-03 00:00:00 UTC","id":"1","total_visits":"2162"}
+{"parsed_date":"2016-11-14 00:00:00 UTC","id":"1","total_visits":"4466"}
+{"parsed_date":"2016-08-30 00:00:00 UTC","id":"1","total_visits":"2675"}
+{"parsed_date":"2016-08-27 00:00:00 UTC","id":"1","total_visits":"1654"}
+{"parsed_date":"2017-02-09 00:00:00 UTC","id":"1","total_visits":"2678"}
+{"parsed_date":"2017-06-03 00:00:00 UTC","id":"1","total_visits":"1399"}
+{"parsed_date":"2017-05-07 00:00:00 UTC","id":"1","total_visits":"1400"}
+{"parsed_date":"2016-11-02 00:00:00 UTC","id":"1","total_visits":"3960"}
+{"parsed_date":"2016-12-15 00:00:00 UTC","id":"1","total_visits":"2937"}
+{"parsed_date":"2017-04-01 00:00:00 UTC","id":"1","total_visits":"2170"}
+{"parsed_date":"2017-07-21 00:00:00 UTC","id":"1","total_visits":"2427"}
+{"parsed_date":"2016-08-06 00:00:00 UTC","id":"1","total_visits":"1663"}
+{"parsed_date":"2016-09-01 00:00:00 UTC","id":"1","total_visits":"2687"}
+{"parsed_date":"2017-06-28 00:00:00 UTC","id":"1","total_visits":"2687"}
+{"parsed_date":"2016-08-20 00:00:00 UTC","id":"1","total_visits":"1664"}
+{"parsed_date":"2017-04-26 00:00:00 UTC","id":"1","total_visits":"4224"}
+{"parsed_date":"2017-07-09 00:00:00 UTC","id":"1","total_visits":"1921"}
+{"parsed_date":"2017-07-28 00:00:00 UTC","id":"1","total_visits":"2433"}
+{"parsed_date":"2016-09-19 00:00:00 UTC","id":"1","total_visits":"2689"}
+{"parsed_date":"2017-07-24 00:00:00 UTC","id":"1","total_visits":"2436"}
+{"parsed_date":"2017-06-13 00:00:00 UTC","id":"1","total_visits":"2181"}
+{"parsed_date":"2016-09-15 00:00:00 UTC","id":"1","total_visits":"2949"}
+{"parsed_date":"2017-02-03 00:00:00 UTC","id":"1","total_visits":"2182"}
+{"parsed_date":"2016-09-10 00:00:00 UTC","id":"1","total_visits":"1671"}
+{"parsed_date":"2017-06-09 00:00:00 UTC","id":"1","total_visits":"1927"}
+{"parsed_date":"2017-01-11 00:00:00 UTC","id":"1","total_visits":"2185"}
+{"parsed_date":"2017-02-19 00:00:00 UTC","id":"1","total_visits":"2187"}
+{"parsed_date":"2017-01-17 00:00:00 UTC","id":"1","total_visits":"2443"}
+{"parsed_date":"2017-05-12 00:00:00 UTC","id":"1","total_visits":"1932"}
+{"parsed_date":"2016-12-16 00:00:00 UTC","id":"1","total_visits":"2956"}
+{"parsed_date":"2017-02-01 00:00:00 UTC","id":"1","total_visits":"2445"}
+{"parsed_date":"2016-11-26 00:00:00 UTC","id":"1","total_visits":"3213"}
+{"parsed_date":"2017-06-02 00:00:00 UTC","id":"1","total_visits":"2190"}
+{"parsed_date":"2016-08-05 00:00:00 UTC","id":"1","total_visits":"2702"}
+{"parsed_date":"2016-11-01 00:00:00 UTC","id":"1","total_visits":"3728"}
+{"parsed_date":"2017-01-05 00:00:00 UTC","id":"1","total_visits":"2193"}
+{"parsed_date":"2017-03-08 00:00:00 UTC","id":"1","total_visits":"2449"}
+{"parsed_date":"2016-08-28 00:00:00 UTC","id":"1","total_visits":"1682"}
+{"parsed_date":"2017-07-04 00:00:00 UTC","id":"1","total_visits":"1938"}
+{"parsed_date":"2017-03-10 00:00:00 UTC","id":"1","total_visits":"2194"}
+{"parsed_date":"2017-07-07 00:00:00 UTC","id":"1","total_visits":"2450"}
+{"parsed_date":"2016-10-29 00:00:00 UTC","id":"1","total_visits":"2964"}
+{"parsed_date":"2016-10-13 00:00:00 UTC","id":"1","total_visits":"2964"}
+{"parsed_date":"2016-12-04 00:00:00 UTC","id":"1","total_visits":"3220"}
+{"parsed_date":"2017-01-21 00:00:00 UTC","id":"1","total_visits":"1685"}
+{"parsed_date":"2017-06-29 00:00:00 UTC","id":"1","total_visits":"2709"}
+{"parsed_date":"2016-08-29 00:00:00 UTC","id":"1","total_visits":"2454"}
+{"parsed_date":"2016-12-19 00:00:00 UTC","id":"1","total_visits":"3222"}
+{"parsed_date":"2017-05-30 00:00:00 UTC","id":"1","total_visits":"2199"}
+{"parsed_date":"2017-02-10 00:00:00 UTC","id":"1","total_visits":"2199"}
+{"parsed_date":"2016-08-31 00:00:00 UTC","id":"1","total_visits":"3223"}
+{"parsed_date":"2017-06-18 00:00:00 UTC","id":"1","total_visits":"1432"}
+{"parsed_date":"2017-01-12 00:00:00 UTC","id":"1","total_visits":"2203"}
+{"parsed_date":"2017-05-18 00:00:00 UTC","id":"1","total_visits":"2715"}
+{"parsed_date":"2016-10-23 00:00:00 UTC","id":"1","total_visits":"2971"}
+{"parsed_date":"2016-09-04 00:00:00 UTC","id":"1","total_visits":"1692"}
+{"parsed_date":"2016-12-10 00:00:00 UTC","id":"1","total_visits":"2207"}
+{"parsed_date":"2016-12-11 00:00:00 UTC","id":"1","total_visits":"2208"}
+{"parsed_date":"2017-04-11 00:00:00 UTC","id":"1","total_visits":"2464"}
+{"parsed_date":"2016-09-21 00:00:00 UTC","id":"1","total_visits":"2720"}
+{"parsed_date":"2016-11-06 00:00:00 UTC","id":"1","total_visits":"3232"}
+{"parsed_date":"2017-01-26 00:00:00 UTC","id":"1","total_visits":"2209"}
+{"parsed_date":"2016-09-12 00:00:00 UTC","id":"1","total_visits":"2465"}
+{"parsed_date":"2017-04-21 00:00:00 UTC","id":"1","total_visits":"2210"}
+{"parsed_date":"2017-01-06 00:00:00 UTC","id":"1","total_visits":"2210"}
+{"parsed_date":"2017-04-04 00:00:00 UTC","id":"1","total_visits":"2978"}
+{"parsed_date":"2017-01-22 00:00:00 UTC","id":"1","total_visits":"1700"}
+{"parsed_date":"2017-07-26 00:00:00 UTC","id":"1","total_visits":"2725"}
+{"parsed_date":"2016-08-18 00:00:00 UTC","id":"1","total_visits":"2725"}
+{"parsed_date":"2016-09-27 00:00:00 UTC","id":"1","total_visits":"2727"}
+{"parsed_date":"2016-12-02 00:00:00 UTC","id":"1","total_visits":"3751"}
+{"parsed_date":"2017-05-05 00:00:00 UTC","id":"1","total_visits":"1960"}
+{"parsed_date":"2016-11-19 00:00:00 UTC","id":"1","total_visits":"2984"}
+{"parsed_date":"2016-11-09 00:00:00 UTC","id":"1","total_visits":"3752"}
+{"parsed_date":"2016-12-05 00:00:00 UTC","id":"1","total_visits":"4265"}
+{"parsed_date":"2017-05-11 00:00:00 UTC","id":"1","total_visits":"2218"}
+{"parsed_date":"2017-01-25 00:00:00 UTC","id":"1","total_visits":"2986"}
+{"parsed_date":"2017-03-11 00:00:00 UTC","id":"1","total_visits":"1707"}
+{"parsed_date":"2017-03-30 00:00:00 UTC","id":"1","total_visits":"2731"}
+{"parsed_date":"2016-10-20 00:00:00 UTC","id":"1","total_visits":"3755"}
+{"parsed_date":"2017-02-07 00:00:00 UTC","id":"1","total_visits":"2476"}
+{"parsed_date":"2017-02-22 00:00:00 UTC","id":"1","total_visits":"2477"}
+{"parsed_date":"2017-07-23 00:00:00 UTC","id":"1","total_visits":"1966"}
+{"parsed_date":"2016-11-03 00:00:00 UTC","id":"1","total_visits":"4014"}
+{"parsed_date":"2016-08-01 00:00:00 UTC","id":"1","total_visits":"1711"}
+{"parsed_date":"2017-01-13 00:00:00 UTC","id":"1","total_visits":"1967"}
+{"parsed_date":"2017-05-19 00:00:00 UTC","id":"1","total_visits":"2223"}
+{"parsed_date":"2016-11-20 00:00:00 UTC","id":"1","total_visits":"3247"}
+{"parsed_date":"2016-11-25 00:00:00 UTC","id":"1","total_visits":"3759"}
+{"parsed_date":"2017-03-25 00:00:00 UTC","id":"1","total_visits":"1712"}
+{"parsed_date":"2017-01-27 00:00:00 UTC","id":"1","total_visits":"1969"}
+{"parsed_date":"2017-06-26 00:00:00 UTC","id":"1","total_visits":"2226"}
+{"parsed_date":"2017-05-25 00:00:00 UTC","id":"1","total_visits":"2228"}
+{"parsed_date":"2017-01-31 00:00:00 UTC","id":"1","total_visits":"2229"}
+{"parsed_date":"2017-07-13 00:00:00 UTC","id":"1","total_visits":"2741"}
+{"parsed_date":"2017-03-15 00:00:00 UTC","id":"1","total_visits":"2486"}
+{"parsed_date":"2017-05-28 00:00:00 UTC","id":"1","total_visits":"1463"}
+{"parsed_date":"2017-03-09 00:00:00 UTC","id":"1","total_visits":"2231"}
+{"parsed_date":"2017-07-15 00:00:00 UTC","id":"1","total_visits":"1721"}
+{"parsed_date":"2016-11-24 00:00:00 UTC","id":"1","total_visits":"3770"}
+{"parsed_date":"2016-10-05 00:00:00 UTC","id":"1","total_visits":"3770"}
+{"parsed_date":"2016-12-31 00:00:00 UTC","id":"1","total_visits":"1211"}
+{"parsed_date":"2016-10-02 00:00:00 UTC","id":"1","total_visits":"1724"}
+{"parsed_date":"2017-07-22 00:00:00 UTC","id":"1","total_visits":"1724"}
+{"parsed_date":"2016-09-11 00:00:00 UTC","id":"1","total_visits":"1725"}
+{"parsed_date":"2017-06-15 00:00:00 UTC","id":"1","total_visits":"2237"}
+{"parsed_date":"2017-06-05 00:00:00 UTC","id":"1","total_visits":"2493"}
+{"parsed_date":"2017-02-06 00:00:00 UTC","id":"1","total_visits":"2238"}
+{"parsed_date":"2016-10-15 00:00:00 UTC","id":"1","total_visits":"2495"}
+{"parsed_date":"2016-08-21 00:00:00 UTC","id":"1","total_visits":"1730"}
+{"parsed_date":"2016-08-23 00:00:00 UTC","id":"1","total_visits":"2754"}
+{"parsed_date":"2017-06-30 00:00:00 UTC","id":"1","total_visits":"2499"}
+{"parsed_date":"2017-01-18 00:00:00 UTC","id":"1","total_visits":"2245"}
+{"parsed_date":"2016-08-10 00:00:00 UTC","id":"1","total_visits":"2757"}
+{"parsed_date":"2016-12-08 00:00:00 UTC","id":"1","total_visits":"3013"}
+{"parsed_date":"2016-11-28 00:00:00 UTC","id":"1","total_visits":"4807"}
+{"parsed_date":"2017-05-22 00:00:00 UTC","id":"1","total_visits":"2248"}
+{"parsed_date":"2016-09-20 00:00:00 UTC","id":"1","total_visits":"2760"}
+{"parsed_date":"2016-10-06 00:00:00 UTC","id":"1","total_visits":"3016"}
+{"parsed_date":"2016-09-06 00:00:00 UTC","id":"1","total_visits":"2508"}
+{"parsed_date":"2016-09-03 00:00:00 UTC","id":"1","total_visits":"1741"}
+{"parsed_date":"2016-12-06 00:00:00 UTC","id":"1","total_visits":"3021"}
+{"parsed_date":"2016-12-24 00:00:00 UTC","id":"1","total_visits":"1231"}
+{"parsed_date":"2016-10-28 00:00:00 UTC","id":"1","total_visits":"3791"}
+{"parsed_date":"2016-12-30 00:00:00 UTC","id":"1","total_visits":"1232"}
+{"parsed_date":"2017-05-29 00:00:00 UTC","id":"1","total_visits":"1745"}
+{"parsed_date":"2017-07-10 00:00:00 UTC","id":"1","total_visits":"2769"}
+{"parsed_date":"2017-06-22 00:00:00 UTC","id":"1","total_visits":"2258"}
+{"parsed_date":"2017-07-19 00:00:00 UTC","id":"1","total_visits":"2514"}
+{"parsed_date":"2016-10-03 00:00:00 UTC","id":"1","total_visits":"2514"}
+{"parsed_date":"2017-06-14 00:00:00 UTC","id":"1","total_visits":"2517"}
+{"parsed_date":"2016-10-22 00:00:00 UTC","id":"1","total_visits":"3029"}
+{"parsed_date":"2017-01-23 00:00:00 UTC","id":"1","total_visits":"2262"}
+{"parsed_date":"2017-04-24 00:00:00 UTC","id":"1","total_visits":"2263"}
+{"parsed_date":"2016-11-10 00:00:00 UTC","id":"1","total_visits":"4055"}
+{"parsed_date":"2016-09-26 00:00:00 UTC","id":"1","total_visits":"2776"}
+{"parsed_date":"2016-10-19 00:00:00 UTC","id":"1","total_visits":"3544"}
+{"parsed_date":"2017-03-04 00:00:00 UTC","id":"1","total_visits":"1753"}
+{"parsed_date":"2017-05-26 00:00:00 UTC","id":"1","total_visits":"2009"}
+{"parsed_date":"2017-02-13 00:00:00 UTC","id":"1","total_visits":"2266"}
+{"parsed_date":"2017-02-18 00:00:00 UTC","id":"1","total_visits":"1755"}
+{"parsed_date":"2017-03-02 00:00:00 UTC","id":"1","total_visits":"2267"}
+{"parsed_date":"2017-03-31 00:00:00 UTC","id":"1","total_visits":"2268"}
+{"parsed_date":"2017-01-10 00:00:00 UTC","id":"1","total_visits":"2268"}
+{"parsed_date":"2017-03-29 00:00:00 UTC","id":"1","total_visits":"2525"}
+{"parsed_date":"2017-03-27 00:00:00 UTC","id":"1","total_visits":"2525"}
+{"parsed_date":"2016-11-23 00:00:00 UTC","id":"1","total_visits":"3805"}
+{"parsed_date":"2017-05-27 00:00:00 UTC","id":"1","total_visits":"1502"}
+{"parsed_date":"2016-10-24 00:00:00 UTC","id":"1","total_visits":"4063"}
+{"parsed_date":"2016-12-14 00:00:00 UTC","id":"1","total_visits":"3040"}
+{"parsed_date":"2017-02-11 00:00:00 UTC","id":"1","total_visits":"1761"}
+{"parsed_date":"2017-07-27 00:00:00 UTC","id":"1","total_visits":"2529"}
+{"parsed_date":"2017-02-17 00:00:00 UTC","id":"1","total_visits":"2785"}
+{"parsed_date":"2017-04-15 00:00:00 UTC","id":"1","total_visits":"1506"}
+{"parsed_date":"2016-11-05 00:00:00 UTC","id":"1","total_visits":"3042"}
+{"parsed_date":"2016-10-04 00:00:00 UTC","id":"1","total_visits":"4322"}
+{"parsed_date":"2017-05-13 00:00:00 UTC","id":"1","total_visits":"1251"}
+{"parsed_date":"2017-04-16 00:00:00 UTC","id":"1","total_visits":"1507"}
+{"parsed_date":"2016-12-28 00:00:00 UTC","id":"1","total_visits":"1763"}
+{"parsed_date":"2016-08-15 00:00:00 UTC","id":"1","total_visits":"3043"}
+{"parsed_date":"2016-12-03 00:00:00 UTC","id":"1","total_visits":"3044"}
+{"parsed_date":"2017-06-27 00:00:00 UTC","id":"1","total_visits":"2789"}
+{"parsed_date":"2017-06-24 00:00:00 UTC","id":"1","total_visits":"1510"}
+{"parsed_date":"2017-07-16 00:00:00 UTC","id":"1","total_visits":"1766"}
+{"parsed_date":"2017-04-09 00:00:00 UTC","id":"1","total_visits":"1766"}
+{"parsed_date":"2017-06-07 00:00:00 UTC","id":"1","total_visits":"2279"}
+{"parsed_date":"2017-04-17 00:00:00 UTC","id":"1","total_visits":"2279"}
+{"parsed_date":"2016-09-28 00:00:00 UTC","id":"1","total_visits":"2535"}
+{"parsed_date":"2017-03-26 00:00:00 UTC","id":"1","total_visits":"1768"}
+{"parsed_date":"2017-05-10 00:00:00 UTC","id":"1","total_visits":"2024"}
+{"parsed_date":"2017-06-08 00:00:00 UTC","id":"1","total_visits":"2280"}
+{"parsed_date":"2017-05-08 00:00:00 UTC","id":"1","total_visits":"2025"}
+{"parsed_date":"2017-03-13 00:00:00 UTC","id":"1","total_visits":"2537"}
+{"parsed_date":"2016-11-17 00:00:00 UTC","id":"1","total_visits":"4074"}
+{"parsed_date":"2016-08-25 00:00:00 UTC","id":"1","total_visits":"2539"}
+{"parsed_date":"2017-02-16 00:00:00 UTC","id":"1","total_visits":"2539"}
+{"parsed_date":"2017-06-16 00:00:00 UTC","id":"1","total_visits":"2028"}
+{"parsed_date":"2016-11-16 00:00:00 UTC","id":"1","total_visits":"4334"}
+{"parsed_date":"2016-08-17 00:00:00 UTC","id":"1","total_visits":"2800"}
+{"parsed_date":"2017-03-19 00:00:00 UTC","id":"1","total_visits":"1776"}
+{"parsed_date":"2016-11-29 00:00:00 UTC","id":"1","total_visits":"4337"}
+{"parsed_date":"2017-02-05 00:00:00 UTC","id":"1","total_visits":"1522"}
+{"parsed_date":"2016-10-31 00:00:00 UTC","id":"1","total_visits":"3827"}
+{"parsed_date":"2017-05-31 00:00:00 UTC","id":"1","total_visits":"2292"}
+{"parsed_date":"2017-07-18 00:00:00 UTC","id":"1","total_visits":"2804"}
+{"parsed_date":"2017-03-12 00:00:00 UTC","id":"1","total_visits":"1781"}
+{"parsed_date":"2016-09-09 00:00:00 UTC","id":"1","total_visits":"2549"}
+{"parsed_date":"2017-01-14 00:00:00 UTC","id":"1","total_visits":"1526"}
+{"parsed_date":"2017-05-04 00:00:00 UTC","id":"1","total_visits":"2806"}
+{"parsed_date":"2016-11-07 00:00:00 UTC","id":"1","total_visits":"3832"}
+{"parsed_date":"2017-04-07 00:00:00 UTC","id":"1","total_visits":"2297"}
+{"parsed_date":"2017-07-12 00:00:00 UTC","id":"1","total_visits":"2554"}
+{"parsed_date":"2017-04-13 00:00:00 UTC","id":"1","total_visits":"2300"}
+{"parsed_date":"2017-08-01 00:00:00 UTC","id":"1","total_visits":"2556"}
+{"parsed_date":"2017-06-04 00:00:00 UTC","id":"1","total_visits":"1534"}
+{"parsed_date":"2017-02-12 00:00:00 UTC","id":"1","total_visits":"1790"}
+{"parsed_date":"2017-07-03 00:00:00 UTC","id":"1","total_visits":"2046"}
+{"parsed_date":"2016-09-30 00:00:00 UTC","id":"1","total_visits":"2303"}
+{"parsed_date":"2016-08-08 00:00:00 UTC","id":"1","total_visits":"2815"}
+{"parsed_date":"2017-07-01 00:00:00 UTC","id":"2","total_visits":"2048"}
+{"parsed_date":"2016-09-07 00:00:00 UTC","id":"2","total_visits":"2562"}
+{"parsed_date":"2016-10-25 00:00:00 UTC","id":"2","total_visits":"3842"}
+{"parsed_date":"2017-04-10 00:00:00 UTC","id":"2","total_visits":"2563"}
+{"parsed_date":"2017-01-09 00:00:00 UTC","id":"2","total_visits":"2308"}
+{"parsed_date":"2017-05-02 00:00:00 UTC","id":"2","total_visits":"2564"}
+{"parsed_date":"2016-11-11 00:00:00 UTC","id":"2","total_visits":"3588"}
+{"parsed_date":"2017-07-30 00:00:00 UTC","id":"2","total_visits":"1799"}
+{"parsed_date":"2017-06-10 00:00:00 UTC","id":"2","total_visits":"1545"}
+{"parsed_date":"2016-08-14 00:00:00 UTC","id":"2","total_visits":"1801"}
+{"parsed_date":"2017-05-14 00:00:00 UTC","id":"2","total_visits":"1290"}
+{"parsed_date":"2017-02-08 00:00:00 UTC","id":"2","total_visits":"2570"}
+{"parsed_date":"2017-06-01 00:00:00 UTC","id":"2","total_visits":"2826"}
+{"parsed_date":"2017-04-23 00:00:00 UTC","id":"2","total_visits":"1548"}
+{"parsed_date":"2016-11-04 00:00:00 UTC","id":"2","total_visits":"3596"}
+{"parsed_date":"2017-02-04 00:00:00 UTC","id":"2","total_visits":"1549"}
+{"parsed_date":"2016-12-09 00:00:00 UTC","id":"2","total_visits":"2830"}
+{"parsed_date":"2016-10-30 00:00:00 UTC","id":"2","total_visits":"3086"}
+{"parsed_date":"2017-03-28 00:00:00 UTC","id":"2","total_visits":"2577"}
+{"parsed_date":"2017-06-11 00:00:00 UTC","id":"2","total_visits":"1555"}
+{"parsed_date":"2016-12-17 00:00:00 UTC","id":"2","total_visits":"2324"}
+{"parsed_date":"2016-09-22 00:00:00 UTC","id":"2","total_visits":"2581"}
+{"parsed_date":"2017-01-29 00:00:00 UTC","id":"2","total_visits":"1814"}
+{"parsed_date":"2017-03-22 00:00:00 UTC","id":"2","total_visits":"2582"}
+{"parsed_date":"2017-02-21 00:00:00 UTC","id":"2","total_visits":"2582"}
+{"parsed_date":"2016-10-14 00:00:00 UTC","id":"2","total_visits":"2838"}
+{"parsed_date":"2017-04-27 00:00:00 UTC","id":"2","total_visits":"2838"}
+{"parsed_date":"2016-10-26 00:00:00 UTC","id":"2","total_visits":"4375"}
+{"parsed_date":"2016-08-22 00:00:00 UTC","id":"2","total_visits":"2584"}
+{"parsed_date":"2016-12-07 00:00:00 UTC","id":"2","total_visits":"2840"}
+{"parsed_date":"2017-01-20 00:00:00 UTC","id":"2","total_visits":"2074"}
+{"parsed_date":"2017-03-07 00:00:00 UTC","id":"2","total_visits":"2586"}
+{"parsed_date":"2017-05-16 00:00:00 UTC","id":"2","total_visits":"3098"}
+{"parsed_date":"2017-05-03 00:00:00 UTC","id":"2","total_visits":"2588"}
+{"parsed_date":"2017-05-01 00:00:00 UTC","id":"2","total_visits":"2588"}
+{"parsed_date":"2016-11-27 00:00:00 UTC","id":"2","total_visits":"3356"}
+{"parsed_date":"2017-04-29 00:00:00 UTC","id":"2","total_visits":"1566"}
+{"parsed_date":"2016-09-18 00:00:00 UTC","id":"2","total_visits":"1822"}
+{"parsed_date":"2017-03-23 00:00:00 UTC","id":"2","total_visits":"2847"}
+{"parsed_date":"2017-03-14 00:00:00 UTC","id":"2","total_visits":"2338"}
+{"parsed_date":"2016-12-21 00:00:00 UTC","id":"2","total_visits":"2594"}
+{"parsed_date":"2016-10-11 00:00:00 UTC","id":"2","total_visits":"2850"}
+{"parsed_date":"2017-01-24 00:00:00 UTC","id":"2","total_visits":"3618"}
+{"parsed_date":"2017-03-05 00:00:00 UTC","id":"2","total_visits":"1827"}
+{"parsed_date":"2017-01-19 00:00:00 UTC","id":"2","total_visits":"2083"}
+{"parsed_date":"2016-08-09 00:00:00 UTC","id":"2","total_visits":"2851"}
+{"parsed_date":"2017-04-08 00:00:00 UTC","id":"2","total_visits":"1829"}
+{"parsed_date":"2017-04-12 00:00:00 UTC","id":"2","total_visits":"2341"}
+{"parsed_date":"2016-09-29 00:00:00 UTC","id":"2","total_visits":"2597"}
+{"parsed_date":"2016-12-20 00:00:00 UTC","id":"2","total_visits":"3110"}
+{"parsed_date":"2017-01-15 00:00:00 UTC","id":"2","total_visits":"1576"}
+{"parsed_date":"2017-04-14 00:00:00 UTC","id":"2","total_visits":"1834"}
+{"parsed_date":"2017-02-28 00:00:00 UTC","id":"2","total_visits":"2347"}
+{"parsed_date":"2016-09-16 00:00:00 UTC","id":"2","total_visits":"2603"}
+{"parsed_date":"2016-10-18 00:00:00 UTC","id":"2","total_visits":"3628"}
+{"parsed_date":"2017-02-24 00:00:00 UTC","id":"2","total_visits":"2093"}
+{"parsed_date":"2017-05-17 00:00:00 UTC","id":"2","total_visits":"3117"}
+{"parsed_date":"2017-06-23 00:00:00 UTC","id":"2","total_visits":"2095"}
+{"parsed_date":"2016-11-12 00:00:00 UTC","id":"2","total_visits":"3119"}
+{"parsed_date":"2016-11-21 00:00:00 UTC","id":"2","total_visits":"4143"}
+{"parsed_date":"2017-02-27 00:00:00 UTC","id":"2","total_visits":"2352"}
+{"parsed_date":"2016-12-26 00:00:00 UTC","id":"2","total_visits":"1586"}
+{"parsed_date":"2017-04-25 00:00:00 UTC","id":"2","total_visits":"2354"}
+{"parsed_date":"2017-03-21 00:00:00 UTC","id":"2","total_visits":"2611"}
+{"parsed_date":"2016-12-22 00:00:00 UTC","id":"2","total_visits":"2100"}
+{"parsed_date":"2016-10-01 00:00:00 UTC","id":"2","total_visits":"1589"}
+{"parsed_date":"2016-09-24 00:00:00 UTC","id":"2","total_visits":"1845"}
+{"parsed_date":"2017-06-21 00:00:00 UTC","id":"2","total_visits":"2357"}
+{"parsed_date":"2016-09-02 00:00:00 UTC","id":"2","total_visits":"2613"}
+{"parsed_date":"2016-08-26 00:00:00 UTC","id":"2","total_visits":"2359"}
+{"parsed_date":"2016-10-12 00:00:00 UTC","id":"2","total_visits":"2871"}
+{"parsed_date":"2017-05-15 00:00:00 UTC","id":"2","total_visits":"2360"}
+{"parsed_date":"2017-06-12 00:00:00 UTC","id":"2","total_visits":"2361"}
+{"parsed_date":"2016-08-16 00:00:00 UTC","id":"2","total_visits":"2873"}
+{"parsed_date":"2017-04-30 00:00:00 UTC","id":"2","total_visits":"1594"}
+{"parsed_date":"2017-04-05 00:00:00 UTC","id":"2","total_visits":"2619"}
+{"parsed_date":"2016-08-12 00:00:00 UTC","id":"2","total_visits":"2619"}
+{"parsed_date":"2016-11-08 00:00:00 UTC","id":"2","total_visits":"3899"}
+{"parsed_date":"2016-08-13 00:00:00 UTC","id":"2","total_visits":"1596"}
+{"parsed_date":"2017-05-09 00:00:00 UTC","id":"2","total_visits":"2108"}
+{"parsed_date":"2017-02-23 00:00:00 UTC","id":"2","total_visits":"2364"}
+{"parsed_date":"2017-07-31 00:00:00 UTC","id":"2","total_visits":"2620"}
+{"parsed_date":"2017-06-25 00:00:00 UTC","id":"2","total_visits":"1597"}
+{"parsed_date":"2017-07-29 00:00:00 UTC","id":"2","total_visits":"1597"}
+{"parsed_date":"2016-09-17 00:00:00 UTC","id":"2","total_visits":"1853"}
+{"parsed_date":"2016-12-27 00:00:00 UTC","id":"2","total_visits":"1855"}
+{"parsed_date":"2017-05-20 00:00:00 UTC","id":"2","total_visits":"1855"}
+{"parsed_date":"2016-10-08 00:00:00 UTC","id":"2","total_visits":"2114"}
+{"parsed_date":"2016-10-27 00:00:00 UTC","id":"2","total_visits":"4162"}
+{"parsed_date":"2017-07-08 00:00:00 UTC","id":"2","total_visits":"1859"}
+{"parsed_date":"2016-08-24 00:00:00 UTC","id":"2","total_visits":"2627"}
+{"parsed_date":"2016-12-23 00:00:00 UTC","id":"2","total_visits":"1604"}
+{"parsed_date":"2017-02-02 00:00:00 UTC","id":"2","total_visits":"2372"}
+{"parsed_date":"2016-09-08 00:00:00 UTC","id":"2","total_visits":"2628"}
+{"parsed_date":"2017-04-02 00:00:00 UTC","id":"2","total_visits":"1861"}
+{"parsed_date":"2017-02-15 00:00:00 UTC","id":"2","total_visits":"2629"}
+{"parsed_date":"2017-07-05 00:00:00 UTC","id":"2","total_visits":"2885"}
+{"parsed_date":"2016-10-17 00:00:00 UTC","id":"2","total_visits":"3397"}
+{"parsed_date":"2017-02-20 00:00:00 UTC","id":"2","total_visits":"2374"}
+{"parsed_date":"2017-03-24 00:00:00 UTC","id":"2","total_visits":"2374"}
+{"parsed_date":"2017-04-20 00:00:00 UTC","id":"2","total_visits":"2374"}
+{"parsed_date":"2016-11-18 00:00:00 UTC","id":"2","total_visits":"3654"}
+{"parsed_date":"2017-07-25 00:00:00 UTC","id":"2","total_visits":"2631"}
+{"parsed_date":"2016-11-13 00:00:00 UTC","id":"2","total_visits":"3144"}
+{"parsed_date":"2017-03-18 00:00:00 UTC","id":"2","total_visits":"1610"}
+{"parsed_date":"2016-08-03 00:00:00 UTC","id":"2","total_visits":"2890"}
+{"parsed_date":"2016-08-19 00:00:00 UTC","id":"2","total_visits":"2379"}
+{"parsed_date":"2017-02-14 00:00:00 UTC","id":"2","total_visits":"2379"}
+{"parsed_date":"2017-07-11 00:00:00 UTC","id":"2","total_visits":"2635"}
+{"parsed_date":"2017-04-22 00:00:00 UTC","id":"2","total_visits":"1612"}
+{"parsed_date":"2016-10-07 00:00:00 UTC","id":"2","total_visits":"2892"}
+{"parsed_date":"2016-09-05 00:00:00 UTC","id":"2","total_visits":"2125"}
+{"parsed_date":"2016-09-23 00:00:00 UTC","id":"2","total_visits":"2381"}
+{"parsed_date":"2016-11-15 00:00:00 UTC","id":"2","total_visits":"4685"}
+{"parsed_date":"2017-01-28 00:00:00 UTC","id":"2","total_visits":"1614"}
+{"parsed_date":"2017-07-14 00:00:00 UTC","id":"2","total_visits":"2382"}
+{"parsed_date":"2017-01-07 00:00:00 UTC","id":"2","total_visits":"1615"}
+{"parsed_date":"2017-04-03 00:00:00 UTC","id":"2","total_visits":"2383"}
+{"parsed_date":"2017-03-20 00:00:00 UTC","id":"2","total_visits":"2383"}
+{"parsed_date":"2016-12-18 00:00:00 UTC","id":"2","total_visits":"2128"}
+{"parsed_date":"2017-03-17 00:00:00 UTC","id":"2","total_visits":"2129"}
+{"parsed_date":"2017-05-23 00:00:00 UTC","id":"2","total_visits":"2129"}
+{"parsed_date":"2016-11-30 00:00:00 UTC","id":"2","total_visits":"4435"}
+{"parsed_date":"2017-01-01 00:00:00 UTC","id":"2","total_visits":"1364"}
+{"parsed_date":"2017-01-02 00:00:00 UTC","id":"2","total_visits":"1620"}
+{"parsed_date":"2016-09-25 00:00:00 UTC","id":"2","total_visits":"1877"}
+{"parsed_date":"2016-08-07 00:00:00 UTC","id":"2","total_visits":"1622"}
+{"parsed_date":"2016-10-09 00:00:00 UTC","id":"2","total_visits":"2134"}
+{"parsed_date":"2017-03-01 00:00:00 UTC","id":"2","total_visits":"2390"}
+{"parsed_date":"2017-01-04 00:00:00 UTC","id":"2","total_visits":"2390"}
+{"parsed_date":"2017-06-06 00:00:00 UTC","id":"2","total_visits":"2391"}
+{"parsed_date":"2017-04-18 00:00:00 UTC","id":"2","total_visits":"2391"}
+{"parsed_date":"2017-04-06 00:00:00 UTC","id":"2","total_visits":"2647"}
+{"parsed_date":"2017-01-30 00:00:00 UTC","id":"2","total_visits":"2392"}
+{"parsed_date":"2016-10-16 00:00:00 UTC","id":"2","total_visits":"2649"}
+{"parsed_date":"2016-08-04 00:00:00 UTC","id":"2","total_visits":"3161"}
+{"parsed_date":"2016-10-21 00:00:00 UTC","id":"2","total_visits":"3419"}
+{"parsed_date":"2016-08-02 00:00:00 UTC","id":"2","total_visits":"2140"}
+{"parsed_date":"2017-03-06 00:00:00 UTC","id":"2","total_visits":"2396"}
+{"parsed_date":"2016-09-13 00:00:00 UTC","id":"2","total_visits":"2396"}
+{"parsed_date":"2016-09-14 00:00:00 UTC","id":"2","total_visits":"2652"}
+{"parsed_date":"2017-04-19 00:00:00 UTC","id":"2","total_visits":"2397"}
+{"parsed_date":"2017-06-19 00:00:00 UTC","id":"2","total_visits":"2142"}
+{"parsed_date":"2016-12-13 00:00:00 UTC","id":"2","total_visits":"3166"}
+{"parsed_date":"2017-06-20 00:00:00 UTC","id":"2","total_visits":"2143"}
+{"parsed_date":"2016-10-10 00:00:00 UTC","id":"2","total_visits":"2911"}
+{"parsed_date":"2017-07-06 00:00:00 UTC","id":"2","total_visits":"2658"}
+{"parsed_date":"2017-01-03 00:00:00 UTC","id":"2","total_visits":"2403"}
+{"parsed_date":"2017-01-08 00:00:00 UTC","id":"2","total_visits":"1637"}
+{"parsed_date":"2017-02-25 00:00:00 UTC","id":"2","total_visits":"1638"}
+{"parsed_date":"2017-05-24 00:00:00 UTC","id":"2","total_visits":"2406"}
+{"parsed_date":"2016-11-22 00:00:00 UTC","id":"2","total_visits":"3942"}
+{"parsed_date":"2017-05-06 00:00:00 UTC","id":"2","total_visits":"1383"}
+{"parsed_date":"2017-07-02 00:00:00 UTC","id":"2","total_visits":"1895"}
+{"parsed_date":"2016-12-01 00:00:00 UTC","id":"2","total_visits":"4200"}
+{"parsed_date":"2017-03-16 00:00:00 UTC","id":"2","total_visits":"2409"}
+{"parsed_date":"2016-12-12 00:00:00 UTC","id":"2","total_visits":"3433"}
+{"parsed_date":"2016-12-25 00:00:00 UTC","id":"2","total_visits":"1386"}
+{"parsed_date":"2017-02-26 00:00:00 UTC","id":"2","total_visits":"1643"}
+{"parsed_date":"2017-04-28 00:00:00 UTC","id":"2","total_visits":"2411"}
+{"parsed_date":"2016-08-11 00:00:00 UTC","id":"2","total_visits":"2667"}
+{"parsed_date":"2017-07-20 00:00:00 UTC","id":"2","total_visits":"2668"}
+{"parsed_date":"2017-05-21 00:00:00 UTC","id":"2","total_visits":"1645"}
+{"parsed_date":"2017-06-17 00:00:00 UTC","id":"2","total_visits":"1391"}
+{"parsed_date":"2016-12-29 00:00:00 UTC","id":"2","total_visits":"1647"}
+{"parsed_date":"2017-07-17 00:00:00 UTC","id":"2","total_visits":"2671"}
+{"parsed_date":"2017-01-16 00:00:00 UTC","id":"2","total_visits":"1906"}
+{"parsed_date":"2017-03-03 00:00:00 UTC","id":"2","total_visits":"2162"}
+{"parsed_date":"2016-11-14 00:00:00 UTC","id":"2","total_visits":"4466"}
+{"parsed_date":"2016-08-30 00:00:00 UTC","id":"2","total_visits":"2675"}
+{"parsed_date":"2016-08-27 00:00:00 UTC","id":"2","total_visits":"1654"}
+{"parsed_date":"2017-02-09 00:00:00 UTC","id":"2","total_visits":"2678"}
+{"parsed_date":"2017-06-03 00:00:00 UTC","id":"2","total_visits":"1399"}
+{"parsed_date":"2017-05-07 00:00:00 UTC","id":"2","total_visits":"1400"}
+{"parsed_date":"2016-11-02 00:00:00 UTC","id":"2","total_visits":"3960"}
+{"parsed_date":"2016-12-15 00:00:00 UTC","id":"2","total_visits":"2937"}
+{"parsed_date":"2017-04-01 00:00:00 UTC","id":"2","total_visits":"2170"}
+{"parsed_date":"2017-07-21 00:00:00 UTC","id":"2","total_visits":"2427"}
+{"parsed_date":"2016-08-06 00:00:00 UTC","id":"2","total_visits":"1663"}
+{"parsed_date":"2016-09-01 00:00:00 UTC","id":"2","total_visits":"2687"}
+{"parsed_date":"2017-06-28 00:00:00 UTC","id":"2","total_visits":"2687"}
+{"parsed_date":"2016-08-20 00:00:00 UTC","id":"2","total_visits":"1664"}
+{"parsed_date":"2017-04-26 00:00:00 UTC","id":"2","total_visits":"4224"}
+{"parsed_date":"2017-07-09 00:00:00 UTC","id":"2","total_visits":"1921"}
+{"parsed_date":"2017-07-28 00:00:00 UTC","id":"2","total_visits":"2433"}
+{"parsed_date":"2016-09-19 00:00:00 UTC","id":"2","total_visits":"2689"}
+{"parsed_date":"2017-07-24 00:00:00 UTC","id":"2","total_visits":"2436"}
+{"parsed_date":"2017-06-13 00:00:00 UTC","id":"2","total_visits":"2181"}
+{"parsed_date":"2016-09-15 00:00:00 UTC","id":"2","total_visits":"2949"}
+{"parsed_date":"2017-02-03 00:00:00 UTC","id":"2","total_visits":"2182"}
+{"parsed_date":"2016-09-10 00:00:00 UTC","id":"2","total_visits":"1671"}
+{"parsed_date":"2017-06-09 00:00:00 UTC","id":"2","total_visits":"1927"}
+{"parsed_date":"2017-01-11 00:00:00 UTC","id":"2","total_visits":"2185"}
+{"parsed_date":"2017-02-19 00:00:00 UTC","id":"2","total_visits":"2187"}
+{"parsed_date":"2017-01-17 00:00:00 UTC","id":"2","total_visits":"2443"}
+{"parsed_date":"2017-05-12 00:00:00 UTC","id":"2","total_visits":"1932"}
+{"parsed_date":"2016-12-16 00:00:00 UTC","id":"2","total_visits":"2956"}
+{"parsed_date":"2017-02-01 00:00:00 UTC","id":"2","total_visits":"2445"}
+{"parsed_date":"2016-11-26 00:00:00 UTC","id":"2","total_visits":"3213"}
+{"parsed_date":"2017-06-02 00:00:00 UTC","id":"2","total_visits":"2190"}
+{"parsed_date":"2016-08-05 00:00:00 UTC","id":"2","total_visits":"2702"}
+{"parsed_date":"2016-11-01 00:00:00 UTC","id":"2","total_visits":"3728"}
+{"parsed_date":"2017-01-05 00:00:00 UTC","id":"2","total_visits":"2193"}
+{"parsed_date":"2017-03-08 00:00:00 UTC","id":"2","total_visits":"2449"}
+{"parsed_date":"2016-08-28 00:00:00 UTC","id":"2","total_visits":"1682"}
+{"parsed_date":"2017-07-04 00:00:00 UTC","id":"2","total_visits":"1938"}
+{"parsed_date":"2017-03-10 00:00:00 UTC","id":"2","total_visits":"2194"}
+{"parsed_date":"2017-07-07 00:00:00 UTC","id":"2","total_visits":"2450"}
+{"parsed_date":"2016-10-29 00:00:00 UTC","id":"2","total_visits":"2964"}
+{"parsed_date":"2016-10-13 00:00:00 UTC","id":"2","total_visits":"2964"}
+{"parsed_date":"2016-12-04 00:00:00 UTC","id":"2","total_visits":"3220"}
+{"parsed_date":"2017-01-21 00:00:00 UTC","id":"2","total_visits":"1685"}
+{"parsed_date":"2017-06-29 00:00:00 UTC","id":"2","total_visits":"2709"}
+{"parsed_date":"2016-08-29 00:00:00 UTC","id":"2","total_visits":"2454"}
+{"parsed_date":"2016-12-19 00:00:00 UTC","id":"2","total_visits":"3222"}
+{"parsed_date":"2017-05-30 00:00:00 UTC","id":"2","total_visits":"2199"}
+{"parsed_date":"2017-02-10 00:00:00 UTC","id":"2","total_visits":"2199"}
+{"parsed_date":"2016-08-31 00:00:00 UTC","id":"2","total_visits":"3223"}
+{"parsed_date":"2017-06-18 00:00:00 UTC","id":"2","total_visits":"1432"}
+{"parsed_date":"2017-01-12 00:00:00 UTC","id":"2","total_visits":"2203"}
+{"parsed_date":"2017-05-18 00:00:00 UTC","id":"2","total_visits":"2715"}
+{"parsed_date":"2016-10-23 00:00:00 UTC","id":"2","total_visits":"2971"}
+{"parsed_date":"2016-09-04 00:00:00 UTC","id":"2","total_visits":"1692"}
+{"parsed_date":"2016-12-10 00:00:00 UTC","id":"2","total_visits":"2207"}
+{"parsed_date":"2016-12-11 00:00:00 UTC","id":"2","total_visits":"2208"}
+{"parsed_date":"2017-04-11 00:00:00 UTC","id":"2","total_visits":"2464"}
+{"parsed_date":"2016-09-21 00:00:00 UTC","id":"2","total_visits":"2720"}
+{"parsed_date":"2016-11-06 00:00:00 UTC","id":"2","total_visits":"3232"}
+{"parsed_date":"2017-01-26 00:00:00 UTC","id":"2","total_visits":"2209"}
+{"parsed_date":"2016-09-12 00:00:00 UTC","id":"2","total_visits":"2465"}
+{"parsed_date":"2017-04-21 00:00:00 UTC","id":"2","total_visits":"2210"}
+{"parsed_date":"2017-01-06 00:00:00 UTC","id":"2","total_visits":"2210"}
+{"parsed_date":"2017-04-04 00:00:00 UTC","id":"2","total_visits":"2978"}
+{"parsed_date":"2017-01-22 00:00:00 UTC","id":"2","total_visits":"1700"}
+{"parsed_date":"2017-07-26 00:00:00 UTC","id":"2","total_visits":"2725"}
+{"parsed_date":"2016-08-18 00:00:00 UTC","id":"2","total_visits":"2725"}
+{"parsed_date":"2016-09-27 00:00:00 UTC","id":"2","total_visits":"2727"}
+{"parsed_date":"2016-12-02 00:00:00 UTC","id":"2","total_visits":"3751"}
+{"parsed_date":"2017-05-05 00:00:00 UTC","id":"2","total_visits":"1960"}
+{"parsed_date":"2016-11-19 00:00:00 UTC","id":"2","total_visits":"2984"}
+{"parsed_date":"2016-11-09 00:00:00 UTC","id":"2","total_visits":"3752"}
+{"parsed_date":"2016-12-05 00:00:00 UTC","id":"2","total_visits":"4265"}
+{"parsed_date":"2017-05-11 00:00:00 UTC","id":"2","total_visits":"2218"}
+{"parsed_date":"2017-01-25 00:00:00 UTC","id":"2","total_visits":"2986"}
+{"parsed_date":"2017-03-11 00:00:00 UTC","id":"2","total_visits":"1707"}
+{"parsed_date":"2017-03-30 00:00:00 UTC","id":"2","total_visits":"2731"}
+{"parsed_date":"2016-10-20 00:00:00 UTC","id":"2","total_visits":"3755"}
+{"parsed_date":"2017-02-07 00:00:00 UTC","id":"2","total_visits":"2476"}
+{"parsed_date":"2017-02-22 00:00:00 UTC","id":"2","total_visits":"2477"}
+{"parsed_date":"2017-07-23 00:00:00 UTC","id":"2","total_visits":"1966"}
+{"parsed_date":"2016-11-03 00:00:00 UTC","id":"2","total_visits":"4014"}
+{"parsed_date":"2016-08-01 00:00:00 UTC","id":"2","total_visits":"1711"}
+{"parsed_date":"2017-01-13 00:00:00 UTC","id":"2","total_visits":"1967"}
+{"parsed_date":"2017-05-19 00:00:00 UTC","id":"2","total_visits":"2223"}
+{"parsed_date":"2016-11-20 00:00:00 UTC","id":"2","total_visits":"3247"}
+{"parsed_date":"2016-11-25 00:00:00 UTC","id":"2","total_visits":"3759"}
+{"parsed_date":"2017-03-25 00:00:00 UTC","id":"2","total_visits":"1712"}
+{"parsed_date":"2017-01-27 00:00:00 UTC","id":"2","total_visits":"1969"}
+{"parsed_date":"2017-06-26 00:00:00 UTC","id":"2","total_visits":"2226"}
+{"parsed_date":"2017-05-25 00:00:00 UTC","id":"2","total_visits":"2228"}
+{"parsed_date":"2017-01-31 00:00:00 UTC","id":"2","total_visits":"2229"}
+{"parsed_date":"2017-07-13 00:00:00 UTC","id":"2","total_visits":"2741"}
+{"parsed_date":"2017-03-15 00:00:00 UTC","id":"2","total_visits":"2486"}
+{"parsed_date":"2017-05-28 00:00:00 UTC","id":"2","total_visits":"1463"}
+{"parsed_date":"2017-03-09 00:00:00 UTC","id":"2","total_visits":"2231"}
+{"parsed_date":"2017-07-15 00:00:00 UTC","id":"2","total_visits":"1721"}
+{"parsed_date":"2016-11-24 00:00:00 UTC","id":"2","total_visits":"3770"}
+{"parsed_date":"2016-10-05 00:00:00 UTC","id":"2","total_visits":"3770"}
+{"parsed_date":"2016-12-31 00:00:00 UTC","id":"2","total_visits":"1211"}
+{"parsed_date":"2016-10-02 00:00:00 UTC","id":"2","total_visits":"1724"}
+{"parsed_date":"2017-07-22 00:00:00 UTC","id":"2","total_visits":"1724"}
+{"parsed_date":"2016-09-11 00:00:00 UTC","id":"2","total_visits":"1725"}
+{"parsed_date":"2017-06-15 00:00:00 UTC","id":"2","total_visits":"2237"}
+{"parsed_date":"2017-06-05 00:00:00 UTC","id":"2","total_visits":"2493"}
+{"parsed_date":"2017-02-06 00:00:00 UTC","id":"2","total_visits":"2238"}
+{"parsed_date":"2016-10-15 00:00:00 UTC","id":"2","total_visits":"2495"}
+{"parsed_date":"2016-08-21 00:00:00 UTC","id":"2","total_visits":"1730"}
+{"parsed_date":"2016-08-23 00:00:00 UTC","id":"2","total_visits":"2754"}
+{"parsed_date":"2017-06-30 00:00:00 UTC","id":"2","total_visits":"2499"}
+{"parsed_date":"2017-01-18 00:00:00 UTC","id":"2","total_visits":"2245"}
+{"parsed_date":"2016-08-10 00:00:00 UTC","id":"2","total_visits":"2757"}
+{"parsed_date":"2016-12-08 00:00:00 UTC","id":"2","total_visits":"3013"}
+{"parsed_date":"2016-11-28 00:00:00 UTC","id":"2","total_visits":"4807"}
+{"parsed_date":"2017-05-22 00:00:00 UTC","id":"2","total_visits":"2248"}
+{"parsed_date":"2016-09-20 00:00:00 UTC","id":"2","total_visits":"2760"}
+{"parsed_date":"2016-10-06 00:00:00 UTC","id":"2","total_visits":"3016"}
+{"parsed_date":"2016-09-06 00:00:00 UTC","id":"2","total_visits":"2508"}
+{"parsed_date":"2016-09-03 00:00:00 UTC","id":"2","total_visits":"1741"}
+{"parsed_date":"2016-12-06 00:00:00 UTC","id":"2","total_visits":"3021"}
+{"parsed_date":"2016-12-24 00:00:00 UTC","id":"2","total_visits":"1231"}
+{"parsed_date":"2016-10-28 00:00:00 UTC","id":"2","total_visits":"3791"}
+{"parsed_date":"2016-12-30 00:00:00 UTC","id":"2","total_visits":"1232"}
+{"parsed_date":"2017-05-29 00:00:00 UTC","id":"2","total_visits":"1745"}
+{"parsed_date":"2017-07-10 00:00:00 UTC","id":"2","total_visits":"2769"}
+{"parsed_date":"2017-06-22 00:00:00 UTC","id":"2","total_visits":"2258"}
+{"parsed_date":"2017-07-19 00:00:00 UTC","id":"2","total_visits":"2514"}
+{"parsed_date":"2016-10-03 00:00:00 UTC","id":"2","total_visits":"2514"}
+{"parsed_date":"2017-06-14 00:00:00 UTC","id":"2","total_visits":"2517"}
+{"parsed_date":"2016-10-22 00:00:00 UTC","id":"2","total_visits":"3029"}
+{"parsed_date":"2017-01-23 00:00:00 UTC","id":"2","total_visits":"2262"}
+{"parsed_date":"2017-04-24 00:00:00 UTC","id":"2","total_visits":"2263"}
+{"parsed_date":"2016-11-10 00:00:00 UTC","id":"2","total_visits":"4055"}
+{"parsed_date":"2016-09-26 00:00:00 UTC","id":"2","total_visits":"2776"}
+{"parsed_date":"2016-10-19 00:00:00 UTC","id":"2","total_visits":"3544"}
+{"parsed_date":"2017-03-04 00:00:00 UTC","id":"2","total_visits":"1753"}
+{"parsed_date":"2017-05-26 00:00:00 UTC","id":"2","total_visits":"2009"}
+{"parsed_date":"2017-02-13 00:00:00 UTC","id":"2","total_visits":"2266"}
+{"parsed_date":"2017-02-18 00:00:00 UTC","id":"2","total_visits":"1755"}
+{"parsed_date":"2017-03-02 00:00:00 UTC","id":"2","total_visits":"2267"}
+{"parsed_date":"2017-03-31 00:00:00 UTC","id":"2","total_visits":"2268"}
+{"parsed_date":"2017-01-10 00:00:00 UTC","id":"2","total_visits":"2268"}
+{"parsed_date":"2017-03-29 00:00:00 UTC","id":"2","total_visits":"2525"}
+{"parsed_date":"2017-03-27 00:00:00 UTC","id":"2","total_visits":"2525"}
+{"parsed_date":"2016-11-23 00:00:00 UTC","id":"2","total_visits":"3805"}
+{"parsed_date":"2017-05-27 00:00:00 UTC","id":"2","total_visits":"1502"}
+{"parsed_date":"2016-10-24 00:00:00 UTC","id":"2","total_visits":"4063"}
+{"parsed_date":"2016-12-14 00:00:00 UTC","id":"2","total_visits":"3040"}
+{"parsed_date":"2017-02-11 00:00:00 UTC","id":"2","total_visits":"1761"}
+{"parsed_date":"2017-07-27 00:00:00 UTC","id":"2","total_visits":"2529"}
+{"parsed_date":"2017-02-17 00:00:00 UTC","id":"2","total_visits":"2785"}
+{"parsed_date":"2017-04-15 00:00:00 UTC","id":"2","total_visits":"1506"}
+{"parsed_date":"2016-11-05 00:00:00 UTC","id":"2","total_visits":"3042"}
+{"parsed_date":"2016-10-04 00:00:00 UTC","id":"2","total_visits":"4322"}
+{"parsed_date":"2017-05-13 00:00:00 UTC","id":"2","total_visits":"1251"}
+{"parsed_date":"2017-04-16 00:00:00 UTC","id":"2","total_visits":"1507"}
+{"parsed_date":"2016-12-28 00:00:00 UTC","id":"2","total_visits":"1763"}
+{"parsed_date":"2016-08-15 00:00:00 UTC","id":"2","total_visits":"3043"}
+{"parsed_date":"2016-12-03 00:00:00 UTC","id":"2","total_visits":"3044"}
+{"parsed_date":"2017-06-27 00:00:00 UTC","id":"2","total_visits":"2789"}
+{"parsed_date":"2017-06-24 00:00:00 UTC","id":"2","total_visits":"1510"}
+{"parsed_date":"2017-07-16 00:00:00 UTC","id":"2","total_visits":"1766"}
+{"parsed_date":"2017-04-09 00:00:00 UTC","id":"2","total_visits":"1766"}
+{"parsed_date":"2017-06-07 00:00:00 UTC","id":"2","total_visits":"2279"}
+{"parsed_date":"2017-04-17 00:00:00 UTC","id":"2","total_visits":"2279"}
+{"parsed_date":"2016-09-28 00:00:00 UTC","id":"2","total_visits":"2535"}
+{"parsed_date":"2017-03-26 00:00:00 UTC","id":"2","total_visits":"1768"}
+{"parsed_date":"2017-05-10 00:00:00 UTC","id":"2","total_visits":"2024"}
+{"parsed_date":"2017-06-08 00:00:00 UTC","id":"2","total_visits":"2280"}
+{"parsed_date":"2017-05-08 00:00:00 UTC","id":"2","total_visits":"2025"}
+{"parsed_date":"2017-03-13 00:00:00 UTC","id":"2","total_visits":"2537"}
+{"parsed_date":"2016-11-17 00:00:00 UTC","id":"2","total_visits":"4074"}
+{"parsed_date":"2016-08-25 00:00:00 UTC","id":"2","total_visits":"2539"}
+{"parsed_date":"2017-02-16 00:00:00 UTC","id":"2","total_visits":"2539"}
+{"parsed_date":"2017-06-16 00:00:00 UTC","id":"2","total_visits":"2028"}
+{"parsed_date":"2016-11-16 00:00:00 UTC","id":"2","total_visits":"4334"}
+{"parsed_date":"2016-08-17 00:00:00 UTC","id":"2","total_visits":"2800"}
+{"parsed_date":"2017-03-19 00:00:00 UTC","id":"2","total_visits":"1776"}
+{"parsed_date":"2016-11-29 00:00:00 UTC","id":"2","total_visits":"4337"}
+{"parsed_date":"2017-02-05 00:00:00 UTC","id":"2","total_visits":"1522"}
+{"parsed_date":"2016-10-31 00:00:00 UTC","id":"2","total_visits":"3827"}
+{"parsed_date":"2017-05-31 00:00:00 UTC","id":"2","total_visits":"2292"}
+{"parsed_date":"2017-07-18 00:00:00 UTC","id":"2","total_visits":"2804"}
+{"parsed_date":"2017-03-12 00:00:00 UTC","id":"2","total_visits":"1781"}
+{"parsed_date":"2016-09-09 00:00:00 UTC","id":"2","total_visits":"2549"}
+{"parsed_date":"2017-01-14 00:00:00 UTC","id":"2","total_visits":"1526"}
+{"parsed_date":"2017-05-04 00:00:00 UTC","id":"2","total_visits":"2806"}
+{"parsed_date":"2016-11-07 00:00:00 UTC","id":"2","total_visits":"3832"}
+{"parsed_date":"2017-04-07 00:00:00 UTC","id":"2","total_visits":"2297"}
+{"parsed_date":"2017-07-12 00:00:00 UTC","id":"2","total_visits":"2554"}
+{"parsed_date":"2017-04-13 00:00:00 UTC","id":"2","total_visits":"2300"}
+{"parsed_date":"2017-08-01 00:00:00 UTC","id":"2","total_visits":"2556"}
+{"parsed_date":"2017-06-04 00:00:00 UTC","id":"2","total_visits":"1534"}
+{"parsed_date":"2017-02-12 00:00:00 UTC","id":"2","total_visits":"1790"}
+{"parsed_date":"2017-07-03 00:00:00 UTC","id":"2","total_visits":"2046"}
+{"parsed_date":"2016-09-30 00:00:00 UTC","id":"2","total_visits":"2303"}
+{"parsed_date":"2016-08-08 00:00:00 UTC","id":"2","total_visits":"2815"}
diff --git a/tests/data/time_series_schema.json b/tests/data/time_series_schema.json
index 857595b9e6..35473dc0e3 100644
--- a/tests/data/time_series_schema.json
+++ b/tests/data/time_series_schema.json
@@ -4,6 +4,11 @@
"name": "parsed_date",
"type": "TIMESTAMP"
},
+ {
+ "mode": "NULLABLE",
+ "name": "id",
+ "type": "STRING"
+ },
{
"mode": "NULLABLE",
"name": "total_visits",
diff --git a/tests/system/conftest.py b/tests/system/conftest.py
index 251b9da4ac..29234bc4ef 100644
--- a/tests/system/conftest.py
+++ b/tests/system/conftest.py
@@ -772,6 +772,31 @@ def new_time_series_df(session, new_time_series_pandas_df):
return session.read_pandas(new_time_series_pandas_df)
+@pytest.fixture(scope="session")
+def new_time_series_pandas_df_w_id():
+ """Additional data matching the time series dataset. The values are dummy ones used to basically check the prediction scores."""
+ utc = pytz.utc
+ return pd.DataFrame(
+ {
+ "parsed_date": [
+ datetime(2017, 8, 2, tzinfo=utc),
+ datetime(2017, 8, 2, tzinfo=utc),
+ datetime(2017, 8, 3, tzinfo=utc),
+ datetime(2017, 8, 3, tzinfo=utc),
+ datetime(2017, 8, 4, tzinfo=utc),
+ datetime(2017, 8, 4, tzinfo=utc),
+ ],
+ "id": ["1", "2", "1", "2", "1", "2"],
+ "total_visits": [2500, 2500, 2500, 2500, 2500, 2500],
+ }
+ )
+
+
+@pytest.fixture(scope="session")
+def new_time_series_df_w_id(session, new_time_series_pandas_df_w_id):
+ return session.read_pandas(new_time_series_pandas_df_w_id)
+
+
@pytest.fixture(scope="session")
def penguins_pandas_df_default_index() -> pd.DataFrame:
"""Consistently ordered pandas dataframe for penguins test data"""
@@ -1015,12 +1040,34 @@ def penguins_xgbregressor_model_name(
return model_name
+def _get_or_create_arima_plus_model(
+ session: bigframes.Session, dataset_id_permanent, sql
+) -> str:
+ """Internal helper to compute a model name by hasing the given SQL.
+ attempst to retreive the model, create it if not exist.
+ retursn the fully qualitifed model"""
+
+ # We use the SQL hash as the name to ensure the model is regenerated if this fixture is edited
+ model_name = f"{dataset_id_permanent}.time_series_arima_plus_{hashlib.md5(sql.encode()).hexdigest()}"
+ sql = sql.replace("$model_name", model_name)
+ try:
+ session.bqclient.get_model(model_name)
+ except google.cloud.exceptions.NotFound:
+ logging.info(
+ "time_series_arima_plus_model fixture was not found in the permanent dataset, regenerating it..."
+ )
+ session.bqclient.query(sql).result()
+ finally:
+ return model_name
+
+
@pytest.fixture(scope="session")
def time_series_arima_plus_model_name(
session: bigframes.Session, dataset_id_permanent, time_series_table_id
) -> str:
"""Provides a pretrained model as a test fixture that is cached across test runs.
- This lets us run system tests without having to wait for a model.fit(...)"""
+ This lets us run system tests without having to wait for a model.fit(...).
+ This version does not include time_series_id_col."""
sql = f"""
CREATE OR REPLACE MODEL `$model_name`
OPTIONS (
@@ -1028,21 +1075,30 @@ def time_series_arima_plus_model_name(
time_series_timestamp_col = 'parsed_date',
time_series_data_col = 'total_visits'
) AS SELECT
- *
+ parsed_date,
+ total_visits
FROM `{time_series_table_id}`"""
- # We use the SQL hash as the name to ensure the model is regenerated if this fixture is edited
- model_name = f"{dataset_id_permanent}.time_series_arima_plus_{hashlib.md5(sql.encode()).hexdigest()}"
- sql = sql.replace("$model_name", model_name)
+ return _get_or_create_arima_plus_model(session, dataset_id_permanent, sql)
- try:
- session.bqclient.get_model(model_name)
- except google.cloud.exceptions.NotFound:
- logging.info(
- "time_series_arima_plus_model fixture was not found in the permanent dataset, regenerating it..."
- )
- session.bqclient.query(sql).result()
- finally:
- return model_name
+
+@pytest.fixture(scope="session")
+def time_series_arima_plus_model_name_w_id(
+ session: bigframes.Session, dataset_id_permanent, time_series_table_id
+) -> str:
+ """Provides a pretrained model as a test fixture that is cached across test runs.
+ This lets us run system tests without having to wait for a model.fit(...).
+ This version includes time_series_id_col."""
+ sql = f"""
+CREATE OR REPLACE MODEL `$model_name`
+OPTIONS (
+ model_type='ARIMA_PLUS',
+ time_series_timestamp_col = 'parsed_date',
+ time_series_data_col = 'total_visits',
+ time_series_id_col = 'id'
+) AS SELECT
+ *
+FROM `{time_series_table_id}`"""
+ return _get_or_create_arima_plus_model(session, dataset_id_permanent, sql)
@pytest.fixture(scope="session")
diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/functions/test_remote_function.py
similarity index 99%
rename from tests/system/large/test_remote_function.py
rename to tests/system/large/functions/test_remote_function.py
index f226143b50..54ba0549a0 100644
--- a/tests/system/large/test_remote_function.py
+++ b/tests/system/large/functions/test_remote_function.py
@@ -18,6 +18,7 @@
import math # must keep this at top level to test udf referring global import
import os.path
import shutil
+import sys
import tempfile
import textwrap
@@ -47,6 +48,12 @@
_team_euler = "Team Euler"
+pytestmark = pytest.mark.skipif(
+ sys.version_info >= (3, 13),
+ reason="Runtime 'python313' is not supported yet. Skip for now.",
+)
+
+
def cleanup_remote_function_assets(
bigquery_client, cloudfunctions_client, remote_udf, ignore_failures=True
):
diff --git a/tests/system/large/ml/test_forecasting.py b/tests/system/large/ml/test_forecasting.py
index bb53305b94..7c070fd200 100644
--- a/tests/system/large/ml/test_forecasting.py
+++ b/tests/system/large/ml/test_forecasting.py
@@ -33,38 +33,65 @@
]
-@pytest.fixture(scope="module")
-def arima_model(time_series_df_default_index):
+def _fit_arima_model(time_series_df_default_index):
model = forecasting.ARIMAPlus()
X_train = time_series_df_default_index["parsed_date"]
y_train = time_series_df_default_index[["total_visits"]]
+ return model, X_train, y_train
+
+
+@pytest.fixture(scope="module")
+def arima_model(time_series_df_default_index):
+ model, X_train, y_train = _fit_arima_model(time_series_df_default_index)
model.fit(X_train, y_train)
return model
+@pytest.fixture(scope="module")
+def arima_model_w_id(time_series_df_default_index):
+ model, X_train, y_train = _fit_arima_model(time_series_df_default_index)
+ id_cols = time_series_df_default_index[["id"]]
+ model.fit(X_train, y_train, id_col=id_cols)
+ return model
+
+
+@pytest.mark.parametrize("id_col_name", [None, "id"])
def test_arima_plus_model_fit_score(
dataset_id,
new_time_series_df,
+ new_time_series_df_w_id,
arima_model,
+ arima_model_w_id,
+ id_col_name,
):
-
- result = arima_model.score(
- new_time_series_df[["parsed_date"]], new_time_series_df[["total_visits"]]
- ).to_pandas()
+ curr_model = arima_model_w_id if id_col_name else arima_model
+ if id_col_name:
+ result = curr_model.score(
+ new_time_series_df_w_id[["parsed_date"]],
+ new_time_series_df_w_id[["total_visits"]],
+ id_col=new_time_series_df_w_id[[id_col_name]],
+ ).to_pandas()
+ else:
+ result = curr_model.score(
+ new_time_series_df[["parsed_date"]], new_time_series_df[["total_visits"]]
+ ).to_pandas()
+ expected_columns = [
+ "mean_absolute_error",
+ "mean_squared_error",
+ "root_mean_squared_error",
+ "mean_absolute_percentage_error",
+ "symmetric_mean_absolute_percentage_error",
+ ]
+ if id_col_name:
+ expected_columns.insert(0, id_col_name)
utils.check_pandas_df_schema_and_index(
result,
- columns=[
- "mean_absolute_error",
- "mean_squared_error",
- "root_mean_squared_error",
- "mean_absolute_percentage_error",
- "symmetric_mean_absolute_percentage_error",
- ],
- index=1,
+ columns=expected_columns,
+ index=2 if id_col_name else 1,
)
# save, load to ensure configuration was kept
- reloaded_model = arima_model.to_gbq(
+ reloaded_model = curr_model.to_gbq(
f"{dataset_id}.temp_arima_plus_model", replace=True
)
assert (
@@ -72,14 +99,22 @@ def test_arima_plus_model_fit_score(
)
-def test_arima_plus_model_fit_summary(dataset_id, arima_model):
- result = arima_model.summary().to_pandas()
+@pytest.mark.parametrize("id_col_name", [None, "id"])
+def test_arima_plus_model_fit_summary(
+ dataset_id, arima_model, arima_model_w_id, id_col_name
+):
+ curr_model = arima_model_w_id if id_col_name else arima_model
+ result = curr_model.summary().to_pandas()
+ expected_columns = (
+ [id_col_name] + ARIMA_EVALUATE_OUTPUT_COL
+ if id_col_name
+ else ARIMA_EVALUATE_OUTPUT_COL
+ )
utils.check_pandas_df_schema_and_index(
- result, columns=ARIMA_EVALUATE_OUTPUT_COL, index=1
+ result, columns=expected_columns, index=2 if id_col_name else 1
)
-
# save, load to ensure configuration was kept
- reloaded_model = arima_model.to_gbq(
+ reloaded_model = curr_model.to_gbq(
f"{dataset_id}.temp_arima_plus_model", replace=True
)
assert (
@@ -87,17 +122,29 @@ def test_arima_plus_model_fit_summary(dataset_id, arima_model):
)
-def test_arima_coefficients(arima_model):
- result = arima_model.coef_.to_pandas()
+@pytest.mark.parametrize("id_col_name", [None, "id"])
+def test_arima_coefficients(arima_model, arima_model_w_id, id_col_name):
+ result = (
+ arima_model_w_id.coef_.to_pandas()
+ if id_col_name
+ else arima_model.coef_.to_pandas()
+ )
expected_columns = [
"ar_coefficients",
"ma_coefficients",
"intercept_or_drift",
]
- utils.check_pandas_df_schema_and_index(result, columns=expected_columns, index=1)
+ if id_col_name:
+ expected_columns.insert(0, id_col_name)
+ utils.check_pandas_df_schema_and_index(
+ result, columns=expected_columns, index=2 if id_col_name else 1
+ )
-def test_arima_plus_model_fit_params(time_series_df_default_index, dataset_id):
+@pytest.mark.parametrize("id_col_name", [None, "id"])
+def test_arima_plus_model_fit_params(
+ time_series_df_default_index, dataset_id, id_col_name
+):
model = forecasting.ARIMAPlus(
horizon=100,
auto_arima=True,
@@ -115,7 +162,11 @@ def test_arima_plus_model_fit_params(time_series_df_default_index, dataset_id):
X_train = time_series_df_default_index[["parsed_date"]]
y_train = time_series_df_default_index["total_visits"]
- model.fit(X_train, y_train)
+ if id_col_name is None:
+ model.fit(X_train, y_train)
+ else:
+ id_cols = time_series_df_default_index[[id_col_name]]
+ model.fit(X_train, y_train, id_col=id_cols)
# save, load to ensure configuration was kept
reloaded_model = model.to_gbq(f"{dataset_id}.temp_arima_plus_model", replace=True)
diff --git a/tests/system/small/bigquery/test_datetime.py b/tests/system/small/bigquery/test_datetime.py
index 984e75ac10..b839031263 100644
--- a/tests/system/small/bigquery/test_datetime.py
+++ b/tests/system/small/bigquery/test_datetime.py
@@ -15,6 +15,7 @@
import typing
import pandas as pd
+import pytest
from bigframes import bigquery
@@ -32,6 +33,13 @@ def test_unix_seconds(scalars_dfs):
pd.testing.assert_series_equal(actual_res, expected_res)
+def test_unix_seconds_incorrect_input_type_raise_error(scalars_dfs):
+ df, _ = scalars_dfs
+
+ with pytest.raises(TypeError):
+ bigquery.unix_seconds(df["string_col"])
+
+
def test_unix_millis(scalars_dfs):
bigframes_df, pandas_df = scalars_dfs
@@ -45,6 +53,13 @@ def test_unix_millis(scalars_dfs):
pd.testing.assert_series_equal(actual_res, expected_res)
+def test_unix_millis_incorrect_input_type_raise_error(scalars_dfs):
+ df, _ = scalars_dfs
+
+ with pytest.raises(TypeError):
+ bigquery.unix_millis(df["string_col"])
+
+
def test_unix_micros(scalars_dfs):
bigframes_df, pandas_df = scalars_dfs
@@ -58,6 +73,13 @@ def test_unix_micros(scalars_dfs):
pd.testing.assert_series_equal(actual_res, expected_res)
+def test_unix_micros_incorrect_input_type_raise_error(scalars_dfs):
+ df, _ = scalars_dfs
+
+ with pytest.raises(TypeError):
+ bigquery.unix_micros(df["string_col"])
+
+
def _to_unix_epoch(
ts: pd.Timestamp, unit: typing.Literal["s", "ms", "us"]
) -> typing.Optional[int]:
diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/functions/test_remote_function.py
similarity index 100%
rename from tests/system/small/test_remote_function.py
rename to tests/system/small/functions/test_remote_function.py
diff --git a/tests/system/small/ml/conftest.py b/tests/system/small/ml/conftest.py
index c1643776a5..0e8489c513 100644
--- a/tests/system/small/ml/conftest.py
+++ b/tests/system/small/ml/conftest.py
@@ -304,6 +304,14 @@ def time_series_bqml_arima_plus_model(
return core.BqmlModel(session, model)
+@pytest.fixture(scope="session")
+def time_series_bqml_arima_plus_model_w_id(
+ session, time_series_arima_plus_model_name_w_id
+) -> core.BqmlModel:
+ model = session.bqclient.get_model(time_series_arima_plus_model_name_w_id)
+ return core.BqmlModel(session, model)
+
+
@pytest.fixture(scope="session")
def time_series_arima_plus_model(
session, time_series_arima_plus_model_name
@@ -314,6 +322,16 @@ def time_series_arima_plus_model(
)
+@pytest.fixture(scope="session")
+def time_series_arima_plus_model_w_id(
+ session, time_series_arima_plus_model_name_w_id
+) -> forecasting.ARIMAPlus:
+ return cast(
+ forecasting.ARIMAPlus,
+ session.read_gbq_model(time_series_arima_plus_model_name_w_id),
+ )
+
+
@pytest.fixture(scope="session")
def imported_tensorflow_model_path() -> str:
return "gs://cloud-training-demos/txtclass/export/exporter/1549825580/*"
diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py
index 2a2e68b230..1c2591b90a 100644
--- a/tests/system/small/ml/test_core.py
+++ b/tests/system/small/ml/test_core.py
@@ -410,22 +410,65 @@ def test_model_generate_text(
)
-def test_model_forecast(time_series_bqml_arima_plus_model: core.BqmlModel):
+@pytest.mark.parametrize("id_col_name", [None, "id"])
+def test_model_forecast(
+ time_series_bqml_arima_plus_model: core.BqmlModel,
+ time_series_bqml_arima_plus_model_w_id: core.BqmlModel,
+ id_col_name,
+):
utc = pytz.utc
- forecast = time_series_bqml_arima_plus_model.forecast(
- {"horizon": 4, "confidence_level": 0.8}
- ).to_pandas()[["forecast_timestamp", "forecast_value"]]
- expected = pd.DataFrame(
- {
- "forecast_timestamp": [
- datetime(2017, 8, 2, tzinfo=utc),
- datetime(2017, 8, 3, tzinfo=utc),
- datetime(2017, 8, 4, tzinfo=utc),
- datetime(2017, 8, 5, tzinfo=utc),
- ],
- "forecast_value": [2724.472284, 2593.368389, 2353.613034, 1781.623071],
- }
- )
+ forecast_cols = ["forecast_timestamp", "forecast_value"]
+ if id_col_name:
+ forecast_cols.insert(0, id_col_name)
+
+ forecast = (
+ time_series_bqml_arima_plus_model_w_id.forecast(
+ {"horizon": 4, "confidence_level": 0.8}
+ )
+ if id_col_name
+ else time_series_bqml_arima_plus_model.forecast(
+ {"horizon": 4, "confidence_level": 0.8}
+ )
+ ).to_pandas()[forecast_cols]
+ if id_col_name:
+ expected = pd.DataFrame(
+ {
+ "id": ["1", "2", "1", "2", "1", "2", "1", "2"],
+ "forecast_timestamp": [
+ datetime(2017, 8, 2, tzinfo=utc),
+ datetime(2017, 8, 2, tzinfo=utc),
+ datetime(2017, 8, 3, tzinfo=utc),
+ datetime(2017, 8, 3, tzinfo=utc),
+ datetime(2017, 8, 4, tzinfo=utc),
+ datetime(2017, 8, 4, tzinfo=utc),
+ datetime(2017, 8, 5, tzinfo=utc),
+ datetime(2017, 8, 5, tzinfo=utc),
+ ],
+ "forecast_value": [
+ 2634.796023,
+ 2634.796023,
+ 2621.332462,
+ 2621.332462,
+ 2396.095463,
+ 2396.095463,
+ 1742.878278,
+ 1742.878278,
+ ],
+ }
+ )
+ expected["id"] = expected["id"].astype("string[pyarrow]")
+ else:
+ expected = pd.DataFrame(
+ {
+ "forecast_timestamp": [
+ datetime(2017, 8, 2, tzinfo=utc),
+ datetime(2017, 8, 3, tzinfo=utc),
+ datetime(2017, 8, 4, tzinfo=utc),
+ datetime(2017, 8, 5, tzinfo=utc),
+ ],
+ "forecast_value": [2634.796023, 2621.332462, 2396.095463, 1742.878278],
+ }
+ )
expected["forecast_value"] = expected["forecast_value"].astype(pd.Float64Dtype())
expected["forecast_timestamp"] = expected["forecast_timestamp"].astype(
pd.ArrowDtype(pa.timestamp("us", tz="UTC"))
diff --git a/tests/system/small/ml/test_forecasting.py b/tests/system/small/ml/test_forecasting.py
index 1b3a650388..d1b6b18fbe 100644
--- a/tests/system/small/ml/test_forecasting.py
+++ b/tests/system/small/ml/test_forecasting.py
@@ -16,6 +16,7 @@
import pandas as pd
import pyarrow as pa
+import pytest
import pytz
from bigframes.ml import forecasting
@@ -35,23 +36,66 @@
]
+@pytest.mark.parametrize("id_col_name", [None, "id"])
def test_arima_plus_predict_default(
time_series_arima_plus_model: forecasting.ARIMAPlus,
+ time_series_arima_plus_model_w_id: forecasting.ARIMAPlus,
+ id_col_name,
):
utc = pytz.utc
- predictions = time_series_arima_plus_model.predict().to_pandas()
- assert predictions.shape == (3, 8)
- result = predictions[["forecast_timestamp", "forecast_value"]]
- expected = pd.DataFrame(
- {
- "forecast_timestamp": [
- datetime(2017, 8, 2, tzinfo=utc),
- datetime(2017, 8, 3, tzinfo=utc),
- datetime(2017, 8, 4, tzinfo=utc),
- ],
- "forecast_value": [2724.472284, 2593.368389, 2353.613034],
- }
+ predictions = (
+ (
+ time_series_arima_plus_model_w_id
+ if id_col_name
+ else time_series_arima_plus_model
+ )
+ .predict()
+ .to_pandas()
)
+ assert predictions.shape == ((6, 9) if id_col_name else (3, 8))
+ result = predictions[["forecast_timestamp", "forecast_value"]]
+ if id_col_name:
+ result["id"] = predictions[["id"]]
+ result = result[["id", "forecast_timestamp", "forecast_value"]]
+
+ if id_col_name:
+ expected = pd.DataFrame(
+ {
+ "id": ["1", "2", "1", "2", "1", "2"],
+ "forecast_timestamp": [
+ datetime(2017, 8, 2, tzinfo=utc),
+ datetime(2017, 8, 2, tzinfo=utc),
+ datetime(2017, 8, 3, tzinfo=utc),
+ datetime(2017, 8, 3, tzinfo=utc),
+ datetime(2017, 8, 4, tzinfo=utc),
+ datetime(2017, 8, 4, tzinfo=utc),
+ ],
+ "forecast_value": [
+ 2634.796023,
+ 2634.796023,
+ 2621.332461,
+ 2621.332461,
+ 2396.095462,
+ 2396.095462,
+ ],
+ }
+ )
+ expected["id"] = expected["id"].astype("string[pyarrow]")
+ else:
+ expected = pd.DataFrame(
+ {
+ "forecast_timestamp": [
+ datetime(2017, 8, 2, tzinfo=utc),
+ datetime(2017, 8, 3, tzinfo=utc),
+ datetime(2017, 8, 4, tzinfo=utc),
+ ],
+ "forecast_value": [
+ 2634.796023,
+ 2621.332461,
+ 2396.095462,
+ ],
+ }
+ )
expected["forecast_value"] = expected["forecast_value"].astype(pd.Float64Dtype())
expected["forecast_timestamp"] = expected["forecast_timestamp"].astype(
pd.ArrowDtype(pa.timestamp("us", tz="UTC"))
@@ -65,27 +109,69 @@ def test_arima_plus_predict_default(
)
+@pytest.mark.parametrize("id_col_name", [None, "id"])
def test_arima_plus_predict_explain_default(
time_series_arima_plus_model: forecasting.ARIMAPlus,
+ time_series_arima_plus_model_w_id: forecasting.ARIMAPlus,
+ id_col_name,
):
utc = pytz.utc
- predictions = time_series_arima_plus_model.predict_explain().to_pandas()
- assert predictions.shape[0] == 369
+ predictions = (
+ (
+ time_series_arima_plus_model_w_id
+ if id_col_name
+ else time_series_arima_plus_model
+ )
+ .predict_explain()
+ .to_pandas()
+ )
+ assert predictions.shape[0] == (738 if id_col_name else 369)
predictions = predictions[
predictions["time_series_type"] == "forecast"
].reset_index(drop=True)
- assert predictions.shape[0] == 3
+ assert predictions.shape[0] == (6 if id_col_name else 3)
result = predictions[["time_series_timestamp", "time_series_data"]]
- expected = pd.DataFrame(
- {
- "time_series_timestamp": [
- datetime(2017, 8, 2, tzinfo=utc),
- datetime(2017, 8, 3, tzinfo=utc),
- datetime(2017, 8, 4, tzinfo=utc),
- ],
- "time_series_data": [2727.693349, 2595.290749, 2370.86767],
- }
- )
+ if id_col_name:
+ result["id"] = predictions[["id"]]
+ result = result[["id", "time_series_timestamp", "time_series_data"]]
+ if id_col_name:
+ expected = pd.DataFrame(
+ {
+ "id": ["1", "2", "1", "2", "1", "2"],
+ "time_series_timestamp": [
+ datetime(2017, 8, 2, tzinfo=utc),
+ datetime(2017, 8, 2, tzinfo=utc),
+ datetime(2017, 8, 3, tzinfo=utc),
+ datetime(2017, 8, 3, tzinfo=utc),
+ datetime(2017, 8, 4, tzinfo=utc),
+ datetime(2017, 8, 4, tzinfo=utc),
+ ],
+ "time_series_data": [
+ 2634.796023,
+ 2634.796023,
+ 2621.332461,
+ 2621.332461,
+ 2396.095462,
+ 2396.095462,
+ ],
+ }
+ )
+ expected["id"] = expected["id"].astype("string[pyarrow]")
+ else:
+ expected = pd.DataFrame(
+ {
+ "time_series_timestamp": [
+ datetime(2017, 8, 2, tzinfo=utc),
+ datetime(2017, 8, 3, tzinfo=utc),
+ datetime(2017, 8, 4, tzinfo=utc),
+ ],
+ "time_series_data": [
+ 2634.796023,
+ 2621.332461,
+ 2396.095462,
+ ],
+ }
+ )
expected["time_series_data"] = expected["time_series_data"].astype(
pd.Float64Dtype()
)
@@ -101,24 +187,72 @@ def test_arima_plus_predict_explain_default(
)
-def test_arima_plus_predict_params(time_series_arima_plus_model: forecasting.ARIMAPlus):
+@pytest.mark.parametrize("id_col_name", [None, "id"])
+def test_arima_plus_predict_params(
+ time_series_arima_plus_model: forecasting.ARIMAPlus,
+ time_series_arima_plus_model_w_id: forecasting.ARIMAPlus,
+ id_col_name,
+):
utc = pytz.utc
- predictions = time_series_arima_plus_model.predict(
- horizon=4, confidence_level=0.9
- ).to_pandas()
- assert predictions.shape == (4, 8)
- result = predictions[["forecast_timestamp", "forecast_value"]]
- expected = pd.DataFrame(
- {
- "forecast_timestamp": [
- datetime(2017, 8, 2, tzinfo=utc),
- datetime(2017, 8, 3, tzinfo=utc),
- datetime(2017, 8, 4, tzinfo=utc),
- datetime(2017, 8, 5, tzinfo=utc),
- ],
- "forecast_value": [2724.472284, 2593.368389, 2353.613034, 1781.623071],
- }
+ predictions = (
+ (
+ time_series_arima_plus_model_w_id
+ if id_col_name
+ else time_series_arima_plus_model
+ )
+ .predict(horizon=4, confidence_level=0.9)
+ .to_pandas()
)
+ assert predictions.shape == ((8, 9) if id_col_name else (4, 8))
+ result = predictions[["forecast_timestamp", "forecast_value"]]
+ if id_col_name:
+ result["id"] = predictions[["id"]]
+ result = result[["id", "forecast_timestamp", "forecast_value"]]
+
+ if id_col_name:
+ expected = pd.DataFrame(
+ {
+ "id": ["1", "2", "1", "2", "1", "2", "1", "2"],
+ "forecast_timestamp": [
+ datetime(2017, 8, 2, tzinfo=utc),
+ datetime(2017, 8, 2, tzinfo=utc),
+ datetime(2017, 8, 3, tzinfo=utc),
+ datetime(2017, 8, 3, tzinfo=utc),
+ datetime(2017, 8, 4, tzinfo=utc),
+ datetime(2017, 8, 4, tzinfo=utc),
+ datetime(2017, 8, 5, tzinfo=utc),
+ datetime(2017, 8, 5, tzinfo=utc),
+ ],
+ "forecast_value": [
+ 2634.796023,
+ 2634.796023,
+ 2621.332461,
+ 2621.332461,
+ 2396.095462,
+ 2396.095462,
+ 1781.623071,
+ 1781.623071,
+ ],
+ }
+ )
+ expected["id"] = expected["id"].astype("string[pyarrow]")
+ else:
+ expected = pd.DataFrame(
+ {
+ "forecast_timestamp": [
+ datetime(2017, 8, 2, tzinfo=utc),
+ datetime(2017, 8, 3, tzinfo=utc),
+ datetime(2017, 8, 4, tzinfo=utc),
+ datetime(2017, 8, 5, tzinfo=utc),
+ ],
+ "forecast_value": [
+ 2634.796023,
+ 2621.332461,
+ 2396.095462,
+ 1781.623071,
+ ],
+ }
+ )
expected["forecast_value"] = expected["forecast_value"].astype(pd.Float64Dtype())
expected["forecast_timestamp"] = expected["forecast_timestamp"].astype(
pd.ArrowDtype(pa.timestamp("us", tz="UTC"))
@@ -132,12 +266,21 @@ def test_arima_plus_predict_params(time_series_arima_plus_model: forecasting.ARI
)
+@pytest.mark.parametrize("id_col_name", [None, "id"])
def test_arima_plus_predict_explain_params(
time_series_arima_plus_model: forecasting.ARIMAPlus,
+ time_series_arima_plus_model_w_id: forecasting.ARIMAPlus,
+ id_col_name,
):
- predictions = time_series_arima_plus_model.predict_explain(
- horizon=4, confidence_level=0.9
- ).to_pandas()
+ predictions = (
+ (
+ time_series_arima_plus_model_w_id
+ if id_col_name
+ else time_series_arima_plus_model
+ )
+ .predict_explain(horizon=4, confidence_level=0.9)
+ .to_pandas()
+ )
assert predictions.shape[0] >= 1
prediction_columns = set(predictions.columns)
expected_columns = {
@@ -156,24 +299,70 @@ def test_arima_plus_predict_explain_params(
"seasonal_period_daily",
"holiday_effect",
}
+ if id_col_name:
+ expected_columns.add("id")
assert expected_columns <= prediction_columns
+@pytest.mark.parametrize("id_col_name", [None, "id"])
def test_arima_plus_detect_anomalies(
- time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df
+ time_series_arima_plus_model: forecasting.ARIMAPlus,
+ time_series_arima_plus_model_w_id: forecasting.ARIMAPlus,
+ new_time_series_df,
+ new_time_series_df_w_id,
+ id_col_name,
):
- anomalies = time_series_arima_plus_model.detect_anomalies(
- new_time_series_df
- ).to_pandas()
-
- expected = pd.DataFrame(
- {
- "is_anomaly": [False, False, False],
- "lower_bound": [2349.301736, 2153.614829, 1849.040192],
- "upper_bound": [3099.642833, 3033.12195, 2858.185876],
- "anomaly_probability": [0.757824, 0.322559, 0.43011],
- },
+ anomalies = (
+ (
+ time_series_arima_plus_model_w_id
+ if id_col_name
+ else time_series_arima_plus_model
+ )
+ .detect_anomalies(
+ new_time_series_df_w_id if id_col_name else new_time_series_df
+ )
+ .to_pandas()
)
+
+ if id_col_name:
+ expected = pd.DataFrame(
+ {
+ "is_anomaly": [False, False, False, False, False, False],
+ "lower_bound": [
+ 2229.930578,
+ 2229.930578,
+ 2149.645455,
+ 2149.645455,
+ 1892.873256,
+ 1892.873256,
+ ],
+ "upper_bound": [
+ 3039.6614686,
+ 3039.6614686,
+ 3093.019467,
+ 3093.019467,
+ 2899.317669,
+ 2899.317669,
+ ],
+ "anomaly_probability": [
+ 0.48545926,
+ 0.48545926,
+ 0.3856835,
+ 0.3856835,
+ 0.314156,
+ 0.314156,
+ ],
+ },
+ )
+ else:
+ expected = pd.DataFrame(
+ {
+ "is_anomaly": [False, False, False],
+ "lower_bound": [2229.930578, 2149.645455, 1892.873256],
+ "upper_bound": [3039.6614686, 3093.019467, 2899.317669],
+ "anomaly_probability": [0.48545926, 0.3856835, 0.314156],
+ },
+ )
pd.testing.assert_frame_equal(
anomalies[["is_anomaly", "lower_bound", "upper_bound", "anomaly_probability"]],
expected,
@@ -183,21 +372,65 @@ def test_arima_plus_detect_anomalies(
)
+@pytest.mark.parametrize("id_col_name", [None, "id"])
def test_arima_plus_detect_anomalies_params(
- time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df
+ time_series_arima_plus_model: forecasting.ARIMAPlus,
+ time_series_arima_plus_model_w_id: forecasting.ARIMAPlus,
+ new_time_series_df,
+ new_time_series_df_w_id,
+ id_col_name,
):
- anomalies = time_series_arima_plus_model.detect_anomalies(
- new_time_series_df, anomaly_prob_threshold=0.7
- ).to_pandas()
-
- expected = pd.DataFrame(
- {
- "is_anomaly": [True, False, False],
- "lower_bound": [2525.5363, 2360.1870, 2086.0609],
- "upper_bound": [2923.408256, 2826.54981, 2621.165188],
- "anomaly_probability": [0.757824, 0.322559, 0.43011],
- },
+ anomalies = (
+ (
+ time_series_arima_plus_model_w_id
+ if id_col_name
+ else time_series_arima_plus_model
+ )
+ .detect_anomalies(
+ new_time_series_df_w_id if id_col_name else new_time_series_df,
+ anomaly_prob_threshold=0.7,
+ )
+ .to_pandas()
)
+ if id_col_name:
+ expected = pd.DataFrame(
+ {
+ "is_anomaly": [False, False, False, False, False, False],
+ "lower_bound": [
+ 2420.11419,
+ 2420.11419,
+ 2360.1870,
+ 2360.1870,
+ 2086.0609,
+ 2086.0609,
+ ],
+ "upper_bound": [
+ 2849.47785,
+ 2849.47785,
+ 2826.54981,
+ 2826.54981,
+ 2621.165188,
+ 2621.165188,
+ ],
+ "anomaly_probability": [
+ 0.485459,
+ 0.485459,
+ 0.385683,
+ 0.385683,
+ 0.314156,
+ 0.314156,
+ ],
+ },
+ )
+ else:
+ expected = pd.DataFrame(
+ {
+ "is_anomaly": [False, False, False],
+ "lower_bound": [2420.11419, 2360.1870, 2086.0609],
+ "upper_bound": [2849.47785, 2826.54981, 2621.165188],
+ "anomaly_probability": [0.485459, 0.385683, 0.314156],
+ },
+ )
pd.testing.assert_frame_equal(
anomalies[["is_anomaly", "lower_bound", "upper_bound", "anomaly_probability"]],
expected,
@@ -207,22 +440,49 @@ def test_arima_plus_detect_anomalies_params(
)
+@pytest.mark.parametrize("id_col_name", [None, "id"])
def test_arima_plus_score(
- time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df
+ time_series_arima_plus_model: forecasting.ARIMAPlus,
+ time_series_arima_plus_model_w_id: forecasting.ARIMAPlus,
+ new_time_series_df,
+ new_time_series_df_w_id,
+ id_col_name,
):
- result = time_series_arima_plus_model.score(
- new_time_series_df[["parsed_date"]], new_time_series_df[["total_visits"]]
- ).to_pandas()
- expected = pd.DataFrame(
- {
- "mean_absolute_error": [154.742547],
- "mean_squared_error": [26844.868855],
- "root_mean_squared_error": [163.844038],
- "mean_absolute_percentage_error": [6.189702],
- "symmetric_mean_absolute_percentage_error": [6.097155],
- },
- dtype="Float64",
- )
+ if id_col_name:
+ result = time_series_arima_plus_model_w_id.score(
+ new_time_series_df_w_id[["parsed_date"]],
+ new_time_series_df_w_id[["total_visits"]],
+ new_time_series_df_w_id[["id"]],
+ ).to_pandas()
+ else:
+ result = time_series_arima_plus_model.score(
+ new_time_series_df[["parsed_date"]], new_time_series_df[["total_visits"]]
+ ).to_pandas()
+ if id_col_name:
+ expected = pd.DataFrame(
+ {
+ "id": ["2", "1"],
+ "mean_absolute_error": [120.011007, 120.011007],
+ "mean_squared_error": [14562.562359, 14562.562359],
+ "root_mean_squared_error": [120.675442, 120.675442],
+ "mean_absolute_percentage_error": [4.80044, 4.80044],
+ "symmetric_mean_absolute_percentage_error": [4.744332, 4.744332],
+ },
+ dtype="Float64",
+ )
+ expected["id"] = expected["id"].astype(str).str.replace(r"\.0$", "", regex=True)
+ expected["id"] = expected["id"].astype("string[pyarrow]")
+ else:
+ expected = pd.DataFrame(
+ {
+ "mean_absolute_error": [120.0110074],
+ "mean_squared_error": [14562.5623594],
+ "root_mean_squared_error": [120.675442],
+ "mean_absolute_percentage_error": [4.80044],
+ "symmetric_mean_absolute_percentage_error": [4.744332],
+ },
+ dtype="Float64",
+ )
pd.testing.assert_frame_equal(
result,
expected,
@@ -231,38 +491,91 @@ def test_arima_plus_score(
)
-def test_arima_plus_summary(time_series_arima_plus_model: forecasting.ARIMAPlus):
- result = time_series_arima_plus_model.summary()
- assert result.shape == (1, 12)
- assert all(column in result.columns for column in ARIMA_EVALUATE_OUTPUT_COL)
+@pytest.mark.parametrize("id_col_name", [None, "id"])
+def test_arima_plus_summary(
+ time_series_arima_plus_model: forecasting.ARIMAPlus,
+ time_series_arima_plus_model_w_id: forecasting.ARIMAPlus,
+ id_col_name,
+):
+ result = (
+ time_series_arima_plus_model_w_id
+ if id_col_name
+ else time_series_arima_plus_model
+ ).summary()
+ assert result.shape == ((2, 13) if id_col_name else (1, 12))
+ expected_columns = (
+ [id_col_name] + ARIMA_EVALUATE_OUTPUT_COL
+ if id_col_name
+ else ARIMA_EVALUATE_OUTPUT_COL
+ )
+ assert all(column in result.columns for column in expected_columns)
+@pytest.mark.parametrize("id_col_name", [None, "id"])
def test_arima_plus_summary_show_all_candidates(
time_series_arima_plus_model: forecasting.ARIMAPlus,
+ time_series_arima_plus_model_w_id: forecasting.ARIMAPlus,
+ id_col_name,
):
- result = time_series_arima_plus_model.summary(
+ result = (
+ time_series_arima_plus_model_w_id
+ if id_col_name
+ else time_series_arima_plus_model
+ ).summary(
show_all_candidate_models=True,
)
assert result.shape[0] > 1
- assert all(column in result.columns for column in ARIMA_EVALUATE_OUTPUT_COL)
+ expected_columns = (
+ [id_col_name] + ARIMA_EVALUATE_OUTPUT_COL
+ if id_col_name
+ else ARIMA_EVALUATE_OUTPUT_COL
+ )
+ assert all(column in result.columns for column in expected_columns)
+@pytest.mark.parametrize("id_col_name", [None, "id"])
def test_arima_plus_score_series(
- time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df
+ time_series_arima_plus_model: forecasting.ARIMAPlus,
+ time_series_arima_plus_model_w_id: forecasting.ARIMAPlus,
+ new_time_series_df,
+ new_time_series_df_w_id,
+ id_col_name,
):
- result = time_series_arima_plus_model.score(
- new_time_series_df["parsed_date"], new_time_series_df["total_visits"]
- ).to_pandas()
- expected = pd.DataFrame(
- {
- "mean_absolute_error": [154.742547],
- "mean_squared_error": [26844.868855],
- "root_mean_squared_error": [163.844038],
- "mean_absolute_percentage_error": [6.189702],
- "symmetric_mean_absolute_percentage_error": [6.097155],
- },
- dtype="Float64",
- )
+ if id_col_name:
+ result = time_series_arima_plus_model_w_id.score(
+ new_time_series_df_w_id["parsed_date"],
+ new_time_series_df_w_id["total_visits"],
+ new_time_series_df_w_id["id"],
+ ).to_pandas()
+ else:
+ result = time_series_arima_plus_model.score(
+ new_time_series_df["parsed_date"], new_time_series_df["total_visits"]
+ ).to_pandas()
+ if id_col_name:
+ expected = pd.DataFrame(
+ {
+ "id": ["2", "1"],
+ "mean_absolute_error": [120.011007, 120.011007],
+ "mean_squared_error": [14562.562359, 14562.562359],
+ "root_mean_squared_error": [120.675442, 120.675442],
+ "mean_absolute_percentage_error": [4.80044, 4.80044],
+ "symmetric_mean_absolute_percentage_error": [4.744332, 4.744332],
+ },
+ dtype="Float64",
+ )
+ expected["id"] = expected["id"].astype(str).str.replace(r"\.0$", "", regex=True)
+ expected["id"] = expected["id"].astype("string[pyarrow]")
+ else:
+ expected = pd.DataFrame(
+ {
+ "mean_absolute_error": [120.0110074],
+ "mean_squared_error": [14562.5623594],
+ "root_mean_squared_error": [120.675442],
+ "mean_absolute_percentage_error": [4.80044],
+ "symmetric_mean_absolute_percentage_error": [4.744332],
+ },
+ dtype="Float64",
+ )
pd.testing.assert_frame_equal(
result,
expected,
@@ -271,7 +584,21 @@ def test_arima_plus_score_series(
)
-def test_arima_plus_summary_series(time_series_arima_plus_model: forecasting.ARIMAPlus):
- result = time_series_arima_plus_model.summary()
- assert result.shape == (1, 12)
- assert all(column in result.columns for column in ARIMA_EVALUATE_OUTPUT_COL)
+@pytest.mark.parametrize("id_col_name", [None, "id"])
+def test_arima_plus_summary_series(
+ time_series_arima_plus_model: forecasting.ARIMAPlus,
+ time_series_arima_plus_model_w_id: forecasting.ARIMAPlus,
+ id_col_name,
+):
+ result = (
+ time_series_arima_plus_model_w_id
+ if id_col_name
+ else time_series_arima_plus_model
+ ).summary()
+ assert result.shape == ((2, 13) if id_col_name else (1, 12))
+ expected_columns = (
+ [id_col_name] + ARIMA_EVALUATE_OUTPUT_COL
+ if id_col_name
+ else ARIMA_EVALUATE_OUTPUT_COL
+ )
+ assert all(column in result.columns for column in expected_columns)
diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py
index 29f504443a..90d5e9f1d7 100644
--- a/tests/system/small/ml/test_llm.py
+++ b/tests/system/small/ml/test_llm.py
@@ -868,7 +868,6 @@ def test_llm_palm_score_params(llm_fine_tune_df_default_index):
"label",
"evaluation_status",
],
- index=6,
)
@@ -928,7 +927,6 @@ def test_llm_gemini_pro_score_params(llm_fine_tune_df_default_index, model_name)
"label",
"evaluation_status",
],
- index=6,
)
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
index 4266cdba88..aa038c62d8 100644
--- a/tests/system/small/test_dataframe.py
+++ b/tests/system/small/test_dataframe.py
@@ -5315,7 +5315,7 @@ def test__resample_start_time(rule, origin, data):
),
],
)
-def test_astype(scalars_dfs, dtype):
+def test_df_astype(scalars_dfs, dtype):
bf_df, pd_df = scalars_dfs
target_cols = ["bool_col", "int64_col"]
bf_df = bf_df[target_cols]
@@ -5327,6 +5327,20 @@ def test_astype(scalars_dfs, dtype):
pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False)
+def test_df_astype_python_types(scalars_dfs):
+ bf_df, pd_df = scalars_dfs
+ target_cols = ["bool_col", "int64_col"]
+ bf_df = bf_df[target_cols]
+ pd_df = pd_df[target_cols]
+
+ bf_result = bf_df.astype({"bool_col": str, "int64_col": float}).to_pandas()
+ pd_result = pd_df.astype(
+ {"bool_col": "string[pyarrow]", "int64_col": pd.Float64Dtype()}
+ )
+
+ pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False)
+
+
def test_astype_invalid_type_fail(scalars_dfs):
bf_df, _ = scalars_dfs
diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py
index cdf4fa6511..4d01bc5ee9 100644
--- a/tests/system/small/test_index.py
+++ b/tests/system/small/test_index.py
@@ -123,6 +123,12 @@ def test_index_astype(scalars_df_index, scalars_pandas_df_index):
pd.testing.assert_index_equal(bf_result, pd_result)
+def test_index_astype_python(scalars_df_index, scalars_pandas_df_index):
+ bf_result = scalars_df_index.set_index("int64_col").index.astype(float).to_pandas()
+ pd_result = scalars_pandas_df_index.set_index("int64_col").index.astype("Float64")
+ pd.testing.assert_index_equal(bf_result, pd_result)
+
+
def test_index_astype_error_error(session):
input = pd.Index(["hello", "world", "3.11", "4000"])
with pytest.raises(ValueError):
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
index 3d76122e9d..fb48bf58b4 100644
--- a/tests/system/small/test_series.py
+++ b/tests/system/small/test_series.py
@@ -228,6 +228,13 @@ def test_series_construct_geodata():
)
+def test_series_keys(scalars_dfs):
+ scalars_df, scalars_pandas_df = scalars_dfs
+ bf_result = scalars_df["int64_col"].keys().to_pandas()
+ pd_result = scalars_pandas_df["int64_col"].keys()
+ pd.testing.assert_index_equal(bf_result, pd_result)
+
+
@pytest.mark.parametrize(
["data", "index"],
[
@@ -1241,6 +1248,51 @@ def test_isin_bigframes_values(scalars_dfs, col_name, test_set, session):
)
+@pytest.mark.parametrize(
+ (
+ "col_name",
+ "test_set",
+ ),
+ [
+ (
+ "int64_col",
+ [314159, 2.0, 3, pd.NA],
+ ),
+ (
+ "int64_col",
+ [2, 55555, 4],
+ ),
+ (
+ "float64_col",
+ [-123.456, 1.25, pd.NA],
+ ),
+ (
+ "int64_too",
+ [1, 2, pd.NA],
+ ),
+ (
+ "string_col",
+ ["Hello, World!", "Hi", "こんにちは"],
+ ),
+ ],
+)
+def test_isin_bigframes_values_as_predicate(
+ scalars_dfs_maybe_ordered, col_name, test_set
+):
+ scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered
+ bf_predicate = scalars_df[col_name].isin(
+ series.Series(test_set, session=scalars_df._session)
+ )
+ bf_result = scalars_df[bf_predicate].to_pandas()
+ pd_predicate = scalars_pandas_df[col_name].isin(test_set)
+ pd_result = scalars_pandas_df[pd_predicate]
+
+ pd.testing.assert_frame_equal(
+ pd_result.reset_index(),
+ bf_result.reset_index(),
+ )
+
+
def test_isnull(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
col_name = "float64_col"
@@ -1854,6 +1906,7 @@ def test_groupby_window_ops(scalars_df_index, scalars_pandas_df_index, operator)
pd_series = operator(
scalars_pandas_df_index[col_name].groupby(scalars_pandas_df_index[group_key])
).astype(bf_series.dtype)
+
pd.testing.assert_series_equal(
pd_series,
bf_series,
@@ -2862,6 +2915,42 @@ def test_series_case_when(scalars_dfs_maybe_ordered):
)
+def test_series_case_when_change_type(scalars_dfs_maybe_ordered):
+ pytest.importorskip(
+ "pandas",
+ minversion="2.2.0",
+ reason="case_when added in pandas 2.2.0",
+ )
+ scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered
+
+ bf_series = scalars_df["int64_col"]
+ pd_series = scalars_pandas_df["int64_col"]
+
+ # TODO(tswast): pandas case_when appears to assume True when a value is
+ # null. I suspect this should be considered a bug in pandas.
+
+ bf_conditions = [
+ ((bf_series > 645).fillna(True), scalars_df["string_col"]),
+ ((bf_series <= -100).fillna(True), pd.NA),
+ (True, "not_found"),
+ ]
+
+ pd_conditions = [
+ ((pd_series > 645).fillna(True), scalars_pandas_df["string_col"]),
+ ((pd_series <= -100).fillna(True), pd.NA),
+ # pandas currently fails if both the condition and the value are literals.
+ ([True] * len(pd_series), ["not_found"] * len(pd_series)),
+ ]
+
+ bf_result = bf_series.case_when(bf_conditions).to_pandas()
+ pd_result = pd_series.case_when(pd_conditions)
+
+ pd.testing.assert_series_equal(
+ bf_result,
+ pd_result.astype("string[pyarrow]"),
+ )
+
+
def test_to_frame(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
@@ -3219,6 +3308,17 @@ def test_astype(scalars_df_index, scalars_pandas_df_index, column, to_type, erro
pd.testing.assert_series_equal(bf_result, pd_result)
+def test_series_astype_python(session):
+ input = pd.Series(["hello", "world", "3.11", "4000"])
+ exepcted = pd.Series(
+ [None, None, 3.11, 4000],
+ dtype="Float64",
+ index=pd.Index([0, 1, 2, 3], dtype="Int64"),
+ )
+ result = session.read_pandas(input).astype(float, errors="null").to_pandas()
+ pd.testing.assert_series_equal(result, exepcted)
+
+
def test_astype_safe(session):
input = pd.Series(["hello", "world", "3.11", "4000"])
exepcted = pd.Series(
diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py
index e95509e033..a4acb72117 100644
--- a/tests/system/small/test_session.py
+++ b/tests/system/small/test_session.py
@@ -691,6 +691,62 @@ def test_read_pandas_tokyo(
assert len(expected) == result.total_rows
+@pytest.mark.parametrize(
+ "write_engine",
+ ["default", "bigquery_inline", "bigquery_load", "bigquery_streaming"],
+)
+def test_read_pandas_timedelta_dataframes(session, write_engine):
+ expected_df = pd.DataFrame({"my_col": pd.to_timedelta([1, 2, 3], unit="d")})
+
+ actual_result = (
+ session.read_pandas(expected_df, write_engine=write_engine)
+ .to_pandas()
+ .astype("timedelta64[ns]")
+ )
+
+ if write_engine == "bigquery_streaming":
+ expected_df.index = pd.Index([pd.NA] * 3, dtype="Int64")
+ pd.testing.assert_frame_equal(actual_result, expected_df, check_index_type=False)
+
+
+@pytest.mark.parametrize(
+ "write_engine",
+ ["default", "bigquery_inline", "bigquery_load", "bigquery_streaming"],
+)
+def test_read_pandas_timedelta_series(session, write_engine):
+ expected_series = pd.Series(pd.to_timedelta([1, 2, 3], unit="d"))
+
+ actual_result = (
+ session.read_pandas(expected_series, write_engine=write_engine)
+ .to_pandas()
+ .astype("timedelta64[ns]")
+ )
+
+ if write_engine == "bigquery_streaming":
+ expected_series.index = pd.Index([pd.NA] * 3, dtype="Int64")
+ pd.testing.assert_series_equal(
+ actual_result, expected_series, check_index_type=False
+ )
+
+
+@pytest.mark.parametrize(
+ "write_engine",
+ ["default", "bigquery_inline", "bigquery_load"],
+)
+def test_read_pandas_timedelta_index(session, write_engine):
+ expected_index = pd.to_timedelta(
+ [1, 2, 3], unit="d"
+ ) # to_timedelta returns an index
+
+ actual_result = (
+ session.read_pandas(expected_index, write_engine=write_engine)
+ .to_pandas()
+ .astype("timedelta64[ns]")
+ )
+
+ pd.testing.assert_index_equal(actual_result, expected_index)
+
+
@utils.skip_legacy_pandas
@pytest.mark.parametrize(
("write_engine",),
diff --git a/tests/system/utils.py b/tests/system/utils.py
index 7c12c8033a..0772468085 100644
--- a/tests/system/utils.py
+++ b/tests/system/utils.py
@@ -298,7 +298,7 @@ def assert_pandas_df_equal_pca(actual, expected, **kwargs):
def check_pandas_df_schema_and_index(
pd_df: pd.DataFrame,
columns: Iterable,
- index: Union[int, Iterable],
+ index: Optional[Union[int, Iterable]] = None,
col_exact: bool = True,
):
"""Check pandas df schema and index. But not the values.
@@ -306,7 +306,7 @@ def check_pandas_df_schema_and_index(
Args:
pd_df: the input pandas df
columns: target columns to check with
- index: int or Iterable. If int, only check the length (index size) of the df. If Iterable, check index values match
+ index: int or Iterable or None, default None. If int, only check the length (index size) of the df. If Iterable, check index values match. If None, skip checking index.
col_exact: If True, check the columns param are exact match. Otherwise only check the df contains all of those columns
"""
if col_exact:
@@ -314,7 +314,9 @@ def check_pandas_df_schema_and_index(
else:
assert set(columns) <= set(pd_df.columns)
- if isinstance(index, int):
+ if index is None:
+ pass
+ elif isinstance(index, int):
assert len(pd_df) == index
elif isinstance(index, Iterable):
assert list(pd_df.index) == list(index)
diff --git a/tests/unit/core/test_dtypes.py b/tests/unit/core/test_dtypes.py
index e1fac624d7..3d420de51f 100644
--- a/tests/unit/core/test_dtypes.py
+++ b/tests/unit/core/test_dtypes.py
@@ -219,20 +219,20 @@ def test_bigframes_dtype_converts(ibis_dtype, bigframes_dtype):
def test_bigframes_string_dtype_converts(ibis_dtype, bigframes_dtype_str):
"""Test all the Ibis data types needed to read BigQuery tables"""
result = bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype(
- bigframes_dtype_str
+ bigframes.dtypes.bigframes_type(bigframes_dtype_str)
)
assert result == ibis_dtype
def test_unsupported_dtype_raises_unexpected_datatype():
"""Incompatible dtypes should fail when passed into BigQuery DataFrames"""
- with pytest.raises(ValueError, match="Unexpected data type"):
+ with pytest.raises(ValueError, match="Datatype has no ibis type mapping"):
bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype(np.float32)
def test_unsupported_dtype_str_raises_unexpected_datatype():
"""Incompatible dtypes should fail when passed into BigQuery DataFrames"""
- with pytest.raises(ValueError, match="Unexpected data type"):
+ with pytest.raises(ValueError, match="Datatype has no ibis type mapping"):
bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype("int64")
diff --git a/tests/unit/core/test_expression.py b/tests/unit/core/test_expression.py
index 72e200f007..ab6402a909 100644
--- a/tests/unit/core/test_expression.py
+++ b/tests/unit/core/test_expression.py
@@ -47,7 +47,7 @@ def test_expression_dtype_where():
def test_expression_dtype_astype():
- expression = ops.AsTypeOp("Int64").as_expr(ex.const(3.14159))
+ expression = ops.AsTypeOp(dtypes.INT_DTYPE).as_expr(ex.const(3.14159))
result = expression.output_type({})
diff --git a/tests/unit/core/test_log_adapter.py b/tests/unit/core/test_log_adapter.py
index 7b626838ac..6bc9c91f3a 100644
--- a/tests/unit/core/test_log_adapter.py
+++ b/tests/unit/core/test_log_adapter.py
@@ -155,3 +155,33 @@ def test_submit_pandas_labels_without_valid_params_for_param_logging(mock_bqclie
# For param tracking task without kwargs, we won't submit labels
mock_bqclient.query.assert_not_called()
+
+
+@pytest.mark.parametrize(
+ ("class_name", "method_name"),
+ (
+ ("Series", "_repr_latex_"),
+ (
+ "DataFrame",
+ # __call__ should be excluded.
+ # It's implemented on the pd.DataFrame class but not pd.DataFrame instances.
+ "__call__",
+ ),
+ (
+ "Series",
+ # __call__ should be excluded.
+ # It's implemented on the pd.Series class but not pd.Series instances.
+ "__call__",
+ ),
+ ),
+)
+def test_submit_pandas_labels_with_internal_method(
+ mock_bqclient, class_name, method_name
+):
+ log_adapter.submit_pandas_labels(
+ mock_bqclient,
+ class_name,
+ method_name,
+ task=log_adapter.PANDAS_API_TRACKING_TASK,
+ )
+ mock_bqclient.query.assert_not_called()
diff --git a/tests/unit/test_remote_function.py b/tests/unit/functions/test_remote_function.py
similarity index 100%
rename from tests/unit/test_remote_function.py
rename to tests/unit/functions/test_remote_function.py
diff --git a/tests/unit/session/test_read_gbq_table.py b/tests/unit/session/test_read_gbq_table.py
index 6933957e53..8f01820fd3 100644
--- a/tests/unit/session/test_read_gbq_table.py
+++ b/tests/unit/session/test_read_gbq_table.py
@@ -27,8 +27,13 @@
@pytest.mark.parametrize(
("index_cols", "primary_keys", "values_distinct", "expected"),
(
- (["col1", "col2"], ["col1", "col2", "col3"], False, False),
- (["col1", "col2", "col3"], ["col1", "col2", "col3"], True, True),
+ (["col1", "col2"], ["col1", "col2", "col3"], False, ("col1", "col2", "col3")),
+ (
+ ["col1", "col2", "col3"],
+ ["col1", "col2", "col3"],
+ True,
+ ("col1", "col2", "col3"),
+ ),
(
["col2", "col3", "col1"],
[
@@ -36,14 +41,14 @@
"col2",
],
True,
- True,
+ ("col2", "col3"),
),
- (["col1", "col2"], [], False, False),
- ([], ["col1", "col2", "col3"], False, False),
- ([], [], False, False),
+ (["col1", "col2"], [], False, ()),
+ ([], ["col1", "col2", "col3"], False, ("col1", "col2", "col3")),
+ ([], [], False, ()),
),
)
-def test_are_index_cols_unique(index_cols, primary_keys, values_distinct, expected):
+def test_infer_unique_columns(index_cols, primary_keys, values_distinct, expected):
"""If a primary key is set on the table, we use that as the index column
by default, no error should be raised in this case.
@@ -87,6 +92,6 @@ def test_are_index_cols_unique(index_cols, primary_keys, values_distinct, expect
)
table._properties["location"] = session._location
- result = bf_read_gbq_table.are_index_cols_unique(bqclient, table, index_cols, "")
+ result = bf_read_gbq_table.infer_unique_columns(bqclient, table, index_cols, "")
assert result == expected
diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py
index 210fc5d633..13531acbea 100644
--- a/tests/unit/session/test_session.py
+++ b/tests/unit/session/test_session.py
@@ -217,10 +217,10 @@ def test_read_gbq_cached_table():
table,
)
- session.bqclient.get_table.return_value = table
- session.bqclient.query_and_wait.return_value = (
- {"total_count": 3, "distinct_count": 2},
+ session.bqclient.query_and_wait = mock.MagicMock(
+ return_value=({"total_count": 3, "distinct_count": 2},)
)
+ session.bqclient.get_table.return_value = table
with pytest.warns(UserWarning, match=re.escape("use_cache=False")):
df = session.read_gbq("my-project.my_dataset.my_table")
diff --git a/tests/unit/test_dataframe.py b/tests/unit/test_dataframe.py
index 560c0cf0f4..a6ad5e3821 100644
--- a/tests/unit/test_dataframe.py
+++ b/tests/unit/test_dataframe.py
@@ -41,6 +41,15 @@ def test_dataframe_repr_with_uninitialized_object():
assert "DataFrame" in got
+def test_dataframe_setattr_with_uninitialized_object():
+ """Ensures DataFrame can be subclassed without trying to set attributes as columns."""
+ # Avoid calling __init__ since it might be called later in a subclass.
+ # https://stackoverflow.com/a/6384982/101923
+ dataframe = bigframes.dataframe.DataFrame.__new__(bigframes.dataframe.DataFrame)
+ dataframe.lineage = "my-test-value"
+ assert dataframe.lineage == "my-test-value" # Should just be a regular attribute.
+
+
def test_dataframe_to_gbq_invalid_destination(monkeypatch: pytest.MonkeyPatch):
dataframe = resources.create_dataframe(monkeypatch)
diff --git a/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py b/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py
index ccd4a57e11..d1ab36c41a 100644
--- a/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py
+++ b/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py
@@ -432,7 +432,7 @@ class SQLGlotCompiler(abc.ABC):
ops.IntervalSubtract,
)
- NEEDS_PARENS = BINARY_INFIX_OPS + (ops.IsNull,)
+ NEEDS_PARENS = BINARY_INFIX_OPS + (ops.IsNull, ops.NotNull)
# Constructed dynamically in `__init_subclass__` from their respective
# UPPERCASE values to handle inheritance, do not modify directly here.
@@ -1022,7 +1022,7 @@ def visit_IsNull(self, op, *, arg):
return arg.is_(NULL)
def visit_NotNull(self, op, *, arg):
- return arg.is_(sg.not_(NULL, copy=False))
+ return self._add_parens(op, arg).is_(sg.not_(NULL, copy=False))
def visit_InValues(self, op, *, value, options):
return value.isin(*options)
diff --git a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py
index 3793a09229..7d6cd6d2b4 100644
--- a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py
+++ b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py
@@ -1039,7 +1039,7 @@ def visit_InMemoryTable(self, op, *, name, schema, data):
nested=True,
)
array_values = [
- sge.Tuple(
+ sge.Struct(
expressions=tuple(
self.visit_Literal(None, value=value, dtype=type_)
for value, type_ in zip(row, schema.types)
diff --git a/third_party/bigframes_vendored/ibis/backends/sql/rewrites.py b/third_party/bigframes_vendored/ibis/backends/sql/rewrites.py
index b2ef6a15d3..652f04757b 100644
--- a/third_party/bigframes_vendored/ibis/backends/sql/rewrites.py
+++ b/third_party/bigframes_vendored/ibis/backends/sql/rewrites.py
@@ -245,6 +245,8 @@ def merge_select_select(_, **kwargs):
ops.InSubquery,
ops.Unnest,
ops.Impure,
+ # This is used for remote functions, which we don't want to copy
+ ops.ScalarUDF,
)
if _.find_below(blocking, filter=ops.Value):
return _
diff --git a/third_party/bigframes_vendored/ibis/expr/types/core.py b/third_party/bigframes_vendored/ibis/expr/types/core.py
index 9e1853fe52..9685e4ddca 100644
--- a/third_party/bigframes_vendored/ibis/expr/types/core.py
+++ b/third_party/bigframes_vendored/ibis/expr/types/core.py
@@ -79,7 +79,7 @@ def _interactive_repr(self) -> str:
return capture.get().rstrip()
def __repr__(self) -> str:
- if ibis.options.interactive:
+ if bigframes_vendored.ibis.options.interactive:
return self._interactive_repr()
else:
return self._noninteractive_repr()
diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py
index 83a24f7a9c..9dae802b6e 100644
--- a/third_party/bigframes_vendored/pandas/core/generic.py
+++ b/third_party/bigframes_vendored/pandas/core/generic.py
@@ -165,7 +165,7 @@ def astype(self, dtype):
dtype: Int64
Args:
- dtype (str or pandas.ExtensionDtype):
+ dtype (str, data type or pandas.ExtensionDtype):
A dtype supported by BigQuery DataFrame include ``'boolean'``,
``'Float64'``, ``'Int64'``, ``'int64\\[pyarrow\\]'``,
``'string'``, ``'string\\[pyarrow\\]'``,
diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py
index c48c07424d..59504ee68c 100644
--- a/third_party/bigframes_vendored/pandas/core/indexes/base.py
+++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py
@@ -445,7 +445,7 @@ def astype(self, dtype):
Args:
- dtype (str or pandas.ExtensionDtype):
+ dtype (str, data type, or pandas.ExtensionDtype):
A dtype supported by BigQuery DataFrame include ``'boolean'``,
``'Float64'``, ``'Int64'``, ``'int64\\[pyarrow\\]'``,
``'string'``, ``'string\\[pyarrow\\]'``,
diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py
index 727e25836a..57f7dfbb79 100644
--- a/third_party/bigframes_vendored/pandas/core/series.py
+++ b/third_party/bigframes_vendored/pandas/core/series.py
@@ -424,6 +424,25 @@ def __repr__(self) -> str:
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+ def keys(self):
+ """
+ Return alias for index.
+
+ **Examples:**
+
+ >>> import bigframes.pandas as bpd
+ >>> bpd.options.display.progress_bar = None
+
+ >>> s = bpd.Series([1, 2, 3], index=[0, 1, 2])
+ >>> s.keys()
+ Index([0, 1, 2], dtype='Int64')
+
+ Returns:
+ Index:
+ Index of the Series.
+ """
+ return self.index
+
# ----------------------------------------------------------------------
# IO methods (to / from other formats)
@@ -2648,6 +2667,21 @@ def case_when(
3 2
Name: c, dtype: Int64
+ If you'd like to change the type, add a case with the condition True at the end of the case list
+
+ >>> c.case_when(
+ ... caselist=[
+ ... (a.gt(0), 'a'), # condition, replacement
+ ... (b.gt(0), 'b'),
+ ... (True, 'c'),
+ ... ]
+ ... )
+ 0 c
+ 1 b
+ 2 a
+ 3 a
+ Name: c, dtype: string
+
**See also:**
- :func:`bigframes.pandas.Series.mask` : Replace values where the condition is True.
@@ -5038,13 +5072,13 @@ def mask(self, cond, other):
with corresponding value from other. If cond is callable, it is
computed on the Series/DataFrame and should return boolean
Series/DataFrame or array. The callable must not change input
- Series/DataFrame (though pandas doesn’t check it).
+ Series/DataFrame (though pandas doesn't check it).
other (scalar, Series/DataFrame, or callable):
Entries where cond is True are replaced with corresponding value
from other. If other is callable, it is computed on the
Series/DataFrame and should return scalar or Series/DataFrame.
The callable must not change input Series/DataFrame (though pandas
- doesn’t check it). If not specified, entries will be filled with
+ doesn't check it). If not specified, entries will be filled with
the corresponding NULL value (np.nan for numpy dtypes, pd.NA for
extension dtypes).
diff --git a/third_party/bigframes_vendored/tpch/TPC-EULA.txt b/third_party/bigframes_vendored/tpch/TPC-EULA.txt
new file mode 100644
index 0000000000..feed8c4973
--- /dev/null
+++ b/third_party/bigframes_vendored/tpch/TPC-EULA.txt
@@ -0,0 +1,320 @@
+END USER LICENSE AGREEMENT
+VERSION 2.2
+
+READ THE TERMS AND CONDITIONS OF THIS AGREEMENT ("AGREEMENT") CAREFULLY
+BEFORE INSTALLING OR USING THE ACCOMPANYING SOFTWARE. BY INSTALLING OR
+USING THE SOFTWARE OR RELATED DOCUMENTATION, YOU AGREE TO BE BOUND BY
+THE TERMS OF THIS AGREEMENT. IF YOU DO NOT AGREE TO THE TERMS OF THIS
+AGREEMENT, DO NOT INSTALL OR USE THE SOFTWARE. IF YOU ARE ACCESSING THE
+SOFTWARE ON BEHALF OF YOUR ORGANIZATION, YOU REPRESENT AND WARRANT THAT
+YOU HAVE SUFFICIENT AUTHORITY TO BIND YOUR ORGANIZATION TO THIS
+AGREEMENT.
+
+USE AND RE-EXPORT OF THE SOFTWARE IS SUBJECT TO THE UNITED STATES EXPORT
+CONTROL ADMINISTRATION REGULATIONS. THE SOFTWARE MAY NOT BE USED BY
+UNLICENSED PERSONS OR ENTITIES, AND MAY NOT BE RE- EXPORTED TO ANOTHER
+COUNTRY. SEE EXPORT ASSURANCE (CLAUSE 13) OF THIS LICENSE.
+
+This is a legal agreement between you (or, if you are accessing the
+software on behalf of your organization, your organization) ("You" or
+"User") and the Transaction Processing Performance Council ("TPC"). This
+Agreement states the terms and conditions upon which TPC offers to
+license the Software, including, but not limited to, the source code,
+scripts, executable programs, drivers, libraries and data files
+associated with such programs, and modifications thereof (the
+"Software"), and online, electronic or printed documentation
+("Documentation," together with the Software, "Materials").
+
+LICENSE
+
+1. Definitions
+
+"Executive Summary" shall mean a short summary of a TPC Benchmark Result
+that shows the configuration, primary metrics, performance data, and
+pricing details. The exact requirements for the Executive Summary are
+defined in each TPC Benchmark Standard.
+"Full Disclosure Report (FDR)" shall mean a document that describes The
+TPC Benchmark Result in sufficient detail such that the Result could be
+recreated. The exact requirements for the FDR are defined in each TPC
+Benchmark Standard.
+"TPC Benchmark Result (Result)" shall mean a performance test submitted
+to the TPC attested to meet the requirements of a TPC Benchmark Standard
+at the time of submission. A Result is documented by an Executive
+Summary and, if required, a FDR.
+"TPC Benchmark Standard" shall mean a TPC Benchmark Specification and
+any associated code or binaries approved by the TPC. The various TPC
+Benchmark Standards can be found at
+http://www.tpc.org/information/current_specifications.asp.
+"TPC Policies" shall mean the guiding principles for how the TPC
+conducts its operations and business. The current TPC Policies can be
+found at http://www.tpc.org/information/current_specifications.asp.
+
+2. Ownership. The Materials are licensed, not sold, to You for use only
+under the terms of this Agreement. As between You and TPC (and, to the
+extent applicable, its licensors), TPC retains all rights, title and
+interest to and ownership of the Materials and reserves all rights not
+expressly granted to You.
+
+3. License Grant. Subject to Your compliance in all material respects
+with the terms and conditions of this Agreement, TPC grants You a
+restricted, non-exclusive, revocable license to install and use the
+Materials, but only as expressly permitted herein. You may only use the
+Software on computer systems under Your direct control. You may download
+multiple copies of the Materials and make verbatim copies of the
+original of the Software so long as Your use of such copies complies
+with the terms of this Agreement.
+a. Use by Individual. If You are accessing the Materials as an
+individual, only You (as an individual) may access and use the
+Materials.
+b. Use by Organization. If You are accessing the Materials on behalf of
+Your organization, only You and those within Your organization may use
+the Materials. Your organization must identify a contact person to TPC
+and conduct communications with TPC through that contact person.
+
+4. Restrictions. The following restrictions apply to all use of the
+Materials by You.
+a. General: You may not:
+(1) use, copy, print, modify, adapt, create derivative works from,
+market, deliver, rent, lease, sublicense, make, have made, assign,
+pledge, transfer, sell, offer to sell, import, reproduce, distribute,
+publicly perform, publicly display or otherwise grant rights to the
+Materials, or any copy thereof, in whole or in part, except as expressly
+permitted under this Agreement; or
+(2) use the Materials in any way that does not comply with all
+applicable laws and regulations.
+b. Modification: You may modify the Software.
+c. Public Disclosure: You may not publicly disclose any performance
+results produced while using the Software except in the following
+circumstances:
+(1) as part of a TPC Benchmark Result. For purposes of this Agreement, a
+"TPC Benchmark Result" is a performance test submitted to the TPC,
+documented by a Full Disclosure Report and Executive Summary, claiming
+to meet the requirements of an official TPC Benchmark Standard. You
+agree that TPC Benchmark Results may only be published in accordance
+with the TPC Policies. viewable at http: //www.tpc.org
+(2) as part of an academic or research effort that does not imply or
+state a marketing position
+(3) any other use of the Software, provided that any performance results
+must be clearly identified as not being comparable to TPC Benchmark
+Results unless specifically authorized by TPC.
+
+5. License Modification. Requests for modification of this license shall
+be addressed to info@tpc.org. You may not remove or modify this license
+without permission.
+
+6. Copyright. The Materials are owned by TPC and/or its licensors, and
+are protected by United States copyright laws and international treaty
+provisions. You may not remove the copyright notice from the original or
+any copy of the Materials, and You must apply the notice if You extract
+part of the Materials not bearing a notice.
+
+7. Use of Name. You acknowledge and agree that TPC owns all trademark
+and trade name rights in the names, trademarks and logos used by TPC in
+the Materials. User shall preserve any notices regarding such ownership.
+User may only use such names, trademarks and logos in accordance with
+the usage guidelines specified by the TPC Policies.
+
+8. Merger or Integration. Any portion of the Materials merged into or
+integrated with other software or documentation will continue to be
+subject to the terms and conditions of this Agreement.
+
+9. Limited Grants of Sublicense. You may distribute the Software as
+provided or as modified as permitted under clause 4 b. of this
+Agreement, provided You comply with all of the terms of this Agreement
+and the following conditions:
+
+a. If You distribute any portion of the Software in its original form
+You may do so only under this Agreement by including a complete copy of
+this Agreement with Your distribution, and if You distribute the
+Software in modified form, You may only do so under a license that at a
+minimum provides all of the protections and conditions of use contained
+within this Agreement;
+
+b. You must include on each copy of the Software that You distribute the
+following legend in all caps, at the top of the label and license, and
+in a font not less than 12 point and no less prominent than any other
+printing: "THE TPC SOFTWARE IS AVAILABLE WITHOUT CHARGE FROM TPC.";
+
+c. You must retain all copyright, patent, trademark, and attribution
+notices that are present in the Software; and
+
+d. You may not charge a fee for the distribution of this Software,
+including any modifications permitted under clause 4.b.
+
+10. Term and Termination.
+a. Term. The license granted to You is effective until terminated.
+b. Termination.
+(1) By You. You may terminate this Agreement at any time by returning
+the Materials (including any portions or copies thereof) to TPC or
+providing written notice to the TPC that all copies of the Materials
+within Your custody or control have been deleted or destroyed.
+(2) By TPC. In the event You materially fail to comply with any term or
+condition of this Agreement, and You fail to remedy such non-compliance
+within 30 days after the receipt of notice to that effect, then TPC
+shall have the right to terminate this Agreement immediately upon
+written notice at the end of such 30-day period.
+c. Effect of Termination. Termination of this Agreement in accordance
+with this clause 10 will not terminate the rights of end users
+sublicensed by You pursuant to this Agreement. Moreover, upon
+termination and at TPC's written request, You agree to either (1) return
+the Materials (including any portions or copies thereof) to TPC or (2)
+immediately destroy all copies of the Materials within Your custody or
+control and inform the TPC of the destruction of the Materials. Upon
+termination, TPC may also enforce any rights provided by law. The
+provisions of this Agreement that protect the proprietary rights of TPC
+and its Licensors will continue in force after termination.
+
+11. No Warranty; Materials Provided "As Is". TO THE MAXIMUM EXTENT
+PERMITTED BY APPLICABLE LAW, THE MATERIALS ARE PROVIDED "AS IS" AND WITH
+ALL FAULTS, AND TPC (AND ITS LICENSORS) AND THE AUTHORS AND DEVELOPERS
+OF THE MATERIALS HEREBY DISCLAIM ALL WARRANTIES, REPRESENTATIONS AND
+CONDITIONS, EITHER EXPRESS, IMPLIED OR STATUTORY, INCLUDING, BUT NOT
+LIMITED TO, ANY IMPLIED WARRANTIES, DUTIES OR CONDITIONS RELATING TO
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, ACCURACY OR
+COMPLETENESS OF RESPONSES, RESULTS, WORKMANLIKE EFFORT, LACK OF VIRUSES,
+LACK OF NEGLIGENCE, TITLE, QUIET ENJOYMENT, QUIET POSSESSION,
+CORRESPONDENCE TO DESCRIPTION OR NONINFRINGEMENT. USER RECOGNIZES THAT
+THE MATERIALS ARE THE RESULT OF A COOPERATIVE, NON-PROFIT EFFORT AND
+THAT TPC DOES NOT CONDUCT A TYPICAL BUSINESS. USER ACCEPTS THE MATERIALS
+"AS IS" AND WITHOUT ANY WARRANTY, EXPRESS OR IMPLIED.
+
+Without limitation, TPC (and its licensors) do not warrant that the
+functions contained in the Software or Materials will meet Your
+requirements or that the operation of the Software will be
+uninterrupted, error-free or free from malicious code. For purposes of
+this paragraph, "malicious code" means any program code designed to
+contaminate other computer programs or computer data, consume computer
+resources, modify, destroy, record, or transmit data, or in some other
+fashion usurp the normal operation of the computer, computer system, or
+computer network, including viruses, Trojan horses, droppers, worms,
+logic bombs, and the like. TPC (and its licensors) shall not be liable
+for the accuracy of any information provided by TPC or third-party
+technical support personnel, or any damages caused, either directly or
+indirectly, by acts taken or omissions made by You as a result of such
+technical support.
+
+You assume full responsibility for the selection of the Materials to
+achieve Your intended results, and for the installation, use and results
+obtained from the Materials. You also assume the entire risk as it
+applies to the quality and performance of the Materials. Should the
+Materials prove defective, You (and not TPC) assume the entire liability
+of any and all necessary servicing, repair or correction. Some
+countries/states do not allow the exclusion of implied warranties, so
+the above exclusion may not apply to You. TPC (and its licensors)
+further disclaims all warranties of any kind if the Materials were
+customized, repackaged or altered in any way by any party other than TPC
+(or its licensors).
+
+12. Disclaimer of Liability. TPC (and its licensors) assumes no
+liability with respect to the Materials, including liability for
+infringement of intellectual property rights, negligence, or any other
+liability. TPC is not aware of any infringement of copyright or patent
+that may result from its grant of rights to User of the Materials. If
+User receives any notice of infringement, such notice shall be
+immediately communicated to TPC who will have sole discretion to take
+action to evaluate the claim and, if practicable, modify the Materials
+as necessary to avoid infringement. In the event that TPC determines
+that the Materials cannot be modified to avoid such infringement (or any
+other infringement claim communicated to TPC), TPC may terminate this
+Agreement immediately. User shall suspend use of the Materials until
+modifications to avoid claims of infringement have been completed. User
+waives any claim against TPC in the event of such infringement claims by
+others.
+
+13. Export Assurance. Use and re-export of the Materials and related
+technical information is subject to the Export Administration
+Regulations (EAR) of the United States Department of Commerce. User
+hereby agrees that User (a) assumes responsibility for compliance with
+the EAR in its use of the Materials and technical information, and (b)
+will not export, re-export, or otherwise disclose directly or
+indirectly, the Materials, technical data, or any direct product of the
+Materials or technical data in violation of the EAR.
+
+14. Limitation of Remedies And Damages. IN NO EVENT WILL TPC OR ITS
+LICENSORS OR LICENSEE BE LIABLE FOR ANY INDIRECT, INCIDENTAL, SPECIAL OR
+CONSEQUENTIAL DAMAGES OR FOR ANY LOST PROFITS, LOST SAVINGS, LOST
+REVENUES OR LOST DATA ARISING FROM OR RELATING TO THE MATERIALS OR THIS
+AGREEMENT, EVEN IF TPC OR ITS LICENSORS OR LICENSEE HAVE BEEN ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL TPC'S OR ITS
+LICENSORS' LIABILITY OR DAMAGES TO YOU OR ANY OTHER PERSON EVER EXCEED
+U.S. ONE HUNDRED DOLLARS (US $100), REGARDLESS OF THE FORM OF THE CLAIM.
+IN NO EVENT WILL LICENSEE'S LIABILITY OR DAMAGES TO TPC OR ANY OTHER
+PERSON EVER EXCEED $1,000,000, REGARDLESS OF THE FORM OF THE CLAIM. Some
+countries/states do not allow the limitation or exclusion of liability
+for incidental or consequential damages, so the above limitation or
+exclusion may not apply to You.
+
+15. U.S. Government Restricted Rights. All Software and related
+documentation are provided with restricted rights. Use, duplication or
+disclosure by the U.S. Government is subject to restrictions as set
+forth in subdivision (b)(3)(ii) of the Rights in Technical Data and
+Computer Software Clause at 252.227-7013. If You are using the Software
+outside of the United States, You will comply with the applicable local
+laws of Your country, U.S. export control law, and the English version
+of this Agreement.
+
+16. Contractor/Manufacturer. The Contractor/Manufacturer for the
+Software is:
+
+Transaction Processing Performance Council
+572B Ruger Street, P.O. Box 29920
+San Francisco, CA 94129
+
+17. General. This Agreement is binding on You as well as Your employees,
+employers, contractors and agents, and on any successors and assignees.
+This Agreement is governed by the laws of the State of California
+(except to the extent federal law governs copyrights and trademarks)
+without respect to any provisions of California law that would cause
+application of the law of another state or country. The parties agree
+that the United Nations Convention on Contracts for the International
+Sale of Goods will not govern this Agreement. This Agreement is the
+entire agreement between us regarding the subject matter hereof and
+supersedes any other understandings or agreements with respect to the
+Materials or the subject matter hereof. If any provision of this
+Agreement is deemed invalid or unenforceable by any court having
+jurisdiction, that particular provision will be deemed modified to the
+extent necessary to make the provision valid and enforceable, and the
+remaining provisions will remain in full force and effect.
+
+SPECIAL PROVISIONS APPLICABLE TO THE EUROPEAN UNION
+
+If You acquired the Materials in the European Union (EU), the following
+provisions also apply to You. If there is any inconsistency between the
+terms of the Software License Agreement set out earlier and the
+following provisions, the following provisions shall take precedence.
+
+1. Distribution. You may sublicense modifications of the Software
+covered in this Agreement if they meet the requirements of clause 9
+above.
+
+2. Limited Warranty. EXCEPT AS STATED EARLIER IN THIS AGREEMENT, AND AS
+PROVIDED UNDER THE HEADING "STATUTORY RIGHTS", THE SOFTWARE IS PROVIDED
+AS-IS WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+INCLUDING, BUT NOT LIMITED TO, ANY IMPLIED WARRANTIES, NONINFRINGEMENT,
+OR CONDITIONS OF MERCHANTABILITY, QUALITY AND FITNESS FOR A PARTICULAR
+PURPOSE.
+
+3. Limitation of Remedy and Damages. THE LIMITATIONS OF REMEDIES AND
+DAMAGES IN THE SOFTWARE LICENSE AGREEMENT SHALL NOT APPLY TO PERSONAL
+INJURY (INCLUDING DEATH) TO ANY PERSON CAUSED BY TPC'S NEGLIGENCE AND
+ARE SUBJECT TO THE PROVISION SET OUT UNDER THE HEADING "STATUTORY
+RIGHTS".
+
+4. Statutory Rights: Irish law provides that certain conditions and
+warranties may be implied in contracts for the sale of goods and in
+contracts for the supply of services. Such conditions and warranties are
+hereby excluded, to the extent such exclusion, in the context of this
+transaction, is lawful under Irish law. Conversely, such conditions and
+warranties, insofar as they may not be lawfully excluded, shall apply.
+Accordingly nothing in this Agreement shall prejudice any rights that
+You may enjoy by virtue of Sections 12, 13, 14 or 15 of the Irish Sale
+of Goods Act 1893 (as amended).
+
+5. General. This Agreement is governed by the laws of the Republic of
+Ireland. The local language version of this agreement shall apply to
+Materials acquired in the EU. This Agreement is the entire agreement
+between us with respect to the subject matter hereof and You agree that
+TPC will not have any liability for any untrue statement or
+representation made by it, its agents or anyone else (whether innocently
+or negligently) upon which You relied upon entering this Agreement,
+unless such untrue statement or representation was made fraudulently.
diff --git a/third_party/bigframes_vendored/tpch/queries/q1.py b/third_party/bigframes_vendored/tpch/queries/q1.py
index e1fdf85f58..f533776e85 100644
--- a/third_party/bigframes_vendored/tpch/queries/q1.py
+++ b/third_party/bigframes_vendored/tpch/queries/q1.py
@@ -39,4 +39,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session):
["L_RETURNFLAG", "L_LINESTATUS"]
)
- result.to_gbq()
+ next(result.to_pandas_batches(max_results=1500))
diff --git a/third_party/bigframes_vendored/tpch/queries/q10.py b/third_party/bigframes_vendored/tpch/queries/q10.py
index 1650e9ca34..8c0d93dc26 100644
--- a/third_party/bigframes_vendored/tpch/queries/q10.py
+++ b/third_party/bigframes_vendored/tpch/queries/q10.py
@@ -76,4 +76,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session):
.head(20)
)
- q_final.to_gbq()
+ next(q_final.to_pandas_batches(max_results=1500))
diff --git a/third_party/bigframes_vendored/tpch/queries/q11.py b/third_party/bigframes_vendored/tpch/queries/q11.py
index 385393f781..365aa12eb9 100644
--- a/third_party/bigframes_vendored/tpch/queries/q11.py
+++ b/third_party/bigframes_vendored/tpch/queries/q11.py
@@ -18,20 +18,20 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session):
index_col=bigframes.enums.DefaultIndexKind.NULL,
)
- merged_df = partsupp.merge(supplier, left_on="PS_SUPPKEY", right_on="S_SUPPKEY")
- merged_df = merged_df.merge(nation, left_on="S_NATIONKEY", right_on="N_NATIONKEY")
+ nation = nation[nation["N_NAME"] == "GERMANY"]
- filtered_df = merged_df[merged_df["N_NAME"] == "GERMANY"]
+ merged_df = nation.merge(supplier, left_on="N_NATIONKEY", right_on="S_NATIONKEY")
+ merged_df = merged_df.merge(partsupp, left_on="S_SUPPKEY", right_on="PS_SUPPKEY")
- filtered_df["VALUE"] = filtered_df["PS_SUPPLYCOST"] * filtered_df["PS_AVAILQTY"]
- grouped = filtered_df.groupby("PS_PARTKEY", as_index=False).agg(
+ merged_df["VALUE"] = merged_df["PS_SUPPLYCOST"] * merged_df["PS_AVAILQTY"]
+ grouped = merged_df.groupby("PS_PARTKEY", as_index=False).agg(
VALUE=bpd.NamedAgg(column="VALUE", aggfunc="sum")
)
grouped["VALUE"] = grouped["VALUE"].round(2)
total_value = (
- (filtered_df["PS_SUPPLYCOST"] * filtered_df["PS_AVAILQTY"]).to_frame().sum()
+ (merged_df["PS_SUPPLYCOST"] * merged_df["PS_AVAILQTY"]).to_frame().sum()
)
threshold = (total_value * 0.0001).rename("THRESHOLD")
@@ -43,4 +43,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session):
result_df = result_df.sort_values(by="VALUE", ascending=False)
- result_df.to_gbq()
+ next(result_df.to_pandas_batches(max_results=1500))
diff --git a/third_party/bigframes_vendored/tpch/queries/q12.py b/third_party/bigframes_vendored/tpch/queries/q12.py
index e2b7aaf9f2..1bc22f1167 100644
--- a/third_party/bigframes_vendored/tpch/queries/q12.py
+++ b/third_party/bigframes_vendored/tpch/queries/q12.py
@@ -46,4 +46,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session):
agg_results = typing.cast(bpd.DataFrame, agg_results).sort_values("L_SHIPMODE")
- agg_results.to_gbq()
+ next(agg_results.to_pandas_batches(max_results=1500))
diff --git a/third_party/bigframes_vendored/tpch/queries/q13.py b/third_party/bigframes_vendored/tpch/queries/q13.py
index ea2f0da284..8201a1191d 100644
--- a/third_party/bigframes_vendored/tpch/queries/q13.py
+++ b/third_party/bigframes_vendored/tpch/queries/q13.py
@@ -34,4 +34,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session):
["CUSTDIST", "C_COUNT"], ascending=[False, False]
)
- q_final.to_gbq()
+ next(q_final.to_pandas_batches(max_results=1500))
diff --git a/third_party/bigframes_vendored/tpch/queries/q14.py b/third_party/bigframes_vendored/tpch/queries/q14.py
index e2a5a73214..f3b747219b 100644
--- a/third_party/bigframes_vendored/tpch/queries/q14.py
+++ b/third_party/bigframes_vendored/tpch/queries/q14.py
@@ -42,4 +42,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session):
.to_frame(name="PROMO_REVENUE")
)
- promo_revenue_percent.to_gbq()
+ next(promo_revenue_percent.to_pandas_batches(max_results=1500))
diff --git a/third_party/bigframes_vendored/tpch/queries/q15.py b/third_party/bigframes_vendored/tpch/queries/q15.py
index adf37f9892..1cba0ca4bc 100644
--- a/third_party/bigframes_vendored/tpch/queries/q15.py
+++ b/third_party/bigframes_vendored/tpch/queries/q15.py
@@ -50,4 +50,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session):
q_final = max_revenue_suppliers[
["S_SUPPKEY", "S_NAME", "S_ADDRESS", "S_PHONE", "TOTAL_REVENUE"]
].sort_values("S_SUPPKEY")
- q_final.to_gbq()
+ next(q_final.to_pandas_batches(max_results=1500))
diff --git a/third_party/bigframes_vendored/tpch/queries/q16.py b/third_party/bigframes_vendored/tpch/queries/q16.py
index 79f42ec42c..a02dcef5dc 100644
--- a/third_party/bigframes_vendored/tpch/queries/q16.py
+++ b/third_party/bigframes_vendored/tpch/queries/q16.py
@@ -47,4 +47,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session):
ascending=[False, True, True, True],
)
- q_final.to_gbq()
+ next(q_final.to_pandas_batches(max_results=1500))
diff --git a/third_party/bigframes_vendored/tpch/queries/q17.py b/third_party/bigframes_vendored/tpch/queries/q17.py
index 56289d57ad..e6a87dc482 100644
--- a/third_party/bigframes_vendored/tpch/queries/q17.py
+++ b/third_party/bigframes_vendored/tpch/queries/q17.py
@@ -37,4 +37,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session):
(q_final[["L_EXTENDEDPRICE"]].sum() / 7.0).round(2).to_frame(name="AVG_YEARLY")
)
- q_final.to_gbq()
+ next(q_final.to_pandas_batches(max_results=1500))
diff --git a/third_party/bigframes_vendored/tpch/queries/q18.py b/third_party/bigframes_vendored/tpch/queries/q18.py
index f645a08681..c6802e6808 100644
--- a/third_party/bigframes_vendored/tpch/queries/q18.py
+++ b/third_party/bigframes_vendored/tpch/queries/q18.py
@@ -48,4 +48,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session):
)
q_final = final_result.head(100)
- q_final.to_gbq()
+ next(q_final.to_pandas_batches(max_results=1500))
diff --git a/third_party/bigframes_vendored/tpch/queries/q2.py b/third_party/bigframes_vendored/tpch/queries/q2.py
index f388252993..e154e8ae98 100644
--- a/third_party/bigframes_vendored/tpch/queries/q2.py
+++ b/third_party/bigframes_vendored/tpch/queries/q2.py
@@ -59,4 +59,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session):
)
result_df = sort.head(100)
- result_df.to_gbq()
+ next(result_df.to_pandas_batches(max_results=1500))
diff --git a/third_party/bigframes_vendored/tpch/queries/q20.py b/third_party/bigframes_vendored/tpch/queries/q20.py
index fded5f5c97..5c2d8d391f 100644
--- a/third_party/bigframes_vendored/tpch/queries/q20.py
+++ b/third_party/bigframes_vendored/tpch/queries/q20.py
@@ -44,8 +44,6 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session):
filtered_parts = part[part["P_NAME"].str.startswith(var4)]
- if not session._strictly_ordered:
- filtered_parts = filtered_parts[["P_PARTKEY"]].sort_values(by=["P_PARTKEY"])
filtered_parts = filtered_parts["P_PARTKEY"].unique(keep_order=False).to_frame()
joined_parts = filtered_parts.merge(
partsupp, left_on="P_PARTKEY", right_on="PS_PARTKEY"
@@ -61,4 +59,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session):
final_result = final_filtered.merge(q3, left_on="PS_SUPPKEY", right_on="S_SUPPKEY")
final_result = final_result[["S_NAME", "S_ADDRESS"]].sort_values(by="S_NAME")
- final_result.to_gbq()
+ next(final_result.to_pandas_batches(max_results=1500))
diff --git a/third_party/bigframes_vendored/tpch/queries/q21.py b/third_party/bigframes_vendored/tpch/queries/q21.py
index 097a730d43..c27aab0e69 100644
--- a/third_party/bigframes_vendored/tpch/queries/q21.py
+++ b/third_party/bigframes_vendored/tpch/queries/q21.py
@@ -56,4 +56,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session):
by=["NUMWAIT", "S_NAME"], ascending=[False, True]
).head(100)
- q_final.to_gbq()
+ next(q_final.to_pandas_batches(max_results=1500))
diff --git a/third_party/bigframes_vendored/tpch/queries/q22.py b/third_party/bigframes_vendored/tpch/queries/q22.py
index bc648ef392..153ef63c5d 100644
--- a/third_party/bigframes_vendored/tpch/queries/q22.py
+++ b/third_party/bigframes_vendored/tpch/queries/q22.py
@@ -52,4 +52,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session):
result = result.sort_values(by="CNTRYCODE")
- result.to_gbq()
+ next(result.to_pandas_batches(max_results=1500))
diff --git a/third_party/bigframes_vendored/tpch/queries/q3.py b/third_party/bigframes_vendored/tpch/queries/q3.py
index fb09abe159..60d181a603 100644
--- a/third_party/bigframes_vendored/tpch/queries/q3.py
+++ b/third_party/bigframes_vendored/tpch/queries/q3.py
@@ -39,4 +39,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session):
sorted_sel = sel.sort_values(by=["REVENUE", "O_ORDERDATE"], ascending=[False, True])
result_df = sorted_sel.head(10)
- result_df.to_gbq()
+ next(result_df.to_pandas_batches(max_results=1500))
diff --git a/third_party/bigframes_vendored/tpch/queries/q4.py b/third_party/bigframes_vendored/tpch/queries/q4.py
index d149a71f71..3782a7273f 100644
--- a/third_party/bigframes_vendored/tpch/queries/q4.py
+++ b/third_party/bigframes_vendored/tpch/queries/q4.py
@@ -32,4 +32,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session):
agg = gb.agg(ORDER_COUNT=bpd.NamedAgg(column="L_ORDERKEY", aggfunc="count"))
result_df = typing.cast(bpd.DataFrame, agg).sort_values(["O_ORDERPRIORITY"])
- result_df.to_gbq()
+ next(result_df.to_pandas_batches(max_results=1500))
diff --git a/third_party/bigframes_vendored/tpch/queries/q5.py b/third_party/bigframes_vendored/tpch/queries/q5.py
index 9839c025a5..406df79a5a 100644
--- a/third_party/bigframes_vendored/tpch/queries/q5.py
+++ b/third_party/bigframes_vendored/tpch/queries/q5.py
@@ -52,4 +52,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session):
gb = jn5.groupby("N_NAME", as_index=False)["REVENUE"].sum()
result_df = gb.sort_values("REVENUE", ascending=False)
- result_df.to_gbq()
+ next(result_df.to_pandas_batches(max_results=1500))
diff --git a/third_party/bigframes_vendored/tpch/queries/q6.py b/third_party/bigframes_vendored/tpch/queries/q6.py
index b883837fe2..8fe067bafe 100644
--- a/third_party/bigframes_vendored/tpch/queries/q6.py
+++ b/third_party/bigframes_vendored/tpch/queries/q6.py
@@ -27,4 +27,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session):
.to_frame()
)
- result_df.to_gbq()
+ next(result_df.to_pandas_batches(max_results=1500))
diff --git a/third_party/bigframes_vendored/tpch/queries/q7.py b/third_party/bigframes_vendored/tpch/queries/q7.py
index 93047dc299..81cdda8788 100644
--- a/third_party/bigframes_vendored/tpch/queries/q7.py
+++ b/third_party/bigframes_vendored/tpch/queries/q7.py
@@ -60,4 +60,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session):
result_df = typing.cast(bpd.DataFrame, agg).sort_values(
["SUPP_NATION", "CUST_NATION", "L_YEAR"]
)
- result_df.to_gbq()
+ next(result_df.to_pandas_batches(max_results=1500))
diff --git a/third_party/bigframes_vendored/tpch/queries/q8.py b/third_party/bigframes_vendored/tpch/queries/q8.py
index 1676ec6349..67e1af1241 100644
--- a/third_party/bigframes_vendored/tpch/queries/q8.py
+++ b/third_party/bigframes_vendored/tpch/queries/q8.py
@@ -62,17 +62,11 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session):
jn7["VOLUME"] = jn7["L_EXTENDEDPRICE"] * (1.0 - jn7["L_DISCOUNT"])
jn7 = jn7.rename(columns={"N_NAME": "NATION"})
- denominator = jn7.groupby("O_YEAR")["VOLUME"].sum().rename("DENOMINATOR")
- numerator = (
- jn7[jn7["NATION"] == var1]
- .groupby(jn7["O_YEAR"])["VOLUME"]
- .sum()
- .rename("NUMERATOR")
- )
- jn8 = denominator.to_frame().join(numerator.to_frame(), how="left")
+ jn7["numerator"] = jn7["VOLUME"].where(jn7["NATION"] == var1, 0)
+ jn7["denominator"] = jn7["VOLUME"]
- # ValueError: Caching with offsets only supported in strictly ordered mode.
- jn8["MKT_SHARE"] = (jn8["NUMERATOR"] / jn8["DENOMINATOR"]).round(2)
+ sums = jn7.groupby("O_YEAR")[["numerator", "denominator"]].sum()
+ sums["MKT_SHARE"] = (sums["numerator"] / sums["denominator"]).round(2)
- result_df = jn8["MKT_SHARE"].sort_index().rename("MKT_SHARE").reset_index()
- result_df.to_gbq()
+ result_df = sums["MKT_SHARE"].sort_index().rename("MKT_SHARE").reset_index()
+ next(result_df.to_pandas_batches(max_results=1500))
diff --git a/third_party/bigframes_vendored/tpch/queries/q9.py b/third_party/bigframes_vendored/tpch/queries/q9.py
index c2b52789bd..6af33f7569 100644
--- a/third_party/bigframes_vendored/tpch/queries/q9.py
+++ b/third_party/bigframes_vendored/tpch/queries/q9.py
@@ -65,4 +65,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session):
["NATION", "O_YEAR"], ascending=[True, False]
)
- q_final.to_gbq()
+ next(q_final.to_pandas_batches(max_results=1500))
diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q1.sql b/third_party/bigframes_vendored/tpch/sql_queries/q1.sql
new file mode 100644
index 0000000000..c359614583
--- /dev/null
+++ b/third_party/bigframes_vendored/tpch/sql_queries/q1.sql
@@ -0,0 +1,21 @@
+select
+ l_returnflag,
+ l_linestatus,
+ sum(l_quantity) as sum_qty,
+ sum(l_extendedprice) as sum_base_price,
+ sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
+ sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
+ avg(l_quantity) as avg_qty,
+ avg(l_extendedprice) as avg_price,
+ avg(l_discount) as avg_disc,
+ count(*) as count_order
+from
+ {line_item_ds}
+where
+ l_shipdate <= '1998-09-02'
+group by
+ l_returnflag,
+ l_linestatus
+order by
+ l_returnflag,
+ l_linestatus
diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q10.sql b/third_party/bigframes_vendored/tpch/sql_queries/q10.sql
new file mode 100644
index 0000000000..c07aa0b4c9
--- /dev/null
+++ b/third_party/bigframes_vendored/tpch/sql_queries/q10.sql
@@ -0,0 +1,32 @@
+select
+ c_custkey,
+ c_name,
+ round(sum(l_extendedprice * (1 - l_discount)), 2) as revenue,
+ c_acctbal,
+ n_name,
+ c_address,
+ c_phone,
+ c_comment
+from
+ {customer_ds},
+ {orders_ds},
+ {line_item_ds},
+ {nation_ds}
+where
+ c_custkey = o_custkey
+ and l_orderkey = o_orderkey
+ and o_orderdate >= date '1993-10-01'
+ and o_orderdate < date '1993-10-01' + interval '3' month
+ and l_returnflag = 'R'
+ and c_nationkey = n_nationkey
+group by
+ c_custkey,
+ c_name,
+ c_acctbal,
+ c_phone,
+ n_name,
+ c_address,
+ c_comment
+order by
+ revenue desc
+limit 20
diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q11.sql b/third_party/bigframes_vendored/tpch/sql_queries/q11.sql
new file mode 100644
index 0000000000..08c4560423
--- /dev/null
+++ b/third_party/bigframes_vendored/tpch/sql_queries/q11.sql
@@ -0,0 +1,27 @@
+select
+ ps_partkey,
+ round(sum(ps_supplycost * ps_availqty), 2) as value
+from
+ {part_supp_ds},
+ {supplier_ds},
+ {nation_ds}
+where
+ ps_suppkey = s_suppkey
+ and s_nationkey = n_nationkey
+ and n_name = 'GERMANY'
+group by
+ ps_partkey having
+ sum(ps_supplycost * ps_availqty) > (
+ select
+ sum(ps_supplycost * ps_availqty) * 0.0001
+ from
+ {part_supp_ds},
+ {supplier_ds},
+ {nation_ds}
+ where
+ ps_suppkey = s_suppkey
+ and s_nationkey = n_nationkey
+ and n_name = 'GERMANY'
+ )
+ order by
+ value desc
diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q12.sql b/third_party/bigframes_vendored/tpch/sql_queries/q12.sql
new file mode 100644
index 0000000000..cb97f1fb3c
--- /dev/null
+++ b/third_party/bigframes_vendored/tpch/sql_queries/q12.sql
@@ -0,0 +1,28 @@
+select
+ l_shipmode,
+ sum(case
+ when o_orderpriority = '1-URGENT'
+ or o_orderpriority = '2-HIGH'
+ then 1
+ else 0
+ end) as high_line_count,
+ sum(case
+ when o_orderpriority <> '1-URGENT'
+ and o_orderpriority <> '2-HIGH'
+ then 1
+ else 0
+ end) as low_line_count
+from
+ {orders_ds},
+ {line_item_ds}
+where
+ o_orderkey = l_orderkey
+ and l_shipmode in ('MAIL', 'SHIP')
+ and l_commitdate < l_receiptdate
+ and l_shipdate < l_commitdate
+ and l_receiptdate >= date '1994-01-01'
+ and l_receiptdate < date '1994-01-01' + interval '1' year
+group by
+ l_shipmode
+order by
+ l_shipmode
diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q13.sql b/third_party/bigframes_vendored/tpch/sql_queries/q13.sql
new file mode 100644
index 0000000000..d1616f5360
--- /dev/null
+++ b/third_party/bigframes_vendored/tpch/sql_queries/q13.sql
@@ -0,0 +1,18 @@
+SELECT
+ c_count, COUNT(*) AS custdist
+FROM (
+ SELECT
+ c_custkey,
+ COUNT(o_orderkey) AS c_count
+ FROM
+ {customer_ds} LEFT OUTER JOIN {orders_ds} ON
+ c_custkey = o_custkey
+ AND o_comment NOT LIKE '%special%requests%'
+ GROUP BY
+ c_custkey
+) AS c_orders
+GROUP BY
+ c_count
+ORDER BY
+ custdist DESC,
+ c_count DESC
diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q14.sql b/third_party/bigframes_vendored/tpch/sql_queries/q14.sql
new file mode 100644
index 0000000000..1620ab9762
--- /dev/null
+++ b/third_party/bigframes_vendored/tpch/sql_queries/q14.sql
@@ -0,0 +1,13 @@
+select
+ round(100.00 * sum(case
+ when p_type like 'PROMO%'
+ then l_extendedprice * (1 - l_discount)
+ else 0
+ end) / sum(l_extendedprice * (1 - l_discount)), 2) as promo_revenue
+from
+ {line_item_ds},
+ {part_ds}
+where
+ l_partkey = p_partkey
+ and l_shipdate >= date '1995-09-01'
+ and l_shipdate < date '1995-09-01' + interval '1' month
diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q15.sql b/third_party/bigframes_vendored/tpch/sql_queries/q15.sql
new file mode 100644
index 0000000000..cbf77827bc
--- /dev/null
+++ b/third_party/bigframes_vendored/tpch/sql_queries/q15.sql
@@ -0,0 +1,26 @@
+WITH revenue AS (
+ SELECT
+ l_suppkey AS supplier_no,
+ SUM(l_extendedprice * (1 - l_discount)) AS total_revenue
+ FROM
+ {line_item_ds}
+ WHERE
+ l_shipdate >= DATE '1996-01-01'
+ AND l_shipdate < DATE '1996-01-01' + INTERVAL '3' month
+ GROUP BY
+ l_suppkey
+)
+SELECT
+ s.s_suppkey,
+ s.s_name,
+ s.s_address,
+ s.s_phone,
+ r.total_revenue
+FROM
+ {supplier_ds} s
+JOIN
+ revenue r ON s.s_suppkey = r.supplier_no
+WHERE
+ r.total_revenue = (SELECT MAX(total_revenue) FROM revenue)
+ORDER BY
+ s.s_suppkey;
diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q16.sql b/third_party/bigframes_vendored/tpch/sql_queries/q16.sql
new file mode 100644
index 0000000000..193c8e462d
--- /dev/null
+++ b/third_party/bigframes_vendored/tpch/sql_queries/q16.sql
@@ -0,0 +1,30 @@
+select
+ p_brand,
+ p_type,
+ p_size,
+ count(distinct ps_suppkey) as supplier_cnt
+from
+ {part_supp_ds},
+ {part_ds}
+where
+ p_partkey = ps_partkey
+ and p_brand <> 'Brand#45'
+ and p_type not like 'MEDIUM POLISHED%'
+ and p_size in (49, 14, 23, 45, 19, 3, 36, 9)
+ and ps_suppkey not in (
+ select
+ s_suppkey
+ from
+ {supplier_ds}
+ where
+ s_comment like '%Customer%Complaints%'
+ )
+group by
+ p_brand,
+ p_type,
+ p_size
+order by
+ supplier_cnt desc,
+ p_brand,
+ p_type,
+ p_size
diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q17.sql b/third_party/bigframes_vendored/tpch/sql_queries/q17.sql
new file mode 100644
index 0000000000..390ecdd33b
--- /dev/null
+++ b/third_party/bigframes_vendored/tpch/sql_queries/q17.sql
@@ -0,0 +1,17 @@
+select
+ round(sum(l_extendedprice) / 7.0, 2) as avg_yearly
+from
+ {line_item_ds},
+ {part_ds}
+where
+ p_partkey = l_partkey
+ and p_brand = 'Brand#23'
+ and p_container = 'MED BOX'
+ and l_quantity < (
+ select
+ 0.2 * avg(l_quantity)
+ from
+ {line_item_ds}
+ where
+ l_partkey = p_partkey
+ )
diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q18.sql b/third_party/bigframes_vendored/tpch/sql_queries/q18.sql
new file mode 100644
index 0000000000..4a98abafb9
--- /dev/null
+++ b/third_party/bigframes_vendored/tpch/sql_queries/q18.sql
@@ -0,0 +1,33 @@
+select
+ c_name,
+ c_custkey,
+ o_orderkey,
+ o_orderdate as o_orderdat,
+ o_totalprice,
+ sum(l_quantity) as col6
+from
+ {customer_ds},
+ {orders_ds},
+ {line_item_ds}
+where
+ o_orderkey in (
+ select
+ l_orderkey
+ from
+ {line_item_ds}
+ group by
+ l_orderkey having
+ sum(l_quantity) > 300
+ )
+ and c_custkey = o_custkey
+ and o_orderkey = l_orderkey
+group by
+ c_name,
+ c_custkey,
+ o_orderkey,
+ o_orderdate,
+ o_totalprice
+order by
+ o_totalprice desc,
+ o_orderdate
+limit 100
diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q19.sql b/third_party/bigframes_vendored/tpch/sql_queries/q19.sql
new file mode 100644
index 0000000000..30b41ff3ff
--- /dev/null
+++ b/third_party/bigframes_vendored/tpch/sql_queries/q19.sql
@@ -0,0 +1,35 @@
+select
+ round(sum(l_extendedprice* (1 - l_discount)), 2) as revenue
+from
+ {line_item_ds},
+ {part_ds}
+where
+ (
+ p_partkey = l_partkey
+ and p_brand = 'Brand#12'
+ and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG')
+ and l_quantity >= 1 and l_quantity <= 1 + 10
+ and p_size between 1 and 5
+ and l_shipmode in ('AIR', 'AIR REG')
+ and l_shipinstruct = 'DELIVER IN PERSON'
+ )
+ or
+ (
+ p_partkey = l_partkey
+ and p_brand = 'Brand#23'
+ and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK')
+ and l_quantity >= 10 and l_quantity <= 20
+ and p_size between 1 and 10
+ and l_shipmode in ('AIR', 'AIR REG')
+ and l_shipinstruct = 'DELIVER IN PERSON'
+ )
+ or
+ (
+ p_partkey = l_partkey
+ and p_brand = 'Brand#34'
+ and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG')
+ and l_quantity >= 20 and l_quantity <= 30
+ and p_size between 1 and 15
+ and l_shipmode in ('AIR', 'AIR REG')
+ and l_shipinstruct = 'DELIVER IN PERSON'
+ )
diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q2.sql b/third_party/bigframes_vendored/tpch/sql_queries/q2.sql
new file mode 100644
index 0000000000..082e1e7f53
--- /dev/null
+++ b/third_party/bigframes_vendored/tpch/sql_queries/q2.sql
@@ -0,0 +1,44 @@
+select
+ s_acctbal,
+ s_name,
+ n_name,
+ p_partkey,
+ p_mfgr,
+ s_address,
+ s_phone,
+ s_comment
+from
+ {part_ds},
+ {supplier_ds},
+ {part_supp_ds},
+ {nation_ds},
+ {region_ds}
+where
+ p_partkey = ps_partkey
+ and s_suppkey = ps_suppkey
+ and p_size = 15
+ and p_type like '%BRASS'
+ and s_nationkey = n_nationkey
+ and n_regionkey = r_regionkey
+ and r_name = 'EUROPE'
+ and ps_supplycost = (
+ select
+ min(ps_supplycost)
+ from
+ {part_supp_ds},
+ {supplier_ds},
+ {nation_ds},
+ {region_ds}
+ where
+ p_partkey = ps_partkey
+ and s_suppkey = ps_suppkey
+ and s_nationkey = n_nationkey
+ and n_regionkey = r_regionkey
+ and r_name = 'EUROPE'
+ )
+order by
+ s_acctbal desc,
+ n_name,
+ s_name,
+ p_partkey
+limit 100
diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q20.sql b/third_party/bigframes_vendored/tpch/sql_queries/q20.sql
new file mode 100644
index 0000000000..03348e82b8
--- /dev/null
+++ b/third_party/bigframes_vendored/tpch/sql_queries/q20.sql
@@ -0,0 +1,37 @@
+select
+ s_name,
+ s_address
+from
+ {supplier_ds},
+ {nation_ds}
+where
+ s_suppkey in (
+ select
+ ps_suppkey
+ from
+ {part_supp_ds}
+ where
+ ps_partkey in (
+ select
+ p_partkey
+ from
+ {part_ds}
+ where
+ p_name like 'forest%'
+ )
+ and ps_availqty > (
+ select
+ 0.5 * sum(l_quantity)
+ from
+ {line_item_ds}
+ where
+ l_partkey = ps_partkey
+ and l_suppkey = ps_suppkey
+ and l_shipdate >= date '1994-01-01'
+ and l_shipdate < date '1994-01-01' + interval '1' year
+ )
+ )
+ and s_nationkey = n_nationkey
+ and n_name = 'CANADA'
+order by
+ s_name
diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q21.sql b/third_party/bigframes_vendored/tpch/sql_queries/q21.sql
new file mode 100644
index 0000000000..444d127469
--- /dev/null
+++ b/third_party/bigframes_vendored/tpch/sql_queries/q21.sql
@@ -0,0 +1,40 @@
+select
+ s_name,
+ count(*) as numwait
+from
+ {supplier_ds},
+ {line_item_ds} l1,
+ {orders_ds},
+ {nation_ds}
+where
+ s_suppkey = l1.l_suppkey
+ and o_orderkey = l1.l_orderkey
+ and o_orderstatus = 'F'
+ and l1.l_receiptdate > l1.l_commitdate
+ and exists (
+ select
+ *
+ from
+ {line_item_ds} l2
+ where
+ l2.l_orderkey = l1.l_orderkey
+ and l2.l_suppkey <> l1.l_suppkey
+ )
+ and not exists (
+ select
+ *
+ from
+ {line_item_ds} l3
+ where
+ l3.l_orderkey = l1.l_orderkey
+ and l3.l_suppkey <> l1.l_suppkey
+ and l3.l_receiptdate > l3.l_commitdate
+ )
+ and s_nationkey = n_nationkey
+ and n_name = 'SAUDI ARABIA'
+group by
+ s_name
+order by
+ numwait desc,
+ s_name
+limit 100
diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q22.sql b/third_party/bigframes_vendored/tpch/sql_queries/q22.sql
new file mode 100644
index 0000000000..a1e1b2a253
--- /dev/null
+++ b/third_party/bigframes_vendored/tpch/sql_queries/q22.sql
@@ -0,0 +1,36 @@
+select
+ cntrycode,
+ count(*) as numcust,
+ sum(c_acctbal) as totacctbal
+from (
+ select
+ SUBSTR(c_phone, 1, 2) AS cntrycode,
+ c_acctbal
+ from
+ {customer_ds}
+ where
+ SUBSTR(c_phone, 1, 2) in
+ ('13', '31', '23', '29', '30', '18', '17')
+ and c_acctbal > (
+ select
+ avg(c_acctbal)
+ from
+ {customer_ds}
+ where
+ c_acctbal > 0.00
+ and SUBSTR(c_phone, 1, 2) in
+ ('13', '31', '23', '29', '30', '18', '17')
+ )
+ and not exists (
+ select
+ *
+ from
+ {orders_ds}
+ where
+ o_custkey = c_custkey
+ )
+ ) as custsale
+group by
+ cntrycode
+order by
+ cntrycode
diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q3.sql b/third_party/bigframes_vendored/tpch/sql_queries/q3.sql
new file mode 100644
index 0000000000..69a40b8ef7
--- /dev/null
+++ b/third_party/bigframes_vendored/tpch/sql_queries/q3.sql
@@ -0,0 +1,23 @@
+select
+ l_orderkey,
+ sum(l_extendedprice * (1 - l_discount)) as revenue,
+ o_orderdate,
+ o_shippriority
+from
+ {customer_ds},
+ {orders_ds},
+ {line_item_ds}
+where
+ c_mktsegment = 'BUILDING'
+ and c_custkey = o_custkey
+ and l_orderkey = o_orderkey
+ and o_orderdate < '1995-03-15'
+ and l_shipdate > '1995-03-15'
+group by
+ l_orderkey,
+ o_orderdate,
+ o_shippriority
+order by
+ revenue desc,
+ o_orderdate
+limit 10
diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q4.sql b/third_party/bigframes_vendored/tpch/sql_queries/q4.sql
new file mode 100644
index 0000000000..57204e8d70
--- /dev/null
+++ b/third_party/bigframes_vendored/tpch/sql_queries/q4.sql
@@ -0,0 +1,21 @@
+select
+ o_orderpriority,
+ count(*) as order_count
+from
+ {orders_ds}
+where
+ o_orderdate >= date '1993-07-01'
+ and o_orderdate < date '1993-10-01'
+ and exists (
+ select
+ *
+ from
+ {line_item_ds}
+ where
+ l_orderkey = o_orderkey
+ and l_commitdate < l_receiptdate
+ )
+group by
+ o_orderpriority
+order by
+ o_orderpriority
diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q5.sql b/third_party/bigframes_vendored/tpch/sql_queries/q5.sql
new file mode 100644
index 0000000000..78dbb96ffa
--- /dev/null
+++ b/third_party/bigframes_vendored/tpch/sql_queries/q5.sql
@@ -0,0 +1,24 @@
+select
+ n_name,
+ sum(l_extendedprice * (1 - l_discount)) as revenue
+from
+ {customer_ds},
+ {orders_ds},
+ {line_item_ds},
+ {supplier_ds},
+ {nation_ds},
+ {region_ds}
+where
+ c_custkey = o_custkey
+ and l_orderkey = o_orderkey
+ and l_suppkey = s_suppkey
+ and c_nationkey = s_nationkey
+ and s_nationkey = n_nationkey
+ and n_regionkey = r_regionkey
+ and r_name = 'ASIA'
+ and o_orderdate >= date '1994-01-01'
+ and o_orderdate < date '1995-01-01'
+group by
+ n_name
+order by
+ revenue desc
diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q6.sql b/third_party/bigframes_vendored/tpch/sql_queries/q6.sql
new file mode 100644
index 0000000000..0ea158332e
--- /dev/null
+++ b/third_party/bigframes_vendored/tpch/sql_queries/q6.sql
@@ -0,0 +1,9 @@
+select
+ sum(l_extendedprice * l_discount) as revenue
+from
+ {line_item_ds}
+where
+ l_shipdate >= date '1994-01-01'
+ and l_shipdate < date '1994-01-01' + interval '1' year
+ and l_discount between .05 and .07
+ and l_quantity < 24
diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q7.sql b/third_party/bigframes_vendored/tpch/sql_queries/q7.sql
new file mode 100644
index 0000000000..002e89e4a0
--- /dev/null
+++ b/third_party/bigframes_vendored/tpch/sql_queries/q7.sql
@@ -0,0 +1,50 @@
+select
+ supp_nation,
+ cust_nation,
+ l_year,
+ sum(volume) as revenue
+from
+ (
+ select
+ n1.n_name as supp_nation,
+ n2.n_name as cust_nation,
+ EXTRACT(
+ YEAR
+ FROM
+ l_shipdate
+ ) as l_year,
+ l_extendedprice * (1 - l_discount) as volume
+ from
+ {supplier_ds},
+ {line_item_ds},
+ {orders_ds},
+ {customer_ds},
+ {nation_ds} n1,
+ {nation_ds} n2
+ where
+ s_suppkey = l_suppkey
+ and o_orderkey = l_orderkey
+ and c_custkey = o_custkey
+ and s_nationkey = n1.n_nationkey
+ and c_nationkey = n2.n_nationkey
+ and (
+ (
+ n1.n_name = 'FRANCE'
+ and n2.n_name = 'GERMANY'
+ )
+ or (
+ n1.n_name = 'GERMANY'
+ and n2.n_name = 'FRANCE'
+ )
+ )
+ and l_shipdate between date '1995-01-01'
+ and date '1996-12-31'
+ ) as shipping
+group by
+ supp_nation,
+ cust_nation,
+ l_year
+order by
+ supp_nation,
+ cust_nation,
+ l_year
diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q8.sql b/third_party/bigframes_vendored/tpch/sql_queries/q8.sql
new file mode 100644
index 0000000000..d4d1ddd275
--- /dev/null
+++ b/third_party/bigframes_vendored/tpch/sql_queries/q8.sql
@@ -0,0 +1,39 @@
+select
+ o_year,
+ round(
+ sum(case
+ when nation = 'BRAZIL' then volume
+ else 0
+ end) / sum(volume)
+ , 2) as mkt_share
+from
+ (
+ select
+ extract(year from o_orderdate) as o_year,
+ l_extendedprice * (1 - l_discount) as volume,
+ n2.n_name as nation
+ from
+ {part_ds},
+ {supplier_ds},
+ {line_item_ds},
+ {orders_ds},
+ {customer_ds},
+ {nation_ds} n1,
+ {nation_ds} n2,
+ {region_ds}
+ where
+ p_partkey = l_partkey
+ and s_suppkey = l_suppkey
+ and l_orderkey = o_orderkey
+ and o_custkey = c_custkey
+ and c_nationkey = n1.n_nationkey
+ and n1.n_regionkey = r_regionkey
+ and r_name = 'AMERICA'
+ and s_nationkey = n2.n_nationkey
+ and o_orderdate between date '1995-01-01' and date '1996-12-31'
+ and p_type = 'ECONOMY ANODIZED STEEL'
+ ) as all_nations
+group by
+ o_year
+order by
+ o_year
diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q9.sql b/third_party/bigframes_vendored/tpch/sql_queries/q9.sql
new file mode 100644
index 0000000000..fcc3e19400
--- /dev/null
+++ b/third_party/bigframes_vendored/tpch/sql_queries/q9.sql
@@ -0,0 +1,32 @@
+select
+ nation,
+ o_year,
+ round(sum(amount), 2) as sum_profit
+from
+ (
+ select
+ n_name as nation,
+ EXTRACT(YEAR FROM o_orderdate) as o_year,
+ l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount
+ from
+ {part_ds},
+ {supplier_ds},
+ {line_item_ds},
+ {part_supp_ds},
+ {orders_ds},
+ {nation_ds}
+ where
+ s_suppkey = l_suppkey
+ and ps_suppkey = l_suppkey
+ and ps_partkey = l_partkey
+ and p_partkey = l_partkey
+ and o_orderkey = l_orderkey
+ and s_nationkey = n_nationkey
+ and p_name like '%green%'
+ ) as profit
+group by
+ nation,
+ o_year
+order by
+ nation,
+ o_year desc
diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py
index 1fef294cef..d9b9875805 100644
--- a/third_party/bigframes_vendored/version.py
+++ b/third_party/bigframes_vendored/version.py
@@ -12,4 +12,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-__version__ = "1.34.0"
+__version__ = "1.35.0"