Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings
This repository was archived by the owner on May 7, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion 2 bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -967,7 +967,7 @@ def _compute_dry_run(
}

dry_run_stats = dry_runs.get_query_stats_with_dtypes(
query_job, column_dtypes, self.index.dtypes
query_job, column_dtypes, self.index.dtypes, self.expr.node
)
return dry_run_stats, query_job

Expand Down
38 changes: 36 additions & 2 deletions 38 bigframes/session/dry_runs.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import pandas

from bigframes import dtypes
from bigframes.core import bigframe_node, nodes


def get_table_stats(table: bigquery.Table) -> pandas.Series:
Expand Down Expand Up @@ -86,13 +87,26 @@ def get_query_stats_with_dtypes(
query_job: bigquery.QueryJob,
column_dtypes: Dict[str, dtypes.Dtype],
index_dtypes: Sequence[dtypes.Dtype],
expr_root: bigframe_node.BigFrameNode | None = None,
) -> pandas.Series:
"""
Returns important stats from the query job as a Pandas Series. The dtypes information is added too.

Args:
expr_root (Optional):
The root of the expression tree that may contain local data, whose size is added to the
total bytes count if available.

"""
index = ["columnCount", "columnDtypes", "indexLevel", "indexDtypes"]
values = [len(column_dtypes), column_dtypes, len(index_dtypes), index_dtypes]

s = pandas.Series(values, index=index)

return pandas.concat([s, get_query_stats(query_job)])
result = pandas.concat([s, get_query_stats(query_job)])
if expr_root is not None:
result["totalBytesProcessed"] += get_local_bytes(expr_root)
return result


def get_query_stats(
Expand Down Expand Up @@ -145,4 +159,24 @@ def get_query_stats(
else None
)

return pandas.Series(values, index=index)
result = pandas.Series(values, index=index)
if result["totalBytesProcessed"] is None:
result["totalBytesProcessed"] = 0
else:
result["totalBytesProcessed"] = int(result["totalBytesProcessed"])

return result


def get_local_bytes(root: bigframe_node.BigFrameNode) -> int:
def get_total_bytes(
root: bigframe_node.BigFrameNode, child_results: tuple[int, ...]
) -> int:
child_bytes = sum(child_results)

if isinstance(root, nodes.ReadLocalNode):
return child_bytes + root.local_data_source.data.get_total_buffer_size()

return child_bytes

return root.reduce_up(get_total_bytes)
16 changes: 16 additions & 0 deletions 16 tests/system/small/test_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -2173,6 +2173,22 @@ def test_read_gbq_query_dry_run(scalars_table_id, session):
_assert_query_dry_run_stats_are_valid(result)


def test_block_dry_run_includes_local_data(session):
df1 = bigframes.dataframe.DataFrame({"col_1": [1, 2, 3]}, session=session)
df2 = bigframes.dataframe.DataFrame({"col_2": [1, 2, 3]}, session=session)

result = df1.merge(df2, how="cross").to_pandas(dry_run=True)

assert isinstance(result, pd.Series)
_assert_query_dry_run_stats_are_valid(result)
assert result["totalBytesProcessed"] > 0
assert (
df1.to_pandas(dry_run=True)["totalBytesProcessed"]
+ df2.to_pandas(dry_run=True)["totalBytesProcessed"]
== result["totalBytesProcessed"]
)


def _assert_query_dry_run_stats_are_valid(result: pd.Series):
expected_index = pd.Index(
[
Expand Down
Morty Proxy This is a proxified and sanitized view of the page, visit original site.