From 26c6838db53e2012dd4ca2382245812ea0ebb3ec Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Mon, 27 Jan 2025 20:26:28 -0800 Subject: [PATCH 01/38] chore: Amend tpch q8 to reduce join (#1328) --- .../bigframes_vendored/tpch/queries/q8.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/third_party/bigframes_vendored/tpch/queries/q8.py b/third_party/bigframes_vendored/tpch/queries/q8.py index 1676ec6349..7ed1dd0150 100644 --- a/third_party/bigframes_vendored/tpch/queries/q8.py +++ b/third_party/bigframes_vendored/tpch/queries/q8.py @@ -62,17 +62,11 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): jn7["VOLUME"] = jn7["L_EXTENDEDPRICE"] * (1.0 - jn7["L_DISCOUNT"]) jn7 = jn7.rename(columns={"N_NAME": "NATION"}) - denominator = jn7.groupby("O_YEAR")["VOLUME"].sum().rename("DENOMINATOR") - numerator = ( - jn7[jn7["NATION"] == var1] - .groupby(jn7["O_YEAR"])["VOLUME"] - .sum() - .rename("NUMERATOR") - ) - jn8 = denominator.to_frame().join(numerator.to_frame(), how="left") + jn7["numerator"] = jn7["VOLUME"].where(jn7["NATION"] == var1, 0) + jn7["denominator"] = jn7["VOLUME"] - # ValueError: Caching with offsets only supported in strictly ordered mode. - jn8["MKT_SHARE"] = (jn8["NUMERATOR"] / jn8["DENOMINATOR"]).round(2) + sums = jn7.groupby("O_YEAR")[["numerator", "denominator"]].sum() + sums["MKT_SHARE"] = (sums["numerator"] / sums["denominator"]).round(2) - result_df = jn8["MKT_SHARE"].sort_index().rename("MKT_SHARE").reset_index() + result_df = sums["MKT_SHARE"].sort_index().rename("MKT_SHARE").reset_index() result_df.to_gbq() From 16b357ee67ac9c8791fc52ac0a7905881ad763d0 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Mon, 27 Jan 2025 22:08:58 -0800 Subject: [PATCH 02/38] chore: update tpch benchmark to use to_pandas_batch (#1325) --- scripts/tpch_result_verify.py | 4 +++- third_party/bigframes_vendored/tpch/queries/q1.py | 2 +- third_party/bigframes_vendored/tpch/queries/q10.py | 2 +- third_party/bigframes_vendored/tpch/queries/q11.py | 2 +- third_party/bigframes_vendored/tpch/queries/q12.py | 2 +- third_party/bigframes_vendored/tpch/queries/q13.py | 2 +- third_party/bigframes_vendored/tpch/queries/q14.py | 2 +- third_party/bigframes_vendored/tpch/queries/q15.py | 2 +- third_party/bigframes_vendored/tpch/queries/q16.py | 2 +- third_party/bigframes_vendored/tpch/queries/q17.py | 2 +- third_party/bigframes_vendored/tpch/queries/q18.py | 2 +- third_party/bigframes_vendored/tpch/queries/q2.py | 2 +- third_party/bigframes_vendored/tpch/queries/q20.py | 2 +- third_party/bigframes_vendored/tpch/queries/q21.py | 2 +- third_party/bigframes_vendored/tpch/queries/q22.py | 2 +- third_party/bigframes_vendored/tpch/queries/q3.py | 2 +- third_party/bigframes_vendored/tpch/queries/q4.py | 2 +- third_party/bigframes_vendored/tpch/queries/q5.py | 2 +- third_party/bigframes_vendored/tpch/queries/q6.py | 2 +- third_party/bigframes_vendored/tpch/queries/q7.py | 2 +- third_party/bigframes_vendored/tpch/queries/q8.py | 2 +- third_party/bigframes_vendored/tpch/queries/q9.py | 2 +- 22 files changed, 24 insertions(+), 22 deletions(-) diff --git a/scripts/tpch_result_verify.py b/scripts/tpch_result_verify.py index e241327a4a..c16d7cdc84 100644 --- a/scripts/tpch_result_verify.py +++ b/scripts/tpch_result_verify.py @@ -772,7 +772,9 @@ def verify(query_num=None): file_content = file.read() file_content = re.sub( - r"(\w+)\.to_gbq\(\)", r"return \1.to_pandas()", file_content + r"next\((\w+)\.to_pandas_batches\(\)\)", + r"return \1.to_pandas()", + file_content, ) file_content = re.sub(r"_\s*=\s*(\w+)", r"return \1", file_content) sql_result = _execute_query(sql_query) diff --git a/third_party/bigframes_vendored/tpch/queries/q1.py b/third_party/bigframes_vendored/tpch/queries/q1.py index e1fdf85f58..a3d61bce6f 100644 --- a/third_party/bigframes_vendored/tpch/queries/q1.py +++ b/third_party/bigframes_vendored/tpch/queries/q1.py @@ -39,4 +39,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): ["L_RETURNFLAG", "L_LINESTATUS"] ) - result.to_gbq() + next(result.to_pandas_batches()) diff --git a/third_party/bigframes_vendored/tpch/queries/q10.py b/third_party/bigframes_vendored/tpch/queries/q10.py index 1650e9ca34..41165e1ba2 100644 --- a/third_party/bigframes_vendored/tpch/queries/q10.py +++ b/third_party/bigframes_vendored/tpch/queries/q10.py @@ -76,4 +76,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): .head(20) ) - q_final.to_gbq() + next(q_final.to_pandas_batches()) diff --git a/third_party/bigframes_vendored/tpch/queries/q11.py b/third_party/bigframes_vendored/tpch/queries/q11.py index 385393f781..223bc8aee8 100644 --- a/third_party/bigframes_vendored/tpch/queries/q11.py +++ b/third_party/bigframes_vendored/tpch/queries/q11.py @@ -43,4 +43,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): result_df = result_df.sort_values(by="VALUE", ascending=False) - result_df.to_gbq() + next(result_df.to_pandas_batches()) diff --git a/third_party/bigframes_vendored/tpch/queries/q12.py b/third_party/bigframes_vendored/tpch/queries/q12.py index e2b7aaf9f2..4a8aca9228 100644 --- a/third_party/bigframes_vendored/tpch/queries/q12.py +++ b/third_party/bigframes_vendored/tpch/queries/q12.py @@ -46,4 +46,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): agg_results = typing.cast(bpd.DataFrame, agg_results).sort_values("L_SHIPMODE") - agg_results.to_gbq() + next(agg_results.to_pandas_batches()) diff --git a/third_party/bigframes_vendored/tpch/queries/q13.py b/third_party/bigframes_vendored/tpch/queries/q13.py index ea2f0da284..3a69e44c50 100644 --- a/third_party/bigframes_vendored/tpch/queries/q13.py +++ b/third_party/bigframes_vendored/tpch/queries/q13.py @@ -34,4 +34,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): ["CUSTDIST", "C_COUNT"], ascending=[False, False] ) - q_final.to_gbq() + next(q_final.to_pandas_batches()) diff --git a/third_party/bigframes_vendored/tpch/queries/q14.py b/third_party/bigframes_vendored/tpch/queries/q14.py index e2a5a73214..36b5e569cb 100644 --- a/third_party/bigframes_vendored/tpch/queries/q14.py +++ b/third_party/bigframes_vendored/tpch/queries/q14.py @@ -42,4 +42,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): .to_frame(name="PROMO_REVENUE") ) - promo_revenue_percent.to_gbq() + next(promo_revenue_percent.to_pandas_batches()) diff --git a/third_party/bigframes_vendored/tpch/queries/q15.py b/third_party/bigframes_vendored/tpch/queries/q15.py index adf37f9892..7e73935160 100644 --- a/third_party/bigframes_vendored/tpch/queries/q15.py +++ b/third_party/bigframes_vendored/tpch/queries/q15.py @@ -50,4 +50,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): q_final = max_revenue_suppliers[ ["S_SUPPKEY", "S_NAME", "S_ADDRESS", "S_PHONE", "TOTAL_REVENUE"] ].sort_values("S_SUPPKEY") - q_final.to_gbq() + next(q_final.to_pandas_batches()) diff --git a/third_party/bigframes_vendored/tpch/queries/q16.py b/third_party/bigframes_vendored/tpch/queries/q16.py index 79f42ec42c..2559d7ace6 100644 --- a/third_party/bigframes_vendored/tpch/queries/q16.py +++ b/third_party/bigframes_vendored/tpch/queries/q16.py @@ -47,4 +47,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): ascending=[False, True, True, True], ) - q_final.to_gbq() + next(q_final.to_pandas_batches()) diff --git a/third_party/bigframes_vendored/tpch/queries/q17.py b/third_party/bigframes_vendored/tpch/queries/q17.py index 56289d57ad..62c66acad8 100644 --- a/third_party/bigframes_vendored/tpch/queries/q17.py +++ b/third_party/bigframes_vendored/tpch/queries/q17.py @@ -37,4 +37,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): (q_final[["L_EXTENDEDPRICE"]].sum() / 7.0).round(2).to_frame(name="AVG_YEARLY") ) - q_final.to_gbq() + next(q_final.to_pandas_batches()) diff --git a/third_party/bigframes_vendored/tpch/queries/q18.py b/third_party/bigframes_vendored/tpch/queries/q18.py index f645a08681..ac42613b17 100644 --- a/third_party/bigframes_vendored/tpch/queries/q18.py +++ b/third_party/bigframes_vendored/tpch/queries/q18.py @@ -48,4 +48,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): ) q_final = final_result.head(100) - q_final.to_gbq() + next(q_final.to_pandas_batches()) diff --git a/third_party/bigframes_vendored/tpch/queries/q2.py b/third_party/bigframes_vendored/tpch/queries/q2.py index f388252993..5a745db6fb 100644 --- a/third_party/bigframes_vendored/tpch/queries/q2.py +++ b/third_party/bigframes_vendored/tpch/queries/q2.py @@ -59,4 +59,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): ) result_df = sort.head(100) - result_df.to_gbq() + next(result_df.to_pandas_batches()) diff --git a/third_party/bigframes_vendored/tpch/queries/q20.py b/third_party/bigframes_vendored/tpch/queries/q20.py index fded5f5c97..fc36dd8b82 100644 --- a/third_party/bigframes_vendored/tpch/queries/q20.py +++ b/third_party/bigframes_vendored/tpch/queries/q20.py @@ -61,4 +61,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): final_result = final_filtered.merge(q3, left_on="PS_SUPPKEY", right_on="S_SUPPKEY") final_result = final_result[["S_NAME", "S_ADDRESS"]].sort_values(by="S_NAME") - final_result.to_gbq() + next(final_result.to_pandas_batches()) diff --git a/third_party/bigframes_vendored/tpch/queries/q21.py b/third_party/bigframes_vendored/tpch/queries/q21.py index 097a730d43..37fc2e75d1 100644 --- a/third_party/bigframes_vendored/tpch/queries/q21.py +++ b/third_party/bigframes_vendored/tpch/queries/q21.py @@ -56,4 +56,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): by=["NUMWAIT", "S_NAME"], ascending=[False, True] ).head(100) - q_final.to_gbq() + next(q_final.to_pandas_batches()) diff --git a/third_party/bigframes_vendored/tpch/queries/q22.py b/third_party/bigframes_vendored/tpch/queries/q22.py index bc648ef392..e593b7beac 100644 --- a/third_party/bigframes_vendored/tpch/queries/q22.py +++ b/third_party/bigframes_vendored/tpch/queries/q22.py @@ -52,4 +52,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): result = result.sort_values(by="CNTRYCODE") - result.to_gbq() + next(result.to_pandas_batches()) diff --git a/third_party/bigframes_vendored/tpch/queries/q3.py b/third_party/bigframes_vendored/tpch/queries/q3.py index fb09abe159..9fb089fcef 100644 --- a/third_party/bigframes_vendored/tpch/queries/q3.py +++ b/third_party/bigframes_vendored/tpch/queries/q3.py @@ -39,4 +39,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): sorted_sel = sel.sort_values(by=["REVENUE", "O_ORDERDATE"], ascending=[False, True]) result_df = sorted_sel.head(10) - result_df.to_gbq() + next(result_df.to_pandas_batches()) diff --git a/third_party/bigframes_vendored/tpch/queries/q4.py b/third_party/bigframes_vendored/tpch/queries/q4.py index d149a71f71..bc91aa1ada 100644 --- a/third_party/bigframes_vendored/tpch/queries/q4.py +++ b/third_party/bigframes_vendored/tpch/queries/q4.py @@ -32,4 +32,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): agg = gb.agg(ORDER_COUNT=bpd.NamedAgg(column="L_ORDERKEY", aggfunc="count")) result_df = typing.cast(bpd.DataFrame, agg).sort_values(["O_ORDERPRIORITY"]) - result_df.to_gbq() + next(result_df.to_pandas_batches()) diff --git a/third_party/bigframes_vendored/tpch/queries/q5.py b/third_party/bigframes_vendored/tpch/queries/q5.py index 9839c025a5..4b0c522b37 100644 --- a/third_party/bigframes_vendored/tpch/queries/q5.py +++ b/third_party/bigframes_vendored/tpch/queries/q5.py @@ -52,4 +52,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): gb = jn5.groupby("N_NAME", as_index=False)["REVENUE"].sum() result_df = gb.sort_values("REVENUE", ascending=False) - result_df.to_gbq() + next(result_df.to_pandas_batches()) diff --git a/third_party/bigframes_vendored/tpch/queries/q6.py b/third_party/bigframes_vendored/tpch/queries/q6.py index b883837fe2..2e5272073b 100644 --- a/third_party/bigframes_vendored/tpch/queries/q6.py +++ b/third_party/bigframes_vendored/tpch/queries/q6.py @@ -27,4 +27,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): .to_frame() ) - result_df.to_gbq() + next(result_df.to_pandas_batches()) diff --git a/third_party/bigframes_vendored/tpch/queries/q7.py b/third_party/bigframes_vendored/tpch/queries/q7.py index 93047dc299..7325166871 100644 --- a/third_party/bigframes_vendored/tpch/queries/q7.py +++ b/third_party/bigframes_vendored/tpch/queries/q7.py @@ -60,4 +60,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): result_df = typing.cast(bpd.DataFrame, agg).sort_values( ["SUPP_NATION", "CUST_NATION", "L_YEAR"] ) - result_df.to_gbq() + next(result_df.to_pandas_batches()) diff --git a/third_party/bigframes_vendored/tpch/queries/q8.py b/third_party/bigframes_vendored/tpch/queries/q8.py index 7ed1dd0150..0dfe2c1208 100644 --- a/third_party/bigframes_vendored/tpch/queries/q8.py +++ b/third_party/bigframes_vendored/tpch/queries/q8.py @@ -69,4 +69,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): sums["MKT_SHARE"] = (sums["numerator"] / sums["denominator"]).round(2) result_df = sums["MKT_SHARE"].sort_index().rename("MKT_SHARE").reset_index() - result_df.to_gbq() + next(result_df.to_pandas_batches()) diff --git a/third_party/bigframes_vendored/tpch/queries/q9.py b/third_party/bigframes_vendored/tpch/queries/q9.py index c2b52789bd..cd95fa8b56 100644 --- a/third_party/bigframes_vendored/tpch/queries/q9.py +++ b/third_party/bigframes_vendored/tpch/queries/q9.py @@ -65,4 +65,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): ["NATION", "O_YEAR"], ascending=[True, False] ) - q_final.to_gbq() + next(q_final.to_pandas_batches()) From 5c2a2c6086be20cba7da08ecd37899699aab518f Mon Sep 17 00:00:00 2001 From: rey-esp Date: Tue, 28 Jan 2025 12:36:10 -0600 Subject: [PATCH 03/38] feat: allow `case_when` to change dtypes if case list contains the condition `(True, some_default_value)` (#1311) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: support forecast_limit_lower_bound and forecast_limit_upper_bound in ARIMA_PLUS (and ARIMA_PLUS_XREG) models * update doc string * feat: allow case_when to change dtypes if case list contains the condition True * revert bigframes/ml/forecasting.py * revert bigframes/ml/utils.py * revert tests/system/large/ml/test_forecasting.py * Update third_party/bigframes_vendored/pandas/core/series.py Co-authored-by: Tim Sweña (Swast) * Update third_party/bigframes_vendored/pandas/core/series.py * Update bigframes/series.py * Update tests/system/small/test_series.py --------- Co-authored-by: Tim Sweña (Swast) --- bigframes/series.py | 14 +++++++- tests/system/small/test_series.py | 36 +++++++++++++++++++ .../bigframes_vendored/pandas/core/series.py | 15 ++++++++ 3 files changed, 64 insertions(+), 1 deletion(-) diff --git a/bigframes/series.py b/bigframes/series.py index e705a97fa9..8a0aaf8d59 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -483,7 +483,19 @@ def between(self, left, right, inclusive="both"): ) def case_when(self, caselist) -> Series: - cases = list(itertools.chain(*caselist, (True, self))) + cases = [] + + for condition, output in itertools.chain(caselist, [(True, self)]): + cases.append(condition) + cases.append(output) + # In pandas, the default value if no case matches is the original value. + # This makes it impossible to change the type of the column, but if + # the condition is always True, we know it will match and no subsequent + # conditions matter (including the fallback to `self`). This break allows + # the type to change (see: internal issue 349926559). + if condition is True: + break + return self._apply_nary_op( ops.case_when_op, cases, diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 3d76122e9d..aa1d6262d5 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -2862,6 +2862,42 @@ def test_series_case_when(scalars_dfs_maybe_ordered): ) +def test_series_case_when_change_type(scalars_dfs_maybe_ordered): + pytest.importorskip( + "pandas", + minversion="2.2.0", + reason="case_when added in pandas 2.2.0", + ) + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + + bf_series = scalars_df["int64_col"] + pd_series = scalars_pandas_df["int64_col"] + + # TODO(tswast): pandas case_when appears to assume True when a value is + # null. I suspect this should be considered a bug in pandas. + + bf_conditions = [ + ((bf_series > 645).fillna(True), scalars_df["string_col"]), + ((bf_series <= -100).fillna(True), pd.NA), + (True, "not_found"), + ] + + pd_conditions = [ + ((pd_series > 645).fillna(True), scalars_pandas_df["string_col"]), + ((pd_series <= -100).fillna(True), pd.NA), + # pandas currently fails if both the condition and the value are literals. + ([True] * len(pd_series), ["not_found"] * len(pd_series)), + ] + + bf_result = bf_series.case_when(bf_conditions).to_pandas() + pd_result = pd_series.case_when(pd_conditions) + + pd.testing.assert_series_equal( + bf_result, + pd_result.astype("string[pyarrow]"), + ) + + def test_to_frame(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 727e25836a..8b9a76d441 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -2648,6 +2648,21 @@ def case_when( 3 2 Name: c, dtype: Int64 + If you'd like to change the type, add a case with the condition True at the end of the case list + + >>> c.case_when( + ... caselist=[ + ... (a.gt(0), 'a'), # condition, replacement + ... (b.gt(0), 'b'), + ... (True, 'c'), + ... ] + ... ) + 0 c + 1 b + 2 a + 3 a + Name: c, dtype: string + **See also:** - :func:`bigframes.pandas.Series.mask` : Replace values where the condition is True. From f3061f8f28dbe044d632867385780d5cfc906969 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Tue, 28 Jan 2025 10:51:10 -0800 Subject: [PATCH 04/38] chore: filter internal calls. (#1329) --- bigframes/core/log_adapter.py | 3 +++ tests/unit/core/test_log_adapter.py | 10 ++++++++++ 2 files changed, 13 insertions(+) diff --git a/bigframes/core/log_adapter.py b/bigframes/core/log_adapter.py index 36aa6682bd..d234d9be28 100644 --- a/bigframes/core/log_adapter.py +++ b/bigframes/core/log_adapter.py @@ -63,6 +63,9 @@ def submit_pandas_labels( - 'PANDAS_PARAM_TRACKING_TASK': Indicates that the unimplemented feature is a parameter of a method. """ + if method_name.startswith("_") and not method_name.startswith("__"): + return + labels_dict = { "task": task, "class_name": class_name.lower(), diff --git a/tests/unit/core/test_log_adapter.py b/tests/unit/core/test_log_adapter.py index 7b626838ac..d183f4479e 100644 --- a/tests/unit/core/test_log_adapter.py +++ b/tests/unit/core/test_log_adapter.py @@ -155,3 +155,13 @@ def test_submit_pandas_labels_without_valid_params_for_param_logging(mock_bqclie # For param tracking task without kwargs, we won't submit labels mock_bqclient.query.assert_not_called() + + +def test_submit_pandas_labels_with_internal_method(mock_bqclient): + log_adapter.submit_pandas_labels( + mock_bqclient, + "Series", + "_repr_latex_", + task=log_adapter.PANDAS_API_TRACKING_TASK, + ) + mock_bqclient.query.assert_not_called() From 2bb068f07dad86a36d17763d746b94bba019783e Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Tue, 28 Jan 2025 11:37:14 -0800 Subject: [PATCH 05/38] chore: add experiment blob read_gbq_object_table (#1324) * chore: add experiment blob read_gbq_object_table * fix unit --- bigframes/pandas/__init__.py | 2 ++ bigframes/pandas/io/api.py | 15 +++++++++++++++ bigframes/session/__init__.py | 31 +++++++++++++++++++++++++++++++ 3 files changed, 48 insertions(+) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 4a5e4d4b3a..93c08a22aa 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -42,6 +42,7 @@ read_gbq, read_gbq_function, read_gbq_model, + read_gbq_object_table, read_gbq_query, read_gbq_table, read_json, @@ -306,6 +307,7 @@ def reset_session(): "read_gbq", "read_gbq_function", "read_gbq_model", + "read_gbq_object_table", "read_gbq_query", "read_gbq_table", "read_json", diff --git a/bigframes/pandas/io/api.py b/bigframes/pandas/io/api.py index 454b2e729e..a119ff67b0 100644 --- a/bigframes/pandas/io/api.py +++ b/bigframes/pandas/io/api.py @@ -193,6 +193,21 @@ def read_gbq_model(model_name: str): read_gbq_model.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq_model) +def read_gbq_object_table( + object_table: str, *, name: Optional[str] = None +) -> bigframes.dataframe.DataFrame: + return global_session.with_default_session( + bigframes.session.Session.read_gbq_object_table, + object_table, + name=name, + ) + + +read_gbq_object_table.__doc__ = inspect.getdoc( + bigframes.session.Session.read_gbq_object_table +) + + def read_gbq_query( query: str, *, diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 02f79a7d99..89ea0eee69 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1625,6 +1625,9 @@ def from_glob_path( ) -> dataframe.DataFrame: r"""Create a BigFrames DataFrame that contains a BigFrames Blob column from a global wildcard path. + .. note:: + BigFrames Blob is still under experiments. It may not work and subject to change in the future. + Args: path (str): The wildcard global path, such as "gs:////\*". @@ -1641,6 +1644,7 @@ def from_glob_path( if not bigframes.options.experiments.blob: raise NotImplementedError() + # TODO(garrettwu): switch to pseudocolumn when b/374988109 is done. connection = connection or self._bq_connection connection = bigframes.clients.resolve_full_bq_connection_name( connection, @@ -1653,6 +1657,33 @@ def from_glob_path( s = self.read_gbq(table)["uri"].str.to_blob(connection) return s.rename(name).to_frame() + def read_gbq_object_table( + self, object_table: str, *, name: Optional[str] = None + ) -> dataframe.DataFrame: + """Read an existing object table to create a BigFrames Blob DataFrame. Use the connection of the object table for the connection of the blob. + This function dosen't retrieve the object table data. If you want to read the data, use read_gbq() instead. + + .. note:: + BigFrames Blob is still under experiments. It may not work and subject to change in the future. + + Args: + object_table (str): name of the object table of form ... + name (str or None): the returned blob column name. + + Returns: + bigframes.pandas.DataFrame: + Result BigFrames DataFrame. + """ + if not bigframes.options.experiments.blob: + raise NotImplementedError() + + # TODO(garrettwu): switch to pseudocolumn when b/374988109 is done. + table = self.bqclient.get_table(object_table) + connection = table._properties["externalDataConfiguration"]["connectionId"] + + s = self.read_gbq(object_table)["uri"].str.to_blob(connection) + return s.rename(name).to_frame() + def connect(context: Optional[bigquery_options.BigQueryOptions] = None) -> Session: return Session(context) From 68d73a9217b73e4c878d71227a29e3094a448265 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 28 Jan 2025 13:24:58 -0800 Subject: [PATCH 06/38] refactor: Preprocess expression trees to pull up ordering (#1321) --- bigframes/core/__init__.py | 4 +- bigframes/core/blocks.py | 2 +- bigframes/core/compile/aggregate_compiler.py | 14 +- bigframes/core/compile/api.py | 10 +- bigframes/core/compile/compiled.py | 982 ++---------------- bigframes/core/compile/compiler.py | 309 ++---- bigframes/core/compile/concat.py | 64 -- bigframes/core/compile/explode.py | 78 -- bigframes/core/compile/single_column.py | 106 -- bigframes/core/groupby/__init__.py | 6 +- bigframes/core/nodes.py | 55 +- bigframes/core/ordering.py | 168 +-- bigframes/core/rewrite/__init__.py | 2 + bigframes/core/rewrite/order.py | 427 ++++++++ bigframes/core/window_spec.py | 8 +- bigframes/operations/aggregations.py | 89 +- noxfile.py | 4 +- tests/system/small/test_series.py | 1 + .../sql/compilers/bigquery/__init__.py | 2 +- 19 files changed, 815 insertions(+), 1516 deletions(-) create mode 100644 bigframes/core/rewrite/order.py diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index d9bba9bdb0..f573a5bbb3 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -120,7 +120,9 @@ def from_table( if offsets_col: ordering = orderings.TotalOrdering.from_offset_col(offsets_col) elif primary_key: - ordering = orderings.TotalOrdering.from_primary_key(primary_key) + ordering = orderings.TotalOrdering.from_primary_key( + [ids.ColumnId(key_part) for key_part in primary_key] + ) # Scan all columns by default, we define this list as it can be pruned while preserving source_def scan_list = nodes.ScanList( diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 727ee013f8..e6d0480114 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1410,7 +1410,7 @@ def grouped_head( block, result_id = self.apply_window_op( value_columns[0], - agg_ops.rank_op, + agg_ops.count_op, window_spec=window_spec, ) diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index 7a018a662e..02c7ae128b 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -55,7 +55,7 @@ def compile_aggregate( return compile_nullary_agg(aggregate.op) if isinstance(aggregate, ex.UnaryAggregation): input = scalar_compiler.compile_expression(aggregate.arg, bindings=bindings) - if aggregate.op.can_order_by: + if not aggregate.op.order_independent: return compile_ordered_unary_agg(aggregate.op, input, order_by=order_by) # type: ignore else: return compile_unary_agg(aggregate.op, input) # type: ignore @@ -150,6 +150,11 @@ def _(op: agg_ops.SizeOp, window=None) -> ibis_types.NumericValue: return _apply_window_if_present(ibis_ops.count(1), window) +@compile_unary_agg.register +def _(op: agg_ops.SizeUnaryOp, _, window=None) -> ibis_types.NumericValue: + return _apply_window_if_present(ibis_ops.count(1), window) + + @compile_unary_agg.register @numeric_op def _( @@ -171,13 +176,6 @@ def _( column: ibis_types.NumericColumn, window=None, ) -> ibis_types.NumericValue: - # PERCENTILE_CONT has very few allowed windows. For example, "window - # framing clause is not allowed for analytic function percentile_cont". - if window is not None: - raise NotImplementedError( - f"Median with windowing is not supported. {constants.FEEDBACK_LINK}" - ) - # TODO(swast): Allow switching between exact and approximate median. # For now, the best we can do is an approximate median when we're doing # an aggregation, as PERCENTILE_CONT is only an analytic function. diff --git a/bigframes/core/compile/api.py b/bigframes/core/compile/api.py index 61eaa63f85..9280cfbb7b 100644 --- a/bigframes/core/compile/api.py +++ b/bigframes/core/compile/api.py @@ -24,9 +24,7 @@ import bigframes.core.ordering import bigframes.core.schema -_STRICT_COMPILER = compiler.Compiler( - strict=True, enable_pruning=True, enable_densify_ids=True -) +_STRICT_COMPILER = compiler.Compiler(strict=True) class SQLCompiler: @@ -72,9 +70,7 @@ def compile_raw( def test_only_try_evaluate(node: bigframes.core.nodes.BigFrameNode): """Use only for unit testing paths - not fully featured. Will throw exception if fails.""" node = _STRICT_COMPILER._preprocess(node) - ibis = _STRICT_COMPILER.compile_ordered_ir(node)._to_ibis_expr( - ordering_mode="unordered" - ) + ibis = _STRICT_COMPILER.compile_node(node)._to_ibis_expr() return ibis.pandas.connect({}).execute(ibis) @@ -83,7 +79,7 @@ def test_only_ibis_inferred_schema(node: bigframes.core.nodes.BigFrameNode): import bigframes.core.schema node = _STRICT_COMPILER._preprocess(node) - compiled = _STRICT_COMPILER.compile_unordered_ir(node) + compiled = _STRICT_COMPILER.compile_node(node) items = tuple( bigframes.core.schema.SchemaItem(name, compiled.get_column_type(ibis_id)) for name, ibis_id in zip(node.schema.names, compiled.column_ids) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 15805a38fc..a55307e0a4 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -13,21 +13,17 @@ # limitations under the License. from __future__ import annotations -import abc import functools import itertools import typing -from typing import Collection, Literal, Optional, Sequence +from typing import Collection, Optional, Sequence import bigframes_vendored.ibis import bigframes_vendored.ibis.backends.bigquery.backend as ibis_bigquery -import bigframes_vendored.ibis.backends.bigquery.datatypes as ibis_bigquery_dtatatypes import bigframes_vendored.ibis.common.deferred as ibis_deferred # type: ignore import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes import bigframes_vendored.ibis.expr.operations as ibis_ops -import bigframes_vendored.ibis.expr.schema as ibis_schema import bigframes_vendored.ibis.expr.types as ibis_types -import google.cloud.bigquery import pandas import bigframes.core.compile.aggregate_compiler as agg_compiler @@ -36,31 +32,20 @@ import bigframes.core.compile.scalar_op_compiler as op_compilers import bigframes.core.expression as ex import bigframes.core.guid -import bigframes.core.identifiers as ids -from bigframes.core.ordering import ( - ascending_over, - encode_order_string, - OrderingExpression, - RowOrdering, - TotalOrdering, -) +from bigframes.core.ordering import OrderingExpression import bigframes.core.sql from bigframes.core.window_spec import RangeWindowBounds, RowsWindowBounds, WindowSpec import bigframes.dtypes import bigframes.operations.aggregations as agg_ops -ORDER_ID_COLUMN = "bigframes_ordering_id" PREDICATE_COLUMN = "bigframes_predicate" -T = typing.TypeVar("T", bound="BaseIbisIR") - op_compiler = op_compilers.scalar_op_compiler -class BaseIbisIR(abc.ABC): - """Implementation detail, contains common logic between ordered and unordered IR""" - +# Ibis Implementations +class UnorderedIR: def __init__( self, table: ibis_types.Table, @@ -68,6 +53,7 @@ def __init__( predicates: Optional[Collection[ibis_types.BooleanValue]] = None, ): self._table = table + # Deferred predicates probably no longer needed? self._predicates = tuple(predicates) if predicates is not None else () # Allow creating a DataFrame directly from an Ibis table expression. # TODO(swast): Validate that each column references the same table (or @@ -83,6 +69,47 @@ def __init__( # dictionary mapping names to column values. self._column_names = {column.get_name(): column for column in self._columns} + def builder(self): + """Creates a mutable builder for expressions.""" + # Since ArrayValue is intended to be immutable (immutability offers + # potential opportunities for caching, though we might need to introduce + # more node types for that to be useful), we create a builder class. + return UnorderedIR.Builder( + self._table, + columns=self._columns, + predicates=self._predicates, + ) + + def to_sql( + self, + *, + order_by: Sequence[OrderingExpression] = (), + limit: Optional[int] = None, + selections: Optional[Sequence[str]] = None, + ) -> str: + ibis_table = self._to_ibis_expr() + # This set of output transforms maybe should be its own output node?? + if order_by or limit: + sql = ibis_bigquery.Backend().compile(ibis_table) + sql = ( + bigframes.core.compile.googlesql.Select() + .from_(sql) + .select(selections or self.column_ids) + .sql() + ) + + # Single row frames may not have any ordering columns + if len(order_by) > 0: + order_by_clause = bigframes.core.sql.ordering_clause(order_by) + sql += f"\n{order_by_clause}" + if limit is not None: + if not isinstance(limit, int): + raise TypeError(f"Limit param: {limit} must be an int.") + sql += f"\nLIMIT {limit}" + else: + sql = ibis_bigquery.Backend().compile(self._to_ibis_expr()) + return typing.cast(str, sql) + @property def columns(self) -> typing.Tuple[ibis_types.Value, ...]: return self._columns @@ -104,61 +131,33 @@ def _reduced_predicate(self) -> typing.Optional[ibis_types.BooleanValue]: def _ibis_bindings(self) -> dict[str, ibis_types.Value]: return {col: self._get_ibis_column(col) for col in self.column_ids} - @property - @abc.abstractmethod - def is_ordered_ir(self: T) -> bool: - """Whether it is a OrderedIR or UnorderedIR.""" - ... - - @abc.abstractmethod - def filter(self: T, predicate: ex.Expression) -> T: - """Filter the table on a given expression, the predicate must be a boolean expression.""" - ... - - @abc.abstractmethod - def _reproject_to_table(self: T) -> T: - """ - Internal operators that projects the internal representation into a - new ibis table expression where each value column is a direct - reference to a column in that table expression. Needed after - some operations such as window operations that cannot be used - recursively in projections. - """ - ... - def projection( - self: T, + self, expression_id_pairs: typing.Tuple[typing.Tuple[ex.Expression, str], ...], - ) -> T: + ) -> UnorderedIR: """Apply an expression to the ArrayValue and assign the output to a column.""" bindings = {col: self._get_ibis_column(col) for col in self.column_ids} new_values = [ op_compiler.compile_expression(expression, bindings).name(id) for expression, id in expression_id_pairs ] - result = self._select(tuple([*self._columns, *new_values])) # type: ignore - return result + builder = self.builder() + builder.columns = tuple([*self._columns, *new_values]) + return builder.build() def selection( - self: T, + self, input_output_pairs: typing.Tuple[typing.Tuple[ex.DerefOp, str], ...], - ) -> T: + ) -> UnorderedIR: """Apply an expression to the ArrayValue and assign the output to a column.""" bindings = {col: self._get_ibis_column(col) for col in self.column_ids} values = [ op_compiler.compile_expression(input, bindings).name(id) for input, id in input_output_pairs ] - result = self._select(tuple(values)) # type: ignore - return result - - @abc.abstractmethod - def _select(self: T, values: typing.Tuple[ibis_types.Value]) -> T: - ... - - @abc.abstractmethod - def _set_or_replace_by_id(self: T, id: str, new_value: ibis_types.Value) -> T: - ... + builder = self.builder() + builder.columns = tuple(values) + return builder.build() def _get_ibis_column(self, key: str) -> ibis_types.Value: """Gets the Ibis expression for a given column.""" @@ -178,137 +177,16 @@ def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: bigframes.core.compile.ibis_types.ibis_dtype_to_bigframes_dtype(ibis_type), ) - def _aggregate_base( - self, - table: ibis_types.Table, - order_by: typing.Sequence[ibis_types.Value] = [], - aggregations: typing.Sequence[typing.Tuple[ex.Aggregation, str]] = [], - by_column_ids: typing.Sequence[ex.DerefOp] = (), - dropna: bool = True, - ) -> OrderedIR: - assert not self.is_ordered_ir or len(order_by) > 0 - - bindings = {col: table[col] for col in self.column_ids} - stats = { - col_out: agg_compiler.compile_aggregate( - aggregate, bindings, order_by=order_by - ) - for aggregate, col_out in aggregations - } - if by_column_ids: - result = table.group_by((ref.id.sql for ref in by_column_ids)).aggregate( - **stats - ) - # Must have deterministic ordering, so order by the unique "by" column - ordering = TotalOrdering( - tuple( - [ - OrderingExpression(ex.DerefOp(ref.id.local_normalized)) - for ref in by_column_ids - ] - ), - total_ordering_columns=frozenset( - [ex.DerefOp(ref.id.local_normalized) for ref in by_column_ids] - ), - ) - columns = tuple(result[key] for key in result.columns) - expr = OrderedIR(result, columns=columns, ordering=ordering) - if dropna: - for ref in by_column_ids: - expr = expr._filter(expr._compile_expression(ref).notnull()) - return expr - else: - aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)} - result = table.aggregate(**aggregates) - # Ordering is irrelevant for single-row output, but set ordering id regardless - # as other ops(join etc.) expect it. - # TODO: Maybe can make completely empty - ordering = TotalOrdering( - ordering_value_columns=tuple([]), - total_ordering_columns=frozenset([]), - ) - return OrderedIR( - result, - columns=[result[col_id] for col_id in [*stats.keys()]], - hidden_ordering_columns=[result[ORDER_ID_COLUMN]], - ordering=ordering, - ) - - -# Ibis Implementations -class UnorderedIR(BaseIbisIR): - def __init__( - self, - table: ibis_types.Table, - columns: Sequence[ibis_types.Value], - predicates: Optional[Collection[ibis_types.BooleanValue]] = None, - ): - super().__init__(table, columns, predicates) - - @property - def is_ordered_ir(self) -> bool: - return False - - def builder(self): - """Creates a mutable builder for expressions.""" - # Since ArrayValue is intended to be immutable (immutability offers - # potential opportunities for caching, though we might need to introduce - # more node types for that to be useful), we create a builder class. - return UnorderedIR.Builder( - self._table, - columns=self._columns, - predicates=self._predicates, - ) - - def peek_sql(self, n: int): - # Peek currently implemented as top level LIMIT op. - # Execution engine handles limit pushdown. - # In future, may push down limit/filters in compilation. - sql = ibis_bigquery.Backend().compile(self._to_ibis_expr().limit(n)) - return typing.cast(str, sql) - - def to_sql( - self, - offset_column: typing.Optional[str] = None, - ordered: bool = False, - ) -> str: - if offset_column or ordered: - raise ValueError("Cannot produce sorted sql in partial ordering mode") - sql = ibis_bigquery.Backend().compile(self._to_ibis_expr()) - return typing.cast(str, sql) - - def with_total_order(self, by: Sequence[OrderingExpression]) -> OrderedIR: - return OrderedIR( - table=self._table, - columns=self._columns, - predicates=self._predicates, - ordering=TotalOrdering( - ordering_value_columns=tuple(by), - total_ordering_columns=frozenset( - map( - ex.DerefOp, - itertools.chain.from_iterable( - col.referenced_columns for col in by - ), - ) - ), - ), - ) - - def row_count(self, name: str) -> OrderedIR: + def row_count(self, name: str) -> UnorderedIR: original_table = self._to_ibis_expr() ibis_table = original_table.agg( [ original_table.count().name(name), ] ) - return OrderedIR( + return UnorderedIR( ibis_table, (ibis_table[name],), - ordering=TotalOrdering( - ordering_value_columns=(ascending_over(name),), - total_ordering_columns=frozenset([ex.deref(name)]), - ), ) def _to_ibis_expr( @@ -320,22 +198,10 @@ def _to_ibis_expr( """ Creates an Ibis table expression representing the DataFrame. - ArrayValue objects are sorted, so the following options are available - to reflect this in the ibis expression. - - * "string_encoded": An ordered string column is provided in output table. - * "unordered": No ordering information will be provided in output. Only - value columns are projected. - - For offset or ordered column, order_col_name can be used to assign the - output label for the ordering column. If none is specified, the default - column name will be 'bigframes_ordering_id' - Args: expose_hidden_cols: If True, include the hidden ordering columns in the results. - Only compatible with `order_by` and `unordered` - ``ordering_mode``. + Returns: An ibis expression representing the data help by the ArrayValue object. """ @@ -389,7 +255,8 @@ def aggregate( aggregations: typing.Sequence[typing.Tuple[ex.Aggregation, str]], by_column_ids: typing.Sequence[ex.DerefOp] = (), dropna: bool = True, - ) -> OrderedIR: + order_by: typing.Sequence[OrderingExpression] = (), + ) -> UnorderedIR: """ Apply aggregations to the expression. Arguments: @@ -402,9 +269,32 @@ def aggregate( information. """ table = self._to_ibis_expr() - return self._aggregate_base( - table, aggregations=aggregations, by_column_ids=by_column_ids, dropna=dropna - ) + bindings = {col: table[col] for col in self.column_ids} + stats = { + col_out: agg_compiler.compile_aggregate( + aggregate, + bindings, + order_by=_convert_ordering_to_table_values(table, order_by), + ) + for aggregate, col_out in aggregations + } + if by_column_ids: + result = table.group_by((ref.id.sql for ref in by_column_ids)).aggregate( + **stats + ) + columns = tuple(result[key] for key in result.columns) + expr = UnorderedIR(result, columns=columns) + if dropna: + for ref in by_column_ids: + expr = expr._filter(expr._compile_expression(ref).notnull()) + return expr + else: + aggregates = {**stats} + result = table.aggregate(**aggregates) + return UnorderedIR( + result, + columns=[result[col_id] for col_id in [*stats.keys()]], + ) def _uniform_sampling(self, fraction: float) -> UnorderedIR: """Sampling the table on given fraction. @@ -419,10 +309,6 @@ def _uniform_sampling(self, fraction: float) -> UnorderedIR: columns=columns, ) - def as_ordered_ir(self) -> OrderedIR: - """Convert to OrderedIr, but without any definite ordering.""" - return OrderedIR(self._table, self._columns, predicates=self._predicates) - ## Helpers def _set_or_replace_by_id( self, id: str, new_value: ibis_types.Value @@ -437,11 +323,6 @@ def _set_or_replace_by_id( builder.columns = [*self.columns, new_value.name(id)] return builder.build() - def _select(self, values: typing.Tuple[ibis_types.Value, ...]) -> UnorderedIR: - builder = self.builder() - builder.columns = values - return builder.build() - def _reproject_to_table(self) -> UnorderedIR: """ Internal operators that projects the internal representation into a @@ -475,95 +356,14 @@ def build(self) -> UnorderedIR: predicates=self.predicates, ) - -class OrderedIR(BaseIbisIR): - """Immutable BigQuery DataFrames expression tree. - - Note: Usage of this class is considered to be private and subject to change - at any time. - - This class is a wrapper around Ibis expressions. Its purpose is to defer - Ibis projection operations to keep generated SQL small and correct when - mixing and matching columns from different versions of a DataFrame. - - Args: - table: An Ibis table expression. - columns: Ibis value expressions that can be projected as columns. - hidden_ordering_columns: Ibis value expressions to store ordering. - ordering: An ordering property of the data frame. - predicates: A list of filters on the data frame. - """ - - def __init__( - self, - table: ibis_types.Table, - columns: Sequence[ibis_types.Value], - hidden_ordering_columns: Optional[Sequence[ibis_types.Value]] = None, - ordering: RowOrdering = RowOrdering(), - predicates: Optional[Collection[ibis_types.BooleanValue]] = None, - ): - super().__init__(table, columns, predicates) - self._ordering = ordering - # Meta columns store ordering, or other data that doesn't correspond to dataframe columns - self._hidden_ordering_columns = ( - tuple(hidden_ordering_columns) - if hidden_ordering_columns is not None - else () - ) - - # To allow for more efficient lookup by column name, create a - # dictionary mapping names to column values. - self._column_names = { - ( - column.resolve(table) # type: ignore - # TODO(https://github.com/ibis-project/ibis/issues/7613): use - # public API to refer to Deferred type. - if isinstance(column, ibis_deferred.Deferred) - else column - ).get_name(): column - for column in self._columns - } - self._hidden_ordering_column_names = { - typing.cast(str, column.get_name()): column - for column in self._hidden_ordering_columns - } - ### Validation - value_col_ids = self._column_names.keys() - hidden_col_ids = self._hidden_ordering_column_names.keys() - - all_columns = value_col_ids | hidden_col_ids - ordering_valid = all( - set(ref.sql for ref in col.scalar_expression.column_references).issubset( - all_columns - ) - for col in ordering.all_ordering_columns - ) - if value_col_ids & hidden_col_ids: - raise ValueError( - f"Keys in both hidden and exposed list: {value_col_ids & hidden_col_ids}" - ) - if not ordering_valid: - raise ValueError(f"Illegal ordering keys: {ordering.all_ordering_columns}") - - @property - def is_ordered_ir(self) -> bool: - return True - - @property - def order_non_deterministic(self) -> bool: - # ordering suffix non-determinism is ok, as rand() is used as suffix for auto-generated order keys. - # but must be resolved before or explode, otherwise the engine might pull the rand() evaluation above the join, - # creating inconsistencies - return not all(col.deterministic for col in self._ordering.all_ordering_columns) - - @property - def has_total_order(self) -> bool: - return isinstance(self._ordering, TotalOrdering) - @classmethod def from_pandas( - cls, pd_df: pandas.DataFrame, scan_cols: bigframes.core.nodes.ScanList - ) -> OrderedIR: + cls, + pd_df: pandas.DataFrame, + scan_cols: bigframes.core.nodes.ScanList, + offsets: typing.Optional[str] = None, + ) -> UnorderedIR: + # TODO: add offsets """ Builds an in-memory only (SQL only) expr from a pandas dataframe. @@ -574,7 +374,8 @@ def from_pandas( # ibis memtable cannot handle NA, must convert to None # this destroys the schema however ibis_values = pd_df.astype("object").where(pandas.notnull(pd_df), None) # type: ignore - ibis_values = ibis_values.assign(**{ORDER_ID_COLUMN: range(len(pd_df))}) + if offsets: + ibis_values = ibis_values.assign(**{offsets: range(len(pd_df))}) # derive the ibis schema from the original pandas schema ibis_schema = [ ( @@ -583,165 +384,25 @@ def from_pandas( ) for id, dtype, local_label in scan_cols.items ] - ibis_schema.append((ORDER_ID_COLUMN, ibis_dtypes.int64)) + if offsets: + ibis_schema.append((offsets, ibis_dtypes.int64)) keys_memtable = bigframes_vendored.ibis.memtable( ibis_values, schema=bigframes_vendored.ibis.schema(ibis_schema) ) + columns = [ + keys_memtable[local_label].name(col_id.sql) + for col_id, _, local_label in scan_cols.items + ] + if offsets: + columns.append(keys_memtable[offsets].name(offsets)) + return cls( keys_memtable, - columns=[ - keys_memtable[local_label].name(col_id.sql) - for col_id, _, local_label in scan_cols.items - ], - ordering=TotalOrdering.from_offset_col(ORDER_ID_COLUMN), - hidden_ordering_columns=(keys_memtable[ORDER_ID_COLUMN],), - ) - - @property - def _ibis_bindings(self) -> dict[str, ibis_types.Value]: - all_keys = itertools.chain(self.column_ids, self._hidden_column_ids) - return {col: self._get_any_column(col) for col in all_keys} - - @property - def _hidden_column_ids(self) -> typing.Sequence[str]: - return tuple(self._hidden_ordering_column_names.keys()) - - @property - def _ibis_order(self) -> Sequence[ibis_types.Value]: - """Returns a sequence of ibis values which can be directly used to order a - table expression. Has direction modifiers applied.""" - return _convert_ordering_to_table_values( - {**self._column_names, **self._hidden_ordering_column_names}, - self._ordering.all_ordering_columns, - ) - - def to_unordered(self) -> UnorderedIR: - return UnorderedIR(self._table, self._columns, self._predicates) - - def builder(self) -> OrderedIR.Builder: - """Creates a mutable builder for expressions.""" - # Since ArrayValue is intended to be immutable (immutability offers - # potential opportunities for caching, though we might need to introduce - # more node types for that to be useful), we create a builder class. - return OrderedIR.Builder( - self._table, - columns=self._columns, - hidden_ordering_columns=self._hidden_ordering_columns, - ordering=self._ordering, - predicates=self._predicates, - ) - - def order_by(self, by: Sequence[OrderingExpression]) -> OrderedIR: - expr_builder = self.builder() - expr_builder.ordering = self._ordering.with_ordering_columns(by) - return expr_builder.build() - - def reversed(self) -> OrderedIR: - expr_builder = self.builder() - expr_builder.ordering = self._ordering.with_reverse() - return expr_builder.build() - - def aggregate( - self, - aggregations: typing.Sequence[typing.Tuple[ex.Aggregation, str]], - by_column_ids: typing.Sequence[ex.DerefOp] = (), - dropna: bool = True, - ) -> OrderedIR: - """ - Apply aggregations to the expression. - Arguments: - aggregations: input_column_id, operation, output_column_id tuples - by_column_ids: column ids of the aggregation key, this is preserved through - the transform - dropna: whether null keys should be dropped - Returns: - OrderedIR - """ - table = self._to_ibis_expr(ordering_mode="unordered", expose_hidden_cols=True) - - all_columns = { - column_name: table[column_name] - for column_name in { - **self._column_names, - **self._hidden_ordering_column_names, - } - } - order_by = _convert_ordering_to_table_values( - all_columns, - self._ordering.all_ordering_columns, - ) - - return self._aggregate_base( - table, - order_by=order_by, - aggregations=aggregations, - by_column_ids=by_column_ids, - dropna=dropna, - ) - - def _uniform_sampling(self, fraction: float) -> OrderedIR: - """Sampling the table on given fraction. - - .. warning:: - The row numbers of result is non-deterministic, avoid to use. - """ - table = self._to_ibis_expr( - ordering_mode="unordered", expose_hidden_cols=True, fraction=fraction - ) - columns = [table[column_name] for column_name in self._column_names] - hidden_ordering_columns = [ - table[column_name] for column_name in self._hidden_ordering_column_names - ] - return OrderedIR( - table, columns=columns, - hidden_ordering_columns=hidden_ordering_columns, - ordering=self._ordering, ) - def promote_offsets(self, col_id: str) -> OrderedIR: - """ - Convenience function to promote copy of column offsets to a value column. Can be used to reset index. - """ - # Special case: offsets already exist - ordering = self._ordering - # Case 1, already have offsets, just create column from them - if ordering.is_sequential and (ordering.total_order_col is not None): - expr_builder = self.builder() - expr_builder.columns = [ - *self.columns, - self._compile_expression( - ordering.total_order_col.scalar_expression - ).name(col_id), - ] - return expr_builder.build() - # Cannot nest analytic expressions, so reproject to cte first if needed. - # Also ibis cannot window literals, so need to reproject those (even though this is legal in googlesql) - # Seee: https://github.com/ibis-project/ibis/issues/9773 - can_directly_window = not any( - map( - lambda x: is_literal(x) or is_window(x), - itertools.chain(self._ibis_order, self._predicates), - ) - ) - if not can_directly_window: - return self._reproject_to_table().promote_offsets(col_id) - - window = bigframes_vendored.ibis.window(order_by=self._ibis_order) - if self._predicates: - window = window.group_by(self._reduced_predicate) - offsets = bigframes_vendored.ibis.row_number().over(window) - expr_builder = self.builder() - expr_builder.columns = [ - *self.columns, - offsets.name(col_id), - ] - # Reproject, so that offsets are just a scalar value that can be used elsewhere - expr_builder.ordering = TotalOrdering.from_offset_col(col_id) - return expr_builder.build()._reproject_to_table() - ## Methods that only work with ordering def project_window_op( self, @@ -750,7 +411,7 @@ def project_window_op( output_name: str, *, never_skip_nulls=False, - ) -> OrderedIR: + ) -> UnorderedIR: """ Creates a new expression based on this expression with unary operation applied to one column. column_name: the id of the input column present in the expression @@ -782,9 +443,10 @@ def project_window_op( never_skip_nulls=never_skip_nulls, ) - window = self._ibis_window_from_spec( - window_spec, require_total_order=expression.op.uses_total_row_ordering - ) + if expression.op.order_independent and not window_spec.row_bounded: + # notably percentile_cont does not support ordering clause + window_spec = window_spec.without_order() + window = self._ibis_window_from_spec(window_spec) bindings = {col: self._get_ibis_column(col) for col in self.column_ids} window_op = agg_compiler.compile_analytic( @@ -841,401 +503,10 @@ def project_window_op( result = self._set_or_replace_by_id(output_name, window_op) return result - def _reproject_to_table(self) -> OrderedIR: - table = self._to_ibis_expr( - ordering_mode="unordered", - expose_hidden_cols=True, - ) - columns = [table[column_name] for column_name in self._column_names] - ordering_col_ids = list( - id.sql - for id in itertools.chain.from_iterable( - ref.scalar_expression.column_references - for ref in self._ordering.all_ordering_columns - ) - ) - hidden_ordering_columns = [ - table[column_name] - for column_name in self._hidden_ordering_column_names - if column_name in ordering_col_ids - ] - return OrderedIR( - table, - columns=columns, - hidden_ordering_columns=hidden_ordering_columns, - ordering=self._ordering, - ) - - def to_sql( - self, - ordered: bool = False, - limit: Optional[int] = None, - ) -> str: - if ordered or limit: - # Need to bake ordering expressions into the selected column in order for our ordering clause builder to work. - baked_ir = self._bake_ordering() - sql = ibis_bigquery.Backend().compile( - baked_ir._to_ibis_expr( - ordering_mode="unordered", - expose_hidden_cols=True, - ) - ) - sql = ( - bigframes.core.compile.googlesql.Select() - .from_(sql) - .select(self.column_ids) - .sql() - ) - - # Single row frames may not have any ordering columns - if len(baked_ir._ordering.all_ordering_columns) > 0: - order_by_clause = bigframes.core.sql.ordering_clause( - baked_ir._ordering.all_ordering_columns - ) - sql += f"\n{order_by_clause}" - if limit is not None: - if not isinstance(limit, int): - raise TypeError(f"Limit param: {limit} must be an int.") - sql += f"\nLIMIT {limit}" - else: - sql = ibis_bigquery.Backend().compile( - self._to_ibis_expr( - ordering_mode="unordered", - expose_hidden_cols=False, - ) - ) - return typing.cast(str, sql) - - def raw_sql_and_schema( - self, - column_ids: typing.Sequence[str], - ) -> typing.Tuple[str, typing.Sequence[google.cloud.bigquery.SchemaField]]: - """Return sql with all hidden columns. Used to cache with ordering information. - - Also returns schema, as the extra ordering columns are determined compile-time. - """ - col_id_overrides = dict(zip(self.column_ids, column_ids)) - all_columns = (*self.column_ids, *self._hidden_ordering_column_names.keys()) - as_ibis = self._to_ibis_expr( - ordering_mode="unordered", - expose_hidden_cols=True, - ) - as_ibis = as_ibis.select(all_columns).rename(col_id_overrides) - - # Ibis will produce non-nullable schema types, but bigframes should always be nullable - fixed_ibis_schema = ibis_schema.Schema.from_tuples( - (name, dtype.copy(nullable=True)) - for (name, dtype) in as_ibis.schema().items() - ) - bq_schema = ibis_bigquery_dtatatypes.BigQuerySchema.from_ibis(fixed_ibis_schema) - return ibis_bigquery.Backend().compile(as_ibis), bq_schema - - def _to_ibis_expr( - self, - *, - expose_hidden_cols: bool = False, - fraction: Optional[float] = None, - ordering_mode: Literal["string_encoded", "unordered"], - order_col_name: Optional[str] = ORDER_ID_COLUMN, - ): - """ - Creates an Ibis table expression representing the DataFrame. - - ArrayValue objects are sorted, so the following options are available - to reflect this in the ibis expression. - - - * "string_encoded": An ordered string column is provided in output table. - * "unordered": No ordering information will be provided in output. Only - value columns are projected. - - For offset or ordered column, order_col_name can be used to assign the - output label for the ordering column. If none is specified, the default - column name will be 'bigframes_ordering_id' - - Args: - expose_hidden_cols: - If True, include the hidden ordering columns in the results. - Only compatible with `order_by` and `unordered` - ``ordering_mode``. - ordering_mode: - How to construct the Ibis expression from the ArrayValue. See - above for details. - order_col_name: - If the ordering mode outputs a single ordering or offsets - column, use this as the column name. - Returns: - An ibis expression representing the data help by the ArrayValue object. - """ - assert ordering_mode in ( - "string_encoded", - "unordered", - ) - if expose_hidden_cols and ordering_mode in ("ordered_col"): - raise ValueError( - f"Cannot expose hidden ordering columns with ordering_mode {ordering_mode}" - ) - - columns = list(self._columns) - columns_to_drop: list[ - str - ] = [] # Ordering/Filtering columns that will be dropped at end - - if self._reduced_predicate is not None: - columns.append(self._reduced_predicate) - # Usually drop predicate as it is will be all TRUE after filtering - if not expose_hidden_cols: - columns_to_drop.append(self._reduced_predicate.get_name()) - - order_columns = self._create_order_columns( - ordering_mode, order_col_name, expose_hidden_cols - ) - columns.extend(order_columns) - - # Special case for empty tables, since we can't create an empty - # projection. - if not columns: - return bigframes_vendored.ibis.memtable([]) - - # Make sure we don't have any unbound (deferred) columns. - table = self._table.select(columns) - - table = table.select(table[column] for column in table.columns) - base_table = table - if self._reduced_predicate is not None: - table = table.filter(base_table[PREDICATE_COLUMN]) - table = table.drop(*columns_to_drop) - if fraction is not None: - table = table.filter( - bigframes_vendored.ibis.random() < ibis_types.literal(fraction) - ) - return table - - def filter(self, predicate: ex.Expression) -> OrderedIR: - for ref in predicate.column_references: - ibis_value = self._get_ibis_column(ref.sql) - if is_window(ibis_value): - # ibis doesn't support qualify syntax, so create CTE if filtering over window expression - # https://github.com/ibis-project/ibis/issues/9775 - return self._reproject_to_table().filter(predicate) - - bindings = {col: self._get_ibis_column(col) for col in self.column_ids} - condition = op_compiler.compile_expression(predicate, bindings) - return self._filter(condition) # type: ignore - - def _filter(self, predicate_value: ibis_types.BooleanValue) -> OrderedIR: - """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" - expr = self.builder() - expr.ordering = expr.ordering.with_non_sequential() - expr.predicates = [*self._predicates, predicate_value] - return expr.build() - - def _set_or_replace_by_id(self, id: str, new_value: ibis_types.Value) -> OrderedIR: - """Safely assign by id while maintaining ordering integrity.""" - # TODO: Split into explicit set and replace methods - ordering_col_ids = set( - id.sql - for id in itertools.chain.from_iterable( - col_ref.scalar_expression.column_references - for col_ref in self._ordering.ordering_value_columns - ) - ) - if id in ordering_col_ids: - return self._hide_column(id)._set_or_replace_by_id(id, new_value) - - builder = self.builder() - if id in self.column_ids: - builder.columns = [ - val if (col_id != id) else new_value.name(id) - for col_id, val in zip(self.column_ids, self._columns) - ] - else: - builder.columns = [*self.columns, new_value.name(id)] - return builder.build() - - def _select(self, values: typing.Tuple[ibis_types.Value, ...]) -> OrderedIR: - """Safely assign by id while maintaining ordering integrity.""" - # TODO: Split into explicit set and replace methods - ordering_col_ids = set( - id.sql - for id in itertools.chain.from_iterable( - [ - col_ref.scalar_expression.column_references - for col_ref in self._ordering.ordering_value_columns - ] - ) - ) - ir = self - mappings = {typing.cast(str, value.get_name()): value for value in values} - for ordering_id in ordering_col_ids: - # Drop case - if (ordering_id not in mappings) and (ordering_id in ir.column_ids): - # id is being dropped, hide it first - ir = ir._hide_column(ordering_id) - # Mutate case - elif (ordering_id in mappings) and not mappings[ordering_id].equals( - ir._get_any_column(ordering_id) - ): - ir = ir._hide_column(ordering_id) - - builder = ir.builder() - builder.columns = list(values) - return builder.build() - - ## Ordering specific helpers - def _get_any_column(self, key: str) -> ibis_types.Value: - """Gets the Ibis expression for a given column. Will also get hidden columns.""" - all_columns = {**self._column_names, **self._hidden_ordering_column_names} - if key not in all_columns.keys(): - raise ValueError( - "Column name {} not in set of values: {}".format( - key, all_columns.keys() - ) - ) - return typing.cast(ibis_types.Value, all_columns[key]) - - def _get_hidden_ordering_column(self, key: str) -> ibis_types.Column: - """Gets the Ibis expression for a given hidden column.""" - if key not in self._hidden_ordering_column_names.keys(): - raise ValueError( - "Column name {} not in set of values: {}".format( - key, self._hidden_ordering_column_names.keys() - ) - ) - return typing.cast(ibis_types.Column, self._hidden_ordering_column_names[key]) - - def _hide_column(self, column_id: str) -> OrderedIR: - """Pushes columns to hidden columns list. Used to hide ordering columns that have been dropped or destructively mutated.""" - expr_builder = self.builder() - # Need to rename column as caller might be creating a new row with the same name but different values. - # Can avoid this if don't allow callers to determine ids and instead generate unique ones in this class. - new_name = ids.ColumnId( - bigframes.core.guid.generate_guid(prefix="bigframes_hidden_") - ) - expr_builder.hidden_ordering_columns = [ - *self._hidden_ordering_columns, - self._get_ibis_column(column_id).name(new_name.sql), - ] - matching_ref = next( - ref for ref in self._ordering.referenced_columns if ref.sql == column_id - ) - # allow_partial_bindings since only remapping hidden column, not all columns - expr_builder.ordering = self._ordering.remap_column_refs( - {matching_ref: new_name}, allow_partial_bindings=True - ) - return expr_builder.build() - - def _bake_ordering(self) -> OrderedIR: - """Bakes ordering expression into the selection, maybe creating hidden columns.""" - ordering_expressions = self._ordering.all_ordering_columns - new_exprs: list[OrderingExpression] = [] - new_baked_cols: list[ibis_types.Value] = [] - for expr in ordering_expressions: - if isinstance(expr.scalar_expression, ex.OpExpression): - baked_column = self._compile_expression(expr.scalar_expression).name( - bigframes.core.guid.generate_guid() - ) - new_baked_cols.append(baked_column) - new_expr = OrderingExpression( - ex.deref(baked_column.get_name()), expr.direction, expr.na_last - ) - new_exprs.append(new_expr) - elif isinstance(expr.scalar_expression, ex.DerefOp): - order_col = expr.scalar_expression.id - new_exprs.append(expr) - if order_col.sql not in self.column_ids: - new_baked_cols.append( - self._ibis_bindings[expr.scalar_expression.id.sql] - ) - - if isinstance(self._ordering, TotalOrdering): - new_ordering: RowOrdering = TotalOrdering( - tuple(new_exprs), - self._ordering.integer_encoding, - self._ordering.string_encoding, - total_ordering_columns=frozenset( - map( - ex.DerefOp, - itertools.chain.from_iterable( - col.referenced_columns for col in new_exprs - ), - ) - ), - ) - else: - new_ordering = RowOrdering( - tuple(new_exprs), - self._ordering.integer_encoding, - self._ordering.string_encoding, - ) - return OrderedIR( - self._table, - columns=self.columns, - hidden_ordering_columns=tuple(new_baked_cols), - ordering=new_ordering, - predicates=self._predicates, - ) - - def _create_order_columns( - self, - ordering_mode: str, - order_col_name: Optional[str], - expose_hidden_cols: bool, - ) -> typing.Sequence[ibis_types.Value]: - # Generate offsets if current ordering id semantics are not sufficiently strict - if ordering_mode == "string_encoded": - return (self._create_string_ordering_column().name(order_col_name),) - elif expose_hidden_cols: - return self._hidden_ordering_columns - return () - - def _create_offset_column(self) -> ibis_types.IntegerColumn: - if self._ordering.total_order_col and self._ordering.is_sequential: - offsets = self._compile_expression( - self._ordering.total_order_col.scalar_expression - ) - return typing.cast(ibis_types.IntegerColumn, offsets) - else: - window = bigframes_vendored.ibis.window(order_by=self._ibis_order) - if self._predicates: - window = window.group_by(self._reduced_predicate) - offsets = bigframes_vendored.ibis.row_number().over(window) - return typing.cast(ibis_types.IntegerColumn, offsets) - - def _create_string_ordering_column(self) -> ibis_types.StringColumn: - if self._ordering.total_order_col and self._ordering.is_string_encoded: - string_order_ids = op_compiler.compile_expression( - self._ordering.total_order_col.scalar_expression, self._ibis_bindings - ) - return typing.cast(ibis_types.StringColumn, string_order_ids) - if ( - self._ordering.total_order_col - and self._ordering.integer_encoding.is_encoded - ): - # Special case: non-negative integer ordering id can be converted directly to string without regenerating row numbers - int_values = self._compile_expression( - self._ordering.total_order_col.scalar_expression - ) - return encode_order_string( - typing.cast(ibis_types.IntegerColumn, int_values), - ) - else: - # Have to build string from scratch - window = bigframes_vendored.ibis.window(order_by=self._ibis_order) - if self._predicates: - window = window.group_by(self._reduced_predicate) - row_nums = typing.cast( - ibis_types.IntegerColumn, - bigframes_vendored.ibis.row_number().over(window), - ) - return encode_order_string(row_nums) - def _compile_expression(self, expr: ex.Expression): return op_compiler.compile_expression(expr, self._ibis_bindings) - def _ibis_window_from_spec( - self, window_spec: WindowSpec, require_total_order: bool - ): + def _ibis_window_from_spec(self, window_spec: WindowSpec): group_by: typing.List[ibis_types.Value] = ( [ typing.cast( @@ -1255,15 +526,12 @@ def _ibis_window_from_spec( # 3. Order-depedenpent op (navigation functions, array_agg) or rows bounds - use total row order to break ties. if window_spec.ordering: order_by = _convert_ordering_to_table_values( - {**self._column_names, **self._hidden_ordering_column_names}, + self._column_names, window_spec.ordering, ) - if require_total_order or isinstance(window_spec.bounds, RowsWindowBounds): - # Some operators need an unambiguous ordering, so the table's total ordering is appended - order_by = tuple([*order_by, *self._ibis_order]) - elif require_total_order or isinstance(window_spec.bounds, RowsWindowBounds): + elif window_spec.row_bounded: # If window spec has following or preceding bounds, we need to apply an unambiguous ordering. - order_by = tuple(self._ibis_order) + raise ValueError("No ordering provided for ordered analytic function") else: # Unbound grouping window. Suitable for aggregations but not for analytic function application. order_by = None @@ -1284,30 +552,6 @@ def _ibis_window_from_spec( raise ValueError(f"unrecognized window bounds {bounds}") return window - class Builder: - def __init__( - self, - table: ibis_types.Table, - ordering: RowOrdering, - columns: Collection[ibis_types.Value] = (), - hidden_ordering_columns: Collection[ibis_types.Value] = (), - predicates: Optional[Collection[ibis_types.BooleanValue]] = None, - ): - self.table = table - self.columns = list(columns) - self.hidden_ordering_columns = list(hidden_ordering_columns) - self.ordering = ordering - self.predicates = list(predicates) if predicates is not None else None - - def build(self) -> OrderedIR: - return OrderedIR( - table=self.table, - columns=self.columns, - hidden_ordering_columns=self.hidden_ordering_columns, - ordering=self.ordering, - predicates=self.predicates, - ) - def is_literal(column: ibis_types.Value) -> bool: # Unfortunately, Literals in ibis are not "Columns"s and therefore can't be aggregated. diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 6f47d198c5..99f2aaf15b 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -33,7 +33,6 @@ import bigframes.core.compile.schema_translator import bigframes.core.compile.single_column import bigframes.core.expression as ex -import bigframes.core.guid as guids import bigframes.core.identifiers as ids import bigframes.core.nodes as nodes import bigframes.core.ordering as bf_ordering @@ -50,23 +49,40 @@ class Compiler: # In unstrict mode, ordering from ReadTable or after joins may be ambiguous to improve query performance. strict: bool = True scalar_op_compiler = compile_scalar.ScalarOpCompiler() - enable_pruning: bool = False - enable_densify_ids: bool = False def compile_sql( self, node: nodes.BigFrameNode, ordered: bool, output_ids: typing.Sequence[str] ) -> str: - node = self.set_output_names(node, output_ids) + # TODO: get rid of output_ids arg + assert len(output_ids) == len(list(node.fields)) + node = set_output_names(node, output_ids) if ordered: node, limit = rewrites.pullup_limit_from_slice(node) - ir = self.compile_ordered_ir(self._preprocess(node)) - return ir.to_sql(ordered=True, limit=limit) + node = nodes.bottom_up(node, rewrites.rewrite_slice) + node, ordering = rewrites.pull_up_order( + node, order_root=True, ordered_joins=self.strict + ) + ir = self.compile_node(node) + return ir.to_sql( + order_by=ordering.all_ordering_columns, + limit=limit, + selections=output_ids, + ) else: - ir = self.compile_unordered_ir(self._preprocess(node)) # type: ignore - return ir.to_sql() + node = nodes.bottom_up(node, rewrites.rewrite_slice) + node, _ = rewrites.pull_up_order( + node, order_root=False, ordered_joins=self.strict + ) + ir = self.compile_node(node) + return ir.to_sql(selections=output_ids) def compile_peek_sql(self, node: nodes.BigFrameNode, n_rows: int) -> str: - return self.compile_unordered_ir(self._preprocess(node)).peek_sql(n_rows) + ids = [id.sql for id in node.ids] + node = nodes.bottom_up(node, rewrites.rewrite_slice) + node, _ = rewrites.pull_up_order( + node, order_root=False, ordered_joins=self.strict + ) + return self.compile_node(node).to_sql(limit=n_rows, selections=ids) def compile_raw( self, @@ -74,98 +90,49 @@ def compile_raw( ) -> typing.Tuple[ str, typing.Sequence[google.cloud.bigquery.SchemaField], bf_ordering.RowOrdering ]: - ir = self.compile_ordered_ir(self._preprocess(node)) - sql, schema = ir.raw_sql_and_schema(column_ids=node.schema.names) - return sql, schema, ir._ordering + node = nodes.bottom_up(node, rewrites.rewrite_slice) + node, ordering = rewrites.pull_up_order(node, ordered_joins=self.strict) + ir = self.compile_node(node) + sql = ir.to_sql() + return sql, node.schema.to_bigquery(), ordering def _preprocess(self, node: nodes.BigFrameNode): - if self.enable_pruning: - used_fields = frozenset(field.id for field in node.fields) - node = node.prune(used_fields) node = nodes.bottom_up(node, rewrites.rewrite_slice) - if self.enable_densify_ids: - original_names = [id.name for id in node.ids] - node, _ = rewrites.remap_variables( - node, id_generator=ids.anonymous_serial_ids() - ) - node = self.set_output_names(node, original_names) - return node - - def set_output_names( - self, node: bigframes.core.nodes.BigFrameNode, output_ids: typing.Sequence[str] - ): - # TODO: Create specialized output operators that will handle final names - return nodes.SelectionNode( - node, - tuple( - (ex.DerefOp(old_id), ids.ColumnId(out_id)) - for old_id, out_id in zip(node.ids, output_ids) - ), + node, _ = rewrites.pull_up_order( + node, order_root=False, ordered_joins=self.strict ) - - def compile_ordered_ir(self, node: nodes.BigFrameNode) -> compiled.OrderedIR: - return typing.cast(compiled.OrderedIR, self.compile_node(node, True)) - - def compile_unordered_ir(self, node: nodes.BigFrameNode) -> compiled.UnorderedIR: - return typing.cast(compiled.UnorderedIR, self.compile_node(node, False)) + return node # TODO: Remove cache when schema no longer requires compilation to derive schema (and therefor only compiles for execution) @functools.lru_cache(maxsize=5000) - def compile_node( - self, node: nodes.BigFrameNode, ordered: bool = True - ) -> compiled.UnorderedIR | compiled.OrderedIR: + def compile_node(self, node: nodes.BigFrameNode) -> compiled.UnorderedIR: """Compile node into CompileArrayValue. Caches result.""" - return self._compile_node(node, ordered) + return self._compile_node(node) @functools.singledispatchmethod - def _compile_node( - self, node: nodes.BigFrameNode, ordered: bool = True - ) -> compiled.UnorderedIR: + def _compile_node(self, node: nodes.BigFrameNode) -> compiled.UnorderedIR: """Defines transformation but isn't cached, always use compile_node instead""" raise ValueError(f"Can't compile unrecognized node: {node}") @_compile_node.register - def compile_join(self, node: nodes.JoinNode, ordered: bool = True): + def compile_join(self, node: nodes.JoinNode): condition_pairs = tuple( (left.id.sql, right.id.sql) for left, right in node.conditions ) - if ordered: - # In general, joins are an ordering destroying operation. - # With ordering_mode = "partial", make this explicit. In - # this case, we don't need to provide a deterministic ordering. - if self.strict: - left_ordered = self.compile_ordered_ir(node.left_child) - right_ordered = self.compile_ordered_ir(node.right_child) - return bigframes.core.compile.single_column.join_by_column_ordered( - left=left_ordered, - right=right_ordered, - type=node.type, - conditions=condition_pairs, - ) - else: - left_unordered = self.compile_unordered_ir(node.left_child) - right_unordered = self.compile_unordered_ir(node.right_child) - return bigframes.core.compile.single_column.join_by_column_unordered( - left=left_unordered, - right=right_unordered, - type=node.type, - conditions=condition_pairs, - ).as_ordered_ir() - else: - left_unordered = self.compile_unordered_ir(node.left_child) - right_unordered = self.compile_unordered_ir(node.right_child) - return bigframes.core.compile.single_column.join_by_column_unordered( - left=left_unordered, - right=right_unordered, - type=node.type, - conditions=condition_pairs, - ) + left_unordered = self.compile_node(node.left_child) + right_unordered = self.compile_node(node.right_child) + return bigframes.core.compile.single_column.join_by_column_unordered( + left=left_unordered, + right=right_unordered, + type=node.type, + conditions=condition_pairs, + ) @_compile_node.register - def compile_fromrange(self, node: nodes.FromRangeNode, ordered: bool = True): + def compile_fromrange(self, node: nodes.FromRangeNode): # Both start and end are single elements and do not inherently have an order - start = self.compile_unordered_ir(node.start) - end = self.compile_unordered_ir(node.end) + start = self.compile_node(node.start) + end = self.compile_node(node.end) start_table = start._to_ibis_expr() end_table = end._to_ibis_expr() @@ -183,36 +150,25 @@ def compile_fromrange(self, node: nodes.FromRangeNode, ordered: bool = True): .as_table() .unnest([node.output_id.sql]) ) - if ordered: - return compiled.OrderedIR( - labels, - columns=[labels[labels.columns[0]]], - ordering=bf_ordering.TotalOrdering().from_offset_col(labels.columns[0]), - ) - else: - return compiled.UnorderedIR( - labels, - columns=[labels[labels.columns[0]]], - ) + return compiled.UnorderedIR( + labels, + columns=[labels[labels.columns[0]]], + ) @_compile_node.register - def compile_readlocal(self, node: nodes.ReadLocalNode, ordered: bool = True): + def compile_readlocal(self, node: nodes.ReadLocalNode): array_as_pd = pd.read_feather( io.BytesIO(node.feather_bytes), columns=[item.source_id for item in node.scan_list.items], ) - ordered_ir = compiled.OrderedIR.from_pandas(array_as_pd, node.scan_list) - if ordered: - return ordered_ir - else: - return ordered_ir.to_unordered() + offsets = node.offsets_col.sql if node.offsets_col else None + return compiled.UnorderedIR.from_pandas( + array_as_pd, node.scan_list, offsets=offsets + ) @_compile_node.register - def compile_readtable(self, node: nodes.ReadTableNode, ordered: bool = True): - if ordered: - return self.compile_read_table_ordered(node.source, node.scan_list) - else: - return self.compile_read_table_unordered(node.source, node.scan_list) + def compile_readtable(self, node: nodes.ReadTableNode): + return self.compile_read_table_unordered(node.source, node.scan_list) def read_table_as_unordered_ibis( self, source: nodes.BigqueryDataSource @@ -250,140 +206,71 @@ def compile_read_table_unordered( ), ) - def compile_read_table_ordered( - self, source: nodes.BigqueryDataSource, scan_list: nodes.ScanList - ): - ibis_table = self.read_table_as_unordered_ibis(source) - if source.ordering is not None: - visible_column_mapping = { - ids.ColumnId(scan_item.source_id): scan_item.id - for scan_item in scan_list.items - } - full_mapping = { - ids.ColumnId(col.name): ids.ColumnId(guids.generate_guid()) - for col in source.ordering.referenced_columns - } - full_mapping.update(visible_column_mapping) - - ordering = source.ordering.remap_column_refs(full_mapping) - hidden_columns = tuple( - ibis_table[source_id.sql].name(out_id.sql) - for source_id, out_id in full_mapping.items() - if source_id not in visible_column_mapping - ) - else: - # In unstrict mode, don't generate total ordering from hashing as this is - # expensive (prevent removing any columns from table scan) - ordering, hidden_columns = bf_ordering.RowOrdering(), () - - return compiled.OrderedIR( - ibis_table, - columns=tuple( - bigframes.core.compile.ibis_types.ibis_value_to_canonical_type( - ibis_table[scan_item.source_id].name(scan_item.id.sql) - ) - for scan_item in scan_list.items - ), - ordering=ordering, - hidden_ordering_columns=hidden_columns, - ) - - @_compile_node.register - def compile_promote_offsets( - self, node: nodes.PromoteOffsetsNode, ordered: bool = True - ): - result = self.compile_ordered_ir(node.child).promote_offsets(node.col_id.sql) - return result if ordered else result.to_unordered() - @_compile_node.register - def compile_filter(self, node: nodes.FilterNode, ordered: bool = True): - return self.compile_node(node.child, ordered).filter(node.predicate) - - @_compile_node.register - def compile_orderby(self, node: nodes.OrderByNode, ordered: bool = True): - if ordered: - if node.is_total_order: - # more efficient, can just discard any previous ordering and get same result - return self.compile_unordered_ir(node.child).with_total_order(node.by) - else: - return self.compile_ordered_ir(node.child).order_by(node.by) - else: - return self.compile_unordered_ir(node.child) + def compile_filter(self, node: nodes.FilterNode): + return self.compile_node(node.child).filter(node.predicate) @_compile_node.register - def compile_reversed(self, node: nodes.ReversedNode, ordered: bool = True): - if ordered: - return self.compile_ordered_ir(node.child).reversed() - else: - return self.compile_unordered_ir(node.child) - - @_compile_node.register - def compile_selection(self, node: nodes.SelectionNode, ordered: bool = True): - result = self.compile_node(node.child, ordered) + def compile_selection(self, node: nodes.SelectionNode): + result = self.compile_node(node.child) selection = tuple((ref, id.sql) for ref, id in node.input_output_pairs) return result.selection(selection) @_compile_node.register - def compile_projection(self, node: nodes.ProjectionNode, ordered: bool = True): - result = self.compile_node(node.child, ordered) + def compile_projection(self, node: nodes.ProjectionNode): + result = self.compile_node(node.child) projections = ((expr, id.sql) for expr, id in node.assignments) return result.projection(tuple(projections)) @_compile_node.register - def compile_concat(self, node: nodes.ConcatNode, ordered: bool = True): + def compile_concat(self, node: nodes.ConcatNode): output_ids = [id.sql for id in node.output_ids] - if ordered: - compiled_ordered = [self.compile_ordered_ir(node) for node in node.children] - return concat_impl.concat_ordered(compiled_ordered, output_ids) - else: - compiled_unordered = [ - self.compile_unordered_ir(node) for node in node.children - ] - return concat_impl.concat_unordered(compiled_unordered, output_ids) + compiled_unordered = [self.compile_node(node) for node in node.children] + return concat_impl.concat_unordered(compiled_unordered, output_ids) @_compile_node.register - def compile_rowcount(self, node: nodes.RowCountNode, ordered: bool = True): - result = self.compile_unordered_ir(node.child).row_count(name=node.col_id.sql) - return result if ordered else result.to_unordered() + def compile_rowcount(self, node: nodes.RowCountNode): + result = self.compile_node(node.child).row_count(name=node.col_id.sql) + return result @_compile_node.register - def compile_aggregate(self, node: nodes.AggregateNode, ordered: bool = True): - has_ordered_aggregation_ops = any( - aggregate.op.can_order_by for aggregate, _ in node.aggregations - ) + def compile_aggregate(self, node: nodes.AggregateNode): aggs = tuple((agg, id.sql) for agg, id in node.aggregations) - if ordered and has_ordered_aggregation_ops: - return self.compile_ordered_ir(node.child).aggregate( - aggs, node.by_column_ids, node.dropna - ) - else: - result = self.compile_unordered_ir(node.child).aggregate( - aggs, node.by_column_ids, node.dropna - ) - return result if ordered else result.to_unordered() + result = self.compile_node(node.child).aggregate( + aggs, node.by_column_ids, node.dropna, order_by=node.order_by + ) + return result @_compile_node.register - def compile_window(self, node: nodes.WindowOpNode, ordered: bool = True): - result = self.compile_ordered_ir(node.child).project_window_op( + def compile_window(self, node: nodes.WindowOpNode): + result = self.compile_node(node.child).project_window_op( node.expression, node.window_spec, node.output_name.sql, never_skip_nulls=node.never_skip_nulls, ) - return result if ordered else result.to_unordered() + return result @_compile_node.register - def compile_explode(self, node: nodes.ExplodeNode, ordered: bool = True): + def compile_explode(self, node: nodes.ExplodeNode): offsets_col = node.offsets_col.sql if (node.offsets_col is not None) else None - if ordered: - return bigframes.core.compile.explode.explode_ordered( - self.compile_ordered_ir(node.child), node.column_ids, offsets_col - ) - else: - return bigframes.core.compile.explode.explode_unordered( - self.compile_unordered_ir(node.child), node.column_ids, offsets_col - ) + return bigframes.core.compile.explode.explode_unordered( + self.compile_node(node.child), node.column_ids, offsets_col + ) @_compile_node.register - def compile_random_sample(self, node: nodes.RandomSampleNode, ordered: bool = True): - return self.compile_node(node.child, ordered)._uniform_sampling(node.fraction) + def compile_random_sample(self, node: nodes.RandomSampleNode): + return self.compile_node(node.child)._uniform_sampling(node.fraction) + + +def set_output_names( + node: bigframes.core.nodes.BigFrameNode, output_ids: typing.Sequence[str] +): + # TODO: Create specialized output operators that will handle final names + return nodes.SelectionNode( + node, + tuple( + (ex.DerefOp(old_id), ids.ColumnId(out_id)) + for old_id, out_id in zip(node.ids, output_ids) + ), + ) diff --git a/bigframes/core/compile/concat.py b/bigframes/core/compile/concat.py index ede326d00b..742f429f54 100644 --- a/bigframes/core/compile/concat.py +++ b/bigframes/core/compile/concat.py @@ -13,21 +13,11 @@ # limitations under the License. from __future__ import annotations -import math import typing import bigframes_vendored.ibis.expr.api as ibis_api import bigframes.core.compile.compiled as compiled -import bigframes.core.expression as ex -from bigframes.core.ordering import ( - ascending_over, - reencode_order_string, - StringEncoding, - TotalOrdering, -) - -ORDER_ID_COLUMN = "bigframes_ordering_id" def concat_unordered( @@ -49,57 +39,3 @@ def concat_unordered( combined_table, columns=[combined_table[col] for col in combined_table.columns], ) - - -def concat_ordered( - items: typing.Sequence[compiled.OrderedIR], - output_ids: typing.Sequence[str], -) -> compiled.OrderedIR: - """Append together multiple ArrayValue objects.""" - if len(items) == 1: - return items[0] - - tables = [] - prefix_base = 10 - prefix_size = math.ceil(math.log(len(items), prefix_base)) - # Must normalize all ids to the same encoding size - max_encoding_size = max( - *[expression._ordering.string_encoding.length for expression in items], - ) - for i, expr in enumerate(items): - ordering_prefix = str(i).zfill(prefix_size) - renames = { - old_id: new_id for old_id, new_id in zip(expr.column_ids, output_ids) - } - table = expr._to_ibis_expr( - ordering_mode="string_encoded", - order_col_name=ORDER_ID_COLUMN, - ) - table = table.select( - [ - table[col].name(renames[col]) - if col != ORDER_ID_COLUMN - else ( - ordering_prefix - + reencode_order_string(table[ORDER_ID_COLUMN], max_encoding_size) - ).name(ORDER_ID_COLUMN) - for col in table.columns - ] - ) - tables.append(table) - combined_table = ibis_api.union(*tables) - ordering = TotalOrdering( - ordering_value_columns=tuple([ascending_over(ORDER_ID_COLUMN)]), - total_ordering_columns=frozenset([ex.deref(ORDER_ID_COLUMN)]), - string_encoding=StringEncoding(True, prefix_size + max_encoding_size), - ) - return compiled.OrderedIR( - combined_table, - columns=[ - combined_table[col] - for col in combined_table.columns - if col != ORDER_ID_COLUMN - ], - hidden_ordering_columns=[combined_table[ORDER_ID_COLUMN]], - ordering=ordering, - ) diff --git a/bigframes/core/compile/explode.py b/bigframes/core/compile/explode.py index 0dfc129810..59e3a13d02 100644 --- a/bigframes/core/compile/explode.py +++ b/bigframes/core/compile/explode.py @@ -20,9 +20,7 @@ import bigframes.core.compile.compiled as compiled import bigframes.core.expression as ex import bigframes.core.guid -import bigframes.core.identifiers as ids import bigframes.core.ordering -from bigframes.core.ordering import TotalOrdering def explode_unordered( @@ -73,79 +71,3 @@ def explode_unordered( table_w_unnest, columns=columns, # type: ignore ) - - -def explode_ordered( - input: compiled.OrderedIR, - columns: typing.Sequence[ex.DerefOp], - offsets_id: typing.Optional[str], -) -> compiled.OrderedIR: - if input.order_non_deterministic: - id = bigframes.core.guid.generate_guid() - return input.promote_offsets(id) - table = input._to_ibis_expr(ordering_mode="unordered", expose_hidden_cols=True) - column_ids = tuple(ref.id.sql for ref in columns) - - offset_array_id = bigframes.core.guid.generate_guid("offset_array_") - offset_array = bigframes_vendored.ibis.range( - 0, - bigframes_vendored.ibis.greatest( - 1, # We always want at least 1 element to fill in NULLs for empty arrays. - bigframes_vendored.ibis.least( - *[table[column_id].length() for column_id in column_ids] - ), - ), - 1, - ).name(offset_array_id) - table_w_offset_array = table.select( - offset_array, - *input._column_names, - *input._hidden_ordering_column_names, - ) - - unnest_offset_id = offsets_id or bigframes.core.guid.generate_guid("unnest_offset_") - unnest_offset = ( - table_w_offset_array[offset_array_id].unnest().name(unnest_offset_id) - ) - table_w_offset = table_w_offset_array.select( - unnest_offset, - *input._column_names, - *input._hidden_ordering_column_names, - ) - - unnested_columns = [ - table_w_offset[column_id][table_w_offset[unnest_offset_id]].name(column_id) - if column_id in column_ids - else table_w_offset[column_id] - for column_id in input._column_names - ] - - table_w_unnest = table_w_offset.select( - table_w_offset[unnest_offset_id], - *unnested_columns, - *input._hidden_ordering_column_names, - ) - - output_cols = tuple(input.column_ids) + ((offsets_id,) if offsets_id else ()) - columns = [table_w_unnest[column_name] for column_name in output_cols] - hidden_ordering_columns = [ - table_w_unnest[column_name] - for column_name in input._hidden_ordering_column_names - ] - if offsets_id is None: - hidden_ordering_columns.append(table_w_unnest[unnest_offset_id]) - l_mappings = {id: id for id in input._ordering.referenced_columns} - r_mappings = {ids.ColumnId(unnest_offset_id): ids.ColumnId(unnest_offset_id)} - ordering = bigframes.core.ordering.join_orderings( - input._ordering, - TotalOrdering.from_offset_col(unnest_offset_id), - l_mappings, - r_mappings, - ) - - return compiled.OrderedIR( - table_w_unnest, - columns=columns, # type: ignore - hidden_ordering_columns=hidden_ordering_columns, - ordering=ordering, - ) diff --git a/bigframes/core/compile/single_column.py b/bigframes/core/compile/single_column.py index b903f9b552..9216051d91 100644 --- a/bigframes/core/compile/single_column.py +++ b/bigframes/core/compile/single_column.py @@ -23,110 +23,6 @@ import bigframes_vendored.ibis.expr.types as ibis_types import bigframes.core.compile.compiled as compiled -import bigframes.core.guid as guids -import bigframes.core.identifiers as ids -import bigframes.core.ordering as orderings - - -def join_by_column_ordered( - left: compiled.OrderedIR, - right: compiled.OrderedIR, - conditions: Tuple[Tuple[str, str], ...], - type: Literal["inner", "outer", "left", "right", "cross"], -) -> compiled.OrderedIR: - """Join two expressions by column equality. - - Arguments: - left: Expression for left table to join. - left_column_ids: Column IDs (not label) to join by. - right: Expression for right table to join. - right_column_ids: Column IDs (not label) to join by. - how: The type of join to perform. - allow_row_identity_join (bool): - If True, allow matching by row identity. Set to False to always - perform a true JOIN in generated SQL. - Returns: - The joined expression. The resulting columns will be, in order, - first the coalesced join keys, then, all the left columns, and - finally, all the right columns. - """ - if type == "right": - if left.order_non_deterministic: - right = right._bake_ordering() - else: - if left.order_non_deterministic: - left = left._bake_ordering() - - # Do not reset the generator - l_value_mapping = dict(zip(left.column_ids, left.column_ids)) - r_value_mapping = dict(zip(right.column_ids, right.column_ids)) - - # hidden columns aren't necessarily unique, so need to remap to guids - l_hidden_mapping = { - id: guids.generate_guid("hidden_") for id in left._hidden_column_ids - } - r_hidden_mapping = { - id: guids.generate_guid("hidden_") for id in right._hidden_column_ids - } - - l_mapping = {**l_value_mapping, **l_hidden_mapping} - r_mapping = {**r_value_mapping, **r_hidden_mapping} - - left_table = left._to_ibis_expr( - ordering_mode="unordered", - expose_hidden_cols=True, - ) - left_table = left_table.rename({val: key for key, val in l_hidden_mapping.items()}) - right_table = right._to_ibis_expr( - ordering_mode="unordered", - expose_hidden_cols=True, - ) - right_table = right_table.rename( - {val: key for key, val in r_hidden_mapping.items()} - ) - join_conditions = [ - value_to_join_key(left_table[l_mapping[left_index]]) - == value_to_join_key(right_table[r_mapping[right_index]]) - for left_index, right_index in conditions - ] - - combined_table = ibis_api.join( - left_table, - right_table, - predicates=join_conditions, - how=type, # type: ignore - ) - - # Preserve ordering accross joins. - ordering = orderings.join_orderings( - left._ordering, - right._ordering, - {ids.ColumnId(lin): ids.ColumnId(lout) for lin, lout in l_mapping.items()}, - {ids.ColumnId(rin): ids.ColumnId(rout) for rin, rout in r_mapping.items()}, - left_order_dominates=(type != "right"), - ) - - # We could filter out the original join columns, but predicates/ordering - # might still reference them in implicit joins. - columns = [combined_table[l_mapping[col.get_name()]] for col in left.columns] + [ - combined_table[r_mapping[col.get_name()]] for col in right.columns - ] - hidden_ordering_columns = [ - *[ - combined_table[l_hidden_mapping[col.get_name()]] - for col in left._hidden_ordering_columns - ], - *[ - combined_table[r_hidden_mapping[col.get_name()]] - for col in right._hidden_ordering_columns - ], - ] - return compiled.OrderedIR( - combined_table, - columns=columns, - hidden_ordering_columns=hidden_ordering_columns, - ordering=ordering, - ) def join_by_column_unordered( @@ -167,8 +63,6 @@ def join_by_column_unordered( predicates=join_conditions, how=type, # type: ignore ) - # We could filter out the original join columns, but predicates/ordering - # might still reference them in implicit joins. columns = [combined_table[col.get_name()] for col in left.columns] + [ combined_table[col.get_name()] for col in right.columns ] diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 5fb5fb14d2..5f9fcb257e 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -683,10 +683,12 @@ def cummin(self, *args, **kwargs) -> series.Series: @validations.requires_ordering() def cumcount(self, *args, **kwargs) -> series.Series: + # TODO: Add nullary op support to implement more cleanly return ( self._apply_window_op( - agg_ops.rank_op, + agg_ops.SizeUnaryOp(), discard_name=True, + never_skip_nulls=True, ) - 1 ) @@ -758,6 +760,7 @@ def _apply_window_op( op: agg_ops.WindowOp, discard_name=False, window: typing.Optional[core.WindowSpec] = None, + never_skip_nulls: bool = False, ): """Apply window op to groupby. Defaults to grouped cumulative window.""" window_spec = window or window_specs.cumulative_rows( @@ -770,6 +773,7 @@ def _apply_window_op( op, result_label=label, window_spec=window_spec, + never_skip_nulls=never_skip_nulls, ) return series.Series(block.select_column(result_id)) diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index d5083c3737..c800525b33 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -890,6 +890,32 @@ def remap_vars( def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): return self + def with_order_cols(self): + # Maybe the ordering should be required to always be in the scan list, and then we won't need this? + if self.source.ordering is None: + return self, orderings.RowOrdering() + + order_cols = {col.sql for col in self.source.ordering.referenced_columns} + scan_cols = {col.source_id for col in self.scan_list.items} + new_scan_cols = [ + ScanItem( + bigframes.core.ids.ColumnId.unique(), + dtype=bigframes.dtypes.convert_schema_field(field)[1], + source_id=field.name, + ) + for field in self.source.table.physical_schema + if (field.name in order_cols) and (field.name not in scan_cols) + ] + new_scan_list = ScanList(items=(*self.scan_list.items, *new_scan_cols)) + new_order = self.source.ordering.remap_column_refs( + { + bigframes.core.ids.ColumnId(item.source_id): item.id + for item in new_scan_cols + }, + allow_partial_bindings=True, + ) + return dataclasses.replace(self, scan_list=new_scan_list), new_order + @dataclasses.dataclass(frozen=True, eq=False) class CachedTableNode(ReadTableNode): @@ -1113,6 +1139,9 @@ def row_count(self) -> Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return tuple(id for _, id in self.input_output_pairs) + def get_id_mapping(self) -> dict[bfet_ids.ColumnId, bfet_ids.ColumnId]: + return {ref.id: out_id for ref, out_id in self.input_output_pairs} + def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: pruned_selections = ( tuple( @@ -1260,6 +1289,7 @@ class AggregateNode(UnaryNode): typing.Tuple[ex.Aggregation, bigframes.core.identifiers.ColumnId], ... ] by_column_ids: typing.Tuple[ex.DerefOp, ...] = tuple([]) + order_by: Tuple[OrderingExpression, ...] = () dropna: bool = True @property @@ -1308,6 +1338,12 @@ def row_count(self) -> Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return tuple(id for _, id in self.aggregations) + @property + def has_ordered_ops(self) -> bool: + return not all( + aggregate.op.order_independent for aggregate, _ in self.aggregations + ) + def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: by_ids = (ref.id for ref in self.by_column_ids) pruned_aggs = ( @@ -1319,7 +1355,9 @@ def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: ) consumed_ids = frozenset(itertools.chain(by_ids, agg_inputs)) pruned_child = self.child.prune(consumed_ids) - return AggregateNode(pruned_child, pruned_aggs, self.by_column_ids, self.dropna) + return AggregateNode( + pruned_child, pruned_aggs, self.by_column_ids, dropna=self.dropna + ) def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] @@ -1333,8 +1371,9 @@ def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): for agg, id in self.aggregations ) new_by_ids = tuple(id.remap_column_refs(mappings) for id in self.by_column_ids) + new_order_by = tuple(part.remap_column_refs(mappings) for part in self.order_by) return dataclasses.replace( - self, by_column_ids=new_by_ids, aggregations=new_aggs + self, by_column_ids=new_by_ids, aggregations=new_aggs, order_by=new_order_by ) @@ -1348,6 +1387,10 @@ class WindowOpNode(UnaryNode): def _validate(self): """Validate the local data in the node.""" + # Since inner order and row bounds are coupled, rank ops can't be row bounded + assert ( + not self.window_spec.row_bounded + ) or self.expression.op.implicitly_inherits_order assert all(ref in self.child.ids for ref in self.expression.column_references) @property @@ -1387,6 +1430,14 @@ def added_field(self) -> Field: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return (self.output_name,) + @property + def inherits_order(self) -> bool: + # does the op both use ordering at all? and if so, can it inherit order? + op_inherits_order = ( + not self.expression.op.order_independent + ) and self.expression.op.implicitly_inherits_order + return op_inherits_order or self.window_spec.row_bounded + def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: if self.output_name not in used_cols: return self.child.prune(used_cols) diff --git a/bigframes/core/ordering.py b/bigframes/core/ordering.py index 357cc8145c..2fc7573b21 100644 --- a/bigframes/core/ordering.py +++ b/bigframes/core/ordering.py @@ -16,21 +16,12 @@ from dataclasses import dataclass, field from enum import Enum -import math import typing -from typing import Mapping, Optional, Sequence, Set - -import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes -import bigframes_vendored.ibis.expr.types as ibis_types +from typing import Mapping, Optional, Sequence, Set, Union import bigframes.core.expression as expression import bigframes.core.identifiers as ids -# TODO(tbergeron): Encode more efficiently -ORDERING_ID_STRING_BASE: int = 10 -# Sufficient to store any value up to 2^63 -DEFAULT_ORDERING_ID_LENGTH: int = math.ceil(63 * math.log(2, ORDERING_ID_STRING_BASE)) - class OrderingDirection(Enum): ASC = 1 @@ -93,16 +84,6 @@ def with_reverse(self) -> OrderingExpression: # Encoding classes specify additional properties for some ordering representations -@dataclass(frozen=True) -class StringEncoding: - """String encoded order ids are fixed length and can be concat together in joins.""" - - is_encoded: bool = False - # Encoding size must be tracked in order to know what how to combine ordering ids across tables (eg how much to pad when combining different length). - # Also will be needed to determine when length is too large and need to compact ordering id with a ROW_NUMBER operation. - length: int = DEFAULT_ORDERING_ID_LENGTH - - @dataclass(frozen=True) class IntegerEncoding: """Integer encoded order ids are guaranteed non-negative.""" @@ -117,7 +98,6 @@ class RowOrdering: ordering_value_columns: typing.Tuple[OrderingExpression, ...] = () integer_encoding: IntegerEncoding = IntegerEncoding(False) - string_encoding: StringEncoding = StringEncoding(False) @property def all_ordering_columns(self) -> Sequence[OrderingExpression]: @@ -131,11 +111,6 @@ def referenced_columns(self) -> Set[ids.ColumnId]: for col in part.referenced_columns ) - @property - def is_string_encoded(self) -> bool: - """True if ordering is fully defined by a fixed length string column.""" - return self.string_encoding.is_encoded - @property def is_sequential(self) -> bool: return self.integer_encoding.is_encoded and self.integer_encoding.is_sequential @@ -207,6 +182,13 @@ def with_ordering_columns( new_ordering, ) + def join( + self, + other: RowOrdering, + ) -> RowOrdering: + joined_refs = [*self.all_ordering_columns, *other.all_ordering_columns] + return RowOrdering(tuple(joined_refs)) + def _truncate_ordering( self, order_refs: tuple[OrderingExpression, ...] ) -> tuple[OrderingExpression, ...]: @@ -239,19 +221,20 @@ def __post_init__(self): ) @classmethod - def from_offset_col(cls, col: str) -> TotalOrdering: + def from_offset_col(cls, col: Union[ids.ColumnId, str]) -> TotalOrdering: + col_id = ids.ColumnId(col) if isinstance(col, str) else col return TotalOrdering( (ascending_over(col),), integer_encoding=IntegerEncoding(True, is_sequential=True), - total_ordering_columns=frozenset({expression.deref(col)}), + total_ordering_columns=frozenset({expression.DerefOp(col_id)}), ) @classmethod - def from_primary_key(cls, primary_key: Sequence[str]) -> TotalOrdering: + def from_primary_key(cls, primary_key: Sequence[ids.ColumnId]) -> TotalOrdering: return TotalOrdering( tuple(ascending_over(col) for col in primary_key), total_ordering_columns=frozenset( - {expression.deref(col) for col in primary_key} + {expression.DerefOp(col) for col in primary_key} ), ) @@ -342,10 +325,38 @@ def remap_column_refs( return TotalOrdering( tuple(new_value_columns), integer_encoding=self.integer_encoding, - string_encoding=self.string_encoding, total_ordering_columns=new_total_order, ) + @typing.overload + def join( + self, + other: TotalOrdering, + ) -> TotalOrdering: + ... + + @typing.overload + def join( + self, + other: RowOrdering, + ) -> RowOrdering: + ... + + def join( + self, + other: RowOrdering, + ) -> RowOrdering: + joined_refs = [*self.all_ordering_columns, *other.all_ordering_columns] + if isinstance(other, TotalOrdering): + left_total_order_cols = frozenset(self.total_ordering_columns) + right_total_order_cols = frozenset(other.total_ordering_columns) + return TotalOrdering( + ordering_value_columns=tuple(joined_refs), + total_ordering_columns=left_total_order_cols | right_total_order_cols, + ) + else: + return RowOrdering(tuple(joined_refs)) + @property def total_order_col(self) -> Optional[OrderingExpression]: """Returns column id of columns that defines total ordering, if such as column exists""" @@ -357,93 +368,18 @@ def total_order_col(self) -> Optional[OrderingExpression]: return order_ref -def encode_order_string( - order_id: ibis_types.IntegerColumn, length: int = DEFAULT_ORDERING_ID_LENGTH -) -> ibis_types.StringColumn: - """Converts an order id value to string if it is not already a string. MUST produced fixed-length strings.""" - # This is very inefficient encoding base-10 string uses only 10 characters per byte(out of 256 bit combinations) - # Furthermore, if know tighter bounds on order id are known, can produce smaller strings. - # 19 characters chosen as it can represent any positive Int64 in base-10 - # For missing values, ":" * 19 is used as it is larger than any other value this function produces, so null values will be last. - string_order_id = typing.cast( - ibis_types.StringValue, - order_id.cast(ibis_dtypes.string), - ).lpad(length, "0") - return typing.cast(ibis_types.StringColumn, string_order_id) - - -def reencode_order_string( - order_id: ibis_types.StringColumn, length: int -) -> ibis_types.StringColumn: - return typing.cast( - ibis_types.StringColumn, - (typing.cast(ibis_types.StringValue, order_id).lpad(length, "0")), - ) - - # Convenience functions -def ascending_over(id: str, nulls_last: bool = True) -> OrderingExpression: - return OrderingExpression(expression.deref(id), na_last=nulls_last) +def ascending_over( + id: Union[ids.ColumnId, str], nulls_last: bool = True +) -> OrderingExpression: + col_id = ids.ColumnId(id) if isinstance(id, str) else id + return OrderingExpression(expression.DerefOp(col_id), na_last=nulls_last) -def descending_over(id: str, nulls_last: bool = True) -> OrderingExpression: +def descending_over( + id: Union[ids.ColumnId, str], nulls_last: bool = True +) -> OrderingExpression: + col_id = ids.ColumnId(id) if isinstance(id, str) else id return OrderingExpression( - expression.deref(id), direction=OrderingDirection.DESC, na_last=nulls_last + expression.DerefOp(col_id), direction=OrderingDirection.DESC, na_last=nulls_last ) - - -@typing.overload -def join_orderings( - left: TotalOrdering, - right: TotalOrdering, - left_id_mapping: Mapping[ids.ColumnId, ids.ColumnId], - right_id_mapping: Mapping[ids.ColumnId, ids.ColumnId], - left_order_dominates: bool = True, -) -> TotalOrdering: - ... - - -@typing.overload -def join_orderings( - left: RowOrdering, - right: RowOrdering, - left_id_mapping: Mapping[ids.ColumnId, ids.ColumnId], - right_id_mapping: Mapping[ids.ColumnId, ids.ColumnId], - left_order_dominates: bool = True, -) -> RowOrdering: - ... - - -def join_orderings( - left: RowOrdering, - right: RowOrdering, - left_id_mapping: Mapping[ids.ColumnId, ids.ColumnId], - right_id_mapping: Mapping[ids.ColumnId, ids.ColumnId], - left_order_dominates: bool = True, -) -> RowOrdering: - left_ordering_refs = [ - ref.remap_column_refs(left_id_mapping) for ref in left.all_ordering_columns - ] - right_ordering_refs = [ - ref.remap_column_refs(right_id_mapping) for ref in right.all_ordering_columns - ] - if left_order_dominates: - joined_refs = [*left_ordering_refs, *right_ordering_refs] - else: - joined_refs = [*right_ordering_refs, *left_ordering_refs] - - if isinstance(left, TotalOrdering) and isinstance(right, TotalOrdering): - left_total_order_cols = frozenset( - [left_id_mapping[ref.id] for ref in left.total_ordering_columns] - ) - right_total_order_cols = frozenset( - [right_id_mapping[ref.id] for ref in right.total_ordering_columns] - ) - return TotalOrdering( - ordering_value_columns=tuple(joined_refs), - total_ordering_columns=frozenset( - map(expression.DerefOp, left_total_order_cols | right_total_order_cols) - ), - ) - else: - return RowOrdering(tuple(joined_refs)) diff --git a/bigframes/core/rewrite/__init__.py b/bigframes/core/rewrite/__init__.py index f5275239d9..9044cb25f9 100644 --- a/bigframes/core/rewrite/__init__.py +++ b/bigframes/core/rewrite/__init__.py @@ -15,6 +15,7 @@ from bigframes.core.rewrite.identifiers import remap_variables from bigframes.core.rewrite.implicit_align import try_row_join from bigframes.core.rewrite.legacy_align import legacy_join_as_projection +from bigframes.core.rewrite.order import pull_up_order from bigframes.core.rewrite.slices import pullup_limit_from_slice, rewrite_slice __all__ = [ @@ -23,4 +24,5 @@ "rewrite_slice", "pullup_limit_from_slice", "remap_variables", + "pull_up_order", ] diff --git a/bigframes/core/rewrite/order.py b/bigframes/core/rewrite/order.py new file mode 100644 index 0000000000..08593b7a5f --- /dev/null +++ b/bigframes/core/rewrite/order.py @@ -0,0 +1,427 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import dataclasses +import functools +from typing import Mapping, Tuple + +import bigframes.core.expression +import bigframes.core.identifiers +import bigframes.core.nodes +import bigframes.core.ordering +import bigframes.core.window_spec +import bigframes.operations + + +# Makes ordering explicit in window definitions +def pull_up_order( + root: bigframes.core.nodes.BigFrameNode, + *, + order_root: bool = True, + ordered_joins: bool = True, +) -> Tuple[bigframes.core.nodes.BigFrameNode, bigframes.core.ordering.RowOrdering]: + """ + Pull the ordering up, putting full order definition into window ops. + + May create extra colums, which must be removed by callers if they want to preserve original schema. + + Requires the following nodes to be removed/rewritten: SliceNode + + """ + + @functools.cache + def pull_up_order_inner( + node: bigframes.core.nodes.BigFrameNode, + ) -> Tuple[bigframes.core.nodes.BigFrameNode, bigframes.core.ordering.RowOrdering]: + """Pull filter nodes out of a tree section.""" + if isinstance(node, bigframes.core.nodes.ReversedNode): + child_result, child_order = pull_up_order_inner(node.child) + return child_result, child_order.with_reverse() + elif isinstance(node, bigframes.core.nodes.OrderByNode): + if node.is_total_order: + new_node = remove_order(node.child) + else: + new_node, child_order = pull_up_order_inner(node.child) + + new_by = [] + ids: list[bigframes.core.ids.ColumnId] = [] + for part in node.by: + if not isinstance( + part.scalar_expression, bigframes.core.expression.DerefOp + ): + id = bigframes.core.ids.ColumnId.unique() + new_node = bigframes.core.nodes.ProjectionNode( + new_node, ((part.scalar_expression, id),) + ) + new_part = bigframes.core.ordering.OrderingExpression( + bigframes.core.expression.DerefOp(id), + part.direction, + part.na_last, + ) + new_by.append(new_part) + ids.append(id) + else: + new_by.append(part) + ids.append(part.scalar_expression.id) + + if node.is_total_order: + new_order: bigframes.core.ordering.RowOrdering = ( + bigframes.core.ordering.TotalOrdering( + ordering_value_columns=tuple(new_by), + total_ordering_columns=frozenset( + map(lambda x: bigframes.core.expression.DerefOp(x), ids) + ), + ) + ) + else: + assert child_order + new_order = child_order.with_ordering_columns(new_by) + return new_node, new_order + elif isinstance(node, bigframes.core.nodes.ProjectionNode): + child_result, child_order = pull_up_order_inner(node.child) + return node.replace_child(child_result), child_order + elif isinstance(node, bigframes.core.nodes.JoinNode): + if ordered_joins: + return pull_order_join(node) + else: + return ( + dataclasses.replace( + node, + left_child=remove_order_strict(node.left_child), + right_child=remove_order_strict(node.right_child), + ), + bigframes.core.ordering.RowOrdering(), + ) + elif isinstance(node, bigframes.core.nodes.ConcatNode): + return pull_order_concat(node) + elif isinstance(node, bigframes.core.nodes.FromRangeNode): + new_start = remove_order_strict(node.start) + new_end = remove_order_strict(node.end) + + new_node = dataclasses.replace(node, start=new_start, end=new_end) + return node, bigframes.core.ordering.TotalOrdering.from_primary_key( + [node.output_id] + ) + elif isinstance(node, bigframes.core.nodes.ReadLocalNode): + if node.offsets_col is None: + offsets_id = bigframes.core.ids.ColumnId.unique() + new_root = dataclasses.replace(node, offsets_col=offsets_id) + return new_root, bigframes.core.ordering.TotalOrdering.from_offset_col( + offsets_id + ) + else: + return node, bigframes.core.ordering.TotalOrdering.from_offset_col( + node.offsets_col + ) + elif isinstance(node, bigframes.core.nodes.ReadTableNode): + if node.source.ordering is not None: + return node.with_order_cols() + else: + # No defined ordering + return node, bigframes.core.ordering.RowOrdering() + elif isinstance(node, bigframes.core.nodes.PromoteOffsetsNode): + child_result, child_order = pull_up_order_inner(node.child) + if child_order.is_total_ordering and child_order.is_sequential: + # special case, we can just project the ordering + order_expression = child_order.total_order_col + assert order_expression is not None + order_expression.scalar_expression + new_node = bigframes.core.nodes.ProjectionNode( + child_result, ((order_expression.scalar_expression, node.col_id),) + ) + return new_node, bigframes.core.ordering.TotalOrdering.from_offset_col( + node.col_id + ) + else: + # Otherwise we need to generate offsets + agg = bigframes.core.expression.NullaryAggregation( + bigframes.core.agg_ops.RowNumberOp() + ) + window_spec = bigframes.core.window_spec.unbound( + ordering=tuple(child_order.all_ordering_columns) + ) + new_offsets_node = bigframes.core.nodes.WindowOpNode( + child_result, agg, window_spec, node.col_id + ) + return ( + new_offsets_node, + bigframes.core.ordering.TotalOrdering.from_offset_col(node.col_id), + ) + elif isinstance(node, bigframes.core.nodes.FilterNode): + child_result, child_order = pull_up_order_inner(node.child) + return node.replace_child(child_result), child_order.with_non_sequential() + elif isinstance(node, bigframes.core.nodes.SelectionNode): + child_result, child_order = pull_up_order_inner(node.child) + selected_ids = set(ref.id for ref, _ in node.input_output_pairs) + unselected_order_cols = tuple( + col for col in child_order.referenced_columns if col not in selected_ids + ) + # Create unique ids just to be safe + new_selections = { + col: bigframes.core.ids.ColumnId.unique() + for col in unselected_order_cols + } + all_selections = ( + *node.input_output_pairs, + *( + (bigframes.core.expression.DerefOp(k), v) + for k, v in new_selections.items() + ), + ) + + new_select_node = dataclasses.replace( + node, child=child_result, input_output_pairs=all_selections + ) + new_order = child_order.remap_column_refs(new_select_node.get_id_mapping()) + return new_select_node, new_order + elif isinstance(node, bigframes.core.nodes.RowCountNode): + child_result = remove_order(node.child) + return node.replace_child( + child_result + ), bigframes.core.ordering.TotalOrdering.from_primary_key([node.col_id]) + elif isinstance(node, bigframes.core.nodes.AggregateNode): + if node.has_ordered_ops: + child_result, child_order = pull_up_order_inner(node.child) + new_order_by = child_order.with_ordering_columns(node.order_by) + new_order = bigframes.core.ordering.TotalOrdering.from_primary_key( + [ref.id for ref in node.by_column_ids] + ) + return ( + dataclasses.replace( + node, + child=child_result, + order_by=tuple(new_order_by.all_ordering_columns), + ), + new_order, + ) + else: + child_result = remove_order(node.child) + return node.replace_child( + child_result + ), bigframes.core.ordering.TotalOrdering.from_primary_key( + [ref.id for ref in node.by_column_ids] + ) + elif isinstance(node, bigframes.core.nodes.WindowOpNode): + child_result, child_order = pull_up_order_inner(node.child) + if node.inherits_order: + new_window_order = ( + *node.window_spec.ordering, + *child_order.all_ordering_columns, + ) + new_window_spec = dataclasses.replace( + node.window_spec, ordering=new_window_order + ) + else: + new_window_spec = node.window_spec + return ( + dataclasses.replace( + node, child=child_result, window_spec=new_window_spec + ), + child_order, + ) + elif isinstance(node, bigframes.core.nodes.RandomSampleNode): + child_result, child_order = pull_up_order_inner(node.child) + return node.replace_child(child_result), child_order.with_non_sequential() + elif isinstance(node, bigframes.core.nodes.ExplodeNode): + child_result, child_order = pull_up_order_inner(node.child) + if node.offsets_col is None: + offsets_id = bigframes.core.ids.ColumnId.unique() + new_explode: bigframes.core.nodes.BigFrameNode = dataclasses.replace( + node, child=child_result, offsets_col=offsets_id + ) + else: + offsets_id = node.offsets_col + new_explode = node.replace_child(child_result) + inner_order = bigframes.core.orderings.TotalOrdering.from_offset_col( + offsets_id + ) + return new_explode, child_order.join(inner_order) + raise ValueError(f"Unexpected node: {node}") + + def pull_order_concat( + node: bigframes.core.nodes.ConcatNode, + ) -> Tuple[ + bigframes.core.nodes.BigFrameNode, bigframes.core.ordering.TotalOrdering + ]: + new_sources = [] + for i, source in enumerate(node.child_nodes): + new_source, order = pull_up_order_inner(source) + offsets_id = bigframes.core.ids.ColumnId.unique() + table_id = bigframes.core.ids.ColumnId.unique() + if order.is_total_ordering and order.integer_encoding.is_encoded: + order_expression = order.total_order_col + assert order_expression is not None + new_source = bigframes.core.nodes.ProjectionNode( + new_source, ((order_expression.scalar_expression, offsets_id),) + ) + else: + agg = bigframes.core.expression.NullaryAggregation( + bigframes.core.agg_ops.RowNumberOp() + ) + window_spec = bigframes.core.window_spec.unbound( + ordering=tuple(order.all_ordering_columns) + ) + new_source = bigframes.core.nodes.WindowOpNode( + new_source, agg, window_spec, offsets_id + ) + new_source = bigframes.core.nodes.ProjectionNode( + new_source, ((bigframes.core.expression.const(i), table_id),) + ) + selection = tuple( + ( + (bigframes.core.expression.DerefOp(id), id) + for id in (*source.ids, table_id, offsets_id) + ) + ) + new_source = bigframes.core.nodes.SelectionNode(new_source, selection) + new_sources.append(new_source) + + union_offsets_id = bigframes.core.ids.ColumnId.unique() + union_table_id = bigframes.core.ids.ColumnId.unique() + new_ids = (*node.output_ids, union_table_id, union_offsets_id) + new_node = dataclasses.replace( + node, children=tuple(new_sources), output_ids=new_ids + ) + new_ordering = bigframes.core.ordering.TotalOrdering.from_primary_key( + (union_table_id, union_offsets_id) + ) + return new_node, new_ordering + + def pull_order_join( + node: bigframes.core.nodes.JoinNode, + ) -> Tuple[bigframes.core.nodes.BigFrameNode, bigframes.core.ordering.RowOrdering]: + left_child, left_order = pull_up_order_inner(node.left_child) + # as tree is a dag, and pull_up_order_inner is memoized, self-joins can create conflicts in new columns + right_child, right_order = pull_up_order_inner(node.right_child) + conflicts = set(left_child.ids) & set(right_child.ids) + if conflicts: + right_child, mapping = rename_cols(right_child, conflicts) + right_order = right_order.remap_column_refs( + mapping, allow_partial_bindings=True + ) + + if node.type in ("right", "outer"): + # right side is nullable + left_indicator = bigframes.core.ids.ColumnId.unique() + left_child = bigframes.core.nodes.ProjectionNode( + left_child, ((bigframes.core.expression.const(True), left_indicator),) + ) + left_order = left_order.with_ordering_columns( + [bigframes.core.ordering.descending_over(left_indicator)] + ) + if node.type in ("left", "outer"): + # right side is nullable + right_indicator = bigframes.core.ids.ColumnId.unique() + right_child = bigframes.core.nodes.ProjectionNode( + right_child, ((bigframes.core.expression.const(True), right_indicator),) + ) + right_order = right_order.with_ordering_columns( + [bigframes.core.ordering.descending_over(right_indicator)] + ) + + new_join = dataclasses.replace( + node, left_child=left_child, right_child=right_child + ) + new_order = ( + left_order.join(right_order) + if (node.type != "right") + else right_order.join(left_order) + ) + return new_join, new_order + + @functools.cache + def remove_order( + node: bigframes.core.nodes.BigFrameNode, + ) -> bigframes.core.nodes.BigFrameNode: + if isinstance( + node, (bigframes.core.nodes.OrderByNode, bigframes.core.nodes.ReversedNode) + ): + return remove_order(node.child) + elif isinstance( + node, + ( + bigframes.core.nodes.WindowOpNode, + bigframes.core.nodes.PromoteOffsetsNode, + ), + ): + if isinstance(node, bigframes.core.nodes.PromoteOffsetsNode): + node = rewrite_promote_offsets(node) + if node.inherits_order: + child_result, child_order = pull_up_order_inner(node.child) + new_window_order = ( + *node.window_spec.ordering, + *child_order.all_ordering_columns, + ) + new_window_spec = dataclasses.replace( + node.window_spec, ordering=new_window_order + ) + return dataclasses.replace( + node, child=child_result, window_spec=new_window_spec + ) + elif isinstance(node, bigframes.core.nodes.AggregateNode): + if node.has_ordered_ops: + child_result, child_order = pull_up_order_inner(node.child) + new_order_by = child_order.with_ordering_columns(node.order_by) + return dataclasses.replace( + node, + child=child_result, + order_by=tuple(new_order_by.all_ordering_columns), + ) + + return node.transform_children(remove_order) + + def remove_order_strict( + node: bigframes.core.nodes.BigFrameNode, + ) -> bigframes.core.nodes.BigFrameNode: + result = remove_order(node) + if result.ids != node.ids: + return bigframes.core.nodes.SelectionNode( + result, + tuple((bigframes.core.expression.DerefOp(id), id) for id in node.ids), + ) + return result + + return ( + pull_up_order_inner(root) + if order_root + else (remove_order(root), bigframes.core.ordering.RowOrdering()) + ) + + +def rewrite_promote_offsets( + node: bigframes.core.nodes.PromoteOffsetsNode, +) -> bigframes.core.nodes.WindowOpNode: + agg = bigframes.core.expression.NullaryAggregation( + bigframes.core.agg_ops.RowNumberOp() + ) + window_spec = bigframes.core.window_spec.unbound() + return bigframes.core.nodes.WindowOpNode(node.child, agg, window_spec, node.col_id) + + +def rename_cols( + node: bigframes.core.nodes.BigFrameNode, cols: set[bigframes.core.ids.ColumnId] +) -> Tuple[ + bigframes.core.nodes.BigFrameNode, + Mapping[bigframes.core.ids.ColumnId, bigframes.core.ids.ColumnId], +]: + mappings = dict((id, bigframes.core.ids.ColumnId.unique()) for id in cols) + + result_node = bigframes.core.nodes.SelectionNode( + node, + tuple( + (bigframes.core.expression.DerefOp(id), mappings.get(id, id)) + for id in node.ids + ), + ) + + return result_node, dict(mappings) diff --git a/bigframes/core/window_spec.py b/bigframes/core/window_spec.py index d8098f18f7..b4a3d35471 100644 --- a/bigframes/core/window_spec.py +++ b/bigframes/core/window_spec.py @@ -13,7 +13,7 @@ # limitations under the License. from __future__ import annotations -from dataclasses import dataclass +from dataclasses import dataclass, replace import itertools from typing import Mapping, Optional, Set, Tuple, Union @@ -181,6 +181,12 @@ def all_referenced_columns(self) -> Set[ids.ColumnId]: ) return set(itertools.chain((i.id for i in self.grouping_keys), ordering_vars)) + def without_order(self) -> WindowSpec: + """Removes ordering clause if ordering isn't required to define bounds.""" + if self.row_bounded: + raise ValueError("Cannot remove order from row-bounded window") + return replace(self, ordering=()) + def remap_column_refs( self, mapping: Mapping[ids.ColumnId, ids.ColumnId], diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 365b664ee0..5f32cb980a 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -34,19 +34,22 @@ def skips_nulls(self): return True @property - def uses_total_row_ordering(self): - """Whether the operator needs total row ordering. (eg. lead, lag, array_agg)""" - return False + def implicitly_inherits_order(self): + """ + Whether the operator implicitly inherits the underlying array order, should it exist. - @property - def can_order_by(self): - return False + Notably, rank operations do not want to inherit ordering. Even order-independent operations + may inherit order when needed for row bounds. + """ + return True @property def order_independent(self): """ True if the output of the operator does not depend on the ordering of input rows. + Aggregation functions are usually order independent, except array_agg, string_agg. + Navigation functions are a notable case that are not order independent. """ return False @@ -89,12 +92,11 @@ def arguments(self) -> int: @property def order_independent(self): - """ - True if results don't depend on the order of the input. + return True - Almost all aggregation functions are order independent, excepting ``array_agg`` and ``string_agg``. - """ - return not self.can_order_by + @property + def uses_total_row_ordering(self): + return False @dataclasses.dataclass(frozen=True) @@ -126,6 +128,15 @@ def output_type(self, *input_types: dtypes.ExpressionType): return dtypes.INT_DTYPE +# TODO: Remove this temporary hack once nullary ops are better supported in APIs +@dataclasses.dataclass(frozen=True) +class SizeUnaryOp(UnaryAggregateOp): + name: ClassVar[str] = "size" + + def output_type(self, *input_types: dtypes.ExpressionType): + return dtypes.INT_DTYPE + + @dataclasses.dataclass(frozen=True) class SumOp(UnaryAggregateOp): name: ClassVar[str] = "sum" @@ -143,6 +154,10 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT class MedianOp(UnaryAggregateOp): name: ClassVar[str] = "median" + @property + def order_independent(self) -> bool: + return True + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: # These will change if median is changed to exact implementation. if not dtypes.is_orderable(input_types[0]): @@ -161,6 +176,10 @@ class QuantileOp(UnaryAggregateOp): def name(self): return f"{int(self.q * 100)}%" + @property + def order_independent(self) -> bool: + return True + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: return signatures.UNARY_REAL_NUMERIC.output_type(input_types[0]) @@ -288,8 +307,8 @@ class ArrayAggOp(UnaryAggregateOp): name: ClassVar[str] = "arrayagg" @property - def can_order_by(self): - return True + def order_independent(self): + return False @property def skips_nulls(self): @@ -335,7 +354,7 @@ def order_independent(self): @dataclasses.dataclass(frozen=True) -class QcutOp(UnaryWindowOp): +class QcutOp(UnaryWindowOp): # bucket op quantiles: typing.Union[int, typing.Tuple[float, ...]] @property @@ -392,6 +411,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT return dtypes.INT_DTYPE +# TODO: Convert to NullaryWindowOp @dataclasses.dataclass(frozen=True) class RankOp(UnaryWindowOp): name: ClassVar[str] = "rank" @@ -401,15 +421,14 @@ def skips_nulls(self): return False def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: - return signatures.FixedOutputType( - dtypes.is_orderable, dtypes.INT_DTYPE, "orderable" - ).output_type(input_types[0]) + return dtypes.INT_DTYPE @property - def order_independent(self): - return True + def implicitly_inherits_order(self): + return False +# TODO: Convert to NullaryWindowOp @dataclasses.dataclass(frozen=True) class DenseRankOp(UnaryWindowOp): @property @@ -417,30 +436,20 @@ def skips_nulls(self): return False def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: - return signatures.FixedOutputType( - dtypes.is_orderable, dtypes.INT_DTYPE, "orderable" - ).output_type(input_types[0]) + return dtypes.INT_DTYPE @property - def order_independent(self): - return True + def implicitly_inherits_order(self): + return False @dataclasses.dataclass(frozen=True) class FirstOp(UnaryWindowOp): name: ClassVar[str] = "first" - @property - def uses_total_row_ordering(self): - return True - @dataclasses.dataclass(frozen=True) class FirstNonNullOp(UnaryWindowOp): - @property - def uses_total_row_ordering(self): - return True - @property def skips_nulls(self): return False @@ -450,17 +459,9 @@ def skips_nulls(self): class LastOp(UnaryWindowOp): name: ClassVar[str] = "last" - @property - def uses_total_row_ordering(self): - return True - @dataclasses.dataclass(frozen=True) class LastNonNullOp(UnaryWindowOp): - @property - def uses_total_row_ordering(self): - return True - @property def skips_nulls(self): return False @@ -470,10 +471,6 @@ def skips_nulls(self): class ShiftOp(UnaryWindowOp): periods: int - @property - def uses_total_row_ordering(self): - return True - @property def skips_nulls(self): return False @@ -483,10 +480,6 @@ def skips_nulls(self): class DiffOp(UnaryWindowOp): periods: int - @property - def uses_total_row_ordering(self): - return True - @property def skips_nulls(self): return False diff --git a/noxfile.py b/noxfile.py index 863c7b26d3..5fcf1d6cdc 100644 --- a/noxfile.py +++ b/noxfile.py @@ -172,7 +172,7 @@ def install_unittest_dependencies(session, install_test_extra, *constraints): if UNIT_TEST_EXTERNAL_DEPENDENCIES: msg = ( "'unit_test_external_dependencies' is deprecated. Instead, please " - "use 'unit_test_dependencies' or 'unit_test_local_dependencies'.", + "use 'unit_test_dependencies' or 'unit_test_local_dependencies'." ) warnings.warn(msg, DeprecationWarning) session.install(*UNIT_TEST_EXTERNAL_DEPENDENCIES, *constraints) @@ -444,7 +444,7 @@ def cover(session): "report", "--include=bigframes/*", "--show-missing", - "--fail-under=86", + "--fail-under=85", ) # Make sure there is no dead code in our test directories. diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index aa1d6262d5..e950ddbc5a 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1854,6 +1854,7 @@ def test_groupby_window_ops(scalars_df_index, scalars_pandas_df_index, operator) pd_series = operator( scalars_pandas_df_index[col_name].groupby(scalars_pandas_df_index[group_key]) ).astype(bf_series.dtype) + pd.testing.assert_series_equal( pd_series, bf_series, diff --git a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py index 3793a09229..7d6cd6d2b4 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/compilers/bigquery/__init__.py @@ -1039,7 +1039,7 @@ def visit_InMemoryTable(self, op, *, name, schema, data): nested=True, ) array_values = [ - sge.Tuple( + sge.Struct( expressions=tuple( self.visit_Literal(None, value=value, dtype=type_) for value, type_ in zip(row, schema.types) From 2ba59e5d1f938b94226cf2aa949a6560b1183ecb Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Tue, 28 Jan 2025 13:57:49 -0800 Subject: [PATCH 07/38] chore: document permissions for experimental session.from_glob_path (#1332) --- bigframes/session/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 89ea0eee69..d512a22915 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1624,6 +1624,8 @@ def from_glob_path( self, path: str, *, connection: Optional[str] = None, name: Optional[str] = None ) -> dataframe.DataFrame: r"""Create a BigFrames DataFrame that contains a BigFrames Blob column from a global wildcard path. + This operation creates a temporary BQ Object Table under the hood and requires bigquery.connections.delegate permission or BigQuery Connection Admin role. + If you have an existing BQ Object Table, use read_gbq_object_table(). .. note:: BigFrames Blob is still under experiments. It may not work and subject to change in the future. From 97532c9ba02cd709d69666dd0afca5c1df8b9faf Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 28 Jan 2025 14:24:53 -0800 Subject: [PATCH 08/38] feat: support time_series_id_col in ARIMAPlus (#1282) * manually port from shuowei-arima-plus branch, now I cannot pass format test * I have resolve all conflicts after manual porting * use inherritance for arima plus model, and add sql models * resolve unexpected indent for docstring --------- Co-authored-by: Shuowei Li --- bigframes/ml/base.py | 28 + bigframes/ml/core.py | 27 +- bigframes/ml/forecasting.py | 58 +- tests/data/time_series.jsonl | 1098 ++++++++++++++------- tests/data/time_series_schema.json | 5 + tests/system/conftest.py | 84 +- tests/system/large/ml/test_forecasting.py | 101 +- tests/system/small/ml/conftest.py | 18 + tests/system/small/ml/test_core.py | 73 +- tests/system/small/ml/test_forecasting.py | 539 ++++++++-- 10 files changed, 1486 insertions(+), 545 deletions(-) diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index f06de99181..c353e47f3a 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -165,6 +165,34 @@ def fit( return self._fit(X, y) +class SupervisedTrainableWithIdColPredictor(SupervisedTrainablePredictor): + """Inherits from SupervisedTrainablePredictor, + but adds an optional id_col parameter to fit().""" + + def __init__(self): + super().__init__() + self.id_col = None + + def _fit( + self, + X: utils.ArrayType, + y: utils.ArrayType, + transforms=None, + id_col: Optional[utils.ArrayType] = None, + ): + return self + + def fit( + self, + X: utils.ArrayType, + y: utils.ArrayType, + transforms=None, + id_col: Optional[utils.ArrayType] = None, + ): + self.id_col = id_col + return self._fit(X, y, transforms=transforms, id_col=self.id_col) + + class TrainableWithEvaluationPredictor(TrainablePredictor): """A BigQuery DataFrames ML Model base class that can be used to fit and predict outputs. diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index d038b8f4c0..ad00ed3f2c 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -181,15 +181,23 @@ def detect_anomalies( def forecast(self, options: Mapping[str, int | float]) -> bpd.DataFrame: sql = self._model_manipulation_sql_generator.ml_forecast(struct_options=options) - return self._session.read_gbq(sql, index_col="forecast_timestamp").reset_index() + timestamp_col_name = "forecast_timestamp" + index_cols = [timestamp_col_name] + first_col_name = self._session.read_gbq(sql).columns.values[0] + if timestamp_col_name != first_col_name: + index_cols.append(first_col_name) + return self._session.read_gbq(sql, index_col=index_cols).reset_index() def explain_forecast(self, options: Mapping[str, int | float]) -> bpd.DataFrame: sql = self._model_manipulation_sql_generator.ml_explain_forecast( struct_options=options ) - return self._session.read_gbq( - sql, index_col="time_series_timestamp" - ).reset_index() + timestamp_col_name = "time_series_timestamp" + index_cols = [timestamp_col_name] + first_col_name = self._session.read_gbq(sql).columns.values[0] + if timestamp_col_name != first_col_name: + index_cols.append(first_col_name) + return self._session.read_gbq(sql, index_col=index_cols).reset_index() def evaluate(self, input_data: Optional[bpd.DataFrame] = None): sql = self._model_manipulation_sql_generator.ml_evaluate( @@ -390,6 +398,7 @@ def create_time_series_model( self, X_train: bpd.DataFrame, y_train: bpd.DataFrame, + id_col: Optional[bpd.DataFrame] = None, transforms: Optional[Iterable[str]] = None, options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, ) -> BqmlModel: @@ -399,13 +408,21 @@ def create_time_series_model( assert ( y_train.columns.size == 1 ), "Time stamp data input must only contain 1 column." + assert id_col is None or ( + id_col is not None and id_col.columns.size == 1 + ), "Time series id input is either None or must only contain 1 column." options = dict(options) # Cache dataframes to make sure base table is not a snapshot # cached dataframe creates a full copy, never uses snapshot - input_data = X_train.join(y_train, how="outer").cache() + input_data = X_train.join(y_train, how="outer") + if id_col is not None: + input_data = input_data.join(id_col, how="outer") + input_data = input_data.cache() options.update({"TIME_SERIES_TIMESTAMP_COL": X_train.columns.tolist()[0]}) options.update({"TIME_SERIES_DATA_COL": y_train.columns.tolist()[0]}) + if id_col is not None: + options.update({"TIME_SERIES_ID_COL": id_col.columns.tolist()[0]}) session = X_train._session model_ref = self._create_model_ref(session._anonymous_dataset) diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py index 4e6c5036e7..7aa8ba5a5f 100644 --- a/bigframes/ml/forecasting.py +++ b/bigframes/ml/forecasting.py @@ -45,7 +45,7 @@ @log_adapter.class_logger -class ARIMAPlus(base.SupervisedTrainablePredictor): +class ARIMAPlus(base.SupervisedTrainableWithIdColPredictor): """Time Series ARIMA Plus model. Args: @@ -183,18 +183,26 @@ def _fit( X: utils.ArrayType, y: utils.ArrayType, transforms: Optional[List[str]] = None, - ): + id_col: Optional[utils.ArrayType] = None, + ) -> ARIMAPlus: """Fit the model to training data. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): - A dataframe of training timestamp. - - y (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series, + or pandas.core.frame.DataFrame or pandas.core.series.Series): + A dataframe or series of trainging timestamp. + y (bigframes.dataframe.DataFrame, or bigframes.series.Series, + or pandas.core.frame.DataFrame, or pandas.core.series.Series): Target values for training. transforms (Optional[List[str]], default None): Do not use. Internal param to be deprecated. Use bigframes.ml.pipeline instead. + id_col (Optional[bigframes.dataframe.DataFrame] + or Optional[bigframes.series.Series] + or Optional[pandas.core.frame.DataFrame] + or Optional[pandas.core.frame.Series] + or None, default None): + An optional dataframe or series of training id col. Returns: ARIMAPlus: Fitted estimator. @@ -202,18 +210,26 @@ def _fit( X, y = utils.batch_convert_to_dataframe(X, y) if X.columns.size != 1: - raise ValueError( - "Time series timestamp input X must only contain 1 column." - ) + raise ValueError("Time series timestamp input X contain at least 1 column.") if y.columns.size != 1: raise ValueError("Time series data input y must only contain 1 column.") + if id_col is not None: + (id_col,) = utils.batch_convert_to_dataframe(id_col) + + if id_col.columns.size != 1: + raise ValueError( + "Time series id input id_col must only contain 1 column." + ) + self._bqml_model = self._bqml_model_factory.create_time_series_model( X, y, + id_col=id_col, transforms=transforms, options=self._bqml_options, ) + return self def predict( self, X=None, *, horizon: int = 3, confidence_level: float = 0.95 @@ -237,7 +253,7 @@ def predict( Returns: bigframes.dataframe.DataFrame: The predicted DataFrames. Which - contains 2 columns: "forecast_timestamp" and "forecast_value". + contains 2 columns: "forecast_timestamp", "id" as optional, and "forecast_value". """ if horizon < 1 or horizon > 1000: raise ValueError(f"horizon must be [1, 1000], but is {horizon}.") @@ -345,6 +361,7 @@ def score( self, X: utils.ArrayType, y: utils.ArrayType, + id_col: Optional[utils.ArrayType] = None, ) -> bpd.DataFrame: """Calculate evaluation metrics of the model. @@ -355,13 +372,22 @@ def score( for the outputs relevant to this model type. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): - A BigQuery DataFrame only contains 1 column as + X (bigframes.dataframe.DataFrame or bigframes.series.Series + or pandas.core.frame.DataFrame or pandas.core.series.Series): + A dataframe or series only contains 1 column as evaluation timestamp. The timestamp must be within the horizon of the model, which by default is 1000 data points. - y (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): - A BigQuery DataFrame only contains 1 column as + y (bigframes.dataframe.DataFrame or bigframes.series.Series + or pandas.core.frame.DataFrame or pandas.core.series.Series): + A dataframe or series only contains 1 column as evaluation numeric values. + id_col (Optional[bigframes.dataframe.DataFrame] + or Optional[bigframes.series.Series] + or Optional[pandas.core.frame.DataFrame] + or Optional[pandas.core.series.Series] + or None, default None): + An optional dataframe or series contains at least 1 column as + evaluation id column. Returns: bigframes.dataframe.DataFrame: A DataFrame as evaluation result. @@ -371,6 +397,10 @@ def score( X, y = utils.batch_convert_to_dataframe(X, y, session=self._bqml_model.session) input_data = X.join(y, how="outer") + if id_col is not None: + (id_col,) = utils.batch_convert_to_dataframe(id_col) + input_data = input_data.join(id_col, how="outer") + return self._bqml_model.evaluate(input_data) def summary( diff --git a/tests/data/time_series.jsonl b/tests/data/time_series.jsonl index e0f9ca7ae2..329e5a8b61 100644 --- a/tests/data/time_series.jsonl +++ b/tests/data/time_series.jsonl @@ -1,366 +1,732 @@ -{"parsed_date":"2017-07-01 00:00:00 UTC","total_visits":"2048"} -{"parsed_date":"2016-09-07 00:00:00 UTC","total_visits":"2562"} -{"parsed_date":"2016-10-25 00:00:00 UTC","total_visits":"3842"} -{"parsed_date":"2017-04-10 00:00:00 UTC","total_visits":"2563"} -{"parsed_date":"2017-01-09 00:00:00 UTC","total_visits":"2308"} -{"parsed_date":"2017-05-02 00:00:00 UTC","total_visits":"2564"} -{"parsed_date":"2016-11-11 00:00:00 UTC","total_visits":"3588"} -{"parsed_date":"2017-07-30 00:00:00 UTC","total_visits":"1799"} -{"parsed_date":"2017-06-10 00:00:00 UTC","total_visits":"1545"} -{"parsed_date":"2016-08-14 00:00:00 UTC","total_visits":"1801"} -{"parsed_date":"2017-05-14 00:00:00 UTC","total_visits":"1290"} -{"parsed_date":"2017-02-08 00:00:00 UTC","total_visits":"2570"} -{"parsed_date":"2017-06-01 00:00:00 UTC","total_visits":"2826"} -{"parsed_date":"2017-04-23 00:00:00 UTC","total_visits":"1548"} -{"parsed_date":"2016-11-04 00:00:00 UTC","total_visits":"3596"} -{"parsed_date":"2017-02-04 00:00:00 UTC","total_visits":"1549"} -{"parsed_date":"2016-12-09 00:00:00 UTC","total_visits":"2830"} -{"parsed_date":"2016-10-30 00:00:00 UTC","total_visits":"3086"} -{"parsed_date":"2017-03-28 00:00:00 UTC","total_visits":"2577"} -{"parsed_date":"2017-06-11 00:00:00 UTC","total_visits":"1555"} -{"parsed_date":"2016-12-17 00:00:00 UTC","total_visits":"2324"} -{"parsed_date":"2016-09-22 00:00:00 UTC","total_visits":"2581"} -{"parsed_date":"2017-01-29 00:00:00 UTC","total_visits":"1814"} -{"parsed_date":"2017-03-22 00:00:00 UTC","total_visits":"2582"} -{"parsed_date":"2017-02-21 00:00:00 UTC","total_visits":"2582"} -{"parsed_date":"2016-10-14 00:00:00 UTC","total_visits":"2838"} -{"parsed_date":"2017-04-27 00:00:00 UTC","total_visits":"2838"} -{"parsed_date":"2016-10-26 00:00:00 UTC","total_visits":"4375"} -{"parsed_date":"2016-08-22 00:00:00 UTC","total_visits":"2584"} -{"parsed_date":"2016-12-07 00:00:00 UTC","total_visits":"2840"} -{"parsed_date":"2017-01-20 00:00:00 UTC","total_visits":"2074"} -{"parsed_date":"2017-03-07 00:00:00 UTC","total_visits":"2586"} -{"parsed_date":"2017-05-16 00:00:00 UTC","total_visits":"3098"} -{"parsed_date":"2017-05-03 00:00:00 UTC","total_visits":"2588"} -{"parsed_date":"2017-05-01 00:00:00 UTC","total_visits":"2588"} -{"parsed_date":"2016-11-27 00:00:00 UTC","total_visits":"3356"} -{"parsed_date":"2017-04-29 00:00:00 UTC","total_visits":"1566"} -{"parsed_date":"2016-09-18 00:00:00 UTC","total_visits":"1822"} -{"parsed_date":"2017-03-23 00:00:00 UTC","total_visits":"2847"} -{"parsed_date":"2017-03-14 00:00:00 UTC","total_visits":"2338"} -{"parsed_date":"2016-12-21 00:00:00 UTC","total_visits":"2594"} -{"parsed_date":"2016-10-11 00:00:00 UTC","total_visits":"2850"} -{"parsed_date":"2017-01-24 00:00:00 UTC","total_visits":"3618"} -{"parsed_date":"2017-03-05 00:00:00 UTC","total_visits":"1827"} -{"parsed_date":"2017-01-19 00:00:00 UTC","total_visits":"2083"} -{"parsed_date":"2016-08-09 00:00:00 UTC","total_visits":"2851"} -{"parsed_date":"2017-04-08 00:00:00 UTC","total_visits":"1829"} -{"parsed_date":"2017-04-12 00:00:00 UTC","total_visits":"2341"} -{"parsed_date":"2016-09-29 00:00:00 UTC","total_visits":"2597"} -{"parsed_date":"2016-12-20 00:00:00 UTC","total_visits":"3110"} -{"parsed_date":"2017-01-15 00:00:00 UTC","total_visits":"1576"} -{"parsed_date":"2017-04-14 00:00:00 UTC","total_visits":"1834"} -{"parsed_date":"2017-02-28 00:00:00 UTC","total_visits":"2347"} -{"parsed_date":"2016-09-16 00:00:00 UTC","total_visits":"2603"} -{"parsed_date":"2016-10-18 00:00:00 UTC","total_visits":"3628"} -{"parsed_date":"2017-02-24 00:00:00 UTC","total_visits":"2093"} -{"parsed_date":"2017-05-17 00:00:00 UTC","total_visits":"3117"} -{"parsed_date":"2017-06-23 00:00:00 UTC","total_visits":"2095"} -{"parsed_date":"2016-11-12 00:00:00 UTC","total_visits":"3119"} -{"parsed_date":"2016-11-21 00:00:00 UTC","total_visits":"4143"} -{"parsed_date":"2017-02-27 00:00:00 UTC","total_visits":"2352"} -{"parsed_date":"2016-12-26 00:00:00 UTC","total_visits":"1586"} -{"parsed_date":"2017-04-25 00:00:00 UTC","total_visits":"2354"} -{"parsed_date":"2017-03-21 00:00:00 UTC","total_visits":"2611"} -{"parsed_date":"2016-12-22 00:00:00 UTC","total_visits":"2100"} -{"parsed_date":"2016-10-01 00:00:00 UTC","total_visits":"1589"} -{"parsed_date":"2016-09-24 00:00:00 UTC","total_visits":"1845"} -{"parsed_date":"2017-06-21 00:00:00 UTC","total_visits":"2357"} -{"parsed_date":"2016-09-02 00:00:00 UTC","total_visits":"2613"} -{"parsed_date":"2016-08-26 00:00:00 UTC","total_visits":"2359"} -{"parsed_date":"2016-10-12 00:00:00 UTC","total_visits":"2871"} -{"parsed_date":"2017-05-15 00:00:00 UTC","total_visits":"2360"} -{"parsed_date":"2017-06-12 00:00:00 UTC","total_visits":"2361"} -{"parsed_date":"2016-08-16 00:00:00 UTC","total_visits":"2873"} -{"parsed_date":"2017-04-30 00:00:00 UTC","total_visits":"1594"} -{"parsed_date":"2017-04-05 00:00:00 UTC","total_visits":"2619"} -{"parsed_date":"2016-08-12 00:00:00 UTC","total_visits":"2619"} -{"parsed_date":"2016-11-08 00:00:00 UTC","total_visits":"3899"} -{"parsed_date":"2016-08-13 00:00:00 UTC","total_visits":"1596"} -{"parsed_date":"2017-05-09 00:00:00 UTC","total_visits":"2108"} -{"parsed_date":"2017-02-23 00:00:00 UTC","total_visits":"2364"} -{"parsed_date":"2017-07-31 00:00:00 UTC","total_visits":"2620"} -{"parsed_date":"2017-06-25 00:00:00 UTC","total_visits":"1597"} -{"parsed_date":"2017-07-29 00:00:00 UTC","total_visits":"1597"} -{"parsed_date":"2016-09-17 00:00:00 UTC","total_visits":"1853"} -{"parsed_date":"2016-12-27 00:00:00 UTC","total_visits":"1855"} -{"parsed_date":"2017-05-20 00:00:00 UTC","total_visits":"1855"} -{"parsed_date":"2016-10-08 00:00:00 UTC","total_visits":"2114"} -{"parsed_date":"2016-10-27 00:00:00 UTC","total_visits":"4162"} -{"parsed_date":"2017-07-08 00:00:00 UTC","total_visits":"1859"} -{"parsed_date":"2016-08-24 00:00:00 UTC","total_visits":"2627"} -{"parsed_date":"2016-12-23 00:00:00 UTC","total_visits":"1604"} -{"parsed_date":"2017-02-02 00:00:00 UTC","total_visits":"2372"} -{"parsed_date":"2016-09-08 00:00:00 UTC","total_visits":"2628"} -{"parsed_date":"2017-04-02 00:00:00 UTC","total_visits":"1861"} -{"parsed_date":"2017-02-15 00:00:00 UTC","total_visits":"2629"} -{"parsed_date":"2017-07-05 00:00:00 UTC","total_visits":"2885"} -{"parsed_date":"2016-10-17 00:00:00 UTC","total_visits":"3397"} -{"parsed_date":"2017-02-20 00:00:00 UTC","total_visits":"2374"} -{"parsed_date":"2017-03-24 00:00:00 UTC","total_visits":"2374"} -{"parsed_date":"2017-04-20 00:00:00 UTC","total_visits":"2374"} -{"parsed_date":"2016-11-18 00:00:00 UTC","total_visits":"3654"} -{"parsed_date":"2017-07-25 00:00:00 UTC","total_visits":"2631"} -{"parsed_date":"2016-11-13 00:00:00 UTC","total_visits":"3144"} -{"parsed_date":"2017-03-18 00:00:00 UTC","total_visits":"1610"} -{"parsed_date":"2016-08-03 00:00:00 UTC","total_visits":"2890"} -{"parsed_date":"2016-08-19 00:00:00 UTC","total_visits":"2379"} -{"parsed_date":"2017-02-14 00:00:00 UTC","total_visits":"2379"} -{"parsed_date":"2017-07-11 00:00:00 UTC","total_visits":"2635"} -{"parsed_date":"2017-04-22 00:00:00 UTC","total_visits":"1612"} -{"parsed_date":"2016-10-07 00:00:00 UTC","total_visits":"2892"} -{"parsed_date":"2016-09-05 00:00:00 UTC","total_visits":"2125"} -{"parsed_date":"2016-09-23 00:00:00 UTC","total_visits":"2381"} -{"parsed_date":"2016-11-15 00:00:00 UTC","total_visits":"4685"} -{"parsed_date":"2017-01-28 00:00:00 UTC","total_visits":"1614"} -{"parsed_date":"2017-07-14 00:00:00 UTC","total_visits":"2382"} -{"parsed_date":"2017-01-07 00:00:00 UTC","total_visits":"1615"} -{"parsed_date":"2017-04-03 00:00:00 UTC","total_visits":"2383"} -{"parsed_date":"2017-03-20 00:00:00 UTC","total_visits":"2383"} -{"parsed_date":"2016-12-18 00:00:00 UTC","total_visits":"2128"} -{"parsed_date":"2017-03-17 00:00:00 UTC","total_visits":"2129"} -{"parsed_date":"2017-05-23 00:00:00 UTC","total_visits":"2129"} -{"parsed_date":"2016-11-30 00:00:00 UTC","total_visits":"4435"} -{"parsed_date":"2017-01-01 00:00:00 UTC","total_visits":"1364"} -{"parsed_date":"2017-01-02 00:00:00 UTC","total_visits":"1620"} -{"parsed_date":"2016-09-25 00:00:00 UTC","total_visits":"1877"} -{"parsed_date":"2016-08-07 00:00:00 UTC","total_visits":"1622"} -{"parsed_date":"2016-10-09 00:00:00 UTC","total_visits":"2134"} -{"parsed_date":"2017-03-01 00:00:00 UTC","total_visits":"2390"} -{"parsed_date":"2017-01-04 00:00:00 UTC","total_visits":"2390"} -{"parsed_date":"2017-06-06 00:00:00 UTC","total_visits":"2391"} -{"parsed_date":"2017-04-18 00:00:00 UTC","total_visits":"2391"} -{"parsed_date":"2017-04-06 00:00:00 UTC","total_visits":"2647"} -{"parsed_date":"2017-01-30 00:00:00 UTC","total_visits":"2392"} -{"parsed_date":"2016-10-16 00:00:00 UTC","total_visits":"2649"} -{"parsed_date":"2016-08-04 00:00:00 UTC","total_visits":"3161"} -{"parsed_date":"2016-10-21 00:00:00 UTC","total_visits":"3419"} -{"parsed_date":"2016-08-02 00:00:00 UTC","total_visits":"2140"} -{"parsed_date":"2017-03-06 00:00:00 UTC","total_visits":"2396"} -{"parsed_date":"2016-09-13 00:00:00 UTC","total_visits":"2396"} -{"parsed_date":"2016-09-14 00:00:00 UTC","total_visits":"2652"} -{"parsed_date":"2017-04-19 00:00:00 UTC","total_visits":"2397"} -{"parsed_date":"2017-06-19 00:00:00 UTC","total_visits":"2142"} -{"parsed_date":"2016-12-13 00:00:00 UTC","total_visits":"3166"} -{"parsed_date":"2017-06-20 00:00:00 UTC","total_visits":"2143"} -{"parsed_date":"2016-10-10 00:00:00 UTC","total_visits":"2911"} -{"parsed_date":"2017-07-06 00:00:00 UTC","total_visits":"2658"} -{"parsed_date":"2017-01-03 00:00:00 UTC","total_visits":"2403"} -{"parsed_date":"2017-01-08 00:00:00 UTC","total_visits":"1637"} -{"parsed_date":"2017-02-25 00:00:00 UTC","total_visits":"1638"} -{"parsed_date":"2017-05-24 00:00:00 UTC","total_visits":"2406"} -{"parsed_date":"2016-11-22 00:00:00 UTC","total_visits":"3942"} -{"parsed_date":"2017-05-06 00:00:00 UTC","total_visits":"1383"} -{"parsed_date":"2017-07-02 00:00:00 UTC","total_visits":"1895"} -{"parsed_date":"2016-12-01 00:00:00 UTC","total_visits":"4200"} -{"parsed_date":"2017-03-16 00:00:00 UTC","total_visits":"2409"} -{"parsed_date":"2016-12-12 00:00:00 UTC","total_visits":"3433"} -{"parsed_date":"2016-12-25 00:00:00 UTC","total_visits":"1386"} -{"parsed_date":"2017-02-26 00:00:00 UTC","total_visits":"1643"} -{"parsed_date":"2017-04-28 00:00:00 UTC","total_visits":"2411"} -{"parsed_date":"2016-08-11 00:00:00 UTC","total_visits":"2667"} -{"parsed_date":"2017-07-20 00:00:00 UTC","total_visits":"2668"} -{"parsed_date":"2017-05-21 00:00:00 UTC","total_visits":"1645"} -{"parsed_date":"2017-06-17 00:00:00 UTC","total_visits":"1391"} -{"parsed_date":"2016-12-29 00:00:00 UTC","total_visits":"1647"} -{"parsed_date":"2017-07-17 00:00:00 UTC","total_visits":"2671"} -{"parsed_date":"2017-01-16 00:00:00 UTC","total_visits":"1906"} -{"parsed_date":"2017-03-03 00:00:00 UTC","total_visits":"2162"} -{"parsed_date":"2016-11-14 00:00:00 UTC","total_visits":"4466"} -{"parsed_date":"2016-08-30 00:00:00 UTC","total_visits":"2675"} -{"parsed_date":"2016-08-27 00:00:00 UTC","total_visits":"1654"} -{"parsed_date":"2017-02-09 00:00:00 UTC","total_visits":"2678"} -{"parsed_date":"2017-06-03 00:00:00 UTC","total_visits":"1399"} -{"parsed_date":"2017-05-07 00:00:00 UTC","total_visits":"1400"} -{"parsed_date":"2016-11-02 00:00:00 UTC","total_visits":"3960"} -{"parsed_date":"2016-12-15 00:00:00 UTC","total_visits":"2937"} -{"parsed_date":"2017-04-01 00:00:00 UTC","total_visits":"2170"} -{"parsed_date":"2017-07-21 00:00:00 UTC","total_visits":"2427"} -{"parsed_date":"2016-08-06 00:00:00 UTC","total_visits":"1663"} -{"parsed_date":"2016-09-01 00:00:00 UTC","total_visits":"2687"} -{"parsed_date":"2017-06-28 00:00:00 UTC","total_visits":"2687"} -{"parsed_date":"2016-08-20 00:00:00 UTC","total_visits":"1664"} -{"parsed_date":"2017-04-26 00:00:00 UTC","total_visits":"4224"} -{"parsed_date":"2017-07-09 00:00:00 UTC","total_visits":"1921"} -{"parsed_date":"2017-07-28 00:00:00 UTC","total_visits":"2433"} -{"parsed_date":"2016-09-19 00:00:00 UTC","total_visits":"2689"} -{"parsed_date":"2017-07-24 00:00:00 UTC","total_visits":"2436"} -{"parsed_date":"2017-06-13 00:00:00 UTC","total_visits":"2181"} -{"parsed_date":"2016-09-15 00:00:00 UTC","total_visits":"2949"} -{"parsed_date":"2017-02-03 00:00:00 UTC","total_visits":"2182"} -{"parsed_date":"2016-09-10 00:00:00 UTC","total_visits":"1671"} -{"parsed_date":"2017-06-09 00:00:00 UTC","total_visits":"1927"} -{"parsed_date":"2017-01-11 00:00:00 UTC","total_visits":"2185"} -{"parsed_date":"2017-02-19 00:00:00 UTC","total_visits":"2187"} -{"parsed_date":"2017-01-17 00:00:00 UTC","total_visits":"2443"} -{"parsed_date":"2017-05-12 00:00:00 UTC","total_visits":"1932"} -{"parsed_date":"2016-12-16 00:00:00 UTC","total_visits":"2956"} -{"parsed_date":"2017-02-01 00:00:00 UTC","total_visits":"2445"} -{"parsed_date":"2016-11-26 00:00:00 UTC","total_visits":"3213"} -{"parsed_date":"2017-06-02 00:00:00 UTC","total_visits":"2190"} -{"parsed_date":"2016-08-05 00:00:00 UTC","total_visits":"2702"} -{"parsed_date":"2016-11-01 00:00:00 UTC","total_visits":"3728"} -{"parsed_date":"2017-01-05 00:00:00 UTC","total_visits":"2193"} -{"parsed_date":"2017-03-08 00:00:00 UTC","total_visits":"2449"} -{"parsed_date":"2016-08-28 00:00:00 UTC","total_visits":"1682"} -{"parsed_date":"2017-07-04 00:00:00 UTC","total_visits":"1938"} -{"parsed_date":"2017-03-10 00:00:00 UTC","total_visits":"2194"} -{"parsed_date":"2017-07-07 00:00:00 UTC","total_visits":"2450"} -{"parsed_date":"2016-10-29 00:00:00 UTC","total_visits":"2964"} -{"parsed_date":"2016-10-13 00:00:00 UTC","total_visits":"2964"} -{"parsed_date":"2016-12-04 00:00:00 UTC","total_visits":"3220"} -{"parsed_date":"2017-01-21 00:00:00 UTC","total_visits":"1685"} -{"parsed_date":"2017-06-29 00:00:00 UTC","total_visits":"2709"} -{"parsed_date":"2016-08-29 00:00:00 UTC","total_visits":"2454"} -{"parsed_date":"2016-12-19 00:00:00 UTC","total_visits":"3222"} -{"parsed_date":"2017-05-30 00:00:00 UTC","total_visits":"2199"} -{"parsed_date":"2017-02-10 00:00:00 UTC","total_visits":"2199"} -{"parsed_date":"2016-08-31 00:00:00 UTC","total_visits":"3223"} -{"parsed_date":"2017-06-18 00:00:00 UTC","total_visits":"1432"} -{"parsed_date":"2017-01-12 00:00:00 UTC","total_visits":"2203"} -{"parsed_date":"2017-05-18 00:00:00 UTC","total_visits":"2715"} -{"parsed_date":"2016-10-23 00:00:00 UTC","total_visits":"2971"} -{"parsed_date":"2016-09-04 00:00:00 UTC","total_visits":"1692"} -{"parsed_date":"2016-12-10 00:00:00 UTC","total_visits":"2207"} -{"parsed_date":"2016-12-11 00:00:00 UTC","total_visits":"2208"} -{"parsed_date":"2017-04-11 00:00:00 UTC","total_visits":"2464"} -{"parsed_date":"2016-09-21 00:00:00 UTC","total_visits":"2720"} -{"parsed_date":"2016-11-06 00:00:00 UTC","total_visits":"3232"} -{"parsed_date":"2017-01-26 00:00:00 UTC","total_visits":"2209"} -{"parsed_date":"2016-09-12 00:00:00 UTC","total_visits":"2465"} -{"parsed_date":"2017-04-21 00:00:00 UTC","total_visits":"2210"} -{"parsed_date":"2017-01-06 00:00:00 UTC","total_visits":"2210"} -{"parsed_date":"2017-04-04 00:00:00 UTC","total_visits":"2978"} -{"parsed_date":"2017-01-22 00:00:00 UTC","total_visits":"1700"} -{"parsed_date":"2017-07-26 00:00:00 UTC","total_visits":"2725"} -{"parsed_date":"2016-08-18 00:00:00 UTC","total_visits":"2725"} -{"parsed_date":"2016-09-27 00:00:00 UTC","total_visits":"2727"} -{"parsed_date":"2016-12-02 00:00:00 UTC","total_visits":"3751"} -{"parsed_date":"2017-05-05 00:00:00 UTC","total_visits":"1960"} -{"parsed_date":"2016-11-19 00:00:00 UTC","total_visits":"2984"} -{"parsed_date":"2016-11-09 00:00:00 UTC","total_visits":"3752"} -{"parsed_date":"2016-12-05 00:00:00 UTC","total_visits":"4265"} -{"parsed_date":"2017-05-11 00:00:00 UTC","total_visits":"2218"} -{"parsed_date":"2017-01-25 00:00:00 UTC","total_visits":"2986"} -{"parsed_date":"2017-03-11 00:00:00 UTC","total_visits":"1707"} -{"parsed_date":"2017-03-30 00:00:00 UTC","total_visits":"2731"} -{"parsed_date":"2016-10-20 00:00:00 UTC","total_visits":"3755"} -{"parsed_date":"2017-02-07 00:00:00 UTC","total_visits":"2476"} -{"parsed_date":"2017-02-22 00:00:00 UTC","total_visits":"2477"} -{"parsed_date":"2017-07-23 00:00:00 UTC","total_visits":"1966"} -{"parsed_date":"2016-11-03 00:00:00 UTC","total_visits":"4014"} -{"parsed_date":"2016-08-01 00:00:00 UTC","total_visits":"1711"} -{"parsed_date":"2017-01-13 00:00:00 UTC","total_visits":"1967"} -{"parsed_date":"2017-05-19 00:00:00 UTC","total_visits":"2223"} -{"parsed_date":"2016-11-20 00:00:00 UTC","total_visits":"3247"} -{"parsed_date":"2016-11-25 00:00:00 UTC","total_visits":"3759"} -{"parsed_date":"2017-03-25 00:00:00 UTC","total_visits":"1712"} -{"parsed_date":"2017-01-27 00:00:00 UTC","total_visits":"1969"} -{"parsed_date":"2017-06-26 00:00:00 UTC","total_visits":"2226"} -{"parsed_date":"2017-05-25 00:00:00 UTC","total_visits":"2228"} -{"parsed_date":"2017-01-31 00:00:00 UTC","total_visits":"2229"} -{"parsed_date":"2017-07-13 00:00:00 UTC","total_visits":"2741"} -{"parsed_date":"2017-03-15 00:00:00 UTC","total_visits":"2486"} -{"parsed_date":"2017-05-28 00:00:00 UTC","total_visits":"1463"} -{"parsed_date":"2017-03-09 00:00:00 UTC","total_visits":"2231"} -{"parsed_date":"2017-07-15 00:00:00 UTC","total_visits":"1721"} -{"parsed_date":"2016-11-24 00:00:00 UTC","total_visits":"3770"} -{"parsed_date":"2016-10-05 00:00:00 UTC","total_visits":"3770"} -{"parsed_date":"2016-12-31 00:00:00 UTC","total_visits":"1211"} -{"parsed_date":"2016-10-02 00:00:00 UTC","total_visits":"1724"} -{"parsed_date":"2017-07-22 00:00:00 UTC","total_visits":"1724"} -{"parsed_date":"2016-09-11 00:00:00 UTC","total_visits":"1725"} -{"parsed_date":"2017-06-15 00:00:00 UTC","total_visits":"2237"} -{"parsed_date":"2017-06-05 00:00:00 UTC","total_visits":"2493"} -{"parsed_date":"2017-02-06 00:00:00 UTC","total_visits":"2238"} -{"parsed_date":"2016-10-15 00:00:00 UTC","total_visits":"2495"} -{"parsed_date":"2016-08-21 00:00:00 UTC","total_visits":"1730"} -{"parsed_date":"2016-08-23 00:00:00 UTC","total_visits":"2754"} -{"parsed_date":"2017-06-30 00:00:00 UTC","total_visits":"2499"} -{"parsed_date":"2017-01-18 00:00:00 UTC","total_visits":"2245"} -{"parsed_date":"2016-08-10 00:00:00 UTC","total_visits":"2757"} -{"parsed_date":"2016-12-08 00:00:00 UTC","total_visits":"3013"} -{"parsed_date":"2016-11-28 00:00:00 UTC","total_visits":"4807"} -{"parsed_date":"2017-05-22 00:00:00 UTC","total_visits":"2248"} -{"parsed_date":"2016-09-20 00:00:00 UTC","total_visits":"2760"} -{"parsed_date":"2016-10-06 00:00:00 UTC","total_visits":"3016"} -{"parsed_date":"2016-09-06 00:00:00 UTC","total_visits":"2508"} -{"parsed_date":"2016-09-03 00:00:00 UTC","total_visits":"1741"} -{"parsed_date":"2016-12-06 00:00:00 UTC","total_visits":"3021"} -{"parsed_date":"2016-12-24 00:00:00 UTC","total_visits":"1231"} -{"parsed_date":"2016-10-28 00:00:00 UTC","total_visits":"3791"} -{"parsed_date":"2016-12-30 00:00:00 UTC","total_visits":"1232"} -{"parsed_date":"2017-05-29 00:00:00 UTC","total_visits":"1745"} -{"parsed_date":"2017-07-10 00:00:00 UTC","total_visits":"2769"} -{"parsed_date":"2017-06-22 00:00:00 UTC","total_visits":"2258"} -{"parsed_date":"2017-07-19 00:00:00 UTC","total_visits":"2514"} -{"parsed_date":"2016-10-03 00:00:00 UTC","total_visits":"2514"} -{"parsed_date":"2017-06-14 00:00:00 UTC","total_visits":"2517"} -{"parsed_date":"2016-10-22 00:00:00 UTC","total_visits":"3029"} -{"parsed_date":"2017-01-23 00:00:00 UTC","total_visits":"2262"} -{"parsed_date":"2017-04-24 00:00:00 UTC","total_visits":"2263"} -{"parsed_date":"2016-11-10 00:00:00 UTC","total_visits":"4055"} -{"parsed_date":"2016-09-26 00:00:00 UTC","total_visits":"2776"} -{"parsed_date":"2016-10-19 00:00:00 UTC","total_visits":"3544"} -{"parsed_date":"2017-03-04 00:00:00 UTC","total_visits":"1753"} -{"parsed_date":"2017-05-26 00:00:00 UTC","total_visits":"2009"} -{"parsed_date":"2017-02-13 00:00:00 UTC","total_visits":"2266"} -{"parsed_date":"2017-02-18 00:00:00 UTC","total_visits":"1755"} -{"parsed_date":"2017-03-02 00:00:00 UTC","total_visits":"2267"} -{"parsed_date":"2017-03-31 00:00:00 UTC","total_visits":"2268"} -{"parsed_date":"2017-01-10 00:00:00 UTC","total_visits":"2268"} -{"parsed_date":"2017-03-29 00:00:00 UTC","total_visits":"2525"} -{"parsed_date":"2017-03-27 00:00:00 UTC","total_visits":"2525"} -{"parsed_date":"2016-11-23 00:00:00 UTC","total_visits":"3805"} -{"parsed_date":"2017-05-27 00:00:00 UTC","total_visits":"1502"} -{"parsed_date":"2016-10-24 00:00:00 UTC","total_visits":"4063"} -{"parsed_date":"2016-12-14 00:00:00 UTC","total_visits":"3040"} -{"parsed_date":"2017-02-11 00:00:00 UTC","total_visits":"1761"} -{"parsed_date":"2017-07-27 00:00:00 UTC","total_visits":"2529"} -{"parsed_date":"2017-02-17 00:00:00 UTC","total_visits":"2785"} -{"parsed_date":"2017-04-15 00:00:00 UTC","total_visits":"1506"} -{"parsed_date":"2016-11-05 00:00:00 UTC","total_visits":"3042"} -{"parsed_date":"2016-10-04 00:00:00 UTC","total_visits":"4322"} -{"parsed_date":"2017-05-13 00:00:00 UTC","total_visits":"1251"} -{"parsed_date":"2017-04-16 00:00:00 UTC","total_visits":"1507"} -{"parsed_date":"2016-12-28 00:00:00 UTC","total_visits":"1763"} -{"parsed_date":"2016-08-15 00:00:00 UTC","total_visits":"3043"} -{"parsed_date":"2016-12-03 00:00:00 UTC","total_visits":"3044"} -{"parsed_date":"2017-06-27 00:00:00 UTC","total_visits":"2789"} -{"parsed_date":"2017-06-24 00:00:00 UTC","total_visits":"1510"} -{"parsed_date":"2017-07-16 00:00:00 UTC","total_visits":"1766"} -{"parsed_date":"2017-04-09 00:00:00 UTC","total_visits":"1766"} -{"parsed_date":"2017-06-07 00:00:00 UTC","total_visits":"2279"} -{"parsed_date":"2017-04-17 00:00:00 UTC","total_visits":"2279"} -{"parsed_date":"2016-09-28 00:00:00 UTC","total_visits":"2535"} -{"parsed_date":"2017-03-26 00:00:00 UTC","total_visits":"1768"} -{"parsed_date":"2017-05-10 00:00:00 UTC","total_visits":"2024"} -{"parsed_date":"2017-06-08 00:00:00 UTC","total_visits":"2280"} -{"parsed_date":"2017-05-08 00:00:00 UTC","total_visits":"2025"} -{"parsed_date":"2017-03-13 00:00:00 UTC","total_visits":"2537"} -{"parsed_date":"2016-11-17 00:00:00 UTC","total_visits":"4074"} -{"parsed_date":"2016-08-25 00:00:00 UTC","total_visits":"2539"} -{"parsed_date":"2017-02-16 00:00:00 UTC","total_visits":"2539"} -{"parsed_date":"2017-06-16 00:00:00 UTC","total_visits":"2028"} -{"parsed_date":"2016-11-16 00:00:00 UTC","total_visits":"4334"} -{"parsed_date":"2016-08-17 00:00:00 UTC","total_visits":"2799"} -{"parsed_date":"2017-03-19 00:00:00 UTC","total_visits":"1776"} -{"parsed_date":"2016-11-29 00:00:00 UTC","total_visits":"4337"} -{"parsed_date":"2017-02-05 00:00:00 UTC","total_visits":"1522"} -{"parsed_date":"2016-10-31 00:00:00 UTC","total_visits":"3827"} -{"parsed_date":"2017-05-31 00:00:00 UTC","total_visits":"2292"} -{"parsed_date":"2017-07-18 00:00:00 UTC","total_visits":"2804"} -{"parsed_date":"2017-03-12 00:00:00 UTC","total_visits":"1781"} -{"parsed_date":"2016-09-09 00:00:00 UTC","total_visits":"2549"} -{"parsed_date":"2017-01-14 00:00:00 UTC","total_visits":"1526"} -{"parsed_date":"2017-05-04 00:00:00 UTC","total_visits":"2806"} -{"parsed_date":"2016-11-07 00:00:00 UTC","total_visits":"3832"} -{"parsed_date":"2017-04-07 00:00:00 UTC","total_visits":"2297"} -{"parsed_date":"2017-07-12 00:00:00 UTC","total_visits":"2554"} -{"parsed_date":"2017-04-13 00:00:00 UTC","total_visits":"2300"} -{"parsed_date":"2017-08-01 00:00:00 UTC","total_visits":"2556"} -{"parsed_date":"2017-06-04 00:00:00 UTC","total_visits":"1534"} -{"parsed_date":"2017-02-12 00:00:00 UTC","total_visits":"1790"} -{"parsed_date":"2017-07-03 00:00:00 UTC","total_visits":"2046"} -{"parsed_date":"2016-09-30 00:00:00 UTC","total_visits":"2303"} -{"parsed_date":"2016-08-08 00:00:00 UTC","total_visits":"2815"} +{"parsed_date":"2017-07-01 00:00:00 UTC","id":"1","total_visits":"2048"} +{"parsed_date":"2016-09-07 00:00:00 UTC","id":"1","total_visits":"2562"} +{"parsed_date":"2016-10-25 00:00:00 UTC","id":"1","total_visits":"3842"} +{"parsed_date":"2017-04-10 00:00:00 UTC","id":"1","total_visits":"2563"} +{"parsed_date":"2017-01-09 00:00:00 UTC","id":"1","total_visits":"2308"} +{"parsed_date":"2017-05-02 00:00:00 UTC","id":"1","total_visits":"2564"} +{"parsed_date":"2016-11-11 00:00:00 UTC","id":"1","total_visits":"3588"} +{"parsed_date":"2017-07-30 00:00:00 UTC","id":"1","total_visits":"1799"} +{"parsed_date":"2017-06-10 00:00:00 UTC","id":"1","total_visits":"1545"} +{"parsed_date":"2016-08-14 00:00:00 UTC","id":"1","total_visits":"1801"} +{"parsed_date":"2017-05-14 00:00:00 UTC","id":"1","total_visits":"1290"} +{"parsed_date":"2017-02-08 00:00:00 UTC","id":"1","total_visits":"2570"} +{"parsed_date":"2017-06-01 00:00:00 UTC","id":"1","total_visits":"2826"} +{"parsed_date":"2017-04-23 00:00:00 UTC","id":"1","total_visits":"1548"} +{"parsed_date":"2016-11-04 00:00:00 UTC","id":"1","total_visits":"3596"} +{"parsed_date":"2017-02-04 00:00:00 UTC","id":"1","total_visits":"1549"} +{"parsed_date":"2016-12-09 00:00:00 UTC","id":"1","total_visits":"2830"} +{"parsed_date":"2016-10-30 00:00:00 UTC","id":"1","total_visits":"3086"} +{"parsed_date":"2017-03-28 00:00:00 UTC","id":"1","total_visits":"2577"} +{"parsed_date":"2017-06-11 00:00:00 UTC","id":"1","total_visits":"1555"} +{"parsed_date":"2016-12-17 00:00:00 UTC","id":"1","total_visits":"2324"} +{"parsed_date":"2016-09-22 00:00:00 UTC","id":"1","total_visits":"2581"} +{"parsed_date":"2017-01-29 00:00:00 UTC","id":"1","total_visits":"1814"} +{"parsed_date":"2017-03-22 00:00:00 UTC","id":"1","total_visits":"2582"} +{"parsed_date":"2017-02-21 00:00:00 UTC","id":"1","total_visits":"2582"} +{"parsed_date":"2016-10-14 00:00:00 UTC","id":"1","total_visits":"2838"} +{"parsed_date":"2017-04-27 00:00:00 UTC","id":"1","total_visits":"2838"} +{"parsed_date":"2016-10-26 00:00:00 UTC","id":"1","total_visits":"4375"} +{"parsed_date":"2016-08-22 00:00:00 UTC","id":"1","total_visits":"2584"} +{"parsed_date":"2016-12-07 00:00:00 UTC","id":"1","total_visits":"2840"} +{"parsed_date":"2017-01-20 00:00:00 UTC","id":"1","total_visits":"2074"} +{"parsed_date":"2017-03-07 00:00:00 UTC","id":"1","total_visits":"2586"} +{"parsed_date":"2017-05-16 00:00:00 UTC","id":"1","total_visits":"3098"} +{"parsed_date":"2017-05-03 00:00:00 UTC","id":"1","total_visits":"2588"} +{"parsed_date":"2017-05-01 00:00:00 UTC","id":"1","total_visits":"2588"} +{"parsed_date":"2016-11-27 00:00:00 UTC","id":"1","total_visits":"3356"} +{"parsed_date":"2017-04-29 00:00:00 UTC","id":"1","total_visits":"1566"} +{"parsed_date":"2016-09-18 00:00:00 UTC","id":"1","total_visits":"1822"} +{"parsed_date":"2017-03-23 00:00:00 UTC","id":"1","total_visits":"2847"} +{"parsed_date":"2017-03-14 00:00:00 UTC","id":"1","total_visits":"2338"} +{"parsed_date":"2016-12-21 00:00:00 UTC","id":"1","total_visits":"2594"} +{"parsed_date":"2016-10-11 00:00:00 UTC","id":"1","total_visits":"2850"} +{"parsed_date":"2017-01-24 00:00:00 UTC","id":"1","total_visits":"3618"} +{"parsed_date":"2017-03-05 00:00:00 UTC","id":"1","total_visits":"1827"} +{"parsed_date":"2017-01-19 00:00:00 UTC","id":"1","total_visits":"2083"} +{"parsed_date":"2016-08-09 00:00:00 UTC","id":"1","total_visits":"2851"} +{"parsed_date":"2017-04-08 00:00:00 UTC","id":"1","total_visits":"1829"} +{"parsed_date":"2017-04-12 00:00:00 UTC","id":"1","total_visits":"2341"} +{"parsed_date":"2016-09-29 00:00:00 UTC","id":"1","total_visits":"2597"} +{"parsed_date":"2016-12-20 00:00:00 UTC","id":"1","total_visits":"3110"} +{"parsed_date":"2017-01-15 00:00:00 UTC","id":"1","total_visits":"1576"} +{"parsed_date":"2017-04-14 00:00:00 UTC","id":"1","total_visits":"1834"} +{"parsed_date":"2017-02-28 00:00:00 UTC","id":"1","total_visits":"2347"} +{"parsed_date":"2016-09-16 00:00:00 UTC","id":"1","total_visits":"2603"} +{"parsed_date":"2016-10-18 00:00:00 UTC","id":"1","total_visits":"3628"} +{"parsed_date":"2017-02-24 00:00:00 UTC","id":"1","total_visits":"2093"} +{"parsed_date":"2017-05-17 00:00:00 UTC","id":"1","total_visits":"3117"} +{"parsed_date":"2017-06-23 00:00:00 UTC","id":"1","total_visits":"2095"} +{"parsed_date":"2016-11-12 00:00:00 UTC","id":"1","total_visits":"3119"} +{"parsed_date":"2016-11-21 00:00:00 UTC","id":"1","total_visits":"4143"} +{"parsed_date":"2017-02-27 00:00:00 UTC","id":"1","total_visits":"2352"} +{"parsed_date":"2016-12-26 00:00:00 UTC","id":"1","total_visits":"1586"} +{"parsed_date":"2017-04-25 00:00:00 UTC","id":"1","total_visits":"2354"} +{"parsed_date":"2017-03-21 00:00:00 UTC","id":"1","total_visits":"2611"} +{"parsed_date":"2016-12-22 00:00:00 UTC","id":"1","total_visits":"2100"} +{"parsed_date":"2016-10-01 00:00:00 UTC","id":"1","total_visits":"1589"} +{"parsed_date":"2016-09-24 00:00:00 UTC","id":"1","total_visits":"1845"} +{"parsed_date":"2017-06-21 00:00:00 UTC","id":"1","total_visits":"2357"} +{"parsed_date":"2016-09-02 00:00:00 UTC","id":"1","total_visits":"2613"} +{"parsed_date":"2016-08-26 00:00:00 UTC","id":"1","total_visits":"2359"} +{"parsed_date":"2016-10-12 00:00:00 UTC","id":"1","total_visits":"2871"} +{"parsed_date":"2017-05-15 00:00:00 UTC","id":"1","total_visits":"2360"} +{"parsed_date":"2017-06-12 00:00:00 UTC","id":"1","total_visits":"2361"} +{"parsed_date":"2016-08-16 00:00:00 UTC","id":"1","total_visits":"2873"} +{"parsed_date":"2017-04-30 00:00:00 UTC","id":"1","total_visits":"1594"} +{"parsed_date":"2017-04-05 00:00:00 UTC","id":"1","total_visits":"2619"} +{"parsed_date":"2016-08-12 00:00:00 UTC","id":"1","total_visits":"2619"} +{"parsed_date":"2016-11-08 00:00:00 UTC","id":"1","total_visits":"3899"} +{"parsed_date":"2016-08-13 00:00:00 UTC","id":"1","total_visits":"1596"} +{"parsed_date":"2017-05-09 00:00:00 UTC","id":"1","total_visits":"2108"} +{"parsed_date":"2017-02-23 00:00:00 UTC","id":"1","total_visits":"2364"} +{"parsed_date":"2017-07-31 00:00:00 UTC","id":"1","total_visits":"2620"} +{"parsed_date":"2017-06-25 00:00:00 UTC","id":"1","total_visits":"1597"} +{"parsed_date":"2017-07-29 00:00:00 UTC","id":"1","total_visits":"1597"} +{"parsed_date":"2016-09-17 00:00:00 UTC","id":"1","total_visits":"1853"} +{"parsed_date":"2016-12-27 00:00:00 UTC","id":"1","total_visits":"1855"} +{"parsed_date":"2017-05-20 00:00:00 UTC","id":"1","total_visits":"1855"} +{"parsed_date":"2016-10-08 00:00:00 UTC","id":"1","total_visits":"2114"} +{"parsed_date":"2016-10-27 00:00:00 UTC","id":"1","total_visits":"4162"} +{"parsed_date":"2017-07-08 00:00:00 UTC","id":"1","total_visits":"1859"} +{"parsed_date":"2016-08-24 00:00:00 UTC","id":"1","total_visits":"2627"} +{"parsed_date":"2016-12-23 00:00:00 UTC","id":"1","total_visits":"1604"} +{"parsed_date":"2017-02-02 00:00:00 UTC","id":"1","total_visits":"2372"} +{"parsed_date":"2016-09-08 00:00:00 UTC","id":"1","total_visits":"2628"} +{"parsed_date":"2017-04-02 00:00:00 UTC","id":"1","total_visits":"1861"} +{"parsed_date":"2017-02-15 00:00:00 UTC","id":"1","total_visits":"2629"} +{"parsed_date":"2017-07-05 00:00:00 UTC","id":"1","total_visits":"2885"} +{"parsed_date":"2016-10-17 00:00:00 UTC","id":"1","total_visits":"3397"} +{"parsed_date":"2017-02-20 00:00:00 UTC","id":"1","total_visits":"2374"} +{"parsed_date":"2017-03-24 00:00:00 UTC","id":"1","total_visits":"2374"} +{"parsed_date":"2017-04-20 00:00:00 UTC","id":"1","total_visits":"2374"} +{"parsed_date":"2016-11-18 00:00:00 UTC","id":"1","total_visits":"3654"} +{"parsed_date":"2017-07-25 00:00:00 UTC","id":"1","total_visits":"2631"} +{"parsed_date":"2016-11-13 00:00:00 UTC","id":"1","total_visits":"3144"} +{"parsed_date":"2017-03-18 00:00:00 UTC","id":"1","total_visits":"1610"} +{"parsed_date":"2016-08-03 00:00:00 UTC","id":"1","total_visits":"2890"} +{"parsed_date":"2016-08-19 00:00:00 UTC","id":"1","total_visits":"2379"} +{"parsed_date":"2017-02-14 00:00:00 UTC","id":"1","total_visits":"2379"} +{"parsed_date":"2017-07-11 00:00:00 UTC","id":"1","total_visits":"2635"} +{"parsed_date":"2017-04-22 00:00:00 UTC","id":"1","total_visits":"1612"} +{"parsed_date":"2016-10-07 00:00:00 UTC","id":"1","total_visits":"2892"} +{"parsed_date":"2016-09-05 00:00:00 UTC","id":"1","total_visits":"2125"} +{"parsed_date":"2016-09-23 00:00:00 UTC","id":"1","total_visits":"2381"} +{"parsed_date":"2016-11-15 00:00:00 UTC","id":"1","total_visits":"4685"} +{"parsed_date":"2017-01-28 00:00:00 UTC","id":"1","total_visits":"1614"} +{"parsed_date":"2017-07-14 00:00:00 UTC","id":"1","total_visits":"2382"} +{"parsed_date":"2017-01-07 00:00:00 UTC","id":"1","total_visits":"1615"} +{"parsed_date":"2017-04-03 00:00:00 UTC","id":"1","total_visits":"2383"} +{"parsed_date":"2017-03-20 00:00:00 UTC","id":"1","total_visits":"2383"} +{"parsed_date":"2016-12-18 00:00:00 UTC","id":"1","total_visits":"2128"} +{"parsed_date":"2017-03-17 00:00:00 UTC","id":"1","total_visits":"2129"} +{"parsed_date":"2017-05-23 00:00:00 UTC","id":"1","total_visits":"2129"} +{"parsed_date":"2016-11-30 00:00:00 UTC","id":"1","total_visits":"4435"} +{"parsed_date":"2017-01-01 00:00:00 UTC","id":"1","total_visits":"1364"} +{"parsed_date":"2017-01-02 00:00:00 UTC","id":"1","total_visits":"1620"} +{"parsed_date":"2016-09-25 00:00:00 UTC","id":"1","total_visits":"1877"} +{"parsed_date":"2016-08-07 00:00:00 UTC","id":"1","total_visits":"1622"} +{"parsed_date":"2016-10-09 00:00:00 UTC","id":"1","total_visits":"2134"} +{"parsed_date":"2017-03-01 00:00:00 UTC","id":"1","total_visits":"2390"} +{"parsed_date":"2017-01-04 00:00:00 UTC","id":"1","total_visits":"2390"} +{"parsed_date":"2017-06-06 00:00:00 UTC","id":"1","total_visits":"2391"} +{"parsed_date":"2017-04-18 00:00:00 UTC","id":"1","total_visits":"2391"} +{"parsed_date":"2017-04-06 00:00:00 UTC","id":"1","total_visits":"2647"} +{"parsed_date":"2017-01-30 00:00:00 UTC","id":"1","total_visits":"2392"} +{"parsed_date":"2016-10-16 00:00:00 UTC","id":"1","total_visits":"2649"} +{"parsed_date":"2016-08-04 00:00:00 UTC","id":"1","total_visits":"3161"} +{"parsed_date":"2016-10-21 00:00:00 UTC","id":"1","total_visits":"3419"} +{"parsed_date":"2016-08-02 00:00:00 UTC","id":"1","total_visits":"2140"} +{"parsed_date":"2017-03-06 00:00:00 UTC","id":"1","total_visits":"2396"} +{"parsed_date":"2016-09-13 00:00:00 UTC","id":"1","total_visits":"2396"} +{"parsed_date":"2016-09-14 00:00:00 UTC","id":"1","total_visits":"2652"} +{"parsed_date":"2017-04-19 00:00:00 UTC","id":"1","total_visits":"2397"} +{"parsed_date":"2017-06-19 00:00:00 UTC","id":"1","total_visits":"2142"} +{"parsed_date":"2016-12-13 00:00:00 UTC","id":"1","total_visits":"3166"} +{"parsed_date":"2017-06-20 00:00:00 UTC","id":"1","total_visits":"2143"} +{"parsed_date":"2016-10-10 00:00:00 UTC","id":"1","total_visits":"2911"} +{"parsed_date":"2017-07-06 00:00:00 UTC","id":"1","total_visits":"2658"} +{"parsed_date":"2017-01-03 00:00:00 UTC","id":"1","total_visits":"2403"} +{"parsed_date":"2017-01-08 00:00:00 UTC","id":"1","total_visits":"1637"} +{"parsed_date":"2017-02-25 00:00:00 UTC","id":"1","total_visits":"1638"} +{"parsed_date":"2017-05-24 00:00:00 UTC","id":"1","total_visits":"2406"} +{"parsed_date":"2016-11-22 00:00:00 UTC","id":"1","total_visits":"3942"} +{"parsed_date":"2017-05-06 00:00:00 UTC","id":"1","total_visits":"1383"} +{"parsed_date":"2017-07-02 00:00:00 UTC","id":"1","total_visits":"1895"} +{"parsed_date":"2016-12-01 00:00:00 UTC","id":"1","total_visits":"4200"} +{"parsed_date":"2017-03-16 00:00:00 UTC","id":"1","total_visits":"2409"} +{"parsed_date":"2016-12-12 00:00:00 UTC","id":"1","total_visits":"3433"} +{"parsed_date":"2016-12-25 00:00:00 UTC","id":"1","total_visits":"1386"} +{"parsed_date":"2017-02-26 00:00:00 UTC","id":"1","total_visits":"1643"} +{"parsed_date":"2017-04-28 00:00:00 UTC","id":"1","total_visits":"2411"} +{"parsed_date":"2016-08-11 00:00:00 UTC","id":"1","total_visits":"2667"} +{"parsed_date":"2017-07-20 00:00:00 UTC","id":"1","total_visits":"2668"} +{"parsed_date":"2017-05-21 00:00:00 UTC","id":"1","total_visits":"1645"} +{"parsed_date":"2017-06-17 00:00:00 UTC","id":"1","total_visits":"1391"} +{"parsed_date":"2016-12-29 00:00:00 UTC","id":"1","total_visits":"1647"} +{"parsed_date":"2017-07-17 00:00:00 UTC","id":"1","total_visits":"2671"} +{"parsed_date":"2017-01-16 00:00:00 UTC","id":"1","total_visits":"1906"} +{"parsed_date":"2017-03-03 00:00:00 UTC","id":"1","total_visits":"2162"} +{"parsed_date":"2016-11-14 00:00:00 UTC","id":"1","total_visits":"4466"} +{"parsed_date":"2016-08-30 00:00:00 UTC","id":"1","total_visits":"2675"} +{"parsed_date":"2016-08-27 00:00:00 UTC","id":"1","total_visits":"1654"} +{"parsed_date":"2017-02-09 00:00:00 UTC","id":"1","total_visits":"2678"} +{"parsed_date":"2017-06-03 00:00:00 UTC","id":"1","total_visits":"1399"} +{"parsed_date":"2017-05-07 00:00:00 UTC","id":"1","total_visits":"1400"} +{"parsed_date":"2016-11-02 00:00:00 UTC","id":"1","total_visits":"3960"} +{"parsed_date":"2016-12-15 00:00:00 UTC","id":"1","total_visits":"2937"} +{"parsed_date":"2017-04-01 00:00:00 UTC","id":"1","total_visits":"2170"} +{"parsed_date":"2017-07-21 00:00:00 UTC","id":"1","total_visits":"2427"} +{"parsed_date":"2016-08-06 00:00:00 UTC","id":"1","total_visits":"1663"} +{"parsed_date":"2016-09-01 00:00:00 UTC","id":"1","total_visits":"2687"} +{"parsed_date":"2017-06-28 00:00:00 UTC","id":"1","total_visits":"2687"} +{"parsed_date":"2016-08-20 00:00:00 UTC","id":"1","total_visits":"1664"} +{"parsed_date":"2017-04-26 00:00:00 UTC","id":"1","total_visits":"4224"} +{"parsed_date":"2017-07-09 00:00:00 UTC","id":"1","total_visits":"1921"} +{"parsed_date":"2017-07-28 00:00:00 UTC","id":"1","total_visits":"2433"} +{"parsed_date":"2016-09-19 00:00:00 UTC","id":"1","total_visits":"2689"} +{"parsed_date":"2017-07-24 00:00:00 UTC","id":"1","total_visits":"2436"} +{"parsed_date":"2017-06-13 00:00:00 UTC","id":"1","total_visits":"2181"} +{"parsed_date":"2016-09-15 00:00:00 UTC","id":"1","total_visits":"2949"} +{"parsed_date":"2017-02-03 00:00:00 UTC","id":"1","total_visits":"2182"} +{"parsed_date":"2016-09-10 00:00:00 UTC","id":"1","total_visits":"1671"} +{"parsed_date":"2017-06-09 00:00:00 UTC","id":"1","total_visits":"1927"} +{"parsed_date":"2017-01-11 00:00:00 UTC","id":"1","total_visits":"2185"} +{"parsed_date":"2017-02-19 00:00:00 UTC","id":"1","total_visits":"2187"} +{"parsed_date":"2017-01-17 00:00:00 UTC","id":"1","total_visits":"2443"} +{"parsed_date":"2017-05-12 00:00:00 UTC","id":"1","total_visits":"1932"} +{"parsed_date":"2016-12-16 00:00:00 UTC","id":"1","total_visits":"2956"} +{"parsed_date":"2017-02-01 00:00:00 UTC","id":"1","total_visits":"2445"} +{"parsed_date":"2016-11-26 00:00:00 UTC","id":"1","total_visits":"3213"} +{"parsed_date":"2017-06-02 00:00:00 UTC","id":"1","total_visits":"2190"} +{"parsed_date":"2016-08-05 00:00:00 UTC","id":"1","total_visits":"2702"} +{"parsed_date":"2016-11-01 00:00:00 UTC","id":"1","total_visits":"3728"} +{"parsed_date":"2017-01-05 00:00:00 UTC","id":"1","total_visits":"2193"} +{"parsed_date":"2017-03-08 00:00:00 UTC","id":"1","total_visits":"2449"} +{"parsed_date":"2016-08-28 00:00:00 UTC","id":"1","total_visits":"1682"} +{"parsed_date":"2017-07-04 00:00:00 UTC","id":"1","total_visits":"1938"} +{"parsed_date":"2017-03-10 00:00:00 UTC","id":"1","total_visits":"2194"} +{"parsed_date":"2017-07-07 00:00:00 UTC","id":"1","total_visits":"2450"} +{"parsed_date":"2016-10-29 00:00:00 UTC","id":"1","total_visits":"2964"} +{"parsed_date":"2016-10-13 00:00:00 UTC","id":"1","total_visits":"2964"} +{"parsed_date":"2016-12-04 00:00:00 UTC","id":"1","total_visits":"3220"} +{"parsed_date":"2017-01-21 00:00:00 UTC","id":"1","total_visits":"1685"} +{"parsed_date":"2017-06-29 00:00:00 UTC","id":"1","total_visits":"2709"} +{"parsed_date":"2016-08-29 00:00:00 UTC","id":"1","total_visits":"2454"} +{"parsed_date":"2016-12-19 00:00:00 UTC","id":"1","total_visits":"3222"} +{"parsed_date":"2017-05-30 00:00:00 UTC","id":"1","total_visits":"2199"} +{"parsed_date":"2017-02-10 00:00:00 UTC","id":"1","total_visits":"2199"} +{"parsed_date":"2016-08-31 00:00:00 UTC","id":"1","total_visits":"3223"} +{"parsed_date":"2017-06-18 00:00:00 UTC","id":"1","total_visits":"1432"} +{"parsed_date":"2017-01-12 00:00:00 UTC","id":"1","total_visits":"2203"} +{"parsed_date":"2017-05-18 00:00:00 UTC","id":"1","total_visits":"2715"} +{"parsed_date":"2016-10-23 00:00:00 UTC","id":"1","total_visits":"2971"} +{"parsed_date":"2016-09-04 00:00:00 UTC","id":"1","total_visits":"1692"} +{"parsed_date":"2016-12-10 00:00:00 UTC","id":"1","total_visits":"2207"} +{"parsed_date":"2016-12-11 00:00:00 UTC","id":"1","total_visits":"2208"} +{"parsed_date":"2017-04-11 00:00:00 UTC","id":"1","total_visits":"2464"} +{"parsed_date":"2016-09-21 00:00:00 UTC","id":"1","total_visits":"2720"} +{"parsed_date":"2016-11-06 00:00:00 UTC","id":"1","total_visits":"3232"} +{"parsed_date":"2017-01-26 00:00:00 UTC","id":"1","total_visits":"2209"} +{"parsed_date":"2016-09-12 00:00:00 UTC","id":"1","total_visits":"2465"} +{"parsed_date":"2017-04-21 00:00:00 UTC","id":"1","total_visits":"2210"} +{"parsed_date":"2017-01-06 00:00:00 UTC","id":"1","total_visits":"2210"} +{"parsed_date":"2017-04-04 00:00:00 UTC","id":"1","total_visits":"2978"} +{"parsed_date":"2017-01-22 00:00:00 UTC","id":"1","total_visits":"1700"} +{"parsed_date":"2017-07-26 00:00:00 UTC","id":"1","total_visits":"2725"} +{"parsed_date":"2016-08-18 00:00:00 UTC","id":"1","total_visits":"2725"} +{"parsed_date":"2016-09-27 00:00:00 UTC","id":"1","total_visits":"2727"} +{"parsed_date":"2016-12-02 00:00:00 UTC","id":"1","total_visits":"3751"} +{"parsed_date":"2017-05-05 00:00:00 UTC","id":"1","total_visits":"1960"} +{"parsed_date":"2016-11-19 00:00:00 UTC","id":"1","total_visits":"2984"} +{"parsed_date":"2016-11-09 00:00:00 UTC","id":"1","total_visits":"3752"} +{"parsed_date":"2016-12-05 00:00:00 UTC","id":"1","total_visits":"4265"} +{"parsed_date":"2017-05-11 00:00:00 UTC","id":"1","total_visits":"2218"} +{"parsed_date":"2017-01-25 00:00:00 UTC","id":"1","total_visits":"2986"} +{"parsed_date":"2017-03-11 00:00:00 UTC","id":"1","total_visits":"1707"} +{"parsed_date":"2017-03-30 00:00:00 UTC","id":"1","total_visits":"2731"} +{"parsed_date":"2016-10-20 00:00:00 UTC","id":"1","total_visits":"3755"} +{"parsed_date":"2017-02-07 00:00:00 UTC","id":"1","total_visits":"2476"} +{"parsed_date":"2017-02-22 00:00:00 UTC","id":"1","total_visits":"2477"} +{"parsed_date":"2017-07-23 00:00:00 UTC","id":"1","total_visits":"1966"} +{"parsed_date":"2016-11-03 00:00:00 UTC","id":"1","total_visits":"4014"} +{"parsed_date":"2016-08-01 00:00:00 UTC","id":"1","total_visits":"1711"} +{"parsed_date":"2017-01-13 00:00:00 UTC","id":"1","total_visits":"1967"} +{"parsed_date":"2017-05-19 00:00:00 UTC","id":"1","total_visits":"2223"} +{"parsed_date":"2016-11-20 00:00:00 UTC","id":"1","total_visits":"3247"} +{"parsed_date":"2016-11-25 00:00:00 UTC","id":"1","total_visits":"3759"} +{"parsed_date":"2017-03-25 00:00:00 UTC","id":"1","total_visits":"1712"} +{"parsed_date":"2017-01-27 00:00:00 UTC","id":"1","total_visits":"1969"} +{"parsed_date":"2017-06-26 00:00:00 UTC","id":"1","total_visits":"2226"} +{"parsed_date":"2017-05-25 00:00:00 UTC","id":"1","total_visits":"2228"} +{"parsed_date":"2017-01-31 00:00:00 UTC","id":"1","total_visits":"2229"} +{"parsed_date":"2017-07-13 00:00:00 UTC","id":"1","total_visits":"2741"} +{"parsed_date":"2017-03-15 00:00:00 UTC","id":"1","total_visits":"2486"} +{"parsed_date":"2017-05-28 00:00:00 UTC","id":"1","total_visits":"1463"} +{"parsed_date":"2017-03-09 00:00:00 UTC","id":"1","total_visits":"2231"} +{"parsed_date":"2017-07-15 00:00:00 UTC","id":"1","total_visits":"1721"} +{"parsed_date":"2016-11-24 00:00:00 UTC","id":"1","total_visits":"3770"} +{"parsed_date":"2016-10-05 00:00:00 UTC","id":"1","total_visits":"3770"} +{"parsed_date":"2016-12-31 00:00:00 UTC","id":"1","total_visits":"1211"} +{"parsed_date":"2016-10-02 00:00:00 UTC","id":"1","total_visits":"1724"} +{"parsed_date":"2017-07-22 00:00:00 UTC","id":"1","total_visits":"1724"} +{"parsed_date":"2016-09-11 00:00:00 UTC","id":"1","total_visits":"1725"} +{"parsed_date":"2017-06-15 00:00:00 UTC","id":"1","total_visits":"2237"} +{"parsed_date":"2017-06-05 00:00:00 UTC","id":"1","total_visits":"2493"} +{"parsed_date":"2017-02-06 00:00:00 UTC","id":"1","total_visits":"2238"} +{"parsed_date":"2016-10-15 00:00:00 UTC","id":"1","total_visits":"2495"} +{"parsed_date":"2016-08-21 00:00:00 UTC","id":"1","total_visits":"1730"} +{"parsed_date":"2016-08-23 00:00:00 UTC","id":"1","total_visits":"2754"} +{"parsed_date":"2017-06-30 00:00:00 UTC","id":"1","total_visits":"2499"} +{"parsed_date":"2017-01-18 00:00:00 UTC","id":"1","total_visits":"2245"} +{"parsed_date":"2016-08-10 00:00:00 UTC","id":"1","total_visits":"2757"} +{"parsed_date":"2016-12-08 00:00:00 UTC","id":"1","total_visits":"3013"} +{"parsed_date":"2016-11-28 00:00:00 UTC","id":"1","total_visits":"4807"} +{"parsed_date":"2017-05-22 00:00:00 UTC","id":"1","total_visits":"2248"} +{"parsed_date":"2016-09-20 00:00:00 UTC","id":"1","total_visits":"2760"} +{"parsed_date":"2016-10-06 00:00:00 UTC","id":"1","total_visits":"3016"} +{"parsed_date":"2016-09-06 00:00:00 UTC","id":"1","total_visits":"2508"} +{"parsed_date":"2016-09-03 00:00:00 UTC","id":"1","total_visits":"1741"} +{"parsed_date":"2016-12-06 00:00:00 UTC","id":"1","total_visits":"3021"} +{"parsed_date":"2016-12-24 00:00:00 UTC","id":"1","total_visits":"1231"} +{"parsed_date":"2016-10-28 00:00:00 UTC","id":"1","total_visits":"3791"} +{"parsed_date":"2016-12-30 00:00:00 UTC","id":"1","total_visits":"1232"} +{"parsed_date":"2017-05-29 00:00:00 UTC","id":"1","total_visits":"1745"} +{"parsed_date":"2017-07-10 00:00:00 UTC","id":"1","total_visits":"2769"} +{"parsed_date":"2017-06-22 00:00:00 UTC","id":"1","total_visits":"2258"} +{"parsed_date":"2017-07-19 00:00:00 UTC","id":"1","total_visits":"2514"} +{"parsed_date":"2016-10-03 00:00:00 UTC","id":"1","total_visits":"2514"} +{"parsed_date":"2017-06-14 00:00:00 UTC","id":"1","total_visits":"2517"} +{"parsed_date":"2016-10-22 00:00:00 UTC","id":"1","total_visits":"3029"} +{"parsed_date":"2017-01-23 00:00:00 UTC","id":"1","total_visits":"2262"} +{"parsed_date":"2017-04-24 00:00:00 UTC","id":"1","total_visits":"2263"} +{"parsed_date":"2016-11-10 00:00:00 UTC","id":"1","total_visits":"4055"} +{"parsed_date":"2016-09-26 00:00:00 UTC","id":"1","total_visits":"2776"} +{"parsed_date":"2016-10-19 00:00:00 UTC","id":"1","total_visits":"3544"} +{"parsed_date":"2017-03-04 00:00:00 UTC","id":"1","total_visits":"1753"} +{"parsed_date":"2017-05-26 00:00:00 UTC","id":"1","total_visits":"2009"} +{"parsed_date":"2017-02-13 00:00:00 UTC","id":"1","total_visits":"2266"} +{"parsed_date":"2017-02-18 00:00:00 UTC","id":"1","total_visits":"1755"} +{"parsed_date":"2017-03-02 00:00:00 UTC","id":"1","total_visits":"2267"} +{"parsed_date":"2017-03-31 00:00:00 UTC","id":"1","total_visits":"2268"} +{"parsed_date":"2017-01-10 00:00:00 UTC","id":"1","total_visits":"2268"} +{"parsed_date":"2017-03-29 00:00:00 UTC","id":"1","total_visits":"2525"} +{"parsed_date":"2017-03-27 00:00:00 UTC","id":"1","total_visits":"2525"} +{"parsed_date":"2016-11-23 00:00:00 UTC","id":"1","total_visits":"3805"} +{"parsed_date":"2017-05-27 00:00:00 UTC","id":"1","total_visits":"1502"} +{"parsed_date":"2016-10-24 00:00:00 UTC","id":"1","total_visits":"4063"} +{"parsed_date":"2016-12-14 00:00:00 UTC","id":"1","total_visits":"3040"} +{"parsed_date":"2017-02-11 00:00:00 UTC","id":"1","total_visits":"1761"} +{"parsed_date":"2017-07-27 00:00:00 UTC","id":"1","total_visits":"2529"} +{"parsed_date":"2017-02-17 00:00:00 UTC","id":"1","total_visits":"2785"} +{"parsed_date":"2017-04-15 00:00:00 UTC","id":"1","total_visits":"1506"} +{"parsed_date":"2016-11-05 00:00:00 UTC","id":"1","total_visits":"3042"} +{"parsed_date":"2016-10-04 00:00:00 UTC","id":"1","total_visits":"4322"} +{"parsed_date":"2017-05-13 00:00:00 UTC","id":"1","total_visits":"1251"} +{"parsed_date":"2017-04-16 00:00:00 UTC","id":"1","total_visits":"1507"} +{"parsed_date":"2016-12-28 00:00:00 UTC","id":"1","total_visits":"1763"} +{"parsed_date":"2016-08-15 00:00:00 UTC","id":"1","total_visits":"3043"} +{"parsed_date":"2016-12-03 00:00:00 UTC","id":"1","total_visits":"3044"} +{"parsed_date":"2017-06-27 00:00:00 UTC","id":"1","total_visits":"2789"} +{"parsed_date":"2017-06-24 00:00:00 UTC","id":"1","total_visits":"1510"} +{"parsed_date":"2017-07-16 00:00:00 UTC","id":"1","total_visits":"1766"} +{"parsed_date":"2017-04-09 00:00:00 UTC","id":"1","total_visits":"1766"} +{"parsed_date":"2017-06-07 00:00:00 UTC","id":"1","total_visits":"2279"} +{"parsed_date":"2017-04-17 00:00:00 UTC","id":"1","total_visits":"2279"} +{"parsed_date":"2016-09-28 00:00:00 UTC","id":"1","total_visits":"2535"} +{"parsed_date":"2017-03-26 00:00:00 UTC","id":"1","total_visits":"1768"} +{"parsed_date":"2017-05-10 00:00:00 UTC","id":"1","total_visits":"2024"} +{"parsed_date":"2017-06-08 00:00:00 UTC","id":"1","total_visits":"2280"} +{"parsed_date":"2017-05-08 00:00:00 UTC","id":"1","total_visits":"2025"} +{"parsed_date":"2017-03-13 00:00:00 UTC","id":"1","total_visits":"2537"} +{"parsed_date":"2016-11-17 00:00:00 UTC","id":"1","total_visits":"4074"} +{"parsed_date":"2016-08-25 00:00:00 UTC","id":"1","total_visits":"2539"} +{"parsed_date":"2017-02-16 00:00:00 UTC","id":"1","total_visits":"2539"} +{"parsed_date":"2017-06-16 00:00:00 UTC","id":"1","total_visits":"2028"} +{"parsed_date":"2016-11-16 00:00:00 UTC","id":"1","total_visits":"4334"} +{"parsed_date":"2016-08-17 00:00:00 UTC","id":"1","total_visits":"2800"} +{"parsed_date":"2017-03-19 00:00:00 UTC","id":"1","total_visits":"1776"} +{"parsed_date":"2016-11-29 00:00:00 UTC","id":"1","total_visits":"4337"} +{"parsed_date":"2017-02-05 00:00:00 UTC","id":"1","total_visits":"1522"} +{"parsed_date":"2016-10-31 00:00:00 UTC","id":"1","total_visits":"3827"} +{"parsed_date":"2017-05-31 00:00:00 UTC","id":"1","total_visits":"2292"} +{"parsed_date":"2017-07-18 00:00:00 UTC","id":"1","total_visits":"2804"} +{"parsed_date":"2017-03-12 00:00:00 UTC","id":"1","total_visits":"1781"} +{"parsed_date":"2016-09-09 00:00:00 UTC","id":"1","total_visits":"2549"} +{"parsed_date":"2017-01-14 00:00:00 UTC","id":"1","total_visits":"1526"} +{"parsed_date":"2017-05-04 00:00:00 UTC","id":"1","total_visits":"2806"} +{"parsed_date":"2016-11-07 00:00:00 UTC","id":"1","total_visits":"3832"} +{"parsed_date":"2017-04-07 00:00:00 UTC","id":"1","total_visits":"2297"} +{"parsed_date":"2017-07-12 00:00:00 UTC","id":"1","total_visits":"2554"} +{"parsed_date":"2017-04-13 00:00:00 UTC","id":"1","total_visits":"2300"} +{"parsed_date":"2017-08-01 00:00:00 UTC","id":"1","total_visits":"2556"} +{"parsed_date":"2017-06-04 00:00:00 UTC","id":"1","total_visits":"1534"} +{"parsed_date":"2017-02-12 00:00:00 UTC","id":"1","total_visits":"1790"} +{"parsed_date":"2017-07-03 00:00:00 UTC","id":"1","total_visits":"2046"} +{"parsed_date":"2016-09-30 00:00:00 UTC","id":"1","total_visits":"2303"} +{"parsed_date":"2016-08-08 00:00:00 UTC","id":"1","total_visits":"2815"} +{"parsed_date":"2017-07-01 00:00:00 UTC","id":"2","total_visits":"2048"} +{"parsed_date":"2016-09-07 00:00:00 UTC","id":"2","total_visits":"2562"} +{"parsed_date":"2016-10-25 00:00:00 UTC","id":"2","total_visits":"3842"} +{"parsed_date":"2017-04-10 00:00:00 UTC","id":"2","total_visits":"2563"} +{"parsed_date":"2017-01-09 00:00:00 UTC","id":"2","total_visits":"2308"} +{"parsed_date":"2017-05-02 00:00:00 UTC","id":"2","total_visits":"2564"} +{"parsed_date":"2016-11-11 00:00:00 UTC","id":"2","total_visits":"3588"} +{"parsed_date":"2017-07-30 00:00:00 UTC","id":"2","total_visits":"1799"} +{"parsed_date":"2017-06-10 00:00:00 UTC","id":"2","total_visits":"1545"} +{"parsed_date":"2016-08-14 00:00:00 UTC","id":"2","total_visits":"1801"} +{"parsed_date":"2017-05-14 00:00:00 UTC","id":"2","total_visits":"1290"} +{"parsed_date":"2017-02-08 00:00:00 UTC","id":"2","total_visits":"2570"} +{"parsed_date":"2017-06-01 00:00:00 UTC","id":"2","total_visits":"2826"} +{"parsed_date":"2017-04-23 00:00:00 UTC","id":"2","total_visits":"1548"} +{"parsed_date":"2016-11-04 00:00:00 UTC","id":"2","total_visits":"3596"} +{"parsed_date":"2017-02-04 00:00:00 UTC","id":"2","total_visits":"1549"} +{"parsed_date":"2016-12-09 00:00:00 UTC","id":"2","total_visits":"2830"} +{"parsed_date":"2016-10-30 00:00:00 UTC","id":"2","total_visits":"3086"} +{"parsed_date":"2017-03-28 00:00:00 UTC","id":"2","total_visits":"2577"} +{"parsed_date":"2017-06-11 00:00:00 UTC","id":"2","total_visits":"1555"} +{"parsed_date":"2016-12-17 00:00:00 UTC","id":"2","total_visits":"2324"} +{"parsed_date":"2016-09-22 00:00:00 UTC","id":"2","total_visits":"2581"} +{"parsed_date":"2017-01-29 00:00:00 UTC","id":"2","total_visits":"1814"} +{"parsed_date":"2017-03-22 00:00:00 UTC","id":"2","total_visits":"2582"} +{"parsed_date":"2017-02-21 00:00:00 UTC","id":"2","total_visits":"2582"} +{"parsed_date":"2016-10-14 00:00:00 UTC","id":"2","total_visits":"2838"} +{"parsed_date":"2017-04-27 00:00:00 UTC","id":"2","total_visits":"2838"} +{"parsed_date":"2016-10-26 00:00:00 UTC","id":"2","total_visits":"4375"} +{"parsed_date":"2016-08-22 00:00:00 UTC","id":"2","total_visits":"2584"} +{"parsed_date":"2016-12-07 00:00:00 UTC","id":"2","total_visits":"2840"} +{"parsed_date":"2017-01-20 00:00:00 UTC","id":"2","total_visits":"2074"} +{"parsed_date":"2017-03-07 00:00:00 UTC","id":"2","total_visits":"2586"} +{"parsed_date":"2017-05-16 00:00:00 UTC","id":"2","total_visits":"3098"} +{"parsed_date":"2017-05-03 00:00:00 UTC","id":"2","total_visits":"2588"} +{"parsed_date":"2017-05-01 00:00:00 UTC","id":"2","total_visits":"2588"} +{"parsed_date":"2016-11-27 00:00:00 UTC","id":"2","total_visits":"3356"} +{"parsed_date":"2017-04-29 00:00:00 UTC","id":"2","total_visits":"1566"} +{"parsed_date":"2016-09-18 00:00:00 UTC","id":"2","total_visits":"1822"} +{"parsed_date":"2017-03-23 00:00:00 UTC","id":"2","total_visits":"2847"} +{"parsed_date":"2017-03-14 00:00:00 UTC","id":"2","total_visits":"2338"} +{"parsed_date":"2016-12-21 00:00:00 UTC","id":"2","total_visits":"2594"} +{"parsed_date":"2016-10-11 00:00:00 UTC","id":"2","total_visits":"2850"} +{"parsed_date":"2017-01-24 00:00:00 UTC","id":"2","total_visits":"3618"} +{"parsed_date":"2017-03-05 00:00:00 UTC","id":"2","total_visits":"1827"} +{"parsed_date":"2017-01-19 00:00:00 UTC","id":"2","total_visits":"2083"} +{"parsed_date":"2016-08-09 00:00:00 UTC","id":"2","total_visits":"2851"} +{"parsed_date":"2017-04-08 00:00:00 UTC","id":"2","total_visits":"1829"} +{"parsed_date":"2017-04-12 00:00:00 UTC","id":"2","total_visits":"2341"} +{"parsed_date":"2016-09-29 00:00:00 UTC","id":"2","total_visits":"2597"} +{"parsed_date":"2016-12-20 00:00:00 UTC","id":"2","total_visits":"3110"} +{"parsed_date":"2017-01-15 00:00:00 UTC","id":"2","total_visits":"1576"} +{"parsed_date":"2017-04-14 00:00:00 UTC","id":"2","total_visits":"1834"} +{"parsed_date":"2017-02-28 00:00:00 UTC","id":"2","total_visits":"2347"} +{"parsed_date":"2016-09-16 00:00:00 UTC","id":"2","total_visits":"2603"} +{"parsed_date":"2016-10-18 00:00:00 UTC","id":"2","total_visits":"3628"} +{"parsed_date":"2017-02-24 00:00:00 UTC","id":"2","total_visits":"2093"} +{"parsed_date":"2017-05-17 00:00:00 UTC","id":"2","total_visits":"3117"} +{"parsed_date":"2017-06-23 00:00:00 UTC","id":"2","total_visits":"2095"} +{"parsed_date":"2016-11-12 00:00:00 UTC","id":"2","total_visits":"3119"} +{"parsed_date":"2016-11-21 00:00:00 UTC","id":"2","total_visits":"4143"} +{"parsed_date":"2017-02-27 00:00:00 UTC","id":"2","total_visits":"2352"} +{"parsed_date":"2016-12-26 00:00:00 UTC","id":"2","total_visits":"1586"} +{"parsed_date":"2017-04-25 00:00:00 UTC","id":"2","total_visits":"2354"} +{"parsed_date":"2017-03-21 00:00:00 UTC","id":"2","total_visits":"2611"} +{"parsed_date":"2016-12-22 00:00:00 UTC","id":"2","total_visits":"2100"} +{"parsed_date":"2016-10-01 00:00:00 UTC","id":"2","total_visits":"1589"} +{"parsed_date":"2016-09-24 00:00:00 UTC","id":"2","total_visits":"1845"} +{"parsed_date":"2017-06-21 00:00:00 UTC","id":"2","total_visits":"2357"} +{"parsed_date":"2016-09-02 00:00:00 UTC","id":"2","total_visits":"2613"} +{"parsed_date":"2016-08-26 00:00:00 UTC","id":"2","total_visits":"2359"} +{"parsed_date":"2016-10-12 00:00:00 UTC","id":"2","total_visits":"2871"} +{"parsed_date":"2017-05-15 00:00:00 UTC","id":"2","total_visits":"2360"} +{"parsed_date":"2017-06-12 00:00:00 UTC","id":"2","total_visits":"2361"} +{"parsed_date":"2016-08-16 00:00:00 UTC","id":"2","total_visits":"2873"} +{"parsed_date":"2017-04-30 00:00:00 UTC","id":"2","total_visits":"1594"} +{"parsed_date":"2017-04-05 00:00:00 UTC","id":"2","total_visits":"2619"} +{"parsed_date":"2016-08-12 00:00:00 UTC","id":"2","total_visits":"2619"} +{"parsed_date":"2016-11-08 00:00:00 UTC","id":"2","total_visits":"3899"} +{"parsed_date":"2016-08-13 00:00:00 UTC","id":"2","total_visits":"1596"} +{"parsed_date":"2017-05-09 00:00:00 UTC","id":"2","total_visits":"2108"} +{"parsed_date":"2017-02-23 00:00:00 UTC","id":"2","total_visits":"2364"} +{"parsed_date":"2017-07-31 00:00:00 UTC","id":"2","total_visits":"2620"} +{"parsed_date":"2017-06-25 00:00:00 UTC","id":"2","total_visits":"1597"} +{"parsed_date":"2017-07-29 00:00:00 UTC","id":"2","total_visits":"1597"} +{"parsed_date":"2016-09-17 00:00:00 UTC","id":"2","total_visits":"1853"} +{"parsed_date":"2016-12-27 00:00:00 UTC","id":"2","total_visits":"1855"} +{"parsed_date":"2017-05-20 00:00:00 UTC","id":"2","total_visits":"1855"} +{"parsed_date":"2016-10-08 00:00:00 UTC","id":"2","total_visits":"2114"} +{"parsed_date":"2016-10-27 00:00:00 UTC","id":"2","total_visits":"4162"} +{"parsed_date":"2017-07-08 00:00:00 UTC","id":"2","total_visits":"1859"} +{"parsed_date":"2016-08-24 00:00:00 UTC","id":"2","total_visits":"2627"} +{"parsed_date":"2016-12-23 00:00:00 UTC","id":"2","total_visits":"1604"} +{"parsed_date":"2017-02-02 00:00:00 UTC","id":"2","total_visits":"2372"} +{"parsed_date":"2016-09-08 00:00:00 UTC","id":"2","total_visits":"2628"} +{"parsed_date":"2017-04-02 00:00:00 UTC","id":"2","total_visits":"1861"} +{"parsed_date":"2017-02-15 00:00:00 UTC","id":"2","total_visits":"2629"} +{"parsed_date":"2017-07-05 00:00:00 UTC","id":"2","total_visits":"2885"} +{"parsed_date":"2016-10-17 00:00:00 UTC","id":"2","total_visits":"3397"} +{"parsed_date":"2017-02-20 00:00:00 UTC","id":"2","total_visits":"2374"} +{"parsed_date":"2017-03-24 00:00:00 UTC","id":"2","total_visits":"2374"} +{"parsed_date":"2017-04-20 00:00:00 UTC","id":"2","total_visits":"2374"} +{"parsed_date":"2016-11-18 00:00:00 UTC","id":"2","total_visits":"3654"} +{"parsed_date":"2017-07-25 00:00:00 UTC","id":"2","total_visits":"2631"} +{"parsed_date":"2016-11-13 00:00:00 UTC","id":"2","total_visits":"3144"} +{"parsed_date":"2017-03-18 00:00:00 UTC","id":"2","total_visits":"1610"} +{"parsed_date":"2016-08-03 00:00:00 UTC","id":"2","total_visits":"2890"} +{"parsed_date":"2016-08-19 00:00:00 UTC","id":"2","total_visits":"2379"} +{"parsed_date":"2017-02-14 00:00:00 UTC","id":"2","total_visits":"2379"} +{"parsed_date":"2017-07-11 00:00:00 UTC","id":"2","total_visits":"2635"} +{"parsed_date":"2017-04-22 00:00:00 UTC","id":"2","total_visits":"1612"} +{"parsed_date":"2016-10-07 00:00:00 UTC","id":"2","total_visits":"2892"} +{"parsed_date":"2016-09-05 00:00:00 UTC","id":"2","total_visits":"2125"} +{"parsed_date":"2016-09-23 00:00:00 UTC","id":"2","total_visits":"2381"} +{"parsed_date":"2016-11-15 00:00:00 UTC","id":"2","total_visits":"4685"} +{"parsed_date":"2017-01-28 00:00:00 UTC","id":"2","total_visits":"1614"} +{"parsed_date":"2017-07-14 00:00:00 UTC","id":"2","total_visits":"2382"} +{"parsed_date":"2017-01-07 00:00:00 UTC","id":"2","total_visits":"1615"} +{"parsed_date":"2017-04-03 00:00:00 UTC","id":"2","total_visits":"2383"} +{"parsed_date":"2017-03-20 00:00:00 UTC","id":"2","total_visits":"2383"} +{"parsed_date":"2016-12-18 00:00:00 UTC","id":"2","total_visits":"2128"} +{"parsed_date":"2017-03-17 00:00:00 UTC","id":"2","total_visits":"2129"} +{"parsed_date":"2017-05-23 00:00:00 UTC","id":"2","total_visits":"2129"} +{"parsed_date":"2016-11-30 00:00:00 UTC","id":"2","total_visits":"4435"} +{"parsed_date":"2017-01-01 00:00:00 UTC","id":"2","total_visits":"1364"} +{"parsed_date":"2017-01-02 00:00:00 UTC","id":"2","total_visits":"1620"} +{"parsed_date":"2016-09-25 00:00:00 UTC","id":"2","total_visits":"1877"} +{"parsed_date":"2016-08-07 00:00:00 UTC","id":"2","total_visits":"1622"} +{"parsed_date":"2016-10-09 00:00:00 UTC","id":"2","total_visits":"2134"} +{"parsed_date":"2017-03-01 00:00:00 UTC","id":"2","total_visits":"2390"} +{"parsed_date":"2017-01-04 00:00:00 UTC","id":"2","total_visits":"2390"} +{"parsed_date":"2017-06-06 00:00:00 UTC","id":"2","total_visits":"2391"} +{"parsed_date":"2017-04-18 00:00:00 UTC","id":"2","total_visits":"2391"} +{"parsed_date":"2017-04-06 00:00:00 UTC","id":"2","total_visits":"2647"} +{"parsed_date":"2017-01-30 00:00:00 UTC","id":"2","total_visits":"2392"} +{"parsed_date":"2016-10-16 00:00:00 UTC","id":"2","total_visits":"2649"} +{"parsed_date":"2016-08-04 00:00:00 UTC","id":"2","total_visits":"3161"} +{"parsed_date":"2016-10-21 00:00:00 UTC","id":"2","total_visits":"3419"} +{"parsed_date":"2016-08-02 00:00:00 UTC","id":"2","total_visits":"2140"} +{"parsed_date":"2017-03-06 00:00:00 UTC","id":"2","total_visits":"2396"} +{"parsed_date":"2016-09-13 00:00:00 UTC","id":"2","total_visits":"2396"} +{"parsed_date":"2016-09-14 00:00:00 UTC","id":"2","total_visits":"2652"} +{"parsed_date":"2017-04-19 00:00:00 UTC","id":"2","total_visits":"2397"} +{"parsed_date":"2017-06-19 00:00:00 UTC","id":"2","total_visits":"2142"} +{"parsed_date":"2016-12-13 00:00:00 UTC","id":"2","total_visits":"3166"} +{"parsed_date":"2017-06-20 00:00:00 UTC","id":"2","total_visits":"2143"} +{"parsed_date":"2016-10-10 00:00:00 UTC","id":"2","total_visits":"2911"} +{"parsed_date":"2017-07-06 00:00:00 UTC","id":"2","total_visits":"2658"} +{"parsed_date":"2017-01-03 00:00:00 UTC","id":"2","total_visits":"2403"} +{"parsed_date":"2017-01-08 00:00:00 UTC","id":"2","total_visits":"1637"} +{"parsed_date":"2017-02-25 00:00:00 UTC","id":"2","total_visits":"1638"} +{"parsed_date":"2017-05-24 00:00:00 UTC","id":"2","total_visits":"2406"} +{"parsed_date":"2016-11-22 00:00:00 UTC","id":"2","total_visits":"3942"} +{"parsed_date":"2017-05-06 00:00:00 UTC","id":"2","total_visits":"1383"} +{"parsed_date":"2017-07-02 00:00:00 UTC","id":"2","total_visits":"1895"} +{"parsed_date":"2016-12-01 00:00:00 UTC","id":"2","total_visits":"4200"} +{"parsed_date":"2017-03-16 00:00:00 UTC","id":"2","total_visits":"2409"} +{"parsed_date":"2016-12-12 00:00:00 UTC","id":"2","total_visits":"3433"} +{"parsed_date":"2016-12-25 00:00:00 UTC","id":"2","total_visits":"1386"} +{"parsed_date":"2017-02-26 00:00:00 UTC","id":"2","total_visits":"1643"} +{"parsed_date":"2017-04-28 00:00:00 UTC","id":"2","total_visits":"2411"} +{"parsed_date":"2016-08-11 00:00:00 UTC","id":"2","total_visits":"2667"} +{"parsed_date":"2017-07-20 00:00:00 UTC","id":"2","total_visits":"2668"} +{"parsed_date":"2017-05-21 00:00:00 UTC","id":"2","total_visits":"1645"} +{"parsed_date":"2017-06-17 00:00:00 UTC","id":"2","total_visits":"1391"} +{"parsed_date":"2016-12-29 00:00:00 UTC","id":"2","total_visits":"1647"} +{"parsed_date":"2017-07-17 00:00:00 UTC","id":"2","total_visits":"2671"} +{"parsed_date":"2017-01-16 00:00:00 UTC","id":"2","total_visits":"1906"} +{"parsed_date":"2017-03-03 00:00:00 UTC","id":"2","total_visits":"2162"} +{"parsed_date":"2016-11-14 00:00:00 UTC","id":"2","total_visits":"4466"} +{"parsed_date":"2016-08-30 00:00:00 UTC","id":"2","total_visits":"2675"} +{"parsed_date":"2016-08-27 00:00:00 UTC","id":"2","total_visits":"1654"} +{"parsed_date":"2017-02-09 00:00:00 UTC","id":"2","total_visits":"2678"} +{"parsed_date":"2017-06-03 00:00:00 UTC","id":"2","total_visits":"1399"} +{"parsed_date":"2017-05-07 00:00:00 UTC","id":"2","total_visits":"1400"} +{"parsed_date":"2016-11-02 00:00:00 UTC","id":"2","total_visits":"3960"} +{"parsed_date":"2016-12-15 00:00:00 UTC","id":"2","total_visits":"2937"} +{"parsed_date":"2017-04-01 00:00:00 UTC","id":"2","total_visits":"2170"} +{"parsed_date":"2017-07-21 00:00:00 UTC","id":"2","total_visits":"2427"} +{"parsed_date":"2016-08-06 00:00:00 UTC","id":"2","total_visits":"1663"} +{"parsed_date":"2016-09-01 00:00:00 UTC","id":"2","total_visits":"2687"} +{"parsed_date":"2017-06-28 00:00:00 UTC","id":"2","total_visits":"2687"} +{"parsed_date":"2016-08-20 00:00:00 UTC","id":"2","total_visits":"1664"} +{"parsed_date":"2017-04-26 00:00:00 UTC","id":"2","total_visits":"4224"} +{"parsed_date":"2017-07-09 00:00:00 UTC","id":"2","total_visits":"1921"} +{"parsed_date":"2017-07-28 00:00:00 UTC","id":"2","total_visits":"2433"} +{"parsed_date":"2016-09-19 00:00:00 UTC","id":"2","total_visits":"2689"} +{"parsed_date":"2017-07-24 00:00:00 UTC","id":"2","total_visits":"2436"} +{"parsed_date":"2017-06-13 00:00:00 UTC","id":"2","total_visits":"2181"} +{"parsed_date":"2016-09-15 00:00:00 UTC","id":"2","total_visits":"2949"} +{"parsed_date":"2017-02-03 00:00:00 UTC","id":"2","total_visits":"2182"} +{"parsed_date":"2016-09-10 00:00:00 UTC","id":"2","total_visits":"1671"} +{"parsed_date":"2017-06-09 00:00:00 UTC","id":"2","total_visits":"1927"} +{"parsed_date":"2017-01-11 00:00:00 UTC","id":"2","total_visits":"2185"} +{"parsed_date":"2017-02-19 00:00:00 UTC","id":"2","total_visits":"2187"} +{"parsed_date":"2017-01-17 00:00:00 UTC","id":"2","total_visits":"2443"} +{"parsed_date":"2017-05-12 00:00:00 UTC","id":"2","total_visits":"1932"} +{"parsed_date":"2016-12-16 00:00:00 UTC","id":"2","total_visits":"2956"} +{"parsed_date":"2017-02-01 00:00:00 UTC","id":"2","total_visits":"2445"} +{"parsed_date":"2016-11-26 00:00:00 UTC","id":"2","total_visits":"3213"} +{"parsed_date":"2017-06-02 00:00:00 UTC","id":"2","total_visits":"2190"} +{"parsed_date":"2016-08-05 00:00:00 UTC","id":"2","total_visits":"2702"} +{"parsed_date":"2016-11-01 00:00:00 UTC","id":"2","total_visits":"3728"} +{"parsed_date":"2017-01-05 00:00:00 UTC","id":"2","total_visits":"2193"} +{"parsed_date":"2017-03-08 00:00:00 UTC","id":"2","total_visits":"2449"} +{"parsed_date":"2016-08-28 00:00:00 UTC","id":"2","total_visits":"1682"} +{"parsed_date":"2017-07-04 00:00:00 UTC","id":"2","total_visits":"1938"} +{"parsed_date":"2017-03-10 00:00:00 UTC","id":"2","total_visits":"2194"} +{"parsed_date":"2017-07-07 00:00:00 UTC","id":"2","total_visits":"2450"} +{"parsed_date":"2016-10-29 00:00:00 UTC","id":"2","total_visits":"2964"} +{"parsed_date":"2016-10-13 00:00:00 UTC","id":"2","total_visits":"2964"} +{"parsed_date":"2016-12-04 00:00:00 UTC","id":"2","total_visits":"3220"} +{"parsed_date":"2017-01-21 00:00:00 UTC","id":"2","total_visits":"1685"} +{"parsed_date":"2017-06-29 00:00:00 UTC","id":"2","total_visits":"2709"} +{"parsed_date":"2016-08-29 00:00:00 UTC","id":"2","total_visits":"2454"} +{"parsed_date":"2016-12-19 00:00:00 UTC","id":"2","total_visits":"3222"} +{"parsed_date":"2017-05-30 00:00:00 UTC","id":"2","total_visits":"2199"} +{"parsed_date":"2017-02-10 00:00:00 UTC","id":"2","total_visits":"2199"} +{"parsed_date":"2016-08-31 00:00:00 UTC","id":"2","total_visits":"3223"} +{"parsed_date":"2017-06-18 00:00:00 UTC","id":"2","total_visits":"1432"} +{"parsed_date":"2017-01-12 00:00:00 UTC","id":"2","total_visits":"2203"} +{"parsed_date":"2017-05-18 00:00:00 UTC","id":"2","total_visits":"2715"} +{"parsed_date":"2016-10-23 00:00:00 UTC","id":"2","total_visits":"2971"} +{"parsed_date":"2016-09-04 00:00:00 UTC","id":"2","total_visits":"1692"} +{"parsed_date":"2016-12-10 00:00:00 UTC","id":"2","total_visits":"2207"} +{"parsed_date":"2016-12-11 00:00:00 UTC","id":"2","total_visits":"2208"} +{"parsed_date":"2017-04-11 00:00:00 UTC","id":"2","total_visits":"2464"} +{"parsed_date":"2016-09-21 00:00:00 UTC","id":"2","total_visits":"2720"} +{"parsed_date":"2016-11-06 00:00:00 UTC","id":"2","total_visits":"3232"} +{"parsed_date":"2017-01-26 00:00:00 UTC","id":"2","total_visits":"2209"} +{"parsed_date":"2016-09-12 00:00:00 UTC","id":"2","total_visits":"2465"} +{"parsed_date":"2017-04-21 00:00:00 UTC","id":"2","total_visits":"2210"} +{"parsed_date":"2017-01-06 00:00:00 UTC","id":"2","total_visits":"2210"} +{"parsed_date":"2017-04-04 00:00:00 UTC","id":"2","total_visits":"2978"} +{"parsed_date":"2017-01-22 00:00:00 UTC","id":"2","total_visits":"1700"} +{"parsed_date":"2017-07-26 00:00:00 UTC","id":"2","total_visits":"2725"} +{"parsed_date":"2016-08-18 00:00:00 UTC","id":"2","total_visits":"2725"} +{"parsed_date":"2016-09-27 00:00:00 UTC","id":"2","total_visits":"2727"} +{"parsed_date":"2016-12-02 00:00:00 UTC","id":"2","total_visits":"3751"} +{"parsed_date":"2017-05-05 00:00:00 UTC","id":"2","total_visits":"1960"} +{"parsed_date":"2016-11-19 00:00:00 UTC","id":"2","total_visits":"2984"} +{"parsed_date":"2016-11-09 00:00:00 UTC","id":"2","total_visits":"3752"} +{"parsed_date":"2016-12-05 00:00:00 UTC","id":"2","total_visits":"4265"} +{"parsed_date":"2017-05-11 00:00:00 UTC","id":"2","total_visits":"2218"} +{"parsed_date":"2017-01-25 00:00:00 UTC","id":"2","total_visits":"2986"} +{"parsed_date":"2017-03-11 00:00:00 UTC","id":"2","total_visits":"1707"} +{"parsed_date":"2017-03-30 00:00:00 UTC","id":"2","total_visits":"2731"} +{"parsed_date":"2016-10-20 00:00:00 UTC","id":"2","total_visits":"3755"} +{"parsed_date":"2017-02-07 00:00:00 UTC","id":"2","total_visits":"2476"} +{"parsed_date":"2017-02-22 00:00:00 UTC","id":"2","total_visits":"2477"} +{"parsed_date":"2017-07-23 00:00:00 UTC","id":"2","total_visits":"1966"} +{"parsed_date":"2016-11-03 00:00:00 UTC","id":"2","total_visits":"4014"} +{"parsed_date":"2016-08-01 00:00:00 UTC","id":"2","total_visits":"1711"} +{"parsed_date":"2017-01-13 00:00:00 UTC","id":"2","total_visits":"1967"} +{"parsed_date":"2017-05-19 00:00:00 UTC","id":"2","total_visits":"2223"} +{"parsed_date":"2016-11-20 00:00:00 UTC","id":"2","total_visits":"3247"} +{"parsed_date":"2016-11-25 00:00:00 UTC","id":"2","total_visits":"3759"} +{"parsed_date":"2017-03-25 00:00:00 UTC","id":"2","total_visits":"1712"} +{"parsed_date":"2017-01-27 00:00:00 UTC","id":"2","total_visits":"1969"} +{"parsed_date":"2017-06-26 00:00:00 UTC","id":"2","total_visits":"2226"} +{"parsed_date":"2017-05-25 00:00:00 UTC","id":"2","total_visits":"2228"} +{"parsed_date":"2017-01-31 00:00:00 UTC","id":"2","total_visits":"2229"} +{"parsed_date":"2017-07-13 00:00:00 UTC","id":"2","total_visits":"2741"} +{"parsed_date":"2017-03-15 00:00:00 UTC","id":"2","total_visits":"2486"} +{"parsed_date":"2017-05-28 00:00:00 UTC","id":"2","total_visits":"1463"} +{"parsed_date":"2017-03-09 00:00:00 UTC","id":"2","total_visits":"2231"} +{"parsed_date":"2017-07-15 00:00:00 UTC","id":"2","total_visits":"1721"} +{"parsed_date":"2016-11-24 00:00:00 UTC","id":"2","total_visits":"3770"} +{"parsed_date":"2016-10-05 00:00:00 UTC","id":"2","total_visits":"3770"} +{"parsed_date":"2016-12-31 00:00:00 UTC","id":"2","total_visits":"1211"} +{"parsed_date":"2016-10-02 00:00:00 UTC","id":"2","total_visits":"1724"} +{"parsed_date":"2017-07-22 00:00:00 UTC","id":"2","total_visits":"1724"} +{"parsed_date":"2016-09-11 00:00:00 UTC","id":"2","total_visits":"1725"} +{"parsed_date":"2017-06-15 00:00:00 UTC","id":"2","total_visits":"2237"} +{"parsed_date":"2017-06-05 00:00:00 UTC","id":"2","total_visits":"2493"} +{"parsed_date":"2017-02-06 00:00:00 UTC","id":"2","total_visits":"2238"} +{"parsed_date":"2016-10-15 00:00:00 UTC","id":"2","total_visits":"2495"} +{"parsed_date":"2016-08-21 00:00:00 UTC","id":"2","total_visits":"1730"} +{"parsed_date":"2016-08-23 00:00:00 UTC","id":"2","total_visits":"2754"} +{"parsed_date":"2017-06-30 00:00:00 UTC","id":"2","total_visits":"2499"} +{"parsed_date":"2017-01-18 00:00:00 UTC","id":"2","total_visits":"2245"} +{"parsed_date":"2016-08-10 00:00:00 UTC","id":"2","total_visits":"2757"} +{"parsed_date":"2016-12-08 00:00:00 UTC","id":"2","total_visits":"3013"} +{"parsed_date":"2016-11-28 00:00:00 UTC","id":"2","total_visits":"4807"} +{"parsed_date":"2017-05-22 00:00:00 UTC","id":"2","total_visits":"2248"} +{"parsed_date":"2016-09-20 00:00:00 UTC","id":"2","total_visits":"2760"} +{"parsed_date":"2016-10-06 00:00:00 UTC","id":"2","total_visits":"3016"} +{"parsed_date":"2016-09-06 00:00:00 UTC","id":"2","total_visits":"2508"} +{"parsed_date":"2016-09-03 00:00:00 UTC","id":"2","total_visits":"1741"} +{"parsed_date":"2016-12-06 00:00:00 UTC","id":"2","total_visits":"3021"} +{"parsed_date":"2016-12-24 00:00:00 UTC","id":"2","total_visits":"1231"} +{"parsed_date":"2016-10-28 00:00:00 UTC","id":"2","total_visits":"3791"} +{"parsed_date":"2016-12-30 00:00:00 UTC","id":"2","total_visits":"1232"} +{"parsed_date":"2017-05-29 00:00:00 UTC","id":"2","total_visits":"1745"} +{"parsed_date":"2017-07-10 00:00:00 UTC","id":"2","total_visits":"2769"} +{"parsed_date":"2017-06-22 00:00:00 UTC","id":"2","total_visits":"2258"} +{"parsed_date":"2017-07-19 00:00:00 UTC","id":"2","total_visits":"2514"} +{"parsed_date":"2016-10-03 00:00:00 UTC","id":"2","total_visits":"2514"} +{"parsed_date":"2017-06-14 00:00:00 UTC","id":"2","total_visits":"2517"} +{"parsed_date":"2016-10-22 00:00:00 UTC","id":"2","total_visits":"3029"} +{"parsed_date":"2017-01-23 00:00:00 UTC","id":"2","total_visits":"2262"} +{"parsed_date":"2017-04-24 00:00:00 UTC","id":"2","total_visits":"2263"} +{"parsed_date":"2016-11-10 00:00:00 UTC","id":"2","total_visits":"4055"} +{"parsed_date":"2016-09-26 00:00:00 UTC","id":"2","total_visits":"2776"} +{"parsed_date":"2016-10-19 00:00:00 UTC","id":"2","total_visits":"3544"} +{"parsed_date":"2017-03-04 00:00:00 UTC","id":"2","total_visits":"1753"} +{"parsed_date":"2017-05-26 00:00:00 UTC","id":"2","total_visits":"2009"} +{"parsed_date":"2017-02-13 00:00:00 UTC","id":"2","total_visits":"2266"} +{"parsed_date":"2017-02-18 00:00:00 UTC","id":"2","total_visits":"1755"} +{"parsed_date":"2017-03-02 00:00:00 UTC","id":"2","total_visits":"2267"} +{"parsed_date":"2017-03-31 00:00:00 UTC","id":"2","total_visits":"2268"} +{"parsed_date":"2017-01-10 00:00:00 UTC","id":"2","total_visits":"2268"} +{"parsed_date":"2017-03-29 00:00:00 UTC","id":"2","total_visits":"2525"} +{"parsed_date":"2017-03-27 00:00:00 UTC","id":"2","total_visits":"2525"} +{"parsed_date":"2016-11-23 00:00:00 UTC","id":"2","total_visits":"3805"} +{"parsed_date":"2017-05-27 00:00:00 UTC","id":"2","total_visits":"1502"} +{"parsed_date":"2016-10-24 00:00:00 UTC","id":"2","total_visits":"4063"} +{"parsed_date":"2016-12-14 00:00:00 UTC","id":"2","total_visits":"3040"} +{"parsed_date":"2017-02-11 00:00:00 UTC","id":"2","total_visits":"1761"} +{"parsed_date":"2017-07-27 00:00:00 UTC","id":"2","total_visits":"2529"} +{"parsed_date":"2017-02-17 00:00:00 UTC","id":"2","total_visits":"2785"} +{"parsed_date":"2017-04-15 00:00:00 UTC","id":"2","total_visits":"1506"} +{"parsed_date":"2016-11-05 00:00:00 UTC","id":"2","total_visits":"3042"} +{"parsed_date":"2016-10-04 00:00:00 UTC","id":"2","total_visits":"4322"} +{"parsed_date":"2017-05-13 00:00:00 UTC","id":"2","total_visits":"1251"} +{"parsed_date":"2017-04-16 00:00:00 UTC","id":"2","total_visits":"1507"} +{"parsed_date":"2016-12-28 00:00:00 UTC","id":"2","total_visits":"1763"} +{"parsed_date":"2016-08-15 00:00:00 UTC","id":"2","total_visits":"3043"} +{"parsed_date":"2016-12-03 00:00:00 UTC","id":"2","total_visits":"3044"} +{"parsed_date":"2017-06-27 00:00:00 UTC","id":"2","total_visits":"2789"} +{"parsed_date":"2017-06-24 00:00:00 UTC","id":"2","total_visits":"1510"} +{"parsed_date":"2017-07-16 00:00:00 UTC","id":"2","total_visits":"1766"} +{"parsed_date":"2017-04-09 00:00:00 UTC","id":"2","total_visits":"1766"} +{"parsed_date":"2017-06-07 00:00:00 UTC","id":"2","total_visits":"2279"} +{"parsed_date":"2017-04-17 00:00:00 UTC","id":"2","total_visits":"2279"} +{"parsed_date":"2016-09-28 00:00:00 UTC","id":"2","total_visits":"2535"} +{"parsed_date":"2017-03-26 00:00:00 UTC","id":"2","total_visits":"1768"} +{"parsed_date":"2017-05-10 00:00:00 UTC","id":"2","total_visits":"2024"} +{"parsed_date":"2017-06-08 00:00:00 UTC","id":"2","total_visits":"2280"} +{"parsed_date":"2017-05-08 00:00:00 UTC","id":"2","total_visits":"2025"} +{"parsed_date":"2017-03-13 00:00:00 UTC","id":"2","total_visits":"2537"} +{"parsed_date":"2016-11-17 00:00:00 UTC","id":"2","total_visits":"4074"} +{"parsed_date":"2016-08-25 00:00:00 UTC","id":"2","total_visits":"2539"} +{"parsed_date":"2017-02-16 00:00:00 UTC","id":"2","total_visits":"2539"} +{"parsed_date":"2017-06-16 00:00:00 UTC","id":"2","total_visits":"2028"} +{"parsed_date":"2016-11-16 00:00:00 UTC","id":"2","total_visits":"4334"} +{"parsed_date":"2016-08-17 00:00:00 UTC","id":"2","total_visits":"2800"} +{"parsed_date":"2017-03-19 00:00:00 UTC","id":"2","total_visits":"1776"} +{"parsed_date":"2016-11-29 00:00:00 UTC","id":"2","total_visits":"4337"} +{"parsed_date":"2017-02-05 00:00:00 UTC","id":"2","total_visits":"1522"} +{"parsed_date":"2016-10-31 00:00:00 UTC","id":"2","total_visits":"3827"} +{"parsed_date":"2017-05-31 00:00:00 UTC","id":"2","total_visits":"2292"} +{"parsed_date":"2017-07-18 00:00:00 UTC","id":"2","total_visits":"2804"} +{"parsed_date":"2017-03-12 00:00:00 UTC","id":"2","total_visits":"1781"} +{"parsed_date":"2016-09-09 00:00:00 UTC","id":"2","total_visits":"2549"} +{"parsed_date":"2017-01-14 00:00:00 UTC","id":"2","total_visits":"1526"} +{"parsed_date":"2017-05-04 00:00:00 UTC","id":"2","total_visits":"2806"} +{"parsed_date":"2016-11-07 00:00:00 UTC","id":"2","total_visits":"3832"} +{"parsed_date":"2017-04-07 00:00:00 UTC","id":"2","total_visits":"2297"} +{"parsed_date":"2017-07-12 00:00:00 UTC","id":"2","total_visits":"2554"} +{"parsed_date":"2017-04-13 00:00:00 UTC","id":"2","total_visits":"2300"} +{"parsed_date":"2017-08-01 00:00:00 UTC","id":"2","total_visits":"2556"} +{"parsed_date":"2017-06-04 00:00:00 UTC","id":"2","total_visits":"1534"} +{"parsed_date":"2017-02-12 00:00:00 UTC","id":"2","total_visits":"1790"} +{"parsed_date":"2017-07-03 00:00:00 UTC","id":"2","total_visits":"2046"} +{"parsed_date":"2016-09-30 00:00:00 UTC","id":"2","total_visits":"2303"} +{"parsed_date":"2016-08-08 00:00:00 UTC","id":"2","total_visits":"2815"} diff --git a/tests/data/time_series_schema.json b/tests/data/time_series_schema.json index 857595b9e6..35473dc0e3 100644 --- a/tests/data/time_series_schema.json +++ b/tests/data/time_series_schema.json @@ -4,6 +4,11 @@ "name": "parsed_date", "type": "TIMESTAMP" }, + { + "mode": "NULLABLE", + "name": "id", + "type": "STRING" + }, { "mode": "NULLABLE", "name": "total_visits", diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 251b9da4ac..29234bc4ef 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -772,6 +772,31 @@ def new_time_series_df(session, new_time_series_pandas_df): return session.read_pandas(new_time_series_pandas_df) +@pytest.fixture(scope="session") +def new_time_series_pandas_df_w_id(): + """Additional data matching the time series dataset. The values are dummy ones used to basically check the prediction scores.""" + utc = pytz.utc + return pd.DataFrame( + { + "parsed_date": [ + datetime(2017, 8, 2, tzinfo=utc), + datetime(2017, 8, 2, tzinfo=utc), + datetime(2017, 8, 3, tzinfo=utc), + datetime(2017, 8, 3, tzinfo=utc), + datetime(2017, 8, 4, tzinfo=utc), + datetime(2017, 8, 4, tzinfo=utc), + ], + "id": ["1", "2", "1", "2", "1", "2"], + "total_visits": [2500, 2500, 2500, 2500, 2500, 2500], + } + ) + + +@pytest.fixture(scope="session") +def new_time_series_df_w_id(session, new_time_series_pandas_df_w_id): + return session.read_pandas(new_time_series_pandas_df_w_id) + + @pytest.fixture(scope="session") def penguins_pandas_df_default_index() -> pd.DataFrame: """Consistently ordered pandas dataframe for penguins test data""" @@ -1015,12 +1040,34 @@ def penguins_xgbregressor_model_name( return model_name +def _get_or_create_arima_plus_model( + session: bigframes.Session, dataset_id_permanent, sql +) -> str: + """Internal helper to compute a model name by hasing the given SQL. + attempst to retreive the model, create it if not exist. + retursn the fully qualitifed model""" + + # We use the SQL hash as the name to ensure the model is regenerated if this fixture is edited + model_name = f"{dataset_id_permanent}.time_series_arima_plus_{hashlib.md5(sql.encode()).hexdigest()}" + sql = sql.replace("$model_name", model_name) + try: + session.bqclient.get_model(model_name) + except google.cloud.exceptions.NotFound: + logging.info( + "time_series_arima_plus_model fixture was not found in the permanent dataset, regenerating it..." + ) + session.bqclient.query(sql).result() + finally: + return model_name + + @pytest.fixture(scope="session") def time_series_arima_plus_model_name( session: bigframes.Session, dataset_id_permanent, time_series_table_id ) -> str: """Provides a pretrained model as a test fixture that is cached across test runs. - This lets us run system tests without having to wait for a model.fit(...)""" + This lets us run system tests without having to wait for a model.fit(...). + This version does not include time_series_id_col.""" sql = f""" CREATE OR REPLACE MODEL `$model_name` OPTIONS ( @@ -1028,21 +1075,30 @@ def time_series_arima_plus_model_name( time_series_timestamp_col = 'parsed_date', time_series_data_col = 'total_visits' ) AS SELECT - * + parsed_date, + total_visits FROM `{time_series_table_id}`""" - # We use the SQL hash as the name to ensure the model is regenerated if this fixture is edited - model_name = f"{dataset_id_permanent}.time_series_arima_plus_{hashlib.md5(sql.encode()).hexdigest()}" - sql = sql.replace("$model_name", model_name) + return _get_or_create_arima_plus_model(session, dataset_id_permanent, sql) - try: - session.bqclient.get_model(model_name) - except google.cloud.exceptions.NotFound: - logging.info( - "time_series_arima_plus_model fixture was not found in the permanent dataset, regenerating it..." - ) - session.bqclient.query(sql).result() - finally: - return model_name + +@pytest.fixture(scope="session") +def time_series_arima_plus_model_name_w_id( + session: bigframes.Session, dataset_id_permanent, time_series_table_id +) -> str: + """Provides a pretrained model as a test fixture that is cached across test runs. + This lets us run system tests without having to wait for a model.fit(...). + This version includes time_series_id_col.""" + sql = f""" +CREATE OR REPLACE MODEL `$model_name` +OPTIONS ( + model_type='ARIMA_PLUS', + time_series_timestamp_col = 'parsed_date', + time_series_data_col = 'total_visits', + time_series_id_col = 'id' +) AS SELECT + * +FROM `{time_series_table_id}`""" + return _get_or_create_arima_plus_model(session, dataset_id_permanent, sql) @pytest.fixture(scope="session") diff --git a/tests/system/large/ml/test_forecasting.py b/tests/system/large/ml/test_forecasting.py index bb53305b94..7c070fd200 100644 --- a/tests/system/large/ml/test_forecasting.py +++ b/tests/system/large/ml/test_forecasting.py @@ -33,38 +33,65 @@ ] -@pytest.fixture(scope="module") -def arima_model(time_series_df_default_index): +def _fit_arima_model(time_series_df_default_index): model = forecasting.ARIMAPlus() X_train = time_series_df_default_index["parsed_date"] y_train = time_series_df_default_index[["total_visits"]] + return model, X_train, y_train + + +@pytest.fixture(scope="module") +def arima_model(time_series_df_default_index): + model, X_train, y_train = _fit_arima_model(time_series_df_default_index) model.fit(X_train, y_train) return model +@pytest.fixture(scope="module") +def arima_model_w_id(time_series_df_default_index): + model, X_train, y_train = _fit_arima_model(time_series_df_default_index) + id_cols = time_series_df_default_index[["id"]] + model.fit(X_train, y_train, id_col=id_cols) + return model + + +@pytest.mark.parametrize("id_col_name", [None, "id"]) def test_arima_plus_model_fit_score( dataset_id, new_time_series_df, + new_time_series_df_w_id, arima_model, + arima_model_w_id, + id_col_name, ): - - result = arima_model.score( - new_time_series_df[["parsed_date"]], new_time_series_df[["total_visits"]] - ).to_pandas() + curr_model = arima_model_w_id if id_col_name else arima_model + if id_col_name: + result = curr_model.score( + new_time_series_df_w_id[["parsed_date"]], + new_time_series_df_w_id[["total_visits"]], + id_col=new_time_series_df_w_id[[id_col_name]], + ).to_pandas() + else: + result = curr_model.score( + new_time_series_df[["parsed_date"]], new_time_series_df[["total_visits"]] + ).to_pandas() + expected_columns = [ + "mean_absolute_error", + "mean_squared_error", + "root_mean_squared_error", + "mean_absolute_percentage_error", + "symmetric_mean_absolute_percentage_error", + ] + if id_col_name: + expected_columns.insert(0, id_col_name) utils.check_pandas_df_schema_and_index( result, - columns=[ - "mean_absolute_error", - "mean_squared_error", - "root_mean_squared_error", - "mean_absolute_percentage_error", - "symmetric_mean_absolute_percentage_error", - ], - index=1, + columns=expected_columns, + index=2 if id_col_name else 1, ) # save, load to ensure configuration was kept - reloaded_model = arima_model.to_gbq( + reloaded_model = curr_model.to_gbq( f"{dataset_id}.temp_arima_plus_model", replace=True ) assert ( @@ -72,14 +99,22 @@ def test_arima_plus_model_fit_score( ) -def test_arima_plus_model_fit_summary(dataset_id, arima_model): - result = arima_model.summary().to_pandas() +@pytest.mark.parametrize("id_col_name", [None, "id"]) +def test_arima_plus_model_fit_summary( + dataset_id, arima_model, arima_model_w_id, id_col_name +): + curr_model = arima_model_w_id if id_col_name else arima_model + result = curr_model.summary().to_pandas() + expected_columns = ( + [id_col_name] + ARIMA_EVALUATE_OUTPUT_COL + if id_col_name + else ARIMA_EVALUATE_OUTPUT_COL + ) utils.check_pandas_df_schema_and_index( - result, columns=ARIMA_EVALUATE_OUTPUT_COL, index=1 + result, columns=expected_columns, index=2 if id_col_name else 1 ) - # save, load to ensure configuration was kept - reloaded_model = arima_model.to_gbq( + reloaded_model = curr_model.to_gbq( f"{dataset_id}.temp_arima_plus_model", replace=True ) assert ( @@ -87,17 +122,29 @@ def test_arima_plus_model_fit_summary(dataset_id, arima_model): ) -def test_arima_coefficients(arima_model): - result = arima_model.coef_.to_pandas() +@pytest.mark.parametrize("id_col_name", [None, "id"]) +def test_arima_coefficients(arima_model, arima_model_w_id, id_col_name): + result = ( + arima_model_w_id.coef_.to_pandas() + if id_col_name + else arima_model.coef_.to_pandas() + ) expected_columns = [ "ar_coefficients", "ma_coefficients", "intercept_or_drift", ] - utils.check_pandas_df_schema_and_index(result, columns=expected_columns, index=1) + if id_col_name: + expected_columns.insert(0, id_col_name) + utils.check_pandas_df_schema_and_index( + result, columns=expected_columns, index=2 if id_col_name else 1 + ) -def test_arima_plus_model_fit_params(time_series_df_default_index, dataset_id): +@pytest.mark.parametrize("id_col_name", [None, "id"]) +def test_arima_plus_model_fit_params( + time_series_df_default_index, dataset_id, id_col_name +): model = forecasting.ARIMAPlus( horizon=100, auto_arima=True, @@ -115,7 +162,11 @@ def test_arima_plus_model_fit_params(time_series_df_default_index, dataset_id): X_train = time_series_df_default_index[["parsed_date"]] y_train = time_series_df_default_index["total_visits"] - model.fit(X_train, y_train) + if id_col_name is None: + model.fit(X_train, y_train) + else: + id_cols = time_series_df_default_index[[id_col_name]] + model.fit(X_train, y_train, id_col=id_cols) # save, load to ensure configuration was kept reloaded_model = model.to_gbq(f"{dataset_id}.temp_arima_plus_model", replace=True) diff --git a/tests/system/small/ml/conftest.py b/tests/system/small/ml/conftest.py index c1643776a5..0e8489c513 100644 --- a/tests/system/small/ml/conftest.py +++ b/tests/system/small/ml/conftest.py @@ -304,6 +304,14 @@ def time_series_bqml_arima_plus_model( return core.BqmlModel(session, model) +@pytest.fixture(scope="session") +def time_series_bqml_arima_plus_model_w_id( + session, time_series_arima_plus_model_name_w_id +) -> core.BqmlModel: + model = session.bqclient.get_model(time_series_arima_plus_model_name_w_id) + return core.BqmlModel(session, model) + + @pytest.fixture(scope="session") def time_series_arima_plus_model( session, time_series_arima_plus_model_name @@ -314,6 +322,16 @@ def time_series_arima_plus_model( ) +@pytest.fixture(scope="session") +def time_series_arima_plus_model_w_id( + session, time_series_arima_plus_model_name_w_id +) -> forecasting.ARIMAPlus: + return cast( + forecasting.ARIMAPlus, + session.read_gbq_model(time_series_arima_plus_model_name_w_id), + ) + + @pytest.fixture(scope="session") def imported_tensorflow_model_path() -> str: return "gs://cloud-training-demos/txtclass/export/exporter/1549825580/*" diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index 2a2e68b230..1c2591b90a 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -410,22 +410,65 @@ def test_model_generate_text( ) -def test_model_forecast(time_series_bqml_arima_plus_model: core.BqmlModel): +@pytest.mark.parametrize("id_col_name", [None, "id"]) +def test_model_forecast( + time_series_bqml_arima_plus_model: core.BqmlModel, + time_series_bqml_arima_plus_model_w_id: core.BqmlModel, + id_col_name, +): utc = pytz.utc - forecast = time_series_bqml_arima_plus_model.forecast( - {"horizon": 4, "confidence_level": 0.8} - ).to_pandas()[["forecast_timestamp", "forecast_value"]] - expected = pd.DataFrame( - { - "forecast_timestamp": [ - datetime(2017, 8, 2, tzinfo=utc), - datetime(2017, 8, 3, tzinfo=utc), - datetime(2017, 8, 4, tzinfo=utc), - datetime(2017, 8, 5, tzinfo=utc), - ], - "forecast_value": [2724.472284, 2593.368389, 2353.613034, 1781.623071], - } - ) + forecast_cols = ["forecast_timestamp", "forecast_value"] + if id_col_name: + forecast_cols.insert(0, id_col_name) + + forecast = ( + time_series_bqml_arima_plus_model_w_id.forecast( + {"horizon": 4, "confidence_level": 0.8} + ) + if id_col_name + else time_series_bqml_arima_plus_model.forecast( + {"horizon": 4, "confidence_level": 0.8} + ) + ).to_pandas()[forecast_cols] + if id_col_name: + expected = pd.DataFrame( + { + "id": ["1", "2", "1", "2", "1", "2", "1", "2"], + "forecast_timestamp": [ + datetime(2017, 8, 2, tzinfo=utc), + datetime(2017, 8, 2, tzinfo=utc), + datetime(2017, 8, 3, tzinfo=utc), + datetime(2017, 8, 3, tzinfo=utc), + datetime(2017, 8, 4, tzinfo=utc), + datetime(2017, 8, 4, tzinfo=utc), + datetime(2017, 8, 5, tzinfo=utc), + datetime(2017, 8, 5, tzinfo=utc), + ], + "forecast_value": [ + 2634.796023, + 2634.796023, + 2621.332462, + 2621.332462, + 2396.095463, + 2396.095463, + 1742.878278, + 1742.878278, + ], + } + ) + expected["id"] = expected["id"].astype("string[pyarrow]") + else: + expected = pd.DataFrame( + { + "forecast_timestamp": [ + datetime(2017, 8, 2, tzinfo=utc), + datetime(2017, 8, 3, tzinfo=utc), + datetime(2017, 8, 4, tzinfo=utc), + datetime(2017, 8, 5, tzinfo=utc), + ], + "forecast_value": [2634.796023, 2621.332462, 2396.095463, 1742.878278], + } + ) expected["forecast_value"] = expected["forecast_value"].astype(pd.Float64Dtype()) expected["forecast_timestamp"] = expected["forecast_timestamp"].astype( pd.ArrowDtype(pa.timestamp("us", tz="UTC")) diff --git a/tests/system/small/ml/test_forecasting.py b/tests/system/small/ml/test_forecasting.py index 1b3a650388..d1b6b18fbe 100644 --- a/tests/system/small/ml/test_forecasting.py +++ b/tests/system/small/ml/test_forecasting.py @@ -16,6 +16,7 @@ import pandas as pd import pyarrow as pa +import pytest import pytz from bigframes.ml import forecasting @@ -35,23 +36,66 @@ ] +@pytest.mark.parametrize("id_col_name", [None, "id"]) def test_arima_plus_predict_default( time_series_arima_plus_model: forecasting.ARIMAPlus, + time_series_arima_plus_model_w_id: forecasting.ARIMAPlus, + id_col_name, ): utc = pytz.utc - predictions = time_series_arima_plus_model.predict().to_pandas() - assert predictions.shape == (3, 8) - result = predictions[["forecast_timestamp", "forecast_value"]] - expected = pd.DataFrame( - { - "forecast_timestamp": [ - datetime(2017, 8, 2, tzinfo=utc), - datetime(2017, 8, 3, tzinfo=utc), - datetime(2017, 8, 4, tzinfo=utc), - ], - "forecast_value": [2724.472284, 2593.368389, 2353.613034], - } + predictions = ( + ( + time_series_arima_plus_model_w_id + if id_col_name + else time_series_arima_plus_model + ) + .predict() + .to_pandas() ) + assert predictions.shape == ((6, 9) if id_col_name else (3, 8)) + result = predictions[["forecast_timestamp", "forecast_value"]] + if id_col_name: + result["id"] = predictions[["id"]] + result = result[["id", "forecast_timestamp", "forecast_value"]] + + if id_col_name: + expected = pd.DataFrame( + { + "id": ["1", "2", "1", "2", "1", "2"], + "forecast_timestamp": [ + datetime(2017, 8, 2, tzinfo=utc), + datetime(2017, 8, 2, tzinfo=utc), + datetime(2017, 8, 3, tzinfo=utc), + datetime(2017, 8, 3, tzinfo=utc), + datetime(2017, 8, 4, tzinfo=utc), + datetime(2017, 8, 4, tzinfo=utc), + ], + "forecast_value": [ + 2634.796023, + 2634.796023, + 2621.332461, + 2621.332461, + 2396.095462, + 2396.095462, + ], + } + ) + expected["id"] = expected["id"].astype("string[pyarrow]") + else: + expected = pd.DataFrame( + { + "forecast_timestamp": [ + datetime(2017, 8, 2, tzinfo=utc), + datetime(2017, 8, 3, tzinfo=utc), + datetime(2017, 8, 4, tzinfo=utc), + ], + "forecast_value": [ + 2634.796023, + 2621.332461, + 2396.095462, + ], + } + ) expected["forecast_value"] = expected["forecast_value"].astype(pd.Float64Dtype()) expected["forecast_timestamp"] = expected["forecast_timestamp"].astype( pd.ArrowDtype(pa.timestamp("us", tz="UTC")) @@ -65,27 +109,69 @@ def test_arima_plus_predict_default( ) +@pytest.mark.parametrize("id_col_name", [None, "id"]) def test_arima_plus_predict_explain_default( time_series_arima_plus_model: forecasting.ARIMAPlus, + time_series_arima_plus_model_w_id: forecasting.ARIMAPlus, + id_col_name, ): utc = pytz.utc - predictions = time_series_arima_plus_model.predict_explain().to_pandas() - assert predictions.shape[0] == 369 + predictions = ( + ( + time_series_arima_plus_model_w_id + if id_col_name + else time_series_arima_plus_model + ) + .predict_explain() + .to_pandas() + ) + assert predictions.shape[0] == (738 if id_col_name else 369) predictions = predictions[ predictions["time_series_type"] == "forecast" ].reset_index(drop=True) - assert predictions.shape[0] == 3 + assert predictions.shape[0] == (6 if id_col_name else 3) result = predictions[["time_series_timestamp", "time_series_data"]] - expected = pd.DataFrame( - { - "time_series_timestamp": [ - datetime(2017, 8, 2, tzinfo=utc), - datetime(2017, 8, 3, tzinfo=utc), - datetime(2017, 8, 4, tzinfo=utc), - ], - "time_series_data": [2727.693349, 2595.290749, 2370.86767], - } - ) + if id_col_name: + result["id"] = predictions[["id"]] + result = result[["id", "time_series_timestamp", "time_series_data"]] + if id_col_name: + expected = pd.DataFrame( + { + "id": ["1", "2", "1", "2", "1", "2"], + "time_series_timestamp": [ + datetime(2017, 8, 2, tzinfo=utc), + datetime(2017, 8, 2, tzinfo=utc), + datetime(2017, 8, 3, tzinfo=utc), + datetime(2017, 8, 3, tzinfo=utc), + datetime(2017, 8, 4, tzinfo=utc), + datetime(2017, 8, 4, tzinfo=utc), + ], + "time_series_data": [ + 2634.796023, + 2634.796023, + 2621.332461, + 2621.332461, + 2396.095462, + 2396.095462, + ], + } + ) + expected["id"] = expected["id"].astype("string[pyarrow]") + else: + expected = pd.DataFrame( + { + "time_series_timestamp": [ + datetime(2017, 8, 2, tzinfo=utc), + datetime(2017, 8, 3, tzinfo=utc), + datetime(2017, 8, 4, tzinfo=utc), + ], + "time_series_data": [ + 2634.796023, + 2621.332461, + 2396.095462, + ], + } + ) expected["time_series_data"] = expected["time_series_data"].astype( pd.Float64Dtype() ) @@ -101,24 +187,72 @@ def test_arima_plus_predict_explain_default( ) -def test_arima_plus_predict_params(time_series_arima_plus_model: forecasting.ARIMAPlus): +@pytest.mark.parametrize("id_col_name", [None, "id"]) +def test_arima_plus_predict_params( + time_series_arima_plus_model: forecasting.ARIMAPlus, + time_series_arima_plus_model_w_id: forecasting.ARIMAPlus, + id_col_name, +): utc = pytz.utc - predictions = time_series_arima_plus_model.predict( - horizon=4, confidence_level=0.9 - ).to_pandas() - assert predictions.shape == (4, 8) - result = predictions[["forecast_timestamp", "forecast_value"]] - expected = pd.DataFrame( - { - "forecast_timestamp": [ - datetime(2017, 8, 2, tzinfo=utc), - datetime(2017, 8, 3, tzinfo=utc), - datetime(2017, 8, 4, tzinfo=utc), - datetime(2017, 8, 5, tzinfo=utc), - ], - "forecast_value": [2724.472284, 2593.368389, 2353.613034, 1781.623071], - } + predictions = ( + ( + time_series_arima_plus_model_w_id + if id_col_name + else time_series_arima_plus_model + ) + .predict(horizon=4, confidence_level=0.9) + .to_pandas() ) + assert predictions.shape == ((8, 9) if id_col_name else (4, 8)) + result = predictions[["forecast_timestamp", "forecast_value"]] + if id_col_name: + result["id"] = predictions[["id"]] + result = result[["id", "forecast_timestamp", "forecast_value"]] + + if id_col_name: + expected = pd.DataFrame( + { + "id": ["1", "2", "1", "2", "1", "2", "1", "2"], + "forecast_timestamp": [ + datetime(2017, 8, 2, tzinfo=utc), + datetime(2017, 8, 2, tzinfo=utc), + datetime(2017, 8, 3, tzinfo=utc), + datetime(2017, 8, 3, tzinfo=utc), + datetime(2017, 8, 4, tzinfo=utc), + datetime(2017, 8, 4, tzinfo=utc), + datetime(2017, 8, 5, tzinfo=utc), + datetime(2017, 8, 5, tzinfo=utc), + ], + "forecast_value": [ + 2634.796023, + 2634.796023, + 2621.332461, + 2621.332461, + 2396.095462, + 2396.095462, + 1781.623071, + 1781.623071, + ], + } + ) + expected["id"] = expected["id"].astype("string[pyarrow]") + else: + expected = pd.DataFrame( + { + "forecast_timestamp": [ + datetime(2017, 8, 2, tzinfo=utc), + datetime(2017, 8, 3, tzinfo=utc), + datetime(2017, 8, 4, tzinfo=utc), + datetime(2017, 8, 5, tzinfo=utc), + ], + "forecast_value": [ + 2634.796023, + 2621.332461, + 2396.095462, + 1781.623071, + ], + } + ) expected["forecast_value"] = expected["forecast_value"].astype(pd.Float64Dtype()) expected["forecast_timestamp"] = expected["forecast_timestamp"].astype( pd.ArrowDtype(pa.timestamp("us", tz="UTC")) @@ -132,12 +266,21 @@ def test_arima_plus_predict_params(time_series_arima_plus_model: forecasting.ARI ) +@pytest.mark.parametrize("id_col_name", [None, "id"]) def test_arima_plus_predict_explain_params( time_series_arima_plus_model: forecasting.ARIMAPlus, + time_series_arima_plus_model_w_id: forecasting.ARIMAPlus, + id_col_name, ): - predictions = time_series_arima_plus_model.predict_explain( - horizon=4, confidence_level=0.9 - ).to_pandas() + predictions = ( + ( + time_series_arima_plus_model_w_id + if id_col_name + else time_series_arima_plus_model + ) + .predict_explain(horizon=4, confidence_level=0.9) + .to_pandas() + ) assert predictions.shape[0] >= 1 prediction_columns = set(predictions.columns) expected_columns = { @@ -156,24 +299,70 @@ def test_arima_plus_predict_explain_params( "seasonal_period_daily", "holiday_effect", } + if id_col_name: + expected_columns.add("id") assert expected_columns <= prediction_columns +@pytest.mark.parametrize("id_col_name", [None, "id"]) def test_arima_plus_detect_anomalies( - time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df + time_series_arima_plus_model: forecasting.ARIMAPlus, + time_series_arima_plus_model_w_id: forecasting.ARIMAPlus, + new_time_series_df, + new_time_series_df_w_id, + id_col_name, ): - anomalies = time_series_arima_plus_model.detect_anomalies( - new_time_series_df - ).to_pandas() - - expected = pd.DataFrame( - { - "is_anomaly": [False, False, False], - "lower_bound": [2349.301736, 2153.614829, 1849.040192], - "upper_bound": [3099.642833, 3033.12195, 2858.185876], - "anomaly_probability": [0.757824, 0.322559, 0.43011], - }, + anomalies = ( + ( + time_series_arima_plus_model_w_id + if id_col_name + else time_series_arima_plus_model + ) + .detect_anomalies( + new_time_series_df_w_id if id_col_name else new_time_series_df + ) + .to_pandas() ) + + if id_col_name: + expected = pd.DataFrame( + { + "is_anomaly": [False, False, False, False, False, False], + "lower_bound": [ + 2229.930578, + 2229.930578, + 2149.645455, + 2149.645455, + 1892.873256, + 1892.873256, + ], + "upper_bound": [ + 3039.6614686, + 3039.6614686, + 3093.019467, + 3093.019467, + 2899.317669, + 2899.317669, + ], + "anomaly_probability": [ + 0.48545926, + 0.48545926, + 0.3856835, + 0.3856835, + 0.314156, + 0.314156, + ], + }, + ) + else: + expected = pd.DataFrame( + { + "is_anomaly": [False, False, False], + "lower_bound": [2229.930578, 2149.645455, 1892.873256], + "upper_bound": [3039.6614686, 3093.019467, 2899.317669], + "anomaly_probability": [0.48545926, 0.3856835, 0.314156], + }, + ) pd.testing.assert_frame_equal( anomalies[["is_anomaly", "lower_bound", "upper_bound", "anomaly_probability"]], expected, @@ -183,21 +372,65 @@ def test_arima_plus_detect_anomalies( ) +@pytest.mark.parametrize("id_col_name", [None, "id"]) def test_arima_plus_detect_anomalies_params( - time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df + time_series_arima_plus_model: forecasting.ARIMAPlus, + time_series_arima_plus_model_w_id: forecasting.ARIMAPlus, + new_time_series_df, + new_time_series_df_w_id, + id_col_name, ): - anomalies = time_series_arima_plus_model.detect_anomalies( - new_time_series_df, anomaly_prob_threshold=0.7 - ).to_pandas() - - expected = pd.DataFrame( - { - "is_anomaly": [True, False, False], - "lower_bound": [2525.5363, 2360.1870, 2086.0609], - "upper_bound": [2923.408256, 2826.54981, 2621.165188], - "anomaly_probability": [0.757824, 0.322559, 0.43011], - }, + anomalies = ( + ( + time_series_arima_plus_model_w_id + if id_col_name + else time_series_arima_plus_model + ) + .detect_anomalies( + new_time_series_df_w_id if id_col_name else new_time_series_df, + anomaly_prob_threshold=0.7, + ) + .to_pandas() ) + if id_col_name: + expected = pd.DataFrame( + { + "is_anomaly": [False, False, False, False, False, False], + "lower_bound": [ + 2420.11419, + 2420.11419, + 2360.1870, + 2360.1870, + 2086.0609, + 2086.0609, + ], + "upper_bound": [ + 2849.47785, + 2849.47785, + 2826.54981, + 2826.54981, + 2621.165188, + 2621.165188, + ], + "anomaly_probability": [ + 0.485459, + 0.485459, + 0.385683, + 0.385683, + 0.314156, + 0.314156, + ], + }, + ) + else: + expected = pd.DataFrame( + { + "is_anomaly": [False, False, False], + "lower_bound": [2420.11419, 2360.1870, 2086.0609], + "upper_bound": [2849.47785, 2826.54981, 2621.165188], + "anomaly_probability": [0.485459, 0.385683, 0.314156], + }, + ) pd.testing.assert_frame_equal( anomalies[["is_anomaly", "lower_bound", "upper_bound", "anomaly_probability"]], expected, @@ -207,22 +440,49 @@ def test_arima_plus_detect_anomalies_params( ) +@pytest.mark.parametrize("id_col_name", [None, "id"]) def test_arima_plus_score( - time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df + time_series_arima_plus_model: forecasting.ARIMAPlus, + time_series_arima_plus_model_w_id: forecasting.ARIMAPlus, + new_time_series_df, + new_time_series_df_w_id, + id_col_name, ): - result = time_series_arima_plus_model.score( - new_time_series_df[["parsed_date"]], new_time_series_df[["total_visits"]] - ).to_pandas() - expected = pd.DataFrame( - { - "mean_absolute_error": [154.742547], - "mean_squared_error": [26844.868855], - "root_mean_squared_error": [163.844038], - "mean_absolute_percentage_error": [6.189702], - "symmetric_mean_absolute_percentage_error": [6.097155], - }, - dtype="Float64", - ) + if id_col_name: + result = time_series_arima_plus_model_w_id.score( + new_time_series_df_w_id[["parsed_date"]], + new_time_series_df_w_id[["total_visits"]], + new_time_series_df_w_id[["id"]], + ).to_pandas() + else: + result = time_series_arima_plus_model.score( + new_time_series_df[["parsed_date"]], new_time_series_df[["total_visits"]] + ).to_pandas() + if id_col_name: + expected = pd.DataFrame( + { + "id": ["2", "1"], + "mean_absolute_error": [120.011007, 120.011007], + "mean_squared_error": [14562.562359, 14562.562359], + "root_mean_squared_error": [120.675442, 120.675442], + "mean_absolute_percentage_error": [4.80044, 4.80044], + "symmetric_mean_absolute_percentage_error": [4.744332, 4.744332], + }, + dtype="Float64", + ) + expected["id"] = expected["id"].astype(str).str.replace(r"\.0$", "", regex=True) + expected["id"] = expected["id"].astype("string[pyarrow]") + else: + expected = pd.DataFrame( + { + "mean_absolute_error": [120.0110074], + "mean_squared_error": [14562.5623594], + "root_mean_squared_error": [120.675442], + "mean_absolute_percentage_error": [4.80044], + "symmetric_mean_absolute_percentage_error": [4.744332], + }, + dtype="Float64", + ) pd.testing.assert_frame_equal( result, expected, @@ -231,38 +491,91 @@ def test_arima_plus_score( ) -def test_arima_plus_summary(time_series_arima_plus_model: forecasting.ARIMAPlus): - result = time_series_arima_plus_model.summary() - assert result.shape == (1, 12) - assert all(column in result.columns for column in ARIMA_EVALUATE_OUTPUT_COL) +@pytest.mark.parametrize("id_col_name", [None, "id"]) +def test_arima_plus_summary( + time_series_arima_plus_model: forecasting.ARIMAPlus, + time_series_arima_plus_model_w_id: forecasting.ARIMAPlus, + id_col_name, +): + result = ( + time_series_arima_plus_model_w_id + if id_col_name + else time_series_arima_plus_model + ).summary() + assert result.shape == ((2, 13) if id_col_name else (1, 12)) + expected_columns = ( + [id_col_name] + ARIMA_EVALUATE_OUTPUT_COL + if id_col_name + else ARIMA_EVALUATE_OUTPUT_COL + ) + assert all(column in result.columns for column in expected_columns) +@pytest.mark.parametrize("id_col_name", [None, "id"]) def test_arima_plus_summary_show_all_candidates( time_series_arima_plus_model: forecasting.ARIMAPlus, + time_series_arima_plus_model_w_id: forecasting.ARIMAPlus, + id_col_name, ): - result = time_series_arima_plus_model.summary( + result = ( + time_series_arima_plus_model_w_id + if id_col_name + else time_series_arima_plus_model + ).summary( show_all_candidate_models=True, ) assert result.shape[0] > 1 - assert all(column in result.columns for column in ARIMA_EVALUATE_OUTPUT_COL) + expected_columns = ( + [id_col_name] + ARIMA_EVALUATE_OUTPUT_COL + if id_col_name + else ARIMA_EVALUATE_OUTPUT_COL + ) + assert all(column in result.columns for column in expected_columns) +@pytest.mark.parametrize("id_col_name", [None, "id"]) def test_arima_plus_score_series( - time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df + time_series_arima_plus_model: forecasting.ARIMAPlus, + time_series_arima_plus_model_w_id: forecasting.ARIMAPlus, + new_time_series_df, + new_time_series_df_w_id, + id_col_name, ): - result = time_series_arima_plus_model.score( - new_time_series_df["parsed_date"], new_time_series_df["total_visits"] - ).to_pandas() - expected = pd.DataFrame( - { - "mean_absolute_error": [154.742547], - "mean_squared_error": [26844.868855], - "root_mean_squared_error": [163.844038], - "mean_absolute_percentage_error": [6.189702], - "symmetric_mean_absolute_percentage_error": [6.097155], - }, - dtype="Float64", - ) + if id_col_name: + result = time_series_arima_plus_model_w_id.score( + new_time_series_df_w_id["parsed_date"], + new_time_series_df_w_id["total_visits"], + new_time_series_df_w_id["id"], + ).to_pandas() + else: + result = time_series_arima_plus_model.score( + new_time_series_df["parsed_date"], new_time_series_df["total_visits"] + ).to_pandas() + if id_col_name: + expected = pd.DataFrame( + { + "id": ["2", "1"], + "mean_absolute_error": [120.011007, 120.011007], + "mean_squared_error": [14562.562359, 14562.562359], + "root_mean_squared_error": [120.675442, 120.675442], + "mean_absolute_percentage_error": [4.80044, 4.80044], + "symmetric_mean_absolute_percentage_error": [4.744332, 4.744332], + }, + dtype="Float64", + ) + expected["id"] = expected["id"].astype(str).str.replace(r"\.0$", "", regex=True) + expected["id"] = expected["id"].astype("string[pyarrow]") + else: + expected = pd.DataFrame( + { + "mean_absolute_error": [120.0110074], + "mean_squared_error": [14562.5623594], + "root_mean_squared_error": [120.675442], + "mean_absolute_percentage_error": [4.80044], + "symmetric_mean_absolute_percentage_error": [4.744332], + }, + dtype="Float64", + ) pd.testing.assert_frame_equal( result, expected, @@ -271,7 +584,21 @@ def test_arima_plus_score_series( ) -def test_arima_plus_summary_series(time_series_arima_plus_model: forecasting.ARIMAPlus): - result = time_series_arima_plus_model.summary() - assert result.shape == (1, 12) - assert all(column in result.columns for column in ARIMA_EVALUATE_OUTPUT_COL) +@pytest.mark.parametrize("id_col_name", [None, "id"]) +def test_arima_plus_summary_series( + time_series_arima_plus_model: forecasting.ARIMAPlus, + time_series_arima_plus_model_w_id: forecasting.ARIMAPlus, + id_col_name, +): + result = ( + time_series_arima_plus_model_w_id + if id_col_name + else time_series_arima_plus_model + ).summary() + assert result.shape == ((2, 13) if id_col_name else (1, 12)) + expected_columns = ( + [id_col_name] + ARIMA_EVALUATE_OUTPUT_COL + if id_col_name + else ARIMA_EVALUATE_OUTPUT_COL + ) + assert all(column in result.columns for column in expected_columns) From d8ab7728ecd184aea60480cf8cdf052e9499b66f Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Tue, 28 Jan 2025 15:25:34 -0800 Subject: [PATCH 09/38] chore: fix ibis reference in bigframes_vendored (#1333) --- third_party/bigframes_vendored/ibis/expr/types/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/bigframes_vendored/ibis/expr/types/core.py b/third_party/bigframes_vendored/ibis/expr/types/core.py index 9e1853fe52..9685e4ddca 100644 --- a/third_party/bigframes_vendored/ibis/expr/types/core.py +++ b/third_party/bigframes_vendored/ibis/expr/types/core.py @@ -79,7 +79,7 @@ def _interactive_repr(self) -> str: return capture.get().rstrip() def __repr__(self) -> str: - if ibis.options.interactive: + if bigframes_vendored.ibis.options.interactive: return self._interactive_repr() else: return self._noninteractive_repr() From c5eac015417e8933b5a4aec03e39b3d72f0b6aee Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Tue, 28 Jan 2025 16:42:23 -0800 Subject: [PATCH 10/38] chore: fix experimental blob to create connections if not exist (#1334) --- bigframes/ml/llm.py | 92 +++++---------------------------- bigframes/ml/remote.py | 28 ++-------- bigframes/operations/strings.py | 8 +-- bigframes/session/__init__.py | 30 +++++++++-- 4 files changed, 44 insertions(+), 114 deletions(-) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index bdefc793f9..d2e97a7608 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -643,37 +643,16 @@ def __init__( ): self.model_name = model_name self.session = session or global_session.get_global_session() - self._bq_connection_manager = self.session.bqconnectionmanager - - connection_name = connection_name or self.session._bq_connection - self.connection_name = clients.resolve_full_bq_connection_name( - connection_name, - default_project=self.session._project, - default_location=self.session._location, - ) + self.connection_name = connection_name self._bqml_model_factory = globals.bqml_model_factory() self._bqml_model: core.BqmlModel = self._create_bqml_model() def _create_bqml_model(self): # Parse and create connection if needed. - if not self.connection_name: - raise ValueError( - "Must provide connection_name, either in constructor or through session options." - ) - - if self._bq_connection_manager: - connection_name_parts = self.connection_name.split(".") - if len(connection_name_parts) != 3: - raise ValueError( - f"connection_name must be of the format .., got {self.connection_name}." - ) - self._bq_connection_manager.create_bq_connection( - project_id=connection_name_parts[0], - location=connection_name_parts[1], - connection_id=connection_name_parts[2], - iam_role="aiplatform.user", - ) + self.connection_name = self.session._create_bq_connection( + connection=self.connection_name, iam_role="aiplatform.user" + ) if self.model_name not in _TEXT_EMBEDDING_ENDPOINTS: msg = _MODEL_NOT_SUPPORTED_WARNING.format( @@ -828,37 +807,16 @@ def __init__( self.model_name = model_name self.session = session or global_session.get_global_session() self.max_iterations = max_iterations - self._bq_connection_manager = self.session.bqconnectionmanager - - connection_name = connection_name or self.session._bq_connection - self.connection_name = clients.resolve_full_bq_connection_name( - connection_name, - default_project=self.session._project, - default_location=self.session._location, - ) + self.connection_name = connection_name self._bqml_model_factory = globals.bqml_model_factory() self._bqml_model: core.BqmlModel = self._create_bqml_model() def _create_bqml_model(self): # Parse and create connection if needed. - if not self.connection_name: - raise ValueError( - "Must provide connection_name, either in constructor or through session options." - ) - - if self._bq_connection_manager: - connection_name_parts = self.connection_name.split(".") - if len(connection_name_parts) != 3: - raise ValueError( - f"connection_name must be of the format .., got {self.connection_name}." - ) - self._bq_connection_manager.create_bq_connection( - project_id=connection_name_parts[0], - location=connection_name_parts[1], - connection_id=connection_name_parts[2], - iam_role="aiplatform.user", - ) + self.connection_name = self.session._create_bq_connection( + connection=self.connection_name, iam_role="aiplatform.user" + ) if self.model_name not in _GEMINI_ENDPOINTS: msg = _MODEL_NOT_SUPPORTED_WARNING.format( @@ -953,10 +911,7 @@ def fit( options["prompt_col"] = X.columns.tolist()[0] self._bqml_model = self._bqml_model_factory.create_llm_remote_model( - X, - y, - options=options, - connection_name=self.connection_name, + X, y, options=options, connection_name=cast(str, self.connection_name) ) return self @@ -1179,37 +1134,16 @@ def __init__( ): self.model_name = model_name self.session = session or global_session.get_global_session() - self._bq_connection_manager = self.session.bqconnectionmanager - - connection_name = connection_name or self.session._bq_connection - self.connection_name = clients.resolve_full_bq_connection_name( - connection_name, - default_project=self.session._project, - default_location=self.session._location, - ) + self.connection_name = connection_name self._bqml_model_factory = globals.bqml_model_factory() self._bqml_model: core.BqmlModel = self._create_bqml_model() def _create_bqml_model(self): # Parse and create connection if needed. - if not self.connection_name: - raise ValueError( - "Must provide connection_name, either in constructor or through session options." - ) - - if self._bq_connection_manager: - connection_name_parts = self.connection_name.split(".") - if len(connection_name_parts) != 3: - raise ValueError( - f"connection_name must be of the format .., got {self.connection_name}." - ) - self._bq_connection_manager.create_bq_connection( - project_id=connection_name_parts[0], - location=connection_name_parts[1], - connection_id=connection_name_parts[2], - iam_role="aiplatform.user", - ) + self.connection_name = self.session._create_bq_connection( + connection=self.connection_name, iam_role="aiplatform.user" + ) if self.model_name not in _CLAUDE_3_ENDPOINTS: msg = _MODEL_NOT_SUPPORTED_WARNING.format( diff --git a/bigframes/ml/remote.py b/bigframes/ml/remote.py index 21a3a50421..6ee6840656 100644 --- a/bigframes/ml/remote.py +++ b/bigframes/ml/remote.py @@ -19,7 +19,6 @@ from typing import Mapping, Optional import warnings -from bigframes import clients from bigframes.core import global_session, log_adapter import bigframes.dataframe from bigframes.ml import base, core, globals, utils @@ -63,35 +62,16 @@ def __init__( self.session = session or global_session.get_global_session() self._bq_connection_manager = self.session.bqconnectionmanager - connection_name = connection_name or self.session._bq_connection - self.connection_name = clients.resolve_full_bq_connection_name( - connection_name, - default_project=self.session._project, - default_location=self.session._location, - ) + self.connection_name = connection_name self._bqml_model_factory = globals.bqml_model_factory() self._bqml_model: core.BqmlModel = self._create_bqml_model() def _create_bqml_model(self): # Parse and create connection if needed. - if not self.connection_name: - raise ValueError( - "Must provide connection_name, either in constructor or through session options." - ) - - if self._bq_connection_manager: - connection_name_parts = self.connection_name.split(".") - if len(connection_name_parts) != 3: - raise ValueError( - f"connection_name must be of the format .., got {self.connection_name}." - ) - self._bq_connection_manager.create_bq_connection( - project_id=connection_name_parts[0], - location=connection_name_parts[1], - connection_id=connection_name_parts[2], - iam_role="aiplatform.user", - ) + self.connection_name = self.session._create_bq_connection( + connection=self.connection_name, iam_role="aiplatform.user" + ) options = { "endpoint": self.endpoint, diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index 2b094064cd..46d4344499 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -20,7 +20,6 @@ import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.strings.accessor as vendorstr -from bigframes import clients from bigframes.core import log_adapter import bigframes.dataframe as df import bigframes.operations as ops @@ -306,11 +305,8 @@ def to_blob(self, connection: Optional[str] = None) -> series.Series: raise NotImplementedError() session = self._block.session - connection = connection or session._bq_connection - connection = clients.resolve_full_bq_connection_name( - connection, - default_project=session._project, - default_location=session._location, + connection = session._create_bq_connection( + connection=connection, iam_role="storage.objectUser" ) return self._apply_binary_op(connection, ops.obj_make_ref_op) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index d512a22915..693a036734 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1647,11 +1647,8 @@ def from_glob_path( raise NotImplementedError() # TODO(garrettwu): switch to pseudocolumn when b/374988109 is done. - connection = connection or self._bq_connection - connection = bigframes.clients.resolve_full_bq_connection_name( - connection, - default_project=self._project, - default_location=self._location, + connection = self._create_bq_connection( + connection=connection, iam_role="storage.objectUser" ) table = self._create_object_table(path, connection) @@ -1659,6 +1656,29 @@ def from_glob_path( s = self.read_gbq(table)["uri"].str.to_blob(connection) return s.rename(name).to_frame() + def _create_bq_connection( + self, iam_role: str, *, connection: Optional[str] = None + ) -> str: + """Create the connection with the session settings and try to attach iam role to the connection SA. + If any of project, location or connection isn't specified, use the session defaults. Returns fully-qualified connection name.""" + connection = self._bq_connection if not connection else connection + connection = bigframes.clients.resolve_full_bq_connection_name( + connection_name=connection, + default_project=self._project, + default_location=self._location, + ) + connection_parts = connection.split(".") + assert len(connection_parts) == 3 + + self.bqconnectionmanager.create_bq_connection( + project_id=connection_parts[0], + location=connection_parts[1], + connection_id=connection_parts[2], + iam_role=iam_role, + ) + + return connection + def read_gbq_object_table( self, object_table: str, *, name: Optional[str] = None ) -> dataframe.DataFrame: From e2382b2a98522b89d9d552017a27734e7196d4a8 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Tue, 28 Jan 2025 22:06:13 -0800 Subject: [PATCH 11/38] chore: fix experimental blob ObjRefDtype (#1335) --- bigframes/dtypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 4db124134a..016444032f 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -79,7 +79,7 @@ ), pa.field( "details", - pa.large_string(), # JSON + db_dtypes.JSONArrowType(), ), ) ) From c22126b846db428d21c0f5cbd2d439ecc56365b2 Mon Sep 17 00:00:00 2001 From: rey-esp Date: Wed, 29 Jan 2025 11:26:02 -0600 Subject: [PATCH 12/38] docs: add snippet to explain the univariate model's forecast result in the Forecast a single time series with a univariate model tutorial (#1272) * docs: add snippet to explain the univariate model's forecast result * docs: add snippet to explain the univariate model's forecast result --- ..._multiple_timeseries_forecasting_model_test.py | 1 + ...te_single_timeseries_forecasting_model_test.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/samples/snippets/create_multiple_timeseries_forecasting_model_test.py b/samples/snippets/create_multiple_timeseries_forecasting_model_test.py index e414fdea9c..b749c37d50 100644 --- a/samples/snippets/create_multiple_timeseries_forecasting_model_test.py +++ b/samples/snippets/create_multiple_timeseries_forecasting_model_test.py @@ -17,6 +17,7 @@ def test_multiple_timeseries_forecasting_model(random_model_id: str) -> None: your_model_id = random_model_id # [START bigquery_dataframes_bqml_arima_multiple_step_2_visualize] + import bigframes.pandas as bpd df = bpd.read_gbq("bigquery-public-data.new_york.citibike_trips") diff --git a/samples/snippets/create_single_timeseries_forecasting_model_test.py b/samples/snippets/create_single_timeseries_forecasting_model_test.py index 60b8d13149..9965da2817 100644 --- a/samples/snippets/create_single_timeseries_forecasting_model_test.py +++ b/samples/snippets/create_single_timeseries_forecasting_model_test.py @@ -104,7 +104,22 @@ def test_create_single_timeseries() -> None: # 25 2017-08-27 00:00:00+00:00 1853.735689 410.596551 0.8 1327.233216 2380.238162 1327.233216 2380.238162 # 1 2017-08-03 00:00:00+00:00 2621.33159 241.093355 0.8 2312.180802 2930.482379 2312.180802 2930.482379 # [END bigquery_dataframes_single_timeseries_forecasting_model_tutorial_forecast] + + # [START bigquery_dataframes_single_timeseries_forecasting_model_tutorial_explain_forecast] + ex_pred = model.predict_explain(horizon=30, confidence_level=0.8) + + print(ex_pred.head(4)) + # Expected output: + # time_series_timestamp time_series_type time_series_data time_series_adjusted_data standard_error confidence_level prediction_interval_lower_bound prediction_interval_upper_bound trend seasonal_period_yearly seasonal_period_quarterly seasonal_period_monthly seasonal_period_weekly seasonal_period_daily holiday_effect spikes_and_dips step_changes residual + # 0 2016-08-01 00:00:00+00:00 history 1711.0 505.716474 206.939556 0.0 169.611938 1205.283526 336.104536 + # 1 2016-08-02 00:00:00+00:00 history 2140.0 623.137701 206.939556 336.104428 287.033273 1205.283526 311.578773 + # 2 2016-08-03 00:00:00+00:00 history 2890.0 1008.655091 206.939556 563.514213 445.140878 1205.283526 676.061383 + # 3 2016-08-04 00:00:00+00:00 history 3161.0 1389.40959 206.939556 986.317236 403.092354 1205.283526 566.306884 + # 4 2016-08-05 00:00:00+00:00 history 2702.0 1394.395741 206.939556 1248.707386 145.688355 1205.283526 102.320733 + # 5 2016-08-06 00:00:00+00:00 history 1663.0 437.09243 206.939556 1188.59004 -751.49761 1205.283526 20.624044 + # [END bigquery_dataframes_single_timeseries_forecasting_model_tutorial_explain_forecast] assert coef is not None + assert ex_pred is not None assert summary is not None assert model is not None assert parsed_date is not None From 533db9685d159de2bc76307b0e0add676bd679a0 Mon Sep 17 00:00:00 2001 From: Arwa Sharif <146148342+arwas11@users.noreply.github.com> Date: Wed, 29 Jan 2025 12:04:57 -0600 Subject: [PATCH 13/38] deps: Add support for Python 3.13 for everything but remote functions (#1307) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore: Add support for Python 3.13 * update contibuting.rst with the the new version * skip sklearn import * skip api coverage tests because of sklearn NameError * skip remote function tests * skip all remote function tests and doctests * update doctest python version to 3.12 in noxfile * Update noxfile.py * fix the system_pre-release error * skip remote functions notebooks on 3.13 tests * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Tim Sweña (Swast) Co-authored-by: Owl Bot --- .github/workflows/unittest.yml | 2 +- CONTRIBUTING.rst | 10 +-- bigframes/core/expression.py | 13 ++++ bigframes/session/__init__.py | 2 +- .../getting_started_bq_dataframes.ipynb | 14 ++++ notebooks/location/regionalized.ipynb | 14 ++++ .../remote_functions/remote_function.ipynb | 15 ++++ .../remote_function_usecases.ipynb | 14 ++++ .../remote_function_vertex_claude_model.ipynb | 14 ++++ noxfile.py | 70 +++++++++++-------- owlbot.py | 4 +- scripts/test_publish_api_coverage.py | 8 +++ setup.py | 1 + tests/system/large/test_remote_function.py | 7 ++ .../bigframes_vendored/pandas/core/series.py | 4 +- 15 files changed, 154 insertions(+), 38 deletions(-) diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index 8659d83d82..a7805de447 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-22.04 strategy: matrix: - python: ['3.9', '3.10', '3.11', '3.12'] + python: ['3.9', '3.10', '3.11', '3.12', '3.13'] steps: - name: Checkout uses: actions/checkout@v4 diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 8d68e4fc27..5374e7e377 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -22,7 +22,7 @@ In order to add a feature: documentation. - The feature must work fully on the following CPython versions: - 3.9, 3.10, 3.11 and 3.12 on both UNIX and Windows. + 3.9, 3.10, 3.11, 3.12 and 3.13 on both UNIX and Windows. - The feature must not add unnecessary dependencies (where "unnecessary" is of course subjective, but new dependencies should @@ -72,7 +72,7 @@ We use `nox `__ to instrument our tests. - To run a single unit test:: - $ nox -s unit-3.12 -- -k + $ nox -s unit-3.13 -- -k .. note:: @@ -143,12 +143,12 @@ Running System Tests $ nox -s system # Run a single system test - $ nox -s system-3.12 -- -k + $ nox -s system-3.13 -- -k .. note:: - System tests are only configured to run under Python 3.9, 3.11 and 3.12. + System tests are only configured to run under Python 3.9, 3.11, 3.12 and 3.13. For expediency, we do not run them in older versions of Python 3. This alone will not run the tests. You'll need to change some local @@ -262,11 +262,13 @@ We support: - `Python 3.10`_ - `Python 3.11`_ - `Python 3.12`_ +- `Python 3.13`_ .. _Python 3.9: https://docs.python.org/3.9/ .. _Python 3.10: https://docs.python.org/3.10/ .. _Python 3.11: https://docs.python.org/3.11/ .. _Python 3.12: https://docs.python.org/3.12/ +.. _Python 3.13: https://docs.python.org/3.13/ Supported versions can be found in our ``noxfile.py`` `config`_. diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py index 9173bebfc4..3ffccc94ac 100644 --- a/bigframes/core/expression.py +++ b/bigframes/core/expression.py @@ -20,6 +20,8 @@ import typing from typing import Mapping, TypeVar, Union +import pandas as pd + import bigframes.core.identifiers as ids import bigframes.dtypes as dtypes import bigframes.operations @@ -253,6 +255,17 @@ def is_bijective(self) -> bool: # () <-> value return True + def __eq__(self, other): + if not isinstance(other, ScalarConstantExpression): + return False + + # With python 3.13 and the pre-release version of pandas, + # NA == NA is NA instead of True + if pd.isna(self.value) and pd.isna(other.value): # type: ignore + return self.dtype == other.dtype + + return self.value == other.value and self.dtype == other.dtype + @dataclasses.dataclass(frozen=True) class UnboundVariableExpression(Expression): diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 693a036734..0977c48ea7 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1482,7 +1482,7 @@ def read_gbq_function( 2 TestCad$123456Str dtype: string - Another use case is to define your own remote funtion and use it later. + Another use case is to define your own remote function and use it later. For example, define the remote function: >>> @bpd.remote_function() diff --git a/notebooks/getting_started/getting_started_bq_dataframes.ipynb b/notebooks/getting_started/getting_started_bq_dataframes.ipynb index 38ce75cc25..c5deeef1c5 100644 --- a/notebooks/getting_started/getting_started_bq_dataframes.ipynb +++ b/notebooks/getting_started/getting_started_bq_dataframes.ipynb @@ -1448,6 +1448,20 @@ "Running your own Python functions (or being able to bring your packages) and using them at scale is a challenge many data scientists face. BigQuery DataFrames makes it easy to deploy [remote functions](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.pandas#bigframes_pandas_remote_function) that run scalar Python functions at BigQuery scale. These functions are persisted as [BigQuery remote functions](https://cloud.google.com/bigquery/docs/remote-functions) that you can then re-use." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "# Python 3.13 is not yet a supported runtime for remote functions.\n", + "# See: https://cloud.google.com/functions/docs/runtime-support#python for the supported runtimes.\n", + "if sys.version_info >= (3, 13, 0):\n", + " sys.exit(0)" + ] + }, { "cell_type": "markdown", "metadata": { diff --git a/notebooks/location/regionalized.ipynb b/notebooks/location/regionalized.ipynb index 5a8239a42a..1b138c6a66 100644 --- a/notebooks/location/regionalized.ipynb +++ b/notebooks/location/regionalized.ipynb @@ -1339,6 +1339,20 @@ "# Using the Remote Functions" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "# Python 3.13 is not yet a supported runtime for remote functions.\n", + "# See: https://cloud.google.com/functions/docs/runtime-support#python for the supported runtimes.\n", + "if sys.version_info >= (3, 13, 0):\n", + " sys.exit(0)" + ] + }, { "attachments": {}, "cell_type": "markdown", diff --git a/notebooks/remote_functions/remote_function.ipynb b/notebooks/remote_functions/remote_function.ipynb index 1c1048d356..2114311e10 100644 --- a/notebooks/remote_functions/remote_function.ipynb +++ b/notebooks/remote_functions/remote_function.ipynb @@ -1,5 +1,20 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "bcff4fc4", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "# Python 3.13 is not yet a supported runtime for remote functions.\n", + "# See: https://cloud.google.com/functions/docs/runtime-support#python for the supported runtimes.\n", + "if sys.version_info >= (3, 13, 0):\n", + " sys.exit(0)" + ] + }, { "cell_type": "code", "execution_count": 19, diff --git a/notebooks/remote_functions/remote_function_usecases.ipynb b/notebooks/remote_functions/remote_function_usecases.ipynb index b897def4e8..d4dde6e6b1 100644 --- a/notebooks/remote_functions/remote_function_usecases.ipynb +++ b/notebooks/remote_functions/remote_function_usecases.ipynb @@ -21,6 +21,20 @@ "# limitations under the License." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "# Python 3.13 is not yet a supported runtime for remote functions.\n", + "# See: https://cloud.google.com/functions/docs/runtime-support#python for the supported runtimes.\n", + "if sys.version_info >= (3, 13, 0):\n", + " sys.exit(0)" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb b/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb index 641a30e104..78f0d27474 100644 --- a/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb +++ b/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb @@ -28,6 +28,20 @@ "" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "# Python 3.13 is not yet a supported runtime for remote functions.\n", + "# See: https://cloud.google.com/functions/docs/runtime-support#python for the supported runtimes.\n", + "if sys.version_info >= (3, 13, 0):\n", + " sys.exit(0)" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/noxfile.py b/noxfile.py index 5fcf1d6cdc..38e5fab1a6 100644 --- a/noxfile.py +++ b/noxfile.py @@ -24,7 +24,6 @@ import shutil import time from typing import Dict, List -import warnings import nox import nox.sessions @@ -32,6 +31,9 @@ BLACK_VERSION = "black==22.3.0" ISORT_VERSION = "isort==5.12.0" +# TODO: switch to 3.13 once remote functions / cloud run adds a runtime for it (internal issue 333742751) +LATEST_FULLY_SUPPORTED_PYTHON = "3.12" + # pytest-retry is not yet compatible with pytest 8.x. # https://github.com/str0zzapreti/pytest-retry/issues/32 PYTEST_VERSION = "pytest<8.0.0dev" @@ -47,7 +49,7 @@ DEFAULT_PYTHON_VERSION = "3.10" -UNIT_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12"] +UNIT_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13"] UNIT_TEST_STANDARD_DEPENDENCIES = [ "mock", "asyncmock", @@ -57,7 +59,6 @@ "pytest-asyncio", "pytest-mock", ] -UNIT_TEST_EXTERNAL_DEPENDENCIES: List[str] = [] UNIT_TEST_LOCAL_DEPENDENCIES: List[str] = [] UNIT_TEST_DEPENDENCIES: List[str] = [] UNIT_TEST_EXTRAS: List[str] = [] @@ -65,7 +66,7 @@ # There are 4 different ibis-framework 9.x versions we want to test against. # 3.10 is needed for Windows tests. -SYSTEM_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12"] +SYSTEM_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.12", "3.13"] SYSTEM_TEST_STANDARD_DEPENDENCIES = [ "jinja2", "mock", @@ -169,14 +170,6 @@ def install_unittest_dependencies(session, install_test_extra, *constraints): standard_deps = UNIT_TEST_STANDARD_DEPENDENCIES + UNIT_TEST_DEPENDENCIES session.install(*standard_deps, *constraints) - if UNIT_TEST_EXTERNAL_DEPENDENCIES: - msg = ( - "'unit_test_external_dependencies' is deprecated. Instead, please " - "use 'unit_test_dependencies' or 'unit_test_local_dependencies'." - ) - warnings.warn(msg, DeprecationWarning) - session.install(*UNIT_TEST_EXTERNAL_DEPENDENCIES, *constraints) - if UNIT_TEST_LOCAL_DEPENDENCIES: session.install(*UNIT_TEST_LOCAL_DEPENDENCIES, *constraints) @@ -375,7 +368,7 @@ def system(session: nox.sessions.Session): ) -@nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS[-1]) +@nox.session(python=LATEST_FULLY_SUPPORTED_PYTHON) def system_noextras(session: nox.sessions.Session): """Run the system test suite.""" run_system( @@ -386,7 +379,7 @@ def system_noextras(session: nox.sessions.Session): ) -@nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS[-1]) +@nox.session(python=LATEST_FULLY_SUPPORTED_PYTHON) def doctest(session: nox.sessions.Session): """Run the system test suite.""" run_system( @@ -762,6 +755,20 @@ def notebook(session: nox.Session): "notebooks/apps/synthetic_data_generation.ipynb", ] + # TODO: remove exception for Python 3.13 cloud run adds a runtime for it (internal issue 333742751) + # TODO: remove exception for Python 3.13 if nbmake adds support for + # sys.exit(0) or pytest.skip(...). + # See: https://github.com/treebeardtech/nbmake/issues/134 + if session.python == "3.13": + denylist.extend( + [ + "notebooks/getting_started/getting_started_bq_dataframes.ipynb", + "notebooks/remote_functions/remote_function_usecases.ipynb", + "notebooks/remote_functions/remote_function_vertex_claude_model.ipynb", + "notebooks/remote_functions/remote_function.ipynb", + ] + ) + # Convert each Path notebook object to a string using a list comprehension. notebooks = [str(nb) for nb in notebooks_list] @@ -769,20 +776,27 @@ def notebook(session: nox.Session): notebooks = list(filter(lambda nb: nb not in denylist, notebooks)) # Regionalized notebooks - notebooks_reg = { - "regionalized.ipynb": [ - "asia-southeast1", - "eu", - "europe-west4", - "southamerica-west1", - "us", - "us-central1", - ] - } - notebooks_reg = { - os.path.join("notebooks/location", nb): regions - for nb, regions in notebooks_reg.items() - } + # TODO: remove exception for Python 3.13 cloud run adds a runtime for it (internal issue 333742751) + # TODO: remove exception for Python 3.13 if nbmake adds support for + # sys.exit(0) or pytest.skip(...). + # See: https://github.com/treebeardtech/nbmake/issues/134 + if session.python == "3.13": + notebooks_reg = {} + else: + notebooks_reg = { + "regionalized.ipynb": [ + "asia-southeast1", + "eu", + "europe-west4", + "southamerica-west1", + "us", + "us-central1", + ] + } + notebooks_reg = { + os.path.join("notebooks/location", nb): regions + for nb, regions in notebooks_reg.items() + } # The pytest --nbmake exits silently with "no tests ran" message if # one of the notebook paths supplied does not exist. Let's make sure that diff --git a/owlbot.py b/owlbot.py index 5de70bcad6..10fc47ebd7 100644 --- a/owlbot.py +++ b/owlbot.py @@ -31,8 +31,8 @@ # ---------------------------------------------------------------------------- templated_files = common.py_library( default_python_version="3.10", - unit_test_python_versions=["3.9", "3.10", "3.11", "3.12"], - system_test_python_versions=["3.9", "3.11", "3.12"], + unit_test_python_versions=["3.9", "3.10", "3.11", "3.12", "3.13"], + system_test_python_versions=["3.9", "3.11", "3.12", "3.13"], cov_level=35, intersphinx_dependencies={ "pandas": "https://pandas.pydata.org/pandas-docs/stable/", diff --git a/scripts/test_publish_api_coverage.py b/scripts/test_publish_api_coverage.py index 0b87563482..034a266177 100644 --- a/scripts/test_publish_api_coverage.py +++ b/scripts/test_publish_api_coverage.py @@ -25,6 +25,10 @@ def api_coverage_df(): return publish_api_coverage.build_api_coverage_table("my_bf_ver", "my_release_ver") +@pytest.mark.skipif( + sys.version_info >= (3, 13), + reason="Issues with installing sklearn for this test in python 3.13", +) def test_api_coverage_produces_expected_schema(api_coverage_df): if sys.version.split(".")[:2] == ["3", "9"]: pytest.skip( @@ -54,6 +58,10 @@ def test_api_coverage_produces_expected_schema(api_coverage_df): ) +@pytest.mark.skipif( + sys.version_info >= (3, 13), + reason="Issues with installing sklearn for this test in python 3.13", +) def test_api_coverage_produces_missing_parameters(api_coverage_df): """Make sure at least some functions have reported missing parameters.""" assert (api_coverage_df["missing_parameters"].str.len() > 0).any() diff --git a/setup.py b/setup.py index 047da2348c..4386177a5e 100644 --- a/setup.py +++ b/setup.py @@ -126,6 +126,7 @@ "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", "Operating System :: OS Independent", "Topic :: Internet", ], diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index f226143b50..54ba0549a0 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -18,6 +18,7 @@ import math # must keep this at top level to test udf referring global import import os.path import shutil +import sys import tempfile import textwrap @@ -47,6 +48,12 @@ _team_euler = "Team Euler" +pytestmark = pytest.mark.skipif( + sys.version_info >= (3, 13), + reason="Runtime 'python313' is not supported yet. Skip for now.", +) + + def cleanup_remote_function_assets( bigquery_client, cloudfunctions_client, remote_udf, ignore_failures=True ): diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 8b9a76d441..c7d395d276 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -5053,13 +5053,13 @@ def mask(self, cond, other): with corresponding value from other. If cond is callable, it is computed on the Series/DataFrame and should return boolean Series/DataFrame or array. The callable must not change input - Series/DataFrame (though pandas doesn’t check it). + Series/DataFrame (though pandas doesn't check it). other (scalar, Series/DataFrame, or callable): Entries where cond is True are replaced with corresponding value from other. If other is callable, it is computed on the Series/DataFrame and should return scalar or Series/DataFrame. The callable must not change input Series/DataFrame (though pandas - doesn’t check it). If not specified, entries will be filled with + doesn't check it). If not specified, entries will be filled with the corresponding NULL value (np.nan for numpy dtypes, pd.NA for extension dtypes). From db087b0bfe4b3ba965682d620079c923e098e362 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Wed, 29 Jan 2025 11:42:29 -0800 Subject: [PATCH 14/38] perf: Improve isin performance (#1203) --- bigframes/core/__init__.py | 12 ++ bigframes/core/blocks.py | 16 +-- bigframes/core/compile/compiler.py | 12 ++ bigframes/core/compile/isin.py | 71 ++++++++++ bigframes/core/nodes.py | 161 ++++++++++++++++++++++- bigframes/core/rewrite/implicit_align.py | 117 ++++++++++------ bigframes/core/rewrite/legacy_align.py | 2 +- bigframes/core/rewrite/order.py | 9 ++ tests/system/small/test_series.py | 45 +++++++ 9 files changed, 388 insertions(+), 57 deletions(-) create mode 100644 bigframes/core/compile/isin.py diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index f573a5bbb3..5f64bf68dd 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -417,6 +417,18 @@ def project_window_op( output_name, ) + def isin( + self, other: ArrayValue, lcol: str, rcol: str + ) -> typing.Tuple[ArrayValue, str]: + node = nodes.InNode( + self.node, + other.node, + ex.deref(lcol), + ex.deref(rcol), + indicator_col=ids.ColumnId.unique(), + ) + return ArrayValue(node), node.indicator_col.name + def relational_join( self, other: ArrayValue, diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index e6d0480114..382be72340 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -2036,23 +2036,15 @@ def isin(self, other: Block): return block def _isin_inner(self: Block, col: str, unique_values: core.ArrayValue) -> Block: - unique_values, const = unique_values.create_constant( - True, dtype=bigframes.dtypes.BOOL_DTYPE - ) - expr, (l_map, r_map) = self._expr.relational_join( - unique_values, ((col, unique_values.column_ids[0]),), type="left" - ) - expr, matches = expr.project_to_id(ops.notnull_op.as_expr(r_map[const])) + expr, matches = self._expr.isin(unique_values, col, unique_values.column_ids[0]) - new_index_cols = tuple(l_map[idx_col] for idx_col in self.index_columns) new_value_cols = tuple( - l_map[val_col] if val_col != col else matches - for val_col in self.value_columns + val_col if val_col != col else matches for val_col in self.value_columns ) - expr = expr.select_columns((*new_index_cols, *new_value_cols)) + expr = expr.select_columns((*self.index_columns, *new_value_cols)) return Block( expr, - index_columns=new_index_cols, + index_columns=self.index_columns, column_labels=self.column_labels, index_labels=self._index_labels, ) diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 99f2aaf15b..0d047b366e 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -28,6 +28,7 @@ import bigframes.core.compile.concat as concat_impl import bigframes.core.compile.explode import bigframes.core.compile.ibis_types +import bigframes.core.compile.isin import bigframes.core.compile.scalar_op_compiler import bigframes.core.compile.scalar_op_compiler as compile_scalar import bigframes.core.compile.schema_translator @@ -128,6 +129,17 @@ def compile_join(self, node: nodes.JoinNode): conditions=condition_pairs, ) + @_compile_node.register + def compile_isin(self, node: nodes.InNode): + left_unordered = self.compile_node(node.left_child) + right_unordered = self.compile_node(node.right_child) + return bigframes.core.compile.isin.isin_unordered( + left=left_unordered, + right=right_unordered, + indicator_col=node.indicator_col.sql, + conditions=(node.left_col.id.sql, node.right_col.id.sql), + ) + @_compile_node.register def compile_fromrange(self, node: nodes.FromRangeNode): # Both start and end are single elements and do not inherently have an order diff --git a/bigframes/core/compile/isin.py b/bigframes/core/compile/isin.py new file mode 100644 index 0000000000..29acf9e284 --- /dev/null +++ b/bigframes/core/compile/isin.py @@ -0,0 +1,71 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Helpers to join ArrayValue objects.""" + +from __future__ import annotations + +import itertools +from typing import Tuple + +import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes +import bigframes_vendored.ibis.expr.types as ibis_types + +import bigframes.core.compile.compiled as compiled + + +def isin_unordered( + left: compiled.UnorderedIR, + right: compiled.UnorderedIR, + indicator_col: str, + conditions: Tuple[str, str], +) -> compiled.UnorderedIR: + """Join two expressions by column equality. + + Arguments: + left: Expression for left table to join. + right: Expression for right table to join. + conditions: Id pairs to compare + Returns: + The joined expression. + """ + left_table = left._to_ibis_expr() + right_table = right._to_ibis_expr() + new_column = ( + value_to_join_key(left_table[conditions[0]]) + .isin(value_to_join_key(right_table[conditions[1]])) + .name(indicator_col) + ) + + columns = tuple( + itertools.chain( + (left_table[col.get_name()] for col in left.columns), (new_column,) + ) + ) + + return compiled.UnorderedIR( + left_table, + columns=columns, + ) + + +def value_to_join_key(value: ibis_types.Value): + """Converts nullable values to non-null string SQL will not match null keys together - but pandas does.""" + if not value.type().is_string(): + value = value.cast(ibis_dtypes.str) + return ( + value.fill_null(ibis_types.literal("$NULL_SENTINEL$")) + if hasattr(value, "fill_null") + else value.fillna(ibis_types.literal("$NULL_SENTINEL$")) + ) diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index c800525b33..085d52daa6 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -208,6 +208,12 @@ def explicitly_ordered(self) -> bool: """ ... + @functools.cached_property + def height(self) -> int: + if len(self.child_nodes) == 0: + return 0 + return max(child.height for child in self.child_nodes) + 1 + @functools.cached_property def total_variables(self) -> int: return self.variables_introduced + sum( @@ -284,6 +290,34 @@ def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: return self.transform_children(lambda x: x.prune(used_cols)) +class AdditiveNode: + """Definition of additive - if you drop added_fields, you end up with the descendent. + + .. code-block:: text + + AdditiveNode (fields: a, b, c; added_fields: c) + | + | additive_base + V + BigFrameNode (fields: a, b) + + """ + + @property + @abc.abstractmethod + def added_fields(self) -> Tuple[Field, ...]: + ... + + @property + @abc.abstractmethod + def additive_base(self) -> BigFrameNode: + ... + + @abc.abstractmethod + def replace_additive_base(self, BigFrameNode): + ... + + @dataclasses.dataclass(frozen=True, eq=False) class UnaryNode(BigFrameNode): child: BigFrameNode @@ -381,6 +415,106 @@ def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): return self +@dataclasses.dataclass(frozen=True, eq=False) +class InNode(BigFrameNode, AdditiveNode): + """ + Special Join Type that only returns rows from the left side, as well as adding a bool column indicating whether a match exists on the right side. + + Modelled separately from join node, as this operation preserves row identity. + """ + + left_child: BigFrameNode + right_child: BigFrameNode + left_col: ex.DerefOp + right_col: ex.DerefOp + indicator_col: bfet_ids.ColumnId + + def _validate(self): + assert not ( + set(self.left_child.ids) & set(self.right_child.ids) + ), "Join ids collide" + + @property + def row_preserving(self) -> bool: + return False + + @property + def non_local(self) -> bool: + return True + + @property + def child_nodes(self) -> typing.Sequence[BigFrameNode]: + return (self.left_child, self.right_child) + + @property + def order_ambiguous(self) -> bool: + return False + + @property + def explicitly_ordered(self) -> bool: + # Preserves left ordering always + return True + + @property + def added_fields(self) -> Tuple[Field, ...]: + return (Field(self.indicator_col, bigframes.dtypes.BOOL_DTYPE),) + + @property + def fields(self) -> Iterable[Field]: + return itertools.chain( + self.left_child.fields, + self.added_fields, + ) + + @functools.cached_property + def variables_introduced(self) -> int: + """Defines the number of variables generated by the current node. Used to estimate query planning complexity.""" + return 1 + + @property + def joins(self) -> bool: + return True + + @property + def row_count(self) -> Optional[int]: + return self.left_child.row_count + + @property + def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + return (self.indicator_col,) + + @property + def additive_base(self) -> BigFrameNode: + return self.left_child + + def replace_additive_base(self, node: BigFrameNode): + return dataclasses.replace(self, left_child=node) + + def transform_children( + self, t: Callable[[BigFrameNode], BigFrameNode] + ) -> BigFrameNode: + transformed = dataclasses.replace( + self, left_child=t(self.left_child), right_child=t(self.right_child) + ) + if self == transformed: + # reusing existing object speeds up eq, and saves a small amount of memory + return self + return transformed + + def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: + return self + + def remap_vars( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> BigFrameNode: + return dataclasses.replace( + self, indicator_col=mappings.get(self.indicator_col, self.indicator_col) + ) + + def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + return dataclasses.replace(self, left_col=self.left_col.remap_column_refs(mappings, allow_partial_bindings=True), right_col=self.right_col.remap_column_refs(mappings, allow_partial_bindings=True)) # type: ignore + + @dataclasses.dataclass(frozen=True, eq=False) class JoinNode(BigFrameNode): left_child: BigFrameNode @@ -926,7 +1060,7 @@ class CachedTableNode(ReadTableNode): # Unary nodes @dataclasses.dataclass(frozen=True, eq=False) -class PromoteOffsetsNode(UnaryNode): +class PromoteOffsetsNode(UnaryNode, AdditiveNode): col_id: bigframes.core.identifiers.ColumnId @property @@ -959,6 +1093,13 @@ def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: def added_fields(self) -> Tuple[Field, ...]: return (Field(self.col_id, bigframes.dtypes.INT_DTYPE),) + @property + def additive_base(self) -> BigFrameNode: + return self.child + + def replace_additive_base(self, node: BigFrameNode): + return dataclasses.replace(self, child=node) + def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: if self.col_id not in used_cols: return self.child.prune(used_cols) @@ -1171,7 +1312,7 @@ def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): @dataclasses.dataclass(frozen=True, eq=False) -class ProjectionNode(UnaryNode): +class ProjectionNode(UnaryNode, AdditiveNode): """Assigns new variables (without modifying existing ones)""" assignments: typing.Tuple[ @@ -1212,6 +1353,13 @@ def row_count(self) -> Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return tuple(id for _, id in self.assignments) + @property + def additive_base(self) -> BigFrameNode: + return self.child + + def replace_additive_base(self, node: BigFrameNode): + return dataclasses.replace(self, child=node) + def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: pruned_assignments = tuple(i for i in self.assignments if i[1] in used_cols) if len(pruned_assignments) == 0: @@ -1378,7 +1526,7 @@ def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): @dataclasses.dataclass(frozen=True, eq=False) -class WindowOpNode(UnaryNode): +class WindowOpNode(UnaryNode, AdditiveNode): expression: ex.Aggregation window_spec: window.WindowSpec output_name: bigframes.core.identifiers.ColumnId @@ -1438,6 +1586,13 @@ def inherits_order(self) -> bool: ) and self.expression.op.implicitly_inherits_order return op_inherits_order or self.window_spec.row_bounded + @property + def additive_base(self) -> BigFrameNode: + return self.child + + def replace_additive_base(self, node: BigFrameNode): + return dataclasses.replace(self, child=node) + def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: if self.output_name not in used_cols: return self.child.prune(used_cols) diff --git a/bigframes/core/rewrite/implicit_align.py b/bigframes/core/rewrite/implicit_align.py index 41cc1ce82a..1b864fb919 100644 --- a/bigframes/core/rewrite/implicit_align.py +++ b/bigframes/core/rewrite/implicit_align.py @@ -14,7 +14,8 @@ from __future__ import annotations import dataclasses -from typing import Iterable, Optional, Tuple +import itertools +from typing import cast, Optional, Sequence, Set, Tuple import bigframes.core.expression import bigframes.core.guid @@ -24,16 +25,13 @@ import bigframes.core.window_spec import bigframes.operations.aggregations -# Additive nodes leave existing columns completely intact, and only add new columns to the end -ADDITIVE_NODES = ( - bigframes.core.nodes.ProjectionNode, - bigframes.core.nodes.WindowOpNode, - bigframes.core.nodes.PromoteOffsetsNode, -) # Combination of selects and additive nodes can be merged as an explicit keyless "row join" ALIGNABLE_NODES = ( - *ADDITIVE_NODES, bigframes.core.nodes.SelectionNode, + bigframes.core.nodes.ProjectionNode, + bigframes.core.nodes.WindowOpNode, + bigframes.core.nodes.PromoteOffsetsNode, + bigframes.core.nodes.InNode, ) @@ -73,6 +71,7 @@ def get_expression_spec( ( bigframes.core.nodes.WindowOpNode, bigframes.core.nodes.PromoteOffsetsNode, + bigframes.core.nodes.InNode, ), ): if set(expression.column_references).isdisjoint( @@ -85,7 +84,7 @@ def get_expression_spec( return ExpressionSpec(expression, curr_node) else: return ExpressionSpec(expression, curr_node) - curr_node = curr_node.child + curr_node = curr_node.child_nodes[0] def try_row_join( @@ -95,7 +94,7 @@ def try_row_join( ) -> Optional[bigframes.core.nodes.BigFrameNode]: """Joins the two nodes""" divergent_node = first_shared_descendent( - l_node, r_node, descendable_types=ALIGNABLE_NODES + {l_node, r_node}, descendable_types=ALIGNABLE_NODES ) if divergent_node is None: return None @@ -124,11 +123,11 @@ def _linearize_trees( # base case: append tree does not have any divergent nodes to linearize if append_tree == divergent_node: return base_tree - else: - assert isinstance(append_tree, ADDITIVE_NODES) - return append_tree.replace_child( - _linearize_trees(base_tree, append_tree.child) - ) + + assert isinstance(append_tree, bigframes.core.nodes.AdditiveNode) + return append_tree.replace_additive_base( + _linearize_trees(base_tree, append_tree.additive_base) + ) merged_node = _linearize_trees(l_node, r_node) return bigframes.core.nodes.SelectionNode(merged_node, combined_selection) @@ -161,13 +160,40 @@ def pull_up_selection( (bigframes.core.expression.DerefOp(field.id), field.id) for field in node.fields ) - assert isinstance(node, (bigframes.core.nodes.SelectionNode, *ADDITIVE_NODES)) - child_node, child_selections = pull_up_selection( - node.child, stop, rename_vars=rename_vars - ) - mapping = {out: ref.id for ref, out in child_selections} - if isinstance(node, ADDITIVE_NODES): - new_node: bigframes.core.nodes.BigFrameNode = node.replace_child(child_node) + # InNode needs special handling, as its a binary node, but row identity is from left side only. + # TODO: Merge code with unary op paths + if isinstance(node, bigframes.core.nodes.InNode): + child_node, child_selections = pull_up_selection( + node.left_child, stop=stop, rename_vars=rename_vars + ) + mapping = {out: ref.id for ref, out in child_selections} + + new_in_node: bigframes.core.nodes.InNode = dataclasses.replace( + node, left_child=child_node + ) + new_in_node = new_in_node.remap_refs(mapping) + if rename_vars: + new_in_node = cast( + bigframes.core.nodes.InNode, + new_in_node.remap_vars( + {node.indicator_col: bigframes.core.identifiers.ColumnId.unique()} + ), + ) + added_selection = ( + bigframes.core.expression.DerefOp(new_in_node.indicator_col), + node.indicator_col, + ) + new_selection = (*child_selections, added_selection) + return new_in_node, new_selection + + if isinstance(node, bigframes.core.nodes.AdditiveNode): + child_node, child_selections = pull_up_selection( + node.additive_base, stop, rename_vars=rename_vars + ) + mapping = {out: ref.id for ref, out in child_selections} + new_node: bigframes.core.nodes.BigFrameNode = node.replace_additive_base( + child_node + ) new_node = new_node.remap_refs(mapping) if rename_vars: var_renames = { @@ -177,7 +203,7 @@ def pull_up_selection( new_node = new_node.remap_vars(var_renames) else: var_renames = {} - assert isinstance(new_node, ADDITIVE_NODES) + assert isinstance(new_node, bigframes.core.nodes.AdditiveNode) added_selections = ( ( bigframes.core.expression.DerefOp(var_renames.get(field.id, field.id)), @@ -188,6 +214,10 @@ def pull_up_selection( new_selection = (*child_selections, *added_selections) return new_node, new_selection elif isinstance(node, bigframes.core.nodes.SelectionNode): + child_node, child_selections = pull_up_selection( + node.child, stop, rename_vars=rename_vars + ) + mapping = {out: ref.id for ref, out in child_selections} new_selection = tuple( ( bigframes.core.expression.DerefOp(mapping[ref.id]), @@ -201,26 +231,31 @@ def pull_up_selection( ## Traversal helpers def first_shared_descendent( - left: bigframes.core.nodes.BigFrameNode, - right: bigframes.core.nodes.BigFrameNode, - descendable_types: Tuple[type[bigframes.core.nodes.UnaryNode], ...], + roots: Set[bigframes.core.nodes.BigFrameNode], + descendable_types: Tuple[type[bigframes.core.nodes.BigFrameNode], ...], ) -> Optional[bigframes.core.nodes.BigFrameNode]: - l_path = tuple(descend(left, descendable_types)) - r_path = tuple(descend(right, descendable_types)) - if l_path[-1] != r_path[-1]: + if not roots: return None + if len(roots) == 1: + return next(iter(roots)) - for l_node, r_node in zip(l_path[-len(r_path) :], r_path[-len(l_path) :]): - if l_node == r_node: - return l_node - # should be impossible, as l_path[-1] == r_path[-1] - raise ValueError() + min_height = min(root.height for root in roots) + def descend( + root: bigframes.core.nodes.BigFrameNode, + ) -> Sequence[bigframes.core.nodes.BigFrameNode]: + # Special case to not descend into right side of IsInNode + if isinstance(root, bigframes.core.nodes.AdditiveNode): + return (root.additive_base,) + return root.child_nodes -def descend( - root: bigframes.core.nodes.BigFrameNode, - descendable_types: Tuple[type[bigframes.core.nodes.UnaryNode], ...], -) -> Iterable[bigframes.core.nodes.BigFrameNode]: - yield root - if isinstance(root, descendable_types): - yield from descend(root.child, descendable_types) + roots_to_descend = set(root for root in roots if root.height > min_height) + if not roots_to_descend: + roots_to_descend = roots + if any(not isinstance(root, descendable_types) for root in roots_to_descend): + return None + as_is = roots - roots_to_descend + descended = set( + itertools.chain.from_iterable(descend(root) for root in roots_to_descend) + ) + return first_shared_descendent(as_is.union(descended), descendable_types) diff --git a/bigframes/core/rewrite/legacy_align.py b/bigframes/core/rewrite/legacy_align.py index a671f34bd4..05641130fb 100644 --- a/bigframes/core/rewrite/legacy_align.py +++ b/bigframes/core/rewrite/legacy_align.py @@ -361,5 +361,5 @@ def common_selection_root( ) -> Optional[nodes.BigFrameNode]: """Find common subtree between join subtrees""" return bigframes.core.rewrite.implicit_align.first_shared_descendent( - l_tree, r_tree, descendable_types=LEGACY_REWRITER_NODES + {l_tree, r_tree}, descendable_types=LEGACY_REWRITER_NODES ) diff --git a/bigframes/core/rewrite/order.py b/bigframes/core/rewrite/order.py index 08593b7a5f..3f8c409b76 100644 --- a/bigframes/core/rewrite/order.py +++ b/bigframes/core/rewrite/order.py @@ -160,6 +160,15 @@ def pull_up_order_inner( elif isinstance(node, bigframes.core.nodes.FilterNode): child_result, child_order = pull_up_order_inner(node.child) return node.replace_child(child_result), child_order.with_non_sequential() + elif isinstance(node, bigframes.core.nodes.InNode): + child_result, child_order = pull_up_order_inner(node.left_child) + subquery_result = remove_order_strict(node.right_child) + return ( + dataclasses.replace( + node, left_child=child_result, right_child=subquery_result + ), + child_order, + ) elif isinstance(node, bigframes.core.nodes.SelectionNode): child_result, child_order = pull_up_order_inner(node.child) selected_ids = set(ref.id for ref, _ in node.input_output_pairs) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index e950ddbc5a..ac9d878432 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1241,6 +1241,51 @@ def test_isin_bigframes_values(scalars_dfs, col_name, test_set, session): ) +@pytest.mark.parametrize( + ( + "col_name", + "test_set", + ), + [ + ( + "int64_col", + [314159, 2.0, 3, pd.NA], + ), + ( + "int64_col", + [2, 55555, 4], + ), + ( + "float64_col", + [-123.456, 1.25, pd.NA], + ), + ( + "int64_too", + [1, 2, pd.NA], + ), + ( + "string_col", + ["Hello, World!", "Hi", "こんにちは"], + ), + ], +) +def test_isin_bigframes_values_as_predicate( + scalars_dfs_maybe_ordered, col_name, test_set +): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + bf_predicate = scalars_df[col_name].isin( + series.Series(test_set, session=scalars_df._session) + ) + bf_result = scalars_df[bf_predicate].to_pandas() + pd_predicate = scalars_pandas_df[col_name].isin(test_set) + pd_result = scalars_pandas_df[pd_predicate] + + pd.testing.assert_frame_equal( + pd_result.reset_index(), + bf_result.reset_index(), + ) + + def test_isnull(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "float64_col" From ef4f491ebefea4b75a5c5fe2c8242d4996977cab Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Wed, 29 Jan 2025 11:59:42 -0800 Subject: [PATCH 15/38] chore: fix experimental blob repr without content_type (#1337) --- bigframes/dataframe.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index fec53dbf01..a58edf1962 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -768,11 +768,11 @@ def _repr_html_(self) -> str: def obj_ref_rt_to_html(obj_ref_rt) -> str: obj_ref_rt_json = json.loads(obj_ref_rt) + gcs_metadata = obj_ref_rt_json["objectref"]["details"][ + "gcs_metadata" + ] content_type = typing.cast( - str, - obj_ref_rt_json["objectref"]["details"]["gcs_metadata"][ - "content_type" - ], + str, gcs_metadata.get("content_type", "") ) if content_type.startswith("image"): url = obj_ref_rt_json["access_urls"]["read_url"] From 3fef147eca899726a88466611c50df218eff2b47 Mon Sep 17 00:00:00 2001 From: jialuoo Date: Wed, 29 Jan 2025 13:59:31 -0800 Subject: [PATCH 16/38] refactor: rename the bigframes function classes (#1323) * refactor: rename the bigframes function classes * resolve comments * quick fix * fix naming convention --- bigframes/functions/_function_client.py | 4 +++- bigframes/functions/_function_session.py | 16 +++++++-------- bigframes/functions/_utils.py | 26 ++++++++++++------------ bigframes/functions/function.py | 6 +++--- bigframes/session/__init__.py | 2 +- 5 files changed, 28 insertions(+), 26 deletions(-) diff --git a/bigframes/functions/_function_client.py b/bigframes/functions/_function_client.py index 104119a510..f5001ff909 100644 --- a/bigframes/functions/_function_client.py +++ b/bigframes/functions/_function_client.py @@ -54,10 +54,12 @@ ) -class RemoteFunctionClient: +class FunctionClient: # Wait time (in seconds) for an IAM binding to take effect after creation _iam_wait_seconds = 120 + # TODO(b/392707725): Convert all necessary parameters for cloud function + # deployment into method parameters. def __init__( self, gcp_project_id, diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py index 00626a252f..a0518978a3 100644 --- a/bigframes/functions/_function_session.py +++ b/bigframes/functions/_function_session.py @@ -55,18 +55,18 @@ from . import _function_client, _utils -class RemoteFunctionSession: - """Session to manage remote functions.""" +class FunctionSession: + """Session to manage bigframes functions.""" def __init__(self): - # Session level mapping of remote function artifacts + # Session level mapping of function artifacts self._temp_artifacts: Dict[str, str] = dict() # Lock to synchronize the update of the session artifacts self._artifacts_lock = threading.Lock() def _update_temp_artifacts(self, bqrf_routine: str, gcf_path: str): - """Update remote function artifacts in the current session.""" + """Update function artifacts in the current session.""" with self._artifacts_lock: self._temp_artifacts[bqrf_routine] = gcf_path @@ -76,11 +76,11 @@ def clean_up( gcfclient: functions_v2.FunctionServiceClient, session_id: str, ): - """Delete remote function artifacts in the current session.""" + """Delete function artifacts in the current session.""" with self._artifacts_lock: for bqrf_routine, gcf_path in self._temp_artifacts.items(): - # Let's accept the possibility that the remote function may have - # been deleted directly by the user + # Let's accept the possibility that the function may have been + # deleted directly by the user bqclient.delete_routine(bqrf_routine, not_found_ok=True) # Let's accept the possibility that the cloud function may have @@ -467,7 +467,7 @@ def wrapper(func): signature, input_types, output_type # type: ignore ) - remote_function_client = _function_client.RemoteFunctionClient( + remote_function_client = _function_client.FunctionClient( dataset_ref.project, cloud_function_region, cloud_functions_client, diff --git a/bigframes/functions/_utils.py b/bigframes/functions/_utils.py index 591da01dd0..f1f8c97e7f 100644 --- a/bigframes/functions/_utils.py +++ b/bigframes/functions/_utils.py @@ -30,8 +30,8 @@ import bigframes.core.compile.ibis_types import bigframes.dtypes -# Naming convention for the remote function artifacts -_BIGFRAMES_REMOTE_FUNCTION_PREFIX = "bigframes" +# Naming convention for the function artifacts +_BIGFRAMES_FUNCTION_PREFIX = "bigframes" _BQ_FUNCTION_NAME_SEPERATOR = "_" _GCF_FUNCTION_NAME_SEPERATOR = "-" @@ -66,10 +66,10 @@ def _get_updated_package_requirements( ): requirements = [f"cloudpickle=={cloudpickle.__version__}"] if is_row_processor: - # bigframes remote function will send an entire row of data as json, - # which would be converted to a pandas series and processed - # Ensure numpy versions match to avoid unpickling problems. See - # internal issue b/347934471. + # bigframes function will send an entire row of data as json, which + # would be converted to a pandas series and processed Ensure numpy + # versions match to avoid unpickling problems. See internal issue + # b/347934471. requirements.append(f"numpy=={numpy.__version__}") requirements.append(f"pandas=={pandas.__version__}") requirements.append(f"pyarrow=={pyarrow.__version__}") @@ -94,14 +94,14 @@ def _clean_up_by_session_id( point in time. """ - # First clean up the BQ remote functions and then the underlying - # cloud functions, so that at no point we are left with a remote function - # that is pointing to a cloud function that does not exist + # First clean up the BQ remote functions and then the underlying cloud + # functions, so that at no point we are left with a remote function that is + # pointing to a cloud function that does not exist endpoints_to_be_deleted: Set[str] = set() match_prefix = "".join( [ - _BIGFRAMES_REMOTE_FUNCTION_PREFIX, + _BIGFRAMES_FUNCTION_PREFIX, _BQ_FUNCTION_NAME_SEPERATOR, session_id, _BQ_FUNCTION_NAME_SEPERATOR, @@ -176,7 +176,7 @@ def routine_ref_to_string_for_query(routine_ref: bigquery.RoutineReference) -> s def get_cloud_function_name(function_hash, session_id=None, uniq_suffix=None): "Get a name for the cloud function for the given user defined function." - parts = [_BIGFRAMES_REMOTE_FUNCTION_PREFIX] + parts = [_BIGFRAMES_FUNCTION_PREFIX] if session_id: parts.append(session_id) parts.append(function_hash) @@ -186,8 +186,8 @@ def get_cloud_function_name(function_hash, session_id=None, uniq_suffix=None): def get_remote_function_name(function_hash, session_id, uniq_suffix=None): - "Get a name for the BQ remote function for the given user defined function." - parts = [_BIGFRAMES_REMOTE_FUNCTION_PREFIX, session_id, function_hash] + "Get a name for the remote function for the given user defined function." + parts = [_BIGFRAMES_FUNCTION_PREFIX, session_id, function_hash] if uniq_suffix: parts.append(uniq_suffix) return _BQ_FUNCTION_NAME_SEPERATOR.join(parts) diff --git a/bigframes/functions/function.py b/bigframes/functions/function.py index 57df8f9407..ef2c81a953 100644 --- a/bigframes/functions/function.py +++ b/bigframes/functions/function.py @@ -120,11 +120,11 @@ def get_routine_reference( def remote_function(*args, **kwargs): - remote_function_session = bff_session.RemoteFunctionSession() + remote_function_session = bff_session.FunctionSession() return remote_function_session.remote_function(*args, **kwargs) -remote_function.__doc__ = bff_session.RemoteFunctionSession.remote_function.__doc__ +remote_function.__doc__ = bff_session.FunctionSession.remote_function.__doc__ def read_gbq_function( @@ -174,7 +174,7 @@ def read_gbq_function( # The name "args" conflicts with the Ibis operator, so we use # non-standard names for the arguments here. def func(*bigframes_args, **bigframes_kwargs): - f"""Remote function {str(routine_ref)}.""" + f"""Bigframes function {str(routine_ref)}.""" nonlocal node # type: ignore expr = node(*bigframes_args, **bigframes_kwargs) # type: ignore diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 0977c48ea7..95d7b1aa2c 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -245,7 +245,7 @@ def __init__( ) self._metrics = bigframes.session.metrics.ExecutionMetrics() - self._function_session = bff_session.RemoteFunctionSession() + self._function_session = bff_session.FunctionSession() self._temp_storage_manager = ( bigframes.session.temp_storage.TemporaryGbqStorageManager( self._clients_provider.bqclient, From 3e1fa995b7132edfbd7ff51b813921c512a858aa Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Wed, 29 Jan 2025 16:18:41 -0800 Subject: [PATCH 17/38] chore: update tpch q20 (#1339) --- third_party/bigframes_vendored/tpch/queries/q20.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/third_party/bigframes_vendored/tpch/queries/q20.py b/third_party/bigframes_vendored/tpch/queries/q20.py index fc36dd8b82..e5958e96d5 100644 --- a/third_party/bigframes_vendored/tpch/queries/q20.py +++ b/third_party/bigframes_vendored/tpch/queries/q20.py @@ -44,8 +44,6 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): filtered_parts = part[part["P_NAME"].str.startswith(var4)] - if not session._strictly_ordered: - filtered_parts = filtered_parts[["P_PARTKEY"]].sort_values(by=["P_PARTKEY"]) filtered_parts = filtered_parts["P_PARTKEY"].unique(keep_order=False).to_frame() joined_parts = filtered_parts.merge( partsupp, left_on="P_PARTKEY", right_on="PS_PARTKEY" From c831c4709843ce5b356b27ee98ecc99f2844542f Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Wed, 29 Jan 2025 17:48:54 -0800 Subject: [PATCH 18/38] test: skip checking llm tuning score labels (#1340) --- tests/system/small/ml/test_llm.py | 2 -- tests/system/utils.py | 8 +++++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 29f504443a..90d5e9f1d7 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -868,7 +868,6 @@ def test_llm_palm_score_params(llm_fine_tune_df_default_index): "label", "evaluation_status", ], - index=6, ) @@ -928,7 +927,6 @@ def test_llm_gemini_pro_score_params(llm_fine_tune_df_default_index, model_name) "label", "evaluation_status", ], - index=6, ) diff --git a/tests/system/utils.py b/tests/system/utils.py index 7c12c8033a..0772468085 100644 --- a/tests/system/utils.py +++ b/tests/system/utils.py @@ -298,7 +298,7 @@ def assert_pandas_df_equal_pca(actual, expected, **kwargs): def check_pandas_df_schema_and_index( pd_df: pd.DataFrame, columns: Iterable, - index: Union[int, Iterable], + index: Optional[Union[int, Iterable]] = None, col_exact: bool = True, ): """Check pandas df schema and index. But not the values. @@ -306,7 +306,7 @@ def check_pandas_df_schema_and_index( Args: pd_df: the input pandas df columns: target columns to check with - index: int or Iterable. If int, only check the length (index size) of the df. If Iterable, check index values match + index: int or Iterable or None, default None. If int, only check the length (index size) of the df. If Iterable, check index values match. If None, skip checking index. col_exact: If True, check the columns param are exact match. Otherwise only check the df contains all of those columns """ if col_exact: @@ -314,7 +314,9 @@ def check_pandas_df_schema_and_index( else: assert set(columns) <= set(pd_df.columns) - if isinstance(index, int): + if index is None: + pass + elif isinstance(index, int): assert len(pd_df) == index elif isinstance(index, Iterable): assert list(pd_df.index) == list(index) From 7527d3048ab2e392ea6766e1913746f436cffabe Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Wed, 29 Jan 2025 22:13:13 -0800 Subject: [PATCH 19/38] refactor: Remove filter deferment from IR (#1336) --- bigframes/core/compile/compiled.py | 137 +++--------------- .../ibis/backends/sql/compilers/base.py | 4 +- 2 files changed, 20 insertions(+), 121 deletions(-) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index a55307e0a4..896f99b9de 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -16,7 +16,7 @@ import functools import itertools import typing -from typing import Collection, Optional, Sequence +from typing import Optional, Sequence import bigframes_vendored.ibis import bigframes_vendored.ibis.backends.bigquery.backend as ibis_bigquery @@ -38,9 +38,6 @@ import bigframes.dtypes import bigframes.operations.aggregations as agg_ops -PREDICATE_COLUMN = "bigframes_predicate" - - op_compiler = op_compilers.scalar_op_compiler @@ -50,11 +47,8 @@ def __init__( self, table: ibis_types.Table, columns: Sequence[ibis_types.Value], - predicates: Optional[Collection[ibis_types.BooleanValue]] = None, ): self._table = table - # Deferred predicates probably no longer needed? - self._predicates = tuple(predicates) if predicates is not None else () # Allow creating a DataFrame directly from an Ibis table expression. # TODO(swast): Validate that each column references the same table (or # no table for literal values). @@ -69,17 +63,6 @@ def __init__( # dictionary mapping names to column values. self._column_names = {column.get_name(): column for column in self._columns} - def builder(self): - """Creates a mutable builder for expressions.""" - # Since ArrayValue is intended to be immutable (immutability offers - # potential opportunities for caching, though we might need to introduce - # more node types for that to be useful), we create a builder class. - return UnorderedIR.Builder( - self._table, - columns=self._columns, - predicates=self._predicates, - ) - def to_sql( self, *, @@ -118,15 +101,6 @@ def columns(self) -> typing.Tuple[ibis_types.Value, ...]: def column_ids(self) -> typing.Sequence[str]: return tuple(self._column_names.keys()) - @property - def _reduced_predicate(self) -> typing.Optional[ibis_types.BooleanValue]: - """Returns the frame's predicates as an equivalent boolean value, useful where a single predicate value is preferred.""" - return ( - _reduce_predicate_list(self._predicates).name(PREDICATE_COLUMN) - if self._predicates - else None - ) - @property def _ibis_bindings(self) -> dict[str, ibis_types.Value]: return {col: self._get_ibis_column(col) for col in self.column_ids} @@ -141,9 +115,7 @@ def projection( op_compiler.compile_expression(expression, bindings).name(id) for expression, id in expression_id_pairs ] - builder = self.builder() - builder.columns = tuple([*self._columns, *new_values]) - return builder.build() + return UnorderedIR(self._table, (*self._columns, *new_values)) def selection( self, @@ -155,9 +127,7 @@ def selection( op_compiler.compile_expression(input, bindings).name(id) for input, id in input_output_pairs ] - builder = self.builder() - builder.columns = tuple(values) - return builder.build() + return UnorderedIR(self._table, tuple(values)) def _get_ibis_column(self, key: str) -> ibis_types.Value: """Gets the Ibis expression for a given column.""" @@ -192,7 +162,6 @@ def row_count(self, name: str) -> UnorderedIR: def _to_ibis_expr( self, *, - expose_hidden_cols: bool = False, fraction: Optional[float] = None, ): """ @@ -206,26 +175,12 @@ def _to_ibis_expr( An ibis expression representing the data help by the ArrayValue object. """ columns = list(self._columns) - columns_to_drop: list[ - str - ] = [] # Ordering/Filtering columns that will be dropped at end - - if self._reduced_predicate is not None: - columns.append(self._reduced_predicate) - # Usually drop predicate as it is will be all TRUE after filtering - if not expose_hidden_cols: - columns_to_drop.append(self._reduced_predicate.get_name()) - # Special case for empty tables, since we can't create an empty # projection. if not columns: return bigframes_vendored.ibis.memtable([]) table = self._table.select(columns) - base_table = table - if self._reduced_predicate is not None: - table = table.filter(base_table[PREDICATE_COLUMN]) - table = table.drop(*columns_to_drop) if fraction is not None: table = table.filter( bigframes_vendored.ibis.random() < ibis_types.literal(fraction) @@ -233,22 +188,12 @@ def _to_ibis_expr( return table def filter(self, predicate: ex.Expression) -> UnorderedIR: - for ref in predicate.column_references: - ibis_value = self._get_ibis_column(ref.sql) - if is_window(ibis_value): - # ibis doesn't support qualify syntax, so create CTE if filtering over window expression - # https://github.com/ibis-project/ibis/issues/9775 - return self._reproject_to_table().filter(predicate) - - bindings = {col: self._get_ibis_column(col) for col in self.column_ids} - condition = op_compiler.compile_expression(predicate, bindings) - return self._filter(condition) # type:ignore - - def _filter(self, predicate_value: ibis_types.BooleanValue) -> UnorderedIR: - """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" - expr = self.builder() - expr.predicates = [*self._predicates, predicate_value] - return expr.build() + table = self._to_ibis_expr() + condition = op_compiler.compile_expression(predicate, table) + table = table.filter(condition) + return UnorderedIR( + table, tuple(table[column_name] for column_name in self._column_names) + ) def aggregate( self, @@ -279,18 +224,18 @@ def aggregate( for aggregate, col_out in aggregations } if by_column_ids: + if dropna: + table = table.filter( + [table[ref.id.sql].notnull() for ref in by_column_ids] + ) result = table.group_by((ref.id.sql for ref in by_column_ids)).aggregate( **stats ) - columns = tuple(result[key] for key in result.columns) - expr = UnorderedIR(result, columns=columns) - if dropna: - for ref in by_column_ids: - expr = expr._filter(expr._compile_expression(ref).notnull()) - return expr + return UnorderedIR( + result, columns=tuple(result[key] for key in result.columns) + ) else: - aggregates = {**stats} - result = table.aggregate(**aggregates) + result = table.aggregate(**stats) return UnorderedIR( result, columns=[result[col_id] for col_id in [*stats.keys()]], @@ -310,19 +255,6 @@ def _uniform_sampling(self, fraction: float) -> UnorderedIR: ) ## Helpers - def _set_or_replace_by_id( - self, id: str, new_value: ibis_types.Value - ) -> UnorderedIR: - builder = self.builder() - if id in self.column_ids: - builder.columns = [ - val if (col_id != id) else new_value.name(id) - for col_id, val in zip(self.column_ids, self._columns) - ] - else: - builder.columns = [*self.columns, new_value.name(id)] - return builder.build() - def _reproject_to_table(self) -> UnorderedIR: """ Internal operators that projects the internal representation into a @@ -338,24 +270,6 @@ def _reproject_to_table(self) -> UnorderedIR: columns=columns, ) - class Builder: - def __init__( - self, - table: ibis_types.Table, - columns: Collection[ibis_types.Value] = (), - predicates: Optional[Collection[ibis_types.BooleanValue]] = None, - ): - self.table = table - self.columns = list(columns) - self.predicates = list(predicates) if predicates is not None else None - - def build(self) -> UnorderedIR: - return UnorderedIR( - table=self.table, - columns=self.columns, - predicates=self.predicates, - ) - @classmethod def from_pandas( cls, @@ -500,8 +414,7 @@ def project_window_op( case_statement = case_statement.else_(window_op).end() # type: ignore window_op = case_statement # type: ignore - result = self._set_or_replace_by_id(output_name, window_op) - return result + return UnorderedIR(self._table, (*self.columns, window_op.name(output_name))) def _compile_expression(self, expr: ex.Expression): return op_compiler.compile_expression(expr, self._ibis_bindings) @@ -517,8 +430,6 @@ def _ibis_window_from_spec(self, window_spec: WindowSpec): if window_spec.grouping_keys else [] ) - if self._reduced_predicate is not None: - group_by.append(self._reduced_predicate) # Construct ordering. There are basically 3 main cases # 1. Order-independent op (aggregation, cut, rank) with unbound window - no ordering clause needed @@ -569,18 +480,6 @@ def is_window(column: ibis_types.Value) -> bool: return any(isinstance(op, ibis_ops.WindowFunction) for op in matches) -def _reduce_predicate_list( - predicate_list: typing.Collection[ibis_types.BooleanValue], -) -> ibis_types.BooleanValue: - """Converts a list of predicates BooleanValues into a single BooleanValue.""" - if len(predicate_list) == 0: - raise ValueError("Cannot reduce empty list of predicates") - if len(predicate_list) == 1: - (item,) = predicate_list - return item - return functools.reduce(lambda acc, pred: acc.__and__(pred), predicate_list) - - def _convert_ordering_to_table_values( value_lookup: typing.Mapping[str, ibis_types.Value], ordering_columns: typing.Sequence[OrderingExpression], diff --git a/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py b/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py index ccd4a57e11..d1ab36c41a 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/compilers/base.py @@ -432,7 +432,7 @@ class SQLGlotCompiler(abc.ABC): ops.IntervalSubtract, ) - NEEDS_PARENS = BINARY_INFIX_OPS + (ops.IsNull,) + NEEDS_PARENS = BINARY_INFIX_OPS + (ops.IsNull, ops.NotNull) # Constructed dynamically in `__init_subclass__` from their respective # UPPERCASE values to handle inheritance, do not modify directly here. @@ -1022,7 +1022,7 @@ def visit_IsNull(self, op, *, arg): return arg.is_(NULL) def visit_NotNull(self, op, *, arg): - return arg.is_(sg.not_(NULL, copy=False)) + return self._add_parens(op, arg).is_(sg.not_(NULL, copy=False)) def visit_InValues(self, op, *, value, options): return value.isin(*options) From ce7d92f9decd11de2235348d9fa478ef8a049d84 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Thu, 30 Jan 2025 10:02:02 -0800 Subject: [PATCH 20/38] chore: remove 10t benchmark temporarily (#1341) --- tests/benchmark/tpch/config.jsonl | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/benchmark/tpch/config.jsonl b/tests/benchmark/tpch/config.jsonl index e6f7a444f6..779b0fe2d7 100644 --- a/tests/benchmark/tpch/config.jsonl +++ b/tests/benchmark/tpch/config.jsonl @@ -6,5 +6,3 @@ {"benchmark_suffix": "100g_unordered", "project_id": "bigframes-dev-perf", "dataset_id": "tpch_0100g", "ordered": false} {"benchmark_suffix": "1t_ordered", "project_id": "bigframes-dev-perf", "dataset_id": "tpch_0001t", "ordered": true} {"benchmark_suffix": "1t_unordered", "project_id": "bigframes-dev-perf", "dataset_id": "tpch_0001t", "ordered": false} -{"benchmark_suffix": "10t_ordered", "project_id": "bigframes-dev-perf", "dataset_id": "tpch_0010t", "ordered": true} -{"benchmark_suffix": "10t_unordered", "project_id": "bigframes-dev-perf", "dataset_id": "tpch_0010t", "ordered": false} From 05f83d18d276091a1549dbba1f2baf8c91c8c37e Mon Sep 17 00:00:00 2001 From: Arwa Sharif <146148342+arwas11@users.noreply.github.com> Date: Thu, 30 Jan 2025 13:03:47 -0600 Subject: [PATCH 21/38] docs: Add `GeoSeries` docs (#1327) * docs: Add GeoSeris docs * update geoseries notebook * update docs to point to geopandas and notebook to be explain more about geoseries * correct geoseries path in docs and update notebook data * update notebook to resolve comments --- .../bigframes.geopandas/geoseries.rst | 2 +- docs/templates/toc.yml | 4 + notebooks/geo/geoseries.ipynb | 371 ++++++++++++++++++ 3 files changed, 376 insertions(+), 1 deletion(-) create mode 100644 notebooks/geo/geoseries.ipynb diff --git a/docs/reference/bigframes.geopandas/geoseries.rst b/docs/reference/bigframes.geopandas/geoseries.rst index 1819613955..91e853b1f8 100644 --- a/docs/reference/bigframes.geopandas/geoseries.rst +++ b/docs/reference/bigframes.geopandas/geoseries.rst @@ -11,7 +11,7 @@ GeoSeries Series ------ -.. autoclass:: bigframes.geopandas.geoseries.GeoSeries +.. autoclass:: bigframes.geopandas.GeoSeries :members: :inherited-members: :undoc-members: diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index 47d9e97d7a..c17a1788df 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -207,6 +207,10 @@ - name: BigQuery built-in functions uid: bigframes.bigquery name: bigframes.bigquery + - items: + - name: GeoSeries + uid: bigframes.geopandas + name: bigframes.geopandas - items: - name: Overview uid: bigframes.streaming diff --git a/notebooks/geo/geoseries.ipynb b/notebooks/geo/geoseries.ipynb new file mode 100644 index 0000000000..160d19ce91 --- /dev/null +++ b/notebooks/geo/geoseries.ipynb @@ -0,0 +1,371 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2025 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Analyzing a GEOGRAPHY column with `bigframes.geopandas.GeoSeries`" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes\n", + "import bigframes.geopandas\n", + "import bigframes.pandas as bpd\n", + "import shapely\n", + "bpd.options.display.progress_bar = None" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load the Counties table from the Census Bureau US Boundaries dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/arwas/src1/python-bigquery-dataframes/bigframes/session/_io/bigquery/read_gbq_table.py:274: DefaultIndexWarning: Table 'bigquery-public-data.geo_us_boundaries.counties' is clustered and/or partitioned, but BigQuery DataFrames was not able to find a suitable index. To avoid this warning, set at least one of: `index_col` or `filters`.\n", + " warnings.warn(msg, category=bfe.DefaultIndexWarning)\n" + ] + } + ], + "source": [ + "df = bpd.read_gbq(\"bigquery-public-data.geo_us_boundaries.counties\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a series from the int_point_geom column" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "point_geom_series = df['int_point_geom']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The `GeoSeries` constructor accepts local data or a `bigframes.pandas.Series` object." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Create a GeoSeries from local data with `Peek`" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "54 POINT (-93.47523 45.00612)\n", + "256 POINT (-89.60507 42.67552)\n", + "266 POINT (-104.11408 39.31516)\n", + "485 POINT (-91.23193 32.34688)\n", + "765 POINT (-83.42808 38.20427)\n", + "Name: int_point_geom, dtype: geometry" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "five_geo_points = point_geom_series.peek(n = 5)\n", + "five_geo_points" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Convert the five geo points to `bigframes.gopandas.GeoSeries`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Note: TypeError is raised if the GEOGRAPHY column contains geometry type other than `Point`." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 POINT (-86.87338 38.37334)\n", + "1 POINT (-118.48037 46.25461)\n", + "2 POINT (-92.5617 32.30429)\n", + "3 POINT (-83.46189 39.55525)\n", + "4 POINT (-119.46779 47.21363)\n", + "dtype: geometry" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "geo_points = bigframes.geopandas.GeoSeries(\n", + " [point for point in five_geo_points]\n", + ")\n", + "geo_points" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Retrieve the x (longitude) and y (latitude) from the GeoSeries with `.x` and `.y`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `.x`" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 -86.873385\n", + "1 -118.48037\n", + "2 -92.5617\n", + "3 -83.461893\n", + "4 -119.467788\n", + "dtype: Float64" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "geo_points.x" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `.y`" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 38.373344\n", + "1 46.254606\n", + "2 32.30429\n", + "3 39.555246\n", + "4 47.213633\n", + "dtype: Float64" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "geo_points.y" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Alternatively, use the `.geo` accessor to access GeoSeries methods from a `bigframes.pandas.Series` object." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### `geo.x`" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 -101.298265\n", + "1 -99.111085\n", + "2 -66.58687\n", + "3 -102.601791\n", + "4 -71.578625\n", + "5 -88.961529\n", + "6 -87.492986\n", + "7 -82.422666\n", + "8 -100.208166\n", + "9 -85.815939\n", + "10 -101.681133\n", + "11 -119.516659\n", + "12 -89.398306\n", + "13 -107.78848\n", + "14 -91.159306\n", + "15 -113.887042\n", + "16 -83.470416\n", + "17 -98.520146\n", + "18 -83.911718\n", + "19 -87.321865\n", + "20 -91.727626\n", + "21 -93.466093\n", + "22 -101.143324\n", + "23 -78.657634\n", + "24 -94.272323\n", + "dtype: Float64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "point_geom_series.geo.x" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### `geo.y`" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 46.710819\n", + "1 29.353661\n", + "2 18.211152\n", + "3 38.835646\n", + "4 41.869768\n", + "5 39.860237\n", + "6 36.892059\n", + "7 38.143642\n", + "8 34.524623\n", + "9 30.862007\n", + "10 40.180165\n", + "11 46.228125\n", + "12 36.054196\n", + "13 38.154731\n", + "14 38.761902\n", + "15 44.928506\n", + "16 30.447232\n", + "17 29.448671\n", + "18 42.602532\n", + "19 34.529776\n", + "20 33.957675\n", + "21 42.037538\n", + "22 29.875285\n", + "23 36.299884\n", + "24 44.821657\n", + "dtype: Float64" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "point_geom_series.geo.y" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From deb015dc1276549519d51363501355272f8976d8 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 30 Jan 2025 11:08:16 -0800 Subject: [PATCH 22/38] feat: Add Series.keys() (#1342) --- bigframes/series.py | 4 ++++ tests/system/small/test_series.py | 7 +++++++ .../bigframes_vendored/pandas/core/series.py | 19 +++++++++++++++++++ 3 files changed, 30 insertions(+) diff --git a/bigframes/series.py b/bigframes/series.py index 8a0aaf8d59..6c0bf8cebb 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -168,6 +168,10 @@ def values(self) -> numpy.ndarray: def index(self) -> indexes.Index: return indexes.Index.from_frame(self) + @validations.requires_index + def keys(self) -> indexes.Index: + return self.index + @property def query_job(self) -> Optional[bigquery.QueryJob]: """BigQuery job metadata for the most recent query. diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index ac9d878432..b3faefc5f7 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -228,6 +228,13 @@ def test_series_construct_geodata(): ) +def test_series_keys(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df["int64_col"].keys().to_pandas() + pd_result = scalars_pandas_df["int64_col"].keys() + pd.testing.assert_index_equal(bf_result, pd_result) + + @pytest.mark.parametrize( ["data", "index"], [ diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index c7d395d276..57f7dfbb79 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -424,6 +424,25 @@ def __repr__(self) -> str: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def keys(self): + """ + Return alias for index. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([1, 2, 3], index=[0, 1, 2]) + >>> s.keys() + Index([0, 1, 2], dtype='Int64') + + Returns: + Index: + Index of the Series. + """ + return self.index + # ---------------------------------------------------------------------- # IO methods (to / from other formats) From b26e13570f198ec4d252590a8c07253624db667a Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 30 Jan 2025 12:14:04 -0800 Subject: [PATCH 23/38] feat: Support python type as astype arg (#1316) --- bigframes/core/blocks.py | 10 +- bigframes/core/compile/ibis_types.py | 32 +---- bigframes/core/indexes/base.py | 6 +- bigframes/core/local_data.py | 6 + bigframes/dataframe.py | 18 +-- bigframes/dtypes.py | 114 ++++++++++++++---- bigframes/operations/base.py | 3 +- bigframes/operations/generic_ops.py | 11 +- bigframes/series.py | 1 + bigframes/session/__init__.py | 1 + tests/system/small/test_dataframe.py | 16 ++- tests/system/small/test_index.py | 6 + tests/system/small/test_series.py | 11 ++ tests/unit/core/test_dtypes.py | 6 +- tests/unit/core/test_expression.py | 2 +- .../bigframes_vendored/pandas/core/generic.py | 2 +- .../pandas/core/indexes/base.py | 2 +- 17 files changed, 160 insertions(+), 87 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 382be72340..43f605dc03 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -707,7 +707,7 @@ def split( # Create an ordering col and convert to string block, ordering_col = block.promote_offsets() block, string_ordering_col = block.apply_unary_op( - ordering_col, ops.AsTypeOp(to_type="string[pyarrow]") + ordering_col, ops.AsTypeOp(to_type=bigframes.dtypes.STRING_DTYPE) ) # Apply hash method to sum col and order by it. @@ -1479,7 +1479,9 @@ def add_prefix(self, prefix: str, axis: str | int | None = None) -> Block: expr, new_col = expr.project_to_id( expression=ops.add_op.as_expr( ex.const(prefix), - ops.AsTypeOp(to_type="string").as_expr(index_col), + ops.AsTypeOp(to_type=bigframes.dtypes.STRING_DTYPE).as_expr( + index_col + ), ), ) new_index_cols.append(new_col) @@ -1502,7 +1504,9 @@ def add_suffix(self, suffix: str, axis: str | int | None = None) -> Block: for index_col in self._index_columns: expr, new_col = expr.project_to_id( expression=ops.add_op.as_expr( - ops.AsTypeOp(to_type="string").as_expr(index_col), + ops.AsTypeOp(to_type=bigframes.dtypes.STRING_DTYPE).as_expr( + index_col + ), ex.const(suffix), ), ) diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index e5d637e426..8a55f6775d 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -13,9 +13,8 @@ # limitations under the License. from __future__ import annotations -import textwrap import typing -from typing import Any, cast, Dict, Iterable, Optional, Tuple, Union +from typing import cast, Dict, Iterable, Optional, Tuple, Union import bigframes_vendored.constants as constants import bigframes_vendored.ibis @@ -28,7 +27,6 @@ import db_dtypes # type: ignore import geopandas as gpd # type: ignore import google.cloud.bigquery as bigquery -import numpy as np import pandas as pd import pyarrow as pa @@ -228,9 +226,7 @@ def ibis_value_to_canonical_type(value: ibis_types.Value) -> ibis_types.Value: def bigframes_dtype_to_ibis_dtype( - bigframes_dtype: Union[ - bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype, np.dtype[Any] - ] + bigframes_dtype: bigframes.dtypes.Dtype, ) -> ibis_dtypes.DataType: """Converts a BigQuery DataFrames supported dtype to an Ibis dtype. @@ -244,11 +240,6 @@ def bigframes_dtype_to_ibis_dtype( Raises: ValueError: If passed a dtype not supported by BigQuery DataFrames. """ - if str(bigframes_dtype) in bigframes.dtypes.BIGFRAMES_STRING_TO_BIGFRAMES: - bigframes_dtype = bigframes.dtypes.BIGFRAMES_STRING_TO_BIGFRAMES[ - cast(bigframes.dtypes.DtypeString, str(bigframes_dtype)) - ] - if bigframes_dtype in BIGFRAMES_TO_IBIS.keys(): return BIGFRAMES_TO_IBIS[bigframes_dtype] @@ -256,24 +247,7 @@ def bigframes_dtype_to_ibis_dtype( return _arrow_dtype_to_ibis_dtype(bigframes_dtype.pyarrow_dtype) else: - raise ValueError( - textwrap.dedent( - f""" - Unexpected data type {bigframes_dtype}. The following - str dtypes are supppted: 'boolean','Float64','Int64', - 'int64[pyarrow]','string','string[pyarrow]', - 'timestamp[us, tz=UTC][pyarrow]','timestamp[us][pyarrow]', - 'date32[day][pyarrow]','time64[us][pyarrow]'. - The following pandas.ExtensionDtype are supported: - pandas.BooleanDtype(), pandas.Float64Dtype(), - pandas.Int64Dtype(), pandas.StringDtype(storage="pyarrow"), - pd.ArrowDtype(pa.date32()), pd.ArrowDtype(pa.time64("us")), - pd.ArrowDtype(pa.timestamp("us")), - pd.ArrowDtype(pa.timestamp("us", tz="UTC")). - {constants.FEEDBACK_LINK} - """ - ) - ) + raise ValueError(f"Datatype has no ibis type mapping: {bigframes_dtype}") def ibis_dtype_to_bigframes_dtype( diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py index da0daf027a..6ad0973262 100644 --- a/bigframes/core/indexes/base.py +++ b/bigframes/core/indexes/base.py @@ -78,7 +78,8 @@ def __new__( if name is not None: index.name = name if dtype is not None: - index = index.astype(dtype) + bf_dtype = bigframes.dtypes.bigframes_type(dtype) + index = index.astype(bf_dtype) block = index._block elif isinstance(data, pandas.Index): pd_df = pandas.DataFrame(index=data) @@ -310,7 +311,7 @@ def sort_values(self, *, ascending: bool = True, na_position: str = "last"): def astype( self, - dtype: Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype], + dtype, *, errors: Literal["raise", "null"] = "raise", ) -> Index: @@ -318,6 +319,7 @@ def astype( raise ValueError("Argument 'errors' must be one of 'raise' or 'null'") if self.nlevels > 1: raise TypeError("Multiindex does not support 'astype'") + dtype = bigframes.dtypes.bigframes_type(dtype) return self._apply_unary_expr( ops.AsTypeOp(to_type=dtype, safe=(errors == "null")).as_expr( ex.free_var("arg") diff --git a/bigframes/core/local_data.py b/bigframes/core/local_data.py index 573562cefa..f665948be2 100644 --- a/bigframes/core/local_data.py +++ b/bigframes/core/local_data.py @@ -59,6 +59,12 @@ def arrow_type_replacements(type: pa.DataType) -> pa.DataType: if pa.types.is_time64(type): # This is potentially lossy, but BigFrames doesn't support ns return pa.time64("us") + if pa.types.is_decimal128(type): + return pa.decimal128(38, 9) + if pa.types.is_decimal256(type): + return pa.decimal256(76, 38) + if pa.types.is_dictionary(type): + return arrow_type_replacements(type.value_type) if pa.types.is_large_string(type): # simple string type can handle the largest strings needed return pa.string() diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index a58edf1962..d9d3d431fd 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -180,7 +180,8 @@ def __init__( if columns: block = block.select_columns(list(columns)) # type:ignore if dtype: - block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=dtype)) + bf_dtype = bigframes.dtypes.bigframes_type(dtype) + block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype)) self._block = block else: @@ -368,6 +369,7 @@ def astype( dtype: Union[ bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype, + type, dict[str, Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype]], ], *, @@ -378,23 +380,15 @@ def astype( safe_cast = errors == "null" - # Type strings check - if dtype in bigframes.dtypes.DTYPE_STRINGS: - return self._apply_unary_op(ops.AsTypeOp(dtype, safe_cast)) - - # Type instances check - if type(dtype) in bigframes.dtypes.DTYPES: - return self._apply_unary_op(ops.AsTypeOp(dtype, safe_cast)) - if isinstance(dtype, dict): result = self.copy() for col, to_type in dtype.items(): result[col] = result[col].astype(to_type) return result - raise TypeError( - f"Invalid type {type(dtype)} for dtype input. {constants.FEEDBACK_LINK}" - ) + dtype = bigframes.dtypes.bigframes_type(dtype) + + return self._apply_unary_op(ops.AsTypeOp(dtype, safe_cast)) def _to_sql_query( self, include_index: bool, enable_cache: bool = True diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 016444032f..a8d9d60366 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -17,6 +17,7 @@ from dataclasses import dataclass import datetime import decimal +import textwrap import typing from typing import Any, Dict, List, Literal, Union @@ -422,7 +423,7 @@ def arrow_dtype_to_bigframes_dtype(arrow_dtype: pa.DataType) -> Dtype: return DEFAULT_DTYPE # No other types matched. - raise ValueError( + raise TypeError( f"Unexpected Arrow data type {arrow_dtype}. {constants.FEEDBACK_LINK}" ) @@ -447,7 +448,7 @@ def bigframes_dtype_to_arrow_dtype( if pa.types.is_struct(bigframes_dtype.pyarrow_dtype): return bigframes_dtype.pyarrow_dtype else: - raise ValueError( + raise TypeError( f"No arrow conversion for {bigframes_dtype}. {constants.FEEDBACK_LINK}" ) @@ -474,7 +475,7 @@ def bigframes_dtype_to_literal( if isinstance(bigframes_dtype, gpd.array.GeometryDtype): return shapely.Point((0, 0)) - raise ValueError( + raise TypeError( f"No literal conversion for {bigframes_dtype}. {constants.FEEDBACK_LINK}" ) @@ -515,11 +516,91 @@ def arrow_type_to_literal( if pa.types.is_time(arrow_type): return datetime.time(1, 1, 1) - raise ValueError( + raise TypeError( f"No literal conversion for {arrow_type}. {constants.FEEDBACK_LINK}" ) +def bigframes_type(dtype) -> Dtype: + """Convert type object to canoncial bigframes dtype.""" + if _is_bigframes_dtype(dtype): + return dtype + elif isinstance(dtype, str): + return _dtype_from_string(dtype) + elif isinstance(dtype, type): + return _infer_dtype_from_python_type(dtype) + elif isinstance(dtype, pa.DataType): + return arrow_dtype_to_bigframes_dtype(dtype) + else: + raise TypeError( + f"Cannot infer supported datatype for: {dtype}. {constants.FEEDBACK_LINK}" + ) + + +def _is_bigframes_dtype(dtype) -> bool: + """True iff dtyps is a canonical bigframes dtype""" + # have to be quite strict, as pyarrow dtypes equal their string form, and we don't consider that a canonical form. + if (type(dtype), dtype) in set( + (type(item.dtype), item.dtype) for item in SIMPLE_TYPES + ): + return True + if isinstance(dtype, pd.ArrowDtype): + try: + _ = arrow_dtype_to_bigframes_dtype(dtype.pyarrow_dtype) + return True + except TypeError: + return False + return False + + +def _infer_dtype_from_python_type(type: type) -> Dtype: + if issubclass(type, (bool, np.bool_)): + return BOOL_DTYPE + if issubclass(type, (int, np.integer)): + return INT_DTYPE + if issubclass(type, (float, np.floating)): + return FLOAT_DTYPE + if issubclass(type, decimal.Decimal): + return NUMERIC_DTYPE + if issubclass(type, (str, np.str_)): + return STRING_DTYPE + if issubclass(type, (bytes, np.bytes_)): + return BYTES_DTYPE + if issubclass(type, datetime.date): + return DATE_DTYPE + if issubclass(type, datetime.time): + return TIME_DTYPE + else: + raise TypeError( + f"No matching datatype for python type: {type}. {constants.FEEDBACK_LINK}" + ) + + +def _dtype_from_string(dtype_string: str) -> typing.Optional[Dtype]: + if str(dtype_string) in BIGFRAMES_STRING_TO_BIGFRAMES: + return BIGFRAMES_STRING_TO_BIGFRAMES[ + typing.cast(DtypeString, str(dtype_string)) + ] + raise TypeError( + textwrap.dedent( + f""" + Unexpected data type string {dtype_string}. The following + dtypes are supppted: 'boolean','Float64','Int64', + 'int64[pyarrow]','string','string[pyarrow]', + 'timestamp[us, tz=UTC][pyarrow]','timestamp[us][pyarrow]', + 'date32[day][pyarrow]','time64[us][pyarrow]'. + The following pandas.ExtensionDtype are supported: + pandas.BooleanDtype(), pandas.Float64Dtype(), + pandas.Int64Dtype(), pandas.StringDtype(storage="pyarrow"), + pd.ArrowDtype(pa.date32()), pd.ArrowDtype(pa.time64("us")), + pd.ArrowDtype(pa.timestamp("us")), + pd.ArrowDtype(pa.timestamp("us", tz="UTC")). + {constants.FEEDBACK_LINK} + """ + ) + ) + + def infer_literal_type(literal) -> typing.Optional[Dtype]: # Maybe also normalize literal to canonical python representation to remove this burden from compilers? if pd.api.types.is_list_like(literal): @@ -539,30 +620,17 @@ def infer_literal_type(literal) -> typing.Optional[Dtype]: return pd.ArrowDtype(pa.struct(fields)) if pd.isna(literal): return None # Null value without a definite type - if isinstance(literal, (bool, np.bool_)): - return BOOL_DTYPE - if isinstance(literal, (int, np.integer)): - return INT_DTYPE - if isinstance(literal, (float, np.floating)): - return FLOAT_DTYPE - if isinstance(literal, decimal.Decimal): - return NUMERIC_DTYPE - if isinstance(literal, (str, np.str_)): - return STRING_DTYPE - if isinstance(literal, (bytes, np.bytes_)): - return BYTES_DTYPE # Make sure to check datetime before date as datetimes are also dates if isinstance(literal, (datetime.datetime, pd.Timestamp)): if literal.tzinfo is not None: return TIMESTAMP_DTYPE else: return DATETIME_DTYPE - if isinstance(literal, datetime.date): - return DATE_DTYPE - if isinstance(literal, datetime.time): - return TIME_DTYPE + from_python_type = _infer_dtype_from_python_type(type(literal)) + if from_python_type is not None: + return from_python_type else: - raise ValueError(f"Unable to infer type for value: {literal}") + raise TypeError(f"Unable to infer type for value: {literal}") def infer_literal_arrow_type(literal) -> typing.Optional[pa.DataType]: @@ -602,7 +670,7 @@ def convert_schema_field( return field.name, pd.ArrowDtype(pa_type) return field.name, _TK_TO_BIGFRAMES[field.field_type] else: - raise ValueError(f"Cannot handle type: {field.field_type}") + raise TypeError(f"Cannot handle type: {field.field_type}") def convert_to_schema_field( @@ -636,7 +704,7 @@ def convert_to_schema_field( if bigframes_dtype.pyarrow_dtype == pa.duration("us"): # Timedeltas are represented as integers in microseconds. return google.cloud.bigquery.SchemaField(name, "INTEGER") - raise ValueError( + raise TypeError( f"No arrow conversion for {bigframes_dtype}. {constants.FEEDBACK_LINK}" ) diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index f6e8223aa0..75db2f48e9 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -87,7 +87,8 @@ def __init__( if name is not None: data.name = name if dtype is not None: - data = data.astype(dtype) + bf_dtype = bigframes.dtypes.bigframes_type(dtype) + data = data.astype(bf_dtype) else: # local dict-like data data = read_pandas_func(pd.Series(data, name=name, dtype=dtype)) # type: ignore data_block = data._block diff --git a/bigframes/operations/generic_ops.py b/bigframes/operations/generic_ops.py index ef7e1f5cea..b90a43b091 100644 --- a/bigframes/operations/generic_ops.py +++ b/bigframes/operations/generic_ops.py @@ -16,8 +16,6 @@ import functools import typing -import pyarrow as pa - from bigframes import dtypes from bigframes.operations import base_ops import bigframes.operations.type as op_typing @@ -56,17 +54,10 @@ class AsTypeOp(base_ops.UnaryOp): name: typing.ClassVar[str] = "astype" # TODO: Convert strings to dtype earlier - to_type: typing.Union[dtypes.DtypeString, dtypes.Dtype] + to_type: dtypes.Dtype safe: bool = False def output_type(self, *input_types): - # TODO: We should do this conversion earlier - if self.to_type == pa.string(): - return dtypes.STRING_DTYPE - if isinstance(self.to_type, str): - return dtypes.BIGFRAMES_STRING_TO_BIGFRAMES[ - typing.cast(dtypes.DtypeString, self.to_type) - ] return self.to_type diff --git a/bigframes/series.py b/bigframes/series.py index 6c0bf8cebb..706c0f4f09 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -366,6 +366,7 @@ def astype( ) -> Series: if errors not in ["raise", "null"]: raise ValueError("Argument 'errors' must be one of 'raise' or 'null'") + dtype = bigframes.dtypes.bigframes_type(dtype) return self._apply_unary_op( bigframes.operations.AsTypeOp(to_type=dtype, safe=(errors == "null")) ) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 95d7b1aa2c..24963bdcbc 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -806,6 +806,7 @@ def _read_pandas_inline( pa.ArrowInvalid, # Thrown by arrow for unsupported types, such as geo. pa.ArrowTypeError, # Thrown by arrow for types without mapping (geo). ValueError, # Thrown by ibis for some unhandled types + TypeError, # Not all types handleable by local code path ) as exc: if should_raise: raise ValueError( diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 4266cdba88..aa038c62d8 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -5315,7 +5315,7 @@ def test__resample_start_time(rule, origin, data): ), ], ) -def test_astype(scalars_dfs, dtype): +def test_df_astype(scalars_dfs, dtype): bf_df, pd_df = scalars_dfs target_cols = ["bool_col", "int64_col"] bf_df = bf_df[target_cols] @@ -5327,6 +5327,20 @@ def test_astype(scalars_dfs, dtype): pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) +def test_df_astype_python_types(scalars_dfs): + bf_df, pd_df = scalars_dfs + target_cols = ["bool_col", "int64_col"] + bf_df = bf_df[target_cols] + pd_df = pd_df[target_cols] + + bf_result = bf_df.astype({"bool_col": str, "int64_col": float}).to_pandas() + pd_result = pd_df.astype( + {"bool_col": "string[pyarrow]", "int64_col": pd.Float64Dtype()} + ) + + pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + def test_astype_invalid_type_fail(scalars_dfs): bf_df, _ = scalars_dfs diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py index cdf4fa6511..4d01bc5ee9 100644 --- a/tests/system/small/test_index.py +++ b/tests/system/small/test_index.py @@ -123,6 +123,12 @@ def test_index_astype(scalars_df_index, scalars_pandas_df_index): pd.testing.assert_index_equal(bf_result, pd_result) +def test_index_astype_python(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.set_index("int64_col").index.astype(float).to_pandas() + pd_result = scalars_pandas_df_index.set_index("int64_col").index.astype("Float64") + pd.testing.assert_index_equal(bf_result, pd_result) + + def test_index_astype_error_error(session): input = pd.Index(["hello", "world", "3.11", "4000"]) with pytest.raises(ValueError): diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index b3faefc5f7..fb48bf58b4 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -3308,6 +3308,17 @@ def test_astype(scalars_df_index, scalars_pandas_df_index, column, to_type, erro pd.testing.assert_series_equal(bf_result, pd_result) +def test_series_astype_python(session): + input = pd.Series(["hello", "world", "3.11", "4000"]) + exepcted = pd.Series( + [None, None, 3.11, 4000], + dtype="Float64", + index=pd.Index([0, 1, 2, 3], dtype="Int64"), + ) + result = session.read_pandas(input).astype(float, errors="null").to_pandas() + pd.testing.assert_series_equal(result, exepcted) + + def test_astype_safe(session): input = pd.Series(["hello", "world", "3.11", "4000"]) exepcted = pd.Series( diff --git a/tests/unit/core/test_dtypes.py b/tests/unit/core/test_dtypes.py index e1fac624d7..3d420de51f 100644 --- a/tests/unit/core/test_dtypes.py +++ b/tests/unit/core/test_dtypes.py @@ -219,20 +219,20 @@ def test_bigframes_dtype_converts(ibis_dtype, bigframes_dtype): def test_bigframes_string_dtype_converts(ibis_dtype, bigframes_dtype_str): """Test all the Ibis data types needed to read BigQuery tables""" result = bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype( - bigframes_dtype_str + bigframes.dtypes.bigframes_type(bigframes_dtype_str) ) assert result == ibis_dtype def test_unsupported_dtype_raises_unexpected_datatype(): """Incompatible dtypes should fail when passed into BigQuery DataFrames""" - with pytest.raises(ValueError, match="Unexpected data type"): + with pytest.raises(ValueError, match="Datatype has no ibis type mapping"): bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype(np.float32) def test_unsupported_dtype_str_raises_unexpected_datatype(): """Incompatible dtypes should fail when passed into BigQuery DataFrames""" - with pytest.raises(ValueError, match="Unexpected data type"): + with pytest.raises(ValueError, match="Datatype has no ibis type mapping"): bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype("int64") diff --git a/tests/unit/core/test_expression.py b/tests/unit/core/test_expression.py index 72e200f007..ab6402a909 100644 --- a/tests/unit/core/test_expression.py +++ b/tests/unit/core/test_expression.py @@ -47,7 +47,7 @@ def test_expression_dtype_where(): def test_expression_dtype_astype(): - expression = ops.AsTypeOp("Int64").as_expr(ex.const(3.14159)) + expression = ops.AsTypeOp(dtypes.INT_DTYPE).as_expr(ex.const(3.14159)) result = expression.output_type({}) diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 83a24f7a9c..9dae802b6e 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -165,7 +165,7 @@ def astype(self, dtype): dtype: Int64 Args: - dtype (str or pandas.ExtensionDtype): + dtype (str, data type or pandas.ExtensionDtype): A dtype supported by BigQuery DataFrame include ``'boolean'``, ``'Float64'``, ``'Int64'``, ``'int64\\[pyarrow\\]'``, ``'string'``, ``'string\\[pyarrow\\]'``, diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index c48c07424d..59504ee68c 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -445,7 +445,7 @@ def astype(self, dtype): Args: - dtype (str or pandas.ExtensionDtype): + dtype (str, data type, or pandas.ExtensionDtype): A dtype supported by BigQuery DataFrame include ``'boolean'``, ``'Float64'``, ``'Int64'``, ``'int64\\[pyarrow\\]'``, ``'string'``, ``'string\\[pyarrow\\]'``, From 6408f84254f69ebf8abb8194c1148b1ca95750a6 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Thu, 30 Jan 2025 14:36:16 -0800 Subject: [PATCH 24/38] chore: add type checks for unix epoch conversions (#1343) --- bigframes/operations/datetime_ops.py | 18 ++++++++++------ tests/system/small/bigquery/test_datetime.py | 22 ++++++++++++++++++++ 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/bigframes/operations/datetime_ops.py b/bigframes/operations/datetime_ops.py index 3ee8a00141..5086de27d3 100644 --- a/bigframes/operations/datetime_ops.py +++ b/bigframes/operations/datetime_ops.py @@ -43,7 +43,7 @@ class ToDatetimeOp(base_ops.UnaryOp): format: typing.Optional[str] = None unit: typing.Optional[str] = None - def output_type(self, *input_types): + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: if input_types[0] not in ( dtypes.FLOAT_DTYPE, dtypes.INT_DTYPE, @@ -59,7 +59,7 @@ class ToTimestampOp(base_ops.UnaryOp): format: typing.Optional[str] = None unit: typing.Optional[str] = None - def output_type(self, *input_types): + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: # Must be numeric or string if input_types[0] not in ( dtypes.FLOAT_DTYPE, @@ -75,7 +75,7 @@ class StrftimeOp(base_ops.UnaryOp): name: typing.ClassVar[str] = "strftime" date_format: str - def output_type(self, *input_types): + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: return dtypes.STRING_DTYPE @@ -83,7 +83,9 @@ def output_type(self, *input_types): class UnixSeconds(base_ops.UnaryOp): name: typing.ClassVar[str] = "unix_seconds" - def output_type(self, *input_types): + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + if input_types[0] is not dtypes.TIMESTAMP_DTYPE: + raise TypeError("expected timestamp input") return dtypes.INT_DTYPE @@ -91,7 +93,9 @@ def output_type(self, *input_types): class UnixMillis(base_ops.UnaryOp): name: typing.ClassVar[str] = "unix_millis" - def output_type(self, *input_types): + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + if input_types[0] is not dtypes.TIMESTAMP_DTYPE: + raise TypeError("expected timestamp input") return dtypes.INT_DTYPE @@ -99,5 +103,7 @@ def output_type(self, *input_types): class UnixMicros(base_ops.UnaryOp): name: typing.ClassVar[str] = "unix_micros" - def output_type(self, *input_types): + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + if input_types[0] is not dtypes.TIMESTAMP_DTYPE: + raise TypeError("expected timestamp input") return dtypes.INT_DTYPE diff --git a/tests/system/small/bigquery/test_datetime.py b/tests/system/small/bigquery/test_datetime.py index 984e75ac10..b839031263 100644 --- a/tests/system/small/bigquery/test_datetime.py +++ b/tests/system/small/bigquery/test_datetime.py @@ -15,6 +15,7 @@ import typing import pandas as pd +import pytest from bigframes import bigquery @@ -32,6 +33,13 @@ def test_unix_seconds(scalars_dfs): pd.testing.assert_series_equal(actual_res, expected_res) +def test_unix_seconds_incorrect_input_type_raise_error(scalars_dfs): + df, _ = scalars_dfs + + with pytest.raises(TypeError): + bigquery.unix_seconds(df["string_col"]) + + def test_unix_millis(scalars_dfs): bigframes_df, pandas_df = scalars_dfs @@ -45,6 +53,13 @@ def test_unix_millis(scalars_dfs): pd.testing.assert_series_equal(actual_res, expected_res) +def test_unix_millis_incorrect_input_type_raise_error(scalars_dfs): + df, _ = scalars_dfs + + with pytest.raises(TypeError): + bigquery.unix_millis(df["string_col"]) + + def test_unix_micros(scalars_dfs): bigframes_df, pandas_df = scalars_dfs @@ -58,6 +73,13 @@ def test_unix_micros(scalars_dfs): pd.testing.assert_series_equal(actual_res, expected_res) +def test_unix_micros_incorrect_input_type_raise_error(scalars_dfs): + df, _ = scalars_dfs + + with pytest.raises(TypeError): + bigquery.unix_micros(df["string_col"]) + + def _to_unix_epoch( ts: pd.Timestamp, unit: typing.Literal["s", "ms", "us"] ) -> typing.Optional[int]: From e86a4da6b802bf389ffd825308844e8b811f158a Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Thu, 30 Jan 2025 17:40:19 -0800 Subject: [PATCH 25/38] chore: add a new metric (#1345) * chore: add a new metric * update print --- bigframes/session/metrics.py | 23 +++++++++++++++++------ scripts/run_and_publish_benchmark.py | 21 +++++++++++++++++++-- 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/bigframes/session/metrics.py b/bigframes/session/metrics.py index 352cd0d892..33bcd7fbf5 100644 --- a/bigframes/session/metrics.py +++ b/bigframes/session/metrics.py @@ -30,28 +30,31 @@ class ExecutionMetrics: slot_millis: int = 0 bytes_processed: int = 0 execution_secs: float = 0 + query_char_count: int = 0 def count_job_stats(self, query_job: bq_job.QueryJob): stats = get_performance_stats(query_job) if stats is not None: - bytes_processed, slot_millis, execution_secs = stats + bytes_processed, slot_millis, execution_secs, query_char_count = stats self.execution_count += 1 self.bytes_processed += bytes_processed self.slot_millis += slot_millis self.execution_secs += execution_secs + self.query_char_count += query_char_count if LOGGING_NAME_ENV_VAR in os.environ: # when running notebooks via pytest nbmake - write_stats_to_disk(bytes_processed, slot_millis, execution_secs) + write_stats_to_disk( + bytes_processed, slot_millis, execution_secs, query_char_count + ) def get_performance_stats( query_job: bigquery.QueryJob, -) -> Optional[Tuple[int, int, float]]: +) -> Optional[Tuple[int, int, float, int]]: """Parse the query job for performance stats. Return None if the stats do not reflect real work done in bigquery. """ - if ( query_job.configuration.dry_run or query_job.created is None @@ -68,12 +71,13 @@ def get_performance_stats( return None # filter out mocks execution_secs = (query_job.ended - query_job.created).total_seconds() + query_char_count = len(query_job.query) - return bytes_processed, slot_millis, execution_secs + return bytes_processed, slot_millis, execution_secs, query_char_count def write_stats_to_disk( - bytes_processed: int, slot_millis: int, exec_seconds: Optional[float] + bytes_processed: int, slot_millis: int, exec_seconds: float, query_char_count: int ): """For pytest runs only, log information about the query job to a file in order to create a performance report. @@ -103,3 +107,10 @@ def write_stats_to_disk( ) with open(exec_time_file, "a") as f: f.write(str(exec_seconds) + "\n") + + # store length of query + query_char_count_file = os.path.join( + current_directory, test_name + ".query_char_count" + ) + with open(query_char_count_file, "a") as f: + f.write(str(query_char_count) + "\n") diff --git a/scripts/run_and_publish_benchmark.py b/scripts/run_and_publish_benchmark.py index 8b55493770..28605a8155 100644 --- a/scripts/run_and_publish_benchmark.py +++ b/scripts/run_and_publish_benchmark.py @@ -88,6 +88,8 @@ def collect_benchmark_result( millis_files = sorted(path.rglob("*.slotmillis")) bq_seconds_files = sorted(path.rglob("*.bq_exec_time_seconds")) local_seconds_files = sorted(path.rglob("*.local_exec_time_seconds")) + query_char_count_files = sorted(path.rglob("*.query_char_count")) + error_files = sorted(path.rglob("*.error")) if not ( @@ -95,15 +97,18 @@ def collect_benchmark_result( == len(millis_files) == len(local_seconds_files) == len(bq_seconds_files) + == len(query_char_count_files) ): raise ValueError( - "Mismatch in the number of report files for bytes, millis, and seconds." + "Mismatch in the number of report files for bytes, millis, seconds and query char count." ) for idx in range(len(bytes_files)): bytes_file = bytes_files[idx] millis_file = millis_files[idx] bq_seconds_file = bq_seconds_files[idx] + query_char_count_file = query_char_count_files[idx] + filename = bytes_file.relative_to(path).with_suffix("") if filename != millis_file.relative_to(path).with_suffix( @@ -136,12 +141,17 @@ def collect_benchmark_result( lines = file.read().splitlines() bq_seconds = sum(float(line) for line in lines) / iterations + with open(query_char_count_file, "r") as file: + lines = file.read().splitlines() + query_char_count = sum(int(line) for line in lines) / iterations + results_dict[str(filename)] = [ query_count, total_bytes, total_slot_millis, local_seconds, bq_seconds, + query_char_count, ] finally: for files_to_remove in ( @@ -149,6 +159,7 @@ def collect_benchmark_result( path.rglob("*.slotmillis"), path.rglob("*.local_exec_time_seconds"), path.rglob("*.bq_exec_time_seconds"), + path.rglob("*.query_char_count"), path.rglob("*.error"), ): for log_file in files_to_remove: @@ -160,6 +171,7 @@ def collect_benchmark_result( "Slot_Millis", "Local_Execution_Time_Sec", "BigQuery_Execution_Time_Sec", + "Query_Char_Count", ] benchmark_metrics = pd.DataFrame.from_dict( @@ -182,15 +194,19 @@ def collect_benchmark_result( ) print( f"{index} - query count: {row['Query_Count']}," + f" query char count: {row['Query_Char_Count']},", f" bytes processed sum: {row['Bytes_Processed']}," f" slot millis sum: {row['Slot_Millis']}," f" local execution time: {formatted_local_exec_time} seconds," - f" bigquery execution time: {round(row['BigQuery_Execution_Time_Sec'], 1)} seconds" + f" bigquery execution time: {round(row['BigQuery_Execution_Time_Sec'], 1)} seconds", ) geometric_mean_queries = geometric_mean_excluding_zeros( benchmark_metrics["Query_Count"] ) + geometric_mean_query_char_count = geometric_mean_excluding_zeros( + benchmark_metrics["Query_Char_Count"] + ) geometric_mean_bytes = geometric_mean_excluding_zeros( benchmark_metrics["Bytes_Processed"] ) @@ -206,6 +222,7 @@ def collect_benchmark_result( print( f"---Geometric mean of queries: {geometric_mean_queries}, " + f"Geometric mean of queries char counts: {geometric_mean_query_char_count}, " f"Geometric mean of bytes processed: {geometric_mean_bytes}, " f"Geometric mean of slot millis: {geometric_mean_slot_millis}, " f"Geometric mean of local execution time: {geometric_mean_local_seconds} seconds, " From f2d526445da7dae29c49c8d6dacdfee7d2fa9d79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Mon, 3 Feb 2025 12:13:49 -0600 Subject: [PATCH 26/38] fix: exclude `DataFrame` and `Series` `__call__` from unimplemented API metrics (#1351) * fix: add feedback link to `DataFrame` and `Series` `__call__` error * revert __call__ add to unimplemented tracking --- bigframes/core/log_adapter.py | 4 +++- tests/unit/core/test_log_adapter.py | 26 +++++++++++++++++++++++--- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/bigframes/core/log_adapter.py b/bigframes/core/log_adapter.py index d234d9be28..714a522183 100644 --- a/bigframes/core/log_adapter.py +++ b/bigframes/core/log_adapter.py @@ -78,7 +78,9 @@ def submit_pandas_labels( else: return - if hasattr(cls, method_name): + # Omit __call__, because its not implemented on the actual instances of + # DataFrame/Series, only as the constructor. + if method_name != "__call__" and hasattr(cls, method_name): method = getattr(cls, method_name) else: return diff --git a/tests/unit/core/test_log_adapter.py b/tests/unit/core/test_log_adapter.py index d183f4479e..6bc9c91f3a 100644 --- a/tests/unit/core/test_log_adapter.py +++ b/tests/unit/core/test_log_adapter.py @@ -157,11 +157,31 @@ def test_submit_pandas_labels_without_valid_params_for_param_logging(mock_bqclie mock_bqclient.query.assert_not_called() -def test_submit_pandas_labels_with_internal_method(mock_bqclient): +@pytest.mark.parametrize( + ("class_name", "method_name"), + ( + ("Series", "_repr_latex_"), + ( + "DataFrame", + # __call__ should be excluded. + # It's implemented on the pd.DataFrame class but not pd.DataFrame instances. + "__call__", + ), + ( + "Series", + # __call__ should be excluded. + # It's implemented on the pd.Series class but not pd.Series instances. + "__call__", + ), + ), +) +def test_submit_pandas_labels_with_internal_method( + mock_bqclient, class_name, method_name +): log_adapter.submit_pandas_labels( mock_bqclient, - "Series", - "_repr_latex_", + class_name, + method_name, task=log_adapter.PANDAS_API_TRACKING_TASK, ) mock_bqclient.query.assert_not_called() From 417de3a449e5d0748831b502f4f5b9fb9ba38714 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Mon, 3 Feb 2025 12:41:12 -0600 Subject: [PATCH 27/38] fix: make `DataFrame` `__getattr__` and `__setattr__` more robust to subclassing (#1352) * fix: make `DataFrame` `__getattr__` and `__setattr__` more robust to subclassing * use _block as an initialized indicator --- bigframes/dataframe.py | 57 ++++++++++++++++++++++++------------ tests/unit/test_dataframe.py | 9 ++++++ 2 files changed, 47 insertions(+), 19 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index d9d3d431fd..6308dcc8da 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -118,6 +118,8 @@ def __init__( ): global bigframes + self._query_job: Optional[bigquery.QueryJob] = None + if copy is not None and not copy: raise ValueError( f"DataFrame constructor only supports copy=True. {constants.FEEDBACK_LINK}" @@ -182,7 +184,6 @@ def __init__( if dtype: bf_dtype = bigframes.dtypes.bigframes_type(dtype) block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype)) - self._block = block else: import bigframes.pandas @@ -194,10 +195,14 @@ def __init__( dtype=dtype, # type:ignore ) if session: - self._block = session.read_pandas(pd_dataframe)._get_block() + block = session.read_pandas(pd_dataframe)._get_block() else: - self._block = bigframes.pandas.read_pandas(pd_dataframe)._get_block() - self._query_job: Optional[bigquery.QueryJob] = None + block = bigframes.pandas.read_pandas(pd_dataframe)._get_block() + + # We use _block as an indicator in __getattr__ and __setattr__ to see + # if the object is fully initialized, so make sure we set the _block + # attribute last. + self._block = block self._block.session._register_object(self) def __dir__(self): @@ -625,13 +630,17 @@ def _getitem_bool_series(self, key: bigframes.series.Series) -> DataFrame: return DataFrame(block) def __getattr__(self, key: str): - # Protect against recursion errors with uninitialized DataFrame - # objects. See: + # To allow subclasses to set private attributes before the class is + # fully initialized, protect against recursion errors with + # uninitialized DataFrame objects. Note: this comes at the downside + # that columns with a leading `_` won't be treated as columns. + # + # See: # https://github.com/googleapis/python-bigquery-dataframes/issues/728 # and # https://nedbatchelder.com/blog/201010/surprising_getattr_recursion.html if key == "_block": - raise AttributeError("_block") + raise AttributeError(key) if key in self._block.column_labels: return self.__getitem__(key) @@ -651,26 +660,36 @@ def __getattr__(self, key: str): raise AttributeError(key) def __setattr__(self, key: str, value): - if key in ["_block", "_query_job"]: + if key == "_block": + object.__setattr__(self, key, value) + return + + # To allow subclasses to set private attributes before the class is + # fully initialized, assume anything set before `_block` is initialized + # is a regular attribute. + if not hasattr(self, "_block"): object.__setattr__(self, key, value) return - # Can this be removed??? + + # If someone has a column named the same as a normal attribute + # (e.g. index), we want to set the normal attribute, not the column. + # To do that, check if there is a normal attribute by using + # __getattribute__ (not __getattr__, because that includes columns). + # If that returns a value without raising, then we know this is a + # normal attribute and we should prefer that. try: - # boring attributes go through boring old path object.__getattribute__(self, key) return object.__setattr__(self, key, value) except AttributeError: pass - # if this fails, go on to more involved attribute setting - # (note that this matches __getattr__, above). - try: - if key in self.columns: - self[key] = value - else: - object.__setattr__(self, key, value) - # Can this be removed? - except (AttributeError, TypeError): + # If we made it here, then we know that it's not a regular attribute + # already, so it might be a column to update. Note: we don't allow + # adding new columns using __setattr__, only __setitem__, that way we + # can still add regular new attributes. + if key in self._block.column_labels: + self[key] = value + else: object.__setattr__(self, key, value) def __repr__(self) -> str: diff --git a/tests/unit/test_dataframe.py b/tests/unit/test_dataframe.py index 560c0cf0f4..a6ad5e3821 100644 --- a/tests/unit/test_dataframe.py +++ b/tests/unit/test_dataframe.py @@ -41,6 +41,15 @@ def test_dataframe_repr_with_uninitialized_object(): assert "DataFrame" in got +def test_dataframe_setattr_with_uninitialized_object(): + """Ensures DataFrame can be subclassed without trying to set attributes as columns.""" + # Avoid calling __init__ since it might be called later in a subclass. + # https://stackoverflow.com/a/6384982/101923 + dataframe = bigframes.dataframe.DataFrame.__new__(bigframes.dataframe.DataFrame) + dataframe.lineage = "my-test-value" + assert dataframe.lineage == "my-test-value" # Should just be a regular attribute. + + def test_dataframe_to_gbq_invalid_destination(monkeypatch: pytest.MonkeyPatch): dataframe = resources.create_dataframe(monkeypatch) From 012081af9ef825ced96ec1e772b9646cbe09d9a1 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Mon, 3 Feb 2025 11:24:06 -0800 Subject: [PATCH 28/38] perf: Prevent inlining of remote ops (#1347) --- bigframes/core/compile/compiled.py | 14 +++++++++---- bigframes/core/expression.py | 21 ++++++++++++++++++- bigframes/operations/base_ops.py | 9 ++++++++ bigframes/operations/remote_function_ops.py | 12 +++++++++++ .../ibis/backends/sql/rewrites.py | 2 ++ 5 files changed, 53 insertions(+), 5 deletions(-) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 896f99b9de..906bdb1f0d 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -110,12 +110,19 @@ def projection( expression_id_pairs: typing.Tuple[typing.Tuple[ex.Expression, str], ...], ) -> UnorderedIR: """Apply an expression to the ArrayValue and assign the output to a column.""" + cannot_inline = any(expr.expensive for expr, _ in expression_id_pairs) + bindings = {col: self._get_ibis_column(col) for col in self.column_ids} new_values = [ op_compiler.compile_expression(expression, bindings).name(id) for expression, id in expression_id_pairs ] - return UnorderedIR(self._table, (*self._columns, *new_values)) + result = UnorderedIR(self._table, (*self._columns, *new_values)) + if cannot_inline: + return result._reproject_to_table() + else: + # Cheap ops can defer "SELECT" and inline into later ops + return result def selection( self, @@ -174,13 +181,12 @@ def _to_ibis_expr( Returns: An ibis expression representing the data help by the ArrayValue object. """ - columns = list(self._columns) # Special case for empty tables, since we can't create an empty # projection. - if not columns: + if not self._columns: return bigframes_vendored.ibis.memtable([]) - table = self._table.select(columns) + table = self._table.select(self._columns) if fraction is not None: table = table.filter( bigframes_vendored.ibis.random() < ibis_types.literal(fraction) diff --git a/bigframes/core/expression.py b/bigframes/core/expression.py index 3ffccc94ac..8621d5d915 100644 --- a/bigframes/core/expression.py +++ b/bigframes/core/expression.py @@ -18,7 +18,7 @@ import dataclasses import itertools import typing -from typing import Mapping, TypeVar, Union +from typing import Generator, Mapping, TypeVar, Union import pandas as pd @@ -155,6 +155,16 @@ class Expression(abc.ABC): def free_variables(self) -> typing.Tuple[str, ...]: return () + @property + def children(self) -> typing.Tuple[Expression, ...]: + return () + + @property + def expensive(self) -> bool: + return any( + isinstance(ex, OpExpression) and ex.op.expensive for ex in self.walk() + ) + @property @abc.abstractmethod def column_references(self) -> typing.Tuple[ids.ColumnId, ...]: @@ -216,6 +226,11 @@ def is_identity(self) -> bool: """True for identity operation that does not transform input.""" return False + def walk(self) -> Generator[Expression, None, None]: + yield self + for child in self.children: + yield from child.children + @dataclasses.dataclass(frozen=True) class ScalarConstantExpression(Expression): @@ -389,6 +404,10 @@ def free_variables(self) -> typing.Tuple[str, ...]: def is_const(self) -> bool: return all(child.is_const for child in self.inputs) + @property + def children(self): + return self.inputs + def output_type( self, input_types: dict[ids.ColumnId, dtypes.ExpressionType] ) -> dtypes.ExpressionType: diff --git a/bigframes/operations/base_ops.py b/bigframes/operations/base_ops.py index 0308283ad4..fc92ffe760 100644 --- a/bigframes/operations/base_ops.py +++ b/bigframes/operations/base_ops.py @@ -48,6 +48,11 @@ def deterministic(self) -> bool: """Whether the operation is deterministic" (given deterministic inputs)""" ... + @property + def expensive(self) -> bool: + """Whether the operation is expensive to calculate. Such ops shouldn't be inlined if referenced multiple places.""" + ... + @dataclasses.dataclass(frozen=True) class ScalarOp: @@ -73,6 +78,10 @@ def deterministic(self) -> bool: """Whether the operation is deterministic" (given deterministic inputs)""" return True + @property + def expensive(self) -> bool: + return False + @dataclasses.dataclass(frozen=True) class NaryOp(ScalarOp): diff --git a/bigframes/operations/remote_function_ops.py b/bigframes/operations/remote_function_ops.py index 0bced56f8d..5b738c0bb5 100644 --- a/bigframes/operations/remote_function_ops.py +++ b/bigframes/operations/remote_function_ops.py @@ -25,6 +25,10 @@ class RemoteFunctionOp(base_ops.UnaryOp): func: typing.Callable apply_on_null: bool + @property + def expensive(self) -> bool: + return True + def output_type(self, *input_types): # This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method if hasattr(self.func, "output_dtype"): @@ -45,6 +49,10 @@ class BinaryRemoteFunctionOp(base_ops.BinaryOp): name: typing.ClassVar[str] = "binary_remote_function" func: typing.Callable + @property + def expensive(self) -> bool: + return True + def output_type(self, *input_types): # This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method if hasattr(self.func, "output_dtype"): @@ -65,6 +73,10 @@ class NaryRemoteFunctionOp(base_ops.NaryOp): name: typing.ClassVar[str] = "nary_remote_function" func: typing.Callable + @property + def expensive(self) -> bool: + return True + def output_type(self, *input_types): # This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method if hasattr(self.func, "output_dtype"): diff --git a/third_party/bigframes_vendored/ibis/backends/sql/rewrites.py b/third_party/bigframes_vendored/ibis/backends/sql/rewrites.py index b2ef6a15d3..652f04757b 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/rewrites.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/rewrites.py @@ -245,6 +245,8 @@ def merge_select_select(_, **kwargs): ops.InSubquery, ops.Unnest, ops.Impure, + # This is used for remote functions, which we don't want to copy + ops.ScalarUDF, ) if _.find_below(blocking, filter=ops.Value): return _ From 5f18add3db3879d692940ca9577111a5eaab2ef6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Mon, 3 Feb 2025 14:03:08 -0600 Subject: [PATCH 29/38] chore: reduce number of versions notebooks are tested on (#1353) These tests frequently encounter quota issues. Reducing the number of versions we test with should help alleviate this. IMO, the most important versions to test with are the ones for Google's hosted notebook environments. --- noxfile.py | 52 ++++++++++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/noxfile.py b/noxfile.py index 38e5fab1a6..5b38d92c60 100644 --- a/noxfile.py +++ b/noxfile.py @@ -34,6 +34,16 @@ # TODO: switch to 3.13 once remote functions / cloud run adds a runtime for it (internal issue 333742751) LATEST_FULLY_SUPPORTED_PYTHON = "3.12" +# Notebook tests should match colab and BQ Studio. +# Check with import sys; sys.version_info +# on a fresh notebook runtime. +COLAB_AND_BQ_STUDIO_PYTHON_VERSIONS = [ + # BQ Studio + "3.10", + # colab.research.google.com + "3.11", +] + # pytest-retry is not yet compatible with pytest 8.x. # https://github.com/str0zzapreti/pytest-retry/issues/32 PYTEST_VERSION = "pytest<8.0.0dev" @@ -64,8 +74,9 @@ UNIT_TEST_EXTRAS: List[str] = [] UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {"3.12": ["polars"]} -# There are 4 different ibis-framework 9.x versions we want to test against. -# 3.10 is needed for Windows tests. +# 3.10 is needed for Windows tests as it is the only version installed in the +# bigframes-windows container image. For more information, search +# bigframes/windows-docker, internally. SYSTEM_TEST_PYTHON_VERSIONS = ["3.9", "3.10", "3.12", "3.13"] SYSTEM_TEST_STANDARD_DEPENDENCIES = [ "jinja2", @@ -698,7 +709,7 @@ def system_prerelease(session: nox.sessions.Session): ) -@nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS) +@nox.session(python=COLAB_AND_BQ_STUDIO_PYTHON_VERSIONS) def notebook(session: nox.Session): google_cloud_project = os.getenv("GOOGLE_CLOUD_PROJECT") if not google_cloud_project: @@ -776,27 +787,20 @@ def notebook(session: nox.Session): notebooks = list(filter(lambda nb: nb not in denylist, notebooks)) # Regionalized notebooks - # TODO: remove exception for Python 3.13 cloud run adds a runtime for it (internal issue 333742751) - # TODO: remove exception for Python 3.13 if nbmake adds support for - # sys.exit(0) or pytest.skip(...). - # See: https://github.com/treebeardtech/nbmake/issues/134 - if session.python == "3.13": - notebooks_reg = {} - else: - notebooks_reg = { - "regionalized.ipynb": [ - "asia-southeast1", - "eu", - "europe-west4", - "southamerica-west1", - "us", - "us-central1", - ] - } - notebooks_reg = { - os.path.join("notebooks/location", nb): regions - for nb, regions in notebooks_reg.items() - } + notebooks_reg = { + "regionalized.ipynb": [ + "asia-southeast1", + "eu", + "europe-west4", + "southamerica-west1", + "us", + "us-central1", + ] + } + notebooks_reg = { + os.path.join("notebooks/location", nb): regions + for nb, regions in notebooks_reg.items() + } # The pytest --nbmake exits silently with "no tests ran" message if # one of the notebook paths supplied does not exist. Let's make sure that From ab9229e670d5a12f8615c0f8d70ae5bf00eb1e70 Mon Sep 17 00:00:00 2001 From: jialuoo Date: Mon, 3 Feb 2025 12:19:53 -0800 Subject: [PATCH 30/38] refactor: reorganize bigframes function tests (#1338) --- tests/system/large/{ => functions}/test_remote_function.py | 0 tests/system/small/{ => functions}/test_remote_function.py | 0 tests/unit/{ => functions}/test_remote_function.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename tests/system/large/{ => functions}/test_remote_function.py (100%) rename tests/system/small/{ => functions}/test_remote_function.py (100%) rename tests/unit/{ => functions}/test_remote_function.py (100%) diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/functions/test_remote_function.py similarity index 100% rename from tests/system/large/test_remote_function.py rename to tests/system/large/functions/test_remote_function.py diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/functions/test_remote_function.py similarity index 100% rename from tests/system/small/test_remote_function.py rename to tests/system/small/functions/test_remote_function.py diff --git a/tests/unit/test_remote_function.py b/tests/unit/functions/test_remote_function.py similarity index 100% rename from tests/unit/test_remote_function.py rename to tests/unit/functions/test_remote_function.py From f433ecf2863d443147118b82f66a32e818090790 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Mon, 3 Feb 2025 13:50:52 -0800 Subject: [PATCH 31/38] chore: avoid bq storage API deadlock by setting max_results. (#1355) --- scripts/tpch_result_verify.py | 2 +- third_party/bigframes_vendored/tpch/queries/q1.py | 2 +- third_party/bigframes_vendored/tpch/queries/q10.py | 2 +- third_party/bigframes_vendored/tpch/queries/q11.py | 2 +- third_party/bigframes_vendored/tpch/queries/q12.py | 2 +- third_party/bigframes_vendored/tpch/queries/q13.py | 2 +- third_party/bigframes_vendored/tpch/queries/q14.py | 2 +- third_party/bigframes_vendored/tpch/queries/q15.py | 2 +- third_party/bigframes_vendored/tpch/queries/q16.py | 2 +- third_party/bigframes_vendored/tpch/queries/q17.py | 2 +- third_party/bigframes_vendored/tpch/queries/q18.py | 2 +- third_party/bigframes_vendored/tpch/queries/q2.py | 2 +- third_party/bigframes_vendored/tpch/queries/q20.py | 2 +- third_party/bigframes_vendored/tpch/queries/q21.py | 2 +- third_party/bigframes_vendored/tpch/queries/q22.py | 2 +- third_party/bigframes_vendored/tpch/queries/q3.py | 2 +- third_party/bigframes_vendored/tpch/queries/q4.py | 2 +- third_party/bigframes_vendored/tpch/queries/q5.py | 2 +- third_party/bigframes_vendored/tpch/queries/q6.py | 2 +- third_party/bigframes_vendored/tpch/queries/q7.py | 2 +- third_party/bigframes_vendored/tpch/queries/q8.py | 2 +- third_party/bigframes_vendored/tpch/queries/q9.py | 2 +- 22 files changed, 22 insertions(+), 22 deletions(-) diff --git a/scripts/tpch_result_verify.py b/scripts/tpch_result_verify.py index c16d7cdc84..ae01f2031c 100644 --- a/scripts/tpch_result_verify.py +++ b/scripts/tpch_result_verify.py @@ -772,7 +772,7 @@ def verify(query_num=None): file_content = file.read() file_content = re.sub( - r"next\((\w+)\.to_pandas_batches\(\)\)", + r"next\((\w+)\.to_pandas_batches\((.*?)\)\)", r"return \1.to_pandas()", file_content, ) diff --git a/third_party/bigframes_vendored/tpch/queries/q1.py b/third_party/bigframes_vendored/tpch/queries/q1.py index a3d61bce6f..f533776e85 100644 --- a/third_party/bigframes_vendored/tpch/queries/q1.py +++ b/third_party/bigframes_vendored/tpch/queries/q1.py @@ -39,4 +39,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): ["L_RETURNFLAG", "L_LINESTATUS"] ) - next(result.to_pandas_batches()) + next(result.to_pandas_batches(max_results=1500)) diff --git a/third_party/bigframes_vendored/tpch/queries/q10.py b/third_party/bigframes_vendored/tpch/queries/q10.py index 41165e1ba2..8c0d93dc26 100644 --- a/third_party/bigframes_vendored/tpch/queries/q10.py +++ b/third_party/bigframes_vendored/tpch/queries/q10.py @@ -76,4 +76,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): .head(20) ) - next(q_final.to_pandas_batches()) + next(q_final.to_pandas_batches(max_results=1500)) diff --git a/third_party/bigframes_vendored/tpch/queries/q11.py b/third_party/bigframes_vendored/tpch/queries/q11.py index 223bc8aee8..e4b628e9e6 100644 --- a/third_party/bigframes_vendored/tpch/queries/q11.py +++ b/third_party/bigframes_vendored/tpch/queries/q11.py @@ -43,4 +43,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): result_df = result_df.sort_values(by="VALUE", ascending=False) - next(result_df.to_pandas_batches()) + next(result_df.to_pandas_batches(max_results=1500)) diff --git a/third_party/bigframes_vendored/tpch/queries/q12.py b/third_party/bigframes_vendored/tpch/queries/q12.py index 4a8aca9228..1bc22f1167 100644 --- a/third_party/bigframes_vendored/tpch/queries/q12.py +++ b/third_party/bigframes_vendored/tpch/queries/q12.py @@ -46,4 +46,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): agg_results = typing.cast(bpd.DataFrame, agg_results).sort_values("L_SHIPMODE") - next(agg_results.to_pandas_batches()) + next(agg_results.to_pandas_batches(max_results=1500)) diff --git a/third_party/bigframes_vendored/tpch/queries/q13.py b/third_party/bigframes_vendored/tpch/queries/q13.py index 3a69e44c50..8201a1191d 100644 --- a/third_party/bigframes_vendored/tpch/queries/q13.py +++ b/third_party/bigframes_vendored/tpch/queries/q13.py @@ -34,4 +34,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): ["CUSTDIST", "C_COUNT"], ascending=[False, False] ) - next(q_final.to_pandas_batches()) + next(q_final.to_pandas_batches(max_results=1500)) diff --git a/third_party/bigframes_vendored/tpch/queries/q14.py b/third_party/bigframes_vendored/tpch/queries/q14.py index 36b5e569cb..f3b747219b 100644 --- a/third_party/bigframes_vendored/tpch/queries/q14.py +++ b/third_party/bigframes_vendored/tpch/queries/q14.py @@ -42,4 +42,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): .to_frame(name="PROMO_REVENUE") ) - next(promo_revenue_percent.to_pandas_batches()) + next(promo_revenue_percent.to_pandas_batches(max_results=1500)) diff --git a/third_party/bigframes_vendored/tpch/queries/q15.py b/third_party/bigframes_vendored/tpch/queries/q15.py index 7e73935160..1cba0ca4bc 100644 --- a/third_party/bigframes_vendored/tpch/queries/q15.py +++ b/third_party/bigframes_vendored/tpch/queries/q15.py @@ -50,4 +50,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): q_final = max_revenue_suppliers[ ["S_SUPPKEY", "S_NAME", "S_ADDRESS", "S_PHONE", "TOTAL_REVENUE"] ].sort_values("S_SUPPKEY") - next(q_final.to_pandas_batches()) + next(q_final.to_pandas_batches(max_results=1500)) diff --git a/third_party/bigframes_vendored/tpch/queries/q16.py b/third_party/bigframes_vendored/tpch/queries/q16.py index 2559d7ace6..a02dcef5dc 100644 --- a/third_party/bigframes_vendored/tpch/queries/q16.py +++ b/third_party/bigframes_vendored/tpch/queries/q16.py @@ -47,4 +47,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): ascending=[False, True, True, True], ) - next(q_final.to_pandas_batches()) + next(q_final.to_pandas_batches(max_results=1500)) diff --git a/third_party/bigframes_vendored/tpch/queries/q17.py b/third_party/bigframes_vendored/tpch/queries/q17.py index 62c66acad8..e6a87dc482 100644 --- a/third_party/bigframes_vendored/tpch/queries/q17.py +++ b/third_party/bigframes_vendored/tpch/queries/q17.py @@ -37,4 +37,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): (q_final[["L_EXTENDEDPRICE"]].sum() / 7.0).round(2).to_frame(name="AVG_YEARLY") ) - next(q_final.to_pandas_batches()) + next(q_final.to_pandas_batches(max_results=1500)) diff --git a/third_party/bigframes_vendored/tpch/queries/q18.py b/third_party/bigframes_vendored/tpch/queries/q18.py index ac42613b17..c6802e6808 100644 --- a/third_party/bigframes_vendored/tpch/queries/q18.py +++ b/third_party/bigframes_vendored/tpch/queries/q18.py @@ -48,4 +48,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): ) q_final = final_result.head(100) - next(q_final.to_pandas_batches()) + next(q_final.to_pandas_batches(max_results=1500)) diff --git a/third_party/bigframes_vendored/tpch/queries/q2.py b/third_party/bigframes_vendored/tpch/queries/q2.py index 5a745db6fb..e154e8ae98 100644 --- a/third_party/bigframes_vendored/tpch/queries/q2.py +++ b/third_party/bigframes_vendored/tpch/queries/q2.py @@ -59,4 +59,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): ) result_df = sort.head(100) - next(result_df.to_pandas_batches()) + next(result_df.to_pandas_batches(max_results=1500)) diff --git a/third_party/bigframes_vendored/tpch/queries/q20.py b/third_party/bigframes_vendored/tpch/queries/q20.py index e5958e96d5..5c2d8d391f 100644 --- a/third_party/bigframes_vendored/tpch/queries/q20.py +++ b/third_party/bigframes_vendored/tpch/queries/q20.py @@ -59,4 +59,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): final_result = final_filtered.merge(q3, left_on="PS_SUPPKEY", right_on="S_SUPPKEY") final_result = final_result[["S_NAME", "S_ADDRESS"]].sort_values(by="S_NAME") - next(final_result.to_pandas_batches()) + next(final_result.to_pandas_batches(max_results=1500)) diff --git a/third_party/bigframes_vendored/tpch/queries/q21.py b/third_party/bigframes_vendored/tpch/queries/q21.py index 37fc2e75d1..c27aab0e69 100644 --- a/third_party/bigframes_vendored/tpch/queries/q21.py +++ b/third_party/bigframes_vendored/tpch/queries/q21.py @@ -56,4 +56,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): by=["NUMWAIT", "S_NAME"], ascending=[False, True] ).head(100) - next(q_final.to_pandas_batches()) + next(q_final.to_pandas_batches(max_results=1500)) diff --git a/third_party/bigframes_vendored/tpch/queries/q22.py b/third_party/bigframes_vendored/tpch/queries/q22.py index e593b7beac..153ef63c5d 100644 --- a/third_party/bigframes_vendored/tpch/queries/q22.py +++ b/third_party/bigframes_vendored/tpch/queries/q22.py @@ -52,4 +52,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): result = result.sort_values(by="CNTRYCODE") - next(result.to_pandas_batches()) + next(result.to_pandas_batches(max_results=1500)) diff --git a/third_party/bigframes_vendored/tpch/queries/q3.py b/third_party/bigframes_vendored/tpch/queries/q3.py index 9fb089fcef..60d181a603 100644 --- a/third_party/bigframes_vendored/tpch/queries/q3.py +++ b/third_party/bigframes_vendored/tpch/queries/q3.py @@ -39,4 +39,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): sorted_sel = sel.sort_values(by=["REVENUE", "O_ORDERDATE"], ascending=[False, True]) result_df = sorted_sel.head(10) - next(result_df.to_pandas_batches()) + next(result_df.to_pandas_batches(max_results=1500)) diff --git a/third_party/bigframes_vendored/tpch/queries/q4.py b/third_party/bigframes_vendored/tpch/queries/q4.py index bc91aa1ada..3782a7273f 100644 --- a/third_party/bigframes_vendored/tpch/queries/q4.py +++ b/third_party/bigframes_vendored/tpch/queries/q4.py @@ -32,4 +32,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): agg = gb.agg(ORDER_COUNT=bpd.NamedAgg(column="L_ORDERKEY", aggfunc="count")) result_df = typing.cast(bpd.DataFrame, agg).sort_values(["O_ORDERPRIORITY"]) - next(result_df.to_pandas_batches()) + next(result_df.to_pandas_batches(max_results=1500)) diff --git a/third_party/bigframes_vendored/tpch/queries/q5.py b/third_party/bigframes_vendored/tpch/queries/q5.py index 4b0c522b37..406df79a5a 100644 --- a/third_party/bigframes_vendored/tpch/queries/q5.py +++ b/third_party/bigframes_vendored/tpch/queries/q5.py @@ -52,4 +52,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): gb = jn5.groupby("N_NAME", as_index=False)["REVENUE"].sum() result_df = gb.sort_values("REVENUE", ascending=False) - next(result_df.to_pandas_batches()) + next(result_df.to_pandas_batches(max_results=1500)) diff --git a/third_party/bigframes_vendored/tpch/queries/q6.py b/third_party/bigframes_vendored/tpch/queries/q6.py index 2e5272073b..8fe067bafe 100644 --- a/third_party/bigframes_vendored/tpch/queries/q6.py +++ b/third_party/bigframes_vendored/tpch/queries/q6.py @@ -27,4 +27,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): .to_frame() ) - next(result_df.to_pandas_batches()) + next(result_df.to_pandas_batches(max_results=1500)) diff --git a/third_party/bigframes_vendored/tpch/queries/q7.py b/third_party/bigframes_vendored/tpch/queries/q7.py index 7325166871..81cdda8788 100644 --- a/third_party/bigframes_vendored/tpch/queries/q7.py +++ b/third_party/bigframes_vendored/tpch/queries/q7.py @@ -60,4 +60,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): result_df = typing.cast(bpd.DataFrame, agg).sort_values( ["SUPP_NATION", "CUST_NATION", "L_YEAR"] ) - next(result_df.to_pandas_batches()) + next(result_df.to_pandas_batches(max_results=1500)) diff --git a/third_party/bigframes_vendored/tpch/queries/q8.py b/third_party/bigframes_vendored/tpch/queries/q8.py index 0dfe2c1208..67e1af1241 100644 --- a/third_party/bigframes_vendored/tpch/queries/q8.py +++ b/third_party/bigframes_vendored/tpch/queries/q8.py @@ -69,4 +69,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): sums["MKT_SHARE"] = (sums["numerator"] / sums["denominator"]).round(2) result_df = sums["MKT_SHARE"].sort_index().rename("MKT_SHARE").reset_index() - next(result_df.to_pandas_batches()) + next(result_df.to_pandas_batches(max_results=1500)) diff --git a/third_party/bigframes_vendored/tpch/queries/q9.py b/third_party/bigframes_vendored/tpch/queries/q9.py index cd95fa8b56..6af33f7569 100644 --- a/third_party/bigframes_vendored/tpch/queries/q9.py +++ b/third_party/bigframes_vendored/tpch/queries/q9.py @@ -65,4 +65,4 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): ["NATION", "O_YEAR"], ascending=[True, False] ) - next(q_final.to_pandas_batches()) + next(q_final.to_pandas_batches(max_results=1500)) From 3c4abf24ea186e98f629b6f83c0f3e36dc0571c6 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Mon, 3 Feb 2025 14:46:10 -0800 Subject: [PATCH 32/38] perf: Fall back to ordering by bq pk when possible (#1350) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * perf: Fall back to ordering by bq pk when possible * use pk before index, fix unit test * Apply suggestions from code review * Update bigframes/session/_io/bigquery/read_gbq_table.py * fix null index case --------- Co-authored-by: Tim Sweña (Swast) --- .../session/_io/bigquery/read_gbq_table.py | 77 ++++--------------- bigframes/session/loader.py | 6 +- tests/unit/session/test_read_gbq_table.py | 21 +++-- tests/unit/session/test_session.py | 6 +- 4 files changed, 35 insertions(+), 75 deletions(-) diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index ac9523243e..ed68762ee8 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -152,24 +152,28 @@ def validate_table( return False -def are_index_cols_unique( +def infer_unique_columns( bqclient: bigquery.Client, table: bigquery.table.Table, index_cols: List[str], api_name: str, metadata_only: bool = False, -) -> bool: - if len(index_cols) == 0: - return False +) -> Tuple[str, ...]: + """Return a set of columns that can provide a unique row key or empty if none can be inferred. + + Note: primary keys are not enforced, but these are assumed to be unique + by the query engine, so we make the same assumption here. + """ # If index_cols contain the primary_keys, the query engine assumes they are # provide a unique index. - primary_keys = frozenset(_get_primary_keys(table)) - if (len(primary_keys) > 0) and primary_keys <= frozenset(index_cols): - return True + primary_keys = tuple(_get_primary_keys(table)) + if (len(primary_keys) > 0) and frozenset(primary_keys) <= frozenset(index_cols): + # Essentially, just reordering the primary key to match the index col order + return tuple(index_col for index_col in index_cols if index_col in primary_keys) - if metadata_only: + if primary_keys or metadata_only or (not index_cols): # Sometimes not worth scanning data to check uniqueness - return False + return primary_keys # TODO(b/337925142): Avoid a "SELECT *" subquery here by ensuring # table_expression only selects just index_cols. is_unique_sql = bigframes.core.sql.is_distinct_sql(index_cols, table.reference) @@ -178,7 +182,9 @@ def are_index_cols_unique( results = bqclient.query_and_wait(is_unique_sql, job_config=job_config) row = next(iter(results)) - return row["total_count"] == row["distinct_count"] + if row["total_count"] == row["distinct_count"]: + return tuple(index_cols) + return () def _get_primary_keys( @@ -279,54 +285,3 @@ def get_index_cols( index_cols = primary_keys return index_cols - - -def get_time_travel_datetime_and_table_metadata( - bqclient: bigquery.Client, - table_ref: bigquery.TableReference, - *, - api_name: str, - cache: Dict[bigquery.TableReference, Tuple[datetime.datetime, bigquery.Table]], - use_cache: bool = True, -) -> Tuple[datetime.datetime, bigquery.Table]: - cached_table = cache.get(table_ref) - if use_cache and cached_table is not None: - snapshot_timestamp, _ = cached_table - - # Cache hit could be unexpected. See internal issue 329545805. - # Raise a warning with more information about how to avoid the - # problems with the cache. - msg = ( - f"Reading cached table from {snapshot_timestamp} to avoid " - "incompatibilies with previous reads of this table. To read " - "the latest version, set `use_cache=False` or close the " - "current session with Session.close() or " - "bigframes.pandas.close_session()." - ) - # There are many layers before we get to (possibly) the user's code: - # pandas.read_gbq_table - # -> with_default_session - # -> Session.read_gbq_table - # -> _read_gbq_table - # -> _get_snapshot_sql_and_primary_key - # -> get_snapshot_datetime_and_table_metadata - warnings.warn(msg, stacklevel=7) - return cached_table - - # TODO(swast): It's possible that the table metadata is changed between now - # and when we run the CURRENT_TIMESTAMP() query to see when we can time - # travel to. Find a way to fetch the table metadata and BQ's current time - # atomically. - table = bqclient.get_table(table_ref) - - job_config = bigquery.QueryJobConfig() - job_config.labels["bigframes-api"] = api_name - snapshot_timestamp = list( - bqclient.query( - "SELECT CURRENT_TIMESTAMP() AS `current_timestamp`", - job_config=job_config, - ).result() - )[0][0] - cached_table = (snapshot_timestamp, table) - cache[table_ref] = cached_table - return cached_table diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 43faae37c3..0f6ea4afff 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -424,7 +424,7 @@ def read_gbq_table( # in the query that checks for index uniqueness. # TODO(b/338065601): Provide a way to assume uniqueness and avoid this # check. - is_index_unique = bf_read_gbq_table.are_index_cols_unique( + primary_key = bf_read_gbq_table.infer_unique_columns( bqclient=self._bqclient, table=table, index_cols=index_cols, @@ -440,12 +440,12 @@ def read_gbq_table( schema=schema, predicate=filter_str, at_time=time_travel_timestamp if enable_snapshot else None, - primary_key=index_cols if is_index_unique else (), + primary_key=primary_key, session=self._session, ) # if we don't have a unique index, we order by row hash if we are in strict mode if self._force_total_order: - if not is_index_unique: + if not primary_key: array_value = array_value.order_by( [ bigframes.core.ordering.OrderingExpression( diff --git a/tests/unit/session/test_read_gbq_table.py b/tests/unit/session/test_read_gbq_table.py index 6933957e53..8f01820fd3 100644 --- a/tests/unit/session/test_read_gbq_table.py +++ b/tests/unit/session/test_read_gbq_table.py @@ -27,8 +27,13 @@ @pytest.mark.parametrize( ("index_cols", "primary_keys", "values_distinct", "expected"), ( - (["col1", "col2"], ["col1", "col2", "col3"], False, False), - (["col1", "col2", "col3"], ["col1", "col2", "col3"], True, True), + (["col1", "col2"], ["col1", "col2", "col3"], False, ("col1", "col2", "col3")), + ( + ["col1", "col2", "col3"], + ["col1", "col2", "col3"], + True, + ("col1", "col2", "col3"), + ), ( ["col2", "col3", "col1"], [ @@ -36,14 +41,14 @@ "col2", ], True, - True, + ("col2", "col3"), ), - (["col1", "col2"], [], False, False), - ([], ["col1", "col2", "col3"], False, False), - ([], [], False, False), + (["col1", "col2"], [], False, ()), + ([], ["col1", "col2", "col3"], False, ("col1", "col2", "col3")), + ([], [], False, ()), ), ) -def test_are_index_cols_unique(index_cols, primary_keys, values_distinct, expected): +def test_infer_unique_columns(index_cols, primary_keys, values_distinct, expected): """If a primary key is set on the table, we use that as the index column by default, no error should be raised in this case. @@ -87,6 +92,6 @@ def test_are_index_cols_unique(index_cols, primary_keys, values_distinct, expect ) table._properties["location"] = session._location - result = bf_read_gbq_table.are_index_cols_unique(bqclient, table, index_cols, "") + result = bf_read_gbq_table.infer_unique_columns(bqclient, table, index_cols, "") assert result == expected diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py index 210fc5d633..13531acbea 100644 --- a/tests/unit/session/test_session.py +++ b/tests/unit/session/test_session.py @@ -217,10 +217,10 @@ def test_read_gbq_cached_table(): table, ) - session.bqclient.get_table.return_value = table - session.bqclient.query_and_wait.return_value = ( - {"total_count": 3, "distinct_count": 2}, + session.bqclient.query_and_wait = mock.MagicMock( + return_value=({"total_count": 3, "distinct_count": 2},) ) + session.bqclient.get_table.return_value = table with pytest.warns(UserWarning, match=re.escape("use_cache=False")): df = session.read_gbq("my-project.my_dataset.my_table") From 866ba9efb54f11c1fc2ced0d7995fff86277b049 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 3 Feb 2025 16:27:21 -0800 Subject: [PATCH 33/38] chore: support timedeltas for read_pandas() (#1349) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore: support timedeltas for read_pandas() * fix format * fix mypy error * centralize timedelta to microsecs replacement logic * fix format * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * remove redundant imports * polish todo comment * update timdelta to microsecond conversion algo * update python doc --------- Co-authored-by: Owl Bot --- bigframes/core/compile/compiler.py | 5 +++ bigframes/core/local_data.py | 3 ++ bigframes/core/schema.py | 8 +++-- bigframes/core/utils.py | 27 ++++++++++++++ bigframes/dtypes.py | 5 +++ bigframes/session/__init__.py | 1 + bigframes/session/_io/pandas.py | 6 +++- bigframes/session/loader.py | 18 ++++++++-- tests/system/small/test_session.py | 56 ++++++++++++++++++++++++++++++ 9 files changed, 123 insertions(+), 6 deletions(-) diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 0d047b366e..a72ca47190 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -24,6 +24,7 @@ import google.cloud.bigquery import pandas as pd +from bigframes.core import utils import bigframes.core.compile.compiled as compiled import bigframes.core.compile.concat as concat_impl import bigframes.core.compile.explode @@ -173,6 +174,10 @@ def compile_readlocal(self, node: nodes.ReadLocalNode): io.BytesIO(node.feather_bytes), columns=[item.source_id for item in node.scan_list.items], ) + + # Convert timedeltas to microseconds for compatibility with BigQuery + _ = utils.replace_timedeltas_with_micros(array_as_pd) + offsets = node.offsets_col.sql if node.offsets_col else None return compiled.UnorderedIR.from_pandas( array_as_pd, node.scan_list, offsets=offsets diff --git a/bigframes/core/local_data.py b/bigframes/core/local_data.py index f665948be2..d891e385d5 100644 --- a/bigframes/core/local_data.py +++ b/bigframes/core/local_data.py @@ -59,6 +59,9 @@ def arrow_type_replacements(type: pa.DataType) -> pa.DataType: if pa.types.is_time64(type): # This is potentially lossy, but BigFrames doesn't support ns return pa.time64("us") + if pa.types.is_duration(type): + # This is potentially lossy, but BigFrames doesn't support ns + return pa.duration("us") if pa.types.is_decimal128(type): return pa.decimal128(38, 9) if pa.types.is_decimal256(type): diff --git a/bigframes/core/schema.py b/bigframes/core/schema.py index 2b49f81d85..e3808dfffd 100644 --- a/bigframes/core/schema.py +++ b/bigframes/core/schema.py @@ -38,9 +38,13 @@ class ArraySchema: items: typing.Tuple[SchemaItem, ...] @classmethod - def from_bq_table(cls, table: google.cloud.bigquery.Table): + def from_bq_table( + cls, + table: google.cloud.bigquery.Table, + column_type_overrides: typing.Dict[str, bigframes.dtypes.Dtype] = {}, + ): items = tuple( - SchemaItem(name, dtype) + SchemaItem(name, column_type_overrides.get(name, dtype)) for name, dtype in bigframes.dtypes.bf_type_from_type_kind( table.schema ).items() diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index f9ca6cb5f0..7cb2ec7535 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -19,6 +19,7 @@ import bigframes_vendored.pandas.io.common as vendored_pandas_io_common import pandas as pd +import pandas.api.types as pdtypes import typing_extensions import bigframes.exceptions as bfe @@ -184,3 +185,29 @@ def wrapper(*args, **kwargs): return wrapper return decorator + + +def timedelta_to_micros(td: pd.Timedelta) -> int: + # td.value returns total nanoseconds. + return td.value // 1000 + + +def replace_timedeltas_with_micros(dataframe: pd.DataFrame) -> List[str]: + """ + Replaces in-place timedeltas to integer values in microseconds. Nanosecond part is ignored. + + Returns: + The names of updated columns + """ + updated_columns = [] + + for col in dataframe.columns: + if pdtypes.is_timedelta64_dtype(dataframe[col].dtype): + dataframe[col] = dataframe[col].apply(timedelta_to_micros) + updated_columns.append(col) + + if pdtypes.is_timedelta64_dtype(dataframe.index.dtype): + dataframe.index = dataframe.index.map(timedelta_to_micros) + updated_columns.append(dataframe.index.name) + + return updated_columns diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index a8d9d60366..8b1ca3b0c8 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -409,11 +409,16 @@ def dtype_for_etype(etype: ExpressionType) -> Dtype: def arrow_dtype_to_bigframes_dtype(arrow_dtype: pa.DataType) -> Dtype: if arrow_dtype in _ARROW_TO_BIGFRAMES: return _ARROW_TO_BIGFRAMES[arrow_dtype] + if pa.types.is_list(arrow_dtype): return pd.ArrowDtype(arrow_dtype) + if pa.types.is_struct(arrow_dtype): return pd.ArrowDtype(arrow_dtype) + if pa.types.is_duration(arrow_dtype): + return pd.ArrowDtype(arrow_dtype) + # BigFrames doesn't distinguish between string and large_string because the # largest string (2 GB) is already larger than the largest BigQuery row. if pa.types.is_string(arrow_dtype) or pa.types.is_large_string(arrow_dtype): diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 24963bdcbc..c8c44be40b 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -120,6 +120,7 @@ pandas.ArrowDtype(pa.timestamp("us", tz="UTC")), pandas.ArrowDtype(pa.decimal128(38, 9)), pandas.ArrowDtype(pa.decimal256(76, 38)), + pandas.ArrowDtype(pa.duration("us")), ) diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py index 301e1c4ebb..532a909430 100644 --- a/bigframes/session/_io/pandas.py +++ b/bigframes/session/_io/pandas.py @@ -14,7 +14,7 @@ from __future__ import annotations import dataclasses -from typing import Collection, Union +from typing import Collection, List, Union import bigframes_vendored.constants as constants import db_dtypes # type: ignore @@ -38,6 +38,7 @@ class DataFrameAndLabels: column_labels: Collection index_labels: Collection ordering_col: str + timedelta_cols: List[str] def _arrow_to_pandas_arrowdtype( @@ -163,9 +164,12 @@ def pandas_to_bq_compatible(pandas_dataframe: pandas.DataFrame) -> DataFrameAndL pandas_dataframe_copy.columns = pandas.Index(new_col_ids) pandas_dataframe_copy[ordering_col] = np.arange(pandas_dataframe_copy.shape[0]) + timedelta_cols = utils.replace_timedeltas_with_micros(pandas_dataframe_copy) + return DataFrameAndLabels( df=pandas_dataframe_copy, column_labels=col_labels, index_labels=idx_labels, ordering_col=ordering_col, + timedelta_cols=timedelta_cols, ) diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 0f6ea4afff..ba693696c3 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -176,10 +176,16 @@ def read_pandas_load_job( self._start_generic_job(load_job) destination_table = self._bqclient.get_table(load_table_destination) + col_type_overrides: typing.Dict[str, bigframes.dtypes.Dtype] = { + col: bigframes.dtypes.TIMEDETLA_DTYPE + for col in df_and_labels.timedelta_cols + } array_value = core.ArrayValue.from_table( table=destination_table, - # TODO: Generate this directly from original pandas df. - schema=schemata.ArraySchema.from_bq_table(destination_table), + # TODO (b/394156190): Generate this directly from original pandas df. + schema=schemata.ArraySchema.from_bq_table( + destination_table, col_type_overrides + ), session=self._session, offsets_col=ordering_col, ).drop_columns([ordering_col]) @@ -229,10 +235,16 @@ def read_pandas_streaming( f"Problem loading at least one row from DataFrame: {errors}. {constants.FEEDBACK_LINK}" ) + col_type_overrides: typing.Dict[str, bigframes.dtypes.Dtype] = { + col: bigframes.dtypes.TIMEDETLA_DTYPE + for col in df_and_labels.timedelta_cols + } array_value = ( core.ArrayValue.from_table( table=destination_table, - schema=schemata.ArraySchema.from_bq_table(destination_table), + schema=schemata.ArraySchema.from_bq_table( + destination_table, col_type_overrides + ), session=self._session, # Don't set the offsets column because we want to group by it. ) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index e95509e033..a4acb72117 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -691,6 +691,62 @@ def test_read_pandas_tokyo( assert len(expected) == result.total_rows +@pytest.mark.parametrize( + "write_engine", + ["default", "bigquery_inline", "bigquery_load", "bigquery_streaming"], +) +def test_read_pandas_timedelta_dataframes(session, write_engine): + expected_df = pd.DataFrame({"my_col": pd.to_timedelta([1, 2, 3], unit="d")}) + + actual_result = ( + session.read_pandas(expected_df, write_engine=write_engine) + .to_pandas() + .astype("timedelta64[ns]") + ) + + if write_engine == "bigquery_streaming": + expected_df.index = pd.Index([pd.NA] * 3, dtype="Int64") + pd.testing.assert_frame_equal(actual_result, expected_df, check_index_type=False) + + +@pytest.mark.parametrize( + "write_engine", + ["default", "bigquery_inline", "bigquery_load", "bigquery_streaming"], +) +def test_read_pandas_timedelta_series(session, write_engine): + expected_series = pd.Series(pd.to_timedelta([1, 2, 3], unit="d")) + + actual_result = ( + session.read_pandas(expected_series, write_engine=write_engine) + .to_pandas() + .astype("timedelta64[ns]") + ) + + if write_engine == "bigquery_streaming": + expected_series.index = pd.Index([pd.NA] * 3, dtype="Int64") + pd.testing.assert_series_equal( + actual_result, expected_series, check_index_type=False + ) + + +@pytest.mark.parametrize( + "write_engine", + ["default", "bigquery_inline", "bigquery_load"], +) +def test_read_pandas_timedelta_index(session, write_engine): + expected_index = pd.to_timedelta( + [1, 2, 3], unit="d" + ) # to_timedelta returns an index + + actual_result = ( + session.read_pandas(expected_index, write_engine=write_engine) + .to_pandas() + .astype("timedelta64[ns]") + ) + + pd.testing.assert_index_equal(actual_result, expected_index) + + @utils.skip_legacy_pandas @pytest.mark.parametrize( ("write_engine",), From 1716106d4187e9ca34f0ceac91cb65455a24a002 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Mon, 3 Feb 2025 19:21:12 -0800 Subject: [PATCH 34/38] test: update small `remote_function` tests path (#1357) --- noxfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/noxfile.py b/noxfile.py index 5b38d92c60..b851bf160d 100644 --- a/noxfile.py +++ b/noxfile.py @@ -698,7 +698,7 @@ def system_prerelease(session: nox.sessions.Session): # This would mean that we will only rely on the standard remote function # tests. small_remote_function_tests = os.path.join( - small_tests_dir, "test_remote_function.py" + small_tests_dir, "functions", "test_remote_function.py" ) assert os.path.exists(small_remote_function_tests) From 23624efc53d748eb74f1d988ba12151d11289647 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Tue, 4 Feb 2025 10:00:21 -0800 Subject: [PATCH 35/38] chore: update benchmark verify code. (#1356) --- scripts/tpch_result_verify.py | 717 +----------------- .../bigframes_vendored/tpch/TPC-EULA.txt | 320 ++++++++ .../tpch/sql_queries/q1.sql | 21 + .../tpch/sql_queries/q10.sql | 32 + .../tpch/sql_queries/q11.sql | 27 + .../tpch/sql_queries/q12.sql | 28 + .../tpch/sql_queries/q13.sql | 18 + .../tpch/sql_queries/q14.sql | 13 + .../tpch/sql_queries/q15.sql | 26 + .../tpch/sql_queries/q16.sql | 30 + .../tpch/sql_queries/q17.sql | 17 + .../tpch/sql_queries/q18.sql | 33 + .../tpch/sql_queries/q19.sql | 35 + .../tpch/sql_queries/q2.sql | 44 ++ .../tpch/sql_queries/q20.sql | 37 + .../tpch/sql_queries/q21.sql | 40 + .../tpch/sql_queries/q22.sql | 36 + .../tpch/sql_queries/q3.sql | 23 + .../tpch/sql_queries/q4.sql | 21 + .../tpch/sql_queries/q5.sql | 24 + .../tpch/sql_queries/q6.sql | 9 + .../tpch/sql_queries/q7.sql | 50 ++ .../tpch/sql_queries/q8.sql | 39 + .../tpch/sql_queries/q9.sql | 32 + 24 files changed, 971 insertions(+), 701 deletions(-) create mode 100644 third_party/bigframes_vendored/tpch/TPC-EULA.txt create mode 100644 third_party/bigframes_vendored/tpch/sql_queries/q1.sql create mode 100644 third_party/bigframes_vendored/tpch/sql_queries/q10.sql create mode 100644 third_party/bigframes_vendored/tpch/sql_queries/q11.sql create mode 100644 third_party/bigframes_vendored/tpch/sql_queries/q12.sql create mode 100644 third_party/bigframes_vendored/tpch/sql_queries/q13.sql create mode 100644 third_party/bigframes_vendored/tpch/sql_queries/q14.sql create mode 100644 third_party/bigframes_vendored/tpch/sql_queries/q15.sql create mode 100644 third_party/bigframes_vendored/tpch/sql_queries/q16.sql create mode 100644 third_party/bigframes_vendored/tpch/sql_queries/q17.sql create mode 100644 third_party/bigframes_vendored/tpch/sql_queries/q18.sql create mode 100644 third_party/bigframes_vendored/tpch/sql_queries/q19.sql create mode 100644 third_party/bigframes_vendored/tpch/sql_queries/q2.sql create mode 100644 third_party/bigframes_vendored/tpch/sql_queries/q20.sql create mode 100644 third_party/bigframes_vendored/tpch/sql_queries/q21.sql create mode 100644 third_party/bigframes_vendored/tpch/sql_queries/q22.sql create mode 100644 third_party/bigframes_vendored/tpch/sql_queries/q3.sql create mode 100644 third_party/bigframes_vendored/tpch/sql_queries/q4.sql create mode 100644 third_party/bigframes_vendored/tpch/sql_queries/q5.sql create mode 100644 third_party/bigframes_vendored/tpch/sql_queries/q6.sql create mode 100644 third_party/bigframes_vendored/tpch/sql_queries/q7.sql create mode 100644 third_party/bigframes_vendored/tpch/sql_queries/q8.sql create mode 100644 third_party/bigframes_vendored/tpch/sql_queries/q9.sql diff --git a/scripts/tpch_result_verify.py b/scripts/tpch_result_verify.py index ae01f2031c..0c932f6eac 100644 --- a/scripts/tpch_result_verify.py +++ b/scripts/tpch_result_verify.py @@ -24,705 +24,16 @@ project_id = "bigframes-dev-perf" dataset_id = "tpch_0001g" -line_item_ds = f"bigframes-dev-perf.{dataset_id}.LINEITEM" -region_ds = f"bigframes-dev-perf.{dataset_id}.REGION" -nation_ds = f"bigframes-dev-perf.{dataset_id}.NATION" -supplier_ds = f"bigframes-dev-perf.{dataset_id}.SUPPLIER" -part_ds = f"bigframes-dev-perf.{dataset_id}.PART" -part_supp_ds = f"bigframes-dev-perf.{dataset_id}.PARTSUPP" -customer_ds = f"bigframes-dev-perf.{dataset_id}.CUSTOMER" -orders_ds = f"bigframes-dev-perf.{dataset_id}.ORDERS" - -q1_query = f""" - select - l_returnflag, - l_linestatus, - sum(l_quantity) as sum_qty, - sum(l_extendedprice) as sum_base_price, - sum(l_extendedprice * (1 - l_discount)) as sum_disc_price, - sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge, - avg(l_quantity) as avg_qty, - avg(l_extendedprice) as avg_price, - avg(l_discount) as avg_disc, - count(*) as count_order - from - {line_item_ds} - where - l_shipdate <= '1998-09-02' - group by - l_returnflag, - l_linestatus - order by - l_returnflag, - l_linestatus -""" - -q2_query = f""" - select - s_acctbal, - s_name, - n_name, - p_partkey, - p_mfgr, - s_address, - s_phone, - s_comment - from - {part_ds}, - {supplier_ds}, - {part_supp_ds}, - {nation_ds}, - {region_ds} - where - p_partkey = ps_partkey - and s_suppkey = ps_suppkey - and p_size = 15 - and p_type like '%BRASS' - and s_nationkey = n_nationkey - and n_regionkey = r_regionkey - and r_name = 'EUROPE' - and ps_supplycost = ( - select - min(ps_supplycost) - from - {part_supp_ds}, - {supplier_ds}, - {nation_ds}, - {region_ds} - where - p_partkey = ps_partkey - and s_suppkey = ps_suppkey - and s_nationkey = n_nationkey - and n_regionkey = r_regionkey - and r_name = 'EUROPE' - ) - order by - s_acctbal desc, - n_name, - s_name, - p_partkey - limit 100 -""" - -q3_query = f""" - select - l_orderkey, - sum(l_extendedprice * (1 - l_discount)) as revenue, - o_orderdate, - o_shippriority - from - {customer_ds}, - {orders_ds}, - {line_item_ds} - where - c_mktsegment = 'BUILDING' - and c_custkey = o_custkey - and l_orderkey = o_orderkey - and o_orderdate < '1995-03-15' - and l_shipdate > '1995-03-15' - group by - l_orderkey, - o_orderdate, - o_shippriority - order by - revenue desc, - o_orderdate - limit 10 -""" - -q4_query = f""" - select - o_orderpriority, - count(*) as order_count - from - {orders_ds} - where - o_orderdate >= date '1993-07-01' - and o_orderdate < date '1993-10-01' - and exists ( - select - * - from - {line_item_ds} - where - l_orderkey = o_orderkey - and l_commitdate < l_receiptdate - ) - group by - o_orderpriority - order by - o_orderpriority -""" - -q5_query = f""" - select - n_name, - sum(l_extendedprice * (1 - l_discount)) as revenue - from - {customer_ds}, - {orders_ds}, - {line_item_ds}, - {supplier_ds}, - {nation_ds}, - {region_ds} - where - c_custkey = o_custkey - and l_orderkey = o_orderkey - and l_suppkey = s_suppkey - and c_nationkey = s_nationkey - and s_nationkey = n_nationkey - and n_regionkey = r_regionkey - and r_name = 'ASIA' - and o_orderdate >= date '1994-01-01' - and o_orderdate < date '1995-01-01' - group by - n_name - order by - revenue desc -""" - -q6_query = f""" - select - sum(l_extendedprice * l_discount) as revenue - from - {line_item_ds} - where - l_shipdate >= date '1994-01-01' - and l_shipdate < date '1994-01-01' + interval '1' year - and l_discount between .05 and .07 - and l_quantity < 24 -""" - -q7_query = f""" - select - supp_nation, - cust_nation, - l_year, - sum(volume) as revenue - from - ( - select - n1.n_name as supp_nation, - n2.n_name as cust_nation, - EXTRACT(YEAR FROM l_shipdate) as l_year, - l_extendedprice * (1 - l_discount) as volume - from - {supplier_ds}, - {line_item_ds}, - {orders_ds}, - {customer_ds}, - {nation_ds} n1, - {nation_ds} n2 - where - s_suppkey = l_suppkey - and o_orderkey = l_orderkey - and c_custkey = o_custkey - and s_nationkey = n1.n_nationkey - and c_nationkey = n2.n_nationkey - and ( - (n1.n_name = 'FRANCE' and n2.n_name = 'GERMANY') - or (n1.n_name = 'GERMANY' and n2.n_name = 'FRANCE') - ) - and l_shipdate between date '1995-01-01' and date '1996-12-31' - ) as shipping - group by - supp_nation, - cust_nation, - l_year - order by - supp_nation, - cust_nation, - l_year -""" - -q8_query = f""" - select - o_year, - round( - sum(case - when nation = 'BRAZIL' then volume - else 0 - end) / sum(volume) - , 2) as mkt_share - from - ( - select - extract(year from o_orderdate) as o_year, - l_extendedprice * (1 - l_discount) as volume, - n2.n_name as nation - from - {part_ds}, - {supplier_ds}, - {line_item_ds}, - {orders_ds}, - {customer_ds}, - {nation_ds} n1, - {nation_ds} n2, - {region_ds} - where - p_partkey = l_partkey - and s_suppkey = l_suppkey - and l_orderkey = o_orderkey - and o_custkey = c_custkey - and c_nationkey = n1.n_nationkey - and n1.n_regionkey = r_regionkey - and r_name = 'AMERICA' - and s_nationkey = n2.n_nationkey - and o_orderdate between date '1995-01-01' and date '1996-12-31' - and p_type = 'ECONOMY ANODIZED STEEL' - ) as all_nations - group by - o_year - order by - o_year -""" - -q9_query = f""" - select - nation, - o_year, - round(sum(amount), 2) as sum_profit - from - ( - select - n_name as nation, - EXTRACT(YEAR FROM o_orderdate) as o_year, - l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount - from - {part_ds}, - {supplier_ds}, - {line_item_ds}, - {part_supp_ds}, - {orders_ds}, - {nation_ds} - where - s_suppkey = l_suppkey - and ps_suppkey = l_suppkey - and ps_partkey = l_partkey - and p_partkey = l_partkey - and o_orderkey = l_orderkey - and s_nationkey = n_nationkey - and p_name like '%green%' - ) as profit - group by - nation, - o_year - order by - nation, - o_year desc -""" - -q10_query = f""" - select - c_custkey, - c_name, - round(sum(l_extendedprice * (1 - l_discount)), 2) as revenue, - c_acctbal, - n_name, - c_address, - c_phone, - c_comment - from - {customer_ds}, - {orders_ds}, - {line_item_ds}, - {nation_ds} - where - c_custkey = o_custkey - and l_orderkey = o_orderkey - and o_orderdate >= date '1993-10-01' - and o_orderdate < date '1993-10-01' + interval '3' month - and l_returnflag = 'R' - and c_nationkey = n_nationkey - group by - c_custkey, - c_name, - c_acctbal, - c_phone, - n_name, - c_address, - c_comment - order by - revenue desc - limit 20 -""" - -q11_query = f""" - select - ps_partkey, - round(sum(ps_supplycost * ps_availqty), 2) as value - from - {part_supp_ds}, - {supplier_ds}, - {nation_ds} - where - ps_suppkey = s_suppkey - and s_nationkey = n_nationkey - and n_name = 'GERMANY' - group by - ps_partkey having - sum(ps_supplycost * ps_availqty) > ( - select - sum(ps_supplycost * ps_availqty) * 0.0001 - from - {part_supp_ds}, - {supplier_ds}, - {nation_ds} - where - ps_suppkey = s_suppkey - and s_nationkey = n_nationkey - and n_name = 'GERMANY' - ) - order by - value desc -""" - -q12_query = f""" - select - l_shipmode, - sum(case - when o_orderpriority = '1-URGENT' - or o_orderpriority = '2-HIGH' - then 1 - else 0 - end) as high_line_count, - sum(case - when o_orderpriority <> '1-URGENT' - and o_orderpriority <> '2-HIGH' - then 1 - else 0 - end) as low_line_count - from - {orders_ds}, - {line_item_ds} - where - o_orderkey = l_orderkey - and l_shipmode in ('MAIL', 'SHIP') - and l_commitdate < l_receiptdate - and l_shipdate < l_commitdate - and l_receiptdate >= date '1994-01-01' - and l_receiptdate < date '1994-01-01' + interval '1' year - group by - l_shipmode - order by - l_shipmode -""" - -q13_query = f""" - SELECT - c_count, COUNT(*) AS custdist - FROM ( - SELECT - c_custkey, - COUNT(o_orderkey) AS c_count - FROM - {customer_ds} LEFT OUTER JOIN {orders_ds} ON - c_custkey = o_custkey - AND o_comment NOT LIKE '%special%requests%' - GROUP BY - c_custkey - ) AS c_orders - GROUP BY - c_count - ORDER BY - custdist DESC, - c_count DESC -""" - -q14_query = f""" - select - round(100.00 * sum(case - when p_type like 'PROMO%' - then l_extendedprice * (1 - l_discount) - else 0 - end) / sum(l_extendedprice * (1 - l_discount)), 2) as promo_revenue - from - {line_item_ds}, - {part_ds} - where - l_partkey = p_partkey - and l_shipdate >= date '1995-09-01' - and l_shipdate < date '1995-09-01' + interval '1' month -""" - -q15_query = f""" - WITH revenue AS ( - SELECT - l_suppkey AS supplier_no, - SUM(l_extendedprice * (1 - l_discount)) AS total_revenue - FROM - {line_item_ds} - WHERE - l_shipdate >= DATE '1996-01-01' - AND l_shipdate < DATE '1996-01-01' + INTERVAL '3' month - GROUP BY - l_suppkey - ) - SELECT - s.s_suppkey, - s.s_name, - s.s_address, - s.s_phone, - r.total_revenue - FROM - {supplier_ds} s - JOIN - revenue r ON s.s_suppkey = r.supplier_no - WHERE - r.total_revenue = (SELECT MAX(total_revenue) FROM revenue) - ORDER BY - s.s_suppkey; -""" - -q16_query = f""" - select - p_brand, - p_type, - p_size, - count(distinct ps_suppkey) as supplier_cnt - from - {part_supp_ds}, - {part_ds} - where - p_partkey = ps_partkey - and p_brand <> 'Brand#45' - and p_type not like 'MEDIUM POLISHED%' - and p_size in (49, 14, 23, 45, 19, 3, 36, 9) - and ps_suppkey not in ( - select - s_suppkey - from - {supplier_ds} - where - s_comment like '%Customer%Complaints%' - ) - group by - p_brand, - p_type, - p_size - order by - supplier_cnt desc, - p_brand, - p_type, - p_size -""" - -q17_query = f""" - select - round(sum(l_extendedprice) / 7.0, 2) as avg_yearly - from - {line_item_ds}, - {part_ds} - where - p_partkey = l_partkey - and p_brand = 'Brand#23' - and p_container = 'MED BOX' - and l_quantity < ( - select - 0.2 * avg(l_quantity) - from - {line_item_ds} - where - l_partkey = p_partkey - ) -""" - -q18_query = f""" - select - c_name, - c_custkey, - o_orderkey, - o_orderdate as o_orderdat, - o_totalprice, - sum(l_quantity) as col6 - from - {customer_ds}, - {orders_ds}, - {line_item_ds} - where - o_orderkey in ( - select - l_orderkey - from - {line_item_ds} - group by - l_orderkey having - sum(l_quantity) > 300 - ) - and c_custkey = o_custkey - and o_orderkey = l_orderkey - group by - c_name, - c_custkey, - o_orderkey, - o_orderdate, - o_totalprice - order by - o_totalprice desc, - o_orderdate - limit 100 -""" - -q19_query = f""" - select - round(sum(l_extendedprice* (1 - l_discount)), 2) as revenue - from - {line_item_ds}, - {part_ds} - where - ( - p_partkey = l_partkey - and p_brand = 'Brand#12' - and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') - and l_quantity >= 1 and l_quantity <= 1 + 10 - and p_size between 1 and 5 - and l_shipmode in ('AIR', 'AIR REG') - and l_shipinstruct = 'DELIVER IN PERSON' - ) - or - ( - p_partkey = l_partkey - and p_brand = 'Brand#23' - and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK') - and l_quantity >= 10 and l_quantity <= 20 - and p_size between 1 and 10 - and l_shipmode in ('AIR', 'AIR REG') - and l_shipinstruct = 'DELIVER IN PERSON' - ) - or - ( - p_partkey = l_partkey - and p_brand = 'Brand#34' - and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG') - and l_quantity >= 20 and l_quantity <= 30 - and p_size between 1 and 15 - and l_shipmode in ('AIR', 'AIR REG') - and l_shipinstruct = 'DELIVER IN PERSON' - ) -""" - -q20_query = f""" - select - s_name, - s_address - from - {supplier_ds}, - {nation_ds} - where - s_suppkey in ( - select - ps_suppkey - from - {part_supp_ds} - where - ps_partkey in ( - select - p_partkey - from - {part_ds} - where - p_name like 'forest%' - ) - and ps_availqty > ( - select - 0.5 * sum(l_quantity) - from - {line_item_ds} - where - l_partkey = ps_partkey - and l_suppkey = ps_suppkey - and l_shipdate >= date '1994-01-01' - and l_shipdate < date '1994-01-01' + interval '1' year - ) - ) - and s_nationkey = n_nationkey - and n_name = 'CANADA' - order by - s_name -""" - - -q21_query = f""" - select - s_name, - count(*) as numwait - from - {supplier_ds}, - {line_item_ds} l1, - {orders_ds}, - {nation_ds} - where - s_suppkey = l1.l_suppkey - and o_orderkey = l1.l_orderkey - and o_orderstatus = 'F' - and l1.l_receiptdate > l1.l_commitdate - and exists ( - select - * - from - {line_item_ds} l2 - where - l2.l_orderkey = l1.l_orderkey - and l2.l_suppkey <> l1.l_suppkey - ) - and not exists ( - select - * - from - {line_item_ds} l3 - where - l3.l_orderkey = l1.l_orderkey - and l3.l_suppkey <> l1.l_suppkey - and l3.l_receiptdate > l3.l_commitdate - ) - and s_nationkey = n_nationkey - and n_name = 'SAUDI ARABIA' - group by - s_name - order by - numwait desc, - s_name - limit 100 -""" - -q22_query = f""" - select - cntrycode, - count(*) as numcust, - sum(c_acctbal) as totacctbal - from ( - select - SUBSTR(c_phone, 1, 2) AS cntrycode, - c_acctbal - from - {customer_ds} - where - SUBSTR(c_phone, 1, 2) in - ('13', '31', '23', '29', '30', '18', '17') - and c_acctbal > ( - select - avg(c_acctbal) - from - {customer_ds} - where - c_acctbal > 0.00 - and SUBSTR(c_phone, 1, 2) in - ('13', '31', '23', '29', '30', '18', '17') - ) - and not exists ( - select - * - from - {orders_ds} - where - o_custkey = c_custkey - ) - ) as custsale - group by - cntrycode - order by - cntrycode -""" +dataset = { + "line_item_ds": f"bigframes-dev-perf.{dataset_id}.LINEITEM", + "region_ds": f"bigframes-dev-perf.{dataset_id}.REGION", + "nation_ds": f"bigframes-dev-perf.{dataset_id}.NATION", + "supplier_ds": f"bigframes-dev-perf.{dataset_id}.SUPPLIER", + "part_ds": f"bigframes-dev-perf.{dataset_id}.PART", + "part_supp_ds": f"bigframes-dev-perf.{dataset_id}.PARTSUPP", + "customer_ds": f"bigframes-dev-perf.{dataset_id}.CUSTOMER", + "orders_ds": f"bigframes-dev-perf.{dataset_id}.ORDERS", +} def _execute_query(query): @@ -764,8 +75,12 @@ def verify(query_num=None): for i in tqdm(range_iter, desc="Processing queries"): if query_num is not None and i != query_num: continue - query_var_name = f"q{i}_query" - sql_query = globals().get(query_var_name, "Query not defined") + + # Execute SQL: + sql_file_path = f"third_party/bigframes_vendored/tpch/sql_queries/q{i}.sql" + with open(sql_file_path, "r") as f: + sql_query = f.read() + sql_query = sql_query.format(**dataset) file_path = f"third_party/bigframes_vendored/tpch/queries/q{i}.py" if os.path.exists(file_path): with open(file_path, "r") as file: diff --git a/third_party/bigframes_vendored/tpch/TPC-EULA.txt b/third_party/bigframes_vendored/tpch/TPC-EULA.txt new file mode 100644 index 0000000000..feed8c4973 --- /dev/null +++ b/third_party/bigframes_vendored/tpch/TPC-EULA.txt @@ -0,0 +1,320 @@ +END USER LICENSE AGREEMENT +VERSION 2.2 + +READ THE TERMS AND CONDITIONS OF THIS AGREEMENT ("AGREEMENT") CAREFULLY +BEFORE INSTALLING OR USING THE ACCOMPANYING SOFTWARE. BY INSTALLING OR +USING THE SOFTWARE OR RELATED DOCUMENTATION, YOU AGREE TO BE BOUND BY +THE TERMS OF THIS AGREEMENT. IF YOU DO NOT AGREE TO THE TERMS OF THIS +AGREEMENT, DO NOT INSTALL OR USE THE SOFTWARE. IF YOU ARE ACCESSING THE +SOFTWARE ON BEHALF OF YOUR ORGANIZATION, YOU REPRESENT AND WARRANT THAT +YOU HAVE SUFFICIENT AUTHORITY TO BIND YOUR ORGANIZATION TO THIS +AGREEMENT. + +USE AND RE-EXPORT OF THE SOFTWARE IS SUBJECT TO THE UNITED STATES EXPORT +CONTROL ADMINISTRATION REGULATIONS. THE SOFTWARE MAY NOT BE USED BY +UNLICENSED PERSONS OR ENTITIES, AND MAY NOT BE RE- EXPORTED TO ANOTHER +COUNTRY. SEE EXPORT ASSURANCE (CLAUSE 13) OF THIS LICENSE. + +This is a legal agreement between you (or, if you are accessing the +software on behalf of your organization, your organization) ("You" or +"User") and the Transaction Processing Performance Council ("TPC"). This +Agreement states the terms and conditions upon which TPC offers to +license the Software, including, but not limited to, the source code, +scripts, executable programs, drivers, libraries and data files +associated with such programs, and modifications thereof (the +"Software"), and online, electronic or printed documentation +("Documentation," together with the Software, "Materials"). + +LICENSE + +1. Definitions + +"Executive Summary" shall mean a short summary of a TPC Benchmark Result +that shows the configuration, primary metrics, performance data, and +pricing details. The exact requirements for the Executive Summary are +defined in each TPC Benchmark Standard. +"Full Disclosure Report (FDR)" shall mean a document that describes The +TPC Benchmark Result in sufficient detail such that the Result could be +recreated. The exact requirements for the FDR are defined in each TPC +Benchmark Standard. +"TPC Benchmark Result (Result)" shall mean a performance test submitted +to the TPC attested to meet the requirements of a TPC Benchmark Standard +at the time of submission. A Result is documented by an Executive +Summary and, if required, a FDR. +"TPC Benchmark Standard" shall mean a TPC Benchmark Specification and +any associated code or binaries approved by the TPC. The various TPC +Benchmark Standards can be found at +http://www.tpc.org/information/current_specifications.asp. +"TPC Policies" shall mean the guiding principles for how the TPC +conducts its operations and business. The current TPC Policies can be +found at http://www.tpc.org/information/current_specifications.asp. + +2. Ownership. The Materials are licensed, not sold, to You for use only +under the terms of this Agreement. As between You and TPC (and, to the +extent applicable, its licensors), TPC retains all rights, title and +interest to and ownership of the Materials and reserves all rights not +expressly granted to You. + +3. License Grant. Subject to Your compliance in all material respects +with the terms and conditions of this Agreement, TPC grants You a +restricted, non-exclusive, revocable license to install and use the +Materials, but only as expressly permitted herein. You may only use the +Software on computer systems under Your direct control. You may download +multiple copies of the Materials and make verbatim copies of the +original of the Software so long as Your use of such copies complies +with the terms of this Agreement. +a. Use by Individual. If You are accessing the Materials as an +individual, only You (as an individual) may access and use the +Materials. +b. Use by Organization. If You are accessing the Materials on behalf of +Your organization, only You and those within Your organization may use +the Materials. Your organization must identify a contact person to TPC +and conduct communications with TPC through that contact person. + +4. Restrictions. The following restrictions apply to all use of the +Materials by You. +a. General: You may not: +(1) use, copy, print, modify, adapt, create derivative works from, +market, deliver, rent, lease, sublicense, make, have made, assign, +pledge, transfer, sell, offer to sell, import, reproduce, distribute, +publicly perform, publicly display or otherwise grant rights to the +Materials, or any copy thereof, in whole or in part, except as expressly +permitted under this Agreement; or +(2) use the Materials in any way that does not comply with all +applicable laws and regulations. +b. Modification: You may modify the Software. +c. Public Disclosure: You may not publicly disclose any performance +results produced while using the Software except in the following +circumstances: +(1) as part of a TPC Benchmark Result. For purposes of this Agreement, a +"TPC Benchmark Result" is a performance test submitted to the TPC, +documented by a Full Disclosure Report and Executive Summary, claiming +to meet the requirements of an official TPC Benchmark Standard. You +agree that TPC Benchmark Results may only be published in accordance +with the TPC Policies. viewable at http: //www.tpc.org +(2) as part of an academic or research effort that does not imply or +state a marketing position +(3) any other use of the Software, provided that any performance results +must be clearly identified as not being comparable to TPC Benchmark +Results unless specifically authorized by TPC. + +5. License Modification. Requests for modification of this license shall +be addressed to info@tpc.org. You may not remove or modify this license +without permission. + +6. Copyright. The Materials are owned by TPC and/or its licensors, and +are protected by United States copyright laws and international treaty +provisions. You may not remove the copyright notice from the original or +any copy of the Materials, and You must apply the notice if You extract +part of the Materials not bearing a notice. + +7. Use of Name. You acknowledge and agree that TPC owns all trademark +and trade name rights in the names, trademarks and logos used by TPC in +the Materials. User shall preserve any notices regarding such ownership. +User may only use such names, trademarks and logos in accordance with +the usage guidelines specified by the TPC Policies. + +8. Merger or Integration. Any portion of the Materials merged into or +integrated with other software or documentation will continue to be +subject to the terms and conditions of this Agreement. + +9. Limited Grants of Sublicense. You may distribute the Software as +provided or as modified as permitted under clause 4 b. of this +Agreement, provided You comply with all of the terms of this Agreement +and the following conditions: + +a. If You distribute any portion of the Software in its original form +You may do so only under this Agreement by including a complete copy of +this Agreement with Your distribution, and if You distribute the +Software in modified form, You may only do so under a license that at a +minimum provides all of the protections and conditions of use contained +within this Agreement; + +b. You must include on each copy of the Software that You distribute the +following legend in all caps, at the top of the label and license, and +in a font not less than 12 point and no less prominent than any other +printing: "THE TPC SOFTWARE IS AVAILABLE WITHOUT CHARGE FROM TPC."; + +c. You must retain all copyright, patent, trademark, and attribution +notices that are present in the Software; and + +d. You may not charge a fee for the distribution of this Software, +including any modifications permitted under clause 4.b. + +10. Term and Termination. +a. Term. The license granted to You is effective until terminated. +b. Termination. +(1) By You. You may terminate this Agreement at any time by returning +the Materials (including any portions or copies thereof) to TPC or +providing written notice to the TPC that all copies of the Materials +within Your custody or control have been deleted or destroyed. +(2) By TPC. In the event You materially fail to comply with any term or +condition of this Agreement, and You fail to remedy such non-compliance +within 30 days after the receipt of notice to that effect, then TPC +shall have the right to terminate this Agreement immediately upon +written notice at the end of such 30-day period. +c. Effect of Termination. Termination of this Agreement in accordance +with this clause 10 will not terminate the rights of end users +sublicensed by You pursuant to this Agreement. Moreover, upon +termination and at TPC's written request, You agree to either (1) return +the Materials (including any portions or copies thereof) to TPC or (2) +immediately destroy all copies of the Materials within Your custody or +control and inform the TPC of the destruction of the Materials. Upon +termination, TPC may also enforce any rights provided by law. The +provisions of this Agreement that protect the proprietary rights of TPC +and its Licensors will continue in force after termination. + +11. No Warranty; Materials Provided "As Is". TO THE MAXIMUM EXTENT +PERMITTED BY APPLICABLE LAW, THE MATERIALS ARE PROVIDED "AS IS" AND WITH +ALL FAULTS, AND TPC (AND ITS LICENSORS) AND THE AUTHORS AND DEVELOPERS +OF THE MATERIALS HEREBY DISCLAIM ALL WARRANTIES, REPRESENTATIONS AND +CONDITIONS, EITHER EXPRESS, IMPLIED OR STATUTORY, INCLUDING, BUT NOT +LIMITED TO, ANY IMPLIED WARRANTIES, DUTIES OR CONDITIONS RELATING TO +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, ACCURACY OR +COMPLETENESS OF RESPONSES, RESULTS, WORKMANLIKE EFFORT, LACK OF VIRUSES, +LACK OF NEGLIGENCE, TITLE, QUIET ENJOYMENT, QUIET POSSESSION, +CORRESPONDENCE TO DESCRIPTION OR NONINFRINGEMENT. USER RECOGNIZES THAT +THE MATERIALS ARE THE RESULT OF A COOPERATIVE, NON-PROFIT EFFORT AND +THAT TPC DOES NOT CONDUCT A TYPICAL BUSINESS. USER ACCEPTS THE MATERIALS +"AS IS" AND WITHOUT ANY WARRANTY, EXPRESS OR IMPLIED. + +Without limitation, TPC (and its licensors) do not warrant that the +functions contained in the Software or Materials will meet Your +requirements or that the operation of the Software will be +uninterrupted, error-free or free from malicious code. For purposes of +this paragraph, "malicious code" means any program code designed to +contaminate other computer programs or computer data, consume computer +resources, modify, destroy, record, or transmit data, or in some other +fashion usurp the normal operation of the computer, computer system, or +computer network, including viruses, Trojan horses, droppers, worms, +logic bombs, and the like. TPC (and its licensors) shall not be liable +for the accuracy of any information provided by TPC or third-party +technical support personnel, or any damages caused, either directly or +indirectly, by acts taken or omissions made by You as a result of such +technical support. + +You assume full responsibility for the selection of the Materials to +achieve Your intended results, and for the installation, use and results +obtained from the Materials. You also assume the entire risk as it +applies to the quality and performance of the Materials. Should the +Materials prove defective, You (and not TPC) assume the entire liability +of any and all necessary servicing, repair or correction. Some +countries/states do not allow the exclusion of implied warranties, so +the above exclusion may not apply to You. TPC (and its licensors) +further disclaims all warranties of any kind if the Materials were +customized, repackaged or altered in any way by any party other than TPC +(or its licensors). + +12. Disclaimer of Liability. TPC (and its licensors) assumes no +liability with respect to the Materials, including liability for +infringement of intellectual property rights, negligence, or any other +liability. TPC is not aware of any infringement of copyright or patent +that may result from its grant of rights to User of the Materials. If +User receives any notice of infringement, such notice shall be +immediately communicated to TPC who will have sole discretion to take +action to evaluate the claim and, if practicable, modify the Materials +as necessary to avoid infringement. In the event that TPC determines +that the Materials cannot be modified to avoid such infringement (or any +other infringement claim communicated to TPC), TPC may terminate this +Agreement immediately. User shall suspend use of the Materials until +modifications to avoid claims of infringement have been completed. User +waives any claim against TPC in the event of such infringement claims by +others. + +13. Export Assurance. Use and re-export of the Materials and related +technical information is subject to the Export Administration +Regulations (EAR) of the United States Department of Commerce. User +hereby agrees that User (a) assumes responsibility for compliance with +the EAR in its use of the Materials and technical information, and (b) +will not export, re-export, or otherwise disclose directly or +indirectly, the Materials, technical data, or any direct product of the +Materials or technical data in violation of the EAR. + +14. Limitation of Remedies And Damages. IN NO EVENT WILL TPC OR ITS +LICENSORS OR LICENSEE BE LIABLE FOR ANY INDIRECT, INCIDENTAL, SPECIAL OR +CONSEQUENTIAL DAMAGES OR FOR ANY LOST PROFITS, LOST SAVINGS, LOST +REVENUES OR LOST DATA ARISING FROM OR RELATING TO THE MATERIALS OR THIS +AGREEMENT, EVEN IF TPC OR ITS LICENSORS OR LICENSEE HAVE BEEN ADVISED OF +THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL TPC'S OR ITS +LICENSORS' LIABILITY OR DAMAGES TO YOU OR ANY OTHER PERSON EVER EXCEED +U.S. ONE HUNDRED DOLLARS (US $100), REGARDLESS OF THE FORM OF THE CLAIM. +IN NO EVENT WILL LICENSEE'S LIABILITY OR DAMAGES TO TPC OR ANY OTHER +PERSON EVER EXCEED $1,000,000, REGARDLESS OF THE FORM OF THE CLAIM. Some +countries/states do not allow the limitation or exclusion of liability +for incidental or consequential damages, so the above limitation or +exclusion may not apply to You. + +15. U.S. Government Restricted Rights. All Software and related +documentation are provided with restricted rights. Use, duplication or +disclosure by the U.S. Government is subject to restrictions as set +forth in subdivision (b)(3)(ii) of the Rights in Technical Data and +Computer Software Clause at 252.227-7013. If You are using the Software +outside of the United States, You will comply with the applicable local +laws of Your country, U.S. export control law, and the English version +of this Agreement. + +16. Contractor/Manufacturer. The Contractor/Manufacturer for the +Software is: + +Transaction Processing Performance Council +572B Ruger Street, P.O. Box 29920 +San Francisco, CA 94129 + +17. General. This Agreement is binding on You as well as Your employees, +employers, contractors and agents, and on any successors and assignees. +This Agreement is governed by the laws of the State of California +(except to the extent federal law governs copyrights and trademarks) +without respect to any provisions of California law that would cause +application of the law of another state or country. The parties agree +that the United Nations Convention on Contracts for the International +Sale of Goods will not govern this Agreement. This Agreement is the +entire agreement between us regarding the subject matter hereof and +supersedes any other understandings or agreements with respect to the +Materials or the subject matter hereof. If any provision of this +Agreement is deemed invalid or unenforceable by any court having +jurisdiction, that particular provision will be deemed modified to the +extent necessary to make the provision valid and enforceable, and the +remaining provisions will remain in full force and effect. + +SPECIAL PROVISIONS APPLICABLE TO THE EUROPEAN UNION + +If You acquired the Materials in the European Union (EU), the following +provisions also apply to You. If there is any inconsistency between the +terms of the Software License Agreement set out earlier and the +following provisions, the following provisions shall take precedence. + +1. Distribution. You may sublicense modifications of the Software +covered in this Agreement if they meet the requirements of clause 9 +above. + +2. Limited Warranty. EXCEPT AS STATED EARLIER IN THIS AGREEMENT, AND AS +PROVIDED UNDER THE HEADING "STATUTORY RIGHTS", THE SOFTWARE IS PROVIDED +AS-IS WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS OR IMPLIED, +INCLUDING, BUT NOT LIMITED TO, ANY IMPLIED WARRANTIES, NONINFRINGEMENT, +OR CONDITIONS OF MERCHANTABILITY, QUALITY AND FITNESS FOR A PARTICULAR +PURPOSE. + +3. Limitation of Remedy and Damages. THE LIMITATIONS OF REMEDIES AND +DAMAGES IN THE SOFTWARE LICENSE AGREEMENT SHALL NOT APPLY TO PERSONAL +INJURY (INCLUDING DEATH) TO ANY PERSON CAUSED BY TPC'S NEGLIGENCE AND +ARE SUBJECT TO THE PROVISION SET OUT UNDER THE HEADING "STATUTORY +RIGHTS". + +4. Statutory Rights: Irish law provides that certain conditions and +warranties may be implied in contracts for the sale of goods and in +contracts for the supply of services. Such conditions and warranties are +hereby excluded, to the extent such exclusion, in the context of this +transaction, is lawful under Irish law. Conversely, such conditions and +warranties, insofar as they may not be lawfully excluded, shall apply. +Accordingly nothing in this Agreement shall prejudice any rights that +You may enjoy by virtue of Sections 12, 13, 14 or 15 of the Irish Sale +of Goods Act 1893 (as amended). + +5. General. This Agreement is governed by the laws of the Republic of +Ireland. The local language version of this agreement shall apply to +Materials acquired in the EU. This Agreement is the entire agreement +between us with respect to the subject matter hereof and You agree that +TPC will not have any liability for any untrue statement or +representation made by it, its agents or anyone else (whether innocently +or negligently) upon which You relied upon entering this Agreement, +unless such untrue statement or representation was made fraudulently. diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q1.sql b/third_party/bigframes_vendored/tpch/sql_queries/q1.sql new file mode 100644 index 0000000000..c359614583 --- /dev/null +++ b/third_party/bigframes_vendored/tpch/sql_queries/q1.sql @@ -0,0 +1,21 @@ +select + l_returnflag, + l_linestatus, + sum(l_quantity) as sum_qty, + sum(l_extendedprice) as sum_base_price, + sum(l_extendedprice * (1 - l_discount)) as sum_disc_price, + sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge, + avg(l_quantity) as avg_qty, + avg(l_extendedprice) as avg_price, + avg(l_discount) as avg_disc, + count(*) as count_order +from + {line_item_ds} +where + l_shipdate <= '1998-09-02' +group by + l_returnflag, + l_linestatus +order by + l_returnflag, + l_linestatus diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q10.sql b/third_party/bigframes_vendored/tpch/sql_queries/q10.sql new file mode 100644 index 0000000000..c07aa0b4c9 --- /dev/null +++ b/third_party/bigframes_vendored/tpch/sql_queries/q10.sql @@ -0,0 +1,32 @@ +select + c_custkey, + c_name, + round(sum(l_extendedprice * (1 - l_discount)), 2) as revenue, + c_acctbal, + n_name, + c_address, + c_phone, + c_comment +from + {customer_ds}, + {orders_ds}, + {line_item_ds}, + {nation_ds} +where + c_custkey = o_custkey + and l_orderkey = o_orderkey + and o_orderdate >= date '1993-10-01' + and o_orderdate < date '1993-10-01' + interval '3' month + and l_returnflag = 'R' + and c_nationkey = n_nationkey +group by + c_custkey, + c_name, + c_acctbal, + c_phone, + n_name, + c_address, + c_comment +order by + revenue desc +limit 20 diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q11.sql b/third_party/bigframes_vendored/tpch/sql_queries/q11.sql new file mode 100644 index 0000000000..08c4560423 --- /dev/null +++ b/third_party/bigframes_vendored/tpch/sql_queries/q11.sql @@ -0,0 +1,27 @@ +select + ps_partkey, + round(sum(ps_supplycost * ps_availqty), 2) as value +from + {part_supp_ds}, + {supplier_ds}, + {nation_ds} +where + ps_suppkey = s_suppkey + and s_nationkey = n_nationkey + and n_name = 'GERMANY' +group by + ps_partkey having + sum(ps_supplycost * ps_availqty) > ( + select + sum(ps_supplycost * ps_availqty) * 0.0001 + from + {part_supp_ds}, + {supplier_ds}, + {nation_ds} + where + ps_suppkey = s_suppkey + and s_nationkey = n_nationkey + and n_name = 'GERMANY' + ) + order by + value desc diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q12.sql b/third_party/bigframes_vendored/tpch/sql_queries/q12.sql new file mode 100644 index 0000000000..cb97f1fb3c --- /dev/null +++ b/third_party/bigframes_vendored/tpch/sql_queries/q12.sql @@ -0,0 +1,28 @@ +select + l_shipmode, + sum(case + when o_orderpriority = '1-URGENT' + or o_orderpriority = '2-HIGH' + then 1 + else 0 + end) as high_line_count, + sum(case + when o_orderpriority <> '1-URGENT' + and o_orderpriority <> '2-HIGH' + then 1 + else 0 + end) as low_line_count +from + {orders_ds}, + {line_item_ds} +where + o_orderkey = l_orderkey + and l_shipmode in ('MAIL', 'SHIP') + and l_commitdate < l_receiptdate + and l_shipdate < l_commitdate + and l_receiptdate >= date '1994-01-01' + and l_receiptdate < date '1994-01-01' + interval '1' year +group by + l_shipmode +order by + l_shipmode diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q13.sql b/third_party/bigframes_vendored/tpch/sql_queries/q13.sql new file mode 100644 index 0000000000..d1616f5360 --- /dev/null +++ b/third_party/bigframes_vendored/tpch/sql_queries/q13.sql @@ -0,0 +1,18 @@ +SELECT + c_count, COUNT(*) AS custdist +FROM ( + SELECT + c_custkey, + COUNT(o_orderkey) AS c_count + FROM + {customer_ds} LEFT OUTER JOIN {orders_ds} ON + c_custkey = o_custkey + AND o_comment NOT LIKE '%special%requests%' + GROUP BY + c_custkey +) AS c_orders +GROUP BY + c_count +ORDER BY + custdist DESC, + c_count DESC diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q14.sql b/third_party/bigframes_vendored/tpch/sql_queries/q14.sql new file mode 100644 index 0000000000..1620ab9762 --- /dev/null +++ b/third_party/bigframes_vendored/tpch/sql_queries/q14.sql @@ -0,0 +1,13 @@ +select + round(100.00 * sum(case + when p_type like 'PROMO%' + then l_extendedprice * (1 - l_discount) + else 0 + end) / sum(l_extendedprice * (1 - l_discount)), 2) as promo_revenue +from + {line_item_ds}, + {part_ds} +where + l_partkey = p_partkey + and l_shipdate >= date '1995-09-01' + and l_shipdate < date '1995-09-01' + interval '1' month diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q15.sql b/third_party/bigframes_vendored/tpch/sql_queries/q15.sql new file mode 100644 index 0000000000..cbf77827bc --- /dev/null +++ b/third_party/bigframes_vendored/tpch/sql_queries/q15.sql @@ -0,0 +1,26 @@ +WITH revenue AS ( + SELECT + l_suppkey AS supplier_no, + SUM(l_extendedprice * (1 - l_discount)) AS total_revenue + FROM + {line_item_ds} + WHERE + l_shipdate >= DATE '1996-01-01' + AND l_shipdate < DATE '1996-01-01' + INTERVAL '3' month + GROUP BY + l_suppkey +) +SELECT + s.s_suppkey, + s.s_name, + s.s_address, + s.s_phone, + r.total_revenue +FROM + {supplier_ds} s +JOIN + revenue r ON s.s_suppkey = r.supplier_no +WHERE + r.total_revenue = (SELECT MAX(total_revenue) FROM revenue) +ORDER BY + s.s_suppkey; diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q16.sql b/third_party/bigframes_vendored/tpch/sql_queries/q16.sql new file mode 100644 index 0000000000..193c8e462d --- /dev/null +++ b/third_party/bigframes_vendored/tpch/sql_queries/q16.sql @@ -0,0 +1,30 @@ +select + p_brand, + p_type, + p_size, + count(distinct ps_suppkey) as supplier_cnt +from + {part_supp_ds}, + {part_ds} +where + p_partkey = ps_partkey + and p_brand <> 'Brand#45' + and p_type not like 'MEDIUM POLISHED%' + and p_size in (49, 14, 23, 45, 19, 3, 36, 9) + and ps_suppkey not in ( + select + s_suppkey + from + {supplier_ds} + where + s_comment like '%Customer%Complaints%' + ) +group by + p_brand, + p_type, + p_size +order by + supplier_cnt desc, + p_brand, + p_type, + p_size diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q17.sql b/third_party/bigframes_vendored/tpch/sql_queries/q17.sql new file mode 100644 index 0000000000..390ecdd33b --- /dev/null +++ b/third_party/bigframes_vendored/tpch/sql_queries/q17.sql @@ -0,0 +1,17 @@ +select + round(sum(l_extendedprice) / 7.0, 2) as avg_yearly +from + {line_item_ds}, + {part_ds} +where + p_partkey = l_partkey + and p_brand = 'Brand#23' + and p_container = 'MED BOX' + and l_quantity < ( + select + 0.2 * avg(l_quantity) + from + {line_item_ds} + where + l_partkey = p_partkey + ) diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q18.sql b/third_party/bigframes_vendored/tpch/sql_queries/q18.sql new file mode 100644 index 0000000000..4a98abafb9 --- /dev/null +++ b/third_party/bigframes_vendored/tpch/sql_queries/q18.sql @@ -0,0 +1,33 @@ +select + c_name, + c_custkey, + o_orderkey, + o_orderdate as o_orderdat, + o_totalprice, + sum(l_quantity) as col6 +from + {customer_ds}, + {orders_ds}, + {line_item_ds} +where + o_orderkey in ( + select + l_orderkey + from + {line_item_ds} + group by + l_orderkey having + sum(l_quantity) > 300 + ) + and c_custkey = o_custkey + and o_orderkey = l_orderkey +group by + c_name, + c_custkey, + o_orderkey, + o_orderdate, + o_totalprice +order by + o_totalprice desc, + o_orderdate +limit 100 diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q19.sql b/third_party/bigframes_vendored/tpch/sql_queries/q19.sql new file mode 100644 index 0000000000..30b41ff3ff --- /dev/null +++ b/third_party/bigframes_vendored/tpch/sql_queries/q19.sql @@ -0,0 +1,35 @@ +select + round(sum(l_extendedprice* (1 - l_discount)), 2) as revenue +from + {line_item_ds}, + {part_ds} +where + ( + p_partkey = l_partkey + and p_brand = 'Brand#12' + and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') + and l_quantity >= 1 and l_quantity <= 1 + 10 + and p_size between 1 and 5 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ) + or + ( + p_partkey = l_partkey + and p_brand = 'Brand#23' + and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK') + and l_quantity >= 10 and l_quantity <= 20 + and p_size between 1 and 10 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ) + or + ( + p_partkey = l_partkey + and p_brand = 'Brand#34' + and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG') + and l_quantity >= 20 and l_quantity <= 30 + and p_size between 1 and 15 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ) diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q2.sql b/third_party/bigframes_vendored/tpch/sql_queries/q2.sql new file mode 100644 index 0000000000..082e1e7f53 --- /dev/null +++ b/third_party/bigframes_vendored/tpch/sql_queries/q2.sql @@ -0,0 +1,44 @@ +select + s_acctbal, + s_name, + n_name, + p_partkey, + p_mfgr, + s_address, + s_phone, + s_comment +from + {part_ds}, + {supplier_ds}, + {part_supp_ds}, + {nation_ds}, + {region_ds} +where + p_partkey = ps_partkey + and s_suppkey = ps_suppkey + and p_size = 15 + and p_type like '%BRASS' + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = 'EUROPE' + and ps_supplycost = ( + select + min(ps_supplycost) + from + {part_supp_ds}, + {supplier_ds}, + {nation_ds}, + {region_ds} + where + p_partkey = ps_partkey + and s_suppkey = ps_suppkey + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = 'EUROPE' + ) +order by + s_acctbal desc, + n_name, + s_name, + p_partkey +limit 100 diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q20.sql b/third_party/bigframes_vendored/tpch/sql_queries/q20.sql new file mode 100644 index 0000000000..03348e82b8 --- /dev/null +++ b/third_party/bigframes_vendored/tpch/sql_queries/q20.sql @@ -0,0 +1,37 @@ +select + s_name, + s_address +from + {supplier_ds}, + {nation_ds} +where + s_suppkey in ( + select + ps_suppkey + from + {part_supp_ds} + where + ps_partkey in ( + select + p_partkey + from + {part_ds} + where + p_name like 'forest%' + ) + and ps_availqty > ( + select + 0.5 * sum(l_quantity) + from + {line_item_ds} + where + l_partkey = ps_partkey + and l_suppkey = ps_suppkey + and l_shipdate >= date '1994-01-01' + and l_shipdate < date '1994-01-01' + interval '1' year + ) + ) + and s_nationkey = n_nationkey + and n_name = 'CANADA' +order by + s_name diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q21.sql b/third_party/bigframes_vendored/tpch/sql_queries/q21.sql new file mode 100644 index 0000000000..444d127469 --- /dev/null +++ b/third_party/bigframes_vendored/tpch/sql_queries/q21.sql @@ -0,0 +1,40 @@ +select + s_name, + count(*) as numwait +from + {supplier_ds}, + {line_item_ds} l1, + {orders_ds}, + {nation_ds} +where + s_suppkey = l1.l_suppkey + and o_orderkey = l1.l_orderkey + and o_orderstatus = 'F' + and l1.l_receiptdate > l1.l_commitdate + and exists ( + select + * + from + {line_item_ds} l2 + where + l2.l_orderkey = l1.l_orderkey + and l2.l_suppkey <> l1.l_suppkey + ) + and not exists ( + select + * + from + {line_item_ds} l3 + where + l3.l_orderkey = l1.l_orderkey + and l3.l_suppkey <> l1.l_suppkey + and l3.l_receiptdate > l3.l_commitdate + ) + and s_nationkey = n_nationkey + and n_name = 'SAUDI ARABIA' +group by + s_name +order by + numwait desc, + s_name +limit 100 diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q22.sql b/third_party/bigframes_vendored/tpch/sql_queries/q22.sql new file mode 100644 index 0000000000..a1e1b2a253 --- /dev/null +++ b/third_party/bigframes_vendored/tpch/sql_queries/q22.sql @@ -0,0 +1,36 @@ +select + cntrycode, + count(*) as numcust, + sum(c_acctbal) as totacctbal +from ( + select + SUBSTR(c_phone, 1, 2) AS cntrycode, + c_acctbal + from + {customer_ds} + where + SUBSTR(c_phone, 1, 2) in + ('13', '31', '23', '29', '30', '18', '17') + and c_acctbal > ( + select + avg(c_acctbal) + from + {customer_ds} + where + c_acctbal > 0.00 + and SUBSTR(c_phone, 1, 2) in + ('13', '31', '23', '29', '30', '18', '17') + ) + and not exists ( + select + * + from + {orders_ds} + where + o_custkey = c_custkey + ) + ) as custsale +group by + cntrycode +order by + cntrycode diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q3.sql b/third_party/bigframes_vendored/tpch/sql_queries/q3.sql new file mode 100644 index 0000000000..69a40b8ef7 --- /dev/null +++ b/third_party/bigframes_vendored/tpch/sql_queries/q3.sql @@ -0,0 +1,23 @@ +select + l_orderkey, + sum(l_extendedprice * (1 - l_discount)) as revenue, + o_orderdate, + o_shippriority +from + {customer_ds}, + {orders_ds}, + {line_item_ds} +where + c_mktsegment = 'BUILDING' + and c_custkey = o_custkey + and l_orderkey = o_orderkey + and o_orderdate < '1995-03-15' + and l_shipdate > '1995-03-15' +group by + l_orderkey, + o_orderdate, + o_shippriority +order by + revenue desc, + o_orderdate +limit 10 diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q4.sql b/third_party/bigframes_vendored/tpch/sql_queries/q4.sql new file mode 100644 index 0000000000..57204e8d70 --- /dev/null +++ b/third_party/bigframes_vendored/tpch/sql_queries/q4.sql @@ -0,0 +1,21 @@ +select + o_orderpriority, + count(*) as order_count +from + {orders_ds} +where + o_orderdate >= date '1993-07-01' + and o_orderdate < date '1993-10-01' + and exists ( + select + * + from + {line_item_ds} + where + l_orderkey = o_orderkey + and l_commitdate < l_receiptdate + ) +group by + o_orderpriority +order by + o_orderpriority diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q5.sql b/third_party/bigframes_vendored/tpch/sql_queries/q5.sql new file mode 100644 index 0000000000..78dbb96ffa --- /dev/null +++ b/third_party/bigframes_vendored/tpch/sql_queries/q5.sql @@ -0,0 +1,24 @@ +select + n_name, + sum(l_extendedprice * (1 - l_discount)) as revenue +from + {customer_ds}, + {orders_ds}, + {line_item_ds}, + {supplier_ds}, + {nation_ds}, + {region_ds} +where + c_custkey = o_custkey + and l_orderkey = o_orderkey + and l_suppkey = s_suppkey + and c_nationkey = s_nationkey + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = 'ASIA' + and o_orderdate >= date '1994-01-01' + and o_orderdate < date '1995-01-01' +group by + n_name +order by + revenue desc diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q6.sql b/third_party/bigframes_vendored/tpch/sql_queries/q6.sql new file mode 100644 index 0000000000..0ea158332e --- /dev/null +++ b/third_party/bigframes_vendored/tpch/sql_queries/q6.sql @@ -0,0 +1,9 @@ +select + sum(l_extendedprice * l_discount) as revenue +from + {line_item_ds} +where + l_shipdate >= date '1994-01-01' + and l_shipdate < date '1994-01-01' + interval '1' year + and l_discount between .05 and .07 + and l_quantity < 24 diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q7.sql b/third_party/bigframes_vendored/tpch/sql_queries/q7.sql new file mode 100644 index 0000000000..002e89e4a0 --- /dev/null +++ b/third_party/bigframes_vendored/tpch/sql_queries/q7.sql @@ -0,0 +1,50 @@ +select + supp_nation, + cust_nation, + l_year, + sum(volume) as revenue +from + ( + select + n1.n_name as supp_nation, + n2.n_name as cust_nation, + EXTRACT( + YEAR + FROM + l_shipdate + ) as l_year, + l_extendedprice * (1 - l_discount) as volume + from + {supplier_ds}, + {line_item_ds}, + {orders_ds}, + {customer_ds}, + {nation_ds} n1, + {nation_ds} n2 + where + s_suppkey = l_suppkey + and o_orderkey = l_orderkey + and c_custkey = o_custkey + and s_nationkey = n1.n_nationkey + and c_nationkey = n2.n_nationkey + and ( + ( + n1.n_name = 'FRANCE' + and n2.n_name = 'GERMANY' + ) + or ( + n1.n_name = 'GERMANY' + and n2.n_name = 'FRANCE' + ) + ) + and l_shipdate between date '1995-01-01' + and date '1996-12-31' + ) as shipping +group by + supp_nation, + cust_nation, + l_year +order by + supp_nation, + cust_nation, + l_year diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q8.sql b/third_party/bigframes_vendored/tpch/sql_queries/q8.sql new file mode 100644 index 0000000000..d4d1ddd275 --- /dev/null +++ b/third_party/bigframes_vendored/tpch/sql_queries/q8.sql @@ -0,0 +1,39 @@ +select + o_year, + round( + sum(case + when nation = 'BRAZIL' then volume + else 0 + end) / sum(volume) + , 2) as mkt_share +from + ( + select + extract(year from o_orderdate) as o_year, + l_extendedprice * (1 - l_discount) as volume, + n2.n_name as nation + from + {part_ds}, + {supplier_ds}, + {line_item_ds}, + {orders_ds}, + {customer_ds}, + {nation_ds} n1, + {nation_ds} n2, + {region_ds} + where + p_partkey = l_partkey + and s_suppkey = l_suppkey + and l_orderkey = o_orderkey + and o_custkey = c_custkey + and c_nationkey = n1.n_nationkey + and n1.n_regionkey = r_regionkey + and r_name = 'AMERICA' + and s_nationkey = n2.n_nationkey + and o_orderdate between date '1995-01-01' and date '1996-12-31' + and p_type = 'ECONOMY ANODIZED STEEL' + ) as all_nations +group by + o_year +order by + o_year diff --git a/third_party/bigframes_vendored/tpch/sql_queries/q9.sql b/third_party/bigframes_vendored/tpch/sql_queries/q9.sql new file mode 100644 index 0000000000..fcc3e19400 --- /dev/null +++ b/third_party/bigframes_vendored/tpch/sql_queries/q9.sql @@ -0,0 +1,32 @@ +select + nation, + o_year, + round(sum(amount), 2) as sum_profit +from + ( + select + n_name as nation, + EXTRACT(YEAR FROM o_orderdate) as o_year, + l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount + from + {part_ds}, + {supplier_ds}, + {line_item_ds}, + {part_supp_ds}, + {orders_ds}, + {nation_ds} + where + s_suppkey = l_suppkey + and ps_suppkey = l_suppkey + and ps_partkey = l_partkey + and p_partkey = l_partkey + and o_orderkey = l_orderkey + and s_nationkey = n_nationkey + and p_name like '%green%' + ) as profit +group by + nation, + o_year +order by + nation, + o_year desc From aafb5be3e9c50f477fca2a1ebb5338194672913f Mon Sep 17 00:00:00 2001 From: Adam Dupaski Date: Tue, 4 Feb 2025 19:42:30 +0100 Subject: [PATCH 36/38] docs: add link to DataFrames intro to improve SEO (#1176) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tim Sweña (Swast) --- README.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 70041c7c8e..185c50c14a 100644 --- a/README.rst +++ b/README.rst @@ -30,7 +30,8 @@ Documentation Getting started with BigQuery DataFrames ---------------------------------------- -Try the `BigQuery DataFrames quickstart `_ +Read `Introduction to BigQuery DataFrames `_ +and try the `BigQuery DataFrames quickstart `_ to get up and running in just a few minutes. From 5cd0cf257687f242c1895b26ca27ef5455ef7871 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Tue, 4 Feb 2025 13:04:14 -0800 Subject: [PATCH 37/38] chore: update q11 (#1359) --- third_party/bigframes_vendored/tpch/queries/q11.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/third_party/bigframes_vendored/tpch/queries/q11.py b/third_party/bigframes_vendored/tpch/queries/q11.py index e4b628e9e6..365aa12eb9 100644 --- a/third_party/bigframes_vendored/tpch/queries/q11.py +++ b/third_party/bigframes_vendored/tpch/queries/q11.py @@ -18,20 +18,20 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session): index_col=bigframes.enums.DefaultIndexKind.NULL, ) - merged_df = partsupp.merge(supplier, left_on="PS_SUPPKEY", right_on="S_SUPPKEY") - merged_df = merged_df.merge(nation, left_on="S_NATIONKEY", right_on="N_NATIONKEY") + nation = nation[nation["N_NAME"] == "GERMANY"] - filtered_df = merged_df[merged_df["N_NAME"] == "GERMANY"] + merged_df = nation.merge(supplier, left_on="N_NATIONKEY", right_on="S_NATIONKEY") + merged_df = merged_df.merge(partsupp, left_on="S_SUPPKEY", right_on="PS_SUPPKEY") - filtered_df["VALUE"] = filtered_df["PS_SUPPLYCOST"] * filtered_df["PS_AVAILQTY"] - grouped = filtered_df.groupby("PS_PARTKEY", as_index=False).agg( + merged_df["VALUE"] = merged_df["PS_SUPPLYCOST"] * merged_df["PS_AVAILQTY"] + grouped = merged_df.groupby("PS_PARTKEY", as_index=False).agg( VALUE=bpd.NamedAgg(column="VALUE", aggfunc="sum") ) grouped["VALUE"] = grouped["VALUE"].round(2) total_value = ( - (filtered_df["PS_SUPPLYCOST"] * filtered_df["PS_AVAILQTY"]).to_frame().sum() + (merged_df["PS_SUPPLYCOST"] * merged_df["PS_AVAILQTY"]).to_frame().sum() ) threshold = (total_value * 0.0001).rename("THRESHOLD") From 9a21f25fc8a86ed8c53f91dc726e6438f92e15b5 Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Tue, 4 Feb 2025 13:12:01 -0800 Subject: [PATCH 38/38] chore(main): release 1.35.0 (#1331) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> --- CHANGELOG.md | 35 +++++++++++++++++++++++ bigframes/version.py | 2 +- third_party/bigframes_vendored/version.py | 2 +- 3 files changed, 37 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 886e4f8921..af87cae3b2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,41 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.35.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.34.0...v1.35.0) (2025-02-04) + + +### Features + +* Add Series.keys() ([#1342](https://github.com/googleapis/python-bigquery-dataframes/issues/1342)) ([deb015d](https://github.com/googleapis/python-bigquery-dataframes/commit/deb015dc1276549519d51363501355272f8976d8)) +* Allow `case_when` to change dtypes if case list contains the condition `(True, some_default_value)` ([#1311](https://github.com/googleapis/python-bigquery-dataframes/issues/1311)) ([5c2a2c6](https://github.com/googleapis/python-bigquery-dataframes/commit/5c2a2c6086be20cba7da08ecd37899699aab518f)) +* Support python type as astype arg ([#1316](https://github.com/googleapis/python-bigquery-dataframes/issues/1316)) ([b26e135](https://github.com/googleapis/python-bigquery-dataframes/commit/b26e13570f198ec4d252590a8c07253624db667a)) +* Support time_series_id_col in ARIMAPlus ([#1282](https://github.com/googleapis/python-bigquery-dataframes/issues/1282)) ([97532c9](https://github.com/googleapis/python-bigquery-dataframes/commit/97532c9ba02cd709d69666dd0afca5c1df8b9faf)) + + +### Bug Fixes + +* Exclude `DataFrame` and `Series` `__call__` from unimplemented API metrics ([#1351](https://github.com/googleapis/python-bigquery-dataframes/issues/1351)) ([f2d5264](https://github.com/googleapis/python-bigquery-dataframes/commit/f2d526445da7dae29c49c8d6dacdfee7d2fa9d79)) +* Make `DataFrame` `__getattr__` and `__setattr__` more robust to subclassing ([#1352](https://github.com/googleapis/python-bigquery-dataframes/issues/1352)) ([417de3a](https://github.com/googleapis/python-bigquery-dataframes/commit/417de3a449e5d0748831b502f4f5b9fb9ba38714)) + + +### Performance Improvements + +* Fall back to ordering by bq pk when possible ([#1350](https://github.com/googleapis/python-bigquery-dataframes/issues/1350)) ([3c4abf2](https://github.com/googleapis/python-bigquery-dataframes/commit/3c4abf24ea186e98f629b6f83c0f3e36dc0571c6)) +* Improve isin performance ([#1203](https://github.com/googleapis/python-bigquery-dataframes/issues/1203)) ([db087b0](https://github.com/googleapis/python-bigquery-dataframes/commit/db087b0bfe4b3ba965682d620079c923e098e362)) +* Prevent inlining of remote ops ([#1347](https://github.com/googleapis/python-bigquery-dataframes/issues/1347)) ([012081a](https://github.com/googleapis/python-bigquery-dataframes/commit/012081af9ef825ced96ec1e772b9646cbe09d9a1)) + + +### Dependencies + +* Add support for Python 3.13 for everything but remote functions ([#1307](https://github.com/googleapis/python-bigquery-dataframes/issues/1307)) ([533db96](https://github.com/googleapis/python-bigquery-dataframes/commit/533db9685d159de2bc76307b0e0add676bd679a0)) + + +### Documentation + +* Add `GeoSeries` docs ([#1327](https://github.com/googleapis/python-bigquery-dataframes/issues/1327)) ([05f83d1](https://github.com/googleapis/python-bigquery-dataframes/commit/05f83d18d276091a1549dbba1f2baf8c91c8c37e)) +* Add link to DataFrames intro to improve SEO ([#1176](https://github.com/googleapis/python-bigquery-dataframes/issues/1176)) ([aafb5be](https://github.com/googleapis/python-bigquery-dataframes/commit/aafb5be3e9c50f477fca2a1ebb5338194672913f)) +* Add snippet to explain the univariate model's forecast result in the Forecast a single time series with a univariate model tutorial ([#1272](https://github.com/googleapis/python-bigquery-dataframes/issues/1272)) ([c22126b](https://github.com/googleapis/python-bigquery-dataframes/commit/c22126b846db428d21c0f5cbd2d439ecc56365b2)) + ## [1.34.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.33.0...v1.34.0) (2025-01-27) diff --git a/bigframes/version.py b/bigframes/version.py index 1fef294cef..d9b9875805 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.34.0" +__version__ = "1.35.0" diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index 1fef294cef..d9b9875805 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.34.0" +__version__ = "1.35.0"