From 16b9bcfa65056d1da18481962249d7048b276b9b Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Tue, 24 Oct 2023 00:14:53 +0000 Subject: [PATCH 1/7] docs: link to ML.EVALUATE BQML page for score() methods --- bigframes/ml/ensemble.py | 10 ++++++++++ bigframes/ml/forecasting.py | 5 +++++ third_party/bigframes_vendored/sklearn/base.py | 12 +++++++++++- .../bigframes_vendored/sklearn/cluster/_kmeans.py | 8 ++++++-- .../bigframes_vendored/sklearn/decomposition/_pca.py | 7 ++++++- 5 files changed, 38 insertions(+), 4 deletions(-) diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py index 113ad872b5..764f00ed12 100644 --- a/bigframes/ml/ensemble.py +++ b/bigframes/ml/ensemble.py @@ -507,6 +507,11 @@ def score( ): """Calculate evaluation metrics of the model. + .. note:: + + We're using BigQuery ML.EVALUATE function (https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate) + for evaluateing model metrics. + Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): A BigQuery DataFrame as evaluation data. @@ -676,6 +681,11 @@ def score( ): """Calculate evaluation metrics of the model. + .. note:: + + We're using BigQuery ML.EVALUATE function (https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate) + for evaluateing model metrics. + Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): A BigQuery DataFrame as evaluation data. diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py index 8a6de1dd81..b88518a843 100644 --- a/bigframes/ml/forecasting.py +++ b/bigframes/ml/forecasting.py @@ -112,6 +112,11 @@ def score( ) -> bpd.DataFrame: """Calculate evaluation metrics of the model. + .. note:: + + We're using BigQuery ML.EVALUATE function (https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate) + for evaluateing model metrics. + Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): A BigQuery DataFrame only contains 1 column as diff --git a/third_party/bigframes_vendored/sklearn/base.py b/third_party/bigframes_vendored/sklearn/base.py index 42868ce51f..4d039be60d 100644 --- a/third_party/bigframes_vendored/sklearn/base.py +++ b/third_party/bigframes_vendored/sklearn/base.py @@ -85,6 +85,11 @@ def score(self, X, y): which is a harsh metric since you require for each sample that each label set be correctly predicted. + .. note:: + + We're using BigQuery ML.EVALUATE function (https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate) + for evaluateing model metrics. + Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): DataFrame of shape (n_samples, n_features). Test samples. @@ -105,7 +110,12 @@ class RegressorMixin: _estimator_type = "regressor" def score(self, X, y): - """Return the evaluation metrics of the model. + """Calculate evaluation metrics of the model. + + .. note:: + + We're using BigQuery ML.EVALUATE function (https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate) + for evaluateing model metrics. Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): diff --git a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py index ece62dc147..7b22bb4560 100644 --- a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py +++ b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py @@ -12,7 +12,6 @@ # License: BSD 3 clause from abc import ABC -from typing import List, Optional from bigframes import constants from third_party.bigframes_vendored.sklearn.base import BaseEstimator @@ -83,7 +82,12 @@ def score( X, y=None, ): - """Metrics of the model. + """Calculate evaluation metrics of the model. + + .. note:: + + We're using BigQuery ML.EVALUATE function (https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate) + for evaluateing model metrics. Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py index 97fee5a501..0326a10c2d 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py @@ -55,7 +55,12 @@ def fit(self, X, y=None): raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def score(self, X=None, y=None): - """Return the metrics of the model. + """Calculate evaluation metrics of the model. + + .. note:: + + We're using BigQuery ML.EVALUATE function (https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate) + for evaluateing model metrics. Args: X (default None): From 10c30cb47fa28970ff80e6c4e5a0ba3c4ba10d16 Mon Sep 17 00:00:00 2001 From: Henry Solberg Date: Wed, 25 Oct 2023 15:52:16 -0700 Subject: [PATCH 2/7] test: allow for alternative PCA solutions in tests (#143) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- tests/system/large/ml/test_decomposition.py | 4 ++-- tests/system/large/ml/test_pipeline.py | 14 ++++++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/tests/system/large/ml/test_decomposition.py b/tests/system/large/ml/test_decomposition.py index 460f07b816..a7049d4c18 100644 --- a/tests/system/large/ml/test_decomposition.py +++ b/tests/system/large/ml/test_decomposition.py @@ -67,8 +67,8 @@ def test_decomposition_configure_fit_score_predict( index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) pd.testing.assert_frame_equal( - result.sort_index(), - expected, + abs(result.sort_index()), # results may differ by a minus sign + abs(expected), check_exact=False, rtol=0.1, ) diff --git a/tests/system/large/ml/test_pipeline.py b/tests/system/large/ml/test_pipeline.py index 9294740dd6..6874a9f301 100644 --- a/tests/system/large/ml/test_pipeline.py +++ b/tests/system/large/ml/test_pipeline.py @@ -431,10 +431,16 @@ def test_pipeline_PCA_fit_score_predict(session, penguins_df_default_index): index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) pd.testing.assert_frame_equal( - predictions[ - ["principal_component_1", "principal_component_2", "principal_component_3"] - ], - expected, + abs( # results may differ by a minus sign + predictions[ + [ + "principal_component_1", + "principal_component_2", + "principal_component_3", + ] + ] + ), + abs(expected), check_exact=False, rtol=0.1, ) From 5ab92f054d9dd370ecc95f44a8685ffa61b0a798 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 26 Oct 2023 00:16:15 +0000 Subject: [PATCH 3/7] ci: Disable presubmit LLM tests temporarily (#144) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- tests/system/small/ml/test_llm.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index b7257dde1b..a801c36c83 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -26,6 +26,9 @@ def test_create_text_generator_model(palm2_text_generator_model): assert palm2_text_generator_model._bqml_model is not None +@pytest.mark.skip( + reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." +) @pytest.mark.flaky(retries=2, delay=120) def test_create_text_generator_model_default_session(bq_connection, llm_text_pandas_df): import bigframes.pandas as bpd @@ -48,6 +51,9 @@ def test_create_text_generator_model_default_session(bq_connection, llm_text_pan assert all(series.str.len() > 20) +@pytest.mark.skip( + reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." +) @pytest.mark.flaky(retries=2, delay=120) def test_create_text_generator_model_default_connection(llm_text_pandas_df): from bigframes import _config @@ -74,6 +80,9 @@ def test_create_text_generator_model_default_connection(llm_text_pandas_df): # Marked as flaky only because BQML LLM is in preview, the service only has limited capacity, not stable enough. +@pytest.mark.skip( + reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." +) @pytest.mark.flaky(retries=2, delay=120) def test_text_generator_predict_default_params_success( palm2_text_generator_model, llm_text_df @@ -85,6 +94,9 @@ def test_text_generator_predict_default_params_success( assert all(series.str.len() > 20) +@pytest.mark.skip( + reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." +) @pytest.mark.flaky(retries=2, delay=120) def test_text_generator_predict_series_default_params_success( palm2_text_generator_model, llm_text_df @@ -96,6 +108,9 @@ def test_text_generator_predict_series_default_params_success( assert all(series.str.len() > 20) +@pytest.mark.skip( + reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." +) @pytest.mark.flaky(retries=2, delay=120) def test_text_generator_predict_arbitrary_col_label_success( palm2_text_generator_model, llm_text_df @@ -108,6 +123,9 @@ def test_text_generator_predict_arbitrary_col_label_success( assert all(series.str.len() > 20) +@pytest.mark.skip( + reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." +) @pytest.mark.flaky(retries=2, delay=120) def test_text_generator_predict_with_params_success( palm2_text_generator_model, llm_text_df @@ -139,6 +157,9 @@ def test_create_text_embedding_generator_model_defaults(bq_connection): assert model._bqml_model is not None +@pytest.mark.skip( + reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." +) @pytest.mark.flaky(retries=2, delay=120) def test_embedding_generator_predict_success( palm2_embedding_generator_model, llm_text_df @@ -152,6 +173,9 @@ def test_embedding_generator_predict_success( assert value.size == 768 +@pytest.mark.skip( + reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." +) @pytest.mark.flaky(retries=2, delay=120) def test_embedding_generator_predict_series_success( palm2_embedding_generator_model, llm_text_df From b66d1a1be33ff8c32f9b11ef87a0aaf2809a889f Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Wed, 25 Oct 2023 19:39:22 -0700 Subject: [PATCH 4/7] chore(main): release 0.11.0 (#126) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> --- CHANGELOG.md | 20 ++++++++++++++++++++ bigframes/version.py | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4d9f63d4c6..93ebadb56f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,26 @@ [1]: https://pypi.org/project/bigframes/#history +## [0.11.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.10.0...v0.11.0) (2023-10-26) + + +### Features + +* Add back `reset_session` as an alias for `close_session` ([#124](https://github.com/googleapis/python-bigquery-dataframes/issues/124)) ([694a85a](https://github.com/googleapis/python-bigquery-dataframes/commit/694a85a0ef90d838700014a204d72b23362db1d8)) +* Change `query` parameter to `query_or_table` in `read_gbq` ([#127](https://github.com/googleapis/python-bigquery-dataframes/issues/127)) ([f9bb3c4](https://github.com/googleapis/python-bigquery-dataframes/commit/f9bb3c4bc88c5ba2be6f17e12a0ec4f482ce161f)) + + +### Bug Fixes + +* Expose `bigframes.pandas.reset_session` as a public API ([#128](https://github.com/googleapis/python-bigquery-dataframes/issues/128)) ([b17e1f4](https://github.com/googleapis/python-bigquery-dataframes/commit/b17e1f43cd0f7567bc5b59b0e916cd20528312b3)) +* Use series's own session in series.reindex listlike case ([#135](https://github.com/googleapis/python-bigquery-dataframes/issues/135)) ([95bff3f](https://github.com/googleapis/python-bigquery-dataframes/commit/95bff3f1902bc09dc3310798a42df8ffd31ed8ee)) + + +### Documentation + +* Add runnable code samples for DataFrames I/O methods and property ([#129](https://github.com/googleapis/python-bigquery-dataframes/issues/129)) ([6fea8ef](https://github.com/googleapis/python-bigquery-dataframes/commit/6fea8efac35871985677ebeb948a576e64a1ffa4)) +* Add runnable code samples for reading methods ([#125](https://github.com/googleapis/python-bigquery-dataframes/issues/125)) ([a669919](https://github.com/googleapis/python-bigquery-dataframes/commit/a669919ff25b56156bd70ccd816a0bf19adb48aa)) + ## [0.10.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.9.0...v0.10.0) (2023-10-19) diff --git a/bigframes/version.py b/bigframes/version.py index 7a37ebd220..18edfa5615 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.10.0" +__version__ = "0.11.0" From e80abdb70b5d154648cba7eeb38393958ab533ad Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 26 Oct 2023 05:12:13 +0000 Subject: [PATCH 5/7] Revert "ci: Disable presubmit LLM tests temporarily (#144)" (#148) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 1641aff37d601b47e0bc4f25ff148be4f718bd1a, which was merged due to automerge label while still being discussed. Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- tests/system/small/ml/test_llm.py | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index a801c36c83..b7257dde1b 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -26,9 +26,6 @@ def test_create_text_generator_model(palm2_text_generator_model): assert palm2_text_generator_model._bqml_model is not None -@pytest.mark.skip( - reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." -) @pytest.mark.flaky(retries=2, delay=120) def test_create_text_generator_model_default_session(bq_connection, llm_text_pandas_df): import bigframes.pandas as bpd @@ -51,9 +48,6 @@ def test_create_text_generator_model_default_session(bq_connection, llm_text_pan assert all(series.str.len() > 20) -@pytest.mark.skip( - reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." -) @pytest.mark.flaky(retries=2, delay=120) def test_create_text_generator_model_default_connection(llm_text_pandas_df): from bigframes import _config @@ -80,9 +74,6 @@ def test_create_text_generator_model_default_connection(llm_text_pandas_df): # Marked as flaky only because BQML LLM is in preview, the service only has limited capacity, not stable enough. -@pytest.mark.skip( - reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." -) @pytest.mark.flaky(retries=2, delay=120) def test_text_generator_predict_default_params_success( palm2_text_generator_model, llm_text_df @@ -94,9 +85,6 @@ def test_text_generator_predict_default_params_success( assert all(series.str.len() > 20) -@pytest.mark.skip( - reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." -) @pytest.mark.flaky(retries=2, delay=120) def test_text_generator_predict_series_default_params_success( palm2_text_generator_model, llm_text_df @@ -108,9 +96,6 @@ def test_text_generator_predict_series_default_params_success( assert all(series.str.len() > 20) -@pytest.mark.skip( - reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." -) @pytest.mark.flaky(retries=2, delay=120) def test_text_generator_predict_arbitrary_col_label_success( palm2_text_generator_model, llm_text_df @@ -123,9 +108,6 @@ def test_text_generator_predict_arbitrary_col_label_success( assert all(series.str.len() > 20) -@pytest.mark.skip( - reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." -) @pytest.mark.flaky(retries=2, delay=120) def test_text_generator_predict_with_params_success( palm2_text_generator_model, llm_text_df @@ -157,9 +139,6 @@ def test_create_text_embedding_generator_model_defaults(bq_connection): assert model._bqml_model is not None -@pytest.mark.skip( - reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." -) @pytest.mark.flaky(retries=2, delay=120) def test_embedding_generator_predict_success( palm2_embedding_generator_model, llm_text_df @@ -173,9 +152,6 @@ def test_embedding_generator_predict_success( assert value.size == 768 -@pytest.mark.skip( - reason="Temporarily disable to validate the hypothesis that LLM capacity is causing the presubmit tests to take long to run." -) @pytest.mark.flaky(retries=2, delay=120) def test_embedding_generator_predict_series_success( palm2_embedding_generator_model, llm_text_df From ef41bc7e5d774a4231fadbaae5831aea82fa9f01 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 26 Oct 2023 11:48:14 -0500 Subject: [PATCH 6/7] refactor: make `to_pandas()` call `to_arrow()` and use local dtypes in DataFrame construction (#132) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Towards internal issue 280662868 🦕 --- bigframes/core/blocks.py | 41 +--- bigframes/core/indexes/index.py | 3 +- bigframes/dtypes.py | 6 + bigframes/session/__init__.py | 10 +- bigframes/session/_io/pandas.py | 77 +++++++ tests/system/small/test_dataframe.py | 10 - tests/system/small/test_series.py | 48 ++++- tests/unit/session/test_io_pandas.py | 296 +++++++++++++++++++++++++++ tests/unit/test_dtypes.py | 57 +++--- 9 files changed, 457 insertions(+), 91 deletions(-) create mode 100644 bigframes/session/_io/pandas.py create mode 100644 tests/unit/session/test_io_pandas.py diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 046d2b3a44..eab4645477 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -28,11 +28,8 @@ from typing import Iterable, List, Optional, Sequence, Tuple import warnings -import geopandas as gpd # type: ignore import google.cloud.bigquery as bigquery -import numpy import pandas as pd -import pyarrow as pa # type: ignore import bigframes.constants as constants import bigframes.core as core @@ -46,6 +43,7 @@ import bigframes.dtypes import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops +import bigframes.session._io.pandas import third_party.bigframes_vendored.pandas.io.common as vendored_pandas_io_common # Type constraint for wherever column labels are used @@ -372,34 +370,11 @@ def reorder_levels(self, ids: typing.Sequence[str]): level_names = [self.col_id_to_index_name[index_id] for index_id in ids] return Block(self.expr, ids, self.column_labels, level_names) - @classmethod - def _to_dataframe( - cls, result, schema: typing.Mapping[str, bigframes.dtypes.Dtype] - ) -> pd.DataFrame: + def _to_dataframe(self, result) -> pd.DataFrame: """Convert BigQuery data to pandas DataFrame with specific dtypes.""" - dtypes = bigframes.dtypes.to_pandas_dtypes_overrides(result.schema) - df = result.to_dataframe( - dtypes=dtypes, - bool_dtype=pd.BooleanDtype(), - int_dtype=pd.Int64Dtype(), - float_dtype=pd.Float64Dtype(), - string_dtype=pd.StringDtype(storage="pyarrow"), - date_dtype=pd.ArrowDtype(pa.date32()), - datetime_dtype=pd.ArrowDtype(pa.timestamp("us")), - time_dtype=pd.ArrowDtype(pa.time64("us")), - timestamp_dtype=pd.ArrowDtype(pa.timestamp("us", tz="UTC")), - ) - - # Convert Geography column from StringDType to GeometryDtype. - for column_name, dtype in schema.items(): - if dtype == gpd.array.GeometryDtype(): - df[column_name] = gpd.GeoSeries.from_wkt( - # https://github.com/geopandas/geopandas/issues/1879 - df[column_name].replace({numpy.nan: None}), - # BigQuery geography type is based on the WGS84 reference ellipsoid. - crs="EPSG:4326", - ) - return df + dtypes = dict(zip(self.index_columns, self.index_dtypes)) + dtypes.update(zip(self.value_columns, self.dtypes)) + return self._expr._session._rows_to_dataframe(result, dtypes) def to_pandas( self, @@ -480,8 +455,7 @@ def _compute_and_count( if sampling_method == _HEAD: total_rows = int(results_iterator.total_rows * fraction) results_iterator.max_results = total_rows - schema = dict(zip(self.value_columns, self.dtypes)) - df = self._to_dataframe(results_iterator, schema) + df = self._to_dataframe(results_iterator) if self.index_columns: df.set_index(list(self.index_columns), inplace=True) @@ -510,8 +484,7 @@ def _compute_and_count( ) else: total_rows = results_iterator.total_rows - schema = dict(zip(self.value_columns, self.dtypes)) - df = self._to_dataframe(results_iterator, schema) + df = self._to_dataframe(results_iterator) if self.index_columns: df.set_index(list(self.index_columns), inplace=True) diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index 677bb8529c..b9ffdff21e 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -399,9 +399,10 @@ def to_pandas(self) -> pandas.Index: """Executes deferred operations and downloads the results.""" # Project down to only the index column. So the query can be cached to visualize other data. index_columns = list(self._block.index_columns) + dtypes = dict(zip(index_columns, self.dtypes)) expr = self._expr.select_columns(index_columns) results, _ = expr.start_query() - df = expr._session._rows_to_dataframe(results) + df = expr._session._rows_to_dataframe(results, dtypes) df = df.set_index(index_columns) index = df.index index.names = list(self._block._index_labels) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index da221a95ac..079f0cc27a 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -169,6 +169,10 @@ def ibis_dtype_to_bigframes_dtype( if isinstance(ibis_dtype, ibis_dtypes.Struct): return pd.ArrowDtype(ibis_dtype_to_arrow_dtype(ibis_dtype)) + # BigQuery only supports integers of size 64 bits. + if isinstance(ibis_dtype, ibis_dtypes.Integer): + return pd.Int64Dtype() + if ibis_dtype in IBIS_TO_BIGFRAMES: return IBIS_TO_BIGFRAMES[ibis_dtype] elif isinstance(ibis_dtype, ibis_dtypes.Null): @@ -372,6 +376,8 @@ def cast_ibis_value( ibis_dtypes.float64: (ibis_dtypes.string, ibis_dtypes.int64), ibis_dtypes.string: (ibis_dtypes.int64, ibis_dtypes.float64), ibis_dtypes.date: (), + ibis_dtypes.Decimal(precision=38, scale=9): (ibis_dtypes.float64,), + ibis_dtypes.Decimal(precision=76, scale=38): (ibis_dtypes.float64,), ibis_dtypes.time: (), ibis_dtypes.timestamp: (ibis_dtypes.Timestamp(timezone="UTC"),), ibis_dtypes.Timestamp(timezone="UTC"): (ibis_dtypes.timestamp,), diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index db9c5a353c..af1f70d54d 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1515,14 +1515,10 @@ def _get_table_size(self, destination_table): return table.num_bytes def _rows_to_dataframe( - self, row_iterator: bigquery.table.RowIterator + self, row_iterator: bigquery.table.RowIterator, dtypes: Dict ) -> pandas.DataFrame: - return row_iterator.to_dataframe( - bool_dtype=pandas.BooleanDtype(), - int_dtype=pandas.Int64Dtype(), - float_dtype=pandas.Float64Dtype(), - string_dtype=pandas.StringDtype(storage="pyarrow"), - ) + arrow_table = row_iterator.to_arrow() + return bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes) def _start_generic_job(self, job: formatting_helpers.GenericJob): if bigframes.options.display.progress_bar is not None: diff --git a/bigframes/session/_io/pandas.py b/bigframes/session/_io/pandas.py new file mode 100644 index 0000000000..163127b546 --- /dev/null +++ b/bigframes/session/_io/pandas.py @@ -0,0 +1,77 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, Union + +import geopandas # type: ignore +import pandas +import pandas.arrays +import pyarrow # type: ignore +import pyarrow.compute # type: ignore + +import bigframes.constants + + +def arrow_to_pandas( + arrow_table: Union[pyarrow.Table, pyarrow.RecordBatch], dtypes: Dict +): + if len(dtypes) != arrow_table.num_columns: + raise ValueError( + f"Number of types {len(dtypes)} doesn't match number of columns " + f"{arrow_table.num_columns}. {bigframes.constants.FEEDBACK_LINK}" + ) + + serieses = {} + for field, column in zip(arrow_table.schema, arrow_table): + dtype = dtypes[field.name] + + if dtype == geopandas.array.GeometryDtype(): + series = geopandas.GeoSeries.from_wkt( + column, + # BigQuery geography type is based on the WGS84 reference ellipsoid. + crs="EPSG:4326", + ) + elif dtype == pandas.Float64Dtype(): + # Preserve NA/NaN distinction. Note: This is currently needed, even if we use + # nullable Float64Dtype in the types_mapper. See: + # https://github.com/pandas-dev/pandas/issues/55668 + # Regarding type: ignore, this class has been public at this + # location since pandas 1.2.0. See: + # https://pandas.pydata.org/docs/dev/reference/api/pandas.arrays.FloatingArray.html + pd_array = pandas.arrays.FloatingArray( # type: ignore + column.to_numpy(), + pyarrow.compute.is_null(column).to_numpy(), + ) + series = pandas.Series(pd_array, dtype=dtype) + elif dtype == pandas.Int64Dtype(): + # Avoid out-of-bounds errors in Pandas 1.5.x, which incorrectly + # casts to float64 in an intermediate step. + pd_array = pandas.arrays.IntegerArray( + pyarrow.compute.fill_null(column, 0).to_numpy(), + pyarrow.compute.is_null(column).to_numpy(), + ) + series = pandas.Series(pd_array, dtype=dtype) + elif isinstance(dtype, pandas.ArrowDtype): + # Avoid conversion logic if we are backing the pandas Series by the + # arrow array. + series = pandas.Series( + pandas.arrays.ArrowExtensionArray(column), # type: ignore + dtype=dtype, + ) + else: + series = column.to_pandas(types_mapper=lambda _: dtype) + + serieses[field.name] = series + + return pandas.DataFrame(serieses) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 19e50eb06d..84e8def83b 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -2046,16 +2046,6 @@ def test__dir__with_rename(scalars_dfs): def test_iloc_slice(scalars_df_index, scalars_pandas_df_index, start, stop, step): bf_result = scalars_df_index.iloc[start:stop:step].to_pandas() pd_result = scalars_pandas_df_index.iloc[start:stop:step] - - # Pandas may assign non-object dtype to empty series and series index - # dtypes of empty columns are a known area of divergence from pandas - for column in pd_result.columns: - if ( - pd_result[column].empty and column != "geography_col" - ): # for empty geography_col, bigframes assigns non-object dtype - pd_result[column] = pd_result[column].astype("object") - pd_result.index = pd_result.index.astype("object") - pd.testing.assert_frame_equal( bf_result, pd_result, diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index bd9edbb1ca..c9510290b6 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -575,7 +575,15 @@ def test_series_int_int_operators_series(scalars_dfs, operator): ) def test_mods(scalars_dfs, col_x, col_y, method): scalars_df, scalars_pandas_df = scalars_dfs - bf_result = getattr(scalars_df[col_x], method)(scalars_df[col_y]).to_pandas() + x_bf = scalars_df[col_x] + y_bf = scalars_df[col_y] + bf_series = getattr(x_bf, method)(y_bf) + # BigQuery's mod functions return [BIG]NUMERIC values unless both arguments are integers. + # https://cloud.google.com/bigquery/docs/reference/standard-sql/mathematical_functions#mod + if x_bf.dtype == pd.Int64Dtype() and y_bf.dtype == pd.Int64Dtype(): + bf_result = bf_series.to_pandas() + else: + bf_result = bf_series.astype("Float64").to_pandas() pd_result = getattr(scalars_pandas_df[col_x], method)(scalars_pandas_df[col_y]) pd.testing.assert_series_equal(pd_result, bf_result) @@ -620,8 +628,20 @@ def test_divmods_series(scalars_dfs, col_x, col_y, method): pd_div_result, pd_mod_result = getattr(scalars_pandas_df[col_x], method)( scalars_pandas_df[col_y] ) - pd.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas()) - pd.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas()) + # BigQuery's mod functions return NUMERIC values for non-INT64 inputs. + if bf_div_result.dtype == pd.Int64Dtype(): + pd.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas()) + else: + pd.testing.assert_series_equal( + pd_div_result, bf_div_result.astype("Float64").to_pandas() + ) + + if bf_mod_result.dtype == pd.Int64Dtype(): + pd.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas()) + else: + pd.testing.assert_series_equal( + pd_mod_result, bf_mod_result.astype("Float64").to_pandas() + ) @pytest.mark.parametrize( @@ -649,8 +669,20 @@ def test_divmods_scalars(scalars_dfs, col_x, other, method): scalars_df, scalars_pandas_df = scalars_dfs bf_div_result, bf_mod_result = getattr(scalars_df[col_x], method)(other) pd_div_result, pd_mod_result = getattr(scalars_pandas_df[col_x], method)(other) - pd.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas()) - pd.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas()) + # BigQuery's mod functions return NUMERIC values for non-INT64 inputs. + if bf_div_result.dtype == pd.Int64Dtype(): + pd.testing.assert_series_equal(pd_div_result, bf_div_result.to_pandas()) + else: + pd.testing.assert_series_equal( + pd_div_result, bf_div_result.astype("Float64").to_pandas() + ) + + if bf_mod_result.dtype == pd.Int64Dtype(): + pd.testing.assert_series_equal(pd_mod_result, bf_mod_result.to_pandas()) + else: + pd.testing.assert_series_equal( + pd_mod_result, bf_mod_result.astype("Float64").to_pandas() + ) @pytest.mark.parametrize( @@ -1941,12 +1973,6 @@ def test_iloc_nested(scalars_df_index, scalars_pandas_df_index): def test_series_iloc(scalars_df_index, scalars_pandas_df_index, start, stop, step): bf_result = scalars_df_index["string_col"].iloc[start:stop:step].to_pandas() pd_result = scalars_pandas_df_index["string_col"].iloc[start:stop:step] - - # Pandas may assign non-object dtype to empty series and series index - if pd_result.empty: - pd_result = pd_result.astype("object") - pd_result.index = pd_result.index.astype("object") - pd.testing.assert_series_equal( bf_result, pd_result, diff --git a/tests/unit/session/test_io_pandas.py b/tests/unit/session/test_io_pandas.py new file mode 100644 index 0000000000..8b95977ec3 --- /dev/null +++ b/tests/unit/session/test_io_pandas.py @@ -0,0 +1,296 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +from typing import Dict, Union + +import geopandas # type: ignore +import numpy +import pandas +import pandas.arrays +import pandas.testing +import pyarrow # type: ignore +import pytest + +import bigframes.session._io.pandas + + +@pytest.mark.parametrize( + ("arrow_table", "dtypes", "expected"), + ( + pytest.param( + pyarrow.Table.from_pydict({}), + {}, + pandas.DataFrame(), + id="empty-df", + ), + pytest.param( + pyarrow.Table.from_pydict( + { + "bool": pyarrow.array([None, None, None], type=pyarrow.bool_()), + "float": pyarrow.array([None, None, None], type=pyarrow.float64()), + "int": pyarrow.array([None, None, None], type=pyarrow.int64()), + "string": pyarrow.array([None, None, None], type=pyarrow.string()), + "time": pyarrow.array( + [None, None, None], type=pyarrow.time64("us") + ), + } + ), + { + "bool": "boolean", + "float": pandas.Float64Dtype(), + "int": pandas.Int64Dtype(), + "string": "string[pyarrow]", + "time": pandas.ArrowDtype(pyarrow.time64("us")), + }, + pandas.DataFrame( + { + "bool": pandas.Series([None, None, None], dtype="boolean"), + "float": pandas.Series( + pandas.arrays.FloatingArray( # type: ignore + numpy.array( + [float("nan"), float("nan"), float("nan")], + dtype="float64", + ), + numpy.array([True, True, True], dtype="bool"), + ), + dtype=pandas.Float64Dtype(), + ), + "int": pandas.Series( + [None, None, None], + dtype=pandas.Int64Dtype(), + ), + "string": pandas.Series( + [None, None, None], dtype="string[pyarrow]" + ), + "time": pandas.Series( + [ + None, + None, + None, + ], + dtype=pandas.ArrowDtype(pyarrow.time64("us")), + ), + } + ), + id="nulls-df", + ), + pytest.param( + pyarrow.Table.from_pydict( + { + "date": pyarrow.array( + [ + datetime.date(2023, 8, 29), + None, + datetime.date(2024, 4, 9), + datetime.date(1, 1, 1), + ], + type=pyarrow.date32(), + ), + "datetime": pyarrow.array( + [ + datetime.datetime(2023, 8, 29), + None, + datetime.datetime(2024, 4, 9, 23, 59, 59), + datetime.datetime(1, 1, 1, 0, 0, 0, 1), + ], + type=pyarrow.timestamp("us"), + ), + "string": ["123", None, "abc", "xyz"], + "time": pyarrow.array( + [ + datetime.time(0, 0, 0, 1), + datetime.time(12, 0, 0), + None, + datetime.time(23, 59, 59, 999999), + ], + type=pyarrow.time64("us"), + ), + "timestamp": pyarrow.array( + [ + datetime.datetime(2023, 8, 29), + datetime.datetime(1, 1, 1, 0, 0, 0, 1), + None, + datetime.datetime(2024, 4, 9, 23, 59, 59), + ], + type=pyarrow.timestamp("us", datetime.timezone.utc), + ), + } + ), + { + "date": pandas.ArrowDtype(pyarrow.date32()), + "datetime": pandas.ArrowDtype(pyarrow.timestamp("us")), + "string": "string[pyarrow]", + "time": pandas.ArrowDtype(pyarrow.time64("us")), + "timestamp": pandas.ArrowDtype( + pyarrow.timestamp("us", datetime.timezone.utc) + ), + }, + pandas.DataFrame( + { + "date": pandas.Series( + [ + datetime.date(2023, 8, 29), + None, + datetime.date(2024, 4, 9), + datetime.date(1, 1, 1), + ], + dtype=pandas.ArrowDtype(pyarrow.date32()), + ), + "datetime": pandas.Series( + [ + datetime.datetime(2023, 8, 29), + None, + datetime.datetime(2024, 4, 9, 23, 59, 59), + datetime.datetime(1, 1, 1, 0, 0, 0, 1), + ], + dtype=pandas.ArrowDtype(pyarrow.timestamp("us")), + ), + "string": pandas.Series( + ["123", None, "abc", "xyz"], dtype="string[pyarrow]" + ), + "time": pandas.Series( + [ + datetime.time(0, 0, 0, 1), + datetime.time(12, 0, 0), + None, + datetime.time(23, 59, 59, 999999), + ], + dtype=pandas.ArrowDtype(pyarrow.time64("us")), + ), + "timestamp": pandas.Series( + [ + datetime.datetime(2023, 8, 29), + datetime.datetime(1, 1, 1, 0, 0, 0, 1), + None, + datetime.datetime(2024, 4, 9, 23, 59, 59), + ], + dtype=pandas.ArrowDtype( + pyarrow.timestamp("us", datetime.timezone.utc) + ), + ), + } + ), + id="arrow-dtypes", + ), + pytest.param( + pyarrow.Table.from_pydict( + { + "bool": [True, None, True, False], + "bytes": [b"123", None, b"abc", b"xyz"], + "float": pyarrow.array( + [1.0, None, float("nan"), -1.0], + type=pyarrow.float64(), + ), + "int": pyarrow.array( + [1, None, -1, 2**63 - 1], + type=pyarrow.int64(), + ), + "string": ["123", None, "abc", "xyz"], + } + ), + { + "bool": "boolean", + "bytes": "object", + "float": pandas.Float64Dtype(), + "int": pandas.Int64Dtype(), + "string": "string[pyarrow]", + }, + pandas.DataFrame( + { + "bool": pandas.Series([True, None, True, False], dtype="boolean"), + "bytes": [b"123", None, b"abc", b"xyz"], + "float": pandas.Series( + pandas.arrays.FloatingArray( # type: ignore + numpy.array( + [1.0, float("nan"), float("nan"), -1.0], dtype="float64" + ), + numpy.array([False, True, False, False], dtype="bool"), + ), + dtype=pandas.Float64Dtype(), + ), + "int": pandas.Series( + [1, None, -1, 2**63 - 1], + dtype=pandas.Int64Dtype(), + ), + "string": pandas.Series( + ["123", None, "abc", "xyz"], dtype="string[pyarrow]" + ), + } + ), + id="scalar-dtypes", + ), + pytest.param( + pyarrow.Table.from_pydict( + { + "geocol": [ + "POINT(32 210)", + None, + "LINESTRING(1 1, 2 1, 3.1 2.88, 3 -3)", + ] + } + ), + {"geocol": geopandas.array.GeometryDtype()}, + pandas.DataFrame( + { + "geocol": geopandas.GeoSeries.from_wkt( + ["POINT(32 210)", None, "LINESTRING(1 1, 2 1, 3.1 2.88, 3 -3)"], + crs="EPSG:4326", + ), + } + ), + id="geography-dtype", + ), + ), +) +def test_arrow_to_pandas( + arrow_table: Union[pyarrow.Table, pyarrow.RecordBatch], + dtypes: Dict, + expected: pandas.DataFrame, +): + actual = bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes) + pandas.testing.assert_series_equal(actual.dtypes, expected.dtypes) + + # assert_frame_equal is converting to numpy internally, which causes some + # loss of precision with the extreme values in this test. + for column in actual.columns: + assert tuple( + (index, value) if (value is pandas.NA or value == value) else (index, "nan") + for index, value in actual[column].items() + ) == tuple( + (index, value) if (value is pandas.NA or value == value) else (index, "nan") + for index, value in expected[column].items() + ) + + +@pytest.mark.parametrize( + ("arrow_table", "dtypes"), + ( + pytest.param( + pyarrow.Table.from_pydict({"col1": [1], "col2": [2]}), + {"col1": "Int64"}, + id="too-few-dtypes", + ), + pytest.param( + pyarrow.RecordBatch.from_pydict({"col1": [1]}), + {"col1": "Int64", "col2": "string[pyarrow]"}, + id="too-many-dtypes", + ), + ), +) +def test_arrow_to_pandas_wrong_size_dtypes( + arrow_table: Union[pyarrow.Table, pyarrow.RecordBatch], dtypes: Dict +): + with pytest.raises(ValueError, match=f"Number of types {len(dtypes)}"): + bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes) diff --git a/tests/unit/test_dtypes.py b/tests/unit/test_dtypes.py index 3baff2e1f5..6ceaaf911b 100644 --- a/tests/unit/test_dtypes.py +++ b/tests/unit/test_dtypes.py @@ -29,41 +29,42 @@ # TODO(bmil): Add ARRAY, INTERVAL, STRUCT to cover all the standard # BigQuery data types as they appear in Ibis: # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types - (ibis_dtypes.Decimal(precision=76, scale=38, nullable=True), np.dtype("O")), - (ibis_dtypes.boolean, pd.BooleanDtype()), - (ibis_dtypes.binary, np.dtype("O")), - (ibis_dtypes.date, pd.ArrowDtype(pa.date32())), - (ibis_dtypes.Timestamp(), pd.ArrowDtype(pa.timestamp("us"))), - (ibis_dtypes.float64, pd.Float64Dtype()), - ( + pytest.param( + ibis_dtypes.Decimal(precision=76, scale=38, nullable=True), + np.dtype("O"), + id="bignumeric", + ), + pytest.param(ibis_dtypes.boolean, pd.BooleanDtype(), id="bool"), + pytest.param(ibis_dtypes.binary, np.dtype("O"), id="bytes"), + pytest.param(ibis_dtypes.date, pd.ArrowDtype(pa.date32()), id="date"), + pytest.param( + ibis_dtypes.Timestamp(), pd.ArrowDtype(pa.timestamp("us")), id="datetime" + ), + pytest.param(ibis_dtypes.float64, pd.Float64Dtype(), id="float"), + pytest.param( ibis_dtypes.GeoSpatial(geotype="geography", srid=4326, nullable=True), gpd.array.GeometryDtype(), + id="geography", ), - (ibis_dtypes.int64, pd.Int64Dtype()), - (ibis_dtypes.json, np.dtype("O")), - (ibis_dtypes.Decimal(precision=38, scale=9, nullable=True), np.dtype("O")), - (ibis_dtypes.string, pd.StringDtype(storage="pyarrow")), - (ibis_dtypes.time, pd.ArrowDtype(pa.time64("us"))), - ( + pytest.param(ibis_dtypes.int8, pd.Int64Dtype(), id="int8-as-int64"), + pytest.param(ibis_dtypes.int64, pd.Int64Dtype(), id="int64"), + # TODO(tswast): custom dtype (or at least string dtype) for JSON objects + pytest.param(ibis_dtypes.json, np.dtype("O"), id="json"), + pytest.param( + ibis_dtypes.Decimal(precision=38, scale=9, nullable=True), + np.dtype("O"), + id="numeric", + ), + pytest.param( + ibis_dtypes.string, pd.StringDtype(storage="pyarrow"), id="string" + ), + pytest.param(ibis_dtypes.time, pd.ArrowDtype(pa.time64("us")), id="time"), + pytest.param( ibis_dtypes.Timestamp(timezone="UTC"), pd.ArrowDtype(pa.timestamp("us", tz="UTC")), # type: ignore + id="timestamp", ), ], - ids=[ - "bignumeric", - "bool", - "bytes", - "date", - "datetime", - "float", - "geography", - "int64", - "json", - "numeric", - "string", - "time", - "timestamp", - ], ) def test_ibis_dtype_converts(ibis_dtype, bigframes_dtype): """Test all the Ibis data types needed to read BigQuery tables""" From ef97071ae1b0115edcbbb915ac6840ae5bd00db3 Mon Sep 17 00:00:00 2001 From: Ashley Xu Date: Thu, 26 Oct 2023 17:08:51 +0000 Subject: [PATCH 7/7] fix: address comment --- bigframes/ml/ensemble.py | 10 ++++++---- bigframes/ml/forecasting.py | 5 +++-- third_party/bigframes_vendored/sklearn/base.py | 10 ++++++---- .../bigframes_vendored/sklearn/cluster/_kmeans.py | 5 +++-- .../bigframes_vendored/sklearn/decomposition/_pca.py | 5 +++-- 5 files changed, 21 insertions(+), 14 deletions(-) diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py index 764f00ed12..19ca8608ff 100644 --- a/bigframes/ml/ensemble.py +++ b/bigframes/ml/ensemble.py @@ -509,8 +509,9 @@ def score( .. note:: - We're using BigQuery ML.EVALUATE function (https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate) - for evaluateing model metrics. + Output matches that of the BigQuery ML.EVALUTE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#regression_models + for the outputs relevant to this model type. Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): @@ -683,8 +684,9 @@ def score( .. note:: - We're using BigQuery ML.EVALUATE function (https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate) - for evaluateing model metrics. + Output matches that of the BigQuery ML.EVALUTE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#classification_models + for the outputs relevant to this model type. Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py index b88518a843..8e309d5e73 100644 --- a/bigframes/ml/forecasting.py +++ b/bigframes/ml/forecasting.py @@ -114,8 +114,9 @@ def score( .. note:: - We're using BigQuery ML.EVALUATE function (https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate) - for evaluateing model metrics. + Output matches that of the BigQuery ML.EVALUTE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#time_series_models + for the outputs relevant to this model type. Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): diff --git a/third_party/bigframes_vendored/sklearn/base.py b/third_party/bigframes_vendored/sklearn/base.py index 4d039be60d..768328e552 100644 --- a/third_party/bigframes_vendored/sklearn/base.py +++ b/third_party/bigframes_vendored/sklearn/base.py @@ -87,8 +87,9 @@ def score(self, X, y): .. note:: - We're using BigQuery ML.EVALUATE function (https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate) - for evaluateing model metrics. + Output matches that of the BigQuery ML.EVALUTE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#classification_models + for the outputs relevant to this model type. Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): @@ -114,8 +115,9 @@ def score(self, X, y): .. note:: - We're using BigQuery ML.EVALUATE function (https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate) - for evaluateing model metrics. + Output matches that of the BigQuery ML.EVALUTE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#regression_models + for the outputs relevant to this model type. Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): diff --git a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py index 7b22bb4560..5369d3662d 100644 --- a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py +++ b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py @@ -86,8 +86,9 @@ def score( .. note:: - We're using BigQuery ML.EVALUATE function (https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate) - for evaluateing model metrics. + Output matches that of the BigQuery ML.EVALUTE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#k-means_models + for the outputs relevant to this model type. Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py index 0326a10c2d..011ecc06dd 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py @@ -59,8 +59,9 @@ def score(self, X=None, y=None): .. note:: - We're using BigQuery ML.EVALUATE function (https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate) - for evaluateing model metrics. + Output matches that of the BigQuery ML.EVALUTE function. + See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#pca_models + for the outputs relevant to this model type. Args: X (default None):