googleapis · Genesis929 · Feb 21, 2025 · Jan 16, 2025 · Jan 16, 2025 · Jan 16, 2025
@@ -25,7 +25,6 @@
 import bigframes_vendored.sklearn.metrics._regression as vendored_metrics_regression
 import numpy as np
 import pandas as pd
-import sklearn.metrics as sklearn_metrics  # type: ignore

 from bigframes.ml import utils
 import bigframes.pandas as bpd
@@ -176,9 +175,9 @@ def auc(
 ) -> float:
    x_series, y_series = utils.batch_convert_to_series(x, y)

-    # TODO(b/286410053) Support ML exceptions and error handling.
-    auc = sklearn_metrics.auc(x_series.to_pandas(), y_series.to_pandas())
-    return auc
+    x_pandas = x_series.to_pandas()
+    y_pandas = y_series.to_pandas()
+    return vendored_metrics_ranking.auc(x_pandas, y_pandas)


 auc.__doc__ = inspect.getdoc(vendored_metrics_ranking.auc)

@@ -72,7 +72,9 @@
 UNIT_TEST_LOCAL_DEPENDENCIES: List[str] = []
 UNIT_TEST_DEPENDENCIES: List[str] = []
 UNIT_TEST_EXTRAS: List[str] = []
-UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {"3.12": ["polars"]}
+UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {
+    "3.12": ["polars", "scikit-learn"],
+}

 # 3.10 is needed for Windows tests as it is the only version installed in the
 # bigframes-windows container image. For more information, search
@@ -96,8 +98,13 @@
 ]
 SYSTEM_TEST_LOCAL_DEPENDENCIES: List[str] = []
 SYSTEM_TEST_DEPENDENCIES: List[str] = []
-SYSTEM_TEST_EXTRAS: List[str] = ["tests"]
-SYSTEM_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {}
+SYSTEM_TEST_EXTRAS: List[str] = []
+SYSTEM_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {
+    "3.9": ["tests"],
+    "3.10": ["tests"],
+    "3.12": ["tests", "scikit-learn"],
+    "3.13": ["tests"],
+}

 LOGGING_NAME_ENV_VAR = "BIGFRAMES_PERFORMANCE_LOG_NAME"

@@ -468,8 +475,7 @@ def cover(session):
 @nox.session(python=DEFAULT_PYTHON_VERSION)
 def docs(session):
    """Build the docs for this library."""
-
-    session.install("-e", ".")
+    session.install("-e", ".[scikit-learn]")
-    session.install("-e", ".[scikit-learn]")
+    session.install("-e", ".")
-    session.install("-e", ".[scikit-learn]")
+    session.install("-e", ".")
    session.install(
        # We need to pin to specific versions of the `sphinxcontrib-*` packages
        # which still support sphinx 4.x.
@@ -510,7 +516,7 @@ def docs(session):
 def docfx(session):
    """Build the docfx yaml files for this library."""

-    session.install("-e", ".")
+    session.install("-e", ".[scikit-learn]")
-    session.install("-e", ".[scikit-learn]")
+    session.install("-e", ".")
-    session.install("-e", ".[scikit-learn]")
+    session.install("-e", ".")
    session.install(
        # We need to pin to specific versions of the `sphinxcontrib-*` packages
        # which still support sphinx 4.x.
@@ -652,6 +658,8 @@ def prerelease(session: nox.sessions.Session, tests_path, extra_pytest_options=(
        if match.group(1) not in already_installed
    ]

+    print(already_installed)
+
    # We use --no-deps to ensure that pre-release versions aren't overwritten
    # by the version ranges in setup.py.
    session.install(*deps)

@@ -19,6 +19,8 @@

 from . import publish_api_coverage

+pytest.importorskip("sklearn")
+

 @pytest.fixture
 def api_coverage_df():

@@ -55,8 +55,6 @@
    "pyarrow >=10.0.1",
    "pydata-google-auth >=1.8.2",
    "requests >=2.27.1",
-    "scikit-learn >=1.2.2",
-    "sqlalchemy >=1.4,<3.0dev",
    "sqlglot >=23.6.3",
    "tabulate >=0.9",
    "ipywidgets >=7.7.1",
@@ -77,8 +75,15 @@
    "tests": [],
    # used for local engine, which is only needed for unit tests at present.
    "polars": ["polars >= 1.7.0"],
+    "scikit-learn": ["scikit-learn>=1.2.2"],
    # Packages required for basic development flow.
-    "dev": ["pytest", "pytest-mock", "pre-commit", "nox", "google-cloud-testutils"],
+    "dev": [
+        "pytest",
+        "pytest-mock",
+        "pre-commit",
+        "nox",
+        "google-cloud-testutils",
+    ],
 }
 extras["all"] = list(sorted(frozenset(itertools.chain.from_iterable(extras.values()))))


@@ -20,7 +20,6 @@ pyarrow==10.0.1
 pydata-google-auth==1.8.2
 requests==2.27.1
 scikit-learn==1.2.2
-sqlalchemy==1.4
 sqlglot==23.6.3
 tabulate==0.9
 ipywidgets==7.7.1

@@ -17,7 +17,6 @@
 import numpy as np
 import pandas as pd
 import pytest
-import sklearn.metrics as sklearn_metrics  # type: ignore

 import bigframes
 from bigframes.ml import metrics
@@ -66,6 +65,7 @@ def test_r2_score_force_finite(session):


 def test_r2_score_ok_fit_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
    pd_df = pd.DataFrame({"y_true": [1, 2, 3, 4, 5], "y_pred": [2, 3, 4, 3, 6]})

    df = session.read_pandas(pd_df)
@@ -113,6 +113,7 @@ def test_accuracy_score_not_normailze(session):


 def test_accuracy_score_fit_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
    pd_df = pd.DataFrame({"y_true": [1, 2, 3, 4, 5], "y_pred": [2, 3, 4, 3, 6]})

    df = session.read_pandas(pd_df)
@@ -203,6 +204,7 @@ def test_roc_curve_binary_classification_prediction_returns_expected(session):


 def test_roc_curve_binary_classification_prediction_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
    pd_df = pd.DataFrame(
        {
            "y_true": [0, 0, 1, 1, 0, 1, 0, 1, 1, 1],
@@ -294,6 +296,7 @@ def test_roc_curve_binary_classification_decision_returns_expected(session):


 def test_roc_curve_binary_classification_decision_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
    # Instead of operating on probabilities, assume a 70% decision threshold
    # has been applied, and operate on the final output
    y_score = [0.1, 0.4, 0.35, 0.8, 0.65, 0.9, 0.5, 0.3, 0.6, 0.45]
@@ -420,6 +423,7 @@ def test_roc_auc_score_returns_expected(session):


 def test_roc_auc_score_returns_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
    pd_df = pd.DataFrame(
        {
            "y_true": [0, 0, 1, 1, 0, 1, 0, 1, 1, 1],
@@ -525,6 +529,7 @@ def test_confusion_matrix_column_index(session):


 def test_confusion_matrix_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
    pd_df = pd.DataFrame(
        {
            "y_true": [2, 3, 3, 3, 4, 1],
@@ -543,6 +548,7 @@ def test_confusion_matrix_matches_sklearn(session):


 def test_confusion_matrix_str_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
    pd_df = pd.DataFrame(
        {
            "y_true": ["cat", "ant", "cat", "cat", "ant", "bird"],
@@ -603,6 +609,7 @@ def test_recall_score(session):


 def test_recall_score_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
    pd_df = pd.DataFrame(
        {
            "y_true": [2, 0, 2, 2, 0, 1],
@@ -620,6 +627,7 @@ def test_recall_score_matches_sklearn(session):


 def test_recall_score_str_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
    pd_df = pd.DataFrame(
        {
            "y_true": ["cat", "ant", "cat", "cat", "ant", "bird"],
@@ -673,6 +681,7 @@ def test_precision_score(session):


 def test_precision_score_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
    pd_df = pd.DataFrame(
        {
            "y_true": [2, 0, 2, 2, 0, 1],
@@ -695,6 +704,7 @@ def test_precision_score_matches_sklearn(session):


 def test_precision_score_str_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
    pd_df = pd.DataFrame(
        {
            "y_true": ["cat", "ant", "cat", "cat", "ant", "bird"],
@@ -752,6 +762,7 @@ def test_f1_score(session):


 def test_f1_score_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
    pd_df = pd.DataFrame(
        {
            "y_true": [2, 0, 2, 2, 0, 1],
@@ -769,6 +780,7 @@ def test_f1_score_matches_sklearn(session):


 def test_f1_score_str_matches_sklearn(session):
+    sklearn_metrics = pytest.importorskip("sklearn.metrics")
    pd_df = pd.DataFrame(
        {
            "y_true": ["cat", "ant", "cat", "cat", "ant", "bird"],

@@ -641,6 +641,8 @@ def test_series_replace_dict(scalars_dfs, replacement_dict):
    ),
 )
 def test_series_interpolate(method):
+    pytest.importorskip("scipy")
+
    values = [None, 1, 2, None, None, 16, None]
    index = [-3.2, 11.4, 3.56, 4, 4.32, 5.55, 76.8]
    pd_series = pd.Series(values, index)

@@ -13,8 +13,6 @@
 # limitations under the License.

 import pytest
-import sklearn.decomposition as sklearn_decomposition  # type: ignore
-import sklearn.linear_model as sklearn_linear_model  # type: ignore

 import bigframes.ml.decomposition
 import bigframes.ml.linear_model
@@ -35,8 +33,9 @@ def test_base_estimator_repr():
    assert pca_estimator.__repr__() == "PCA(n_components=7)"


-@pytest.mark.skipif(sklearn_linear_model is None, reason="requires sklearn")
 def test_base_estimator_repr_matches_sklearn():
+    sklearn_decomposition = pytest.importorskip("sklearn.decomposition")
+    sklearn_linear_model = pytest.importorskip("sklearn.linear_model")
    estimator = bigframes.ml.linear_model.LinearRegression()
    sklearn_estimator = sklearn_linear_model.LinearRegression()
    assert estimator.__repr__() == sklearn_estimator.__repr__()

@@ -15,8 +15,6 @@

 from google.cloud import bigquery
 import pytest
-import sklearn.compose as sklearn_compose  # type: ignore
-import sklearn.preprocessing as sklearn_preprocessing  # type: ignore

 from bigframes.ml import compose, preprocessing
 from bigframes.ml.compose import ColumnTransformer, SQLScalarColumnTransformer
@@ -119,6 +117,8 @@ def test_columntransformer_repr():


 def test_columntransformer_repr_matches_sklearn():
+    sklearn_compose = pytest.importorskip("sklearn.compose")
+    sklearn_preprocessing = pytest.importorskip("sklearn.preprocessing")
    bf_column_transformer = compose.ColumnTransformer(
        [
            (

@@ -13,10 +13,6 @@
 # limitations under the License.

 import pytest
-import sklearn.compose as sklearn_compose  # type: ignore
-import sklearn.linear_model as sklearn_linear_model  # type: ignore
-import sklearn.pipeline as sklearn_pipeline  # type: ignore
-import sklearn.preprocessing as sklearn_preprocessing  # type: ignore

 from bigframes.ml import compose, forecasting, linear_model, pipeline, preprocessing

@@ -57,8 +53,11 @@ def test_pipeline_repr():
    )


-@pytest.mark.skipif(sklearn_pipeline is None, reason="requires sklearn")
 def test_pipeline_repr_matches_sklearn():
+    sklearn_compose = pytest.importorskip("sklearn.compose")
+    sklearn_linear_model = pytest.importorskip("sklearn.linear_model")
+    sklearn_pipeline = pytest.importorskip("sklearn.pipeline")
+    sklearn_preprocessing = pytest.importorskip("sklearn.preprocessing")
    bf_pl = pipeline.Pipeline(
        [
            (

@@ -16,6 +16,8 @@
 #          Michal Karbownik <michakarbownik@gmail.com>
 # License: BSD 3 clause

+import numpy as np
+
 from bigframes import constants


@@ -60,7 +62,23 @@ def auc(x, y) -> float:
    Returns:
        float: Area Under the Curve.
    """
-    raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+    if len(x) < 2:
+        raise ValueError(
+            f"At least 2 points are needed to compute area under curve, but x.shape = {len(x)}"
+        )
+
+    if x.is_monotonic_decreasing:
+        d = -1
+    elif x.is_monotonic_increasing:
+        d = 1
+    else:
+        raise ValueError(f"x is neither increasing nor decreasing : {x}.")
+
+    if hasattr(np, "trapezoid"):
+        # new in numpy 2.0
+        return d * np.trapezoid(y, x)
+    # np.trapz has been deprecated in 2.0
+    return d * np.trapz(y, x)  # type: ignore


 def roc_auc_score(y_true, y_score) -> float: