Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

deps: remove scikit-learn and sqlalchemy as required dependencies #1296

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 25 commits into from
Feb 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions 7 bigframes/ml/metrics/_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
import bigframes_vendored.sklearn.metrics._regression as vendored_metrics_regression
import numpy as np
import pandas as pd
import sklearn.metrics as sklearn_metrics # type: ignore

from bigframes.ml import utils
import bigframes.pandas as bpd
Expand Down Expand Up @@ -176,9 +175,9 @@ def auc(
) -> float:
x_series, y_series = utils.batch_convert_to_series(x, y)

# TODO(b/286410053) Support ML exceptions and error handling.
auc = sklearn_metrics.auc(x_series.to_pandas(), y_series.to_pandas())
return auc
x_pandas = x_series.to_pandas()
y_pandas = y_series.to_pandas()
return vendored_metrics_ranking.auc(x_pandas, y_pandas)


auc.__doc__ = inspect.getdoc(vendored_metrics_ranking.auc)
Expand Down
20 changes: 14 additions & 6 deletions 20 noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,9 @@
UNIT_TEST_LOCAL_DEPENDENCIES: List[str] = []
UNIT_TEST_DEPENDENCIES: List[str] = []
UNIT_TEST_EXTRAS: List[str] = []
UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {"3.12": ["polars"]}
UNIT_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {
"3.12": ["polars", "scikit-learn"],
}

# 3.10 is needed for Windows tests as it is the only version installed in the
# bigframes-windows container image. For more information, search
Expand All @@ -96,8 +98,13 @@
]
SYSTEM_TEST_LOCAL_DEPENDENCIES: List[str] = []
SYSTEM_TEST_DEPENDENCIES: List[str] = []
SYSTEM_TEST_EXTRAS: List[str] = ["tests"]
SYSTEM_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {}
SYSTEM_TEST_EXTRAS: List[str] = []
SYSTEM_TEST_EXTRAS_BY_PYTHON: Dict[str, List[str]] = {
"3.9": ["tests"],
"3.10": ["tests"],
"3.12": ["tests", "scikit-learn"],
"3.13": ["tests"],
}

LOGGING_NAME_ENV_VAR = "BIGFRAMES_PERFORMANCE_LOG_NAME"

Expand Down Expand Up @@ -468,8 +475,7 @@ def cover(session):
@nox.session(python=DEFAULT_PYTHON_VERSION)
def docs(session):
"""Build the docs for this library."""

session.install("-e", ".")
session.install("-e", ".[scikit-learn]")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm very concerned if we need to install scikit-learn just to build the docs. Please remove.

Suggested change
session.install("-e", ".[scikit-learn]")
session.install("-e", ".")

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if it's possible to remove this as we are calling publish_api_coverage.py, and one of the things it do is go through sklearn APIs and generate a coverage report.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. Thanks for clarifying.

session.install(
# We need to pin to specific versions of the `sphinxcontrib-*` packages
# which still support sphinx 4.x.
Expand Down Expand Up @@ -510,7 +516,7 @@ def docs(session):
def docfx(session):
"""Build the docfx yaml files for this library."""

session.install("-e", ".")
session.install("-e", ".[scikit-learn]")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm very concerned if we need to install scikit-learn just to build the docs. Please remove.

Suggested change
session.install("-e", ".[scikit-learn]")
session.install("-e", ".")

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if it's possible to remove this as we are calling publish_api_coverage.py, and one of the things it do is go through sklearn APIs and generate a coverage report.

session.install(
# We need to pin to specific versions of the `sphinxcontrib-*` packages
# which still support sphinx 4.x.
Expand Down Expand Up @@ -652,6 +658,8 @@ def prerelease(session: nox.sessions.Session, tests_path, extra_pytest_options=(
if match.group(1) not in already_installed
]

print(already_installed)

# We use --no-deps to ensure that pre-release versions aren't overwritten
# by the version ranges in setup.py.
session.install(*deps)
Expand Down
2 changes: 2 additions & 0 deletions 2 scripts/test_publish_api_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

from . import publish_api_coverage

pytest.importorskip("sklearn")


@pytest.fixture
def api_coverage_df():
Expand Down
11 changes: 8 additions & 3 deletions 11 setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,6 @@
"pyarrow >=10.0.1",
"pydata-google-auth >=1.8.2",
"requests >=2.27.1",
"scikit-learn >=1.2.2",
"sqlalchemy >=1.4,<3.0dev",
"sqlglot >=23.6.3",
"tabulate >=0.9",
"ipywidgets >=7.7.1",
Expand All @@ -77,8 +75,15 @@
"tests": [],
# used for local engine, which is only needed for unit tests at present.
"polars": ["polars >= 1.7.0"],
"scikit-learn": ["scikit-learn>=1.2.2"],
# Packages required for basic development flow.
"dev": ["pytest", "pytest-mock", "pre-commit", "nox", "google-cloud-testutils"],
"dev": [
"pytest",
"pytest-mock",
"pre-commit",
"nox",
"google-cloud-testutils",
],
}
extras["all"] = list(sorted(frozenset(itertools.chain.from_iterable(extras.values()))))

Expand Down
1 change: 0 additions & 1 deletion 1 testing/constraints-3.9.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ pyarrow==10.0.1
pydata-google-auth==1.8.2
requests==2.27.1
scikit-learn==1.2.2
sqlalchemy==1.4
sqlglot==23.6.3
tabulate==0.9
ipywidgets==7.7.1
Expand Down
14 changes: 13 additions & 1 deletion 14 tests/system/small/ml/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
import numpy as np
import pandas as pd
import pytest
import sklearn.metrics as sklearn_metrics # type: ignore

import bigframes
from bigframes.ml import metrics
Expand Down Expand Up @@ -66,6 +65,7 @@ def test_r2_score_force_finite(session):


def test_r2_score_ok_fit_matches_sklearn(session):
sklearn_metrics = pytest.importorskip("sklearn.metrics")
pd_df = pd.DataFrame({"y_true": [1, 2, 3, 4, 5], "y_pred": [2, 3, 4, 3, 6]})

df = session.read_pandas(pd_df)
Expand Down Expand Up @@ -113,6 +113,7 @@ def test_accuracy_score_not_normailze(session):


def test_accuracy_score_fit_matches_sklearn(session):
sklearn_metrics = pytest.importorskip("sklearn.metrics")
pd_df = pd.DataFrame({"y_true": [1, 2, 3, 4, 5], "y_pred": [2, 3, 4, 3, 6]})

df = session.read_pandas(pd_df)
Expand Down Expand Up @@ -203,6 +204,7 @@ def test_roc_curve_binary_classification_prediction_returns_expected(session):


def test_roc_curve_binary_classification_prediction_matches_sklearn(session):
sklearn_metrics = pytest.importorskip("sklearn.metrics")
pd_df = pd.DataFrame(
{
"y_true": [0, 0, 1, 1, 0, 1, 0, 1, 1, 1],
Expand Down Expand Up @@ -294,6 +296,7 @@ def test_roc_curve_binary_classification_decision_returns_expected(session):


def test_roc_curve_binary_classification_decision_matches_sklearn(session):
sklearn_metrics = pytest.importorskip("sklearn.metrics")
# Instead of operating on probabilities, assume a 70% decision threshold
# has been applied, and operate on the final output
y_score = [0.1, 0.4, 0.35, 0.8, 0.65, 0.9, 0.5, 0.3, 0.6, 0.45]
Expand Down Expand Up @@ -420,6 +423,7 @@ def test_roc_auc_score_returns_expected(session):


def test_roc_auc_score_returns_matches_sklearn(session):
sklearn_metrics = pytest.importorskip("sklearn.metrics")
pd_df = pd.DataFrame(
{
"y_true": [0, 0, 1, 1, 0, 1, 0, 1, 1, 1],
Expand Down Expand Up @@ -525,6 +529,7 @@ def test_confusion_matrix_column_index(session):


def test_confusion_matrix_matches_sklearn(session):
sklearn_metrics = pytest.importorskip("sklearn.metrics")
pd_df = pd.DataFrame(
{
"y_true": [2, 3, 3, 3, 4, 1],
Expand All @@ -543,6 +548,7 @@ def test_confusion_matrix_matches_sklearn(session):


def test_confusion_matrix_str_matches_sklearn(session):
sklearn_metrics = pytest.importorskip("sklearn.metrics")
pd_df = pd.DataFrame(
{
"y_true": ["cat", "ant", "cat", "cat", "ant", "bird"],
Expand Down Expand Up @@ -603,6 +609,7 @@ def test_recall_score(session):


def test_recall_score_matches_sklearn(session):
sklearn_metrics = pytest.importorskip("sklearn.metrics")
pd_df = pd.DataFrame(
{
"y_true": [2, 0, 2, 2, 0, 1],
Expand All @@ -620,6 +627,7 @@ def test_recall_score_matches_sklearn(session):


def test_recall_score_str_matches_sklearn(session):
sklearn_metrics = pytest.importorskip("sklearn.metrics")
pd_df = pd.DataFrame(
{
"y_true": ["cat", "ant", "cat", "cat", "ant", "bird"],
Expand Down Expand Up @@ -673,6 +681,7 @@ def test_precision_score(session):


def test_precision_score_matches_sklearn(session):
sklearn_metrics = pytest.importorskip("sklearn.metrics")
pd_df = pd.DataFrame(
{
"y_true": [2, 0, 2, 2, 0, 1],
Expand All @@ -695,6 +704,7 @@ def test_precision_score_matches_sklearn(session):


def test_precision_score_str_matches_sklearn(session):
sklearn_metrics = pytest.importorskip("sklearn.metrics")
pd_df = pd.DataFrame(
{
"y_true": ["cat", "ant", "cat", "cat", "ant", "bird"],
Expand Down Expand Up @@ -752,6 +762,7 @@ def test_f1_score(session):


def test_f1_score_matches_sklearn(session):
sklearn_metrics = pytest.importorskip("sklearn.metrics")
pd_df = pd.DataFrame(
{
"y_true": [2, 0, 2, 2, 0, 1],
Expand All @@ -769,6 +780,7 @@ def test_f1_score_matches_sklearn(session):


def test_f1_score_str_matches_sklearn(session):
sklearn_metrics = pytest.importorskip("sklearn.metrics")
pd_df = pd.DataFrame(
{
"y_true": ["cat", "ant", "cat", "cat", "ant", "bird"],
Expand Down
2 changes: 2 additions & 0 deletions 2 tests/system/small/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -641,6 +641,8 @@ def test_series_replace_dict(scalars_dfs, replacement_dict):
),
)
def test_series_interpolate(method):
pytest.importorskip("scipy")

values = [None, 1, 2, None, None, 16, None]
index = [-3.2, 11.4, 3.56, 4, 4.32, 5.55, 76.8]
pd_series = pd.Series(values, index)
Expand Down
5 changes: 2 additions & 3 deletions 5 tests/unit/ml/test_api_primitives.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@
# limitations under the License.

import pytest
import sklearn.decomposition as sklearn_decomposition # type: ignore
import sklearn.linear_model as sklearn_linear_model # type: ignore

import bigframes.ml.decomposition
import bigframes.ml.linear_model
Expand All @@ -35,8 +33,9 @@ def test_base_estimator_repr():
assert pca_estimator.__repr__() == "PCA(n_components=7)"


@pytest.mark.skipif(sklearn_linear_model is None, reason="requires sklearn")
def test_base_estimator_repr_matches_sklearn():
sklearn_decomposition = pytest.importorskip("sklearn.decomposition")
sklearn_linear_model = pytest.importorskip("sklearn.linear_model")
estimator = bigframes.ml.linear_model.LinearRegression()
sklearn_estimator = sklearn_linear_model.LinearRegression()
assert estimator.__repr__() == sklearn_estimator.__repr__()
Expand Down
4 changes: 2 additions & 2 deletions 4 tests/unit/ml/test_compose.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@

from google.cloud import bigquery
import pytest
import sklearn.compose as sklearn_compose # type: ignore
import sklearn.preprocessing as sklearn_preprocessing # type: ignore

from bigframes.ml import compose, preprocessing
from bigframes.ml.compose import ColumnTransformer, SQLScalarColumnTransformer
Expand Down Expand Up @@ -119,6 +117,8 @@ def test_columntransformer_repr():


def test_columntransformer_repr_matches_sklearn():
sklearn_compose = pytest.importorskip("sklearn.compose")
sklearn_preprocessing = pytest.importorskip("sklearn.preprocessing")
bf_column_transformer = compose.ColumnTransformer(
[
(
Expand Down
9 changes: 4 additions & 5 deletions 9 tests/unit/ml/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,6 @@
# limitations under the License.

import pytest
import sklearn.compose as sklearn_compose # type: ignore
import sklearn.linear_model as sklearn_linear_model # type: ignore
import sklearn.pipeline as sklearn_pipeline # type: ignore
import sklearn.preprocessing as sklearn_preprocessing # type: ignore

from bigframes.ml import compose, forecasting, linear_model, pipeline, preprocessing

Expand Down Expand Up @@ -57,8 +53,11 @@ def test_pipeline_repr():
)


@pytest.mark.skipif(sklearn_pipeline is None, reason="requires sklearn")
def test_pipeline_repr_matches_sklearn():
sklearn_compose = pytest.importorskip("sklearn.compose")
sklearn_linear_model = pytest.importorskip("sklearn.linear_model")
sklearn_pipeline = pytest.importorskip("sklearn.pipeline")
sklearn_preprocessing = pytest.importorskip("sklearn.preprocessing")
bf_pl = pipeline.Pipeline(
[
(
Expand Down
20 changes: 19 additions & 1 deletion 20 third_party/bigframes_vendored/sklearn/metrics/_ranking.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
# Michal Karbownik <michakarbownik@gmail.com>
# License: BSD 3 clause

import numpy as np

from bigframes import constants


Expand Down Expand Up @@ -60,7 +62,23 @@ def auc(x, y) -> float:
Returns:
float: Area Under the Curve.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
if len(x) < 2:
raise ValueError(
f"At least 2 points are needed to compute area under curve, but x.shape = {len(x)}"
)

if x.is_monotonic_decreasing:
d = -1
elif x.is_monotonic_increasing:
d = 1
else:
raise ValueError(f"x is neither increasing nor decreasing : {x}.")

if hasattr(np, "trapezoid"):
# new in numpy 2.0
return d * np.trapezoid(y, x)
# np.trapz has been deprecated in 2.0
return d * np.trapz(y, x) # type: ignore


def roc_auc_score(y_true, y_score) -> float:
Expand Down
Morty Proxy This is a proxified and sanitized view of the page, visit original site.