diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index b480847ed1f45..9622e85c39718 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -2401,18 +2401,40 @@ def _weights(dist): "nn_model", [ neighbors.KNeighborsClassifier(n_neighbors=10), - neighbors.RadiusNeighborsClassifier(radius=5.0), + neighbors.RadiusNeighborsClassifier(), ], ) -def test_neighbor_classifiers_loocv(nn_model): - """Check that `predict` and related functions work fine with X=None""" - X, y = datasets.make_blobs(n_samples=500, centers=5, n_features=2, random_state=0) +@pytest.mark.parametrize("algorithm", ALGORITHMS) +def test_neighbor_classifiers_loocv(nn_model, algorithm): + """Check that `predict` and related functions work fine with X=None + + Calling predict with X=None computes a prediction for each training point + from the labels of its neighbors (without the label of the data point being + predicted upon). This is therefore mathematically equivalent to + leave-one-out cross-validation without having do any retraining (rebuilding + a KD-tree or Ball-tree index) or any data reshuffling. + """ + X, y = datasets.make_blobs(n_samples=15, centers=5, n_features=2, random_state=0) + + nn_model = clone(nn_model).set_params(algorithm=algorithm) + + # Set the radius for RadiusNeighborsRegressor to some percentile of the + # empirical pairwise distances to avoid trivial test cases and warnings for + # predictions with no neighbors within the radius. + if "radius" in nn_model.get_params(): + dists = pairwise_distances(X).ravel() + dists = dists[dists > 0] + nn_model.set_params(radius=np.percentile(dists, 80)) loocv = cross_val_score(nn_model, X, y, cv=LeaveOneOut()) nn_model.fit(X, y) - assert np.all(loocv == (nn_model.predict(None) == y)) - assert np.mean(loocv) == nn_model.score(None, y) + assert_allclose(loocv, nn_model.predict(None) == y) + assert np.mean(loocv) == pytest.approx(nn_model.score(None, y)) + + # Evaluating `nn_model` on its "training" set should lead to a higher + # accuracy value than leaving out each data point in turn because the + # former can overfit while the latter cannot by construction. assert nn_model.score(None, y) < nn_model.score(X, y) @@ -2420,16 +2442,32 @@ def test_neighbor_classifiers_loocv(nn_model): "nn_model", [ neighbors.KNeighborsRegressor(n_neighbors=10), - neighbors.RadiusNeighborsRegressor(radius=0.5), + neighbors.RadiusNeighborsRegressor(), ], ) -def test_neighbor_regressors_loocv(nn_model): +@pytest.mark.parametrize("algorithm", ALGORITHMS) +def test_neighbor_regressors_loocv(nn_model, algorithm): """Check that `predict` and related functions work fine with X=None""" - X, y = datasets.load_diabetes(return_X_y=True) + X, y = datasets.make_regression(n_samples=15, n_features=2, random_state=0) # Only checking cross_val_predict and not cross_val_score because - # cross_val_score does not work with LeaveOneOut() for a regressor + # cross_val_score does not work with LeaveOneOut() for a regressor: the + # default score method implements R2 score which is not well defined for a + # single data point. + # + # TODO: if score is refactored to evaluate models for other scoring + # functions, then this test can be extended to check cross_val_score as + # well. + nn_model = clone(nn_model).set_params(algorithm=algorithm) + + # Set the radius for RadiusNeighborsRegressor to some percentile of the + # empirical pairwise distances to avoid trivial test cases and warnings for + # predictions with no neighbors within the radius. + if "radius" in nn_model.get_params(): + dists = pairwise_distances(X).ravel() + dists = dists[dists > 0] + nn_model.set_params(radius=np.percentile(dists, 80)) + loocv = cross_val_predict(nn_model, X, y, cv=LeaveOneOut()) nn_model.fit(X, y) - - assert np.all(loocv == nn_model.predict(None)) + assert_allclose(loocv, nn_model.predict(None))