Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

ENH Make KNeighborsClassifier.predict handle X=None #30047

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Oct 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
- Make `predict`, `predict_proba`, and `score` of
:class:`neighbors.KNeighborsClassifier` and
:class:`neighbors.RadiusNeighborsClassifier` accept `X=None` as input. In this case
predictions for all training set points are returned, and points are not included
into their own neighbors.
:pr:`30047` by :user:`Dmitry Kobak <dkobak>`.
96 changes: 85 additions & 11 deletions 96 sklearn/neighbors/_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,8 +244,10 @@ def predict(self, X):
Parameters
----------
X : {array-like, sparse matrix} of shape (n_queries, n_features), \
or (n_queries, n_indexed) if metric == 'precomputed'
Test samples.
or (n_queries, n_indexed) if metric == 'precomputed', or None
Test samples. If `None`, predictions for all indexed points are
returned; in this case, points are not considered their own
neighbors.

Returns
-------
Expand Down Expand Up @@ -281,7 +283,7 @@ def predict(self, X):
classes_ = [self.classes_]

n_outputs = len(classes_)
n_queries = _num_samples(X)
n_queries = _num_samples(self._fit_X if X is None else X)
weights = _get_weights(neigh_dist, self.weights)
if weights is not None and _all_with_any_reduction_axis_1(weights, value=0):
raise ValueError(
Expand Down Expand Up @@ -311,8 +313,10 @@ def predict_proba(self, X):
Parameters
----------
X : {array-like, sparse matrix} of shape (n_queries, n_features), \
or (n_queries, n_indexed) if metric == 'precomputed'
Test samples.
or (n_queries, n_indexed) if metric == 'precomputed', or None
Test samples. If `None`, predictions for all indexed points are
returned; in this case, points are not considered their own
neighbors.

Returns
-------
Expand Down Expand Up @@ -375,7 +379,7 @@ def predict_proba(self, X):
_y = self._y.reshape((-1, 1))
classes_ = [self.classes_]

n_queries = _num_samples(X)
n_queries = _num_samples(self._fit_X if X is None else X)

weights = _get_weights(neigh_dist, self.weights)
if weights is None:
Expand Down Expand Up @@ -408,6 +412,39 @@ def predict_proba(self, X):

return probabilities

# This function is defined here only to modify the parent docstring
# and add information about X=None
def score(self, X, y, sample_weight=None):
"""
Return the mean accuracy on the given test data and labels.

In multi-label classification, this is the subset accuracy
which is a harsh metric since you require for each sample that
each label set be correctly predicted.

Parameters
----------
X : array-like of shape (n_samples, n_features), or None
Test samples. If `None`, predictions for all indexed points are
used; in this case, points are not considered their own
neighbors. This means that `knn.fit(X, y).score(None, y)`
implicitly performs a leave-one-out cross-validation procedure
and is equivalent to `cross_val_score(knn, X, y, cv=LeaveOneOut())`
but typically much faster.

y : array-like of shape (n_samples,) or (n_samples, n_outputs)
True labels for `X`.

sample_weight : array-like of shape (n_samples,), default=None
Sample weights.

Returns
-------
score : float
Mean accuracy of ``self.predict(X)`` w.r.t. `y`.
"""
return super().score(X, y, sample_weight)

def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
tags.classifier_tags.multi_label = True
Expand Down Expand Up @@ -692,8 +729,10 @@ def predict(self, X):
Parameters
----------
X : {array-like, sparse matrix} of shape (n_queries, n_features), \
or (n_queries, n_indexed) if metric == 'precomputed'
Test samples.
or (n_queries, n_indexed) if metric == 'precomputed', or None
Test samples. If `None`, predictions for all indexed points are
returned; in this case, points are not considered their own
neighbors.

Returns
-------
Expand Down Expand Up @@ -734,8 +773,10 @@ def predict_proba(self, X):
Parameters
----------
X : {array-like, sparse matrix} of shape (n_queries, n_features), \
or (n_queries, n_indexed) if metric == 'precomputed'
Test samples.
or (n_queries, n_indexed) if metric == 'precomputed', or None
Test samples. If `None`, predictions for all indexed points are
returned; in this case, points are not considered their own
neighbors.

Returns
-------
Expand All @@ -745,7 +786,7 @@ def predict_proba(self, X):
by lexicographic order.
"""
check_is_fitted(self, "_fit_method")
n_queries = _num_samples(X)
n_queries = _num_samples(self._fit_X if X is None else X)

metric, metric_kwargs = _adjusted_metric(
metric=self.metric, metric_kwargs=self.metric_params, p=self.p
Expand Down Expand Up @@ -846,6 +887,39 @@ def predict_proba(self, X):

return probabilities

# This function is defined here only to modify the parent docstring
# and add information about X=None
def score(self, X, y, sample_weight=None):
"""
Return the mean accuracy on the given test data and labels.

In multi-label classification, this is the subset accuracy
which is a harsh metric since you require for each sample that
each label set be correctly predicted.

Parameters
----------
X : array-like of shape (n_samples, n_features), or None
Test samples. If `None`, predictions for all indexed points are
used; in this case, points are not considered their own
neighbors. This means that `knn.fit(X, y).score(None, y)`
implicitly performs a leave-one-out cross-validation procedure
and is equivalent to `cross_val_score(knn, X, y, cv=LeaveOneOut())`
but typically much faster.

y : array-like of shape (n_samples,) or (n_samples, n_outputs)
True labels for `X`.

sample_weight : array-like of shape (n_samples,), default=None
Sample weights.

Returns
-------
score : float
Mean accuracy of ``self.predict(X)`` w.r.t. `y`.
"""
return super().score(X, y, sample_weight)

def __sklearn_tags__(self):
tags = super().__sklearn_tags__()
tags.classifier_tags.multi_label = True
Expand Down
12 changes: 8 additions & 4 deletions 12 sklearn/neighbors/_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,8 +234,10 @@ def predict(self, X):
Parameters
----------
X : {array-like, sparse matrix} of shape (n_queries, n_features), \
or (n_queries, n_indexed) if metric == 'precomputed'
Test samples.
or (n_queries, n_indexed) if metric == 'precomputed', or None
Test samples. If `None`, predictions for all indexed points are
returned; in this case, points are not considered their own
neighbors.

Returns
-------
Expand Down Expand Up @@ -464,8 +466,10 @@ def predict(self, X):
Parameters
----------
X : {array-like, sparse matrix} of shape (n_queries, n_features), \
or (n_queries, n_indexed) if metric == 'precomputed'
Test samples.
or (n_queries, n_indexed) if metric == 'precomputed', or None
Test samples. If `None`, predictions for all indexed points are
returned; in this case, points are not considered their own
neighbors.

Returns
-------
Expand Down
45 changes: 44 additions & 1 deletion 45 sklearn/neighbors/tests/test_neighbors.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,12 @@
assert_compatible_argkmin_results,
assert_compatible_radius_results,
)
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import (
LeaveOneOut,
cross_val_predict,
cross_val_score,
train_test_split,
)
from sklearn.neighbors import (
VALID_METRICS_SPARSE,
KNeighborsRegressor,
Expand Down Expand Up @@ -2390,3 +2395,41 @@ def _weights(dist):

with pytest.raises(ValueError, match=msg):
est.predict_proba([[1.1, 1.1]])


@pytest.mark.parametrize(
"nn_model",
[
neighbors.KNeighborsClassifier(n_neighbors=10),
neighbors.RadiusNeighborsClassifier(radius=5.0),
],
)
def test_neighbor_classifiers_loocv(nn_model):
"""Check that `predict` and related functions work fine with X=None"""
X, y = datasets.make_blobs(n_samples=500, centers=5, n_features=2, random_state=0)

loocv = cross_val_score(nn_model, X, y, cv=LeaveOneOut())
nn_model.fit(X, y)

assert np.all(loocv == (nn_model.predict(None) == y))
assert np.mean(loocv) == nn_model.score(None, y)
assert nn_model.score(None, y) < nn_model.score(X, y)


@pytest.mark.parametrize(
"nn_model",
[
neighbors.KNeighborsRegressor(n_neighbors=10),
neighbors.RadiusNeighborsRegressor(radius=0.5),
],
)
def test_neighbor_regressors_loocv(nn_model):
"""Check that `predict` and related functions work fine with X=None"""
X, y = datasets.load_diabetes(return_X_y=True)

# Only checking cross_val_predict and not cross_val_score because
# cross_val_score does not work with LeaveOneOut() for a regressor
loocv = cross_val_predict(nn_model, X, y, cv=LeaveOneOut())
nn_model.fit(X, y)

assert np.all(loocv == nn_model.predict(None))
Morty Proxy This is a proxified and sanitized view of the page, visit original site.