scikit-learn · OmarManzoor · Oct 18, 2024 · Oct 11, 2024 · Oct 11, 2024 · Oct 11, 2024
diff --git a/doc/whats_new/upcoming_changes/sklearn.neighbors/30047.enhancement.rst b/doc/whats_new/upcoming_changes/sklearn.neighbors/30047.enhancement.rst
@@ -0,0 +1,6 @@
+- Make `predict`, `predict_proba`, and `score` of
+  :class:`neighbors.KNeighborsClassifier` and
+  :class:`neighbors.RadiusNeighborsClassifier` accept `X=None` as input. In this case
+  predictions for all training set points are returned, and points are not included
+  into their own neighbors.
+  :pr:`30047` by :user:`Dmitry Kobak <dkobak>`.
diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py
@@ -244,8 +244,10 @@ def predict(self, X):
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_queries, n_features), \
-                or (n_queries, n_indexed) if metric == 'precomputed'
-            Test samples.
+                or (n_queries, n_indexed) if metric == 'precomputed', or None
+            Test samples. If `None`, predictions for all indexed points are
+            returned; in this case, points are not considered their own
+            neighbors.

        Returns
        -------
@@ -281,7 +283,7 @@ def predict(self, X):
            classes_ = [self.classes_]

        n_outputs = len(classes_)
-        n_queries = _num_samples(X)
+        n_queries = _num_samples(self._fit_X if X is None else X)
        weights = _get_weights(neigh_dist, self.weights)
        if weights is not None and _all_with_any_reduction_axis_1(weights, value=0):
            raise ValueError(
@@ -311,8 +313,10 @@ def predict_proba(self, X):
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_queries, n_features), \
-                or (n_queries, n_indexed) if metric == 'precomputed'
-            Test samples.
+                or (n_queries, n_indexed) if metric == 'precomputed', or None
+            Test samples. If `None`, predictions for all indexed points are
+            returned; in this case, points are not considered their own
+            neighbors.

        Returns
        -------
@@ -375,7 +379,7 @@ def predict_proba(self, X):
            _y = self._y.reshape((-1, 1))
            classes_ = [self.classes_]

-        n_queries = _num_samples(X)
+        n_queries = _num_samples(self._fit_X if X is None else X)

        weights = _get_weights(neigh_dist, self.weights)
        if weights is None:
@@ -408,6 +412,39 @@ def predict_proba(self, X):

        return probabilities

+    # This function is defined here only to modify the parent docstring
+    # and add information about X=None
+    def score(self, X, y, sample_weight=None):
+        """
+        Return the mean accuracy on the given test data and labels.
+
+        In multi-label classification, this is the subset accuracy
+        which is a harsh metric since you require for each sample that
+        each label set be correctly predicted.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features), or None
+            Test samples. If `None`, predictions for all indexed points are
+            used; in this case, points are not considered their own
+            neighbors. This means that `knn.fit(X, y).score(None, y)`
+            implicitly performs a leave-one-out cross-validation procedure
+            and is equivalent to `cross_val_score(knn, X, y, cv=LeaveOneOut())`
+            but typically much faster.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
+            True labels for `X`.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        Returns
+        -------
+        score : float
+            Mean accuracy of ``self.predict(X)`` w.r.t. `y`.
+        """
+        return super().score(X, y, sample_weight)
+
    def __sklearn_tags__(self):
        tags = super().__sklearn_tags__()
        tags.classifier_tags.multi_label = True
@@ -692,8 +729,10 @@ def predict(self, X):
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_queries, n_features), \
-                or (n_queries, n_indexed) if metric == 'precomputed'
-            Test samples.
+                or (n_queries, n_indexed) if metric == 'precomputed', or None
+            Test samples. If `None`, predictions for all indexed points are
+            returned; in this case, points are not considered their own
+            neighbors.

        Returns
        -------
@@ -734,8 +773,10 @@ def predict_proba(self, X):
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_queries, n_features), \
-                or (n_queries, n_indexed) if metric == 'precomputed'
-            Test samples.
+                or (n_queries, n_indexed) if metric == 'precomputed', or None
+            Test samples. If `None`, predictions for all indexed points are
+            returned; in this case, points are not considered their own
+            neighbors.

        Returns
        -------
@@ -745,7 +786,7 @@ def predict_proba(self, X):
            by lexicographic order.
        """
        check_is_fitted(self, "_fit_method")
-        n_queries = _num_samples(X)
+        n_queries = _num_samples(self._fit_X if X is None else X)

        metric, metric_kwargs = _adjusted_metric(
            metric=self.metric, metric_kwargs=self.metric_params, p=self.p
@@ -846,6 +887,39 @@ def predict_proba(self, X):

        return probabilities

+    # This function is defined here only to modify the parent docstring
+    # and add information about X=None
+    def score(self, X, y, sample_weight=None):
+        """
+        Return the mean accuracy on the given test data and labels.
+
+        In multi-label classification, this is the subset accuracy
+        which is a harsh metric since you require for each sample that
+        each label set be correctly predicted.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features), or None
+            Test samples. If `None`, predictions for all indexed points are
+            used; in this case, points are not considered their own
+            neighbors. This means that `knn.fit(X, y).score(None, y)`
+            implicitly performs a leave-one-out cross-validation procedure
+            and is equivalent to `cross_val_score(knn, X, y, cv=LeaveOneOut())`
+            but typically much faster.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
+            True labels for `X`.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        Returns
+        -------
+        score : float
+            Mean accuracy of ``self.predict(X)`` w.r.t. `y`.
+        """
+        return super().score(X, y, sample_weight)
+
    def __sklearn_tags__(self):
        tags = super().__sklearn_tags__()
        tags.classifier_tags.multi_label = True

diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py
@@ -234,8 +234,10 @@ def predict(self, X):
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_queries, n_features), \
-                or (n_queries, n_indexed) if metric == 'precomputed'
-            Test samples.
+                or (n_queries, n_indexed) if metric == 'precomputed', or None
+            Test samples. If `None`, predictions for all indexed points are
+            returned; in this case, points are not considered their own
+            neighbors.

        Returns
        -------
@@ -464,8 +466,10 @@ def predict(self, X):
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_queries, n_features), \
-                or (n_queries, n_indexed) if metric == 'precomputed'
-            Test samples.
+                or (n_queries, n_indexed) if metric == 'precomputed', or None
+            Test samples. If `None`, predictions for all indexed points are
+            returned; in this case, points are not considered their own
+            neighbors.

        Returns
        -------

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
@@ -24,7 +24,12 @@
    assert_compatible_argkmin_results,
    assert_compatible_radius_results,
 )
-from sklearn.model_selection import cross_val_score, train_test_split
+from sklearn.model_selection import (
+    LeaveOneOut,
+    cross_val_predict,
+    cross_val_score,
+    train_test_split,
+)
 from sklearn.neighbors import (
    VALID_METRICS_SPARSE,
    KNeighborsRegressor,
@@ -2390,3 +2395,41 @@ def _weights(dist):

    with pytest.raises(ValueError, match=msg):
        est.predict_proba([[1.1, 1.1]])
+
+
+@pytest.mark.parametrize(
+    "nn_model",
+    [
+        neighbors.KNeighborsClassifier(n_neighbors=10),
+        neighbors.RadiusNeighborsClassifier(radius=5.0),
+    ],
+)
+def test_neighbor_classifiers_loocv(nn_model):
+    """Check that `predict` and related functions work fine with X=None"""
+    X, y = datasets.make_blobs(n_samples=500, centers=5, n_features=2, random_state=0)
+
+    loocv = cross_val_score(nn_model, X, y, cv=LeaveOneOut())
+    nn_model.fit(X, y)
+
+    assert np.all(loocv == (nn_model.predict(None) == y))
+    assert np.mean(loocv) == nn_model.score(None, y)
+    assert nn_model.score(None, y) < nn_model.score(X, y)
+
+
+@pytest.mark.parametrize(
+    "nn_model",
+    [
+        neighbors.KNeighborsRegressor(n_neighbors=10),
+        neighbors.RadiusNeighborsRegressor(radius=0.5),
+    ],
+)
+def test_neighbor_regressors_loocv(nn_model):
+    """Check that `predict` and related functions work fine with X=None"""
+    X, y = datasets.load_diabetes(return_X_y=True)
+
+    # Only checking cross_val_predict and not cross_val_score because
+    # cross_val_score does not work with LeaveOneOut() for a regressor
+    loocv = cross_val_predict(nn_model, X, y, cv=LeaveOneOut())
+    nn_model.fit(X, y)
+
+    assert np.all(loocv == nn_model.predict(None))