Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 07c0688

Browse filesBrowse files
OmarManzoorOmar Salmanjjerphan
authored andcommitted
PERF Implement PairwiseDistancesReduction backend for RadiusNeighbors.predict_proba (scikit-learn#26828)
Co-authored-by: Omar Salman <omar.salman@arbisoft> Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
1 parent bce20d9 commit 07c0688
Copy full SHA for 07c0688

File tree

9 files changed

+572
-2
lines changed
Filter options

9 files changed

+572
-2
lines changed

‎.gitignore

Copy file name to clipboardExpand all lines: .gitignore
+1Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd
9999
sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx
100100
sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd
101101
sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx
102+
sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx
102103
sklearn/neighbors/_ball_tree.pyx
103104
sklearn/neighbors/_binary_tree.pxi
104105
sklearn/neighbors/_kd_tree.pyx

‎doc/whats_new/v1.4.rst

Copy file name to clipboardExpand all lines: doc/whats_new/v1.4.rst
+5Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,11 @@ Changelog
280280
:class:`metric.DistanceMetric` objects.
281281
:pr:`26267` by :user:`Meekail Zain <micky774>`
282282

283+
- |Efficiency| The performance of :meth:`neighbors.RadiusNeighborsClassifier.predict`
284+
and of :meth:`neighbors.RadiusNeighborsClassifier.predict_proba` has been improved
285+
when `radius` is large and `algorithm="brute"` with non-Euclidean metrics.
286+
:pr:`26828` by :user:`Omar Salman <OmarManzoor>`.
287+
283288
:mod:`sklearn.preprocessing`
284289
............................
285290

‎setup.cfg

Copy file name to clipboardExpand all lines: setup.cfg
+1Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ ignore =
5353
sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx
5454
sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd
5555
sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx
56+
sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx
5657
sklearn/neighbors/_ball_tree.pyx
5758
sklearn/neighbors/_binary_tree.pxi
5859
sklearn/neighbors/_kd_tree.pyx

‎setup.py

Copy file name to clipboardExpand all lines: setup.py
+6Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,12 @@ def check_package_status(package, min_version):
295295
"include_np": True,
296296
"extra_compile_args": ["-std=c++11"],
297297
},
298+
{
299+
"sources": ["_radius_neighbors_classmode.pyx.tp"],
300+
"language": "c++",
301+
"include_np": True,
302+
"extra_compile_args": ["-std=c++11"],
303+
},
298304
],
299305
"preprocessing": [
300306
{"sources": ["_csr_polynomial_expansion.pyx"]},

‎sklearn/metrics/_pairwise_distances_reduction/__init__.py

Copy file name to clipboardExpand all lines: sklearn/metrics/_pairwise_distances_reduction/__init__.py
+2Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@
9191
ArgKminClassMode,
9292
BaseDistancesReductionDispatcher,
9393
RadiusNeighbors,
94+
RadiusNeighborsClassMode,
9495
sqeuclidean_row_norms,
9596
)
9697

@@ -99,5 +100,6 @@
99100
"ArgKmin",
100101
"RadiusNeighbors",
101102
"ArgKminClassMode",
103+
"RadiusNeighborsClassMode",
102104
"sqeuclidean_row_norms",
103105
]

‎sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py

Copy file name to clipboardExpand all lines: sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
+154Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@
2323
RadiusNeighbors32,
2424
RadiusNeighbors64,
2525
)
26+
from ._radius_neighbors_classmode import (
27+
RadiusNeighborsClassMode32,
28+
RadiusNeighborsClassMode64,
29+
)
2630

2731

2832
def sqeuclidean_row_norms(X, num_threads):
@@ -597,3 +601,153 @@ def compute(
597601
"Only float64 or float32 datasets pairs are supported at this time, "
598602
f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
599603
)
604+
605+
606+
class RadiusNeighborsClassMode(BaseDistancesReductionDispatcher):
607+
"""Compute radius-based class modes of row vectors of X using the
608+
those of Y.
609+
610+
For each row-vector X[i] of the queries X, find all the indices j of
611+
row-vectors in Y such that:
612+
613+
dist(X[i], Y[j]) <= radius
614+
615+
RadiusNeighborsClassMode is typically used to perform bruteforce
616+
radius neighbors queries when the weighted mode of the labels for
617+
the nearest neighbors within the specified radius are required,
618+
such as in `predict` methods.
619+
620+
This class is not meant to be instantiated, one should only use
621+
its :meth:`compute` classmethod which handles allocation and
622+
deallocation consistently.
623+
"""
624+
625+
@classmethod
626+
def valid_metrics(cls) -> List[str]:
627+
excluded = {
628+
# Euclidean is technically usable for RadiusNeighborsClassMode
629+
# but it would not be competitive.
630+
# TODO: implement Euclidean specialization using GEMM.
631+
"euclidean",
632+
"sqeuclidean",
633+
}
634+
return sorted(set(BaseDistancesReductionDispatcher.valid_metrics()) - excluded)
635+
636+
@classmethod
637+
def compute(
638+
cls,
639+
X,
640+
Y,
641+
radius,
642+
weights,
643+
Y_labels,
644+
unique_Y_labels,
645+
outlier_label,
646+
metric="euclidean",
647+
chunk_size=None,
648+
metric_kwargs=None,
649+
strategy=None,
650+
):
651+
"""Return the results of the reduction for the given arguments.
652+
Parameters
653+
----------
654+
X : ndarray of shape (n_samples_X, n_features)
655+
The input array to be labelled.
656+
Y : ndarray of shape (n_samples_Y, n_features)
657+
The input array whose class membership is provided through
658+
the `Y_labels` parameter.
659+
radius : float
660+
The radius defining the neighborhood.
661+
weights : ndarray
662+
The weights applied to the `Y_labels` when computing the
663+
weighted mode of the labels.
664+
Y_labels : ndarray
665+
An array containing the index of the class membership of the
666+
associated samples in `Y`. This is used in labeling `X`.
667+
unique_Y_labels : ndarray
668+
An array containing all unique class labels.
669+
outlier_label : int, default=None
670+
Label for outlier samples (samples with no neighbors in given
671+
radius). In the default case when the value is None if any
672+
outlier is detected, a ValueError will be raised. The outlier
673+
label should be selected from among the unique 'Y' labels. If
674+
it is specified with a different value a warning will be raised
675+
and all class probabilities of outliers will be assigned to be 0.
676+
metric : str, default='euclidean'
677+
The distance metric to use. For a list of available metrics, see
678+
the documentation of :class:`~sklearn.metrics.DistanceMetric`.
679+
Currently does not support `'precomputed'`.
680+
chunk_size : int, default=None,
681+
The number of vectors per chunk. If None (default) looks-up in
682+
scikit-learn configuration for `pairwise_dist_chunk_size`,
683+
and use 256 if it is not set.
684+
metric_kwargs : dict, default=None
685+
Keyword arguments to pass to specified metric function.
686+
strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
687+
The chunking strategy defining which dataset parallelization are made on.
688+
For both strategies the computations happens with two nested loops,
689+
respectively on chunks of X and chunks of Y.
690+
Strategies differs on which loop (outer or inner) is made to run
691+
in parallel with the Cython `prange` construct:
692+
- 'parallel_on_X' dispatches chunks of X uniformly on threads.
693+
Each thread then iterates on all the chunks of Y. This strategy is
694+
embarrassingly parallel and comes with no datastructures
695+
synchronisation.
696+
- 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
697+
Each thread processes all the chunks of X in turn. This strategy is
698+
a sequence of embarrassingly parallel subtasks (the inner loop on Y
699+
chunks) with intermediate datastructures synchronisation at each
700+
iteration of the sequential outer loop on X chunks.
701+
- 'auto' relies on a simple heuristic to choose between
702+
'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
703+
'parallel_on_X' is usually the most efficient strategy.
704+
When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y'
705+
brings more opportunity for parallelism and is therefore more efficient
706+
despite the synchronization step at each iteration of the outer loop
707+
on chunks of `X`.
708+
- None (default) looks-up in scikit-learn configuration for
709+
`pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
710+
Returns
711+
-------
712+
probabilities : ndarray of shape (n_samples_X, n_classes)
713+
An array containing the class probabilities for each sample.
714+
"""
715+
if weights not in {"uniform", "distance"}:
716+
raise ValueError(
717+
"Only the 'uniform' or 'distance' weights options are supported"
718+
f" at this time. Got: {weights=}."
719+
)
720+
if X.dtype == Y.dtype == np.float64:
721+
return RadiusNeighborsClassMode64.compute(
722+
X=X,
723+
Y=Y,
724+
radius=radius,
725+
weights=weights,
726+
Y_labels=np.array(Y_labels, dtype=np.intp),
727+
unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp),
728+
outlier_label=outlier_label,
729+
metric=metric,
730+
chunk_size=chunk_size,
731+
metric_kwargs=metric_kwargs,
732+
strategy=strategy,
733+
)
734+
735+
if X.dtype == Y.dtype == np.float32:
736+
return RadiusNeighborsClassMode32.compute(
737+
X=X,
738+
Y=Y,
739+
radius=radius,
740+
weights=weights,
741+
Y_labels=np.array(Y_labels, dtype=np.intp),
742+
unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp),
743+
outlier_label=outlier_label,
744+
metric=metric,
745+
chunk_size=chunk_size,
746+
metric_kwargs=metric_kwargs,
747+
strategy=strategy,
748+
)
749+
750+
raise ValueError(
751+
"Only float64 or float32 datasets pairs are supported at this time, "
752+
f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
753+
)

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.