23
23
RadiusNeighbors32 ,
24
24
RadiusNeighbors64 ,
25
25
)
26
+ from ._radius_neighbors_classmode import (
27
+ RadiusNeighborsClassMode32 ,
28
+ RadiusNeighborsClassMode64 ,
29
+ )
26
30
27
31
28
32
def sqeuclidean_row_norms (X , num_threads ):
@@ -597,3 +601,153 @@ def compute(
597
601
"Only float64 or float32 datasets pairs are supported at this time, "
598
602
f"got: X.dtype={ X .dtype } and Y.dtype={ Y .dtype } ."
599
603
)
604
+
605
+
606
+ class RadiusNeighborsClassMode (BaseDistancesReductionDispatcher ):
607
+ """Compute radius-based class modes of row vectors of X using the
608
+ those of Y.
609
+
610
+ For each row-vector X[i] of the queries X, find all the indices j of
611
+ row-vectors in Y such that:
612
+
613
+ dist(X[i], Y[j]) <= radius
614
+
615
+ RadiusNeighborsClassMode is typically used to perform bruteforce
616
+ radius neighbors queries when the weighted mode of the labels for
617
+ the nearest neighbors within the specified radius are required,
618
+ such as in `predict` methods.
619
+
620
+ This class is not meant to be instantiated, one should only use
621
+ its :meth:`compute` classmethod which handles allocation and
622
+ deallocation consistently.
623
+ """
624
+
625
+ @classmethod
626
+ def valid_metrics (cls ) -> List [str ]:
627
+ excluded = {
628
+ # Euclidean is technically usable for RadiusNeighborsClassMode
629
+ # but it would not be competitive.
630
+ # TODO: implement Euclidean specialization using GEMM.
631
+ "euclidean" ,
632
+ "sqeuclidean" ,
633
+ }
634
+ return sorted (set (BaseDistancesReductionDispatcher .valid_metrics ()) - excluded )
635
+
636
+ @classmethod
637
+ def compute (
638
+ cls ,
639
+ X ,
640
+ Y ,
641
+ radius ,
642
+ weights ,
643
+ Y_labels ,
644
+ unique_Y_labels ,
645
+ outlier_label ,
646
+ metric = "euclidean" ,
647
+ chunk_size = None ,
648
+ metric_kwargs = None ,
649
+ strategy = None ,
650
+ ):
651
+ """Return the results of the reduction for the given arguments.
652
+ Parameters
653
+ ----------
654
+ X : ndarray of shape (n_samples_X, n_features)
655
+ The input array to be labelled.
656
+ Y : ndarray of shape (n_samples_Y, n_features)
657
+ The input array whose class membership is provided through
658
+ the `Y_labels` parameter.
659
+ radius : float
660
+ The radius defining the neighborhood.
661
+ weights : ndarray
662
+ The weights applied to the `Y_labels` when computing the
663
+ weighted mode of the labels.
664
+ Y_labels : ndarray
665
+ An array containing the index of the class membership of the
666
+ associated samples in `Y`. This is used in labeling `X`.
667
+ unique_Y_labels : ndarray
668
+ An array containing all unique class labels.
669
+ outlier_label : int, default=None
670
+ Label for outlier samples (samples with no neighbors in given
671
+ radius). In the default case when the value is None if any
672
+ outlier is detected, a ValueError will be raised. The outlier
673
+ label should be selected from among the unique 'Y' labels. If
674
+ it is specified with a different value a warning will be raised
675
+ and all class probabilities of outliers will be assigned to be 0.
676
+ metric : str, default='euclidean'
677
+ The distance metric to use. For a list of available metrics, see
678
+ the documentation of :class:`~sklearn.metrics.DistanceMetric`.
679
+ Currently does not support `'precomputed'`.
680
+ chunk_size : int, default=None,
681
+ The number of vectors per chunk. If None (default) looks-up in
682
+ scikit-learn configuration for `pairwise_dist_chunk_size`,
683
+ and use 256 if it is not set.
684
+ metric_kwargs : dict, default=None
685
+ Keyword arguments to pass to specified metric function.
686
+ strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
687
+ The chunking strategy defining which dataset parallelization are made on.
688
+ For both strategies the computations happens with two nested loops,
689
+ respectively on chunks of X and chunks of Y.
690
+ Strategies differs on which loop (outer or inner) is made to run
691
+ in parallel with the Cython `prange` construct:
692
+ - 'parallel_on_X' dispatches chunks of X uniformly on threads.
693
+ Each thread then iterates on all the chunks of Y. This strategy is
694
+ embarrassingly parallel and comes with no datastructures
695
+ synchronisation.
696
+ - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
697
+ Each thread processes all the chunks of X in turn. This strategy is
698
+ a sequence of embarrassingly parallel subtasks (the inner loop on Y
699
+ chunks) with intermediate datastructures synchronisation at each
700
+ iteration of the sequential outer loop on X chunks.
701
+ - 'auto' relies on a simple heuristic to choose between
702
+ 'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
703
+ 'parallel_on_X' is usually the most efficient strategy.
704
+ When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y'
705
+ brings more opportunity for parallelism and is therefore more efficient
706
+ despite the synchronization step at each iteration of the outer loop
707
+ on chunks of `X`.
708
+ - None (default) looks-up in scikit-learn configuration for
709
+ `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
710
+ Returns
711
+ -------
712
+ probabilities : ndarray of shape (n_samples_X, n_classes)
713
+ An array containing the class probabilities for each sample.
714
+ """
715
+ if weights not in {"uniform" , "distance" }:
716
+ raise ValueError (
717
+ "Only the 'uniform' or 'distance' weights options are supported"
718
+ f" at this time. Got: { weights = } ."
719
+ )
720
+ if X .dtype == Y .dtype == np .float64 :
721
+ return RadiusNeighborsClassMode64 .compute (
722
+ X = X ,
723
+ Y = Y ,
724
+ radius = radius ,
725
+ weights = weights ,
726
+ Y_labels = np .array (Y_labels , dtype = np .intp ),
727
+ unique_Y_labels = np .array (unique_Y_labels , dtype = np .intp ),
728
+ outlier_label = outlier_label ,
729
+ metric = metric ,
730
+ chunk_size = chunk_size ,
731
+ metric_kwargs = metric_kwargs ,
732
+ strategy = strategy ,
733
+ )
734
+
735
+ if X .dtype == Y .dtype == np .float32 :
736
+ return RadiusNeighborsClassMode32 .compute (
737
+ X = X ,
738
+ Y = Y ,
739
+ radius = radius ,
740
+ weights = weights ,
741
+ Y_labels = np .array (Y_labels , dtype = np .intp ),
742
+ unique_Y_labels = np .array (unique_Y_labels , dtype = np .intp ),
743
+ outlier_label = outlier_label ,
744
+ metric = metric ,
745
+ chunk_size = chunk_size ,
746
+ metric_kwargs = metric_kwargs ,
747
+ strategy = strategy ,
748
+ )
749
+
750
+ raise ValueError (
751
+ "Only float64 or float32 datasets pairs are supported at this time, "
752
+ f"got: X.dtype={ X .dtype } and Y.dtype={ Y .dtype } ."
753
+ )
0 commit comments