Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 708c668

Browse filesBrowse files
committed
Added docstring for random_state_ attribute
1 parent 8291d00 commit 708c668
Copy full SHA for 708c668

File tree

1 file changed

+204
-0
lines changed
Filter options

1 file changed

+204
-0
lines changed

‎sklearn/neighbors/_subsampled.py

Copy file name to clipboard
+204Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
"""Subsampled neighbors transformer"""
2+
3+
# Author: Jennifer Jang <j.jang42@gmail.com>
4+
# Heinrich Jiang <heinrichj@google.com>
5+
#
6+
# License: BSD 3 clause
7+
8+
import numpy as np
9+
10+
from ..metrics.pairwise import paired_distances, PAIRED_DISTANCES
11+
from ..utils.random import check_random_state
12+
from ..base import TransformerMixin, BaseEstimator
13+
from ..utils.validation import check_is_fitted
14+
15+
16+
class SubsampledNeighborsTransformer(TransformerMixin, BaseEstimator):
17+
"""Compute subsampled sparse distance matrix of neighboring points in X.
18+
19+
Parameters
20+
----------
21+
22+
s : float
23+
Sampling probability.
24+
25+
eps : float, default=None
26+
Neighborhood radius. Pairs of points which are at most eps apart are
27+
considered neighbors. If None, radius is assumed to be infinity.
28+
29+
metric : string or callable, default='euclidean'
30+
Input to paired_distances function. Can be string specified
31+
in PAIRED_DISTANCES, including "euclidean", "manhattan", or
32+
"cosine." Alternatively, can be a callable function, which should
33+
take two arrays from X as input and return a value indicating
34+
the distance between them.
35+
36+
random_seed : int, default=None
37+
Seeds the random sampling of lists of vertices.
38+
39+
Attributes
40+
----------
41+
fit_X_ : array-like of shape (n_train, n_features)
42+
Training set
43+
44+
n_train_ : int
45+
Number of training samples
46+
47+
random_state_ : numpy.RandomState
48+
Pseudo random number generator object used for sampling.
49+
50+
References
51+
----------
52+
- Faster DBSCAN via subsampled similarity queries, 2020
53+
Heinrich Jiang, Jennifer Jang, Jakub Łącki
54+
https://arxiv.org/abs/2006.06743
55+
56+
Notes
57+
-----
58+
Each pair of points is sampled uniformly with probability s.
59+
"""
60+
61+
def __init__(self, s=0.1, eps=None, metric='euclidean', random_state=None):
62+
self.s = s
63+
self.eps = eps
64+
self.metric = metric
65+
self.random_state = random_state
66+
self._check_parameters()
67+
68+
def _check_parameters(self):
69+
if self.s < 0:
70+
raise ValueError("Sampling rate needs to be non-negative: %s" %
71+
self.s)
72+
73+
if self.eps is not None and self.eps <= 0:
74+
raise ValueError("Epsilon needs to be positive: %s" % self.eps)
75+
76+
if self.metric not in PAIRED_DISTANCES and not callable(self.metric):
77+
raise ValueError('Unknown distance %s' % self.metric)
78+
79+
return self
80+
81+
def fit(self, X, Y=None):
82+
83+
self.fit_X_ = self._validate_data(X, accept_sparse='csr')
84+
self.n_train_ = self.fit_X_.shape[0]
85+
self.random_state_ = check_random_state(self.random_state)
86+
87+
return self
88+
89+
def transform(self, X):
90+
"""Transform data into a subsampled graph of neighbors.
91+
92+
Parameters
93+
----------
94+
X : array-like of shape (n, n_features)
95+
Sample data.
96+
97+
Returns
98+
-------
99+
neighborhood : sparse matrix of shape (n, n)
100+
Sparse matrix where the i-jth value is equal to the distance
101+
between X[i] and fit_X[j] for randomly sampled pairs of neighbors.
102+
The matrix is of CSR format.
103+
"""
104+
105+
check_is_fitted(self)
106+
107+
return self.subsampled_neighbors(X)
108+
109+
def subsampled_neighbors(self, X):
110+
"""Compute the subsampled sparse distance matrix of the neighboring
111+
points of X in fit_X.
112+
113+
Parameters
114+
----------
115+
X : array-like of shape (n, n_features)
116+
Sample data.
117+
118+
s : float
119+
Sampling probability.
120+
121+
eps : float, default=None
122+
Neighborhood radius. Pairs of points which are at most eps apart
123+
are considered neighbors. If not given, radius is assumed to be
124+
infinity.
125+
126+
metric : string or callable, default='euclidean'
127+
Input to paired_distances function. Can be string specified
128+
in PAIRED_DISTANCES, including "euclidean", "manhattan", or
129+
"cosine." Alternatively, can be a callable function, which should
130+
take two arrays from X as input and return a value indicating
131+
the distance between them.
132+
133+
random_state : int or numpy.RandomState, default=None
134+
A pseudo random number generator object or a seed for it if int.
135+
See :term: `Glossary <random_state>`.
136+
137+
Returns
138+
-------
139+
neighborhood : sparse matrix of shape (n, n)
140+
Sparse matrix where the i-jth value is equal to the distance
141+
between X[i] and fit_X[j] for randomly sampled pairs of neighbors.
142+
The matrix is of CSR format.
143+
"""
144+
145+
from scipy.sparse import csr_matrix
146+
147+
X = self._validate_data(X, accept_sparse='csr')
148+
149+
n, d = X.shape
150+
n_neighbors = int(self.s * self.n_train_)
151+
152+
# Sample edges
153+
rows = np.repeat(np.arange(n), n_neighbors)
154+
cols = self.random_state_.randint(self.n_train_, size=n * n_neighbors)
155+
156+
# No edges sampled
157+
if n_neighbors < 1:
158+
return csr_matrix((n, self.n_train_))
159+
160+
distances = paired_distances(X[rows], self.fit_X_[cols], metric=self.metric)
161+
162+
# Keep only neighbors within epsilon-neighborhood
163+
if self.eps is not None:
164+
eps_neighb = np.where(distances <= self.eps)
165+
rows = rows[eps_neighb]
166+
cols = cols[eps_neighb]
167+
distances = distances[eps_neighb]
168+
169+
line_changes = np.bincount(rows + 1).cumsum()
170+
is_dupe = np.zeros(rows.shape[0], dtype=bool)
171+
172+
# Loop over each row in our neighborhood graph
173+
for start, stop in zip(line_changes, line_changes[1:]):
174+
# Sort each row by distance
175+
dist_order = np.argsort(distances[start:stop], kind='mergesort')
176+
distances[start:stop] = distances[start:stop][dist_order]
177+
cols[start:stop] = cols[start:stop][dist_order]
178+
179+
# Sort column indices and label duplicates
180+
# When consecutive elements in sorted array are equal,
181+
# it means there is a duplicate
182+
col_order = np.argsort(cols[start:stop], kind='mergesort')
183+
cols_tmp = cols[start:stop][col_order]
184+
is_dupe[start:stop][col_order[1:]] = cols_tmp[:-1] == cols_tmp[1:]
185+
186+
# Dedupe
187+
rows = rows[~is_dupe]
188+
cols = cols[~is_dupe]
189+
distances = distances[~is_dupe]
190+
191+
indptr = np.bincount(rows + 1, minlength=n + 1).cumsum()
192+
193+
neighborhood = csr_matrix((distances, cols, indptr),
194+
shape=(n, self.n_train_))
195+
196+
return neighborhood
197+
198+
def _more_tags(self):
199+
return {
200+
'_xfail_checks': {
201+
'check_methods_subset_invariance':
202+
'Fails for the transform method'
203+
}
204+
}

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.