14
14
15
15
import numpy as np
16
16
cimport numpy as np
17
- cimport cython
18
17
from cython cimport floating
18
+ from cython.parallel cimport prange
19
19
from libc.math cimport sqrt
20
20
21
21
from ..utils.extmath import row_norms
22
22
23
23
24
24
np.import_array()
25
25
26
- ctypedef np.float64_t DOUBLE
27
- ctypedef np.int32_t INT
28
-
29
26
30
27
# Number of samples per data chunk defined as a global constant.
31
28
CHUNK_SIZE = 256
@@ -103,7 +100,8 @@ cpdef floating _inertia_dense(
103
100
np.ndarray[floating, ndim= 2 , mode= ' c' ] X, # IN
104
101
floating[::1 ] sample_weight, # IN
105
102
floating[:, ::1 ] centers, # IN
106
- int [::1 ] labels): # IN
103
+ int [::1 ] labels, # IN
104
+ int n_threads):
107
105
""" Compute inertia for dense input data
108
106
109
107
Sum of squared distance between each sample and its assigned center.
@@ -116,7 +114,8 @@ cpdef floating _inertia_dense(
116
114
floating sq_dist = 0.0
117
115
floating inertia = 0.0
118
116
119
- for i in range (n_samples):
117
+ for i in prange(n_samples, nogil = True , num_threads = n_threads,
118
+ schedule = ' static' ):
120
119
j = labels[i]
121
120
sq_dist = _euclidean_dense_dense(& X[i, 0 ], & centers[j, 0 ],
122
121
n_features, True )
@@ -129,7 +128,8 @@ cpdef floating _inertia_sparse(
129
128
X, # IN
130
129
floating[::1 ] sample_weight, # IN
131
130
floating[:, ::1 ] centers, # IN
132
- int [::1 ] labels): # IN
131
+ int [::1 ] labels, # IN
132
+ int n_threads):
133
133
""" Compute inertia for sparse input data
134
134
135
135
Sum of squared distance between each sample and its assigned center.
@@ -148,7 +148,8 @@ cpdef floating _inertia_sparse(
148
148
149
149
floating[::1 ] centers_squared_norms = row_norms(centers, squared = True )
150
150
151
- for i in range (n_samples):
151
+ for i in prange(n_samples, nogil = True , num_threads = n_threads,
152
+ schedule = ' static' ):
152
153
j = labels[i]
153
154
sq_dist = _euclidean_sparse_dense(
154
155
X_data[X_indptr[i]: X_indptr[i + 1 ]],
@@ -286,104 +287,3 @@ cdef void _center_shift(
286
287
for j in range (n_clusters):
287
288
center_shift[j] = _euclidean_dense_dense(
288
289
& centers_new[j, 0 ], & centers_old[j, 0 ], n_features, False )
289
-
290
-
291
- def _mini_batch_update_csr (X , np.ndarray[floating , ndim = 1 ] sample_weight,
292
- np.ndarray[floating , ndim = 1 ] x_squared_norms,
293
- np.ndarray[floating , ndim = 2 ] centers,
294
- np.ndarray[floating , ndim = 1 ] weight_sums,
295
- np.ndarray[INT , ndim = 1 ] nearest_center,
296
- np.ndarray[floating , ndim = 1 ] old_center,
297
- int compute_squared_diff ):
298
- """ Incremental update of the centers for sparse MiniBatchKMeans.
299
-
300
- Parameters
301
- ----------
302
-
303
- X : CSR matrix, dtype float
304
- The complete (pre allocated) training set as a CSR matrix.
305
-
306
- centers : array, shape (n_clusters, n_features)
307
- The cluster centers
308
-
309
- counts : array, shape (n_clusters,)
310
- The vector in which we keep track of the numbers of elements in a
311
- cluster
312
-
313
- Returns
314
- -------
315
- inertia : float
316
- The inertia of the batch prior to centers update, i.e. the sum
317
- of squared distances to the closest center for each sample. This
318
- is the objective function being minimized by the k-means algorithm.
319
-
320
- squared_diff : float
321
- The sum of squared update (squared norm of the centers position
322
- change). If compute_squared_diff is 0, this computation is skipped and
323
- 0.0 is returned instead.
324
-
325
- Both squared diff and inertia are commonly used to monitor the convergence
326
- of the algorithm.
327
- """
328
- cdef:
329
- np.ndarray[floating, ndim= 1 ] X_data = X.data
330
- np.ndarray[int , ndim= 1 ] X_indices = X.indices
331
- np.ndarray[int , ndim= 1 ] X_indptr = X.indptr
332
- unsigned int n_samples = X.shape[0 ]
333
- unsigned int n_clusters = centers.shape[0 ]
334
- unsigned int n_features = centers.shape[1 ]
335
-
336
- unsigned int sample_idx, center_idx, feature_idx
337
- unsigned int k
338
- DOUBLE old_weight_sum, new_weight_sum
339
- DOUBLE center_diff
340
- DOUBLE squared_diff = 0.0
341
-
342
- # move centers to the mean of both old and newly assigned samples
343
- for center_idx in range (n_clusters):
344
- old_weight_sum = weight_sums[center_idx]
345
- new_weight_sum = old_weight_sum
346
-
347
- # count the number of samples assigned to this center
348
- for sample_idx in range (n_samples):
349
- if nearest_center[sample_idx] == center_idx:
350
- new_weight_sum += sample_weight[sample_idx]
351
-
352
- if new_weight_sum == old_weight_sum:
353
- # no new sample: leave this center as it stands
354
- continue
355
-
356
- # rescale the old center to reflect it previous accumulated weight
357
- # with regards to the new data that will be incrementally contributed
358
- if compute_squared_diff:
359
- old_center[:] = centers[center_idx]
360
- centers[center_idx] *= old_weight_sum
361
-
362
- # iterate of over samples assigned to this cluster to move the center
363
- # location by inplace summation
364
- for sample_idx in range (n_samples):
365
- if nearest_center[sample_idx] != center_idx:
366
- continue
367
-
368
- # inplace sum with new samples that are members of this cluster
369
- # and update of the incremental squared difference update of the
370
- # center position
371
- for k in range (X_indptr[sample_idx], X_indptr[sample_idx + 1 ]):
372
- centers[center_idx, X_indices[k]] += X_data[k]
373
-
374
- # inplace rescale center with updated count
375
- if new_weight_sum > old_weight_sum:
376
- # update the count statistics for this center
377
- weight_sums[center_idx] = new_weight_sum
378
-
379
- # re-scale the updated center with the total new counts
380
- centers[center_idx] /= new_weight_sum
381
-
382
- # update the incremental computation of the squared total
383
- # centers position change
384
- if compute_squared_diff:
385
- for feature_idx in range (n_features):
386
- squared_diff += (old_center[feature_idx]
387
- - centers[center_idx, feature_idx]) ** 2
388
-
389
- return squared_diff
0 commit comments