scikit-learn
diff --git a/‎doc/whats_new/v1.0.rst
Copy file name to clipboardExpand all lines: doc/whats_new/v1.0.rst
+20-2Lines changed: 20 additions & 2 deletions b/‎doc/whats_new/v1.0.rst
Copy file name to clipboardExpand all lines: doc/whats_new/v1.0.rst
+20-2Lines changed: 20 additions & 2 deletions
diff --git a/‎sklearn/cluster/_k_means_fast.pxd renamed to ‎sklearn/cluster/_k_means_common.pxd
Copy file name to clipboard b/‎sklearn/cluster/_k_means_fast.pxd renamed to ‎sklearn/cluster/_k_means_common.pxd
Copy file name to clipboard
diff --git a/‎sklearn/cluster/_k_means_fast.pyx renamed to ‎sklearn/cluster/_k_means_common.pyx
Copy file name to clipboardExpand all lines: sklearn/cluster/_k_means_common.pyx
+9-109Lines changed: 9 additions & 109 deletions b/‎sklearn/cluster/_k_means_fast.pyx renamed to ‎sklearn/cluster/_k_means_common.pyx
Copy file name to clipboardExpand all lines: sklearn/cluster/_k_means_common.pyx
+9-109Lines changed: 9 additions & 109 deletions
diff --git a/‎sklearn/cluster/_k_means_elkan.pyx
Copy file name to clipboardExpand all lines: sklearn/cluster/_k_means_elkan.pyx
+7-7Lines changed: 7 additions & 7 deletions b/‎sklearn/cluster/_k_means_elkan.pyx
Copy file name to clipboardExpand all lines: sklearn/cluster/_k_means_elkan.pyx
+7-7Lines changed: 7 additions & 7 deletions
diff --git a/‎sklearn/cluster/_k_means_lloyd.pyx
Copy file name to clipboardExpand all lines: sklearn/cluster/_k_means_lloyd.pyx
+5-5Lines changed: 5 additions & 5 deletions b/‎sklearn/cluster/_k_means_lloyd.pyx
Copy file name to clipboardExpand all lines: sklearn/cluster/_k_means_lloyd.pyx
+5-5Lines changed: 5 additions & 5 deletions
@@ -95,12 +95,30 @@ Changelog
   in multicore settings. :pr:`19052` by
   :user:`Yusuke Nagasaka <YusukeNagasaka>`.
 
-- |API| :class:`cluster.Birch` attributes, `fit_` and `partial_fit_`, are
-  deprecated and will be removed in 1.2. :pr:`19297` by `Thomas Fan`_.
+- |Efficiency| :class:`cluster.MiniBatchKMeans` is now faster in multicore
+  settings. :pr:`17622` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
+
+- |Fix| Fixed a bug in :class:`cluster.MiniBatchKMeans` where the sample
+  weights were partially ignored when the input is sparse. :pr:`17622` by
+  :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
+- |Fix| Improved convergence detection based on center change in
+  :class:`cluster.MiniBatchKMeans` which was almost never achievable.
+  :pr:`17622` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
+  
 - |FIX| :class:`cluster.AgglomerativeClustering` now supports readonly
   memory-mapped datasets. :pr:`19883` by `Julien Jerphanion <jjerphan>`.
 
+- |API| :class:`cluster.Birch` attributes, `fit_` and `partial_fit_`, are
+  deprecated and will be removed in 1.2. :pr:`19297` by `Thomas Fan`_.
+
+- |API| the default value for the `batch_size` parameter of
+  :class:`MiniBatchKMeans` was changed from 100 to 1024 due to efficiency
+  reasons. The `n_iter_` attribute of :class:`MiniBatchKMeans` now reports the
+  number of started epochs and the `n_steps_` attribute reports the number of
+  mini batches processed. :pr:`17622`
+  by :user:`Jérémie du Boisberranger <jeremiedbb>`.
+
 :mod:`sklearn.compose`
 ......................
 
 
@@ -14,18 +14,15 @@
 
 import numpy as np
 cimport numpy as np
-cimport cython
 from cython cimport floating
+from cython.parallel cimport prange
 from libc.math cimport sqrt
 
 from ..utils.extmath import row_norms
 
 
 np.import_array()
 
-ctypedef np.float64_t DOUBLE
-ctypedef np.int32_t INT
-
 
 # Number of samples per data chunk defined as a global constant.
 CHUNK_SIZE = 256
@@ -103,7 +100,8 @@ cpdef floating _inertia_dense(
         np.ndarray[floating, ndim=2, mode='c'] X,  # IN
         floating[::1] sample_weight,               # IN
         floating[:, ::1] centers,                  # IN
-        int[::1] labels):                          # IN
+        int[::1] labels,                           # IN
+        int n_threads):
     """Compute inertia for dense input data
 
     Sum of squared distance between each sample and its assigned center.
@@ -116,7 +114,8 @@ cpdef floating _inertia_dense(
         floating sq_dist = 0.0
         floating inertia = 0.0
 
-    for i in range(n_samples):
+    for i in prange(n_samples, nogil=True, num_threads=n_threads,
+                    schedule='static'):
         j = labels[i]
         sq_dist = _euclidean_dense_dense(&X[i, 0], &centers[j, 0],
                                          n_features, True)
@@ -129,7 +128,8 @@ cpdef floating _inertia_sparse(
         X,                            # IN
         floating[::1] sample_weight,  # IN
         floating[:, ::1] centers,     # IN
-        int[::1] labels):             # IN
+        int[::1] labels,              # IN
+        int n_threads):
     """Compute inertia for sparse input data
 
     Sum of squared distance between each sample and its assigned center.
@@ -148,7 +148,8 @@ cpdef floating _inertia_sparse(
 
         floating[::1] centers_squared_norms = row_norms(centers, squared=True)
 
-    for i in range(n_samples):
+    for i in prange(n_samples, nogil=True, num_threads=n_threads,
+                    schedule='static'):
         j = labels[i]
         sq_dist = _euclidean_sparse_dense(
             X_data[X_indptr[i]: X_indptr[i + 1]],
@@ -286,104 +287,3 @@ cdef void _center_shift(
     for j in range(n_clusters):
         center_shift[j] = _euclidean_dense_dense(
             &centers_new[j, 0], &centers_old[j, 0], n_features, False)
-
-
-def _mini_batch_update_csr(X, np.ndarray[floating, ndim=1] sample_weight,
-                           np.ndarray[floating, ndim=1] x_squared_norms,
-                           np.ndarray[floating, ndim=2] centers,
-                           np.ndarray[floating, ndim=1] weight_sums,
-                           np.ndarray[INT, ndim=1] nearest_center,
-                           np.ndarray[floating, ndim=1] old_center,
-                           int compute_squared_diff):
-    """Incremental update of the centers for sparse MiniBatchKMeans.
-
-    Parameters
-    ----------
-
-    X : CSR matrix, dtype float
-        The complete (pre allocated) training set as a CSR matrix.
-
-    centers : array, shape (n_clusters, n_features)
-        The cluster centers
-
-    counts : array, shape (n_clusters,)
-         The vector in which we keep track of the numbers of elements in a
-         cluster
-
-    Returns
-    -------
-    inertia : float
-        The inertia of the batch prior to centers update, i.e. the sum
-        of squared distances to the closest center for each sample. This
-        is the objective function being minimized by the k-means algorithm.
-
-    squared_diff : float
-        The sum of squared update (squared norm of the centers position
-        change). If compute_squared_diff is 0, this computation is skipped and
-        0.0 is returned instead.
-
-    Both squared diff and inertia are commonly used to monitor the convergence
-    of the algorithm.
-    """
-    cdef:
-        np.ndarray[floating, ndim=1] X_data = X.data
-        np.ndarray[int, ndim=1] X_indices = X.indices
-        np.ndarray[int, ndim=1] X_indptr = X.indptr
-        unsigned int n_samples = X.shape[0]
-        unsigned int n_clusters = centers.shape[0]
-        unsigned int n_features = centers.shape[1]
-
-        unsigned int sample_idx, center_idx, feature_idx
-        unsigned int k
-        DOUBLE old_weight_sum, new_weight_sum
-        DOUBLE center_diff
-        DOUBLE squared_diff = 0.0
-
-    # move centers to the mean of both old and newly assigned samples
-    for center_idx in range(n_clusters):
-        old_weight_sum = weight_sums[center_idx]
-        new_weight_sum = old_weight_sum
-
-        # count the number of samples assigned to this center
-        for sample_idx in range(n_samples):
-            if nearest_center[sample_idx] == center_idx:
-                new_weight_sum += sample_weight[sample_idx]
-
-        if new_weight_sum == old_weight_sum:
-            # no new sample: leave this center as it stands
-            continue
-
-        # rescale the old center to reflect it previous accumulated weight
-        # with regards to the new data that will be incrementally contributed
-        if compute_squared_diff:
-            old_center[:] = centers[center_idx]
-        centers[center_idx] *= old_weight_sum
-
-        # iterate of over samples assigned to this cluster to move the center
-        # location by inplace summation
-        for sample_idx in range(n_samples):
-            if nearest_center[sample_idx] != center_idx:
-                continue
-
-            # inplace sum with new samples that are members of this cluster
-            # and update of the incremental squared difference update of the
-            # center position
-            for k in range(X_indptr[sample_idx], X_indptr[sample_idx + 1]):
-                centers[center_idx, X_indices[k]] += X_data[k]
-
-        # inplace rescale center with updated count
-        if new_weight_sum > old_weight_sum:
-            # update the count statistics for this center
-            weight_sums[center_idx] = new_weight_sum
-
-            # re-scale the updated center with the total new counts
-            centers[center_idx] /= new_weight_sum
-
-            # update the incremental computation of the squared total
-            # centers position change
-            if compute_squared_diff:
-                for feature_idx in range(n_features):
-                    squared_diff += (old_center[feature_idx]
-                                     - centers[center_idx, feature_idx]) ** 2
-
-    return squared_diff
@@ -18,13 +18,13 @@ from libc.stdlib cimport calloc, free
 from libc.string cimport memset, memcpy
 
 from ..utils.extmath import row_norms
-from ._k_means_fast import CHUNK_SIZE
-from ._k_means_fast cimport _relocate_empty_clusters_dense
-from ._k_means_fast cimport _relocate_empty_clusters_sparse
-from ._k_means_fast cimport _euclidean_dense_dense
-from ._k_means_fast cimport _euclidean_sparse_dense
-from ._k_means_fast cimport _average_centers
-from ._k_means_fast cimport _center_shift
+from ._k_means_common import CHUNK_SIZE
+from ._k_means_common cimport _relocate_empty_clusters_dense
+from ._k_means_common cimport _relocate_empty_clusters_sparse
+from ._k_means_common cimport _euclidean_dense_dense
+from ._k_means_common cimport _euclidean_sparse_dense
+from ._k_means_common cimport _average_centers
+from ._k_means_common cimport _center_shift
 
 
 np.import_array()
 
@@ -11,16 +11,16 @@ cimport numpy as np
 from cython cimport floating
 from cython.parallel import prange, parallel
 from libc.stdlib cimport malloc, calloc, free
-from libc.string cimport memset, memcpy
+from libc.string cimport memset
 from libc.float cimport DBL_MAX, FLT_MAX
 
 from ..utils.extmath import row_norms
 from ..utils._cython_blas cimport _gemm
 from ..utils._cython_blas cimport RowMajor, Trans, NoTrans
-from ._k_means_fast import CHUNK_SIZE
-from ._k_means_fast cimport _relocate_empty_clusters_dense
-from ._k_means_fast cimport _relocate_empty_clusters_sparse
-from ._k_means_fast cimport _average_centers, _center_shift
+from ._k_means_common import CHUNK_SIZE
+from ._k_means_common cimport _relocate_empty_clusters_dense
+from ._k_means_common cimport _relocate_empty_clusters_sparse
+from ._k_means_common cimport _average_centers, _center_shift
 
 
 np.import_array()