scikit-learn
diff --git a/‎doc/modules/classes.rst
Copy file name to clipboardExpand all lines: doc/modules/classes.rst
+1Lines changed: 1 addition & 0 deletions b/‎doc/modules/classes.rst
Copy file name to clipboardExpand all lines: doc/modules/classes.rst
+1Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/modules/clustering.rst
Copy file name to clipboardExpand all lines: doc/modules/clustering.rst
+5-1Lines changed: 5 additions & 1 deletion b/‎doc/modules/clustering.rst
Copy file name to clipboardExpand all lines: doc/modules/clustering.rst
+5-1Lines changed: 5 additions & 1 deletion
diff --git a/‎doc/whats_new/v0.24.rst
Copy file name to clipboardExpand all lines: doc/whats_new/v0.24.rst
+4Lines changed: 4 additions & 0 deletions b/‎doc/whats_new/v0.24.rst
Copy file name to clipboardExpand all lines: doc/whats_new/v0.24.rst
+4Lines changed: 4 additions & 0 deletions
diff --git a/‎examples/cluster/plot_kmeans_plusplus.py
Copy file name to clipboard
+45Lines changed: 45 additions & 0 deletions b/‎examples/cluster/plot_kmeans_plusplus.py
Copy file name to clipboard
+45Lines changed: 45 additions & 0 deletions
diff --git a/‎sklearn/cluster/__init__.py
Copy file name to clipboardExpand all lines: sklearn/cluster/__init__.py
+2-1Lines changed: 2 additions & 1 deletion b/‎sklearn/cluster/__init__.py
Copy file name to clipboardExpand all lines: sklearn/cluster/__init__.py
+2-1Lines changed: 2 additions & 1 deletion
diff --git a/‎sklearn/cluster/_kmeans.py
Copy file name to clipboardExpand all lines: sklearn/cluster/_kmeans.py
+114-18Lines changed: 114 additions & 18 deletions b/‎sklearn/cluster/_kmeans.py
Copy file name to clipboardExpand all lines: sklearn/cluster/_kmeans.py
+114-18Lines changed: 114 additions & 18 deletions
diff --git a/‎sklearn/cluster/tests/test_k_means.py
Copy file name to clipboardExpand all lines: sklearn/cluster/tests/test_k_means.py
+58-1Lines changed: 58 additions & 1 deletion b/‎sklearn/cluster/tests/test_k_means.py
Copy file name to clipboardExpand all lines: sklearn/cluster/tests/test_k_means.py
+58-1Lines changed: 58 additions & 1 deletion
@@ -124,6 +124,7 @@ Functions
    cluster.dbscan
    cluster.estimate_bandwidth
    cluster.k_means
+   cluster.kmeans_plusplus
    cluster.mean_shift
    cluster.spectral_clustering
    cluster.ward_tree
 
@@ -197,7 +197,11 @@ initializations of the centroids. One method to help address this issue is the
 k-means++ initialization scheme, which has been implemented in scikit-learn
 (use the ``init='k-means++'`` parameter). This initializes the centroids to be
 (generally) distant from each other, leading to provably better results than
-random initialization, as shown in the reference.
+random initialization, as shown in the reference. 
+
+K-means++ can also be called independently to select seeds for other 
+clustering algorithms, see :func:`sklearn.cluster.kmeans_plusplus` for details
+and example usage.
 
 The algorithm supports sample weights, which can be given by a parameter
 ``sample_weight``. This allows to assign more weight to some samples when
 
@@ -97,6 +97,10 @@ Changelog
   `init_size_`, are deprecated and will be removed in 0.26. :pr:`17864` by
   :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
+- |Enhancement| Added :func:`cluster.kmeans_plusplus` as public function. 
+  Initialization by KMeans++ can now be called separately to generate
+  initial cluster centroids. :pr:`17937` by :user:`g-walsh`
+
 :mod:`sklearn.compose`
 ......................
 
 
@@ -0,0 +1,45 @@
+"""
+===========================================================
+An example of K-Means++ initialization
+===========================================================
+
+An example to show the output of the :func:`sklearn.cluster.kmeans_plusplus`
+function for generating initial seeds for clustering.
+
+K-Means++ is used as the default initialization for :ref:`k_means`.
+
+"""
+print(__doc__)
+
+from sklearn.cluster import kmeans_plusplus
+from sklearn.datasets import make_blobs
+import matplotlib.pyplot as plt
+
+# Generate sample data
+n_samples = 4000
+n_components = 4
+
+X, y_true = make_blobs(n_samples=n_samples,
+                       centers=n_components,
+                       cluster_std=0.60,
+                       random_state=0)
+X = X[:, ::-1]
+
+# Calculate seeds from kmeans++
+centers_init, indices = kmeans_plusplus(X, n_clusters=4,
+                                        random_state=0)
+
+# Plot init seeds along side sample data
+plt.figure(1)
+colors = ['#4EACC5', '#FF9C34', '#4E9A06', 'm']
+
+for k, col in enumerate(colors):
+    cluster_data = y_true == k
+    plt.scatter(X[cluster_data, 0], X[cluster_data, 1],
+                c=col, marker='.', s=10)
+
+plt.scatter(centers_init[:, 0], centers_init[:, 1], c='b', s=50)
+plt.title("K-Means++ Initialization")
+plt.xticks([])
+plt.yticks([])
+plt.show()
@@ -9,7 +9,7 @@
 from ._affinity_propagation import affinity_propagation, AffinityPropagation
 from ._agglomerative import (ward_tree, AgglomerativeClustering,
                              linkage_tree, FeatureAgglomeration)
-from ._kmeans import k_means, KMeans, MiniBatchKMeans
+from ._kmeans import k_means, KMeans, MiniBatchKMeans, kmeans_plusplus
 from ._dbscan import dbscan, DBSCAN
 from ._optics import (OPTICS, cluster_optics_dbscan, compute_optics_graph,
                       cluster_optics_xi)
@@ -34,6 +34,7 @@
            'estimate_bandwidth',
            'get_bin_seeds',
            'k_means',
+           'kmeans_plusplus',
            'linkage_tree',
            'mean_shift',
            'spectral_clustering',
 
@@ -47,14 +47,15 @@
 # Initialization heuristic
 
 
-def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
-    """Init n_clusters seeds according to k-means++
+def _kmeans_plusplus(X, n_clusters, x_squared_norms,
+                     random_state, n_local_trials=None):
+    """Computational component for initialization of n_clusters by
+    k-means++. Prior validation of data is assumed.
 
     Parameters
     ----------
     X : {ndarray, sparse matrix} of shape (n_samples, n_features)
-        The data to pick seeds for. To avoid memory copy, the input data
-        should be double precision (dtype=np.float64).
+        The data to pick seeds for.
 
     n_clusters : int
         The number of seeds to choose.
@@ -72,35 +73,34 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
         Set to None to make the number of trials depend logarithmically
         on the number of seeds (2+log(k)); this is the default.
 
-    Notes
-    -----
-    Selects initial cluster centers for k-mean clustering in a smart way
-    to speed up convergence. see: Arthur, D. and Vassilvitskii, S.
-    "k-means++: the advantages of careful seeding". ACM-SIAM symposium
-    on Discrete algorithms. 2007
+    Returns
+    -------
+    centers : ndarray of shape (n_clusters, n_features)
+        The inital centers for k-means.
 
-    Version ported from http://www.stanford.edu/~darthur/kMeansppTest.zip,
-    which is the implementation used in the aforementioned paper.
+    indices : ndarray of shape (n_clusters,)
+        The index location of the chosen centers in the data array X. For a
+        given index and center, X[index] = center.
     """
     n_samples, n_features = X.shape
 
     centers = np.empty((n_clusters, n_features), dtype=X.dtype)
 
-    assert x_squared_norms is not None, 'x_squared_norms None in _k_init'
-
     # Set the number of local seeding trials if none is given
     if n_local_trials is None:
         # This is what Arthur/Vassilvitskii tried, but did not report
         # specific results for other than mentioning in the conclusion
         # that it helped.
         n_local_trials = 2 + int(np.log(n_clusters))
 
-    # Pick first center randomly
+    # Pick first center randomly and track index of point
     center_id = random_state.randint(n_samples)
+    indices = np.full(n_clusters, -1, dtype=int)
     if sp.issparse(X):
         centers[0] = X[center_id].toarray()
     else:
         centers[0] = X[center_id]
+    indices[0] = center_id
 
     # Initialize list of closest distances and calculate current potential
     closest_dist_sq = euclidean_distances(
@@ -139,8 +139,9 @@ def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
             centers[c] = X[best_candidate].toarray()
         else:
             centers[c] = X[best_candidate]
+        indices[c] = best_candidate
 
-    return centers
+    return centers, indices
 
 
 ###############################################################################
@@ -936,8 +937,9 @@ def _init_centroids(self, X, x_squared_norms, init, random_state,
             n_samples = X.shape[0]
 
         if isinstance(init, str) and init == 'k-means++':
-            centers = _k_init(X, n_clusters, random_state=random_state,
-                              x_squared_norms=x_squared_norms)
+            centers, _ = _kmeans_plusplus(X, n_clusters,
+                                          random_state=random_state,
+                                          x_squared_norms=x_squared_norms)
         elif isinstance(init, str) and init == 'random':
             seeds = random_state.permutation(n_samples)[:n_clusters]
             centers = X[seeds]
@@ -1925,3 +1927,97 @@ def _more_tags(self):
                 'zero sample_weight is not equivalent to removing samples',
             }
         }
+
+
+def kmeans_plusplus(X, n_clusters, *, x_squared_norms=None,
+                    random_state=None, n_local_trials=None):
+    """Init n_clusters seeds according to k-means++
+
+    .. versionadded:: 0.24
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The data to pick seeds from.
+
+    n_clusters : int
+        The number of centroids to initialize
+
+    x_squared_norms : array-like of shape (n_samples,), default=None
+        Squared Euclidean norm of each data point.
+
+    random_state : int or RandomState instance, default=None
+        Determines random number generation for centroid initialization. Pass
+        an int for reproducible output across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    n_local_trials : int, default=None
+        The number of seeding trials for each center (except the first),
+        of which the one reducing inertia the most is greedily chosen.
+        Set to None to make the number of trials depend logarithmically
+        on the number of seeds (2+log(k)).
+
+    Returns
+    -------
+    centers : ndarray of shape (n_clusters, n_features)
+        The inital centers for k-means.
+
+    indices : ndarray of shape (n_clusters,)
+        The index location of the chosen centers in the data array X. For a
+        given index and center, X[index] = center.
+
+    Notes
+    -----
+    Selects initial cluster centers for k-mean clustering in a smart way
+    to speed up convergence. see: Arthur, D. and Vassilvitskii, S.
+    "k-means++: the advantages of careful seeding". ACM-SIAM symposium
+    on Discrete algorithms. 2007
+
+    Examples
+    --------
+
+    >>> from sklearn.cluster import kmeans_plusplus
+    >>> import numpy as np
+    >>> X = np.array([[1, 2], [1, 4], [1, 0],
+    ...               [10, 2], [10, 4], [10, 0]])
+    >>> centers, indices = kmeans_plusplus(X, n_clusters=2, random_state=0)
+    >>> centers
+    array([[10,  4],
+           [ 1,  0]])
+    >>> indices
+    array([4, 2])
+    """
+
+    # Check data
+    check_array(X, accept_sparse='csr',
+                dtype=[np.float64, np.float32])
+
+    if X.shape[0] < n_clusters:
+        raise ValueError(f"n_samples={X.shape[0]} should be >= "
+                         f"n_clusters={n_clusters}.")
+
+    # Check parameters
+    if x_squared_norms is None:
+        x_squared_norms = row_norms(X, squared=True)
+    else:
+        x_squared_norms = check_array(x_squared_norms,
+                                      dtype=X.dtype,
+                                      ensure_2d=False)
+
+    if x_squared_norms.shape[0] != X.shape[0]:
+        raise ValueError(
+            f"The length of x_squared_norms {x_squared_norms.shape[0]} should "
+            f"be equal to the length of n_samples {X.shape[0]}.")
+
+    if n_local_trials is not None and n_local_trials < 1:
+        raise ValueError(
+            f"n_local_trials is set to {n_local_trials} but should be an "
+            f"integer value greater than zero.")
+
+    random_state = check_random_state(random_state)
+
+    # Call private k-means++
+    centers, indices = _kmeans_plusplus(X, n_clusters, x_squared_norms,
+                                        random_state, n_local_trials)
+
+    return centers, indices
@@ -20,7 +20,7 @@
 from sklearn.metrics import pairwise_distances
 from sklearn.metrics import pairwise_distances_argmin
 from sklearn.metrics.cluster import v_measure_score
-from sklearn.cluster import KMeans, k_means
+from sklearn.cluster import KMeans, k_means, kmeans_plusplus
 from sklearn.cluster import MiniBatchKMeans
 from sklearn.cluster._kmeans import _labels_inertia
 from sklearn.cluster._kmeans import _mini_batch_step
@@ -1030,3 +1030,60 @@ def test_minibatch_kmeans_wrong_params(param, match):
     # are passed for the MiniBatchKMeans specific parameters
     with pytest.raises(ValueError, match=match):
         MiniBatchKMeans(**param).fit(X)
+
+
+@pytest.mark.parametrize("param, match", [
+    ({"n_local_trials": 0},
+     r"n_local_trials is set to 0 but should be an "
+     r"integer value greater than zero"),
+    ({"x_squared_norms": X[:2]},
+     r"The length of x_squared_norms .* should "
+     r"be equal to the length of n_samples")]
+)
+def test_kmeans_plusplus_wrong_params(param, match):
+    with pytest.raises(ValueError, match=match):
+        kmeans_plusplus(X, n_clusters, **param)
+
+
+@pytest.mark.parametrize("data", [X, X_csr])
+@pytest.mark.parametrize("dtype", [np.float64, np.float32])
+def test_kmeans_plusplus_output(data, dtype):
+    # Check for the correct number of seeds and all positive values
+    data = data.astype(dtype)
+    centers, indices = kmeans_plusplus(data, n_clusters)
+
+    # Check there are the correct number of indices and that all indices are
+    # positive and within the number of samples
+    assert indices.shape[0] == n_clusters
+    assert (indices >= 0).all()
+    assert (indices <= data.shape[0]).all()
+
+    # Check for the correct number of seeds and that they are bound by the data
+    assert centers.shape[0] == n_clusters
+    assert (centers.max(axis=0) <= data.max(axis=0)).all()
+    assert (centers.min(axis=0) >= data.min(axis=0)).all()
+
+    # Check that indices correspond to reported centers
+    # Use X for comparison rather than data, test still works against centers
+    # calculated with sparse data.
+    assert_allclose(X[indices].astype(dtype), centers)
+
+
+@pytest.mark.parametrize("x_squared_norms", [row_norms(X, squared=True), None])
+def test_kmeans_plusplus_norms(x_squared_norms):
+    # Check that defining x_squared_norms returns the same as default=None.
+    centers, indices = kmeans_plusplus(X, n_clusters,
+                                       x_squared_norms=x_squared_norms)
+
+    assert_allclose(X[indices], centers)
+
+
+def test_kmeans_plusplus_dataorder():
+    # Check that memory layout does not effect result
+    centers_c, _ = kmeans_plusplus(X, n_clusters, random_state=0)
+
+    X_fortran = np.asfortranarray(X)
+
+    centers_fortran, _ = kmeans_plusplus(X_fortran, n_clusters, random_state=0)
+
+    assert_allclose(centers_c, centers_fortran)