scikit-learn · bingoko · Oct 22, 2021 · Oct 22, 2021 · Oct 22, 2021 · Oct 22, 2021
diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
@@ -68,6 +68,15 @@ Changelog
  error when 'min_idf' or 'max_idf' are floating-point numbers greater than 1.
  :pr:`20752` by :user:`Alek Lefebvre <AlekLefebvre>`.

+:mod:`sklearn.feature_selection`
+.................................
+
+- |Enhancement| Enabled parallel processing for
+  :func:`feature_selection.mutual_info_regression` and
+  :func:`feature_selection.mutual_info_classif` by offering
+  n_jobs parameters.
+  :pr:`21409` by :user:`Bingo Li <Bingoko>`.
+
 :mod:`sklearn.impute`
 .....................


diff --git a/sklearn/feature_selection/_mutual_info.py b/sklearn/feature_selection/_mutual_info.py
@@ -14,7 +14,7 @@
 from ..utils.multiclass import check_classification_targets


-def _compute_mi_cc(x, y, n_neighbors):
+def _compute_mi_cc(x, y, n_neighbors, n_jobs):
    """Compute mutual information between two continuous variables.

    Parameters
@@ -26,6 +26,12 @@ def _compute_mi_cc(x, y, n_neighbors):
    n_neighbors : int
        Number of nearest neighbors to search for each point, see [1]_.

+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
    Returns
    -------
    mi : float
@@ -51,7 +57,7 @@ def _compute_mi_cc(x, y, n_neighbors):
    xy = np.hstack((x, y))

    # Here we rely on NearestNeighbors to select the fastest algorithm.
-    nn = NearestNeighbors(metric="chebyshev", n_neighbors=n_neighbors)
+    nn = NearestNeighbors(metric="chebyshev", n_neighbors=n_neighbors, n_jobs=n_jobs)

    nn.fit(xy)
    radius = nn.kneighbors()[0]
@@ -77,7 +83,7 @@ def _compute_mi_cc(x, y, n_neighbors):
    return max(0, mi)


-def _compute_mi_cd(c, d, n_neighbors):
+def _compute_mi_cd(c, d, n_neighbors, n_jobs):
    """Compute mutual information between continuous and discrete variables.

    Parameters
@@ -91,6 +97,12 @@ def _compute_mi_cd(c, d, n_neighbors):
    n_neighbors : int
        Number of nearest neighbors to search for each point, see [1]_.

+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
    Returns
    -------
    mi : float
@@ -115,7 +127,7 @@ def _compute_mi_cd(c, d, n_neighbors):
    radius = np.empty(n_samples)
    label_counts = np.empty(n_samples)
    k_all = np.empty(n_samples)
-    nn = NearestNeighbors()
+    nn = NearestNeighbors(n_jobs=n_jobs)
    for label in np.unique(d):
        mask = d == label
        count = np.sum(mask)
@@ -150,7 +162,7 @@ def _compute_mi_cd(c, d, n_neighbors):
    return max(0, mi)


-def _compute_mi(x, y, x_discrete, y_discrete, n_neighbors=3):
+def _compute_mi(x, y, x_discrete, y_discrete, n_neighbors=3, n_jobs=None):
    """Compute mutual information between two variables.

    This is a simple wrapper which selects a proper function to call based on
@@ -159,11 +171,11 @@ def _compute_mi(x, y, x_discrete, y_discrete, n_neighbors=3):
    if x_discrete and y_discrete:
        return mutual_info_score(x, y)
    elif x_discrete and not y_discrete:
-        return _compute_mi_cd(y, x, n_neighbors)
+        return _compute_mi_cd(y, x, n_neighbors, n_jobs)
    elif not x_discrete and y_discrete:
-        return _compute_mi_cd(x, y, n_neighbors)
+        return _compute_mi_cd(x, y, n_neighbors, n_jobs)
    else:
-        return _compute_mi_cc(x, y, n_neighbors)
+        return _compute_mi_cc(x, y, n_neighbors, n_jobs)


 def _iterate_columns(X, columns=None):
@@ -202,6 +214,7 @@ def _estimate_mi(
    discrete_features="auto",
    discrete_target=False,
    n_neighbors=3,
+    n_jobs=None,
    copy=True,
    random_state=None,
 ):
@@ -230,6 +243,12 @@ def _estimate_mi(
        see [1]_ and [2]_. Higher values reduce variance of the estimation, but
        could introduce a bias.

+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
    copy : bool, default=True
        Whether to make a copy of the given data. If set to False, the initial
        data will be overwritten.
@@ -298,15 +317,22 @@ def _estimate_mi(
        y += 1e-10 * np.maximum(1, np.mean(np.abs(y))) * rng.randn(n_samples)

    mi = [
-        _compute_mi(x, y, discrete_feature, discrete_target, n_neighbors)
+        _compute_mi(x, y, discrete_feature, discrete_target, n_neighbors, n_jobs)
        for x, discrete_feature in zip(_iterate_columns(X), discrete_mask)
    ]

    return np.array(mi)


 def mutual_info_regression(
-    X, y, *, discrete_features="auto", n_neighbors=3, copy=True, random_state=None
+    X,
+    y,
+    *,
+    discrete_features="auto",
+    n_neighbors=3,
+    n_jobs=None,
+    copy=True,
+    random_state=None,
 ):
    """Estimate mutual information for a continuous target variable.

@@ -342,6 +368,12 @@ def mutual_info_regression(
        see [2]_ and [3]_. Higher values reduce variance of the estimation, but
        could introduce a bias.

+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
    copy : bool, default=True
        Whether to make a copy of the given data. If set to False, the initial
        data will be overwritten.
@@ -381,11 +413,27 @@ def mutual_info_regression(
    .. [4] L. F. Kozachenko, N. N. Leonenko, "Sample Estimate of the Entropy
           of a Random Vector", Probl. Peredachi Inf., 23:2 (1987), 9-16
    """
-    return _estimate_mi(X, y, discrete_features, False, n_neighbors, copy, random_state)
+    return _estimate_mi(
+        X=X,
+        y=y,
+        discrete_features=discrete_features,
+        discrete_target=False,
+        n_neighbors=n_neighbors,
+        n_jobs=n_jobs,
+        copy=copy,
+        random_state=random_state,
+    )


 def mutual_info_classif(
-    X, y, *, discrete_features="auto", n_neighbors=3, copy=True, random_state=None
+    X,
+    y,
+    *,
+    discrete_features="auto",
+    n_neighbors=3,
+    n_jobs=None,
+    copy=True,
+    random_state=None,
 ):
    """Estimate mutual information for a discrete target variable.

@@ -421,6 +469,12 @@ def mutual_info_classif(
        see [2]_ and [3]_. Higher values reduce variance of the estimation, but
        could introduce a bias.

+    n_jobs : int, default=None
+        The number of parallel jobs to run for neighbors search.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
    copy : bool, default=True
        Whether to make a copy of the given data. If set to False, the initial
        data will be overwritten.
@@ -461,4 +515,13 @@ def mutual_info_classif(
           of a Random Vector:, Probl. Peredachi Inf., 23:2 (1987), 9-16
    """
    check_classification_targets(y)
-    return _estimate_mi(X, y, discrete_features, True, n_neighbors, copy, random_state)
+    return _estimate_mi(
+        X=X,
+        y=y,
+        discrete_features=discrete_features,
+        discrete_target=True,
+        n_neighbors=n_neighbors,
+        n_jobs=n_jobs,
+        copy=copy,
+        random_state=random_state,
+    )
diff --git a/sklearn/feature_selection/tests/test_mutual_info.py b/sklearn/feature_selection/tests/test_mutual_info.py
@@ -19,6 +19,7 @@ def test_compute_mi_dd():
    I_xy = H_x + H_y - H_xy

    assert_almost_equal(_compute_mi(x, y, True, True), I_xy)
+    assert_almost_equal(_compute_mi(x, y, True, True, n_jobs=2), I_xy)


 def test_compute_mi_cc():
@@ -53,6 +54,10 @@ def test_compute_mi_cc():
        I_computed = _compute_mi(x, y, False, False, n_neighbors)
        assert_almost_equal(I_computed, I_theory, 1)

+    for n_neighbors in [3, 5, 7]:
+        I_computed_ = _compute_mi(x, y, False, False, n_neighbors, n_jobs=2)
+        assert_almost_equal(I_computed_, I_theory, 1)
+

 def test_compute_mi_cd():
    # To test define a joint distribution as follows:
@@ -90,6 +95,10 @@ def test_compute_mi_cd():
            I_computed = _compute_mi(x, y, True, False, n_neighbors)
            assert_almost_equal(I_computed, I_theory, 1)

+        for n_neighbors in [3, 5, 7]:
+            I_computed_ = _compute_mi(x, y, True, False, n_neighbors, n_jobs=2)
+            assert_almost_equal(I_computed_, I_theory, 1)
+

 def test_compute_mi_cd_unique_label():
    # Test that adding unique label doesn't change MI.
@@ -102,12 +111,15 @@ def test_compute_mi_cd_unique_label():
    y[~mask] = np.random.uniform(0, 2, size=np.sum(~mask))

    mi_1 = _compute_mi(x, y, True, False)
+    mi_1_ = _compute_mi(x, y, True, False, n_jobs=2)

    x = np.hstack((x, 2))
    y = np.hstack((y, 10))
    mi_2 = _compute_mi(x, y, True, False)
+    mi_2_ = _compute_mi(x, y, True, False, n_jobs=2)

    assert mi_1 == mi_2
+    assert mi_1_ == mi_2_


 # We are going test that feature ordering by MI matches our expectations.
@@ -139,6 +151,9 @@ def test_mutual_info_regression():
    mi = mutual_info_regression(X, y, random_state=0)
    assert_array_equal(np.argsort(-mi), np.array([1, 2, 0]))

+    mi_ = mutual_info_regression(X, y, random_state=0, n_jobs=2)
+    assert_array_equal(np.argsort(-mi_), np.array([1, 2, 0]))
+

 def test_mutual_info_classif_mixed():
    # Here the target is discrete and there are two continuous and one
@@ -163,6 +178,27 @@ def test_mutual_info_classif_mixed():
        # The MI should be the same
        assert mi_nn[2] == mi[2]

+    mi_ = mutual_info_classif(
+        X, y, discrete_features=[2], n_neighbors=3, random_state=0, n_jobs=2
+    )
+    assert_array_equal(np.argsort(-mi_), [2, 0, 1])
+    for n_neighbors in [5, 7, 9]:
+        mi_nn_ = mutual_info_classif(
+            X,
+            y,
+            discrete_features=[2],
+            n_neighbors=n_neighbors,
+            random_state=0,
+            n_jobs=2,
+        )
+        # Check that the continuous values have an higher MI with greater
+        # n_neighbors
+        assert mi_nn_[0] > mi_[0]
+        assert mi_nn_[1] > mi_[1]
+        # The n_neighbors should not have any effect on the discrete value
+        # The MI should be the same
+        assert mi_nn_[2] == mi_[2]
+

 def test_mutual_info_options():
    X = np.array([[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]], dtype=float)