scikit-learn · jeremiedbb · Mar 14, 2022 · Nov 17, 2021 · Nov 17, 2021 · Nov 18, 2021
diff --git a/doc/modules/random_projection.rst b/doc/modules/random_projection.rst
@@ -160,3 +160,42 @@ projection transformer::
   In Proceedings of the 12th ACM SIGKDD international conference on
   Knowledge discovery and data mining (KDD '06). ACM, New York, NY, USA,
   287-296.
+
+
+.. _random_projection_inverse_transform:
+
+Inverse Transform
+=================
+The random projection transformers have ``compute_inverse_components`` parameter. When
+set to True, after creating the random ``components_`` matrix during fitting,
+the transformer computes the pseudo-inverse of this matrix and stores it as
+``inverse_components_``. The ``inverse_components_`` matrix has shape
+:math:`n_{features} \times n_{components}`, and it is always a dense matrix,
+regardless of whether the components matrix is sparse or dense. So depending on
+the number of features and components, it may use a lot of memory.
+
+When the ``inverse_transform`` method is called, it computes the product of the
+input ``X`` and the transpose of the inverse components. If the inverse components have
+been computed during fit, they are reused at each call to ``inverse_transform``.
+Otherwise they are recomputed each time, which can be costly. The result is always
+dense, even if ``X`` is sparse.
+
+Here a small code example which illustrates how to use the inverse transform
+feature::
+
+  >>> import numpy as np
+  >>> from sklearn.random_projection import SparseRandomProjection
+  >>> X = np.random.rand(100, 10000)
+  >>> transformer = SparseRandomProjection(
+  ...   compute_inverse_components=True
+  ... )
+  ...
+  >>> X_new = transformer.fit_transform(X)
+  >>> X_new.shape
+  (100, 3947)
+  >>> X_new_inversed = transformer.inverse_transform(X_new)
+  >>> X_new_inversed.shape
+  (100, 10000)
+  >>> X_new_again = transformer.transform(X_new_inversed)
+  >>> np.allclose(X_new, X_new_again)
+  True
diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
@@ -791,6 +791,14 @@ Changelog
  :class:`random_projection.GaussianRandomProjection` preserves dtype for
  `numpy.float32`. :pr:`22114` by :user:`Takeshi Oura <takoika>`.

+- |Enhancement| Adds an :meth:`inverse_transform` method and a
+  `compute_inverse_transform` parameter to all transformers in the
+  :mod:`~sklearn.random_projection` module:
+  :class:`~sklearn.random_projection.GaussianRandomProjection` and
+  :class:`~sklearn.random_projection.SparseRandomProjection`. When the parameter is set
+  to True, the pseudo-inverse of the components is computed during `fit` and stored as
+  `inverse_components_`. :pr:`21701` by `Aurélien Geron <ageron>`.
+
 - |API| Adds :term:`get_feature_names_out` to all transformers in the
  :mod:`~sklearn.random_projection` module:
  :class:`~sklearn.random_projection.GaussianRandomProjection` and

diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py
@@ -31,6 +31,7 @@
 from abc import ABCMeta, abstractmethod

 import numpy as np
+from scipy import linalg
 import scipy.sparse as sp

 from .base import BaseEstimator, TransformerMixin
@@ -39,10 +40,9 @@
 from .utils import check_random_state
 from .utils.extmath import safe_sparse_dot
 from .utils.random import sample_without_replacement
-from .utils.validation import check_is_fitted
+from .utils.validation import check_array, check_is_fitted
 from .exceptions import DataDimensionalityWarning

-
 __all__ = [
    "SparseRandomProjection",
    "GaussianRandomProjection",
@@ -302,11 +302,18 @@ class BaseRandomProjection(

    @abstractmethod
    def __init__(
-        self, n_components="auto", *, eps=0.1, dense_output=False, random_state=None
+        self,
+        n_components="auto",
+        *,
+        eps=0.1,
+        dense_output=False,
+        compute_inverse_components=False,
+        random_state=None,
    ):
        self.n_components = n_components
        self.eps = eps
        self.dense_output = dense_output
+        self.compute_inverse_components = compute_inverse_components
        self.random_state = random_state

    @abstractmethod
@@ -323,12 +330,18 @@ def _make_random_matrix(self, n_components, n_features):

        Returns
        -------
-        components : {ndarray, sparse matrix} of shape \
-                (n_components, n_features)
+        components : {ndarray, sparse matrix} of shape (n_components, n_features)
            The generated random matrix. Sparse matrix will be of CSR format.

        """

+    def _compute_inverse_components(self):
+        """Compute the pseudo-inverse of the (densified) components."""
+        components = self.components_
+        if sp.issparse(components):
+            components = components.toarray()
+        return linalg.pinv(components, check_finite=False)
+
    def fit(self, X, y=None):
        """Generate a sparse random projection matrix.

@@ -399,6 +412,9 @@ def fit(self, X, y=None):
            " not the proper shape."
        )

+        if self.compute_inverse_components:
+            self.inverse_components_ = self._compute_inverse_components()
+
        return self

    def transform(self, X):
@@ -437,6 +453,35 @@ def _n_features_out(self):
        """
        return self.n_components

+    def inverse_transform(self, X):
+        """Project data back to its original space.
+
+        Returns an array X_original whose transform would be X. Note that even
+        if X is sparse, X_original is dense: this may use a lot of RAM.
+
+        If `compute_inverse_components` is False, the inverse of the components is
+        computed during each call to `inverse_transform` which can be costly.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_components)
+            Data to be transformed back.
+
+        Returns
+        -------
+        X_original : ndarray of shape (n_samples, n_features)
+            Reconstructed data.
+        """
+        check_is_fitted(self)
+
+        X = check_array(X, dtype=[np.float64, np.float32], accept_sparse=("csr", "csc"))
+
+        if self.compute_inverse_components:
+            return X @ self.inverse_components_.T
+
+        inverse_components = self._compute_inverse_components()
+        return X @ inverse_components.T
+
    def _more_tags(self):
        return {
            "preserves_dtype": [np.float64, np.float32],
@@ -474,6 +519,11 @@ class GaussianRandomProjection(BaseRandomProjection):
        Smaller values lead to better embedding and higher number of
        dimensions (n_components) in the target projection space.

+    compute_inverse_components : bool, default=False
+        Learn the inverse transform by computing the pseudo-inverse of the
+        components during fit. Note that computing the pseudo-inverse does not
+        scale well to large matrices.
+
    random_state : int, RandomState instance or None, default=None
        Controls the pseudo random number generator used to generate the
        projection matrix at fit time.
@@ -488,6 +538,12 @@ class GaussianRandomProjection(BaseRandomProjection):
    components_ : ndarray of shape (n_components, n_features)
        Random matrix used for the projection.

+    inverse_components_ : ndarray of shape (n_features, n_components)
+        Pseudo-inverse of the components, only computed if
+        `compute_inverse_components` is True.
+
+        .. versionadded:: 1.1
+
    n_features_in_ : int
        Number of features seen during :term:`fit`.

@@ -516,11 +572,19 @@ class GaussianRandomProjection(BaseRandomProjection):
    (25, 2759)
    """

-    def __init__(self, n_components="auto", *, eps=0.1, random_state=None):
+    def __init__(
+        self,
+        n_components="auto",
+        *,
+        eps=0.1,
+        compute_inverse_components=False,
+        random_state=None,
+    ):
        super().__init__(
            n_components=n_components,
            eps=eps,
            dense_output=True,
+            compute_inverse_components=compute_inverse_components,
            random_state=random_state,
        )

@@ -610,6 +674,14 @@ class SparseRandomProjection(BaseRandomProjection):
        If False, the projected data uses a sparse representation if
        the input is sparse.

+    compute_inverse_components : bool, default=False
+        Learn the inverse transform by computing the pseudo-inverse of the
+        components during fit. Note that the pseudo-inverse is always a dense
+        array, even if the training data was sparse. This means that it might be
+        necessary to call `inverse_transform` on a small batch of samples at a
+        time to avoid exhausting the available memory on the host. Moreover,
+        computing the pseudo-inverse does not scale well to large matrices.
+
    random_state : int, RandomState instance or None, default=None
        Controls the pseudo random number generator used to generate the
        projection matrix at fit time.
@@ -625,6 +697,12 @@ class SparseRandomProjection(BaseRandomProjection):
        Random matrix used for the projection. Sparse matrix will be of CSR
        format.

+    inverse_components_ : ndarray of shape (n_features, n_components)
+        Pseudo-inverse of the components, only computed if
+        `compute_inverse_components` is True.
+
+        .. versionadded:: 1.1
+
    density_ : float in range 0.0 - 1.0
        Concrete density computed from when density = "auto".

@@ -676,12 +754,14 @@ def __init__(
        density="auto",
        eps=0.1,
        dense_output=False,
+        compute_inverse_components=False,
        random_state=None,
    ):
        super().__init__(
            n_components=n_components,
            eps=eps,
            dense_output=dense_output,
+            compute_inverse_components=compute_inverse_components,
            random_state=random_state,
        )


diff --git a/sklearn/tests/test_random_projection.py b/sklearn/tests/test_random_projection.py
@@ -1,5 +1,6 @@
 import functools
 from typing import List, Any
+import warnings

 import numpy as np
 import scipy.sparse as sp
@@ -31,8 +32,8 @@

 # Make some random data with uniformly located non zero entries with
 # Gaussian distributed values
-def make_sparse_random_data(n_samples, n_features, n_nonzeros):
-    rng = np.random.RandomState(0)
+def make_sparse_random_data(n_samples, n_features, n_nonzeros, random_state=0):
+    rng = np.random.RandomState(random_state)
    data_coo = sp.coo_matrix(
        (
            rng.randn(n_nonzeros),
@@ -377,6 +378,57 @@ def test_random_projection_feature_names_out(random_projection_cls):
    assert_array_equal(names_out, expected_names_out)


+@pytest.mark.parametrize("n_samples", (2, 9, 10, 11, 1000))
+@pytest.mark.parametrize("n_features", (2, 9, 10, 11, 1000))
+@pytest.mark.parametrize("random_projection_cls", all_RandomProjection)
+@pytest.mark.parametrize("compute_inverse_components", [True, False])
+def test_inverse_transform(
+    n_samples,
+    n_features,
+    random_projection_cls,
+    compute_inverse_components,
+    global_random_seed,
+):
+    n_components = 10
+
+    random_projection = random_projection_cls(
+        n_components=n_components,
+        compute_inverse_components=compute_inverse_components,
+        random_state=global_random_seed,
+    )
+
+    X_dense, X_csr = make_sparse_random_data(
+        n_samples,
+        n_features,
+        n_samples * n_features // 100 + 1,
+        random_state=global_random_seed,
+    )
+
+    for X in [X_dense, X_csr]:
+        with warnings.catch_warnings():
+            warnings.filterwarnings(
+                "ignore",
+                message=(
+                    "The number of components is higher than the number of features"
+                ),
+                category=DataDimensionalityWarning,
+            )
+            projected = random_projection.fit_transform(X)
+
+        if compute_inverse_components:
+            assert hasattr(random_projection, "inverse_components_")
+            inv_components = random_projection.inverse_components_
+            assert inv_components.shape == (n_features, n_components)
+
+        projected_back = random_projection.inverse_transform(projected)
+        assert projected_back.shape == X.shape
+
+        projected_again = random_projection.transform(projected_back)
+        if hasattr(projected, "toarray"):
+            projected = projected.toarray()
+        assert_allclose(projected, projected_again, rtol=1e-7, atol=1e-10)
+
+
 @pytest.mark.parametrize("random_projection_cls", all_RandomProjection)
 @pytest.mark.parametrize(
    "input_dtype, expected_dtype",