From b6d42e493f7685db6e596e1d0d208f17fb22e731 Mon Sep 17 00:00:00 2001 From: Kei Ishikawa Date: Sat, 24 Apr 2021 18:48:06 +0200 Subject: [PATCH 1/9] add transform_inverse to Nystroem --- sklearn/kernel_approximation.py | 115 ++++++++++++++++++++++++++++++-- 1 file changed, 111 insertions(+), 4 deletions(-) diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index ca02aac3e982c..a729d36d14cfc 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -12,6 +12,7 @@ import numpy as np import scipy.sparse as sp +from scipy import linalg from scipy.linalg import svd try: from scipy.fft import fft, ifft @@ -20,6 +21,7 @@ from .base import BaseEstimator from .base import TransformerMixin +from .exceptions import NotFittedError from .utils import check_random_state, as_float_array from .utils.extmath import safe_sparse_dot from .utils.validation import check_is_fitted @@ -730,8 +732,8 @@ class Nystroem(TransformerMixin, BaseEstimator): """ @_deprecate_positional_args def __init__(self, kernel="rbf", *, gamma=None, coef0=None, degree=None, - kernel_params=None, n_components=100, random_state=None, - n_jobs=None): + kernel_params=None, n_components=100, alpha=1.0, + fit_inverse_transform=False, random_state=None, n_jobs=None): self.kernel = kernel self.gamma = gamma @@ -739,9 +741,61 @@ def __init__(self, kernel="rbf", *, gamma=None, coef0=None, degree=None, self.degree = degree self.kernel_params = kernel_params self.n_components = n_components + self.alpha = alpha + self.fit_inverse_transform = fit_inverse_transform self.random_state = random_state self.n_jobs = n_jobs + def _fit_inverse_transform(self, X_transformed, X): + if hasattr(X, "tocsr"): + raise NotImplementedError("Inverse transform not implemented for " + "sparse matrices!") + + # from sklearn.decomposition import PCA + # self.pca = PCA(n_components=20) + # X_transformed = self.pca.fit_transform(X_transformed) + + # n_samples = X_transformed.shape[0] + # Taken from KernelPCA. Tempolarily left for debug + # K = pairwise_kernels(X_transformed, + # metric=self.kernel, + # filter_params=True, + # n_jobs=self.n_jobs, + # **self._get_kernel_params()) + # K.flat[::n_samples + 1] += self.alpha + #self.dual_coef_ = linalg.solve(K, X, sym_pos=True, overwrite_a=True) + + # Let I and T be the indices of inducing points and training points. + Kii = pairwise_kernels(X_transformed[self.basis_indices_], # K_{I, I} + metric=self.kernel, + filter_params=True, + n_jobs=self.n_jobs, + **self._get_kernel_params()) + Kit = pairwise_kernels(X_transformed[self.basis_indices_], # K_{I, T} + X_transformed, + metric=self.kernel, + filter_params=True, + n_jobs=self.n_jobs, + **self._get_kernel_params()) + + # By Woodbury matrix identity, following formula(typed in latex) holds: + # \tilde{K}_{new, T} (\tilde{K}_{T, T} + \alpha I)^{-1} y + # = K_{new, I} K_{I, I}^{-1}K_{I, T} + # \left( \frac{1}{\alpha} I - \frac{1}{\alpha}K_{T, I}(\alpha K_{I, I} + # + K_{I, T}K_{T, I})^{-1}K_{I, T} \right )y + # = K_{new, I} \left( \alpha \cdot K_{I, I}\right)^{-1} + # \left\{K_{I, T}y - K_{I, T} K_{T, I}(\alpha K_{I, I} + # + K_{I, T}K_{T, I})^{-1}K_{I, T}y \right\} + + A = np.dot(Kit, X) + B = np.dot(Kit, Kit.T) + # TODO(kstoneriv3): The following line often results in warning like + # "Ill-conditioned matrix (rcond=1.80677e-18)". + C = linalg.solve(self.alpha * Kii + B, A, sym_pos=True) + D = A - np.dot(B, C) + self.dual_coef_ = linalg.solve(self.alpha * Kii, D, sym_pos=True, overwrite_a=True) + self.X_transformed_fit_ = X_transformed[self.basis_indices_] + def fit(self, X, y=None): """Fit estimator to data. @@ -778,11 +832,17 @@ def fit(self, X, y=None): **self._get_kernel_params()) # sqrt of kernel matrix on basis vectors - U, S, V = svd(basis_kernel) + U, S, V = svd(basis_kernel) # TODO(kstoneriv3): Why not np.linalg.eigh() ? S = np.maximum(S, 1e-12) self.normalization_ = np.dot(U / np.sqrt(S), V) self.components_ = basis - self.component_indices_ = inds + self.components_indices_ = basis_inds + self.basis_indices_ = basis_inds + + if self.fit_inverse_transform: + X_transformed = self.transform(X) + self._fit_inverse_transform(X_transformed, X) + return self def transform(self, X): @@ -812,6 +872,53 @@ def transform(self, X): **kernel_params) return np.dot(embedded, self.normalization_.T) + def inverse_transform(self, X): + """Transform X back to original space. + + ``inverse_transform`` approximates the inverse transformation using + a learned pre-image. The pre-image is learned by kernel ridge + regression of the original data on their low-dimensional representation + vectors. + + .. note: + :meth:`~sklearn.decomposition.fit` internally uses a centered + kernel. As the centered kernel no longer contains the information + of the mean of kernel features, such information is not taken into + account in reconstruction. + + .. note:: + When users want to compute inverse transformation for 'linear' + kernel, it is recommended that they use + :class:`~sklearn.decomposition.PCA` instead. Unlike + :class:`~sklearn.decomposition.PCA`, + :class:`~sklearn.decomposition.KernelPCA`'s ``inverse_transform`` + does not reconstruct the mean of data when 'linear' kernel is used + due to the use of centered kernel. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_components) + + Returns + ------- + X_new : ndarray of shape (n_samples, n_features) + + References + ---------- + "Learning to Find Pre-Images", G BakIr et al, 2004. + """ + if not self.fit_inverse_transform: + raise NotFittedError("The fit_inverse_transform parameter was not" + " set to True when instantiating and hence " + "the inverse transform is not available.") + # X = self.pca.transform(X) + K = pairwise_kernels(X, self.X_transformed_fit_, + metric=self.kernel, + filter_params=True, + n_jobs=self.n_jobs, + **self._get_kernel_params()) + return np.dot(K, self.dual_coef_) + def _get_kernel_params(self): params = self.kernel_params if params is None: From 0ee5ee8fe4bc5409406a559cb0c35a393adc88e6 Mon Sep 17 00:00:00 2001 From: Kei Ishikawa Date: Sat, 24 Apr 2021 20:18:30 +0200 Subject: [PATCH 2/9] add a test --- sklearn/kernel_approximation.py | 16 +++++++++------ sklearn/tests/test_kernel_approximation.py | 23 +++++++++++++++++++++- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index a729d36d14cfc..c48d0f837ef55 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -779,6 +779,7 @@ def _fit_inverse_transform(self, X_transformed, X): **self._get_kernel_params()) # By Woodbury matrix identity, following formula(typed in latex) holds: + # # \tilde{K}_{new, T} (\tilde{K}_{T, T} + \alpha I)^{-1} y # = K_{new, I} K_{I, I}^{-1}K_{I, T} # \left( \frac{1}{\alpha} I - \frac{1}{\alpha}K_{T, I}(\alpha K_{I, I} @@ -786,14 +787,17 @@ def _fit_inverse_transform(self, X_transformed, X): # = K_{new, I} \left( \alpha \cdot K_{I, I}\right)^{-1} # \left\{K_{I, T}y - K_{I, T} K_{T, I}(\alpha K_{I, I} # + K_{I, T}K_{T, I})^{-1}K_{I, T}y \right\} + # + # where \tilde{K} implies the kernel matrix approximated by Nystrom approximation - A = np.dot(Kit, X) - B = np.dot(Kit, Kit.T) + A = Kit @ X + B = Kit @ Kit.T # TODO(kstoneriv3): The following line often results in warning like - # "Ill-conditioned matrix (rcond=1.80677e-18)". - C = linalg.solve(self.alpha * Kii + B, A, sym_pos=True) - D = A - np.dot(B, C) - self.dual_coef_ = linalg.solve(self.alpha * Kii, D, sym_pos=True, overwrite_a=True) + # "Ill-conditioned matrix (rcond=1.80677e-18)". So we use pinv instead. + # C = linalg.solve(self.alpha * Kii + B, A, sym_pos=True) + C = linalg.pinvh(self.alpha * Kii + B) @ A + D = A - B @ C + self.dual_coef_ = linalg.pinvh(self.alpha * Kii) @ D self.X_transformed_fit_ = X_transformed[self.basis_indices_] def fit(self, X, y=None): diff --git a/sklearn/tests/test_kernel_approximation.py b/sklearn/tests/test_kernel_approximation.py index cfd9c9671fc4d..d60eba06cff03 100644 --- a/sklearn/tests/test_kernel_approximation.py +++ b/sklearn/tests/test_kernel_approximation.py @@ -7,12 +7,13 @@ from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_array_almost_equal -from sklearn.metrics.pairwise import kernel_metrics +from sklearn.datasets import make_blobs from sklearn.kernel_approximation import RBFSampler from sklearn.kernel_approximation import AdditiveChi2Sampler from sklearn.kernel_approximation import SkewedChi2Sampler from sklearn.kernel_approximation import Nystroem from sklearn.kernel_approximation import PolynomialCountSketch +from sklearn.metrics.pairwise import kernel_metrics from sklearn.metrics.pairwise import polynomial_kernel, rbf_kernel, chi2_kernel # generate data @@ -332,3 +333,23 @@ def test_nystroem_precomputed_kernel(): **param) with pytest.raises(ValueError, match=msg): ny.fit(K) + +def test_nystroem_inverse_transform_reconstruction(): + # Test if the reconstruction is a good approximation. + # Note that in general it is not possible to get an arbitrarily good + # reconstruction because of kernel centering that does not + # preserve all the information of the original data. + n = 100 + d = 2 + rng = np.random.RandomState(0) + X = rng.randn(n * d).reshape(n, d) + Y = rng.randn(n * d).reshape(n, d) + + nystroem = Nystroem( + n_components=10, kernel='rbf', fit_inverse_transform=True, + alpha=2e-3, gamma=6e-2, random_state=0 + ) + nystroem.fit(X) + Y_trans = nystroem.fit_transform(Y) + Y_reconst = nystroem.inverse_transform(Y_trans) + assert np.linalg.norm(Y - Y_reconst) / np.linalg.norm(Y) < 1e-1 From e15f31223aec10cf6c7fd8abec4fb1a7dbfab321 Mon Sep 17 00:00:00 2001 From: Kei Ishikawa Date: Sat, 24 Apr 2021 20:24:46 +0200 Subject: [PATCH 3/9] update --- sklearn/tests/test_kernel_approximation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/tests/test_kernel_approximation.py b/sklearn/tests/test_kernel_approximation.py index d60eba06cff03..388eb7c3bfe05 100644 --- a/sklearn/tests/test_kernel_approximation.py +++ b/sklearn/tests/test_kernel_approximation.py @@ -346,8 +346,8 @@ def test_nystroem_inverse_transform_reconstruction(): Y = rng.randn(n * d).reshape(n, d) nystroem = Nystroem( - n_components=10, kernel='rbf', fit_inverse_transform=True, - alpha=2e-3, gamma=6e-2, random_state=0 + n_components=40, kernel='rbf', fit_inverse_transform=True, + alpha=3e-3, gamma=4e-1, random_state=0 ) nystroem.fit(X) Y_trans = nystroem.fit_transform(Y) From 6daf70ea1090590dd87ff7ea4719ed60bfc4ca28 Mon Sep 17 00:00:00 2001 From: Kei Ishikawa Date: Sat, 24 Apr 2021 20:40:27 +0200 Subject: [PATCH 4/9] update --- sklearn/kernel_approximation.py | 38 ++++++--------------------------- 1 file changed, 6 insertions(+), 32 deletions(-) diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index c48d0f837ef55..f7a6e250197f1 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -685,6 +685,9 @@ class Nystroem(TransformerMixin, BaseEstimator): Attributes ---------- + basis_indices_: ndarray of shape (n_basis) + Indices of ``basis`` in the training set. + components_ : ndarray of shape (n_components, n_features) Subset of training points used to construct the feature map. @@ -750,21 +753,7 @@ def _fit_inverse_transform(self, X_transformed, X): if hasattr(X, "tocsr"): raise NotImplementedError("Inverse transform not implemented for " "sparse matrices!") - - # from sklearn.decomposition import PCA - # self.pca = PCA(n_components=20) - # X_transformed = self.pca.fit_transform(X_transformed) - - # n_samples = X_transformed.shape[0] - # Taken from KernelPCA. Tempolarily left for debug - # K = pairwise_kernels(X_transformed, - # metric=self.kernel, - # filter_params=True, - # n_jobs=self.n_jobs, - # **self._get_kernel_params()) - # K.flat[::n_samples + 1] += self.alpha - #self.dual_coef_ = linalg.solve(K, X, sym_pos=True, overwrite_a=True) - + # Let I and T be the indices of inducing points and training points. Kii = pairwise_kernels(X_transformed[self.basis_indices_], # K_{I, I} metric=self.kernel, @@ -882,22 +871,8 @@ def inverse_transform(self, X): ``inverse_transform`` approximates the inverse transformation using a learned pre-image. The pre-image is learned by kernel ridge regression of the original data on their low-dimensional representation - vectors. - - .. note: - :meth:`~sklearn.decomposition.fit` internally uses a centered - kernel. As the centered kernel no longer contains the information - of the mean of kernel features, such information is not taken into - account in reconstruction. - - .. note:: - When users want to compute inverse transformation for 'linear' - kernel, it is recommended that they use - :class:`~sklearn.decomposition.PCA` instead. Unlike - :class:`~sklearn.decomposition.PCA`, - :class:`~sklearn.decomposition.KernelPCA`'s ``inverse_transform`` - does not reconstruct the mean of data when 'linear' kernel is used - due to the use of centered kernel. + vectors. For the efficiency of kernel ridge regression, the kernel for + regression is a Nystrom approximation of the original kernel. Parameters ---------- @@ -915,7 +890,6 @@ def inverse_transform(self, X): raise NotFittedError("The fit_inverse_transform parameter was not" " set to True when instantiating and hence " "the inverse transform is not available.") - # X = self.pca.transform(X) K = pairwise_kernels(X, self.X_transformed_fit_, metric=self.kernel, filter_params=True, From 15a4215b20769d31b3859ee3a753efaa7bb6ed01 Mon Sep 17 00:00:00 2001 From: Kei Ishikawa Date: Sat, 24 Apr 2021 20:42:05 +0200 Subject: [PATCH 5/9] update --- sklearn/kernel_approximation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index f7a6e250197f1..021e0d0b94fc3 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -829,7 +829,7 @@ def fit(self, X, y=None): S = np.maximum(S, 1e-12) self.normalization_ = np.dot(U / np.sqrt(S), V) self.components_ = basis - self.components_indices_ = basis_inds + self.component_indices_ = basis_inds self.basis_indices_ = basis_inds if self.fit_inverse_transform: From ea347f5a0bb7d52319d52b37eab6fe2aeb6bb10e Mon Sep 17 00:00:00 2001 From: Kei Ishikawa Date: Sat, 24 Apr 2021 20:43:07 +0200 Subject: [PATCH 6/9] update --- sklearn/tests/test_kernel_approximation.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/sklearn/tests/test_kernel_approximation.py b/sklearn/tests/test_kernel_approximation.py index 388eb7c3bfe05..5bba7f7d55c76 100644 --- a/sklearn/tests/test_kernel_approximation.py +++ b/sklearn/tests/test_kernel_approximation.py @@ -336,9 +336,6 @@ def test_nystroem_precomputed_kernel(): def test_nystroem_inverse_transform_reconstruction(): # Test if the reconstruction is a good approximation. - # Note that in general it is not possible to get an arbitrarily good - # reconstruction because of kernel centering that does not - # preserve all the information of the original data. n = 100 d = 2 rng = np.random.RandomState(0) From 46776fab8db79d350612a699030dd5bdf87de677 Mon Sep 17 00:00:00 2001 From: Kei Ishikawa Date: Sat, 24 Apr 2021 20:52:37 +0200 Subject: [PATCH 7/9] add change log --- doc/whats_new/v1.0.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 3b3884e68e185..ee14f97461f68 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -219,6 +219,13 @@ Changelog :func:`~sklearn.inspection.permutation_importance`. :pr:`19411` by :user:`Simona Maggio `. +:mod:`sklearn.kernel_approximation` +......................... + +- |Enhancement| Add + :meth:`~sklearn.kernel_approximation.Nystroem.inverse_transform`. + :pr:`19971` by :user:`Kei Ishikawa `. + :mod:`sklearn.linear_model` ........................... From 12270b93063de23507f7a2c0d45c4ed3a5a4af4e Mon Sep 17 00:00:00 2001 From: Kei Ishikawa Date: Sat, 24 Apr 2021 20:56:55 +0200 Subject: [PATCH 8/9] fix linter --- sklearn/kernel_approximation.py | 11 ++++++----- sklearn/tests/test_kernel_approximation.py | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index 021e0d0b94fc3..41a3eda5183da 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -753,7 +753,7 @@ def _fit_inverse_transform(self, X_transformed, X): if hasattr(X, "tocsr"): raise NotImplementedError("Inverse transform not implemented for " "sparse matrices!") - + # Let I and T be the indices of inducing points and training points. Kii = pairwise_kernels(X_transformed[self.basis_indices_], # K_{I, I} metric=self.kernel, @@ -766,7 +766,7 @@ def _fit_inverse_transform(self, X_transformed, X): filter_params=True, n_jobs=self.n_jobs, **self._get_kernel_params()) - + # By Woodbury matrix identity, following formula(typed in latex) holds: # # \tilde{K}_{new, T} (\tilde{K}_{T, T} + \alpha I)^{-1} y @@ -777,7 +777,8 @@ def _fit_inverse_transform(self, X_transformed, X): # \left\{K_{I, T}y - K_{I, T} K_{T, I}(\alpha K_{I, I} # + K_{I, T}K_{T, I})^{-1}K_{I, T}y \right\} # - # where \tilde{K} implies the kernel matrix approximated by Nystrom approximation + # where \tilde{K} implies the kernel matrix approximated by + # Nystrom approximation. A = Kit @ X B = Kit @ Kit.T @@ -825,7 +826,7 @@ def fit(self, X, y=None): **self._get_kernel_params()) # sqrt of kernel matrix on basis vectors - U, S, V = svd(basis_kernel) # TODO(kstoneriv3): Why not np.linalg.eigh() ? + U, S, V = svd(basis_kernel) # TODO(kstoneriv3): Why not linalg.eigh()? S = np.maximum(S, 1e-12) self.normalization_ = np.dot(U / np.sqrt(S), V) self.components_ = basis @@ -871,7 +872,7 @@ def inverse_transform(self, X): ``inverse_transform`` approximates the inverse transformation using a learned pre-image. The pre-image is learned by kernel ridge regression of the original data on their low-dimensional representation - vectors. For the efficiency of kernel ridge regression, the kernel for + vectors. For the efficiency of kernel ridge regression, the kernel for regression is a Nystrom approximation of the original kernel. Parameters diff --git a/sklearn/tests/test_kernel_approximation.py b/sklearn/tests/test_kernel_approximation.py index 5bba7f7d55c76..24838cf486d08 100644 --- a/sklearn/tests/test_kernel_approximation.py +++ b/sklearn/tests/test_kernel_approximation.py @@ -7,7 +7,6 @@ from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_array_almost_equal -from sklearn.datasets import make_blobs from sklearn.kernel_approximation import RBFSampler from sklearn.kernel_approximation import AdditiveChi2Sampler from sklearn.kernel_approximation import SkewedChi2Sampler @@ -334,6 +333,7 @@ def test_nystroem_precomputed_kernel(): with pytest.raises(ValueError, match=msg): ny.fit(K) + def test_nystroem_inverse_transform_reconstruction(): # Test if the reconstruction is a good approximation. n = 100 From 7a010e69cfba4ece8b161386609fce60ce4cb1b6 Mon Sep 17 00:00:00 2001 From: Kei Ishikawa Date: Sat, 24 Apr 2021 21:31:43 +0200 Subject: [PATCH 9/9] fix documents --- doc/whats_new/v1.0.rst | 2 +- sklearn/kernel_approximation.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index ee14f97461f68..2df7c9b847fc8 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -220,7 +220,7 @@ Changelog :pr:`19411` by :user:`Simona Maggio `. :mod:`sklearn.kernel_approximation` -......................... +................................... - |Enhancement| Add :meth:`~sklearn.kernel_approximation.Nystroem.inverse_transform`. diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index 41a3eda5183da..67467dfa49f9f 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -869,7 +869,7 @@ def transform(self, X): def inverse_transform(self, X): """Transform X back to original space. - ``inverse_transform`` approximates the inverse transformation using + This method approximates the inverse transformation using a learned pre-image. The pre-image is learned by kernel ridge regression of the original data on their low-dimensional representation vectors. For the efficiency of kernel ridge regression, the kernel for