diff --git a/doc/conf.py b/doc/conf.py index f749b188b3274..7f8b011746852 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -477,10 +477,28 @@ def add_js_css_files(app, pagename, templatename, context, doctree): "auto_examples/model_selection/plot_nested_cross_validation_iris" ), "auto_examples/linear_model/plot_lasso_lars": ( - "auto_examples/linear_model/plot_lasso_lasso_lars_elasticnet_path" + "auto_examples/linear_model/plot_regularization" ), "auto_examples/linear_model/plot_lasso_coordinate_descent_path": ( - "auto_examples/linear_model/plot_lasso_lasso_lars_elasticnet_path" + "auto_examples/linear_model/plot_regularization" + ), + "auto_examples/linear_model/plot_lasso_lasso_lars_elasticnet_path": ( + "auto_examples/linear_model/plot_regularization" + ), + "auto_examples/linear_model/plot_ridge_coeffs": ( + "auto_examples/linear_model/plot_regularization" + ), + "auto_examples/linear_model/plot_ridge_path": ( + "auto_examples/linear_model/plot_regularization" + ), + "auto_examples/linear_model/plot_sgd_penalties": ( + "auto_examples/linear_model/plot_regularization" + ), + "auto_examples/linear_model/plot_logistic_path": ( + "auto_examples/linear_model/plot_regularization" + ), + "auto_examples/linear_model/plot_logistic_l1_l2_sparsity": ( + "auto_examples/linear_model/plot_regularization" ), "auto_examples/cluster/plot_color_quantization": ( "auto_examples/cluster/plot_face_compress" diff --git a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py deleted file mode 100644 index 44ae64c4c2811..0000000000000 --- a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py +++ /dev/null @@ -1,136 +0,0 @@ -""" -======================================== -Lasso, Lasso-LARS, and Elastic Net paths -======================================== - -This example shows how to compute the "paths" of coefficients along the Lasso, -Lasso-LARS, and Elastic Net regularization paths. In other words, it shows the -relationship between the regularization parameter (alpha) and the coefficients. - -Lasso and Lasso-LARS impose a sparsity constraint on the coefficients, -encouraging some of them to be zero. Elastic Net is a generalization of -Lasso that adds an L2 penalty term to the L1 penalty term. This allows for -some coefficients to be non-zero while still encouraging sparsity. - -Lasso and Elastic Net use a coordinate descent method to compute the paths, while -Lasso-LARS uses the LARS algorithm to compute the paths. - -The paths are computed using :func:`~sklearn.linear_model.lasso_path`, -:func:`~sklearn.linear_model.lars_path`, and :func:`~sklearn.linear_model.enet_path`. - -The results show different comparison plots: - -- Compare Lasso and Lasso-LARS -- Compare Lasso and Elastic Net -- Compare Lasso with positive Lasso -- Compare LARS and Positive LARS -- Compare Elastic Net and positive Elastic Net - -Each plot shows how the model coefficients vary as the regularization strength changes, -offering insight into the behavior of these models -under different constraints. -""" - -# Authors: The scikit-learn developers -# SPDX-License-Identifier: BSD-3-Clause - -from itertools import cycle - -import matplotlib.pyplot as plt - -from sklearn.datasets import load_diabetes -from sklearn.linear_model import enet_path, lars_path, lasso_path - -X, y = load_diabetes(return_X_y=True) -X /= X.std(axis=0) # Standardize data (easier to set the l1_ratio parameter) - -# Compute paths - -eps = 5e-3 # the smaller it is the longer is the path - -print("Computing regularization path using the lasso...") -alphas_lasso, coefs_lasso, _ = lasso_path(X, y, eps=eps) - -print("Computing regularization path using the positive lasso...") -alphas_positive_lasso, coefs_positive_lasso, _ = lasso_path( - X, y, eps=eps, positive=True -) - -print("Computing regularization path using the LARS...") -alphas_lars, _, coefs_lars = lars_path(X, y, method="lasso") - -print("Computing regularization path using the positive LARS...") -alphas_positive_lars, _, coefs_positive_lars = lars_path( - X, y, method="lasso", positive=True -) - -print("Computing regularization path using the elastic net...") -alphas_enet, coefs_enet, _ = enet_path(X, y, eps=eps, l1_ratio=0.8) - -print("Computing regularization path using the positive elastic net...") -alphas_positive_enet, coefs_positive_enet, _ = enet_path( - X, y, eps=eps, l1_ratio=0.8, positive=True -) - -# Display results - -plt.figure(1) -colors = cycle(["b", "r", "g", "c", "k"]) -for coef_lasso, coef_lars, c in zip(coefs_lasso, coefs_lars, colors): - l1 = plt.semilogx(alphas_lasso, coef_lasso, c=c) - l2 = plt.semilogx(alphas_lars, coef_lars, linestyle="--", c=c) - -plt.xlabel("alpha") -plt.ylabel("coefficients") -plt.title("Lasso and LARS Paths") -plt.legend((l1[-1], l2[-1]), ("Lasso", "LARS"), loc="lower right") -plt.axis("tight") - -plt.figure(2) -colors = cycle(["b", "r", "g", "c", "k"]) -for coef_l, coef_e, c in zip(coefs_lasso, coefs_enet, colors): - l1 = plt.semilogx(alphas_lasso, coef_l, c=c) - l2 = plt.semilogx(alphas_enet, coef_e, linestyle="--", c=c) - -plt.xlabel("alpha") -plt.ylabel("coefficients") -plt.title("Lasso and Elastic-Net Paths") -plt.legend((l1[-1], l2[-1]), ("Lasso", "Elastic-Net"), loc="lower right") -plt.axis("tight") - - -plt.figure(3) -for coef_l, coef_pl, c in zip(coefs_lasso, coefs_positive_lasso, colors): - l1 = plt.semilogy(alphas_lasso, coef_l, c=c) - l2 = plt.semilogy(alphas_positive_lasso, coef_pl, linestyle="--", c=c) - -plt.xlabel("alpha") -plt.ylabel("coefficients") -plt.title("Lasso and positive Lasso") -plt.legend((l1[-1], l2[-1]), ("Lasso", "positive Lasso"), loc="lower right") -plt.axis("tight") - - -plt.figure(4) -colors = cycle(["b", "r", "g", "c", "k"]) -for coef_lars, coef_positive_lars, c in zip(coefs_lars, coefs_positive_lars, colors): - l1 = plt.semilogx(alphas_lars, coef_lars, c=c) - l2 = plt.semilogx(alphas_positive_lars, coef_positive_lars, linestyle="--", c=c) - -plt.xlabel("alpha") -plt.ylabel("coefficients") -plt.title("LARS and Positive LARS") -plt.legend((l1[-1], l2[-1]), ("LARS", "Positive LARS"), loc="lower right") -plt.axis("tight") - -plt.figure(5) -for coef_e, coef_pe, c in zip(coefs_enet, coefs_positive_enet, colors): - l1 = plt.semilogx(alphas_enet, coef_e, c=c) - l2 = plt.semilogx(alphas_positive_enet, coef_pe, linestyle="--", c=c) - -plt.xlabel("alpha") -plt.ylabel("coefficients") -plt.title("Elastic-Net and positive Elastic-Net") -plt.legend((l1[-1], l2[-1]), ("Elastic-Net", "positive Elastic-Net"), loc="lower right") -plt.axis("tight") -plt.show() diff --git a/examples/linear_model/plot_logistic_l1_l2_sparsity.py b/examples/linear_model/plot_logistic_l1_l2_sparsity.py deleted file mode 100644 index f642dfade5db8..0000000000000 --- a/examples/linear_model/plot_logistic_l1_l2_sparsity.py +++ /dev/null @@ -1,88 +0,0 @@ -""" -============================================== -L1 Penalty and Sparsity in Logistic Regression -============================================== - -Comparison of the sparsity (percentage of zero coefficients) of solutions when -L1, L2 and Elastic-Net penalty are used for different values of C. We can see -that large values of C give more freedom to the model. Conversely, smaller -values of C constrain the model more. In the L1 penalty case, this leads to -sparser solutions. As expected, the Elastic-Net penalty sparsity is between -that of L1 and L2. - -We classify 8x8 images of digits into two classes: 0-4 against 5-9. -The visualization shows coefficients of the models for varying C. - -""" - -# Authors: The scikit-learn developers -# SPDX-License-Identifier: BSD-3-Clause - -import matplotlib.pyplot as plt -import numpy as np - -from sklearn import datasets -from sklearn.linear_model import LogisticRegression -from sklearn.preprocessing import StandardScaler - -X, y = datasets.load_digits(return_X_y=True) - -X = StandardScaler().fit_transform(X) - -# classify small against large digits -y = (y > 4).astype(int) - -l1_ratio = 0.5 # L1 weight in the Elastic-Net regularization - -fig, axes = plt.subplots(3, 3) - -# Set regularization parameter -for i, (C, axes_row) in enumerate(zip((1, 0.1, 0.01), axes)): - # Increase tolerance for short training time - clf_l1_LR = LogisticRegression(C=C, penalty="l1", tol=0.01, solver="saga") - clf_l2_LR = LogisticRegression(C=C, penalty="l2", tol=0.01, solver="saga") - clf_en_LR = LogisticRegression( - C=C, penalty="elasticnet", solver="saga", l1_ratio=l1_ratio, tol=0.01 - ) - clf_l1_LR.fit(X, y) - clf_l2_LR.fit(X, y) - clf_en_LR.fit(X, y) - - coef_l1_LR = clf_l1_LR.coef_.ravel() - coef_l2_LR = clf_l2_LR.coef_.ravel() - coef_en_LR = clf_en_LR.coef_.ravel() - - # coef_l1_LR contains zeros due to the - # L1 sparsity inducing norm - - sparsity_l1_LR = np.mean(coef_l1_LR == 0) * 100 - sparsity_l2_LR = np.mean(coef_l2_LR == 0) * 100 - sparsity_en_LR = np.mean(coef_en_LR == 0) * 100 - - print(f"C={C:.2f}") - print(f"{'Sparsity with L1 penalty:':<40} {sparsity_l1_LR:.2f}%") - print(f"{'Sparsity with Elastic-Net penalty:':<40} {sparsity_en_LR:.2f}%") - print(f"{'Sparsity with L2 penalty:':<40} {sparsity_l2_LR:.2f}%") - print(f"{'Score with L1 penalty:':<40} {clf_l1_LR.score(X, y):.2f}") - print(f"{'Score with Elastic-Net penalty:':<40} {clf_en_LR.score(X, y):.2f}") - print(f"{'Score with L2 penalty:':<40} {clf_l2_LR.score(X, y):.2f}") - - if i == 0: - axes_row[0].set_title("L1 penalty") - axes_row[1].set_title("Elastic-Net\nl1_ratio = %s" % l1_ratio) - axes_row[2].set_title("L2 penalty") - - for ax, coefs in zip(axes_row, [coef_l1_LR, coef_en_LR, coef_l2_LR]): - ax.imshow( - np.abs(coefs.reshape(8, 8)), - interpolation="nearest", - cmap="binary", - vmax=1, - vmin=0, - ) - ax.set_xticks(()) - ax.set_yticks(()) - - axes_row[0].set_ylabel(f"C = {C}") - -plt.show() diff --git a/examples/linear_model/plot_logistic_path.py b/examples/linear_model/plot_logistic_path.py deleted file mode 100644 index 46608f683740e..0000000000000 --- a/examples/linear_model/plot_logistic_path.py +++ /dev/null @@ -1,103 +0,0 @@ -""" -============================================== -Regularization path of L1- Logistic Regression -============================================== - - -Train l1-penalized logistic regression models on a binary classification -problem derived from the Iris dataset. - -The models are ordered from strongest regularized to least regularized. The 4 -coefficients of the models are collected and plotted as a "regularization -path": on the left-hand side of the figure (strong regularizers), all the -coefficients are exactly 0. When regularization gets progressively looser, -coefficients can get non-zero values one after the other. - -Here we choose the liblinear solver because it can efficiently optimize for the -Logistic Regression loss with a non-smooth, sparsity inducing l1 penalty. - -Also note that we set a low value for the tolerance to make sure that the model -has converged before collecting the coefficients. - -We also use warm_start=True which means that the coefficients of the models are -reused to initialize the next model fit to speed-up the computation of the -full-path. - -""" - -# Authors: The scikit-learn developers -# SPDX-License-Identifier: BSD-3-Clause - -# %% -# Load data -# --------- - -from sklearn import datasets - -iris = datasets.load_iris() -X = iris.data -y = iris.target -feature_names = iris.feature_names - -# %% -# Here we remove the third class to make the problem a binary classification -X = X[y != 2] -y = y[y != 2] - -# %% -# Compute regularization path -# --------------------------- - -import numpy as np - -from sklearn.linear_model import LogisticRegression -from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import StandardScaler -from sklearn.svm import l1_min_c - -cs = l1_min_c(X, y, loss="log") * np.logspace(0, 1, 16) - -# %% -# Create a pipeline with `StandardScaler` and `LogisticRegression`, to normalize -# the data before fitting a linear model, in order to speed-up convergence and -# make the coefficients comparable. Also, as a side effect, since the data is now -# centered around 0, we don't need to fit an intercept. -clf = make_pipeline( - StandardScaler(), - LogisticRegression( - penalty="l1", - solver="liblinear", - tol=1e-6, - max_iter=int(1e6), - warm_start=True, - fit_intercept=False, - ), -) -coefs_ = [] -for c in cs: - clf.set_params(logisticregression__C=c) - clf.fit(X, y) - coefs_.append(clf["logisticregression"].coef_.ravel().copy()) - -coefs_ = np.array(coefs_) - -# %% -# Plot regularization path -# ------------------------ - -import matplotlib.pyplot as plt - -# Colorblind-friendly palette (IBM Color Blind Safe palette) -colors = ["#648FFF", "#785EF0", "#DC267F", "#FE6100"] - -plt.figure(figsize=(10, 6)) -for i in range(coefs_.shape[1]): - plt.semilogx(cs, coefs_[:, i], marker="o", color=colors[i], label=feature_names[i]) - -ymin, ymax = plt.ylim() -plt.xlabel("C") -plt.ylabel("Coefficients") -plt.title("Logistic Regression Path") -plt.legend() -plt.axis("tight") -plt.show() diff --git a/examples/linear_model/plot_regularization.py b/examples/linear_model/plot_regularization.py new file mode 100644 index 0000000000000..610a59b3736dd --- /dev/null +++ b/examples/linear_model/plot_regularization.py @@ -0,0 +1,461 @@ +# Authors: The scikit-learn developers +# SPDX-License-Identifier: BSD-3-Clause + +""" +================================================= +Regularization in Linear and Logistic Regressions +================================================= + +This example explores regularization techniques for linear and logistic regression +in both regression and classification tasks. It demonstrates how the +regularization parameter :math:`\\alpha` can be adjusted to control the complexity +of the trained coefficients :math:`w` and reduce overfitting. +""" + +import matplotlib.pyplot as plt +import numpy as np + +from sklearn.datasets import make_classification, make_regression +from sklearn.linear_model import ( + LogisticRegression, + Ridge, + enet_path, + lars_path, + lasso_path, +) +from sklearn.metrics import mean_squared_error +from sklearn.svm import l1_min_c + +# %% +# +# Regularization in Linear Regression +# ----------------------------------- +# +# When performing linear regression on a given dataset +# :math:`(X, y)`, regularization terms can be added to +# control the model's complexity and mitigate overfitting. +# Scikit-learn provides the following regularization techniques: +# +# - :func:`~sklearn.linear_model.Lasso` +# - :func:`~sklearn.linear_model.LassoLars` +# - :func:`~sklearn.linear_model.Ridge` +# - :func:`~sklearn.linear_model.ElasticNet` (with `l1_ratio=0.5` +# in this example) +# +# Mathematically, these are formulated by minimizing the constrained +# least-squares penalty: +# +# .. math:: +# +# \min_{w} \frac{1}{2n_{\operatorname{sample}}} +# \Vert Xw - y \Vert^2_2 + +# \left\{ +# \begin{array}{cl} +# \alpha \Vert w \Vert_1 & \mbox{Lasso(-LARS)} \\ +# \alpha \Vert w \Vert_2^2 & \mbox{Ridge} \\ +# \frac{\alpha}{2} \Vert w \Vert_1 + +# \frac{\alpha}{4} \Vert w \Vert^2_2& \mbox{Elastic Net} \\ +# \end{array} +# \right. +# +# Thus, the Lasso model (resp. Ridge model) includes the :math:`\ell^1`-norm +# (resp. :math:`\ell^2`-norm) of the regression +# coefficients in the penalty, while the Elastic Net model +# incorporates both :math:`\ell^1`- and :math:`\ell^2`-norms. +# +# We can interpret the :math:`\ell^p`-norms as minimising the least squares penalty +# under different geometries. This is illustrated by plotting the unit circles +# +# .. math:: +# +# \left\{ +# \begin{array}{cl} +# \Vert w \Vert_1 &=1 \\ +# \Vert w \Vert_2^2 &=1 \\ +# 0.5 \Vert w \Vert_1 + 0.25 \Vert w \Vert_2^2 &= 1 \\ +# \end{array} +# \right. +# +# in :math:`\mathbb{R}^2`: +line = np.linspace(-1.2, 1.2, 1001) +xx, yy = np.meshgrid(line, line) + +l1 = np.abs(xx) + np.abs(yy) +l2 = xx**2 + yy**2 +elastic_net = 0.5 * l1 + 0.25 * l2 + +plt.figure() +ax = plt.gca() + +l1_contour = plt.contour(xx, yy, l1, levels=[1], colors="#0072B2") +l2_contour = plt.contour(xx, yy, l2, levels=[1], colors="#D55E00") +elastic_net_contour = plt.contour(xx, yy, elastic_net, levels=[1], colors="#009E73") + +ax.set_aspect("equal") +ax.spines["left"].set_position("center") +ax.spines["right"].set_color("none") +ax.spines["bottom"].set_position("center") +ax.spines["top"].set_color("none") + +plt.clabel(l1_contour, inline=1, fmt={1.0: r"$\ell^1$"}, manual=[(-1, -1)]) +plt.clabel(l2_contour, inline=1, fmt={1.0: r"$\ell^2$"}, manual=[(-1, -1)]) +plt.clabel( + elastic_net_contour, + inline=1, + fmt={1.0: "Elastic Net"}, + manual=[(1, -2)], +) + +plt.title(r"Unit Circles in $\mathbb{R}^2$") + +plt.tight_layout() +_ = plt.show() + +# %% +# Algebraically, any solution to this optimization problem depends +# on :math:`\alpha`. For example, in Lasso, a large :math:`\alpha` forces +# the least-squares penalty to stay small, which in turn keeps the norm +# :math:`\Vert w \Vert_1` +# small. Conversely, a smaller :math:`\alpha` allows the norm +# :math:`\Vert w \Vert_1` +# to grow larger. +# +# This suggests that the regression coefficients :math:`w` evolve as +# :math:`\alpha` increases, and we are interested in knowing +# :math:`w` across a range of :math:`\alpha` values. This is known +# as the **regularization path**: a list of :math:`w` values corresponding to +# different :math:`\alpha` values, ranging from small to large. +# +# In this example, we plot the regularization paths to show how the magnitudes of +# the coefficients change as the regularization parameter :math:`\alpha` increases. +# This demonstrates how model complexity varies with :math:`\alpha`. We then compare +# the trained coefficients with the true coefficients used to generate the training set, +# illustrating how regularization helps mitigate overfitting. +# +# Creating a Noise-free Regression Dataset +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# We generate a toy dataset with 400 samples and 10 features, suitable for +# regression analysis. Since the data is noise-free in this example, +# we can expect our regression model to recover the true coefficients `w` exactly. + +X, y, w = make_regression(n_samples=400, n_features=10, coef=True, random_state=42) + +# %% +# +# Impact of Regularization Parameter on Model Complexity +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# Lasso(-LARS) and Elastic Net Models +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# Scikit-learn provides the following functions to compute multiple +# :math:`w` values for various :math:`\alpha` values efficiently: +# +# - :func:`~sklearn.linear_model.lasso_path` +# - :func:`~sklearn.linear_model.lars_path` +# - :func:`~sklearn.linear_model.enet_path` with `l1_ratio=0.5` +# + +eps = 3e-4 # the smaller it is the longer is the path + +alphas_lasso, coefs_lasso, _ = lasso_path(X, y, eps=eps) + +alphas_enet, coefs_enet, _ = enet_path(X, y, eps=eps, l1_ratio=0.5) + +# %% +# The :func:`~sklearn.linear_model.lasso_path` and +# :func:`~sklearn.linear_model.enet_path` functions compute +# :math:`w` with **coordinate decent**: for each entry of :math:`w`, +# the function solves for it optimal value while keeping the others +# fixed. Since the algorithm iterates until convergence, +# Lasso doesn't operate in a fixed number of steps based solely +# on the dataset's size, which can make it take longer to run. +# In contrast, the Lasso-LARS model computes the Lasso solution in fewer steps. + +alphas_lars, _, coefs_lars = lars_path(X, y, method="lasso") + +# %% +# The Lasso-LARS model uses the **Least Angle Regression (LARS)** algorithm +# (see [1]_ Algorithm 3.2 on page 74) to compute the Lasso solution in +# :math:`\min \left\{ +# n_{\operatorname{sample}}-1,n_{\operatorname{feature}} +# \right\}` +# steps. This provides an efficient algorithm for computing the entire Lasso path, and +# is implemented as :func:`~sklearn.linear_model.LassoLars` +# and :func:`~sklearn.linear_model.lars_path`. +# +# Ridge Model +# ~~~~~~~~~~~ +# +# Next, we compute the coefficients for the Ridge model using the :math:`\alpha` +# from Elastic Net: + +coefs_ridge = [] +for a in alphas_enet: + ridge = Ridge(alpha=a) + ridge.fit(X, y) + coefs_ridge.append(ridge.coef_) + +coefs_ridge = np.asarray(coefs_ridge) + +# %% +# Plotting the Regularization Paths +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# We now present the visualization of the regularization paths for the dataset. +# Each model is represented by 10 curves, corresponding to the number of features in the +# dataset. Each curve shows how a particular coefficient :math:`w_i` changes as +# :math:`\alpha` increases. + +model_names = ["Lasso", "Lasso-LARS", "Elastic Net", "Ridge"] + +model_coefficients = [coefs_lasso.T, coefs_lars.T, coefs_enet.T, coefs_ridge] + +model_alphas = [alphas_lasso, alphas_lars, alphas_enet, alphas_enet] + +fig, axes = plt.subplots(4, 4, sharex=True, sharey=True, figsize=(25, 10)) + +for i in range(len(model_names)): + for j in range(len(model_names)): + if i == j: + axes[i, i].semilogx(model_alphas[i], model_coefficients[i], c="#0072B2") + + axes[i, i].set_title(f"{model_names[i]} Paths", fontsize=14) + + elif j < i: + l1 = axes[i, j].semilogx( + model_alphas[i], model_coefficients[i], c="#0072B2" + ) + + l2 = axes[i, j].semilogx( + model_alphas[j], model_coefficients[j], linestyle="--", c="#D55E00" + ) + + axes[i, j].set_title( + f"{model_names[j]} vs {model_names[i]} Paths", fontsize=14 + ) + + axes[i, j].legend( + (l1[-1], l2[-1]), + (f"{model_names[i]}", f"{model_names[j]}"), + loc="upper right", + ) + + else: + fig.delaxes(axes[i, j]) + +fig.text(0.5, 0.02, r"$\alpha$", fontsize=18, ha="center") +fig.text(0, 0.5, "Coefficients", fontsize=18, va="center", rotation=90) + +fig.suptitle( + "Comparing Regularization Paths: Lasso(-LARS), Ridge, and Elastic Net", fontsize=20 +) + +fig.tight_layout(pad=3.0) +_ = plt.show() + +# %% +# +# * In the "Lasso vs Lasso-LARS Paths" visual, +# the Lasso and Lasso-LARS paths appear identical towards the end +# because both models solve the same constrained problem. +# However, Lasso-LARS reaches the solution faster than Lasso. +# +# * The "Lasso vs Elastic-Net Paths" visual is more notable. +# Elastic Net's coefficients tend to have smaller absolute values than those of Lasso. +# Additionally, Elastic Net maintains more non-zero coefficients than Lasso towards +# the end. +# +# * In the "Lasso(-LARS) vs Ridge Paths" and "Elastic Net vs Ridge Paths" visuals, the +# Ridge model focuses on shrinking all coefficients uniformly, rather than setting +# some to exactly zero. As a result, the Ridge model retains all features after +# training, unlike the Lasso(-LARS) or Elastic Net models. +# +# This demonstrates how different regularization techniques govern +# the model's complexity: +# +# 1. the :math:`\ell^1`-norm constraint encourages sparsity in the solution. +# +# 2. the :math:`\ell^2`-norm constraint focuses on shrinkage of the magnitude +# of the solution. +# +# 3. the Elastic Net constraint provides a balanced compromise. +# +# Mitigating Overfitting with Regularization +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# Recall that the true coefficient `w` refers to the coefficients of the linear model +# used to generate the training dataset. In this section, we compare the trained +# coefficients of Lasso(-LARS), Ridge, and Elastic Net with `w` to demonstrate how +# regularization can mitigate overfitting. This is achieved by computing the +# :func:`~sklearn.metrics.mean_squared_error` (MSE) between the true and trained +# coefficients. + +lasso_mse, lars_mse, enet_mse, ridge_mse = [], [], [], [] + +for coef_lasso, coef_enet, coef_ridge in zip(coefs_lasso.T, coefs_enet.T, coefs_ridge): + lasso_mse.append(mean_squared_error(coef_lasso, w)) + enet_mse.append(mean_squared_error(coef_enet, w)) + ridge_mse.append(mean_squared_error(coef_ridge, w)) + +for coef_lars in coefs_lars.T: + lars_mse.append(mean_squared_error(coef_lars, w)) + +lasso_mse = np.asarray(lasso_mse) +lars_mse = np.asarray(lars_mse) +enet_mse = np.asarray(enet_mse) +ridge_mse = np.asarray(ridge_mse) + +# %% +# +# The idea is that a smaller MSE between the true and trained coefficients implies +# greater similarity between the coefficients. Thus, if the MSE is small, the +# trained model captures the underlying pattern of the training data well. +# However, this can also indicate that the trained model may not perform well on +# generalised data, as the pattern may not hold for unseen data. +# This is essentially the overfitting problem. +# +# The following visualization demonstrates how the MSE changes for different trained +# models as the regularization parameter :math:`\alpha` increases. + +plt.figure() +l1 = plt.semilogx(alphas_lasso, lasso_mse.T, c="#0072B2") +l2 = plt.semilogx(alphas_lars, lars_mse.T, c="#D55E00") +l3 = plt.semilogx(alphas_enet, enet_mse.T, c="#009E73") +l4 = plt.semilogx(alphas_enet, ridge_mse, c="#F0E442") + +plt.xlabel(r"$\alpha$") +plt.ylabel("Mean Squared Error") +plt.title("Coefficient Error Across Regularization Strengths") +plt.legend( + (l1[-1], l2[-1], l3[-1], l4[-1]), + ("Lasso", "LARS", "Elastic Net", "Ridge"), + loc="upper left", +) + +plt.axis("tight") +_ = plt.show() + +# %% +# +# In the visualization, for small values of :math:`\alpha`, since our synthetic data is +# noise-free, the trained coefficients of Lasso(-LARS), Ridge, and Elastic Net are +# similar to the true coefficient `w` (with MSE close to 0). This indicates that the +# models capture the intricate details of the training data well. +# +# As :math:`\alpha` increases, the MSE also increases. This improves the models' ability +# to generalise to unseen data (e.g., if the data were noisy), but it also risks +# degrading model performance if the regularization becomes too strong. +# +# Regularization in Logistic Regression +# ------------------------------------- +# +# Regularization can also be applied to Logistic Regression when working on +# classification tasks. scikit-learn's :func:`~sklearn.linear_model.LogisticRegression` +# enables users to apply regularization using the `penalty` parameter: +# +# * `l1`: :math:`\ell^1`-regularization, similar to the Lasso model +# * `l2`: :math:`\ell^2`-regularization, similar to the Ridge model +# * `elasticnet`: Combined with the `l1_ratio` parameter for a mix of :math:`\ell^1` +# and :math:`\ell^2` +# +# Additionally, the `C` parameter controls the inverse of the regularization strength. +# Smaller values of `C` apply stronger regularization. +# +# We demonstrate the effect of regularization by creating a synthetic classification +# dataset. +# + +X, y = make_classification( + n_samples=400, + n_features=64, + n_informative=64, + n_redundant=0, + n_classes=2, + n_clusters_per_class=1, + random_state=42, +) + +# %% +# +# In this synthetic binary classification dataset, there are 400 samples, +# each with 64 features. This toy dataset is noise-free to maintain consistency with +# our earlier regression example. +# +# As noted in the regression example, :math:`\ell^1`-regularization may set some +# coefficients exactly to zero. For extreme values of `C`, the trained coefficients +# may even become the zero vector. To address this, scikit-learn provides the +# :func:`~sklearn.svm.l1_min_c` function, which computes the minimum value of the +# regularization strength `C` at which the model begins to learn meaningful patterns +# (i.e., some coefficients become non-zero). +# + +cs = l1_min_c(X, y, loss="log") * np.logspace(0, 10, 16) + +# %% +# +# We now plot heatmaps to represent the sparsity for each `penalty` and each value +# of `C`. +# + +l1_ratio = 0.5 # l1 weight in the Elastic-Net regularization + +fig, axes = plt.subplots(3, 3) + +# Set regularization parameter +for i, (C, axes_row) in enumerate(zip((1, 0.1, 0.01), axes)): + # Increase tolerance for short training time + clf_l1_LR = LogisticRegression(C=C, penalty="l1", tol=0.01, solver="saga") + clf_l2_LR = LogisticRegression(C=C, penalty="l2", tol=0.01, solver="saga") + clf_en_LR = LogisticRegression( + C=C, penalty="elasticnet", solver="saga", l1_ratio=l1_ratio, tol=0.01 + ) + clf_l1_LR.fit(X, y) + clf_l2_LR.fit(X, y) + clf_en_LR.fit(X, y) + + coef_l1_LR = clf_l1_LR.coef_.ravel() + coef_l2_LR = clf_l2_LR.coef_.ravel() + coef_en_LR = clf_en_LR.coef_.ravel() + + sparsity_l1_LR = np.mean(coef_l1_LR == 0) * 100 + sparsity_l2_LR = np.mean(coef_l2_LR == 0) * 100 + sparsity_en_LR = np.mean(coef_en_LR == 0) * 100 + + if i == 0: + axes_row[0].set_title(r"$\ell^1$ penalty") + axes_row[1].set_title(f"Elastic-Net\n {l1_ratio = }") + axes_row[2].set_title(r"$\ell^2$ penalty") + + for ax, coefs in zip(axes_row, [coef_l1_LR, coef_en_LR, coef_l2_LR]): + ax.imshow( + np.abs(coefs.reshape(8, 8)), + interpolation="nearest", + cmap="binary", + vmax=1, + vmin=0, + ) + ax.set_xticks(()) + ax.set_yticks(()) + + axes_row[0].set_ylabel(f"C = {C:.2f}") + +_ = plt.show() +# %% +# +# Each heatmap organizes the 64 coefficients (the number of features in our synthetic +# classification dataset) into an 8×8 grid. It is constructed by taking the absolute +# values of the coefficients and displaying them in a black-and-white scale, where +# lower values appear white and higher values appear black. +# +# We can see that larger values of `C` (i.e., weaker regularization) give the model +# more freedom, while smaller values of `C` impose stronger constraints, leading to +# increased sparsity. As expected, the Elastic-Net penalty results in a level of +# sparsity between that of :math:`\ell^1` and :math:`\ell^2`. +# +# .. rubric:: References +# +# .. [1] Hastie, T., Tibshirani, R., & Friedman, J. (2009). The Elements of Statistical +# Learning: Data Mining, Inference, and Prediction. New York, +# NY: Springer New York. diff --git a/examples/linear_model/plot_ridge_coeffs.py b/examples/linear_model/plot_ridge_coeffs.py deleted file mode 100644 index 1ad7962f8bfa3..0000000000000 --- a/examples/linear_model/plot_ridge_coeffs.py +++ /dev/null @@ -1,181 +0,0 @@ -""" -========================================================= -Ridge coefficients as a function of the L2 Regularization -========================================================= - -A model that overfits learns the training data too well, capturing both the -underlying patterns and the noise in the data. However, when applied to unseen -data, the learned associations may not hold. We normally detect this when we -apply our trained predictions to the test data and see the statistical -performance drop significantly compared to the training data. - -One way to overcome overfitting is through regularization, which can be done by -penalizing large weights (coefficients) in linear models, forcing the model to -shrink all coefficients. Regularization reduces a model's reliance on specific -information obtained from the training samples. - -This example illustrates how L2 regularization in a -:class:`~sklearn.linear_model.Ridge` regression affects a model's performance by -adding a penalty term to the loss that increases with the coefficients -:math:`\\beta`. - -The regularized loss function is given by: :math:`\\mathcal{L}(X, y, \\beta) = -\\| y - X \\beta \\|^{2}_{2} + \\alpha \\| \\beta \\|^{2}_{2}` - -where :math:`X` is the input data, :math:`y` is the target variable, -:math:`\\beta` is the vector of coefficients associated with the features, and -:math:`\\alpha` is the regularization strength. - -The regularized loss function aims to balance the trade-off between accurately -predicting the training set and to prevent overfitting. - -In this regularized loss, the left-hand side (e.g. :math:`\\|y - -X\\beta\\|^{2}_{2}`) measures the squared difference between the actual target -variable, :math:`y`, and the predicted values. Minimizing this term alone could -lead to overfitting, as the model may become too complex and sensitive to noise -in the training data. - -To address overfitting, Ridge regularization adds a constraint, called a penalty -term, (:math:`\\alpha \\| \\beta\\|^{2}_{2}`) to the loss function. This penalty -term is the sum of the squares of the model's coefficients, multiplied by the -regularization strength :math:`\\alpha`. By introducing this constraint, Ridge -regularization discourages any single coefficient :math:`\\beta_{i}` from taking -an excessively large value and encourages smaller and more evenly distributed -coefficients. Higher values of :math:`\\alpha` force the coefficients towards -zero. However, an excessively high :math:`\\alpha` can result in an underfit -model that fails to capture important patterns in the data. - -Therefore, the regularized loss function combines the prediction accuracy term -and the penalty term. By adjusting the regularization strength, practitioners -can fine-tune the degree of constraint imposed on the weights, training a model -capable of generalizing well to unseen data while avoiding overfitting. -""" - -# Authors: The scikit-learn developers -# SPDX-License-Identifier: BSD-3-Clause - -# %% -# Purpose of this example -# ----------------------- -# For the purpose of showing how Ridge regularization works, we will create a -# non-noisy data set. Then we will train a regularized model on a range of -# regularization strengths (:math:`\alpha`) and plot how the trained -# coefficients and the mean squared error between those and the original values -# behave as functions of the regularization strength. -# -# Creating a non-noisy data set -# ***************************** -# We make a toy data set with 100 samples and 10 features, that's suitable to -# detect regression. Out of the 10 features, 8 are informative and contribute to -# the regression, while the remaining 2 features do not have any effect on the -# target variable (their true coefficients are 0). Please note that in this -# example the data is non-noisy, hence we can expect our regression model to -# recover exactly the true coefficients w. -from sklearn.datasets import make_regression - -X, y, w = make_regression( - n_samples=100, n_features=10, n_informative=8, coef=True, random_state=1 -) - -# Obtain the true coefficients -print(f"The true coefficient of this regression problem are:\n{w}") - -# %% -# Training the Ridge Regressor -# **************************** -# We use :class:`~sklearn.linear_model.Ridge`, a linear model with L2 -# regularization. We train several models, each with a different value for the -# model parameter `alpha`, which is a positive constant that multiplies the -# penalty term, controlling the regularization strength. For each trained model -# we then compute the error between the true coefficients `w` and the -# coefficients found by the model `clf`. We store the identified coefficients -# and the calculated errors for the corresponding coefficients in lists, which -# makes it convenient for us to plot them. -import numpy as np - -from sklearn.linear_model import Ridge -from sklearn.metrics import mean_squared_error - -clf = Ridge() - -# Generate values for `alpha` that are evenly distributed on a logarithmic scale -alphas = np.logspace(-3, 4, 200) -coefs = [] -errors_coefs = [] - -# Train the model with different regularisation strengths -for a in alphas: - clf.set_params(alpha=a).fit(X, y) - coefs.append(clf.coef_) - errors_coefs.append(mean_squared_error(clf.coef_, w)) - -# %% -# Plotting trained Coefficients and Mean Squared Errors -# ***************************************************** -# We now plot the 10 different regularized coefficients as a function of the -# regularization parameter `alpha` where each color represents a different -# coefficient. -# -# On the right-hand-side, we plot how the errors of the coefficients from the -# estimator change as a function of regularization. -import matplotlib.pyplot as plt -import pandas as pd - -alphas = pd.Index(alphas, name="alpha") -coefs = pd.DataFrame(coefs, index=alphas, columns=[f"Feature {i}" for i in range(10)]) -errors = pd.Series(errors_coefs, index=alphas, name="Mean squared error") - -fig, axs = plt.subplots(1, 2, figsize=(20, 6)) - -coefs.plot( - ax=axs[0], - logx=True, - title="Ridge coefficients as a function of the regularization strength", -) -axs[0].set_ylabel("Ridge coefficient values") -errors.plot( - ax=axs[1], - logx=True, - title="Coefficient error as a function of the regularization strength", -) -_ = axs[1].set_ylabel("Mean squared error") -# %% -# Interpreting the plots -# ********************** -# The plot on the left-hand side shows how the regularization strength (`alpha`) -# affects the Ridge regression coefficients. Smaller values of `alpha` (weak -# regularization), allow the coefficients to closely resemble the true -# coefficients (`w`) used to generate the data set. This is because no -# additional noise was added to our artificial data set. As `alpha` increases, -# the coefficients shrink towards zero, gradually reducing the impact of the -# features that were formerly more significant. -# -# The right-hand side plot shows the mean squared error (MSE) between the -# coefficients found by the model and the true coefficients (`w`). It provides a -# measure that relates to how exact our ridge model is in comparison to the true -# generative model. A low error means that it found coefficients closer to the -# ones of the true generative model. In this case, since our toy data set was -# non-noisy, we can see that the least regularized model retrieves coefficients -# closest to the true coefficients (`w`) (error is close to 0). -# -# When `alpha` is small, the model captures the intricate details of the -# training data, whether those were caused by noise or by actual information. As -# `alpha` increases, the highest coefficients shrink more rapidly, rendering -# their corresponding features less influential in the training process. This -# can enhance a model's ability to generalize to unseen data (if there was a lot -# of noise to capture), but it also poses the risk of losing performance if the -# regularization becomes too strong compared to the amount of noise the data -# contained (as in this example). -# -# In real-world scenarios where data typically includes noise, selecting an -# appropriate `alpha` value becomes crucial in striking a balance between an -# overfitting and an underfitting model. -# -# Here, we saw that :class:`~sklearn.linear_model.Ridge` adds a penalty to the -# coefficients to fight overfitting. Another problem that occurs is linked to -# the presence of outliers in the training dataset. An outlier is a data point -# that differs significantly from other observations. Concretely, these outliers -# impact the left-hand side term of the loss function that we showed earlier. -# Some other linear models are formulated to be robust to outliers such as the -# :class:`~sklearn.linear_model.HuberRegressor`. You can learn more about it in -# the :ref:`sphx_glr_auto_examples_linear_model_plot_huber_vs_ridge.py` example. diff --git a/examples/linear_model/plot_ridge_path.py b/examples/linear_model/plot_ridge_path.py deleted file mode 100644 index d3c19acd9e18c..0000000000000 --- a/examples/linear_model/plot_ridge_path.py +++ /dev/null @@ -1,68 +0,0 @@ -""" -=========================================================== -Plot Ridge coefficients as a function of the regularization -=========================================================== - -Shows the effect of collinearity in the coefficients of an estimator. - -.. currentmodule:: sklearn.linear_model - -:class:`Ridge` Regression is the estimator used in this example. -Each color represents a different feature of the -coefficient vector, and this is displayed as a function of the -regularization parameter. - -This example also shows the usefulness of applying Ridge regression -to highly ill-conditioned matrices. For such matrices, a slight -change in the target variable can cause huge variances in the -calculated weights. In such cases, it is useful to set a certain -regularization (alpha) to reduce this variation (noise). - -When alpha is very large, the regularization effect dominates the -squared loss function and the coefficients tend to zero. -At the end of the path, as alpha tends toward zero -and the solution tends towards the ordinary least squares, coefficients -exhibit big oscillations. In practise it is necessary to tune alpha -in such a way that a balance is maintained between both. - -""" - -# Authors: The scikit-learn developers -# SPDX-License-Identifier: BSD-3-Clause - -import matplotlib.pyplot as plt -import numpy as np - -from sklearn import linear_model - -# X is the 10x10 Hilbert matrix -X = 1.0 / (np.arange(1, 11) + np.arange(0, 10)[:, np.newaxis]) -y = np.ones(10) - -# %% -# Compute paths -# ------------- - -n_alphas = 200 -alphas = np.logspace(-10, -2, n_alphas) - -coefs = [] -for a in alphas: - ridge = linear_model.Ridge(alpha=a, fit_intercept=False) - ridge.fit(X, y) - coefs.append(ridge.coef_) - -# %% -# Display results -# --------------- - -ax = plt.gca() - -ax.plot(alphas, coefs) -ax.set_xscale("log") -ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis -plt.xlabel("alpha") -plt.ylabel("weights") -plt.title("Ridge coefficients as a function of the regularization") -plt.axis("tight") -plt.show() diff --git a/examples/linear_model/plot_sgd_penalties.py b/examples/linear_model/plot_sgd_penalties.py deleted file mode 100644 index 6f8830b52fe7a..0000000000000 --- a/examples/linear_model/plot_sgd_penalties.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -============== -SGD: Penalties -============== - -Contours of where the penalty is equal to 1 -for the three penalties L1, L2 and elastic-net. - -All of the above are supported by :class:`~sklearn.linear_model.SGDClassifier` -and :class:`~sklearn.linear_model.SGDRegressor`. - -""" - -# Authors: The scikit-learn developers -# SPDX-License-Identifier: BSD-3-Clause - -import matplotlib.pyplot as plt -import numpy as np - -l1_color = "navy" -l2_color = "c" -elastic_net_color = "darkorange" - -line = np.linspace(-1.5, 1.5, 1001) -xx, yy = np.meshgrid(line, line) - -l2 = xx**2 + yy**2 -l1 = np.abs(xx) + np.abs(yy) -rho = 0.5 -elastic_net = rho * l1 + (1 - rho) * l2 - -plt.figure(figsize=(10, 10), dpi=100) -ax = plt.gca() - -elastic_net_contour = plt.contour( - xx, yy, elastic_net, levels=[1], colors=elastic_net_color -) -l2_contour = plt.contour(xx, yy, l2, levels=[1], colors=l2_color) -l1_contour = plt.contour(xx, yy, l1, levels=[1], colors=l1_color) -ax.set_aspect("equal") -ax.spines["left"].set_position("center") -ax.spines["right"].set_color("none") -ax.spines["bottom"].set_position("center") -ax.spines["top"].set_color("none") - -plt.clabel( - elastic_net_contour, - inline=1, - fontsize=18, - fmt={1.0: "elastic-net"}, - manual=[(-1, -1)], -) -plt.clabel(l2_contour, inline=1, fontsize=18, fmt={1.0: "L2"}, manual=[(-1, -1)]) -plt.clabel(l1_contour, inline=1, fontsize=18, fmt={1.0: "L1"}, manual=[(-1, -1)]) - -plt.tight_layout() -plt.show() diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index b98cf08925910..5c43c8b04ec20 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -319,8 +319,8 @@ def lasso_path( Notes ----- For an example, see - :ref:`examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py - `. + :ref:`examples/linear_model/plot_regularization.py + `. To avoid unnecessary memory duplication the X argument of the fit method should be directly passed as a Fortran-contiguous numpy array. @@ -524,8 +524,8 @@ def enet_path( Notes ----- For an example, see - :ref:`examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py - `. + :ref:`examples/linear_model/plot_regularization.py + `. Examples --------