diff --git a/doc/conf.py b/doc/conf.py
index f749b188b3274..7f8b011746852 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -477,10 +477,28 @@ def add_js_css_files(app, pagename, templatename, context, doctree):
         "auto_examples/model_selection/plot_nested_cross_validation_iris"
     ),
     "auto_examples/linear_model/plot_lasso_lars": (
-        "auto_examples/linear_model/plot_lasso_lasso_lars_elasticnet_path"
+        "auto_examples/linear_model/plot_regularization"
     ),
     "auto_examples/linear_model/plot_lasso_coordinate_descent_path": (
-        "auto_examples/linear_model/plot_lasso_lasso_lars_elasticnet_path"
+        "auto_examples/linear_model/plot_regularization"
+    ),
+    "auto_examples/linear_model/plot_lasso_lasso_lars_elasticnet_path": (
+        "auto_examples/linear_model/plot_regularization"
+    ),
+    "auto_examples/linear_model/plot_ridge_coeffs": (
+        "auto_examples/linear_model/plot_regularization"
+    ),
+    "auto_examples/linear_model/plot_ridge_path": (
+        "auto_examples/linear_model/plot_regularization"
+    ),
+    "auto_examples/linear_model/plot_sgd_penalties": (
+        "auto_examples/linear_model/plot_regularization"
+    ),
+    "auto_examples/linear_model/plot_logistic_path": (
+        "auto_examples/linear_model/plot_regularization"
+    ),
+    "auto_examples/linear_model/plot_logistic_l1_l2_sparsity": (
+        "auto_examples/linear_model/plot_regularization"
     ),
     "auto_examples/cluster/plot_color_quantization": (
         "auto_examples/cluster/plot_face_compress"
diff --git a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py b/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
deleted file mode 100644
index 44ae64c4c2811..0000000000000
--- a/examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
+++ /dev/null
@@ -1,136 +0,0 @@
-"""
-========================================
-Lasso, Lasso-LARS, and Elastic Net paths
-========================================
-
-This example shows how to compute the "paths" of coefficients along the Lasso,
-Lasso-LARS, and Elastic Net regularization paths. In other words, it shows the
-relationship between the regularization parameter (alpha) and the coefficients.
-
-Lasso and Lasso-LARS impose a sparsity constraint on the coefficients,
-encouraging some of them to be zero. Elastic Net is a generalization of
-Lasso that adds an L2 penalty term to the L1 penalty term. This allows for
-some coefficients to be non-zero while still encouraging sparsity.
-
-Lasso and Elastic Net use a coordinate descent method to compute the paths, while
-Lasso-LARS uses the LARS algorithm to compute the paths.
-
-The paths are computed using :func:`~sklearn.linear_model.lasso_path`,
-:func:`~sklearn.linear_model.lars_path`, and :func:`~sklearn.linear_model.enet_path`.
-
-The results show different comparison plots:
-
-- Compare Lasso and Lasso-LARS
-- Compare Lasso and Elastic Net
-- Compare Lasso with positive Lasso
-- Compare LARS and Positive LARS
-- Compare Elastic Net and positive Elastic Net
-
-Each plot shows how the model coefficients vary as the regularization strength changes,
-offering insight into the behavior of these models
-under different constraints.
-"""
-
-# Authors: The scikit-learn developers
-# SPDX-License-Identifier: BSD-3-Clause
-
-from itertools import cycle
-
-import matplotlib.pyplot as plt
-
-from sklearn.datasets import load_diabetes
-from sklearn.linear_model import enet_path, lars_path, lasso_path
-
-X, y = load_diabetes(return_X_y=True)
-X /= X.std(axis=0)  # Standardize data (easier to set the l1_ratio parameter)
-
-# Compute paths
-
-eps = 5e-3  # the smaller it is the longer is the path
-
-print("Computing regularization path using the lasso...")
-alphas_lasso, coefs_lasso, _ = lasso_path(X, y, eps=eps)
-
-print("Computing regularization path using the positive lasso...")
-alphas_positive_lasso, coefs_positive_lasso, _ = lasso_path(
-    X, y, eps=eps, positive=True
-)
-
-print("Computing regularization path using the LARS...")
-alphas_lars, _, coefs_lars = lars_path(X, y, method="lasso")
-
-print("Computing regularization path using the positive LARS...")
-alphas_positive_lars, _, coefs_positive_lars = lars_path(
-    X, y, method="lasso", positive=True
-)
-
-print("Computing regularization path using the elastic net...")
-alphas_enet, coefs_enet, _ = enet_path(X, y, eps=eps, l1_ratio=0.8)
-
-print("Computing regularization path using the positive elastic net...")
-alphas_positive_enet, coefs_positive_enet, _ = enet_path(
-    X, y, eps=eps, l1_ratio=0.8, positive=True
-)
-
-# Display results
-
-plt.figure(1)
-colors = cycle(["b", "r", "g", "c", "k"])
-for coef_lasso, coef_lars, c in zip(coefs_lasso, coefs_lars, colors):
-    l1 = plt.semilogx(alphas_lasso, coef_lasso, c=c)
-    l2 = plt.semilogx(alphas_lars, coef_lars, linestyle="--", c=c)
-
-plt.xlabel("alpha")
-plt.ylabel("coefficients")
-plt.title("Lasso and LARS Paths")
-plt.legend((l1[-1], l2[-1]), ("Lasso", "LARS"), loc="lower right")
-plt.axis("tight")
-
-plt.figure(2)
-colors = cycle(["b", "r", "g", "c", "k"])
-for coef_l, coef_e, c in zip(coefs_lasso, coefs_enet, colors):
-    l1 = plt.semilogx(alphas_lasso, coef_l, c=c)
-    l2 = plt.semilogx(alphas_enet, coef_e, linestyle="--", c=c)
-
-plt.xlabel("alpha")
-plt.ylabel("coefficients")
-plt.title("Lasso and Elastic-Net Paths")
-plt.legend((l1[-1], l2[-1]), ("Lasso", "Elastic-Net"), loc="lower right")
-plt.axis("tight")
-
-
-plt.figure(3)
-for coef_l, coef_pl, c in zip(coefs_lasso, coefs_positive_lasso, colors):
-    l1 = plt.semilogy(alphas_lasso, coef_l, c=c)
-    l2 = plt.semilogy(alphas_positive_lasso, coef_pl, linestyle="--", c=c)
-
-plt.xlabel("alpha")
-plt.ylabel("coefficients")
-plt.title("Lasso and positive Lasso")
-plt.legend((l1[-1], l2[-1]), ("Lasso", "positive Lasso"), loc="lower right")
-plt.axis("tight")
-
-
-plt.figure(4)
-colors = cycle(["b", "r", "g", "c", "k"])
-for coef_lars, coef_positive_lars, c in zip(coefs_lars, coefs_positive_lars, colors):
-    l1 = plt.semilogx(alphas_lars, coef_lars, c=c)
-    l2 = plt.semilogx(alphas_positive_lars, coef_positive_lars, linestyle="--", c=c)
-
-plt.xlabel("alpha")
-plt.ylabel("coefficients")
-plt.title("LARS and Positive LARS")
-plt.legend((l1[-1], l2[-1]), ("LARS", "Positive LARS"), loc="lower right")
-plt.axis("tight")
-
-plt.figure(5)
-for coef_e, coef_pe, c in zip(coefs_enet, coefs_positive_enet, colors):
-    l1 = plt.semilogx(alphas_enet, coef_e, c=c)
-    l2 = plt.semilogx(alphas_positive_enet, coef_pe, linestyle="--", c=c)
-
-plt.xlabel("alpha")
-plt.ylabel("coefficients")
-plt.title("Elastic-Net and positive Elastic-Net")
-plt.legend((l1[-1], l2[-1]), ("Elastic-Net", "positive Elastic-Net"), loc="lower right")
-plt.axis("tight")
-plt.show()
diff --git a/examples/linear_model/plot_logistic_l1_l2_sparsity.py b/examples/linear_model/plot_logistic_l1_l2_sparsity.py
deleted file mode 100644
index f642dfade5db8..0000000000000
--- a/examples/linear_model/plot_logistic_l1_l2_sparsity.py
+++ /dev/null
@@ -1,88 +0,0 @@
-"""
-==============================================
-L1 Penalty and Sparsity in Logistic Regression
-==============================================
-
-Comparison of the sparsity (percentage of zero coefficients) of solutions when
-L1, L2 and Elastic-Net penalty are used for different values of C. We can see
-that large values of C give more freedom to the model.  Conversely, smaller
-values of C constrain the model more. In the L1 penalty case, this leads to
-sparser solutions. As expected, the Elastic-Net penalty sparsity is between
-that of L1 and L2.
-
-We classify 8x8 images of digits into two classes: 0-4 against 5-9.
-The visualization shows coefficients of the models for varying C.
-
-"""
-
-# Authors: The scikit-learn developers
-# SPDX-License-Identifier: BSD-3-Clause
-
-import matplotlib.pyplot as plt
-import numpy as np
-
-from sklearn import datasets
-from sklearn.linear_model import LogisticRegression
-from sklearn.preprocessing import StandardScaler
-
-X, y = datasets.load_digits(return_X_y=True)
-
-X = StandardScaler().fit_transform(X)
-
-# classify small against large digits
-y = (y > 4).astype(int)
-
-l1_ratio = 0.5  # L1 weight in the Elastic-Net regularization
-
-fig, axes = plt.subplots(3, 3)
-
-# Set regularization parameter
-for i, (C, axes_row) in enumerate(zip((1, 0.1, 0.01), axes)):
-    # Increase tolerance for short training time
-    clf_l1_LR = LogisticRegression(C=C, penalty="l1", tol=0.01, solver="saga")
-    clf_l2_LR = LogisticRegression(C=C, penalty="l2", tol=0.01, solver="saga")
-    clf_en_LR = LogisticRegression(
-        C=C, penalty="elasticnet", solver="saga", l1_ratio=l1_ratio, tol=0.01
-    )
-    clf_l1_LR.fit(X, y)
-    clf_l2_LR.fit(X, y)
-    clf_en_LR.fit(X, y)
-
-    coef_l1_LR = clf_l1_LR.coef_.ravel()
-    coef_l2_LR = clf_l2_LR.coef_.ravel()
-    coef_en_LR = clf_en_LR.coef_.ravel()
-
-    # coef_l1_LR contains zeros due to the
-    # L1 sparsity inducing norm
-
-    sparsity_l1_LR = np.mean(coef_l1_LR == 0) * 100
-    sparsity_l2_LR = np.mean(coef_l2_LR == 0) * 100
-    sparsity_en_LR = np.mean(coef_en_LR == 0) * 100
-
-    print(f"C={C:.2f}")
-    print(f"{'Sparsity with L1 penalty:':<40} {sparsity_l1_LR:.2f}%")
-    print(f"{'Sparsity with Elastic-Net penalty:':<40} {sparsity_en_LR:.2f}%")
-    print(f"{'Sparsity with L2 penalty:':<40} {sparsity_l2_LR:.2f}%")
-    print(f"{'Score with L1 penalty:':<40} {clf_l1_LR.score(X, y):.2f}")
-    print(f"{'Score with Elastic-Net penalty:':<40} {clf_en_LR.score(X, y):.2f}")
-    print(f"{'Score with L2 penalty:':<40} {clf_l2_LR.score(X, y):.2f}")
-
-    if i == 0:
-        axes_row[0].set_title("L1 penalty")
-        axes_row[1].set_title("Elastic-Net\nl1_ratio = %s" % l1_ratio)
-        axes_row[2].set_title("L2 penalty")
-
-    for ax, coefs in zip(axes_row, [coef_l1_LR, coef_en_LR, coef_l2_LR]):
-        ax.imshow(
-            np.abs(coefs.reshape(8, 8)),
-            interpolation="nearest",
-            cmap="binary",
-            vmax=1,
-            vmin=0,
-        )
-        ax.set_xticks(())
-        ax.set_yticks(())
-
-    axes_row[0].set_ylabel(f"C = {C}")
-
-plt.show()
diff --git a/examples/linear_model/plot_logistic_path.py b/examples/linear_model/plot_logistic_path.py
deleted file mode 100644
index 46608f683740e..0000000000000
--- a/examples/linear_model/plot_logistic_path.py
+++ /dev/null
@@ -1,103 +0,0 @@
-"""
-==============================================
-Regularization path of L1- Logistic Regression
-==============================================
-
-
-Train l1-penalized logistic regression models on a binary classification
-problem derived from the Iris dataset.
-
-The models are ordered from strongest regularized to least regularized. The 4
-coefficients of the models are collected and plotted as a "regularization
-path": on the left-hand side of the figure (strong regularizers), all the
-coefficients are exactly 0. When regularization gets progressively looser,
-coefficients can get non-zero values one after the other.
-
-Here we choose the liblinear solver because it can efficiently optimize for the
-Logistic Regression loss with a non-smooth, sparsity inducing l1 penalty.
-
-Also note that we set a low value for the tolerance to make sure that the model
-has converged before collecting the coefficients.
-
-We also use warm_start=True which means that the coefficients of the models are
-reused to initialize the next model fit to speed-up the computation of the
-full-path.
-
-"""
-
-# Authors: The scikit-learn developers
-# SPDX-License-Identifier: BSD-3-Clause
-
-# %%
-# Load data
-# ---------
-
-from sklearn import datasets
-
-iris = datasets.load_iris()
-X = iris.data
-y = iris.target
-feature_names = iris.feature_names
-
-# %%
-# Here we remove the third class to make the problem a binary classification
-X = X[y != 2]
-y = y[y != 2]
-
-# %%
-# Compute regularization path
-# ---------------------------
-
-import numpy as np
-
-from sklearn.linear_model import LogisticRegression
-from sklearn.pipeline import make_pipeline
-from sklearn.preprocessing import StandardScaler
-from sklearn.svm import l1_min_c
-
-cs = l1_min_c(X, y, loss="log") * np.logspace(0, 1, 16)
-
-# %%
-# Create a pipeline with `StandardScaler` and `LogisticRegression`, to normalize
-# the data before fitting a linear model, in order to speed-up convergence and
-# make the coefficients comparable. Also, as a side effect, since the data is now
-# centered around 0, we don't need to fit an intercept.
-clf = make_pipeline(
-    StandardScaler(),
-    LogisticRegression(
-        penalty="l1",
-        solver="liblinear",
-        tol=1e-6,
-        max_iter=int(1e6),
-        warm_start=True,
-        fit_intercept=False,
-    ),
-)
-coefs_ = []
-for c in cs:
-    clf.set_params(logisticregression__C=c)
-    clf.fit(X, y)
-    coefs_.append(clf["logisticregression"].coef_.ravel().copy())
-
-coefs_ = np.array(coefs_)
-
-# %%
-# Plot regularization path
-# ------------------------
-
-import matplotlib.pyplot as plt
-
-# Colorblind-friendly palette (IBM Color Blind Safe palette)
-colors = ["#648FFF", "#785EF0", "#DC267F", "#FE6100"]
-
-plt.figure(figsize=(10, 6))
-for i in range(coefs_.shape[1]):
-    plt.semilogx(cs, coefs_[:, i], marker="o", color=colors[i], label=feature_names[i])
-
-ymin, ymax = plt.ylim()
-plt.xlabel("C")
-plt.ylabel("Coefficients")
-plt.title("Logistic Regression Path")
-plt.legend()
-plt.axis("tight")
-plt.show()
diff --git a/examples/linear_model/plot_regularization.py b/examples/linear_model/plot_regularization.py
new file mode 100644
index 0000000000000..610a59b3736dd
--- /dev/null
+++ b/examples/linear_model/plot_regularization.py
@@ -0,0 +1,461 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""
+=================================================
+Regularization in Linear and Logistic Regressions
+=================================================
+
+This example explores regularization techniques for linear and logistic regression
+in both regression and classification tasks. It demonstrates how the
+regularization parameter :math:`\\alpha` can be adjusted to control the complexity
+of the trained coefficients :math:`w` and reduce overfitting.
+"""
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+from sklearn.datasets import make_classification, make_regression
+from sklearn.linear_model import (
+    LogisticRegression,
+    Ridge,
+    enet_path,
+    lars_path,
+    lasso_path,
+)
+from sklearn.metrics import mean_squared_error
+from sklearn.svm import l1_min_c
+
+# %%
+#
+# Regularization in Linear Regression
+# -----------------------------------
+#
+# When performing linear regression on a given dataset
+# :math:`(X, y)`, regularization terms can be added to
+# control the model's complexity and mitigate overfitting.
+# Scikit-learn provides the following regularization techniques:
+#
+# - :func:`~sklearn.linear_model.Lasso`
+# - :func:`~sklearn.linear_model.LassoLars`
+# - :func:`~sklearn.linear_model.Ridge`
+# - :func:`~sklearn.linear_model.ElasticNet` (with `l1_ratio=0.5`
+#   in this example)
+#
+# Mathematically, these are formulated by minimizing the constrained
+# least-squares penalty:
+#
+# .. math::
+#
+#     \min_{w} \frac{1}{2n_{\operatorname{sample}}}
+#     \Vert Xw - y \Vert^2_2 +
+#     \left\{
+#     \begin{array}{cl}
+#     \alpha \Vert w \Vert_1 & \mbox{Lasso(-LARS)} \\
+#     \alpha \Vert w \Vert_2^2 & \mbox{Ridge} \\
+#     \frac{\alpha}{2} \Vert w \Vert_1  +
+#     \frac{\alpha}{4} \Vert w \Vert^2_2& \mbox{Elastic Net} \\
+#     \end{array}
+#     \right.
+#
+# Thus, the Lasso model (resp. Ridge model) includes the :math:`\ell^1`-norm
+# (resp. :math:`\ell^2`-norm) of the regression
+# coefficients in the penalty, while the Elastic Net model
+# incorporates both :math:`\ell^1`- and :math:`\ell^2`-norms.
+#
+# We can interpret the :math:`\ell^p`-norms as minimising the least squares penalty
+# under different geometries. This is illustrated by plotting the unit circles
+#
+# .. math::
+#
+#     \left\{
+#     \begin{array}{cl}
+#     \Vert w \Vert_1 &=1 \\
+#     \Vert w \Vert_2^2 &=1 \\
+#     0.5 \Vert w \Vert_1 + 0.25 \Vert w \Vert_2^2 &= 1 \\
+#     \end{array}
+#     \right.
+#
+# in :math:`\mathbb{R}^2`:
+line = np.linspace(-1.2, 1.2, 1001)
+xx, yy = np.meshgrid(line, line)
+
+l1 = np.abs(xx) + np.abs(yy)
+l2 = xx**2 + yy**2
+elastic_net = 0.5 * l1 + 0.25 * l2
+
+plt.figure()
+ax = plt.gca()
+
+l1_contour = plt.contour(xx, yy, l1, levels=[1], colors="#0072B2")
+l2_contour = plt.contour(xx, yy, l2, levels=[1], colors="#D55E00")
+elastic_net_contour = plt.contour(xx, yy, elastic_net, levels=[1], colors="#009E73")
+
+ax.set_aspect("equal")
+ax.spines["left"].set_position("center")
+ax.spines["right"].set_color("none")
+ax.spines["bottom"].set_position("center")
+ax.spines["top"].set_color("none")
+
+plt.clabel(l1_contour, inline=1, fmt={1.0: r"$\ell^1$"}, manual=[(-1, -1)])
+plt.clabel(l2_contour, inline=1, fmt={1.0: r"$\ell^2$"}, manual=[(-1, -1)])
+plt.clabel(
+    elastic_net_contour,
+    inline=1,
+    fmt={1.0: "Elastic Net"},
+    manual=[(1, -2)],
+)
+
+plt.title(r"Unit Circles in $\mathbb{R}^2$")
+
+plt.tight_layout()
+_ = plt.show()
+
+# %%
+# Algebraically, any solution to this optimization problem depends
+# on :math:`\alpha`. For example, in Lasso, a large :math:`\alpha` forces
+# the least-squares penalty to stay small, which in turn keeps the norm
+# :math:`\Vert w \Vert_1`
+# small. Conversely, a smaller :math:`\alpha` allows the norm
+# :math:`\Vert w \Vert_1`
+# to grow larger.
+#
+# This suggests that the regression coefficients :math:`w` evolve as
+# :math:`\alpha` increases, and we are interested in knowing
+# :math:`w` across a range of :math:`\alpha` values. This is known
+# as the **regularization path**: a list of :math:`w` values corresponding to
+# different :math:`\alpha` values, ranging from small to large.
+#
+# In this example, we plot the regularization paths to show how the magnitudes of
+# the coefficients change as the regularization parameter :math:`\alpha` increases.
+# This demonstrates how model complexity varies with :math:`\alpha`. We then compare
+# the trained coefficients with the true coefficients used to generate the training set,
+# illustrating how regularization helps mitigate overfitting.
+#
+# Creating a Noise-free Regression Dataset
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# We generate a toy dataset with 400 samples and 10 features, suitable for
+# regression analysis. Since the data is noise-free in this example,
+# we can expect our regression model to recover the true coefficients `w` exactly.
+
+X, y, w = make_regression(n_samples=400, n_features=10, coef=True, random_state=42)
+
+# %%
+#
+# Impact of Regularization Parameter on Model Complexity
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# Lasso(-LARS) and Elastic Net Models
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# Scikit-learn provides the following functions to compute multiple
+# :math:`w` values for various :math:`\alpha` values efficiently:
+#
+# - :func:`~sklearn.linear_model.lasso_path`
+# - :func:`~sklearn.linear_model.lars_path`
+# - :func:`~sklearn.linear_model.enet_path` with `l1_ratio=0.5`
+#
+
+eps = 3e-4  # the smaller it is the longer is the path
+
+alphas_lasso, coefs_lasso, _ = lasso_path(X, y, eps=eps)
+
+alphas_enet, coefs_enet, _ = enet_path(X, y, eps=eps, l1_ratio=0.5)
+
+# %%
+# The :func:`~sklearn.linear_model.lasso_path` and
+# :func:`~sklearn.linear_model.enet_path` functions compute
+# :math:`w` with **coordinate decent**: for each entry of :math:`w`,
+# the function solves for it optimal value while keeping the others
+# fixed. Since the algorithm iterates until convergence,
+# Lasso doesn't operate in a fixed number of steps based solely
+# on the dataset's size, which can make it take longer to run.
+# In contrast, the Lasso-LARS model computes the Lasso solution in fewer steps.
+
+alphas_lars, _, coefs_lars = lars_path(X, y, method="lasso")
+
+# %%
+# The Lasso-LARS model uses the **Least Angle Regression (LARS)** algorithm
+# (see [1]_ Algorithm 3.2 on page 74) to compute the Lasso solution in
+# :math:`\min \left\{
+# n_{\operatorname{sample}}-1,n_{\operatorname{feature}}
+# \right\}`
+# steps. This provides an efficient algorithm for computing the entire Lasso path, and
+# is implemented as :func:`~sklearn.linear_model.LassoLars`
+# and :func:`~sklearn.linear_model.lars_path`.
+#
+# Ridge Model
+# ~~~~~~~~~~~
+#
+# Next, we compute the coefficients for the Ridge model using the :math:`\alpha`
+# from Elastic Net:
+
+coefs_ridge = []
+for a in alphas_enet:
+    ridge = Ridge(alpha=a)
+    ridge.fit(X, y)
+    coefs_ridge.append(ridge.coef_)
+
+coefs_ridge = np.asarray(coefs_ridge)
+
+# %%
+# Plotting the Regularization Paths
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#
+# We now present the visualization of the regularization paths for the dataset.
+# Each model is represented by 10 curves, corresponding to the number of features in the
+# dataset. Each curve shows how a particular coefficient :math:`w_i` changes as
+# :math:`\alpha` increases.
+
+model_names = ["Lasso", "Lasso-LARS", "Elastic Net", "Ridge"]
+
+model_coefficients = [coefs_lasso.T, coefs_lars.T, coefs_enet.T, coefs_ridge]
+
+model_alphas = [alphas_lasso, alphas_lars, alphas_enet, alphas_enet]
+
+fig, axes = plt.subplots(4, 4, sharex=True, sharey=True, figsize=(25, 10))
+
+for i in range(len(model_names)):
+    for j in range(len(model_names)):
+        if i == j:
+            axes[i, i].semilogx(model_alphas[i], model_coefficients[i], c="#0072B2")
+
+            axes[i, i].set_title(f"{model_names[i]} Paths", fontsize=14)
+
+        elif j < i:
+            l1 = axes[i, j].semilogx(
+                model_alphas[i], model_coefficients[i], c="#0072B2"
+            )
+
+            l2 = axes[i, j].semilogx(
+                model_alphas[j], model_coefficients[j], linestyle="--", c="#D55E00"
+            )
+
+            axes[i, j].set_title(
+                f"{model_names[j]} vs {model_names[i]} Paths", fontsize=14
+            )
+
+            axes[i, j].legend(
+                (l1[-1], l2[-1]),
+                (f"{model_names[i]}", f"{model_names[j]}"),
+                loc="upper right",
+            )
+
+        else:
+            fig.delaxes(axes[i, j])
+
+fig.text(0.5, 0.02, r"$\alpha$", fontsize=18, ha="center")
+fig.text(0, 0.5, "Coefficients", fontsize=18, va="center", rotation=90)
+
+fig.suptitle(
+    "Comparing Regularization Paths: Lasso(-LARS), Ridge, and Elastic Net", fontsize=20
+)
+
+fig.tight_layout(pad=3.0)
+_ = plt.show()
+
+# %%
+#
+# * In the "Lasso vs Lasso-LARS Paths" visual,
+#   the Lasso and Lasso-LARS paths appear identical towards the end
+#   because both models solve the same constrained problem.
+#   However, Lasso-LARS reaches the solution faster than Lasso.
+#
+# * The "Lasso vs Elastic-Net Paths" visual is more notable.
+#   Elastic Net's coefficients tend to have smaller absolute values than those of Lasso.
+#   Additionally, Elastic Net maintains more non-zero coefficients than Lasso towards
+#   the end.
+#
+# * In the "Lasso(-LARS) vs Ridge Paths" and "Elastic Net vs Ridge Paths" visuals, the
+#   Ridge model focuses on shrinking all coefficients uniformly, rather than setting
+#   some to exactly zero. As a result, the Ridge model retains all features after
+#   training, unlike the Lasso(-LARS) or Elastic Net models.
+#
+# This demonstrates how different regularization techniques govern
+# the model's complexity:
+#
+# 1. the :math:`\ell^1`-norm constraint encourages sparsity in the solution.
+#
+# 2. the :math:`\ell^2`-norm constraint focuses on shrinkage of the magnitude
+#    of the solution.
+#
+# 3. the Elastic Net constraint provides a balanced compromise.
+#
+# Mitigating Overfitting with Regularization
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# Recall that the true coefficient `w` refers to the coefficients of the linear model
+# used to generate the training dataset. In this section, we compare the trained
+# coefficients of Lasso(-LARS), Ridge, and Elastic Net with `w` to demonstrate how
+# regularization can mitigate overfitting. This is achieved by computing the
+# :func:`~sklearn.metrics.mean_squared_error` (MSE) between the true and trained
+# coefficients.
+
+lasso_mse, lars_mse, enet_mse, ridge_mse = [], [], [], []
+
+for coef_lasso, coef_enet, coef_ridge in zip(coefs_lasso.T, coefs_enet.T, coefs_ridge):
+    lasso_mse.append(mean_squared_error(coef_lasso, w))
+    enet_mse.append(mean_squared_error(coef_enet, w))
+    ridge_mse.append(mean_squared_error(coef_ridge, w))
+
+for coef_lars in coefs_lars.T:
+    lars_mse.append(mean_squared_error(coef_lars, w))
+
+lasso_mse = np.asarray(lasso_mse)
+lars_mse = np.asarray(lars_mse)
+enet_mse = np.asarray(enet_mse)
+ridge_mse = np.asarray(ridge_mse)
+
+# %%
+#
+# The idea is that a smaller MSE between the true and trained coefficients implies
+# greater similarity between the coefficients. Thus, if the MSE is small, the
+# trained model captures the underlying pattern of the training data well.
+# However, this can also indicate that the trained model may not perform well on
+# generalised data, as the pattern may not hold for unseen data.
+# This is essentially the overfitting problem.
+#
+# The following visualization demonstrates how the MSE changes for different trained
+# models as the regularization parameter :math:`\alpha` increases.
+
+plt.figure()
+l1 = plt.semilogx(alphas_lasso, lasso_mse.T, c="#0072B2")
+l2 = plt.semilogx(alphas_lars, lars_mse.T, c="#D55E00")
+l3 = plt.semilogx(alphas_enet, enet_mse.T, c="#009E73")
+l4 = plt.semilogx(alphas_enet, ridge_mse, c="#F0E442")
+
+plt.xlabel(r"$\alpha$")
+plt.ylabel("Mean Squared Error")
+plt.title("Coefficient Error Across Regularization Strengths")
+plt.legend(
+    (l1[-1], l2[-1], l3[-1], l4[-1]),
+    ("Lasso", "LARS", "Elastic Net", "Ridge"),
+    loc="upper left",
+)
+
+plt.axis("tight")
+_ = plt.show()
+
+# %%
+#
+# In the visualization, for small values of :math:`\alpha`, since our synthetic data is
+# noise-free, the trained coefficients of Lasso(-LARS), Ridge, and Elastic Net are
+# similar to the true coefficient `w` (with MSE close to 0). This indicates that the
+# models capture the intricate details of the training data well.
+#
+# As :math:`\alpha` increases, the MSE also increases. This improves the models' ability
+# to generalise to unseen data (e.g., if the data were noisy), but it also risks
+# degrading model performance if the regularization becomes too strong.
+#
+# Regularization in Logistic Regression
+# -------------------------------------
+#
+# Regularization can also be applied to Logistic Regression when working on
+# classification tasks. scikit-learn's :func:`~sklearn.linear_model.LogisticRegression`
+# enables users to apply regularization using the `penalty` parameter:
+#
+# * `l1`: :math:`\ell^1`-regularization, similar to the Lasso model
+# * `l2`: :math:`\ell^2`-regularization, similar to the Ridge model
+# * `elasticnet`: Combined with the `l1_ratio` parameter for a mix of :math:`\ell^1`
+#   and :math:`\ell^2`
+#
+# Additionally, the `C` parameter controls the inverse of the regularization strength.
+# Smaller values of `C` apply stronger regularization.
+#
+# We demonstrate the effect of regularization by creating a synthetic classification
+# dataset.
+#
+
+X, y = make_classification(
+    n_samples=400,
+    n_features=64,
+    n_informative=64,
+    n_redundant=0,
+    n_classes=2,
+    n_clusters_per_class=1,
+    random_state=42,
+)
+
+# %%
+#
+# In this synthetic binary classification dataset, there are 400 samples,
+# each with 64 features. This toy dataset is noise-free to maintain consistency with
+# our earlier regression example.
+#
+# As noted in the regression example, :math:`\ell^1`-regularization may set some
+# coefficients exactly to zero. For extreme values of `C`, the trained coefficients
+# may even become the zero vector. To address this, scikit-learn provides the
+# :func:`~sklearn.svm.l1_min_c` function, which computes the minimum value of the
+# regularization strength `C` at which the model begins to learn meaningful patterns
+# (i.e., some coefficients become non-zero).
+#
+
+cs = l1_min_c(X, y, loss="log") * np.logspace(0, 10, 16)
+
+# %%
+#
+# We now plot heatmaps to represent the sparsity for each `penalty` and each value
+# of `C`.
+#
+
+l1_ratio = 0.5  # l1 weight in the Elastic-Net regularization
+
+fig, axes = plt.subplots(3, 3)
+
+# Set regularization parameter
+for i, (C, axes_row) in enumerate(zip((1, 0.1, 0.01), axes)):
+    # Increase tolerance for short training time
+    clf_l1_LR = LogisticRegression(C=C, penalty="l1", tol=0.01, solver="saga")
+    clf_l2_LR = LogisticRegression(C=C, penalty="l2", tol=0.01, solver="saga")
+    clf_en_LR = LogisticRegression(
+        C=C, penalty="elasticnet", solver="saga", l1_ratio=l1_ratio, tol=0.01
+    )
+    clf_l1_LR.fit(X, y)
+    clf_l2_LR.fit(X, y)
+    clf_en_LR.fit(X, y)
+
+    coef_l1_LR = clf_l1_LR.coef_.ravel()
+    coef_l2_LR = clf_l2_LR.coef_.ravel()
+    coef_en_LR = clf_en_LR.coef_.ravel()
+
+    sparsity_l1_LR = np.mean(coef_l1_LR == 0) * 100
+    sparsity_l2_LR = np.mean(coef_l2_LR == 0) * 100
+    sparsity_en_LR = np.mean(coef_en_LR == 0) * 100
+
+    if i == 0:
+        axes_row[0].set_title(r"$\ell^1$ penalty")
+        axes_row[1].set_title(f"Elastic-Net\n {l1_ratio = }")
+        axes_row[2].set_title(r"$\ell^2$ penalty")
+
+    for ax, coefs in zip(axes_row, [coef_l1_LR, coef_en_LR, coef_l2_LR]):
+        ax.imshow(
+            np.abs(coefs.reshape(8, 8)),
+            interpolation="nearest",
+            cmap="binary",
+            vmax=1,
+            vmin=0,
+        )
+        ax.set_xticks(())
+        ax.set_yticks(())
+
+    axes_row[0].set_ylabel(f"C = {C:.2f}")
+
+_ = plt.show()
+# %%
+#
+# Each heatmap organizes the 64 coefficients (the number of features in our synthetic
+# classification dataset) into an 8×8 grid. It is constructed by taking the absolute
+# values of the coefficients and displaying them in a black-and-white scale, where
+# lower values appear white and higher values appear black.
+#
+# We can see that larger values of `C` (i.e., weaker regularization) give the model
+# more freedom, while smaller values of `C` impose stronger constraints, leading to
+# increased sparsity. As expected, the Elastic-Net penalty results in a level of
+# sparsity between that of :math:`\ell^1` and :math:`\ell^2`.
+#
+# .. rubric:: References
+#
+# .. [1] Hastie, T., Tibshirani, R., & Friedman, J. (2009). The Elements of Statistical
+#        Learning: Data Mining, Inference, and Prediction. New York,
+#        NY: Springer New York.
diff --git a/examples/linear_model/plot_ridge_coeffs.py b/examples/linear_model/plot_ridge_coeffs.py
deleted file mode 100644
index 1ad7962f8bfa3..0000000000000
--- a/examples/linear_model/plot_ridge_coeffs.py
+++ /dev/null
@@ -1,181 +0,0 @@
-"""
-=========================================================
-Ridge coefficients as a function of the L2 Regularization
-=========================================================
-
-A model that overfits learns the training data too well, capturing both the
-underlying patterns and the noise in the data. However, when applied to unseen
-data, the learned associations may not hold. We normally detect this when we
-apply our trained predictions to the test data and see the statistical
-performance drop significantly compared to the training data.
-
-One way to overcome overfitting is through regularization, which can be done by
-penalizing large weights (coefficients) in linear models, forcing the model to
-shrink all coefficients. Regularization reduces a model's reliance on specific
-information obtained from the training samples.
-
-This example illustrates how L2 regularization in a
-:class:`~sklearn.linear_model.Ridge` regression affects a model's performance by
-adding a penalty term to the loss that increases with the coefficients
-:math:`\\beta`.
-
-The regularized loss function is given by: :math:`\\mathcal{L}(X, y, \\beta) =
-\\| y - X \\beta \\|^{2}_{2} + \\alpha \\| \\beta \\|^{2}_{2}`
-
-where :math:`X` is the input data, :math:`y` is the target variable,
-:math:`\\beta` is the vector of coefficients associated with the features, and
-:math:`\\alpha` is the regularization strength.
-
-The regularized loss function aims to balance the trade-off between accurately
-predicting the training set and to prevent overfitting.
-
-In this regularized loss, the left-hand side (e.g. :math:`\\|y -
-X\\beta\\|^{2}_{2}`) measures the squared difference between the actual target
-variable, :math:`y`, and the predicted values. Minimizing this term alone could
-lead to overfitting, as the model may become too complex and sensitive to noise
-in the training data.
-
-To address overfitting, Ridge regularization adds a constraint, called a penalty
-term, (:math:`\\alpha \\| \\beta\\|^{2}_{2}`) to the loss function. This penalty
-term is the sum of the squares of the model's coefficients, multiplied by the
-regularization strength :math:`\\alpha`. By introducing this constraint, Ridge
-regularization discourages any single coefficient :math:`\\beta_{i}` from taking
-an excessively large value and encourages smaller and more evenly distributed
-coefficients. Higher values of :math:`\\alpha` force the coefficients towards
-zero. However, an excessively high :math:`\\alpha` can result in an underfit
-model that fails to capture important patterns in the data.
-
-Therefore, the regularized loss function combines the prediction accuracy term
-and the penalty term. By adjusting the regularization strength, practitioners
-can fine-tune the degree of constraint imposed on the weights, training a model
-capable of generalizing well to unseen data while avoiding overfitting.
-"""
-
-# Authors: The scikit-learn developers
-# SPDX-License-Identifier: BSD-3-Clause
-
-# %%
-# Purpose of this example
-# -----------------------
-# For the purpose of showing how Ridge regularization works, we will create a
-# non-noisy data set. Then we will train a regularized model on a range of
-# regularization strengths (:math:`\alpha`) and plot how the trained
-# coefficients and the mean squared error between those and the original values
-# behave as functions of the regularization strength.
-#
-# Creating a non-noisy data set
-# *****************************
-# We make a toy data set with 100 samples and 10 features, that's suitable to
-# detect regression. Out of the 10 features, 8 are informative and contribute to
-# the regression, while the remaining 2 features do not have any effect on the
-# target variable (their true coefficients are 0). Please note that in this
-# example the data is non-noisy, hence we can expect our regression model to
-# recover exactly the true coefficients w.
-from sklearn.datasets import make_regression
-
-X, y, w = make_regression(
-    n_samples=100, n_features=10, n_informative=8, coef=True, random_state=1
-)
-
-# Obtain the true coefficients
-print(f"The true coefficient of this regression problem are:\n{w}")
-
-# %%
-# Training the Ridge Regressor
-# ****************************
-# We use :class:`~sklearn.linear_model.Ridge`, a linear model with L2
-# regularization. We train several models, each with a different value for the
-# model parameter `alpha`, which is a positive constant that multiplies the
-# penalty term, controlling the regularization strength. For each trained model
-# we then compute the error between the true coefficients `w` and the
-# coefficients found by the model `clf`. We store the identified coefficients
-# and the calculated errors for the corresponding coefficients in lists, which
-# makes it convenient for us to plot them.
-import numpy as np
-
-from sklearn.linear_model import Ridge
-from sklearn.metrics import mean_squared_error
-
-clf = Ridge()
-
-# Generate values for `alpha` that are evenly distributed on a logarithmic scale
-alphas = np.logspace(-3, 4, 200)
-coefs = []
-errors_coefs = []
-
-# Train the model with different regularisation strengths
-for a in alphas:
-    clf.set_params(alpha=a).fit(X, y)
-    coefs.append(clf.coef_)
-    errors_coefs.append(mean_squared_error(clf.coef_, w))
-
-# %%
-# Plotting trained Coefficients and Mean Squared Errors
-# *****************************************************
-# We now plot the 10 different regularized coefficients as a function of the
-# regularization parameter `alpha` where each color represents a different
-# coefficient.
-#
-# On the right-hand-side, we plot how the errors of the coefficients from the
-# estimator change as a function of regularization.
-import matplotlib.pyplot as plt
-import pandas as pd
-
-alphas = pd.Index(alphas, name="alpha")
-coefs = pd.DataFrame(coefs, index=alphas, columns=[f"Feature {i}" for i in range(10)])
-errors = pd.Series(errors_coefs, index=alphas, name="Mean squared error")
-
-fig, axs = plt.subplots(1, 2, figsize=(20, 6))
-
-coefs.plot(
-    ax=axs[0],
-    logx=True,
-    title="Ridge coefficients as a function of the regularization strength",
-)
-axs[0].set_ylabel("Ridge coefficient values")
-errors.plot(
-    ax=axs[1],
-    logx=True,
-    title="Coefficient error as a function of the regularization strength",
-)
-_ = axs[1].set_ylabel("Mean squared error")
-# %%
-# Interpreting the plots
-# **********************
-# The plot on the left-hand side shows how the regularization strength (`alpha`)
-# affects the Ridge regression coefficients. Smaller values of `alpha` (weak
-# regularization), allow the coefficients to closely resemble the true
-# coefficients (`w`) used to generate the data set. This is because no
-# additional noise was added to our artificial data set. As `alpha` increases,
-# the coefficients shrink towards zero, gradually reducing the impact of the
-# features that were formerly more significant.
-#
-# The right-hand side plot shows the mean squared error (MSE) between the
-# coefficients found by the model and the true coefficients (`w`). It provides a
-# measure that relates to how exact our ridge model is in comparison to the true
-# generative model. A low error means that it found coefficients closer to the
-# ones of the true generative model. In this case, since our toy data set was
-# non-noisy, we can see that the least regularized model retrieves coefficients
-# closest to the true coefficients (`w`) (error is close to 0).
-#
-# When `alpha` is small, the model captures the intricate details of the
-# training data, whether those were caused by noise or by actual information. As
-# `alpha` increases, the highest coefficients shrink more rapidly, rendering
-# their corresponding features less influential in the training process. This
-# can enhance a model's ability to generalize to unseen data (if there was a lot
-# of noise to capture), but it also poses the risk of losing performance if the
-# regularization becomes too strong compared to the amount of noise the data
-# contained (as in this example).
-#
-# In real-world scenarios where data typically includes noise, selecting an
-# appropriate `alpha` value becomes crucial in striking a balance between an
-# overfitting and an underfitting model.
-#
-# Here, we saw that :class:`~sklearn.linear_model.Ridge` adds a penalty to the
-# coefficients to fight overfitting. Another problem that occurs is linked to
-# the presence of outliers in the training dataset. An outlier is a data point
-# that differs significantly from other observations. Concretely, these outliers
-# impact the left-hand side term of the loss function that we showed earlier.
-# Some other linear models are formulated to be robust to outliers such as the
-# :class:`~sklearn.linear_model.HuberRegressor`. You can learn more about it in
-# the :ref:`sphx_glr_auto_examples_linear_model_plot_huber_vs_ridge.py` example.
diff --git a/examples/linear_model/plot_ridge_path.py b/examples/linear_model/plot_ridge_path.py
deleted file mode 100644
index d3c19acd9e18c..0000000000000
--- a/examples/linear_model/plot_ridge_path.py
+++ /dev/null
@@ -1,68 +0,0 @@
-"""
-===========================================================
-Plot Ridge coefficients as a function of the regularization
-===========================================================
-
-Shows the effect of collinearity in the coefficients of an estimator.
-
-.. currentmodule:: sklearn.linear_model
-
-:class:`Ridge` Regression is the estimator used in this example.
-Each color represents a different feature of the
-coefficient vector, and this is displayed as a function of the
-regularization parameter.
-
-This example also shows the usefulness of applying Ridge regression
-to highly ill-conditioned matrices. For such matrices, a slight
-change in the target variable can cause huge variances in the
-calculated weights. In such cases, it is useful to set a certain
-regularization (alpha) to reduce this variation (noise).
-
-When alpha is very large, the regularization effect dominates the
-squared loss function and the coefficients tend to zero.
-At the end of the path, as alpha tends toward zero
-and the solution tends towards the ordinary least squares, coefficients
-exhibit big oscillations. In practise it is necessary to tune alpha
-in such a way that a balance is maintained between both.
-
-"""
-
-# Authors: The scikit-learn developers
-# SPDX-License-Identifier: BSD-3-Clause
-
-import matplotlib.pyplot as plt
-import numpy as np
-
-from sklearn import linear_model
-
-# X is the 10x10 Hilbert matrix
-X = 1.0 / (np.arange(1, 11) + np.arange(0, 10)[:, np.newaxis])
-y = np.ones(10)
-
-# %%
-# Compute paths
-# -------------
-
-n_alphas = 200
-alphas = np.logspace(-10, -2, n_alphas)
-
-coefs = []
-for a in alphas:
-    ridge = linear_model.Ridge(alpha=a, fit_intercept=False)
-    ridge.fit(X, y)
-    coefs.append(ridge.coef_)
-
-# %%
-# Display results
-# ---------------
-
-ax = plt.gca()
-
-ax.plot(alphas, coefs)
-ax.set_xscale("log")
-ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
-plt.xlabel("alpha")
-plt.ylabel("weights")
-plt.title("Ridge coefficients as a function of the regularization")
-plt.axis("tight")
-plt.show()
diff --git a/examples/linear_model/plot_sgd_penalties.py b/examples/linear_model/plot_sgd_penalties.py
deleted file mode 100644
index 6f8830b52fe7a..0000000000000
--- a/examples/linear_model/plot_sgd_penalties.py
+++ /dev/null
@@ -1,57 +0,0 @@
-"""
-==============
-SGD: Penalties
-==============
-
-Contours of where the penalty is equal to 1
-for the three penalties L1, L2 and elastic-net.
-
-All of the above are supported by :class:`~sklearn.linear_model.SGDClassifier`
-and :class:`~sklearn.linear_model.SGDRegressor`.
-
-"""
-
-# Authors: The scikit-learn developers
-# SPDX-License-Identifier: BSD-3-Clause
-
-import matplotlib.pyplot as plt
-import numpy as np
-
-l1_color = "navy"
-l2_color = "c"
-elastic_net_color = "darkorange"
-
-line = np.linspace(-1.5, 1.5, 1001)
-xx, yy = np.meshgrid(line, line)
-
-l2 = xx**2 + yy**2
-l1 = np.abs(xx) + np.abs(yy)
-rho = 0.5
-elastic_net = rho * l1 + (1 - rho) * l2
-
-plt.figure(figsize=(10, 10), dpi=100)
-ax = plt.gca()
-
-elastic_net_contour = plt.contour(
-    xx, yy, elastic_net, levels=[1], colors=elastic_net_color
-)
-l2_contour = plt.contour(xx, yy, l2, levels=[1], colors=l2_color)
-l1_contour = plt.contour(xx, yy, l1, levels=[1], colors=l1_color)
-ax.set_aspect("equal")
-ax.spines["left"].set_position("center")
-ax.spines["right"].set_color("none")
-ax.spines["bottom"].set_position("center")
-ax.spines["top"].set_color("none")
-
-plt.clabel(
-    elastic_net_contour,
-    inline=1,
-    fontsize=18,
-    fmt={1.0: "elastic-net"},
-    manual=[(-1, -1)],
-)
-plt.clabel(l2_contour, inline=1, fontsize=18, fmt={1.0: "L2"}, manual=[(-1, -1)])
-plt.clabel(l1_contour, inline=1, fontsize=18, fmt={1.0: "L1"}, manual=[(-1, -1)])
-
-plt.tight_layout()
-plt.show()
diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
index b98cf08925910..5c43c8b04ec20 100644
--- a/sklearn/linear_model/_coordinate_descent.py
+++ b/sklearn/linear_model/_coordinate_descent.py
@@ -319,8 +319,8 @@ def lasso_path(
     Notes
     -----
     For an example, see
-    :ref:`examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
-    <sphx_glr_auto_examples_linear_model_plot_lasso_lasso_lars_elasticnet_path.py>`.
+    :ref:`examples/linear_model/plot_regularization.py
+    <sphx_glr_auto_examples_linear_model_plot_regularization.py>`.
 
     To avoid unnecessary memory duplication the X argument of the fit method
     should be directly passed as a Fortran-contiguous numpy array.
@@ -524,8 +524,8 @@ def enet_path(
     Notes
     -----
     For an example, see
-    :ref:`examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py
-    <sphx_glr_auto_examples_linear_model_plot_lasso_lasso_lars_elasticnet_path.py>`.
+    :ref:`examples/linear_model/plot_regularization.py
+    <sphx_glr_auto_examples_linear_model_plot_regularization.py>`.
 
     Examples
     --------