scikit-learn · lorentzenchr · Dec 15, 2021 · Sep 19, 2021 · Sep 20, 2021 · Sep 20, 2021
diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
@@ -221,7 +221,10 @@ Changelog
  a parameter in order to provide an estimate of the noise variance.
  This is particularly relevant when `n_features > n_samples` and the
  estimator of the noise variance cannot be computed.
-  :pr:`21481` by :user:`Guillaume Lemaitre <glemaitre>`
+  :pr:`21481` by :user:`Guillaume Lemaitre <glemaitre>`.
+
+- |Enhancement| :class:`linear_model.QuantileRegressor` support sparse inputs.
+  :pr:`21086` by :user:`Venkatachalam Natchiappan <venkyyuvy>`.

 - |Fix| :class:`linear_model.LassoLarsIC` now correctly computes AIC
  and BIC. An error is now raised when `n_features > n_samples` and

diff --git a/sklearn/linear_model/_quantile.py b/sklearn/linear_model/_quantile.py
@@ -4,11 +4,13 @@
 import warnings

 import numpy as np
+from scipy import sparse
 from scipy.optimize import linprog

 from ..base import BaseEstimator, RegressorMixin
 from ._base import LinearModel
 from ..exceptions import ConvergenceWarning
+from ..utils import _safe_indexing
 from ..utils.validation import _check_sample_weight
 from ..utils.fixes import sp_version, parse_version

@@ -44,6 +46,8 @@ class QuantileRegressor(LinearModel, RegressorMixin, BaseEstimator):
        Method used by :func:`scipy.optimize.linprog` to solve the linear
        programming formulation. Note that the highs methods are recommended
        for usage with `scipy>=1.6.0` because they are the fastest ones.
+        Solvers "highs-ds", "highs-ipm" and "highs" support
+        sparse input data.

    solver_options : dict, default=None
        Additional parameters passed to :func:`scipy.optimize.linprog` as
@@ -112,7 +116,7 @@ def fit(self, X, y, sample_weight=None):

        Parameters
        ----------
-        X : array-like of shape (n_samples, n_features)
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.

        y : array-like of shape (n_samples,)
@@ -127,7 +131,11 @@ def fit(self, X, y, sample_weight=None):
            Returns self.
        """
        X, y = self._validate_data(
-            X, y, accept_sparse=False, y_numeric=True, multi_output=False
+            X,
+            y,
+            accept_sparse=["csc", "csr", "coo"],
+            y_numeric=True,
+            multi_output=False,
        )
        sample_weight = _check_sample_weight(sample_weight, X)

@@ -218,37 +226,47 @@ def fit(self, X, y, sample_weight=None):
        #
        # Filtering out zero samples weights from the beginning makes life
        # easier for the linprog solver.
-        mask = sample_weight != 0
-        n_mask = int(np.sum(mask))  # use n_mask instead of n_samples
+        indices = np.nonzero(sample_weight)[0]
+        n_indices = len(indices)  # use n_mask instead of n_samples
+        if n_indices < len(sample_weight):
+            sample_weight = sample_weight[indices]
+            X = _safe_indexing(X, indices)
+            y = _safe_indexing(y, indices)
        c = np.concatenate(
            [
                np.full(2 * n_params, fill_value=alpha),
-                sample_weight[mask] * self.quantile,
-                sample_weight[mask] * (1 - self.quantile),
+                sample_weight * self.quantile,
+                sample_weight * (1 - self.quantile),
            ]
        )
        if self.fit_intercept:
            # do not penalize the intercept
            c[0] = 0
            c[n_params] = 0

-            A_eq = np.concatenate(
-                [
-                    np.ones((n_mask, 1)),
-                    X[mask],
-                    -np.ones((n_mask, 1)),
-                    -X[mask],
-                    np.eye(n_mask),
-                    -np.eye(n_mask),
-                ],
-                axis=1,
-            )
+        if sparse.issparse(X):
+            if self.solver not in ["highs-ds", "highs-ipm", "highs"]:
+                raise ValueError(
+                    f"Solver {self.solver} does not support sparse X. "
+                    "Use solver 'highs' for example."
+                )
+            # Note that highs methods do convert to csc.
+            # Therefore, we work with csc matrices as much as possible.
+            eye = sparse.eye(n_indices, dtype=X.dtype, format="csc")
+            if self.fit_intercept:
+                ones = sparse.csc_matrix(np.ones(shape=(n_indices, 1), dtype=X.dtype))
+                A_eq = sparse.hstack([ones, X, -ones, -X, eye, -eye], format="csc")
+            else:
+                A_eq = sparse.hstack([X, -X, eye, -eye], format="csc")
        else:
-            A_eq = np.concatenate(
-                [X[mask], -X[mask], np.eye(n_mask), -np.eye(n_mask)], axis=1
-            )
-
-        b_eq = y[mask]
+            eye = np.eye(n_indices)
+            if self.fit_intercept:
+                ones = np.ones((n_indices, 1))
+                A_eq = np.concatenate([ones, X, -ones, -X, eye, -eye], axis=1)
+            else:
+                A_eq = np.concatenate([X, -X, eye, -eye], axis=1)
+
+        b_eq = y

        result = linprog(
            c=c,

diff --git a/sklearn/linear_model/tests/test_quantile.py b/sklearn/linear_model/tests/test_quantile.py
@@ -6,6 +6,7 @@
 import pytest
 from pytest import approx
 from scipy.optimize import minimize
+from scipy import sparse

 from sklearn.datasets import make_regression
 from sklearn.exceptions import ConvergenceWarning
@@ -45,6 +46,21 @@ def test_init_parameters_validation(X_y_data, params, err_msg):
        QuantileRegressor(**params).fit(X, y)


+@pytest.mark.skipif(
+    sp_version < parse_version("1.3.0"),
+    reason="Solver 'revised simplex' is only available with of scipy>=1.3.0",
+)
+@pytest.mark.parametrize("solver", ["interior-point", "revised simplex"])
+def test_incompatible_solver_for_sparse_input(X_y_data, solver):
+    X, y = X_y_data
+    X_sparse = sparse.csc_matrix(X)
+    err_msg = (
+        f"Solver {solver} does not support sparse X. Use solver 'highs' for example."
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        QuantileRegressor(solver=solver).fit(X_sparse, y)
+
+
 @pytest.mark.parametrize("solver", ("highs-ds", "highs-ipm", "highs"))
 @pytest.mark.skipif(
    sp_version >= parse_version("1.6.0"),
@@ -250,3 +266,28 @@ def test_linprog_failure():
    msg = "Linear programming for QuantileRegressor did not succeed."
    with pytest.warns(ConvergenceWarning, match=msg):
        reg.fit(X, y)
+
+
+@pytest.mark.skipif(
+    sp_version <= parse_version("1.6.0"),
+    reason="Solvers are available as of scipy 1.6.0",
+)
+@pytest.mark.parametrize(
+    "sparse_format", [sparse.csc_matrix, sparse.csr_matrix, sparse.coo_matrix]
+)
+@pytest.mark.parametrize("solver", ["highs", "highs-ds", "highs-ipm"])
+@pytest.mark.parametrize("fit_intercept", [True, False])
+def test_sparse_input(sparse_format, solver, fit_intercept):
+    """Test that sparse and dense X give same results."""
+    X, y = make_regression(n_samples=100, n_features=20, random_state=1, noise=1.0)
+    X_sparse = sparse_format(X)
+    alpha = 1e-4
+    quant_dense = QuantileRegressor(alpha=alpha, fit_intercept=fit_intercept).fit(X, y)
+    quant_sparse = QuantileRegressor(
+        alpha=alpha, fit_intercept=fit_intercept, solver=solver
+    ).fit(X_sparse, y)
+    assert_allclose(quant_sparse.coef_, quant_dense.coef_, rtol=1e-2)
+    if fit_intercept:
+        assert quant_sparse.intercept_ == approx(quant_dense.intercept_)
+        # check that we still predict fraction
+        assert 0.45 <= np.mean(y < quant_sparse.predict(X_sparse)) <= 0.55