Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

ENH support sparse data input for QuantileRegressor #21086

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 31 commits into from
Dec 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
8b038db
sparse support
venkyyuvy Sep 19, 2021
6fab90c
compare sparse and dense input models
venkyyuvy Sep 20, 2021
39f31d3
change log
venkyyuvy Sep 20, 2021
27c8623
remove print
venkyyuvy Sep 21, 2021
77e7793
changing solver
venkyyuvy Sep 22, 2021
cdc1ce1
updating changelog
venkyyuvy Sep 22, 2021
b328296
handling coo format indexing
venkyyuvy Sep 22, 2021
34511aa
fixing change log
venkyyuvy Sep 26, 2021
a359572
merge with main
venkyyuvy Sep 26, 2021
2a25ffb
Merge remote-tracking branch 'upstream/main' into sparse_qr
venkyyuvy Sep 26, 2021
4a52a1b
more parametrize
venkyyuvy Sep 26, 2021
4d4e026
more parametrization
venkyyuvy Sep 26, 2021
cd8c53a
adjusting penalty
venkyyuvy Sep 26, 2021
3bed0bc
skipping test for low sp_version
venkyyuvy Sep 27, 2021
e5a3d37
correcting version check
venkyyuvy Sep 27, 2021
f04e985
Merge remote-tracking branch 'origin/main' into pr/venkyyuvy/21086
glemaitre Dec 2, 2021
a5aa914
DOC update whats new
glemaitre Dec 2, 2021
5a450c9
Merge remote-tracking branch 'upstream/main' into sparse_qr
venkyyuvy Dec 12, 2021
12a4826
allowing only csc format
venkyyuvy Dec 12, 2021
33bf1e2
fixing nonzero sample weight indexing
venkyyuvy Dec 12, 2021
c07a77b
updating the validate_data
venkyyuvy Dec 13, 2021
e871bf7
allow all sparse formats
venkyyuvy Dec 14, 2021
5ce57e4
checking compatible solvers
venkyyuvy Dec 14, 2021
fad71b2
cleaning test cases
venkyyuvy Dec 15, 2021
db98459
solver docstring update
venkyyuvy Dec 15, 2021
aff659d
doc update
venkyyuvy Dec 15, 2021
49e663e
test_cleaning
venkyyuvy Dec 15, 2021
edb9747
renaming test case
venkyyuvy Dec 15, 2021
4b24346
removing interior-pt solver for sparse data
venkyyuvy Dec 15, 2021
ecb3e4e
doc fixs
venkyyuvy Dec 15, 2021
39a97d5
scipy V check for tests
venkyyuvy Dec 15, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion 5 doc/whats_new/v1.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,10 @@ Changelog
a parameter in order to provide an estimate of the noise variance.
This is particularly relevant when `n_features > n_samples` and the
estimator of the noise variance cannot be computed.
:pr:`21481` by :user:`Guillaume Lemaitre <glemaitre>`
:pr:`21481` by :user:`Guillaume Lemaitre <glemaitre>`.

- |Enhancement| :class:`linear_model.QuantileRegressor` support sparse inputs.
:pr:`21086` by :user:`Venkatachalam Natchiappan <venkyyuvy>`.

- |Fix| :class:`linear_model.LassoLarsIC` now correctly computes AIC
and BIC. An error is now raised when `n_features > n_samples` and
Expand Down
62 changes: 40 additions & 22 deletions 62 sklearn/linear_model/_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@
import warnings

import numpy as np
from scipy import sparse
from scipy.optimize import linprog

from ..base import BaseEstimator, RegressorMixin
from ._base import LinearModel
from ..exceptions import ConvergenceWarning
from ..utils import _safe_indexing
from ..utils.validation import _check_sample_weight
from ..utils.fixes import sp_version, parse_version

Expand Down Expand Up @@ -44,6 +46,8 @@ class QuantileRegressor(LinearModel, RegressorMixin, BaseEstimator):
Method used by :func:`scipy.optimize.linprog` to solve the linear
programming formulation. Note that the highs methods are recommended
for usage with `scipy>=1.6.0` because they are the fastest ones.
Solvers "highs-ds", "highs-ipm" and "highs" support
sparse input data.

solver_options : dict, default=None
Additional parameters passed to :func:`scipy.optimize.linprog` as
Expand Down Expand Up @@ -112,7 +116,7 @@ def fit(self, X, y, sample_weight=None):

Parameters
----------
X : array-like of shape (n_samples, n_features)
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training data.

y : array-like of shape (n_samples,)
Expand All @@ -127,7 +131,11 @@ def fit(self, X, y, sample_weight=None):
Returns self.
"""
X, y = self._validate_data(
X, y, accept_sparse=False, y_numeric=True, multi_output=False
X,
venkyyuvy marked this conversation as resolved.
Show resolved Hide resolved
y,
accept_sparse=["csc", "csr", "coo"],
y_numeric=True,
multi_output=False,
)
sample_weight = _check_sample_weight(sample_weight, X)

Expand Down Expand Up @@ -218,37 +226,47 @@ def fit(self, X, y, sample_weight=None):
#
# Filtering out zero samples weights from the beginning makes life
# easier for the linprog solver.
mask = sample_weight != 0
n_mask = int(np.sum(mask)) # use n_mask instead of n_samples
indices = np.nonzero(sample_weight)[0]
n_indices = len(indices) # use n_mask instead of n_samples
lorentzenchr marked this conversation as resolved.
Show resolved Hide resolved
if n_indices < len(sample_weight):
sample_weight = sample_weight[indices]
X = _safe_indexing(X, indices)
y = _safe_indexing(y, indices)
c = np.concatenate(
[
np.full(2 * n_params, fill_value=alpha),
sample_weight[mask] * self.quantile,
sample_weight[mask] * (1 - self.quantile),
sample_weight * self.quantile,
sample_weight * (1 - self.quantile),
]
)
if self.fit_intercept:
# do not penalize the intercept
c[0] = 0
c[n_params] = 0

A_eq = np.concatenate(
[
np.ones((n_mask, 1)),
X[mask],
-np.ones((n_mask, 1)),
-X[mask],
np.eye(n_mask),
-np.eye(n_mask),
],
axis=1,
)
if sparse.issparse(X):
if self.solver not in ["highs-ds", "highs-ipm", "highs"]:
raise ValueError(
f"Solver {self.solver} does not support sparse X. "
"Use solver 'highs' for example."
)
# Note that highs methods do convert to csc.
# Therefore, we work with csc matrices as much as possible.
eye = sparse.eye(n_indices, dtype=X.dtype, format="csc")
venkyyuvy marked this conversation as resolved.
Show resolved Hide resolved
if self.fit_intercept:
ones = sparse.csc_matrix(np.ones(shape=(n_indices, 1), dtype=X.dtype))
A_eq = sparse.hstack([ones, X, -ones, -X, eye, -eye], format="csc")
else:
A_eq = sparse.hstack([X, -X, eye, -eye], format="csc")
else:
A_eq = np.concatenate(
[X[mask], -X[mask], np.eye(n_mask), -np.eye(n_mask)], axis=1
)

b_eq = y[mask]
eye = np.eye(n_indices)
if self.fit_intercept:
ones = np.ones((n_indices, 1))
A_eq = np.concatenate([ones, X, -ones, -X, eye, -eye], axis=1)
else:
A_eq = np.concatenate([X, -X, eye, -eye], axis=1)

b_eq = y

result = linprog(
c=c,
Expand Down
41 changes: 41 additions & 0 deletions 41 sklearn/linear_model/tests/test_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pytest
from pytest import approx
from scipy.optimize import minimize
from scipy import sparse

from sklearn.datasets import make_regression
from sklearn.exceptions import ConvergenceWarning
Expand Down Expand Up @@ -45,6 +46,21 @@ def test_init_parameters_validation(X_y_data, params, err_msg):
QuantileRegressor(**params).fit(X, y)


@pytest.mark.skipif(
sp_version < parse_version("1.3.0"),
reason="Solver 'revised simplex' is only available with of scipy>=1.3.0",
)
@pytest.mark.parametrize("solver", ["interior-point", "revised simplex"])
venkyyuvy marked this conversation as resolved.
Show resolved Hide resolved
def test_incompatible_solver_for_sparse_input(X_y_data, solver):
X, y = X_y_data
X_sparse = sparse.csc_matrix(X)
err_msg = (
f"Solver {solver} does not support sparse X. Use solver 'highs' for example."
)
with pytest.raises(ValueError, match=err_msg):
QuantileRegressor(solver=solver).fit(X_sparse, y)


@pytest.mark.parametrize("solver", ("highs-ds", "highs-ipm", "highs"))
@pytest.mark.skipif(
sp_version >= parse_version("1.6.0"),
Expand Down Expand Up @@ -250,3 +266,28 @@ def test_linprog_failure():
msg = "Linear programming for QuantileRegressor did not succeed."
with pytest.warns(ConvergenceWarning, match=msg):
reg.fit(X, y)


@pytest.mark.skipif(
sp_version <= parse_version("1.6.0"),
reason="Solvers are available as of scipy 1.6.0",
)
@pytest.mark.parametrize(
"sparse_format", [sparse.csc_matrix, sparse.csr_matrix, sparse.coo_matrix]
)
@pytest.mark.parametrize("solver", ["highs", "highs-ds", "highs-ipm"])
@pytest.mark.parametrize("fit_intercept", [True, False])
def test_sparse_input(sparse_format, solver, fit_intercept):
"""Test that sparse and dense X give same results."""
X, y = make_regression(n_samples=100, n_features=20, random_state=1, noise=1.0)
X_sparse = sparse_format(X)
alpha = 1e-4
quant_dense = QuantileRegressor(alpha=alpha, fit_intercept=fit_intercept).fit(X, y)
quant_sparse = QuantileRegressor(
alpha=alpha, fit_intercept=fit_intercept, solver=solver
).fit(X_sparse, y)
assert_allclose(quant_sparse.coef_, quant_dense.coef_, rtol=1e-2)
if fit_intercept:
assert quant_sparse.intercept_ == approx(quant_dense.intercept_)
# check that we still predict fraction
assert 0.45 <= np.mean(y < quant_sparse.predict(X_sparse)) <= 0.55
Morty Proxy This is a proxified and sanitized view of the page, visit original site.