Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

FIX LabelPropagation handling of sparce matrices #17085 #17384

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions 5 sklearn/semi_supervised/_label_propagation.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ class labels.
"""
check_is_fitted(self)

X_2d = check_array(X, accept_sparse=['csc', 'csr', 'coo', 'dok',
X_2d = check_array(X, accept_sparse=['csc', 'csr', 'coo',
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this use _validate_data? ping @NicolasHug

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm also really not sure why this fixes the issue

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this use _validate_data

We only use it in fit for now, so check_array is fine here

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the feedback @adrinjalali: PR #17085 already contains tests extended to the sparse case. Could you please advise a newbie regarding regression tests?

'bsr', 'lil', 'dia'])
weight_matrices = self._get_kernel(self.X_, X_2d)
if self.kernel == 'knn':
Expand Down Expand Up @@ -225,7 +225,8 @@ def fit(self, X, y):
-------
self : object
"""
X, y = self._validate_data(X, y)
X, y = self._validate_data(X, y, accept_sparse=['csc', 'csr', 'coo',
'bsr', 'lil', 'dia'])
self.X_ = X
check_classification_targets(y)

Expand Down
97 changes: 55 additions & 42 deletions 97 sklearn/semi_supervised/tests/test_label_propagation.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
""" test the label propagation module """

import numpy as np
import pytest
import numpy as np
# Some tests fail for dok_matrix.
from scipy.sparse import (bsr_matrix, coo_matrix, csc_matrix, csr_matrix,
lil_matrix, issparse)

from scipy.sparse import issparse
from sklearn.utils._testing import assert_warns
from sklearn.utils._testing import assert_no_warnings
from sklearn.semi_supervised import _label_propagation as label_propagation
Expand All @@ -15,6 +17,9 @@
from numpy.testing import assert_array_almost_equal
from numpy.testing import assert_array_equal

SPARSE_TYPES = (bsr_matrix, coo_matrix, csc_matrix, csr_matrix, lil_matrix)
SPARSE_OR_DENSE = SPARSE_TYPES + (np.asarray,)

ESTIMATORS = [
(label_propagation.LabelPropagation, {'kernel': 'rbf'}),
(label_propagation.LabelPropagation, {'kernel': 'knn', 'n_neighbors': 2}),
Expand Down Expand Up @@ -73,18 +78,19 @@ def test_label_spreading_closed_form():
X, y = make_classification(n_classes=n_classes, n_samples=200,
random_state=0)
y[::3] = -1
clf = label_propagation.LabelSpreading().fit(X, y)
# adopting notation from Zhou et al (2004):
S = clf._build_graph()
Y = np.zeros((len(y), n_classes + 1))
Y[np.arange(len(y)), y] = 1
Y = Y[:, :-1]
for alpha in [0.1, 0.3, 0.5, 0.7, 0.9]:
expected = np.dot(np.linalg.inv(np.eye(len(S)) - alpha * S), Y)
expected /= expected.sum(axis=1)[:, np.newaxis]
clf = label_propagation.LabelSpreading(max_iter=10000, alpha=alpha)
clf.fit(X, y)
assert_array_almost_equal(expected, clf.label_distributions_, 4)
for sparse_or_dense in SPARSE_OR_DENSE:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could you please parameterize the test for this one and alpha? Also for the other tests

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@adrinjalali: I didn't write the original tests, so I'm probably not the right one to ask for improving these with regard to parametrization.

clf = label_propagation.LabelSpreading().fit(sparse_or_dense(X), y)
# adopting notation from Zhou et al (2004):
S = clf._build_graph()
Y = np.zeros((len(y), n_classes + 1))
Y[np.arange(len(y)), y] = 1
Y = Y[:, :-1]
for alpha in [0.1, 0.3, 0.5, 0.7, 0.9]:
expected = np.dot(np.linalg.inv(np.eye(len(S)) - alpha * S), Y)
expected /= expected.sum(axis=1)[:, np.newaxis]
clf = label_propagation.LabelSpreading(max_iter=10000, alpha=alpha)
clf.fit(sparse_or_dense(X), y)
assert_array_almost_equal(expected, clf.label_distributions_, 4)


def test_label_propagation_closed_form():
Expand All @@ -97,6 +103,7 @@ def test_label_propagation_closed_form():
unlabelled_idx = Y[:, (-1,)].nonzero()[0]
labelled_idx = (Y[:, (-1,)] == 0).nonzero()[0]

# This test fails for sparse matrices!
clf = label_propagation.LabelPropagation(max_iter=10000,
gamma=0.1)
clf.fit(X, y)
Expand All @@ -121,40 +128,44 @@ def test_valid_alpha():
n_classes = 2
X, y = make_classification(n_classes=n_classes, n_samples=200,
random_state=0)
for alpha in [-0.1, 0, 1, 1.1, None]:
with pytest.raises(ValueError):
label_propagation.LabelSpreading(alpha=alpha).fit(X, y)
for sparse_or_dense in SPARSE_OR_DENSE:
for alpha in [-0.1, 0, 1, 1.1, None]:
with pytest.raises(ValueError):
label_propagation.LabelSpreading(alpha=alpha).fit(
sparse_or_dense(X), y)


def test_convergence_speed():
# This is a non-regression test for #5774
X = np.array([[1., 0.], [0., 1.], [1., 2.5]])
y = np.array([0, 1, -1])
mdl = label_propagation.LabelSpreading(kernel='rbf', max_iter=5000)
mdl.fit(X, y)
for sparse_or_dense in SPARSE_OR_DENSE:
mdl = label_propagation.LabelSpreading(kernel='rbf', max_iter=5000)
mdl.fit(sparse_or_dense(X), y)

# this should converge quickly:
assert mdl.n_iter_ < 10
assert_array_equal(mdl.predict(X), [0, 1, 1])
# this should converge quickly:
assert mdl.n_iter_ < 10
assert_array_equal(mdl.predict(X), [0, 1, 1])


def test_convergence_warning():
# This is a non-regression test for #5774
X = np.array([[1., 0.], [0., 1.], [1., 2.5]])
y = np.array([0, 1, -1])
mdl = label_propagation.LabelSpreading(kernel='rbf', max_iter=1)
assert_warns(ConvergenceWarning, mdl.fit, X, y)
assert mdl.n_iter_ == mdl.max_iter
for sparse_or_dense in SPARSE_OR_DENSE:
mdl = label_propagation.LabelSpreading(kernel='rbf', max_iter=1)
assert_warns(ConvergenceWarning, mdl.fit, sparse_or_dense(X), y)
assert mdl.n_iter_ == mdl.max_iter

mdl = label_propagation.LabelPropagation(kernel='rbf', max_iter=1)
assert_warns(ConvergenceWarning, mdl.fit, X, y)
assert mdl.n_iter_ == mdl.max_iter
mdl = label_propagation.LabelPropagation(kernel='rbf', max_iter=1)
assert_warns(ConvergenceWarning, mdl.fit, sparse_or_dense(X), y)
assert mdl.n_iter_ == mdl.max_iter

mdl = label_propagation.LabelSpreading(kernel='rbf', max_iter=500)
assert_no_warnings(mdl.fit, X, y)
mdl = label_propagation.LabelSpreading(kernel='rbf', max_iter=500)
assert_no_warnings(mdl.fit, sparse_or_dense(X), y)

mdl = label_propagation.LabelPropagation(kernel='rbf', max_iter=500)
assert_no_warnings(mdl.fit, X, y)
mdl = label_propagation.LabelPropagation(kernel='rbf', max_iter=500)
assert_no_warnings(mdl.fit, sparse_or_dense(X), y)


def test_label_propagation_non_zero_normalizer():
Expand All @@ -163,10 +174,11 @@ def test_label_propagation_non_zero_normalizer():
# https://github.com/scikit-learn/scikit-learn/pull/15946
X = np.array([[100., 100.], [100., 100.], [0., 0.], [0., 0.]])
y = np.array([0, 1, -1, -1])
mdl = label_propagation.LabelSpreading(kernel='knn',
max_iter=100,
n_neighbors=1)
assert_no_warnings(mdl.fit, X, y)
for sparse_or_dense in SPARSE_OR_DENSE:
mdl = label_propagation.LabelSpreading(kernel='knn',
max_iter=100,
n_neighbors=1)
assert_no_warnings(mdl.fit, sparse_or_dense(X), y)


def test_predict_sparse_callable_kernel():
Expand Down Expand Up @@ -196,10 +208,11 @@ def topk_rbf(X, Y=None, n_neighbors=10, gamma=1e-5):
test_size=n_test,
random_state=0)

model = label_propagation.LabelSpreading(kernel=topk_rbf)
model.fit(X_train, y_train)
assert model.score(X_test, y_test) >= 0.9
for sparse_or_dense in SPARSE_OR_DENSE:
model = label_propagation.LabelSpreading(kernel=topk_rbf)
model.fit(sparse_or_dense(X_train), y_train)
assert model.score(X_test, y_test) >= 0.9

model = label_propagation.LabelPropagation(kernel=topk_rbf)
model.fit(X_train, y_train)
assert model.score(X_test, y_test) >= 0.9
model = label_propagation.LabelPropagation(kernel=topk_rbf)
model.fit(sparse_or_dense(X_train), y_train)
assert model.score(X_test, y_test) >= 0.9
Morty Proxy This is a proxified and sanitized view of the page, visit original site.