diff --git a/sklearn/feature_selection/_sequential.py b/sklearn/feature_selection/_sequential.py index bc1c4ac32ce1d..bae7583a22b5a 100644 --- a/sklearn/feature_selection/_sequential.py +++ b/sklearn/feature_selection/_sequential.py @@ -257,8 +257,11 @@ def fit(self, X, y=None): elif isinstance(self.n_features_to_select, Real): self.n_features_to_select_ = int(n_features * self.n_features_to_select) - if self.tol is not None and self.tol < 0 and self.direction == "forward": - raise ValueError("tol must be positive when doing forward selection") + if self.tol is not None: + if self.tol < 0 and self.direction == "forward": + raise ValueError("tol must be positive when doing forward selection") + if self.tol > 0 and self.direction == "backward": + raise ValueError("tol must be negative when doing backward selection") cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator)) @@ -274,13 +277,28 @@ def fit(self, X, y=None): else n_features - self.n_features_to_select_ ) - old_score = -np.inf + if self.direction == "forward": + old_score = -np.inf + else: + old_score = cross_val_score( + cloned_estimator, + X, + y, + cv=cv, + scoring=self.scoring, + n_jobs=self.n_jobs, + ).mean() + is_auto_select = self.tol is not None and self.n_features_to_select == "auto" for _ in range(n_iterations): new_feature_idx, new_score = self._get_best_new_feature_score( cloned_estimator, X, y, cv, current_mask ) + if is_auto_select and ((new_score - old_score) < self.tol): + # The score has not improved enough by adding the latest feature, + # so we stop. Or, the score has decreased too much by removing the + # latest feature, so we stop break old_score = new_score diff --git a/sklearn/feature_selection/tests/test_sequential.py b/sklearn/feature_selection/tests/test_sequential.py index 98134addc39e7..e3205f0aac977 100644 --- a/sklearn/feature_selection/tests/test_sequential.py +++ b/sklearn/feature_selection/tests/test_sequential.py @@ -46,14 +46,18 @@ def test_n_features_to_select(direction, n_features_to_select): assert sfs.transform(X).shape[1] == n_features_to_select -@pytest.mark.parametrize("direction", ("forward", "backward")) -def test_n_features_to_select_auto(direction): - """Check the behaviour of `n_features_to_select="auto"` with different - values for the parameter `tol`. +@pytest.mark.parametrize( + "direction,max_features_to_select", (("forward", 9), ("backward", 10)) +) +def test_n_features_to_select_auto(direction, max_features_to_select): + """Check the behaviour of `n_features_to_select="auto"` when selecting + features in a forward and backward direction. """ - n_features = 10 tol = 1e-3 + if direction == "backward": + tol *= -1 + X, y = make_regression(n_features=n_features, random_state=0) sfs = SequentialFeatureSelector( LinearRegression(), @@ -64,7 +68,7 @@ def test_n_features_to_select_auto(direction): ) sfs.fit(X, y) - max_features_to_select = n_features - 1 + # max_features_to_select = n_features - 1 assert sfs.get_support(indices=True).shape[0] <= max_features_to_select assert sfs.n_features_to_select_ <= max_features_to_select @@ -95,6 +99,8 @@ def test_n_features_to_select_stopping_criterion(direction): X, y = make_regression(n_features=50, n_informative=10, random_state=0) tol = 1e-3 + if direction == "backward": + tol *= -1 sfs = SequentialFeatureSelector( LinearRegression(), @@ -130,7 +136,15 @@ def test_n_features_to_select_stopping_criterion(direction): assert (sfs_cv_score - added_cv_score) <= tol assert (sfs_cv_score - removed_cv_score) >= tol else: - assert (added_cv_score - sfs_cv_score) <= tol + assert sfs_cv_score <= added_cv_score + assert sfs_cv_score >= removed_cv_score + # The "added" score should be equal or higher than the SFS score + # so the difference between them should be >= tol, which is a + # negative number. + assert (sfs_cv_score - added_cv_score) >= tol + # Because tol is negative the delta between scores should be + # less than or equal to the tolerance, in absolute terms + # the delta is bigger than the tolerance assert (removed_cv_score - sfs_cv_score) <= tol @@ -281,8 +295,8 @@ def test_no_y_validation_model_fit(y): sfs.fit(X, y) -def test_forward_neg_tol_error(): - """Check that we raise an error when tol<0 and direction='forward'""" +def test_tol_sign_depends_on_direction(): + """Check that we raise an error if the sign of tol and direction do not match""" X, y = make_regression(n_features=10, random_state=0) sfs = SequentialFeatureSelector( LinearRegression(), @@ -290,10 +304,18 @@ def test_forward_neg_tol_error(): direction="forward", tol=-1e-3, ) - with pytest.raises(ValueError, match="tol must be positive"): sfs.fit(X, y) + sfs = SequentialFeatureSelector( + LinearRegression(), + n_features_to_select="auto", + direction="backward", + tol=+1e-3, + ) + with pytest.raises(ValueError, match="tol must be negative"): + sfs.fit(X, y) + def test_backward_neg_tol(): """Check that SequentialFeatureSelector works negative tol @@ -334,3 +356,28 @@ def test_cv_generator_support(): sfs = SequentialFeatureSelector(knc, n_features_to_select=5, cv=splits) sfs.fit(X, y) + + +def test_backwards_doesnt_remove_feature(): + """All features should be kept. + + Non regression test for #26369 + """ + expected_selected_features = [ + 0, + 1, + ] + rng = np.random.RandomState(0) + n_samples = 500 + X = rng.randn(n_samples, 2) + y = 3 * X[:, 0] - 10 * X[:, 1] + + sfs = SequentialFeatureSelector( + LinearRegression(), + direction="backward", + cv=2, + n_features_to_select="auto", + tol=-0.01, + ) + sfs.fit(X, y) + assert_array_equal(sfs.get_support(indices=True), expected_selected_features)