Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit c37cf78

Browse filesBrowse files
authored
Merge branch 'scikit-learn:main' into update-scikit-learn
2 parents 5e76ebd + de968ed commit c37cf78
Copy full SHA for c37cf78
Expand file treeCollapse file tree

36 files changed

+597
-364
lines changed

‎doc/modules/classes.rst

Copy file name to clipboardExpand all lines: doc/modules/classes.rst
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1122,7 +1122,7 @@ See the :ref:`visualizations` section of the user guide for further details.
11221122

11231123
.. autosummary::
11241124
:toctree: generated/
1125-
:template: display.rst
1125+
:template: display_all_class_methods.rst
11261126

11271127
metrics.ConfusionMatrixDisplay
11281128
metrics.DetCurveDisplay

‎doc/whats_new/v1.3.rst

Copy file name to clipboardExpand all lines: doc/whats_new/v1.3.rst
+8-2Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,12 @@ Changelog
240240
dataframe.
241241
:pr:`25931` by :user:`Yao Xiao <Charlie-XIAO>`.
242242

243+
- |Fix| :class:`ensemble.HistGradientBoostingRegressor` and
244+
:class:`ensemble.HistGradientBoostingClassifier` treats negative values for
245+
categorical features consistently as missing values, following LightGBM's and
246+
pandas' conventions.
247+
:pr:`25629` by `Thomas Fan`_.
248+
243249
:mod:`sklearn.exception`
244250
........................
245251
- |Feature| Added :class:`exception.InconsistentVersionWarning` which is raised
@@ -284,8 +290,8 @@ Changelog
284290
estimators consistent with the rest of estimators.
285291
:pr:`25697` by :user:`John Pangas <jpangas>`.
286292

287-
- |Enhancement| The `n_iter_` attribute has been included in
288-
:class:`linear_model.ARDRegression` to expose the actual number of iterations
293+
- |Enhancement| The `n_iter_` attribute has been included in
294+
:class:`linear_model.ARDRegression` to expose the actual number of iterations
289295
required to reach the stopping criterion.
290296
:pr:`25697` by :user:`John Pangas <jpangas>`.
291297

‎sklearn/calibration.py

Copy file name to clipboardExpand all lines: sklearn/calibration.py
+23-36Lines changed: 23 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -30,16 +30,16 @@
3030
from .utils import (
3131
column_or_1d,
3232
indexable,
33-
check_matplotlib_support,
3433
_safe_indexing,
3534
)
36-
from .utils._response import _get_response_values_binary
3735

38-
from .utils.multiclass import check_classification_targets, type_of_target
36+
from .utils.multiclass import check_classification_targets
3937
from .utils.parallel import delayed, Parallel
4038
from .utils._param_validation import StrOptions, HasMethods, Hidden
39+
from .utils._plotting import _BinaryClassifierCurveDisplayMixin
4140
from .utils.validation import (
4241
_check_fit_params,
42+
_check_pos_label_consistency,
4343
_check_sample_weight,
4444
_num_samples,
4545
check_consistent_length,
@@ -48,7 +48,6 @@
4848
from .isotonic import IsotonicRegression
4949
from .svm import LinearSVC
5050
from .model_selection import check_cv, cross_val_predict
51-
from .metrics._base import _check_pos_label_consistency
5251

5352

5453
class CalibratedClassifierCV(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
@@ -1013,7 +1012,7 @@ def calibration_curve(
10131012
return prob_true, prob_pred
10141013

10151014

1016-
class CalibrationDisplay:
1015+
class CalibrationDisplay(_BinaryClassifierCurveDisplayMixin):
10171016
"""Calibration curve (also known as reliability diagram) visualization.
10181017
10191018
It is recommended to use
@@ -1124,13 +1123,8 @@ def plot(self, *, ax=None, name=None, ref_line=True, **kwargs):
11241123
display : :class:`~sklearn.calibration.CalibrationDisplay`
11251124
Object that stores computed values.
11261125
"""
1127-
check_matplotlib_support("CalibrationDisplay.plot")
1128-
import matplotlib.pyplot as plt
1126+
self.ax_, self.figure_, name = self._validate_plot_params(ax=ax, name=name)
11291127

1130-
if ax is None:
1131-
fig, ax = plt.subplots()
1132-
1133-
name = self.estimator_name if name is None else name
11341128
info_pos_label = (
11351129
f"(Positive class: {self.pos_label})" if self.pos_label is not None else ""
11361130
)
@@ -1141,20 +1135,20 @@ def plot(self, *, ax=None, name=None, ref_line=True, **kwargs):
11411135
line_kwargs.update(**kwargs)
11421136

11431137
ref_line_label = "Perfectly calibrated"
1144-
existing_ref_line = ref_line_label in ax.get_legend_handles_labels()[1]
1138+
existing_ref_line = ref_line_label in self.ax_.get_legend_handles_labels()[1]
11451139
if ref_line and not existing_ref_line:
1146-
ax.plot([0, 1], [0, 1], "k:", label=ref_line_label)
1147-
self.line_ = ax.plot(self.prob_pred, self.prob_true, "s-", **line_kwargs)[0]
1140+
self.ax_.plot([0, 1], [0, 1], "k:", label=ref_line_label)
1141+
self.line_ = self.ax_.plot(self.prob_pred, self.prob_true, "s-", **line_kwargs)[
1142+
0
1143+
]
11481144

11491145
# We always have to show the legend for at least the reference line
1150-
ax.legend(loc="lower right")
1146+
self.ax_.legend(loc="lower right")
11511147

11521148
xlabel = f"Mean predicted probability {info_pos_label}"
11531149
ylabel = f"Fraction of positives {info_pos_label}"
1154-
ax.set(xlabel=xlabel, ylabel=ylabel)
1150+
self.ax_.set(xlabel=xlabel, ylabel=ylabel)
11551151

1156-
self.ax_ = ax
1157-
self.figure_ = ax.figure
11581152
return self
11591153

11601154
@classmethod
@@ -1260,15 +1254,15 @@ def from_estimator(
12601254
>>> disp = CalibrationDisplay.from_estimator(clf, X_test, y_test)
12611255
>>> plt.show()
12621256
"""
1263-
method_name = f"{cls.__name__}.from_estimator"
1264-
check_matplotlib_support(method_name)
1265-
1266-
check_is_fitted(estimator)
1267-
y_prob, pos_label = _get_response_values_binary(
1268-
estimator, X, response_method="predict_proba", pos_label=pos_label
1257+
y_prob, pos_label, name = cls._validate_and_get_response_values(
1258+
estimator,
1259+
X,
1260+
y,
1261+
response_method="predict_proba",
1262+
pos_label=pos_label,
1263+
name=name,
12691264
)
12701265

1271-
name = name if name is not None else estimator.__class__.__name__
12721266
return cls.from_predictions(
12731267
y,
12741268
y_prob,
@@ -1378,26 +1372,19 @@ def from_predictions(
13781372
>>> disp = CalibrationDisplay.from_predictions(y_test, y_prob)
13791373
>>> plt.show()
13801374
"""
1381-
method_name = f"{cls.__name__}.from_predictions"
1382-
check_matplotlib_support(method_name)
1383-
1384-
target_type = type_of_target(y_true)
1385-
if target_type != "binary":
1386-
raise ValueError(
1387-
f"The target y is not binary. Got {target_type} type of target."
1388-
)
1375+
pos_label_validated, name = cls._validate_from_predictions_params(
1376+
y_true, y_prob, sample_weight=None, pos_label=pos_label, name=name
1377+
)
13891378

13901379
prob_true, prob_pred = calibration_curve(
13911380
y_true, y_prob, n_bins=n_bins, strategy=strategy, pos_label=pos_label
13921381
)
1393-
name = "Classifier" if name is None else name
1394-
pos_label = _check_pos_label_consistency(pos_label, y_true)
13951382

13961383
disp = cls(
13971384
prob_true=prob_true,
13981385
prob_pred=prob_pred,
13991386
y_prob=y_prob,
14001387
estimator_name=name,
1401-
pos_label=pos_label,
1388+
pos_label=pos_label_validated,
14021389
)
14031390
return disp.plot(ax=ax, ref_line=ref_line, **kwargs)

‎sklearn/compose/_column_transformer.py

Copy file name to clipboardExpand all lines: sklearn/compose/_column_transformer.py
+2Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -936,6 +936,8 @@ def _get_transformer_list(estimators):
936936
return transformer_list
937937

938938

939+
# This function is not validated using validate_params because
940+
# it's just a factory for ColumnTransformer.
939941
def make_column_transformer(
940942
*transformers,
941943
remainder="drop",

‎sklearn/discriminant_analysis.py

Copy file name to clipboardExpand all lines: sklearn/discriminant_analysis.py
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -640,7 +640,7 @@ def fit(self, X, y):
640640
intercept_ = xp.asarray(
641641
self.intercept_[1] - self.intercept_[0], dtype=X.dtype
642642
)
643-
self.intercept_ = xp.reshape(intercept_, 1)
643+
self.intercept_ = xp.reshape(intercept_, (1,))
644644
self._n_features_out = self._max_components
645645
return self
646646

‎sklearn/ensemble/_hist_gradient_boosting/_binning.pyx

Copy file name to clipboardExpand all lines: sklearn/ensemble/_hist_gradient_boosting/_binning.pyx
+11-1Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ from .common cimport X_DTYPE_C, X_BINNED_DTYPE_C
88

99
def _map_to_bins(const X_DTYPE_C [:, :] data,
1010
list binning_thresholds,
11+
const unsigned char[::1] is_categorical,
1112
const unsigned char missing_values_bin_idx,
1213
int n_threads,
1314
X_BINNED_DTYPE_C [::1, :] binned):
@@ -23,6 +24,8 @@ def _map_to_bins(const X_DTYPE_C [:, :] data,
2324
binning_thresholds : list of arrays
2425
For each feature, stores the increasing numeric values that are
2526
used to separate the bins.
27+
is_categorical : ndarray of unsigned char of shape (n_features,)
28+
Indicates categorical features.
2629
n_threads : int
2730
Number of OpenMP threads to use.
2831
binned : ndarray, shape (n_samples, n_features)
@@ -34,13 +37,15 @@ def _map_to_bins(const X_DTYPE_C [:, :] data,
3437
for feature_idx in range(data.shape[1]):
3538
_map_col_to_bins(data[:, feature_idx],
3639
binning_thresholds[feature_idx],
40+
is_categorical[feature_idx],
3741
missing_values_bin_idx,
3842
n_threads,
3943
binned[:, feature_idx])
4044

4145

4246
cdef void _map_col_to_bins(const X_DTYPE_C [:] data,
4347
const X_DTYPE_C [:] binning_thresholds,
48+
const unsigned char is_categorical,
4449
const unsigned char missing_values_bin_idx,
4550
int n_threads,
4651
X_BINNED_DTYPE_C [:] binned):
@@ -53,7 +58,12 @@ cdef void _map_col_to_bins(const X_DTYPE_C [:] data,
5358

5459
for i in prange(data.shape[0], schedule='static', nogil=True,
5560
num_threads=n_threads):
56-
if isnan(data[i]):
61+
if (
62+
isnan(data[i]) or
63+
# To follow LightGBM's conventions, negative values for
64+
# categorical features are considered as missing values.
65+
(is_categorical and data[i] < 0)
66+
):
5767
binned[i] = missing_values_bin_idx
5868
else:
5969
# for known values, use binary search

‎sklearn/ensemble/_hist_gradient_boosting/binning.py

Copy file name to clipboardExpand all lines: sklearn/ensemble/_hist_gradient_boosting/binning.py
+6-1Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -275,7 +275,12 @@ def transform(self, X):
275275
n_threads = _openmp_effective_n_threads(self.n_threads)
276276
binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order="F")
277277
_map_to_bins(
278-
X, self.bin_thresholds_, self.missing_values_bin_idx_, n_threads, binned
278+
X,
279+
self.bin_thresholds_,
280+
self.is_categorical_,
281+
self.missing_values_bin_idx_,
282+
n_threads,
283+
binned,
279284
)
280285
return binned
281286

‎sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py

Copy file name to clipboardExpand all lines: sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+9-6Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,11 @@ def _check_categories(self, X):
269269
if missing.any():
270270
categories = categories[~missing]
271271

272+
# Treat negative values for categorical features as missing values.
273+
negative_categories = categories < 0
274+
if negative_categories.any():
275+
categories = categories[~negative_categories]
276+
272277
if hasattr(self, "feature_names_in_"):
273278
feature_name = f"'{self.feature_names_in_[f_idx]}'"
274279
else:
@@ -1265,9 +1270,8 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
12651270
data has feature names).
12661271
12671272
For each categorical feature, there must be at most `max_bins` unique
1268-
categories, and each categorical value must be in [0, max_bins -1].
1269-
During prediction, categories encoded as a negative value are treated as
1270-
missing values.
1273+
categories, and each categorical value must be less then `max_bins - 1`.
1274+
Negative values for categorical features are treated as missing values.
12711275
12721276
Read more in the :ref:`User Guide <categorical_support_gbdt>`.
12731277
@@ -1623,9 +1627,8 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting):
16231627
data has feature names).
16241628
16251629
For each categorical feature, there must be at most `max_bins` unique
1626-
categories, and each categorical value must be in [0, max_bins -1].
1627-
During prediction, categories encoded as a negative value are treated as
1628-
missing values.
1630+
categories, and each categorical value must be less then `max_bins - 1`.
1631+
Negative values for categorical features are treated as missing values.
16291632
16301633
Read more in the :ref:`User Guide <categorical_support_gbdt>`.
16311634

‎sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py

Copy file name to clipboardExpand all lines: sklearn/ensemble/_hist_gradient_boosting/tests/test_binning.py
+31-5Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,9 @@ def test_map_to_bins(max_bins):
9595
_find_binning_thresholds(DATA[:, i], max_bins=max_bins) for i in range(2)
9696
]
9797
binned = np.zeros_like(DATA, dtype=X_BINNED_DTYPE, order="F")
98+
is_categorical = np.zeros(2, dtype=np.uint8)
9899
last_bin_idx = max_bins
99-
_map_to_bins(DATA, bin_thresholds, last_bin_idx, n_threads, binned)
100+
_map_to_bins(DATA, bin_thresholds, is_categorical, last_bin_idx, n_threads, binned)
100101
assert binned.shape == DATA.shape
101102
assert binned.dtype == np.uint8
102103
assert binned.flags.f_contiguous
@@ -357,10 +358,35 @@ def test_categorical_feature(n_bins):
357358
expected_trans = np.array([[0, 1, 2, n_bins - 1, 3, 4, 5]]).T
358359
assert_array_equal(bin_mapper.transform(X), expected_trans)
359360

360-
# For unknown categories, the mapping is incorrect / undefined. This never
361-
# happens in practice. This check is only for illustration purpose.
362-
X = np.array([[-1, 100]], dtype=X_DTYPE).T
363-
expected_trans = np.array([[0, 6]]).T
361+
# Negative categories are mapped to the missing values' bin
362+
# (i.e. the bin of index `missing_values_bin_idx_ == n_bins - 1).
363+
# Unknown positive categories does not happen in practice and tested
364+
# for illustration purpose.
365+
X = np.array([[-4, -1, 100]], dtype=X_DTYPE).T
366+
expected_trans = np.array([[n_bins - 1, n_bins - 1, 6]]).T
367+
assert_array_equal(bin_mapper.transform(X), expected_trans)
368+
369+
370+
def test_categorical_feature_negative_missing():
371+
"""Make sure bin mapper treats negative categories as missing values."""
372+
X = np.array(
373+
[[4] * 500 + [1] * 3 + [5] * 10 + [-1] * 3 + [np.nan] * 4], dtype=X_DTYPE
374+
).T
375+
bin_mapper = _BinMapper(
376+
n_bins=4,
377+
is_categorical=np.array([True]),
378+
known_categories=[np.array([1, 4, 5], dtype=X_DTYPE)],
379+
).fit(X)
380+
381+
assert bin_mapper.n_bins_non_missing_ == [3]
382+
383+
X = np.array([[-1, 1, 3, 5, np.nan]], dtype=X_DTYPE).T
384+
385+
# Negative values for categorical features are considered as missing values.
386+
# They are mapped to the bin of index `bin_mapper.missing_values_bin_idx_`,
387+
# which is 3 here.
388+
assert bin_mapper.missing_values_bin_idx_ == 3
389+
expected_trans = np.array([[3, 0, 1, 2, 3]]).T
364390
assert_array_equal(bin_mapper.transform(X), expected_trans)
365391

366392

‎sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py

Copy file name to clipboardExpand all lines: sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+6-3Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -950,7 +950,10 @@ def test_staged_predict(HistGradientBoosting, X, y):
950950
"Est", (HistGradientBoostingRegressor, HistGradientBoostingClassifier)
951951
)
952952
@pytest.mark.parametrize("bool_categorical_parameter", [True, False])
953-
def test_unknown_categories_nan(insert_missing, Est, bool_categorical_parameter):
953+
@pytest.mark.parametrize("missing_value", [np.nan, -1])
954+
def test_unknown_categories_nan(
955+
insert_missing, Est, bool_categorical_parameter, missing_value
956+
):
954957
# Make sure no error is raised at predict if a category wasn't seen during
955958
# fit. We also make sure they're treated as nans.
956959

@@ -970,7 +973,7 @@ def test_unknown_categories_nan(insert_missing, Est, bool_categorical_parameter)
970973
if insert_missing:
971974
mask = rng.binomial(1, 0.01, size=X.shape).astype(bool)
972975
assert mask.sum() > 0
973-
X[mask] = np.nan
976+
X[mask] = missing_value
974977

975978
est = Est(max_iter=20, categorical_features=categorical_features).fit(X, y)
976979
assert_array_equal(est.is_categorical_, [False, True])
@@ -979,7 +982,7 @@ def test_unknown_categories_nan(insert_missing, Est, bool_categorical_parameter)
979982
# unknown categories will be treated as nans
980983
X_test = np.zeros((10, X.shape[1]), dtype=float)
981984
X_test[:5, 1] = 30
982-
X_test[5:, 1] = np.nan
985+
X_test[5:, 1] = missing_value
983986
assert len(np.unique(est.predict(X_test))) == 1
984987

985988

‎sklearn/impute/_base.py

Copy file name to clipboardExpand all lines: sklearn/impute/_base.py
+3-3Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from scipy import sparse as sp
1212

1313
from ..base import BaseEstimator, TransformerMixin
14-
from ..utils._param_validation import StrOptions, Hidden
14+
from ..utils._param_validation import StrOptions, Hidden, MissingValues
1515
from ..utils.fixes import _mode
1616
from ..utils.sparsefuncs import _get_median
1717
from ..utils.validation import check_is_fitted
@@ -78,7 +78,7 @@ class _BaseImputer(TransformerMixin, BaseEstimator):
7878
"""
7979

8080
_parameter_constraints: dict = {
81-
"missing_values": ["missing_values"],
81+
"missing_values": [MissingValues()],
8282
"add_indicator": ["boolean"],
8383
"keep_empty_features": ["boolean"],
8484
}
@@ -800,7 +800,7 @@ class MissingIndicator(TransformerMixin, BaseEstimator):
800800
"""
801801

802802
_parameter_constraints: dict = {
803-
"missing_values": [numbers.Real, numbers.Integral, str, None],
803+
"missing_values": [MissingValues()],
804804
"features": [StrOptions({"missing-only", "all"})],
805805
"sparse": ["boolean", StrOptions({"auto"})],
806806
"error_on_new": ["boolean"],

‎sklearn/linear_model/_base.py

Copy file name to clipboardExpand all lines: sklearn/linear_model/_base.py
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,7 @@ def decision_function(self, X):
399399

400400
X = self._validate_data(X, accept_sparse="csr", reset=False)
401401
scores = safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_
402-
return xp.reshape(scores, -1) if scores.shape[1] == 1 else scores
402+
return xp.reshape(scores, (-1,)) if scores.shape[1] == 1 else scores
403403

404404
def predict(self, X):
405405
"""

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.