Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 8c56929

Browse filesBrowse files
glemaitrethomasjpfanrth
committed
DOC use diagram and simplify pipeline in stacking example (scikit-learn#18830)
Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com> Co-authored-by: Roman Yurchak <rth.yurchak@gmail.com>
1 parent c0e81e1 commit 8c56929
Copy full SHA for 8c56929

File tree

1 file changed

+69
-71
lines changed
Filter options

1 file changed

+69
-71
lines changed

‎examples/ensemble/plot_stack_predictors.py

Copy file name to clipboardExpand all lines: examples/ensemble/plot_stack_predictors.py
+69-71Lines changed: 69 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,15 @@
1515
stacking strategy. Stacking slightly improves the overall performance.
1616
1717
"""
18-
print(__doc__)
1918

2019
# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
2120
# Maria Telenczuk <https://github.com/maikia>
2221
# License: BSD 3 clause
2322

23+
print(__doc__)
24+
25+
from sklearn import set_config
26+
set_config(display='diagram')
2427

2528
# %%
2629
# Download the dataset
@@ -73,68 +76,56 @@ def load_ames_housing():
7376
##############################################################################
7477
#
7578
# Before we can use Ames dataset we still need to do some preprocessing.
76-
# First, the dataset has many missing values. To impute them, we will exchange
77-
# categorical missing values with the new category 'missing' while the
78-
# numerical missing values with the 'mean' of the column. We will also encode
79-
# the categories with either :class:`~sklearn.preprocessing.OneHotEncoder
80-
# <sklearn.preprocessing.OneHotEncoder>` or
81-
# :class:`~sklearn.preprocessing.OrdinalEncoder
82-
# <sklearn.preprocessing.OrdinalEncoder>` depending for which type of model we
83-
# will use them (linear or non-linear model). To facilitate this preprocessing
84-
# we will make two pipelines.
85-
# You can skip this section if your data is ready to use and does
86-
# not need preprocessing
79+
# First, we will select the categorical and numerical columns of the dataset to
80+
# construct the first step of the pipeline.
81+
82+
from sklearn.compose import make_column_selector
83+
84+
cat_selector = make_column_selector(dtype_include=object)
85+
num_selector = make_column_selector(dtype_include=np.number)
86+
cat_selector(X)
8787

88+
# %%
89+
num_selector(X)
90+
91+
# %%
92+
# Then, we will need to design preprocessing pipelines which depends on the
93+
# ending regressor. If the ending regressor is a linear model, one needs to
94+
# one-hot encode the categories. If the ending regressor is a tree-based model
95+
# an ordinal encoder will be sufficient. Besides, numerical values need to be
96+
# standardized for a linear model while the raw numerical data can be treated
97+
# as is by a tree-based model. However, both models need an imputer to
98+
# handle missing values.
99+
#
100+
# We will first design the pipeline required for the tree-based models.
88101

89102
from sklearn.compose import make_column_transformer
90103
from sklearn.impute import SimpleImputer
91104
from sklearn.pipeline import make_pipeline
92-
from sklearn.preprocessing import OneHotEncoder
93105
from sklearn.preprocessing import OrdinalEncoder
94-
from sklearn.preprocessing import StandardScaler
95-
96-
97-
cat_cols = X.columns[X.dtypes == 'O']
98-
num_cols = X.columns[X.dtypes == 'float64']
99106

100-
categories = [
101-
X[column].unique() for column in X[cat_cols]]
107+
cat_tree_processor = OrdinalEncoder(
108+
handle_unknown="use_encoded_value", unknown_value=-1)
109+
num_tree_processor = SimpleImputer(strategy="mean", add_indicator=True)
102110

103-
for cat in categories:
104-
cat[cat == None] = 'missing' # noqa
111+
tree_preprocessor = make_column_transformer(
112+
(num_tree_processor, num_selector), (cat_tree_processor, cat_selector))
113+
tree_preprocessor
105114

106-
cat_proc_nlin = make_pipeline(
107-
SimpleImputer(missing_values=None, strategy='constant',
108-
fill_value='missing'),
109-
OrdinalEncoder(categories=categories)
110-
)
111-
112-
num_proc_nlin = make_pipeline(SimpleImputer(strategy='mean'))
113-
114-
cat_proc_lin = make_pipeline(
115-
SimpleImputer(missing_values=None,
116-
strategy='constant',
117-
fill_value='missing'),
118-
OneHotEncoder(categories=categories)
119-
)
120-
121-
num_proc_lin = make_pipeline(
122-
SimpleImputer(strategy='mean'),
123-
StandardScaler()
124-
)
115+
# %%
116+
# Then, we will now define the preprocessor used when the ending regressor
117+
# is a linear model.
125118

126-
# transformation to use for non-linear estimators
127-
processor_nlin = make_column_transformer(
128-
(cat_proc_nlin, cat_cols),
129-
(num_proc_nlin, num_cols),
130-
remainder='passthrough')
119+
from sklearn.preprocessing import OneHotEncoder
120+
from sklearn.preprocessing import StandardScaler
131121

132-
# transformation to use for linear estimators
133-
processor_lin = make_column_transformer(
134-
(cat_proc_lin, cat_cols),
135-
(num_proc_lin, num_cols),
136-
remainder='passthrough')
122+
cat_linear_processor = OneHotEncoder(handle_unknown="ignore")
123+
num_linear_processor = make_pipeline(
124+
StandardScaler(), SimpleImputer(strategy="mean", add_indicator=True))
137125

126+
linear_preprocessor = make_column_transformer(
127+
(num_linear_processor, num_selector), (cat_linear_processor, cat_selector))
128+
linear_preprocessor
138129

139130
# %%
140131
# Stack of predictors on a single data set
@@ -149,37 +140,44 @@ def load_ames_housing():
149140
# Here, we combine 3 learners (linear and non-linear) and use a ridge regressor
150141
# to combine their outputs together.
151142
#
152-
# Note: although we will make new pipelines with the processors which we wrote
153-
# in the previous section for the 3 learners, the final estimator RidgeCV()
154-
# does not need preprocessing of the data as it will be fed with the already
155-
# preprocessed output from the 3 learners.
143+
# .. note::
144+
# Although we will make new pipelines with the processors which we wrote in
145+
# the previous section for the 3 learners, the final estimator
146+
# :class:`~sklearn.linear_model.RidgeCV()` does not need preprocessing of
147+
# the data as it will be fed with the already preprocessed output from the 3
148+
# learners.
156149

150+
from sklearn.linear_model import LassoCV
157151

158-
from sklearn.experimental import enable_hist_gradient_boosting # noqa
159-
from sklearn.ensemble import HistGradientBoostingRegressor
152+
lasso_pipeline = make_pipeline(linear_preprocessor, LassoCV())
153+
lasso_pipeline
154+
155+
# %%
160156
from sklearn.ensemble import RandomForestRegressor
161-
from sklearn.ensemble import StackingRegressor
162-
from sklearn.linear_model import LassoCV
163-
from sklearn.linear_model import RidgeCV
164157

158+
rf_pipeline = make_pipeline(
159+
tree_preprocessor, RandomForestRegressor(random_state=42))
160+
rf_pipeline
165161

166-
lasso_pipeline = make_pipeline(processor_lin,
167-
LassoCV())
162+
# %%
163+
from sklearn.experimental import enable_hist_gradient_boosting # noqa
164+
from sklearn.ensemble import HistGradientBoostingRegressor
168165

169-
rf_pipeline = make_pipeline(processor_nlin,
170-
RandomForestRegressor(random_state=42))
166+
gbdt_pipeline = make_pipeline(
167+
tree_preprocessor, HistGradientBoostingRegressor(random_state=0))
168+
gbdt_pipeline
171169

172-
gradient_pipeline = make_pipeline(
173-
processor_nlin,
174-
HistGradientBoostingRegressor(random_state=0))
170+
# %%
171+
from sklearn.ensemble import StackingRegressor
172+
from sklearn.linear_model import RidgeCV
175173

176174
estimators = [('Random Forest', rf_pipeline),
177175
('Lasso', lasso_pipeline),
178-
('Gradient Boosting', gradient_pipeline)]
179-
180-
stacking_regressor = StackingRegressor(estimators=estimators,
181-
final_estimator=RidgeCV())
176+
('Gradient Boosting', gbdt_pipeline)]
182177

178+
stacking_regressor = StackingRegressor(
179+
estimators=estimators, final_estimator=RidgeCV())
180+
stacking_regressor
183181

184182
# %%
185183
# Measure and plot the results

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.