15
15
stacking strategy. Stacking slightly improves the overall performance.
16
16
17
17
"""
18
- print (__doc__ )
19
18
20
19
# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
21
20
# Maria Telenczuk <https://github.com/maikia>
22
21
# License: BSD 3 clause
23
22
23
+ print (__doc__ )
24
+
25
+ from sklearn import set_config
26
+ set_config (display = 'diagram' )
24
27
25
28
# %%
26
29
# Download the dataset
@@ -73,68 +76,56 @@ def load_ames_housing():
73
76
##############################################################################
74
77
#
75
78
# Before we can use Ames dataset we still need to do some preprocessing.
76
- # First, the dataset has many missing values. To impute them, we will exchange
77
- # categorical missing values with the new category 'missing' while the
78
- # numerical missing values with the 'mean' of the column. We will also encode
79
- # the categories with either :class:`~sklearn.preprocessing.OneHotEncoder
80
- # <sklearn.preprocessing.OneHotEncoder>` or
81
- # :class:`~sklearn.preprocessing.OrdinalEncoder
82
- # <sklearn.preprocessing.OrdinalEncoder>` depending for which type of model we
83
- # will use them (linear or non-linear model). To facilitate this preprocessing
84
- # we will make two pipelines.
85
- # You can skip this section if your data is ready to use and does
86
- # not need preprocessing
79
+ # First, we will select the categorical and numerical columns of the dataset to
80
+ # construct the first step of the pipeline.
81
+
82
+ from sklearn .compose import make_column_selector
83
+
84
+ cat_selector = make_column_selector (dtype_include = object )
85
+ num_selector = make_column_selector (dtype_include = np .number )
86
+ cat_selector (X )
87
87
88
+ # %%
89
+ num_selector (X )
90
+
91
+ # %%
92
+ # Then, we will need to design preprocessing pipelines which depends on the
93
+ # ending regressor. If the ending regressor is a linear model, one needs to
94
+ # one-hot encode the categories. If the ending regressor is a tree-based model
95
+ # an ordinal encoder will be sufficient. Besides, numerical values need to be
96
+ # standardized for a linear model while the raw numerical data can be treated
97
+ # as is by a tree-based model. However, both models need an imputer to
98
+ # handle missing values.
99
+ #
100
+ # We will first design the pipeline required for the tree-based models.
88
101
89
102
from sklearn .compose import make_column_transformer
90
103
from sklearn .impute import SimpleImputer
91
104
from sklearn .pipeline import make_pipeline
92
- from sklearn .preprocessing import OneHotEncoder
93
105
from sklearn .preprocessing import OrdinalEncoder
94
- from sklearn .preprocessing import StandardScaler
95
-
96
-
97
- cat_cols = X .columns [X .dtypes == 'O' ]
98
- num_cols = X .columns [X .dtypes == 'float64' ]
99
106
100
- categories = [
101
- X [column ].unique () for column in X [cat_cols ]]
107
+ cat_tree_processor = OrdinalEncoder (
108
+ handle_unknown = "use_encoded_value" , unknown_value = - 1 )
109
+ num_tree_processor = SimpleImputer (strategy = "mean" , add_indicator = True )
102
110
103
- for cat in categories :
104
- cat [cat == None ] = 'missing' # noqa
111
+ tree_preprocessor = make_column_transformer (
112
+ (num_tree_processor , num_selector ), (cat_tree_processor , cat_selector ))
113
+ tree_preprocessor
105
114
106
- cat_proc_nlin = make_pipeline (
107
- SimpleImputer (missing_values = None , strategy = 'constant' ,
108
- fill_value = 'missing' ),
109
- OrdinalEncoder (categories = categories )
110
- )
111
-
112
- num_proc_nlin = make_pipeline (SimpleImputer (strategy = 'mean' ))
113
-
114
- cat_proc_lin = make_pipeline (
115
- SimpleImputer (missing_values = None ,
116
- strategy = 'constant' ,
117
- fill_value = 'missing' ),
118
- OneHotEncoder (categories = categories )
119
- )
120
-
121
- num_proc_lin = make_pipeline (
122
- SimpleImputer (strategy = 'mean' ),
123
- StandardScaler ()
124
- )
115
+ # %%
116
+ # Then, we will now define the preprocessor used when the ending regressor
117
+ # is a linear model.
125
118
126
- # transformation to use for non-linear estimators
127
- processor_nlin = make_column_transformer (
128
- (cat_proc_nlin , cat_cols ),
129
- (num_proc_nlin , num_cols ),
130
- remainder = 'passthrough' )
119
+ from sklearn .preprocessing import OneHotEncoder
120
+ from sklearn .preprocessing import StandardScaler
131
121
132
- # transformation to use for linear estimators
133
- processor_lin = make_column_transformer (
134
- (cat_proc_lin , cat_cols ),
135
- (num_proc_lin , num_cols ),
136
- remainder = 'passthrough' )
122
+ cat_linear_processor = OneHotEncoder (handle_unknown = "ignore" )
123
+ num_linear_processor = make_pipeline (
124
+ StandardScaler (), SimpleImputer (strategy = "mean" , add_indicator = True ))
137
125
126
+ linear_preprocessor = make_column_transformer (
127
+ (num_linear_processor , num_selector ), (cat_linear_processor , cat_selector ))
128
+ linear_preprocessor
138
129
139
130
# %%
140
131
# Stack of predictors on a single data set
@@ -149,37 +140,44 @@ def load_ames_housing():
149
140
# Here, we combine 3 learners (linear and non-linear) and use a ridge regressor
150
141
# to combine their outputs together.
151
142
#
152
- # Note: although we will make new pipelines with the processors which we wrote
153
- # in the previous section for the 3 learners, the final estimator RidgeCV()
154
- # does not need preprocessing of the data as it will be fed with the already
155
- # preprocessed output from the 3 learners.
143
+ # .. note::
144
+ # Although we will make new pipelines with the processors which we wrote in
145
+ # the previous section for the 3 learners, the final estimator
146
+ # :class:`~sklearn.linear_model.RidgeCV()` does not need preprocessing of
147
+ # the data as it will be fed with the already preprocessed output from the 3
148
+ # learners.
156
149
150
+ from sklearn .linear_model import LassoCV
157
151
158
- from sklearn .experimental import enable_hist_gradient_boosting # noqa
159
- from sklearn .ensemble import HistGradientBoostingRegressor
152
+ lasso_pipeline = make_pipeline (linear_preprocessor , LassoCV ())
153
+ lasso_pipeline
154
+
155
+ # %%
160
156
from sklearn .ensemble import RandomForestRegressor
161
- from sklearn .ensemble import StackingRegressor
162
- from sklearn .linear_model import LassoCV
163
- from sklearn .linear_model import RidgeCV
164
157
158
+ rf_pipeline = make_pipeline (
159
+ tree_preprocessor , RandomForestRegressor (random_state = 42 ))
160
+ rf_pipeline
165
161
166
- lasso_pipeline = make_pipeline (processor_lin ,
167
- LassoCV ())
162
+ # %%
163
+ from sklearn .experimental import enable_hist_gradient_boosting # noqa
164
+ from sklearn .ensemble import HistGradientBoostingRegressor
168
165
169
- rf_pipeline = make_pipeline (processor_nlin ,
170
- RandomForestRegressor (random_state = 42 ))
166
+ gbdt_pipeline = make_pipeline (
167
+ tree_preprocessor , HistGradientBoostingRegressor (random_state = 0 ))
168
+ gbdt_pipeline
171
169
172
- gradient_pipeline = make_pipeline (
173
- processor_nlin ,
174
- HistGradientBoostingRegressor ( random_state = 0 ))
170
+ # %%
171
+ from sklearn . ensemble import StackingRegressor
172
+ from sklearn . linear_model import RidgeCV
175
173
176
174
estimators = [('Random Forest' , rf_pipeline ),
177
175
('Lasso' , lasso_pipeline ),
178
- ('Gradient Boosting' , gradient_pipeline )]
179
-
180
- stacking_regressor = StackingRegressor (estimators = estimators ,
181
- final_estimator = RidgeCV ())
176
+ ('Gradient Boosting' , gbdt_pipeline )]
182
177
178
+ stacking_regressor = StackingRegressor (
179
+ estimators = estimators , final_estimator = RidgeCV ())
180
+ stacking_regressor
183
181
184
182
# %%
185
183
# Measure and plot the results
0 commit comments