Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

DOC use diagram and simplify pipeline in stacking example #18830

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Feb 1, 2021
140 changes: 69 additions & 71 deletions 140 examples/ensemble/plot_stack_predictors.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,15 @@
stacking strategy. Stacking slightly improves the overall performance.

"""
print(__doc__)

# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
# Maria Telenczuk <https://github.com/maikia>
# License: BSD 3 clause

print(__doc__)

from sklearn import set_config
set_config(display='diagram')

# %%
# Download the dataset
Expand Down Expand Up @@ -73,68 +76,56 @@ def load_ames_housing():
##############################################################################
#
# Before we can use Ames dataset we still need to do some preprocessing.
# First, the dataset has many missing values. To impute them, we will exchange
# categorical missing values with the new category 'missing' while the
# numerical missing values with the 'mean' of the column. We will also encode
# the categories with either :class:`~sklearn.preprocessing.OneHotEncoder
# <sklearn.preprocessing.OneHotEncoder>` or
# :class:`~sklearn.preprocessing.OrdinalEncoder
# <sklearn.preprocessing.OrdinalEncoder>` depending for which type of model we
# will use them (linear or non-linear model). To facilitate this preprocessing
# we will make two pipelines.
# You can skip this section if your data is ready to use and does
# not need preprocessing
# First, we will select the categorical and numerical columns of the dataset to
# construct the first step of the pipeline.

from sklearn.compose import make_column_selector

cat_selector = make_column_selector(dtype_include=object)
num_selector = make_column_selector(dtype_include=np.number)
cat_selector(X)

# %%
num_selector(X)

# %%
# Then, we will need to design preprocessing pipelines which depends on the
# ending regressor. If the ending regressor is a linear model, one needs to
# one-hot encode the categories. If the ending regressor is a tree-based model
# an ordinal encoder will be sufficient. Besides, numerical values need to be
# standardized for a linear model while the raw numerical data can be treated
# as is by a tree-based model. However, both models need an imputer to
# handle missing values.
#
# We will first design the pipeline required for the tree-based models.

from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler


cat_cols = X.columns[X.dtypes == 'O']
num_cols = X.columns[X.dtypes == 'float64']

categories = [
X[column].unique() for column in X[cat_cols]]
cat_tree_processor = OrdinalEncoder(
handle_unknown="use_encoded_value", unknown_value=-1)
num_tree_processor = SimpleImputer(strategy="mean", add_indicator=True)

for cat in categories:
cat[cat == None] = 'missing' # noqa
tree_preprocessor = make_column_transformer(
(num_tree_processor, num_selector), (cat_tree_processor, cat_selector))
tree_preprocessor

cat_proc_nlin = make_pipeline(
SimpleImputer(missing_values=None, strategy='constant',
fill_value='missing'),
OrdinalEncoder(categories=categories)
)

num_proc_nlin = make_pipeline(SimpleImputer(strategy='mean'))

cat_proc_lin = make_pipeline(
SimpleImputer(missing_values=None,
strategy='constant',
fill_value='missing'),
OneHotEncoder(categories=categories)
)

num_proc_lin = make_pipeline(
SimpleImputer(strategy='mean'),
StandardScaler()
)
# %%
# Then, we will now define the preprocessor used when the ending regressor
# is a linear model.

# transformation to use for non-linear estimators
processor_nlin = make_column_transformer(
(cat_proc_nlin, cat_cols),
(num_proc_nlin, num_cols),
remainder='passthrough')
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# transformation to use for linear estimators
processor_lin = make_column_transformer(
(cat_proc_lin, cat_cols),
(num_proc_lin, num_cols),
remainder='passthrough')
cat_linear_processor = OneHotEncoder(handle_unknown="ignore")
num_linear_processor = make_pipeline(
StandardScaler(), SimpleImputer(strategy="mean", add_indicator=True))

linear_preprocessor = make_column_transformer(
(num_linear_processor, num_selector), (cat_linear_processor, cat_selector))
linear_preprocessor

# %%
# Stack of predictors on a single data set
Expand All @@ -149,37 +140,44 @@ def load_ames_housing():
# Here, we combine 3 learners (linear and non-linear) and use a ridge regressor
# to combine their outputs together.
#
# Note: although we will make new pipelines with the processors which we wrote
# in the previous section for the 3 learners, the final estimator RidgeCV()
# does not need preprocessing of the data as it will be fed with the already
# preprocessed output from the 3 learners.
# .. note::
# Although we will make new pipelines with the processors which we wrote in
# the previous section for the 3 learners, the final estimator
# :class:`~sklearn.linear_model.RidgeCV()` does not need preprocessing of
# the data as it will be fed with the already preprocessed output from the 3
# learners.

from sklearn.linear_model import LassoCV

from sklearn.experimental import enable_hist_gradient_boosting # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
lasso_pipeline = make_pipeline(linear_preprocessor, LassoCV())
lasso_pipeline

# %%
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV

rf_pipeline = make_pipeline(
tree_preprocessor, RandomForestRegressor(random_state=42))
rf_pipeline

lasso_pipeline = make_pipeline(processor_lin,
LassoCV())
# %%
from sklearn.experimental import enable_hist_gradient_boosting # noqa
from sklearn.ensemble import HistGradientBoostingRegressor

rf_pipeline = make_pipeline(processor_nlin,
RandomForestRegressor(random_state=42))
gbdt_pipeline = make_pipeline(
tree_preprocessor, HistGradientBoostingRegressor(random_state=0))
gbdt_pipeline

gradient_pipeline = make_pipeline(
processor_nlin,
HistGradientBoostingRegressor(random_state=0))
# %%
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV

estimators = [('Random Forest', rf_pipeline),
('Lasso', lasso_pipeline),
('Gradient Boosting', gradient_pipeline)]

stacking_regressor = StackingRegressor(estimators=estimators,
final_estimator=RidgeCV())
('Gradient Boosting', gbdt_pipeline)]

stacking_regressor = StackingRegressor(
estimators=estimators, final_estimator=RidgeCV())
stacking_regressor
glemaitre marked this conversation as resolved.
Show resolved Hide resolved

# %%
# Measure and plot the results
Expand Down
Morty Proxy This is a proxified and sanitized view of the page, visit original site.