from functools import partial
from pprint import pprint
import numpy as np
import pandas as pd
from hyperopt import fmin, hp, space_eval, tpe, STATUS_OK, Trials
from hyperopt.pyll import scope, stochastic
from plotly import express as px
from plotly import graph_objects as go
from plotly import offline as pyo
from sklearn.datasets import load_boston
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import cross_val_score, KFold
from sklearn.utils import check_random_state
pyo.init_notebook_mode()
MEDIAN_HOME_VALUE = "median_home_value"
# Load the boston dataset using sklearn's helper function
boston_dataset = load_boston()
# Convert the data into a Pandas dataframe
data = np.concatenate(
[boston_dataset["data"], boston_dataset["target"][..., np.newaxis]],
axis=1,
)
features, target = boston_dataset["feature_names"], MEDIAN_HOME_VALUE
columns = np.concatenate([features, [target]])
boston_dataset_df = pd.DataFrame(data, columns=columns)
boston_dataset_df
# Define constant strings that we will use as keys in the “search space” dictionary below. Note that
# the convention I use throughout is to represent the characters in the string with a variable that
# matches that string except that the characters in the variable are capitalized. This convention
# allows us to easily interpret what these variable mean when we encounter them in the code. For
# example, we know that the variable `MODEL` contains the string “model”. Following this pattern of
# representing strings with variables allows me to avoid typos when repeatedly using the same string
# throughout the code since typos in variable names will be caught as errors by my linter.
GRADIENT_BOOSTING_REGRESSOR = "gradient_boosting_regressor"
KWARGS = "kwargs"
LEARNING_RATE = "learning_rate"
LINEAR_REGRESSION = "linear_regression"
MAX_DEPTH = "max_depth"
MODEL = "model"
MODEL_CHOICE = "model_choice"
NORMALIZE = "normalize"
N_ESTIMATORS = "n_estimators"
RANDOM_FOREST_REGRESSOR = "random_forest_regressor"
RANDOM_STATE = "random_state"
# Declare the search space for the random forest regressor model.
random_forest_regressor = {
MODEL: RANDOM_FOREST_REGRESSOR,
# I define the model parameters as a separate dictionary so that we can feed the parameters into
# the `__init__` of the model with dictionary unpacking. See the `sample_to_model` function
# that's defined alongside the objective function to see this in action.
KWARGS: {
N_ESTIMATORS: scope.int(
hp.quniform(f"{RANDOM_FOREST_REGRESSOR}__{N_ESTIMATORS}", 50, 150, 1)
),
MAX_DEPTH: scope.int(
hp.quniform(f"{RANDOM_FOREST_REGRESSOR}__{MAX_DEPTH}", 2, 12, 1)
),
RANDOM_STATE: 0,
},
}
# Declare the search space for the gradient boosting regressor model, following the same structure
# as the random forest regressor search space.
gradient_boosting_regressor = {
MODEL: GRADIENT_BOOSTING_REGRESSOR,
KWARGS: {
LEARNING_RATE: scope.float(
hp.uniform(
f"{GRADIENT_BOOSTING_REGRESSOR}__{LEARNING_RATE}",
0.01,
0.15,
)
), # lower learning rate
N_ESTIMATORS: scope.int(
hp.quniform(f"{GRADIENT_BOOSTING_REGRESSOR}__{N_ESTIMATORS}", 50, 150, 1)
),
MAX_DEPTH: scope.int(
hp.quniform(f"{GRADIENT_BOOSTING_REGRESSOR}__{MAX_DEPTH}", 2, 12, 1)
),
RANDOM_STATE: 0,
},
}
# Combine both model search spaces with a top level "choice" between the two models to get the final
# search space.
space = {
MODEL_CHOICE: hp.choice(
MODEL_CHOICE,
[
random_forest_regressor,
gradient_boosting_regressor,
],
)
}
# Define a few additional variables to represent strings. Note that this code expects that we have
# access to all variables that we previously defined in the "search space" code snippet.
LOSS = "loss"
STATUS = "status"
# Mapping from string name to model class deinition object that we'll use to create an initialized
# version of a model from a sample generated from the search space by hyperopt.
MODELS = {
GRADIENT_BOOSTING_REGRESSOR: GradientBoostingRegressor,
RANDOM_FOREST_REGRESSOR: RandomForestRegressor,
}
# Create a scoring function that we'll use in our objective
mse_scorer = make_scorer(mean_squared_error)
# Helper function thta converts from a sample generated by hyperopt to an initialized model. Note
# that because we split the model type and model keyword-arguments into separate key-value pairs in
# the search space declaration we are able to use dictionary unpacking to create an initialized
# version of the model.
def sample_to_model(sample):
kwargs = sample[MODEL_CHOICE][KWARGS]
return MODELS[sample[MODEL_CHOICE][MODEL]](**kwargs)
# Define the objective function for hyperopt. We'll fix the `dataset`, `features`, and `target`
# arguments with `functools.partial` to create that version of this function that we will supply as
# an argument to `fmin`
def objective(sample, dataset_df, features, target):
model = sample_to_model(sample)
rng = check_random_state(0)
# Handle randomization by shuffling when creating folds. In reality, we probably want a better
# strategy for managing randomization than the fixed `RandomState` instance generated above.
cv = KFold(n_splits=10, random_state=rng, shuffle=True)
# Calculate average mean squared error for each fold. Since `n_splits` is 10, `mse` will is an
# array of size 10 with each element representing the average mean squared error for a fold.
mse = cross_val_score(
model,
dataset_df.loc[:, features],
dataset_df.loc[:, target],
scoring=mse_scorer,
cv=cv,
)
# Return average of mean squared error across all folds.
return {LOSS: np.mean(mse), STATUS: STATUS_OK}
# Since we defined our objective function to be generic in terms of the dataset, we need to use
# `partial` from the `functools` module to "fix" the `dataset_df`, `features`, and `target`
# arguments to the values that we want for this example so that we have an objective function that
# takes in only one argument as assumed by the `hyperopt` interface.
boston_objective = partial(
objective, dataset_df=boston_dataset_df, features=features, target=MEDIAN_HOME_VALUE
)
# `hyperopt` tracks the results of each iteration in this `Trials` object. We’ll be collecting the
# data that we will use for visualization from this object.
trials = Trials()
rng = check_random_state(0) # reproducibility!
# `fmin` searches for hyperparameters that “minimize” our object, mean squared error and returns the
# “best” set of hyperparameters.
best = fmin(boston_objective, space, tpe.suggest, 1000, trials=trials, rstate=rng)
pprint([t for t in trials][:5])
# This is a simple helper function that allows us to fill in `np.nan` when a particular
# hyperparameter is not relevant to a particular trial.
def unpack(x):
if x:
return x[0]
return np.nan
# We'll first turn each trial into a series and then stack those series together as a dataframe.
trials_df = pd.DataFrame([pd.Series(t["misc"]["vals"]).apply(unpack) for t in trials])
# Then we'll add other relevant bits of information to the correct rows and perform a couple of
# mappings for convenience
trials_df["loss"] = [t["result"]["loss"] for t in trials]
trials_df["trial_number"] = trials_df.index
trials_df[MODEL_CHOICE] = trials_df[MODEL_CHOICE].apply(
lambda x: RANDOM_FOREST_REGRESSOR if x == 0 else GRADIENT_BOOSTING_REGRESSOR
)
trials_df
# px is an alias for "express" that's created by following the convention of importing "express" by
# running `from plotly import express as px`
fig = px.scatter(trials_df, x="trial_number", y="loss")
fig.show()
fig = px.scatter(trials_df, x="trial_number", y="loss", color=MODEL_CHOICE)
fig.show()
def add_hover_data(fig, df, model_choice):
# Filter to only columns that are relevant to the current model choice. Note that this relies on
# the convention of including the model name in the hyperparameter name when we declare the
# search space.
cols = [col for col in trials_df.columns if model_choice in col]
fig.update_traces(
# This specifies the data that we want to plot for the current model choice.
customdata=trials_df.loc[
trials_df[MODEL_CHOICE] == model_choice, cols + [MODEL_CHOICE]
],
hovertemplate="<br>".join(
[
f"{col.split('__')[1]}: %{{customdata[{i}]}}"
for i, col in enumerate(cols)
]
)
+ "<extra></extra>",
# We only apply the hover data for the current model choice.
selector={"name": model_choice},
)
return fig
fig = px.scatter(
trials_df,
x="trial_number",
y="loss",
color=MODEL_CHOICE,
)
# We call the `add_hover_data` function once for each model type so that we can add different sets
# of hyperparameters as hover data for each model type.
fig = add_hover_data(fig, trials_df, RANDOM_FOREST_REGRESSOR)
fig = add_hover_data(fig, trials_df, GRADIENT_BOOSTING_REGRESSOR)
fig.show()
# Since max_depth == 3 outperforms other settings, we'll filther to only look at that slice. This
# creates a boolean array that we will use to filter down to relevant rows in the `trials_df`
# dataframe.
max_depth_filter = (trials_df[MODEL_CHOICE] == GRADIENT_BOOSTING_REGRESSOR) & (
trials_df["gradient_boosting_regressor__max_depth"] == 3
)
# plotly express does not support contour plots so we will use `graph_objects` instead. `go.Contour
# automatically interpolates "z" values for our loss.
fig = go.Figure(
data=go.Contour(
z=trials_df.loc[max_depth_filter, "loss"],
x=trials_df.loc[max_depth_filter, "gradient_boosting_regressor__learning_rate"],
y=trials_df.loc[max_depth_filter, "gradient_boosting_regressor__n_estimators"],
contours=dict(
showlabels=True, # show labels on contours
labelfont=dict(
size=12,
color="white",
), # label font properties
),
colorbar=dict(
title="loss",
titleside="right",
),
hovertemplate="loss: %{z}<br>learning_rate: %{x}<br>n_estimators: %{y}<extra></extra>",
)
)
fig.update_layout(
xaxis_title="learning_rate",
yaxis_title="n_estimators",
title={
"text": "learning_rate vs. n_estimators | max_depth == 3",
"xanchor": "center",
"yanchor": "top",
"x": 0.5,
},
)
fig.show()