P4D3: From scikit-learn to DALEX/SHAP

Remaining Schedule

Date	Duration	Details
11/29	1 class	Final day of ML in Python and open programming
12/1	1 class	Day 4 of ML (Now in R)
12/6	1 class	Day 5 of ML in R
12/8	1 class	Day 6 of ML in R and take home coding challenge started
12/13	1 class	Cover letters due
12/17	0 class	Grades submitted

Installation

import sys
!{sys.executable} -m pip install scikit-learn dalex shap

https://ema.drwhy.ai/figure/mdp_general.png

Building our model

After building our machine learning data (dat_ml.pkl), we can start our model.py script.

# %%
import pandas as pd
import numpy as np
import joblib # to savel ml models
from plotnine import *

from sklearn import metrics

from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier

Build training and testing data

# %%
dat_ml = pd.read_pickle('dat_ml.pkl')
# Split our data into training and testing sets.
X_pred = dat_ml.drop(['yrbuilt', 'before1980'], axis = 1)
y_pred = dat_ml.before1980
X_train, X_test, y_train, y_test = train_test_split(
    X_pred, y_pred, test_size = .34, random_state = 76)  

Building our models

clfNB = GaussianNB()
clfGB = GradientBoostingClassifier()
clfNB.fit(X_train, y_train)
clfGB.fit(X_train, y_train)
ypred_clfNB = clfNB.predict(X_test)
ypred_clfGB = clfGB.predict(X_test)

Diagnosing our model with sklearn

# %%
metrics.plot_roc_curve(clfGB, X_test, y_test)
metrics.plot_roc_curve(clfNB, X_test, y_test)
# %%
metrics.confusion_matrix(y_test, ypred_clfNB)
metrics.confusion_matrix(y_test, ypred_clfGB)

Now we can build our own feature importance chart.

# %%
df_features = pd.DataFrame(
    {'f_names': X_train.columns, 
    'f_values': clfGB.feature_importances_}).sort_values('f_values', ascending = False).head(12)

# Python sequence slice addresses 
# can be written as a[start:end:step]
# and any of start, stop or end can be dropped.
# a[::-1] is reverse sequence.
f_names_cat = pd.Categorical(
    df_features.f_names,
    categories=df_features.f_names[::-1])

df_features = df_features.assign(f_cat = f_names_cat)

(ggplot(df_features,
    aes(x = 'f_cat', y = 'f_values')) +
    geom_col() +
    coord_flip() +
    theme_bw()
    )

Finalizing our model

We have ignored model tuning beyond variable reduction. You would want to tune your model parameters and evaluate your classification threshold as well.

Variable reduction

# build reduced model
compVars = df_features.f_names[::-1].tolist()

X_pred_reduced = dat_ml.filter(compVars, axis = 1)
y_pred = dat_ml.before1980

X_train_reduced, X_test_reduced, y_train, y_test = train_test_split(
    X_pred_reduced, y_pred, test_size = .34, random_state = 76)  

clfGB_reduced = GradientBoostingClassifier()
clfGB_reduced.fit(X_train_reduced, y_train)
ypred_clfGB_red = clfGB_reduced.predict(X_test_reduced)

# %%
print(metrics.classification_report(ypred_clfGB_red, y_test))
metrics.confusion_matrix(y_test, ypred_clfGB_red)

Saving our models

# %%
joblib.dump(clfNB, 'models/clfNB.pkl')
joblib.dump(clfGB, 'models/clfGB.pkl')
joblib.dump(clfGB_reduced, 'models/clfGB_final.pkl')
df_features.f_names[::-1].to_pickle('models/compVars.pkl')

Explaining our model

Explanatory Model Analysis Explore, Explain, and Examine Predictive Models. With examples in R and Python.

Let’s create a new explain.py script for our model explanation.

# %%
import pandas as pd
import numpy as np
import dalex as dx
import matplotlib.pyplot as plt

import shap
import joblib

from dalex._explainer.yhat import yhat_proba_default
from sklearn.model_selection import train_test_split


# %%
# load models and data
clfNB = joblib.load('models/clfNB.pkl')
clfGB = joblib.load('models/clfGB.pkl')
clfGB_reduced = joblib.load('models/clfGB_final.pkl')
compVars = pd.read_pickle('models/compVars.pkl').tolist()
dat_ml = pd.read_pickle('dat_ml.pkl')
y_pred = dat_ml.before1980
X_pred = dat_ml.drop(['yrbuilt', 'before1980'], axis = 1)
X_pred_reduced = dat_ml.filter(compVars, axis = 1)

X_train, X_test, y_train, y_test = train_test_split(
    X_pred, y_pred, test_size = .34, random_state = 76) 
# may not be the most efficient way to do this
X_train_reduced, X_test_reduced, y_train, y_test = train_test_split(
    X_pred_reduced, y_pred, test_size = .34, random_state = 76)  

Using DALEX

# %%
# Create explainer objects and show variable importance chart
expReduced = dx.Explainer(clfGB_reduced, X_test_reduced, y_test)
explanationReduced = expReduced.model_parts()
explanationReduced.plot(max_vars=15)

# %%
# show model performance
mpReduced = expReduced.model_performance(model_type = 'classification')
print(mpReduced.result)
mpReduced.plot(geom="roc")

# %%
# Explain variables
pdp_num_red = expReduced.model_profile(type = 'partial', label="pdp", variables = compVars)
ale_num_red = expReduced.model_profile(type = 'accumulated', label="ale", variables = compVars)
pdp_num_red.plot(ale_num_red)

# %%
# Explain observation
# shapley values
sh = expReduced.predict_parts(X_test_reduced.iloc[0,:], type='shap', label="first observation")

sh.plot(max_vars=12)

Using SHAP

The following two links provide background on Shapley values for model explaining.

The shap package has a few valuable model exploration charts that we can leverage.

# %%
# Build shap explainer
explainerShap = shap.Explainer(clfGB_reduced)
shap_values = explainerShap(X_test_reduced)

# %%
# Show variable importance based on shap values
shap.plots.bar(shap_values)

# %%
# https://medium.com/dataman-in-ai/the-shap-with-more-elegant-charts-bc3e73fa1c0c
shap.plots.beeswarm(shap_values)

# %%
# comparable to the bar plot
shap.plots.beeswarm(shap_values.abs, color="shap_red")

# %%
# combine the above charts
shap.plots.heatmap(shap_values[0:1000],  max_display=13)

We can also use partial dependence plots.

# %%
shap.plots.partial_dependence(
    "numbaths", 
    clfGB_reduced.predict,
    X_test_reduced,
    ice=False,
    model_expected_value=True,
    feature_expected_value=True)

# %%
shap.plots.partial_dependence(
    "livearea", 
    clfGB_reduced.predict,
    X_test_reduced,
    ice=False,
    model_expected_value=True,
    feature_expected_value=True, 
    show=False)

plt.xlim(xmin=0,xmax=15000)
plt.show()