#remotes::install_github("NorskRegnesentral/shapr") # Remove initial # to run
#remotes::install_github("NorskRegnesentral/shapr",ref = "ctree") # Remove initial # to run
#install.packages("shapr") # Remove initial # to run
Response: Binary
Examples:
Model: XGBoost model (Tree-based gradient boosting machine) + additional logistic regression model for example 2 for illustation
#### Package usage ####
library(shapr)
library(xgboost)
library(data.table)
library(ggplot2)
data <- data.table::fread(file = "FICO_HELOC_dataset_v1.csv")
# Define response
data[,y:=(RiskPerformance=="Bad")*1]
data[,RiskPerformance:=NULL]
# NA's are coded as Negative numbers -- transform them
data[data<0]=NA
# Re-code two factor variables
data[,MaxDelq2PublicRecLast12M:=as.factor(MaxDelq2PublicRecLast12M)]
data[,MaxDelqEver:=as.factor(MaxDelqEver)]
# Define response and variables
y_var <- "y"
x_var_numeric = c("ExternalRiskEstimate",
"PercentTradesNeverDelq",
"NetFractionRevolvingBurden",
"NumTrades60Ever2DerogPubRec",
"MSinceMostRecentInqexcl7days")
x_var_factor = c("MaxDelq2PublicRecLast12M", "MaxDelqEver")
x_var_mixed = c(x_var_numeric,x_var_factor)
xy_var = c(x_var_mixed,y_var)
xy_data = data[,..xy_var]
xy_data = xy_data[complete.cases(xy_data)] # Remove observations with one or more NA in our features
# Split data in training and test set
set.seed(123)
test_ind = sort(sample(1:nrow(xy_data),5))
train_ind = (1:nrow(xy_data))[-test_ind]
x_train <- xy_data[train_ind,..x_var_numeric]
y_train <- unlist(xy_data[train_ind,..y_var])
x_test <- xy_data[test_ind,..x_var_numeric]
# Fitting a basic xgboost model to the training data
params <- list(eta = 0.3,
max_depth = 4,
objective= 'binary:logistic',
eval_metric = "auc",
tree_method="hist")
model <- xgboost::xgboost(
data = as.matrix(x_train),
label = y_train,
nround = 20,
params = params,
verbose = T)
# Prepare the data/model for explanation
explainer <- shapr(x = x_train,
model = model)
# Define the reference prediction for explanation
p <- mean(y_train)
explanation_empirical <- explain(
x = x_test,
approach = "empirical",
explainer = explainer,
prediction_zero = p
)
# Print
explanation_empirical$dt
# Plot including phi_0
#plot(explanation_empirical)
# Plot excluding phi_0
#plot(explanation_empirical,plot_phi0 = F)
With phi0 | Without phi0 |
---|---|
## Incrasing the bandwidth parameter
explanation_empirical_02 <- explain(
x = x_test,
approach = "empirical",
type = "fixed_sigma",
fixed_sigma_vec = 0.2,
explainer = explainer,
prediction_zero = p
)
# Print
explanation_empirical_02$dt
# Assuming independence
explanation_empirical_indep <- explain(
x = x_test,
approach = "empirical",
type = "independence",
explainer = explainer,
prediction_zero = p
)
# Print
explanation_empirical_indep$dt
# Using the Gaussian approach
explanation_gaussian <- explain(
x = x_test,
approach = "gaussian",
explainer = explainer,
prediction_zero = p
)
# Print
#print(explanation_gaussian$dt)
explanation_gaussian$dt
# Using the ctree approach
explanation_ctree <- explain(
x = x_test,
approach = "ctree",
explainer = explainer,
prediction_zero = p
)
# Print
explanation_ctree$dt
x_train_mixed <- xy_data[train_ind,..x_var_mixed]
x_test_mixed <- xy_data[test_ind,..x_var_mixed]
model_glm_mixed = glm(y~.,data=cbind(x_train_mixed,y=as.factor(y_train)),family="binomial")
explainer_glm_cat <- shapr(x = x_train_mixed,
model = model_glm_mixed)
explanation_glm_cat_ctree <- explain(
x = x_test_mixed,
approach = "ctree",
explainer = explainer_glm_cat,
prediction_zero = p)
explanation_glm_cat_ctree$dt
# Making the transformation function
dummylist <- make_dummies(data = rbind(x_train_mixed, x_test_mixed))
# Applying the transformation function
x_train_mixed_dummy <- apply_dummies(obj = dummylist, newdata = x_train_mixed)
# Fitting the XGBoost model
model_mixed <- xgboost(
data = as.matrix(x_train_mixed_dummy),
label = y_train,
nround = 20,
params = params,
verbose = F)
# Include the transformation function to the model object
model_mixed$dummylist <- dummylist
explainer_mixed <- shapr(x_train_mixed,
model_mixed)
explanation_ctree_mixed <- explain(
x_test_mixed,
approach = "ctree",
explainer = explainer_mixed,
prediction_zero = p
)
explanation_ctree_mixed$dt