<- "data"
path_data <- read_csv(here::here(path_data, "student_perf_cln.csv"),
data_all col_types = cols())
<- c("none", "primary", "middle", "secondary", "higher")
focal_levels <- data_all |>
data_all mutate(across(ends_with("_educ"), ~ factor(.x, levels = 0:4, labels = focal_levels)),
across(where(is.character), as_factor))
<- data_all |>
data_all mutate(failures = if_else(failures == 0, "no", "yes"),
travel_time = if_else(travel_time == 1, "less_than_1_hour", "more_than_1_hour"))
Unit 11 Lab Agenda
Prepare Data
Feature Ablation
Feature ablation: is a technique used to assess the importance of individual features in a model by systematically removing them and observing the impact on model performance. This process helps identify which features contribute most to the model’s predictive power.
We need to create a recipe for the full model and a recipe for the compact model. The compact model will remove the features that are not important. We can use the step_rm()
function to remove features from the recipe.
Importantly, we evaluate the best model configuration (or statistical algorithm) for each feature set.
Next, we will use 10-fold cross-validation with 3 repeats to select the best model configuration and evaluate the models.
set.seed(123456)
<- vfold_cv(data_all, v = 10, repeats = 3)
splits
<- recipe(grade ~ ., data = data_all) |>
rec_full step_scale(all_numeric_predictors()) |>
step_dummy(all_nominal_predictors()) |>
step_nzv(all_predictors())
<- rec_full |>
rec_compact step_rm(starts_with("mother_educ"), starts_with("father_educ"))
<- expand_grid(penalty = exp(seq(-8, .4, length.out = 500)),
tune_grid mixture = seq(0, 1, length.out = 6))
<- linear_reg(penalty = tune(),
fits_full mixture = tune()) |>
set_engine("glmnet") |>
set_mode("regression") |>
tune_grid(preprocessor = rec_full,
resamples = splits,
grid = tune_grid,
metrics = metric_set(rsq))
<- linear_reg(penalty = tune(),
fits_compact mixture = tune()) |>
set_engine("glmnet") |>
set_mode("regression") |>
tune_grid(preprocessor = rec_compact,
resamples = splits,
grid = tune_grid,
metrics = metric_set(rsq))
With the fitted models, we will next obtain the 30 held-out performance metrics for the full and compact models’ best configuration. We will use the collect_metrics()
function to extract the performance metrics from the fitted models.
<-
rsq_full collect_metrics(fits_full, summarize = FALSE) |>
filter(.config == select_best(fits_full)$.config) |>
select(id, id2, full = .estimate)
Warning: There was 1 warning in `filter()`.
ℹ In argument: `.config == select_best(fits_full)$.config`.
Caused by warning in `select_best()`:
! No value of `metric` was given; "rsq" will be used.
<-
rsq_compact collect_metrics(fits_compact, summarize = FALSE) |>
filter(.config == select_best(fits_full)$.config) |>
select(id, id2, compact = .estimate)
Warning: There was 1 warning in `filter()`.
ℹ In argument: `.config == select_best(fits_full)$.config`.
Caused by warning in `select_best()`:
! No value of `metric` was given; "rsq" will be used.
<- rsq_full |>
rsq_all full_join(rsq_compact, by = c("id", "id2")) |>
print()
# A tibble: 30 × 4
id id2 full compact
<chr> <chr> <dbl> <dbl>
1 Repeat1 Fold01 0.0921 0.0604
2 Repeat1 Fold02 0.0654 0.0905
3 Repeat1 Fold03 0.0531 0.0432
4 Repeat1 Fold04 0.233 0.256
5 Repeat1 Fold05 0.00167 0.0159
6 Repeat1 Fold06 0.283 0.254
7 Repeat1 Fold07 0.261 0.236
8 Repeat1 Fold08 0.374 0.354
9 Repeat1 Fold09 0.188 0.216
10 Repeat1 Fold10 0.144 0.201
11 Repeat2 Fold01 0.235 0.212
12 Repeat2 Fold02 0.220 0.184
13 Repeat2 Fold03 0.0386 0.0335
14 Repeat2 Fold04 0.134 0.167
15 Repeat2 Fold05 0.229 0.246
16 Repeat2 Fold06 0.202 0.172
17 Repeat2 Fold07 0.182 0.178
18 Repeat2 Fold08 0.0969 0.144
19 Repeat2 Fold09 0.0000000330 0.000586
20 Repeat2 Fold10 0.380 0.320
21 Repeat3 Fold01 0.162 0.190
22 Repeat3 Fold02 0.0957 0.111
23 Repeat3 Fold03 0.126 0.154
24 Repeat3 Fold04 0.0672 0.0776
25 Repeat3 Fold05 0.0548 0.0705
26 Repeat3 Fold06 0.144 0.151
27 Repeat3 Fold07 0.0778 0.0651
28 Repeat3 Fold08 0.171 0.147
29 Repeat3 Fold09 0.274 0.252
30 Repeat3 Fold10 0.185 0.163
Model Comparison via Frequentist
We now have the same 30 held-out samples for the full and compact models. We might use a paired t-test to compare the two models. The paired t-test is appropriate here because we have the same samples for both models. However, the 30 differences between the full and compact models are not independent. The paired t-test assumes that the differences are independent, which is not the case here. We can use a correction to account for this.
Consult the lecture slides for more information on the nb-correlated t-test! The correction is based on the number of folds in the cross-validation.
# included in fun_ml.R
<- function(cv_full, cv_compact, k = 10){
nb_correlated_t_test
<- cv_full - cv_compact
diffs <- length(diffs)
n <- mean(diffs)
mean_diff <- var(diffs)
var_diffs <- 1 / k
proportion_test <- 1 - proportion_test
proportion_train <- (1 / n) + (proportion_test / proportion_train)
correction = sqrt(correction * var_diffs)
se
= abs(mean_diff/se)
t <- 2 * pt(t, n - 1, lower.tail = FALSE)
p_value tibble(mean_diff = mean_diff, se = se, t = t, df = n - 1, p_value = p_value)
}
nb_correlated_t_test(rsq_all$full, rsq_all$compact, k = 10)
# A tibble: 1 × 5
mean_diff se t df p_value
<dbl> <dbl> <dbl> <dbl> <dbl>
1 0.000179 0.0106 0.0169 29 0.987
Model Comparison via Bayesian
We can also compare the two models using Bayesian methods. Bayesian methods provide the probability of the models given the data. Using this approach, we can estimate the posterior probability of 1) the accuracy of the full model; 2) the accuracy of the compact model; and 3) the difference in accuracy between the two models.
We will use the perf_mod()
function to create a performance model. The perf_mod()
function takes the performance metrics from the fitted models and creates a performance model. We can then use the contrast_models()
function to compare the two models.
For perf_mod()
, we pass in a dataframe with id columns (repeats and folds in this case – id == repeat, and id2 == fold within repeats), and the performance metrics for the two models. The formula
argument specifies the model structure. It’s always written as “statistics ~ model + your random effect terms (1 | id2/id)”.
We will use the hetero_var
argument to specify whether we relax the assumption that the variance for the two model metrics is the same. If hetero_var = TRUE
, we assume that the variance for the two model metrics is different. If hetero_var = FALSE
, we assume that the variance for the two model metrics is the same. The default for hetero_var
is FALSE, and it allows the model to run quicker and easier to converge. We would typically want to set it as TRUE for classification models because the variance is often different for classification models. In particular, the variance will be the largest for accuracy around 0.5, and the variance will be the smallest for accuracy around 0 or 1.
set.seed(102030)
<- tidyposterior::perf_mod(rsq_all,
pp formula = statistic ~ model + (1 | id2/id),
iter = 3000, chains = 4,
hetero_var = TRUE,
adapt_delta = 0.999)
SAMPLING FOR MODEL 'continuous' NOW (CHAIN 1).
Chain 1:
Chain 1: Gradient evaluation took 5.7e-05 seconds
Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.57 seconds.
Chain 1: Adjust your expectations accordingly!
Chain 1:
Chain 1:
Chain 1: Iteration: 1 / 3000 [ 0%] (Warmup)
Chain 1: Iteration: 300 / 3000 [ 10%] (Warmup)
Chain 1: Iteration: 600 / 3000 [ 20%] (Warmup)
Chain 1: Iteration: 900 / 3000 [ 30%] (Warmup)
Chain 1: Iteration: 1200 / 3000 [ 40%] (Warmup)
Chain 1: Iteration: 1500 / 3000 [ 50%] (Warmup)
Chain 1: Iteration: 1501 / 3000 [ 50%] (Sampling)
Chain 1: Iteration: 1800 / 3000 [ 60%] (Sampling)
Chain 1: Iteration: 2100 / 3000 [ 70%] (Sampling)
Chain 1: Iteration: 2400 / 3000 [ 80%] (Sampling)
Chain 1: Iteration: 2700 / 3000 [ 90%] (Sampling)
Chain 1: Iteration: 3000 / 3000 [100%] (Sampling)
Chain 1:
Chain 1: Elapsed Time: 3.511 seconds (Warm-up)
Chain 1: 2.263 seconds (Sampling)
Chain 1: 5.774 seconds (Total)
Chain 1:
SAMPLING FOR MODEL 'continuous' NOW (CHAIN 2).
Chain 2:
Chain 2: Gradient evaluation took 1.3e-05 seconds
Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 0.13 seconds.
Chain 2: Adjust your expectations accordingly!
Chain 2:
Chain 2:
Chain 2: Iteration: 1 / 3000 [ 0%] (Warmup)
Chain 2: Iteration: 300 / 3000 [ 10%] (Warmup)
Chain 2: Iteration: 600 / 3000 [ 20%] (Warmup)
Chain 2: Iteration: 900 / 3000 [ 30%] (Warmup)
Chain 2: Iteration: 1200 / 3000 [ 40%] (Warmup)
Chain 2: Iteration: 1500 / 3000 [ 50%] (Warmup)
Chain 2: Iteration: 1501 / 3000 [ 50%] (Sampling)
Chain 2: Iteration: 1800 / 3000 [ 60%] (Sampling)
Chain 2: Iteration: 2100 / 3000 [ 70%] (Sampling)
Chain 2: Iteration: 2400 / 3000 [ 80%] (Sampling)
Chain 2: Iteration: 2700 / 3000 [ 90%] (Sampling)
Chain 2: Iteration: 3000 / 3000 [100%] (Sampling)
Chain 2:
Chain 2: Elapsed Time: 4.294 seconds (Warm-up)
Chain 2: 2.447 seconds (Sampling)
Chain 2: 6.741 seconds (Total)
Chain 2:
SAMPLING FOR MODEL 'continuous' NOW (CHAIN 3).
Chain 3:
Chain 3: Gradient evaluation took 1.5e-05 seconds
Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 0.15 seconds.
Chain 3: Adjust your expectations accordingly!
Chain 3:
Chain 3:
Chain 3: Iteration: 1 / 3000 [ 0%] (Warmup)
Chain 3: Iteration: 300 / 3000 [ 10%] (Warmup)
Chain 3: Iteration: 600 / 3000 [ 20%] (Warmup)
Chain 3: Iteration: 900 / 3000 [ 30%] (Warmup)
Chain 3: Iteration: 1200 / 3000 [ 40%] (Warmup)
Chain 3: Iteration: 1500 / 3000 [ 50%] (Warmup)
Chain 3: Iteration: 1501 / 3000 [ 50%] (Sampling)
Chain 3: Iteration: 1800 / 3000 [ 60%] (Sampling)
Chain 3: Iteration: 2100 / 3000 [ 70%] (Sampling)
Chain 3: Iteration: 2400 / 3000 [ 80%] (Sampling)
Chain 3: Iteration: 2700 / 3000 [ 90%] (Sampling)
Chain 3: Iteration: 3000 / 3000 [100%] (Sampling)
Chain 3:
Chain 3: Elapsed Time: 3.686 seconds (Warm-up)
Chain 3: 4.598 seconds (Sampling)
Chain 3: 8.284 seconds (Total)
Chain 3:
SAMPLING FOR MODEL 'continuous' NOW (CHAIN 4).
Chain 4:
Chain 4: Gradient evaluation took 1.5e-05 seconds
Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 0.15 seconds.
Chain 4: Adjust your expectations accordingly!
Chain 4:
Chain 4:
Chain 4: Iteration: 1 / 3000 [ 0%] (Warmup)
Chain 4: Iteration: 300 / 3000 [ 10%] (Warmup)
Chain 4: Iteration: 600 / 3000 [ 20%] (Warmup)
Chain 4: Iteration: 900 / 3000 [ 30%] (Warmup)
Chain 4: Iteration: 1200 / 3000 [ 40%] (Warmup)
Chain 4: Iteration: 1500 / 3000 [ 50%] (Warmup)
Chain 4: Iteration: 1501 / 3000 [ 50%] (Sampling)
Chain 4: Iteration: 1800 / 3000 [ 60%] (Sampling)
Chain 4: Iteration: 2100 / 3000 [ 70%] (Sampling)
Chain 4: Iteration: 2400 / 3000 [ 80%] (Sampling)
Chain 4: Iteration: 2700 / 3000 [ 90%] (Sampling)
Chain 4: Iteration: 3000 / 3000 [100%] (Sampling)
Chain 4:
Chain 4: Elapsed Time: 3.183 seconds (Warm-up)
Chain 4: 2.211 seconds (Sampling)
Chain 4: 5.394 seconds (Total)
Chain 4:
# Remember to always check your warnings!
warnings()
Sometimes you will encounter convergence issues in your models. This can be due to a variety of reasons, including the model structure, the data, or the priors. You can try increasing the number of iterations, changing the priors, or using different model structures. Increasing the number of chains can also help with convergence issues.
To check convergence, you can use the bayesplot::mcmc_trace()
function to plot the trace of the chains. You can also use the summary()
function to check the R-hat statistic. The R-hat statistic should be close to 1 for all parameters. If it is greater than 1, it indicates that the chains have not converged.
::mcmc_trace(pp$stan) bayesplot
|> summary() pp
Model Info:
function: stan_glmer
family: gaussian [identity]
formula: statistic ~ model + (1 | id2/id)
algorithm: sampling
sample: 6000 (posterior sample size)
priors: see help('prior_summary')
observations: 60
groups: id:id2 (30), id2 (10)
Estimates:
mean sd 10% 50% 90%
(Intercept) 0.2 0.0 0.1 0.2 0.2
modelfull 0.0 0.0 0.0 0.0 0.0
b[(Intercept) id:id2:Repeat1:Fold01] -0.1 0.0 -0.1 -0.1 0.0
b[(Intercept) id:id2:Repeat1:Fold02] -0.1 0.0 -0.1 -0.1 0.0
b[(Intercept) id:id2:Repeat1:Fold03] -0.1 0.0 -0.1 -0.1 -0.1
b[(Intercept) id:id2:Repeat1:Fold04] 0.1 0.0 0.1 0.1 0.1
b[(Intercept) id:id2:Repeat1:Fold05] -0.1 0.0 -0.2 -0.1 -0.1
b[(Intercept) id:id2:Repeat1:Fold06] 0.1 0.0 0.1 0.1 0.1
b[(Intercept) id:id2:Repeat1:Fold07] 0.1 0.0 0.1 0.1 0.1
b[(Intercept) id:id2:Repeat1:Fold08] 0.2 0.0 0.2 0.2 0.2
b[(Intercept) id:id2:Repeat1:Fold09] 0.0 0.0 0.0 0.0 0.1
b[(Intercept) id:id2:Repeat1:Fold10] 0.0 0.0 0.0 0.0 0.0
b[(Intercept) id:id2:Repeat2:Fold01] 0.1 0.0 0.0 0.1 0.1
b[(Intercept) id:id2:Repeat2:Fold02] 0.0 0.0 0.0 0.0 0.1
b[(Intercept) id:id2:Repeat2:Fold03] -0.1 0.0 -0.1 -0.1 -0.1
b[(Intercept) id:id2:Repeat2:Fold04] 0.0 0.0 0.0 0.0 0.0
b[(Intercept) id:id2:Repeat2:Fold05] 0.1 0.0 0.0 0.1 0.1
b[(Intercept) id:id2:Repeat2:Fold06] 0.0 0.0 0.0 0.0 0.1
b[(Intercept) id:id2:Repeat2:Fold07] 0.0 0.0 0.0 0.0 0.1
b[(Intercept) id:id2:Repeat2:Fold08] 0.0 0.0 -0.1 0.0 0.0
b[(Intercept) id:id2:Repeat2:Fold09] -0.2 0.0 -0.2 -0.2 -0.1
b[(Intercept) id:id2:Repeat2:Fold10] 0.2 0.0 0.1 0.2 0.2
b[(Intercept) id:id2:Repeat3:Fold01] 0.0 0.0 0.0 0.0 0.0
b[(Intercept) id:id2:Repeat3:Fold02] -0.1 0.0 -0.1 -0.1 0.0
b[(Intercept) id:id2:Repeat3:Fold03] 0.0 0.0 0.0 0.0 0.0
b[(Intercept) id:id2:Repeat3:Fold04] -0.1 0.0 -0.1 -0.1 -0.1
b[(Intercept) id:id2:Repeat3:Fold05] -0.1 0.0 -0.1 -0.1 -0.1
b[(Intercept) id:id2:Repeat3:Fold06] 0.0 0.0 -0.1 0.0 0.0
b[(Intercept) id:id2:Repeat3:Fold07] -0.1 0.0 -0.1 -0.1 -0.1
b[(Intercept) id:id2:Repeat3:Fold08] 0.0 0.0 0.0 0.0 0.0
b[(Intercept) id:id2:Repeat3:Fold09] 0.1 0.0 0.1 0.1 0.1
b[(Intercept) id:id2:Repeat3:Fold10] 0.0 0.0 0.0 0.0 0.0
b[(Intercept) id2:Fold01] 0.0 0.0 0.0 0.0 0.0
b[(Intercept) id2:Fold02] 0.0 0.0 0.0 0.0 0.0
b[(Intercept) id2:Fold03] 0.0 0.0 0.0 0.0 0.0
b[(Intercept) id2:Fold04] 0.0 0.0 0.0 0.0 0.0
b[(Intercept) id2:Fold05] 0.0 0.0 0.0 0.0 0.0
b[(Intercept) id2:Fold06] 0.0 0.0 0.0 0.0 0.0
b[(Intercept) id2:Fold07] 0.0 0.0 0.0 0.0 0.0
b[(Intercept) id2:Fold08] 0.0 0.0 0.0 0.0 0.0
b[(Intercept) id2:Fold09] 0.0 0.0 0.0 0.0 0.0
b[(Intercept) id2:Fold10] 0.0 0.0 0.0 0.0 0.0
sigma 0.0 0.0 0.0 0.0 0.0
Sigma[id:id2:(Intercept),(Intercept)] 0.0 0.0 0.0 0.0 0.0
Sigma[id2:(Intercept),(Intercept)] 0.0 0.0 0.0 0.0 0.0
Fit Diagnostics:
mean sd 10% 50% 90%
mean_PPD 0.2 0.0 0.2 0.2 0.2
The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).
MCMC diagnostics
mcse Rhat n_eff
(Intercept) 0.0 1.0 1076
modelfull 0.0 1.0 5335
b[(Intercept) id:id2:Repeat1:Fold01] 0.0 1.0 1923
b[(Intercept) id:id2:Repeat1:Fold02] 0.0 1.0 2026
b[(Intercept) id:id2:Repeat1:Fold03] 0.0 1.0 1448
b[(Intercept) id:id2:Repeat1:Fold04] 0.0 1.0 1886
b[(Intercept) id:id2:Repeat1:Fold05] 0.0 1.0 1718
b[(Intercept) id:id2:Repeat1:Fold06] 0.0 1.0 1610
b[(Intercept) id:id2:Repeat1:Fold07] 0.0 1.0 2092
b[(Intercept) id:id2:Repeat1:Fold08] 0.0 1.0 1879
b[(Intercept) id:id2:Repeat1:Fold09] 0.0 1.0 2070
b[(Intercept) id:id2:Repeat1:Fold10] 0.0 1.0 1631
b[(Intercept) id:id2:Repeat2:Fold01] 0.0 1.0 1860
b[(Intercept) id:id2:Repeat2:Fold02] 0.0 1.0 1975
b[(Intercept) id:id2:Repeat2:Fold03] 0.0 1.0 1470
b[(Intercept) id:id2:Repeat2:Fold04] 0.0 1.0 1851
b[(Intercept) id:id2:Repeat2:Fold05] 0.0 1.0 1824
b[(Intercept) id:id2:Repeat2:Fold06] 0.0 1.0 1755
b[(Intercept) id:id2:Repeat2:Fold07] 0.0 1.0 1949
b[(Intercept) id:id2:Repeat2:Fold08] 0.0 1.0 1842
b[(Intercept) id:id2:Repeat2:Fold09] 0.0 1.0 2012
b[(Intercept) id:id2:Repeat2:Fold10] 0.0 1.0 1694
b[(Intercept) id:id2:Repeat3:Fold01] 0.0 1.0 1805
b[(Intercept) id:id2:Repeat3:Fold02] 0.0 1.0 1992
b[(Intercept) id:id2:Repeat3:Fold03] 0.0 1.0 1472
b[(Intercept) id:id2:Repeat3:Fold04] 0.0 1.0 1864
b[(Intercept) id:id2:Repeat3:Fold05] 0.0 1.0 1711
b[(Intercept) id:id2:Repeat3:Fold06] 0.0 1.0 1713
b[(Intercept) id:id2:Repeat3:Fold07] 0.0 1.0 2034
b[(Intercept) id:id2:Repeat3:Fold08] 0.0 1.0 1900
b[(Intercept) id:id2:Repeat3:Fold09] 0.0 1.0 2094
b[(Intercept) id:id2:Repeat3:Fold10] 0.0 1.0 1653
b[(Intercept) id2:Fold01] 0.0 1.0 4322
b[(Intercept) id2:Fold02] 0.0 1.0 3431
b[(Intercept) id2:Fold03] 0.0 1.0 1479
b[(Intercept) id2:Fold04] 0.0 1.0 3812
b[(Intercept) id2:Fold05] 0.0 1.0 2706
b[(Intercept) id2:Fold06] 0.0 1.0 2443
b[(Intercept) id2:Fold07] 0.0 1.0 4066
b[(Intercept) id2:Fold08] 0.0 1.0 2569
b[(Intercept) id2:Fold09] 0.0 1.0 4127
b[(Intercept) id2:Fold10] 0.0 1.0 1656
sigma 0.0 1.0 1815
Sigma[id:id2:(Intercept),(Intercept)] 0.0 1.0 1626
Sigma[id2:(Intercept),(Intercept)] 0.0 1.0 942
mean_PPD 0.0 1.0 6198
log-posterior 0.2 1.0 940
For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
We can check our posterior distributions of the models. The posterior distributions will give us the probability of the models given the data. We can use the tidy()
function to extract the posterior distributions from the fitted models.
<- pp |>
pp_tidy tidy(seed = 123)
|>
pp_tidy group_by(model) |>
summarize(mean = mean(posterior),
lower = quantile(posterior, probs = .025),
upper = quantile(posterior, probs = .975)) |>
mutate(model = factor(model, levels = c("full", "compact"))) |>
arrange(model)
# A tibble: 2 × 4
model mean lower upper
<fct> <dbl> <dbl> <dbl>
1 full 0.159 0.124 0.194
2 compact 0.159 0.124 0.193
We can also graph the posterior probabilities of the models.
|>
pp_tidy ggplot() +
geom_density(aes(x = posterior, color = model)) +
xlab("R Squared")
|>
pp_tidy ggplot() +
geom_histogram(aes(x = posterior, fill = model), color = "white", alpha = 0.4,
bins = 50, position = "identity") +
xlab("R Squared")
To visualize a little better, we can use faceted plots and add the mean and 95% credible intervals to each model, separately. The mean is the average of the posterior distribution, and the 95% credible interval is the range of values that contains 95% of the posterior distribution.
<- pp_tidy |>
ci summary() |>
mutate(y = 450)
|>
pp_tidy ggplot(aes(x = posterior)) +
geom_histogram(aes(x = posterior, fill = model), color = "white", bins = 50) +
geom_segment(mapping = aes(y = y+50, yend = y-50, x = mean, xend = mean,
color = model),
data = ci) +
geom_segment(mapping = aes(y = y, yend = y, x = lower, xend = upper, color = model),
data = ci) +
facet_wrap(~ model, ncol = 1) +
theme(legend.position = "none") +
ylab("Count") +
xlab("R Squared")
Determine if the models are equivalent. We can use the contrast_models()
function to compare the two models. The contrast_models()
function takes the performance model and compares the two models.
# contrast_model sets up a random seed inherently within the seed argument
# no need to specify the seed outside
|>
pp contrast_models(seed = 12) |>
summary(size = .01) |>
glimpse()
Rows: 1
Columns: 9
$ contrast <chr> "full vs compact"
$ probability <dbl> 0.5206667
$ mean <dbl> 0.0002721943
$ lower <dbl> -0.008999285
$ upper <dbl> 0.01001865
$ size <dbl> 0.01
$ pract_neg <dbl> 0.03483333
$ pract_equiv <dbl> 0.9146667
$ pract_pos <dbl> 0.0505
A rope is the range of values that we consider to be equivalent. If the posterior distribution falls within the rope, we consider the models to be equivalent. We define the range for ROPE completely based on the context of the problem (i.e., domain knowledge).
|>
pp contrast_models(seed = 12) |>
ggplot(aes(x = difference)) +
geom_histogram(bins = 50, color = "white", fill = "light grey")+
geom_vline(aes(xintercept = -.01), linetype = "dashed") +
geom_vline(aes(xintercept = .01), linetype = "dashed")
Feature Importance
Prepare model:
<- rec_full |>
feat_all prep(data_all) |>
bake(NULL)
<- linear_reg(penalty = select_best(fits_full, metric = "rsq")$penalty,
fit_all_data mixture = select_best(fits_full, metric = "rsq")$mixture) |>
set_engine("glmnet") |>
set_mode("regression") |>
fit(grade ~ ., data = feat_all)
We now need a predictor wrapper and explainer object. Predictor wrappers are used to create a function that takes a model and a dataset and returns the predictions. An explainer object is used to create a function that takes a model and a dataset and returns the feature importance. The predictor_wrapper()
function creates a predictor wrapper, and the explain()
function creates an explainer object. We need a prediction wrapper function here because the DALEX package is designed to be generic to accommodate different model types. In particular, the wrapper will specify what the prediction column is for classification models.
<- feat_all |> select(-grade)
x <- feat_all |> pull(grade)
y
<- function(model, newdata) {
predict_wrapper predict(model, newdata) |>
pull(.pred)
}
<- explain_tidymodels(fit_all_data,
explain_full data = x,
y = y,
predict_function = predict_wrapper)
Preparation of a new explainer is initiated
-> model label : model_fit ( default )
-> data : 395 rows 44 cols
-> data : tibble converted into a data.frame
-> target variable : 395 values
-> predict function : predict_function
-> predicted values : No value for predict function target column. ( default )
-> model_info : package parsnip , ver. 1.2.1 , task regression ( default )
-> predicted values : numerical, min = 4.661688 , mean = 10.41519 , max = 15.13141
-> residual function : difference between y and yhat ( default )
-> residuals : numerical, min = -12.60379 , mean = -6.630983e-15 , max = 8.618674
A new explainer has been created!
Permutation Feature Importance
Permutation feature importance is a technique used to assess the importance of individual features in a model by permuting the values of each feature and observing the impact on model performance. This process helps identify which features contribute most to the model’s predictive power.
We can use the model_parts()
function to calculate the permutation feature importance. The model_parts()
function takes the explainer object and calculates the feature importance. The loss_function
argument specifies the loss function to use. The type
argument specifies the type of feature importance to calculate. The B
argument specifies the number of permutations to use. The variable_group
argument specifies the variable groups to use.
set.seed(123456)
<- model_parts(explain_full,
permute loss_function = loss_root_mean_square,
type = "raw",
B = 100)
(permute)
variable mean_dropout_loss label
1 _full_model_ 3.916733 model_fit
2 father_job_at_home 3.916715 model_fit
3 father_educ_primary 3.916795 model_fit
4 school_reason_home 3.916818 model_fit
5 nursery_no 3.917342 model_fit
6 father_educ_secondary 3.917356 model_fit
7 guardian_other 3.917679 model_fit
8 weekend_alcohol 3.918067 model_fit
9 father_educ_middle 3.918106 model_fit
10 father_educ_higher 3.918279 model_fit
11 guardian_father 3.918521 model_fit
12 mother_educ_secondary 3.918767 model_fit
13 parent_cohabit_together 3.918942 model_fit
14 activities_yes 3.919963 model_fit
15 mother_job_other 3.920056 model_fit
16 weekday_alcohol 3.920432 model_fit
17 internet_yes 3.921272 model_fit
18 paid_yes 3.921522 model_fit
19 school_ms 3.921812 model_fit
20 family_rel_quality 3.923993 model_fit
21 father_job_services 3.924818 model_fit
22 address_rural 3.925079 model_fit
23 free_time 3.925519 model_fit
24 health 3.925835 model_fit
25 travel_time_more_than_1_hour 3.929091 model_fit
26 school_reason_other 3.929913 model_fit
27 mother_job_health 3.931932 model_fit
28 father_job_other 3.932174 model_fit
29 mother_educ_middle 3.932963 model_fit
30 mother_educ_primary 3.936710 model_fit
31 school_reason_reputation 3.937171 model_fit
32 family_size_less_or_equal_to_3 3.938001 model_fit
33 mother_job_teacher 3.938093 model_fit
34 age 3.943200 model_fit
35 mother_educ_higher 3.943653 model_fit
36 mother_job_services 3.945399 model_fit
37 school_support_no 3.948316 model_fit
38 study_time 3.948692 model_fit
39 higher_no 3.953399 model_fit
40 family_support_yes 3.954710 model_fit
41 absences 3.961208 model_fit
42 romantic_yes 3.966871 model_fit
43 go_out 3.986041 model_fit
44 sex_m 3.988668 model_fit
45 failures_yes 4.230520 model_fit
46 _baseline_ 4.951863 model_fit
plot(permute)
Shapley Feature Importance
Shapley feature importance is a technique used to assess the importance of individual features in a model by calculating the Shapley values. Shapley values are a concept from cooperative game theory that assigns a value to each feature based on its contribution to the model’s performance.
We can calculate Shapley values for a single participant. Here’s an example where we calculate Shapley values for the last subject. The new_observation
argument specifies the observation to calculate the Shapley values for. The B
argument specifies the number of permutations to use.
<- x |> slice(nrow(feat_all))
x1 <- predict_parts(explain_full,
sv new_observation = x1,
type = "shap",
B = 25)
plot(sv)
We can calculate global Shapley values (Shapley values for all observations) by adding up all local Shapley values (Shapley values for a single observation).
<- function(df1){
get_shaps predict_parts(explain_full,
new_observation = df1,
type = "shap",
B = 25) |>
filter(B == 0) |>
select(variable_name, variable_value, contribution) |>
as_tibble()
}
# To save up time, I will only calculate Shapley values for 10 observations.
set.seed(123456)
<- x |>
local_shaps slice_sample(n = 10) |>
mutate(id = row_number()) |>
nest(.by = id, .key = "dfs") |>
mutate(shapleys = map(dfs, \(df1) get_shaps(df1))) |>
select(-dfs) |>
unnest(shapleys)
|>
local_shaps mutate(contribution = abs(contribution)) |>
group_by(variable_name) |>
summarize(mean_shap = mean(contribution)) |>
arrange(desc(mean_shap)) |>
mutate(variable_name = factor(variable_name),
variable_name = fct_reorder(variable_name, mean_shap)) |>
ggplot(aes(x = variable_name, y = mean_shap)) +
geom_point() +
coord_flip()
Explanatory Plots
Partial Dependence Plots
A partial dependece plot (PDP) is a technique used to visualize the relationship between a feature and the predicted outcome of a model. It shows the average predicted outcome for different values of the feature, while holding all other features constant.
We can use the model_profile()
function to calculate the PDP. The model_profile()
function takes the explainer object and calculates the PDP. The variable
argument specifies the variable to plot. The type
argument specifies the type of plot to create. The grid_points
argument specifies the number of observations to use.
<- model_profile(explain_full,
pdp variable = "study_time",
type = "partial",
N = NULL)
|> plot() pdp
Accumulated Local Effects (ALE) Plots
Accumulated local effects (ALE) plots are a technique used to visualize the relationship between a feature and the predicted outcome of a model. ALE plots show the average predicted outcome for different values of the feature, while holding all other features constant. The difference between ALE and PDP is that ALE takes into account the interaction between features.
We can use the model_profile()
function to calculate the ALE. The model_profile()
function takes the explainer object and calculates the ALE. The variable
argument specifies the variable to plot. The type
argument specifies the type of plot to create. The N
argument specifies the number of observations to use.
<- model_profile(explain_full,
ale type = "accumulated",
variable = "age",
N = NULL)
|> plot() ale