Проблема исходит от
augment(model, newdata = assessment(splits))
, потому что на предыдущем шаге
model <- glm(egalit_scale ~ cut(income06, cc),
data = analysis(splits))
мы делаем analysis
для 'split' вместо assessment
и это приводит к получению различного количества строк, например
out <- tidyr::expand(training_df, id, cc = 2:15) %>%
left_join(training_df)
tmp <- out$splits[[1]]
analysis(tmp)
# A tibble: 1,332 x 2
# egalit_scale income06
# <dbl> <dbl>
# 1 27.3 9.69
# 2 7.71 8.48
# 3 34.3 21.3
# 4 7.85 15.8
# 5 13.3 24.6
# 6 26.2 8.67
# 7 34.3 4.78
# 8 17.9 16.8
# 9 1.45 21.2
#10 9.84 15.7
# … with 1,322 more rows
assessment(tmp)
# A tibble: 149 x 2
# egalit_scale income06
# <dbl> <dbl>
# 1 28.6 14.8
# 2 17.8 2.47
# 3 5.03 24.3
# 4 31.5 5.79
# 5 18.4 18.0
# 6 4.05 8.06
# 7 2.28 8.16
# 8 28.6 16.8
# 9 21.1 7.03
#10 3.67 14.2
# … with 139 more rows
Итак, если мы изменим оператор модели с помощью assessment
mse_df <- function(splits, cc){
model <- glm(egalit_scale ~ cut(income06, cc),
data = assessment(splits))
model_mse <- augment(model, newdata = assessment(splits)) %>%
mse(truth = egalit_scale, estimate = round(.fitted))
model_mse$.estimate
}
library(yardstick)
out1 <- tidyr::expand(training_df, id, cc = 2:15) %>%
left_join(training_df) %>%
mutate(mse = map2_dbl(splits, cc, mse_df))
out1
# A tibble: 140 x 4
# id cc splits mse
# <chr> <int> <named list> <dbl>
# 1 Fold01 2 <split [1.3K/149]> 94.9
# 2 Fold01 3 <split [1.3K/149]> 94.6
# 3 Fold01 4 <split [1.3K/149]> 93.8
# 4 Fold01 5 <split [1.3K/149]> 94.5
# 5 Fold01 6 <split [1.3K/149]> 94.0
# 6 Fold01 7 <split [1.3K/149]> 92.0
# 7 Fold01 8 <split [1.3K/149]> 88.9
# 8 Fold01 9 <split [1.3K/149]> 91.2
# 9 Fold01 10 <split [1.3K/149]> 92.8
#10 Fold01 11 <split [1.3K/149]> 86.0
# … with 130 more rows