Похоже, что сэмплы сделаны до применения рецепта .
Так что вы могли бы prep
и juice
recipe
и использовать метод формулы:
library(recipes)
library(caret)
library(rsample)
data(airquality)
set.seed(33) # for reproducibility
air_split <- initial_split(airquality, prop = 0.7)
air_train <- training(air_split)
air_test <- testing(air_split)
# Feature engineering - final recipe
air_recipe <- recipe(Ozone ~ ., data = air_train) %>%
step_zv(all_predictors()) %>%
step_nzv(all_predictors()) %>%
step_knnimpute(all_numeric(), neighbors = 6) %>%
step_log(Ozone, Wind) %>%
step_other(Day, threshold = 0.01, other = "other") %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_naomit(all_outcomes(),all_predictors())
# Prep recipe
air_prep <- prep(air_recipe, retain = TRUE)
# Juice the prepared recipe
air_train <- juice(air_prep)
# Validation
cv5 <- trainControl( method = "repeatedcv",
number = 5,
repeats = 5, allowParallel = TRUE)
# Fit an lm model
set.seed(12)
lm_fit <- train(
Ozone ~ .,
data = air_train,
method = "lm",
trControl = cv5,
metric = "RMSE")
lm_fit
#> Linear Regression
#>
#> 108 samples
#> 5 predictor
#>
#> No pre-processing
#> Resampling: Cross-Validated (5 fold, repeated 5 times)
#> Summary of sample sizes: 86, 88, 86, 86, 86, 86, ...
#> Resampling results:
#>
#> RMSE Rsquared MAE
#> 0.5091496 0.6568485 0.3793589
#>
#> Tuning parameter 'intercept' was held constant at a value of TRUE
Кроме того, вы можете использовать {parsnip}
и {tune}
, чтобы сохранить все в идиоме tidymodels
:
library(recipes)
library(rsample)
library(parsnip)
library(tune)
library(yardstick)
data(airquality)
set.seed(33) # for reproducibility
air_split <- initial_split(airquality, prop = 0.7)
air_train <- training(air_split)
air_test <- testing(air_split)
air_recipe <- recipe(Ozone ~ ., data = air_train) %>%
step_zv(all_predictors()) %>%
step_nzv(all_predictors()) %>%
step_knnimpute(all_numeric(), neighbors = 6) %>%
step_log(Ozone, Wind) %>%
step_other(Day, threshold = 0.01, other = "other") %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_naomit(all_outcomes(),all_predictors())
air_cv <- vfold_cv(air_train, v = 5, repeats = 5)
lm_mod <- linear_reg() %>% set_engine("lm")
lm_fits <- fit_resamples(air_recipe, lm_mod, air_cv)
show_best(lm_fits, metric = "rmse", maximize = FALSE)
#> # A tibble: 1 x 5
#> .metric .estimator mean n std_err
#> <chr> <chr> <dbl> <int> <dbl>
#> 1 rmse standard 0.526 25 0.0256
Создано в 2020-04-05 представляет пакет (v0.3.0)