Я пытаюсь создать функцию каретки «все в одном» для обучения моделей каретки с разными входами. Я хочу, чтобы эта функция была независимой от всех других вызовов.
Функция, которую я разработал до сих пор, кажется воспроизводимой для некоторых моделей, а не для других.
Например, ниже я тренирую gbm на наборе данных радужной оболочки = не удается воспроизвести. Затем обучите модель rpart = воспроизведение (кроме разницы во времени). Моя функция звучит? Можно ли указывать параллельную обработку внутри функции?
Этот вопрос имеет небольшие ссылки на другие: Полностью воспроизводимые параллельные модели с использованием каретки , Лучшая функция для сравнения модели каретки объекты
library(caret)
library(recipes)
library(doParallel)
# recipe to be supplied
Recipe.Obj <- recipe(Sepal.Length ~ ., data = iris) %>%
step_zv(all_predictors()) %>%
step_nzv(all_predictors()) %>%
step_normalize(all_numeric(), -Sepal.Length)
# train control object
TC.Obj <- trainControl("cv", savePredictions = "all", summaryFunction = defaultSummary, returnResamp = "all")
# tune.grid values to be used
gbm.TG = expand.grid(n.trees = c(seq(50,200, by = 100)),
interaction.depth = c(1:2),
shrinkage = c(0.05,0.1),
n.minobsinnode = c(10,20))
# function for seed generation
generateTrainSeeds <- function(repeats, resampling_folds, nModels) {
#nmodels = nparams * ntunelength
vectorLength <- repeats * resampling_folds + 1
set.seed(0)
seeds <- vector(mode = "list", length = vectorLength)
# for resampling
for(i in 1:(vectorLength-1)) seeds[[i]]<- sample.int(n=1000, nModels)
# for final model
seeds[[vectorLength]] <- sample.int(1000, 1)
return (seeds)
}
# parallel processing model training function
SO.Parallel.Model.Train.Func <- function(Model,Recipe, TC, Training.Data.Set, Metric, Tune.Length = NULL, Tune.Grid = NULL) {
# determine number of tuning parameters
nTuneParameters <- nrow(modelLookup(Model))
resampling_folds <- TC$number
repeats <- ifelse(is.na(TC$repeats), 1, TC$repeats)
# specify number of models according to which argument (Tune.Length or Tune.Grid) is supplied
if(!missing("Tune.Length")){
nModels <- Tune.Length * nTuneParameters
} else if (!missing("Tune.Grid")){
nModels <- nrow(Tune.Grid)
}
# use function to generate seeds for reproducibility
seeds <- generateTrainSeeds(repeats, resampling_folds, nModels)
# allocate seeds to the trainControl() object
TC$seeds <- seeds
# establish parallel environment
cl <- makePSOCKcluster(detectCores()-3, autoStop = TRUE)
registerDoParallel(cl)
# run model if Tune.Length is supplied, otherwise do with Tune.Grid
if(!missing("Tune.Length")){
set.seed(0)
Model.Obj <- train(Recipe,
method = Model,
data = Training.Data.Set,
trControl = TC,
metric = Metric,
tuneLength = Tune.Length
)
} else if (!missing("Tune.Grid")){
set.seed(0)
Model.Obj <- train(Recipe,
method = Model,
data = Training.Data.Set,
trControl = TC,
metric = Metric,
tuneGrid = Tune.Grid
)
}
# stop parallel environment & return model created
stopCluster(cl)
registerDoSEQ()
return(Model.Obj)
}
# example 1
gbm.Model.1 <- SO.Parallel.Model.Train.Func(Model = "gbm",
Recipe = Recipe.Obj,
TC = TC.Obj,
Training.Data.Set = iris,
Metric = "RMSE",
Tune.Grid = gbm.TG)
# example 2
gbm.Model.2 <- SO.Parallel.Model.Train.Func(Model = "gbm",
Recipe = Recipe.Obj,
TC = TC.Obj,
Training.Data.Set = iris,
Metric = "RMSE",
Tune.Grid = gbm.TG)
# returns many differences with resampling performance
all.equal(gbm.Model.1, gbm.Model.2)
[1] "Component “results”: Component “RMSE”: Mean relative difference: 0.001501716"
[2] "Component “results”: Component “Rsquared”: Mean relative difference: 0.001225451"
[3] "Component “results”: Component “MAE”: Mean relative difference: 0.001895233"
[4] "Component “results”: Component “RMSESD”: Mean relative difference: 0.01187857"
[5] "Component “results”: Component “RsquaredSD”: Mean relative difference: 0.007913823"
[6] "Component “results”: Component “MAESD”: Mean relative difference: 0.01049994"
[7] "Component “pred”: Component “pred”: Mean relative difference: 0.003305424"
[8] "Component “resample”: Component “RMSE”: Mean relative difference: 0.00653162"
[9] "Component “resample”: Component “Rsquared”: Mean relative difference: 0.004632896"
[10] "Component “resample”: Component “MAE”: Mean relative difference: 0.00941898"
[11] "Component “times”: Component “everything”: Mean relative difference: 0.03904282"
[12] "Component “times”: Component “final”: Mean relative difference: 0.1666667"
# example 3
rpart.Model.3 <- SO.Parallel.Model.Train.Func(Model = "rpart",
Recipe = Recipe.Obj,
TC = TC.Obj,
Training.Data.Set = iris,
Metric = "RMSE",
Tune.Length = 3)
# example 4
rpart.Model.4 <- SO.Parallel.Model.Train.Func(Model = "rpart",
Recipe = Recipe.Obj,
TC = TC.Obj,
Training.Data.Set = iris,
Metric = "RMSE",
Tune.Length = 3)
# gives the same result aside from the times
all.equal(rpart.Model.3, rpart.Model.4)