Параллельная обработка внутри функции с каретной моделью - PullRequest
0 голосов
/ 30 апреля 2020

Я пытаюсь создать функцию каретки «все в одном» для обучения моделей каретки с разными входами. Я хочу, чтобы эта функция была независимой от всех других вызовов.

Функция, которую я разработал до сих пор, кажется воспроизводимой для некоторых моделей, а не для других.

Например, ниже я тренирую gbm на наборе данных радужной оболочки = не удается воспроизвести. Затем обучите модель rpart = воспроизведение (кроме разницы во времени). Моя функция звучит? Можно ли указывать параллельную обработку внутри функции?

Этот вопрос имеет небольшие ссылки на другие: Полностью воспроизводимые параллельные модели с использованием каретки , Лучшая функция для сравнения модели каретки объекты

library(caret)
library(recipes)
library(doParallel)

# recipe to be supplied
Recipe.Obj <- recipe(Sepal.Length ~ ., data = iris) %>% 
  step_zv(all_predictors()) %>%
  step_nzv(all_predictors()) %>% 
  step_normalize(all_numeric(), -Sepal.Length)

# train control object
TC.Obj <- trainControl("cv", savePredictions = "all", summaryFunction = defaultSummary, returnResamp = "all")

# tune.grid values to be used
gbm.TG = expand.grid(n.trees = c(seq(50,200, by = 100)),
                     interaction.depth = c(1:2),
                     shrinkage = c(0.05,0.1),
                     n.minobsinnode = c(10,20))

# function for seed generation
generateTrainSeeds <- function(repeats, resampling_folds, nModels) {
  #nmodels = nparams * ntunelength

  vectorLength <- repeats * resampling_folds + 1

  set.seed(0)
  seeds <- vector(mode = "list", length = vectorLength)

  # for resampling
  for(i in 1:(vectorLength-1)) seeds[[i]]<- sample.int(n=1000, nModels)

  # for final model
  seeds[[vectorLength]] <- sample.int(1000, 1)

  return (seeds)
}

# parallel processing model training function
SO.Parallel.Model.Train.Func <- function(Model,Recipe, TC, Training.Data.Set, Metric, Tune.Length = NULL, Tune.Grid = NULL) {

  # determine number of tuning parameters
  nTuneParameters <- nrow(modelLookup(Model))

  resampling_folds <- TC$number
  repeats <- ifelse(is.na(TC$repeats), 1, TC$repeats)

  # specify number of models according to which argument (Tune.Length or Tune.Grid) is supplied
  if(!missing("Tune.Length")){
    nModels <- Tune.Length * nTuneParameters
  } else if (!missing("Tune.Grid")){
    nModels <- nrow(Tune.Grid)
  }

  # use function to generate seeds for reproducibility
  seeds <- generateTrainSeeds(repeats, resampling_folds, nModels)

  # allocate seeds to the trainControl() object
  TC$seeds <- seeds

  # establish parallel environment
  cl <- makePSOCKcluster(detectCores()-3, autoStop = TRUE)
  registerDoParallel(cl)

  # run model if Tune.Length is supplied, otherwise do with Tune.Grid

  if(!missing("Tune.Length")){

    set.seed(0)
    Model.Obj <- train(Recipe,
                       method = Model,
                       data = Training.Data.Set,
                       trControl = TC,
                       metric = Metric,
                       tuneLength = Tune.Length
    )


  } else if (!missing("Tune.Grid")){

    set.seed(0)
    Model.Obj <- train(Recipe,
                       method = Model,
                       data = Training.Data.Set,
                       trControl = TC,
                       metric = Metric,
                       tuneGrid = Tune.Grid
    )   

  }

  # stop parallel environment & return model created
  stopCluster(cl)
  registerDoSEQ()

  return(Model.Obj)
}



# example 1
gbm.Model.1 <- SO.Parallel.Model.Train.Func(Model = "gbm",
                                        Recipe = Recipe.Obj,
                                        TC = TC.Obj,
                                        Training.Data.Set = iris,
                                        Metric = "RMSE",
                                        Tune.Grid = gbm.TG)

# example 2
gbm.Model.2 <- SO.Parallel.Model.Train.Func(Model = "gbm",
                                        Recipe = Recipe.Obj,
                                        TC = TC.Obj,
                                        Training.Data.Set = iris,
                                        Metric = "RMSE",
                                        Tune.Grid = gbm.TG)

# returns many differences with resampling performance
all.equal(gbm.Model.1, gbm.Model.2)

[1] "Component “results”: Component “RMSE”: Mean relative difference: 0.001501716"      
 [2] "Component “results”: Component “Rsquared”: Mean relative difference: 0.001225451"  
 [3] "Component “results”: Component “MAE”: Mean relative difference: 0.001895233"       
 [4] "Component “results”: Component “RMSESD”: Mean relative difference: 0.01187857"     
 [5] "Component “results”: Component “RsquaredSD”: Mean relative difference: 0.007913823"
 [6] "Component “results”: Component “MAESD”: Mean relative difference: 0.01049994"      
 [7] "Component “pred”: Component “pred”: Mean relative difference: 0.003305424"         
 [8] "Component “resample”: Component “RMSE”: Mean relative difference: 0.00653162"      
 [9] "Component “resample”: Component “Rsquared”: Mean relative difference: 0.004632896" 
[10] "Component “resample”: Component “MAE”: Mean relative difference: 0.00941898"       
[11] "Component “times”: Component “everything”: Mean relative difference: 0.03904282"   
[12] "Component “times”: Component “final”: Mean relative difference: 0.1666667" 

# example 3
rpart.Model.3 <- SO.Parallel.Model.Train.Func(Model = "rpart",
                                            Recipe = Recipe.Obj,
                                            TC = TC.Obj,
                                            Training.Data.Set = iris,
                                            Metric = "RMSE",
                                            Tune.Length = 3)

# example 4
rpart.Model.4 <- SO.Parallel.Model.Train.Func(Model = "rpart",
                                            Recipe = Recipe.Obj,
                                            TC = TC.Obj,
                                            Training.Data.Set = iris,
                                            Metric = "RMSE",
                                            Tune.Length = 3)

# gives the same result aside from the times
all.equal(rpart.Model.3, rpart.Model.4)
...