Получение кривой ROC из результатов теста - PullRequest
0 голосов
/ 09 ноября 2019

Я использовал mlr и batchtools, чтобы воспроизвести сравнительный анализ логистической регрессии и случайного леса в наборах данных openml для 2 учащихся. Это работа из (https://github.com/RaphaelCouronne/Benchmark_RF-LR_OpenML/tree/v0.9). Выполнение следующих двух файлов обычно занимает 2/3 дня, так как он пытается сравниться с 1119 наборами данных. Следовательно, вы можете избежать сравнительного анализа, импортируя данные из (https://github.com/RaphaelCouronne/Benchmark_RF-LR_OpenML/tree/v0.9/Data)и просто запустите следующий main.R.

#Filename: main.R (https://github.com/RaphaelCouronne/Benchmark_RF-LR_OpenML/blob/v0.9/main.R)

rm(list = ls())

# If possible increase memory used
options( java.parameters = "-Xmx8g" )

# Check that all packages can be loaded
library(mlr)
library(OpenML)
library(ggplot2)
library(snowfall)
library(cowplot)
library(RWeka) 
library(doParallel)
library(batchtools)

# Enter here nCores and myapikey
nCores = 3 # number of cores you want to use
myapikey = "7a4391537f767ea70db6af99497653e5" #OpenML API key
saveOMLConfig(apikey = myapikey, arff.reader = "RWeka", overwrite=TRUE)

## 1 Benchmark Study ======================================================================================

## I.1 Data Mining ----
# Get the tasks from OpenML
# Generates Data/OpenML/df.infos.RData which gives information about the processing of the datasets
# Generates Data/Results/clas_time.RData which contains information about our dataset pool
# 
# Options
# force = TRUE to force (re)computing of ALL dataset informations
# computeTime = TRUE to compute an estimate of training time for LR and RF. It may take up to several days
source(file = "Benchmark/benchmark_getDataOpenML.R")
get_data_OpenML(target_path = "Data/OpenML/clas_time.RData", force = FALSE, computeTime = FALSE)


## 1.2 Benchmark computation ---
# Batchtools implementation
#source(file = "Benchmark/benchmark-batchtools.R")
#load("Data/OpenML/clas_time.RData")
#clas_used = rbind(clas_time_small, clas_time_medium, clas_time_big)

# Set up the benchmark (delete current results)
#setBatchtoolsExperiment(seed = 1, ncpus = 3, clas_used = clas_used)
#regis = loadRegistry("Data/Results/Batchtools/batchtool_experiment//")

# Launch benchmark
#submitJobs(ids = 1:193, reg = regis) #small datasets
#submitJobs(ids = 194:231, reg = regis) #medium datasets
#submitJobs(ids = 232:278, reg = regis) #big datasets

# Check benchmark
#getStatus()

regis = loadRegistry("Data/Results/Batchtools/batchtool_experiment//") load("Data/OpenML/clas_time.RData")

#File name: benchmark_getDataOpenML.R (https://github.com/RaphaelCouronne/Benchmark_RF-LR_OpenML/blob/v0.9/Benchmark/benchmark_getDataOpenML.R) 

# Remove non usefull datasets using the tasks attributes
# Load all the datasets, test them, compute their dimension and computation time
get_data_OpenML <- function(target_path = "Data/Results/clas_time.RData", force = FALSE, computeTime = FALSE, seed = 1) {

  library(RWeka)
  library(OpenML)

  print("Begin Datamining OpenML", quote = FALSE)

  # =============================
  # Part 1 : Get the basic informations 
  # ============================= ----

  print("1. Load the tasks", quote = FALSE)

  Load the classification tasks informations if it does not exist yet

  tasks = listOMLTasks(limit = NULL)
  classifTasks.infos = subset(tasks, task.type == "Supervised Classification" &    # classification
  number.of.classes == 2 &                             # binary classification
  number.of.instances.with.missing.values == 0)           # no missing values
  save(classifTasks.infos, file = "Data/OpenML/classifTasks.infos.RData" )

  # We work with the given data from OpenML
  load("Data/OpenML/classifTasks.infos.RData")
  datasets.index = sort(unique(classifTasks.infos$data.id))
  clas = classifTasks.infos
  print(paste("  Number of datasets for supervised binary classification without missing values :", dim(clas)[1]), quote = FALSE)


  # =============================
  # Part 2 : Select datasets using OpenML task features
  # ============================= 

  print("2. Remove datasets using the tasks features", quote = FALSE)

  # remove the redundancies : 470 datasets
  clas = clas[order(clas$data.id),]
  logic = diff(clas$data.id)>0
  clas = clas[logic,]
  print(paste("  Number of datasets after removing the redundacies of datasets's IDs :", dim(clas)[1]), quote = FALSE)

  # Friedman-, volcanoes- und trX-Datasets : 393 datasets
  clas = clas[substr(clas$name,1,9) != "volcanoes" & substr(clas$name,1,4) != "fri_" & substr(clas$name,1,3) != "tr1" & substr(clas$name,1,3) != "tr2" & substr(clas$name,1,3) != "tr3" & substr(clas$name,1,3) != "tr4", ]
  print(paste("  Number of datasets after removing the obviously simulated datasets :", dim(clas)[1]), quote = FALSE)
  print("  Datasets removed : Friedman, volcanoes, TrX-Datasets", quote = FALSE)

  # remove the datasets with the same name, they correspond often to datasets with only very slight changes : 380 datasets
  doublon = names(sort(table(clas$name)[table(clas$name) > 1]))
  doublon = clas[clas$name %in% doublon,]
  doublon = doublon[order(doublon$name), ]

  diff.categorical <- function(x) {
    x = as.factor(x)
    n = length(x)
    res = rep(NA,n)
    res[1] = TRUE

    for (i in c(2:n)) {
      res[i] = !identical(x[i-1],x[i])
    }
    res = res*1
    return(res)
  }

  indexDuplicate.useful = which(diff.categorical(doublon$name)==1)
  indexDuplicate.notuseful = which(diff.categorical(doublon$name)==0)
  task.id.notuseful = doublon$task.id[indexDuplicate.notuseful]
  indexclas.notuseful = which(clas$task.id %in% task.id.notuseful)
  clas = clas[-indexclas.notuseful,]
  print(paste("  Number of datasets after removing the redundancies in dataset's names:", dim(clas)[1]), quote = FALSE)

  # Removing High dimentional datasets : 326 datasets
  index.highdimension = which(clas$number.of.features>clas$number.of.instances)
  clas= clas[-index.highdimension,]
  print(paste("  Number of datasets after removing the high dimentional datasets p>n:", dim(clas)[1]), quote = FALSE)

  # Ordering according to size (n*p)
  clas = clas[order(clas$number.of.features * clas$number.of.instances), ]


  # =============================
  # Part 3 : Loading the datasets
  # ============================= ----
  print("3. Load the datasets for analysis", quote = FALSE)
  print("  Testing the datasets ", quote = FALSE)

  # categorical target and test loading the datas ----
  n.row = nrow(clas)

  if (file.exists("Data/OpenML/df.infos.RData") && !force) {
    print("  File \"Data/OpenML/df.infos.RData\" has been found and loaded, it will not be recomputed", quote = FALSE)

    # laod the file
    load(file = "Data/OpenML/df.infos.RData")

    # check integrity of datasets
    if (!identical(clas$data.id,df.infos$data.id)) {
      print("  Difference between df.infos and clas", quote = FALSE)

      # reorganise the data.id
      notcomputed = subset(clas, select = c("data.id", "task.id", 
                                            "number.of.instances","number.of.features"))[which(!clas$data.id %in% df.infos$data.id),]
      df.infos.new = data.frame(matrix(data = NA, nrow = length(df.infos$data.id) + length(notcomputed$data.id), ncol = 15))
      names(df.infos.new) = c("index", "data.id", "task.id","n","p", "began", "done", 
                              "loaded","converted", "target_type", "dimension", 
                              "rf_time", "lr_time", "rf_NA", "lr_NA")

      df.infos.new[c(1:length(df.infos$data.id)),] = df.infos
      df.infos.new[c((length(df.infos$data.id)+1):length(df.infos.new$data.id)),c(2,3,4,5)] = notcomputed
      df.infos.new = df.infos.new[order(df.infos.new$data.id),]
      df.infos.new = df.infos.new[order(df.infos.new$n*df.infos.new$p),]
      df.infos.new$index = c(1:length(df.infos.new$index))
      df.infos = df.infos.new
    }


  } else {

    print("  Creating a new df.infos, total computation should last 1-2 days", quote = FALSE)

    # Create new df.infos
    df.infos = data.frame(matrix(data = NA, nrow = n.row, ncol = 15))
    names(df.infos) = c("index", "data.id", "task.id","n","p", "began", "done", 
                        "loaded","converted", "target_type", "dimension", 
                        "rf_time", "lr_time", "rf_NA", "lr_NA")
    df.infos$index = c(1:n.row)
    df.infos$data.id = clas$data.id
    df.infos$task.id=clas$task.id
    df.infos$n = clas$number.of.instances
    df.infos$p = clas$number.of.features

    if (file.exists("Data/OpenML/df.infos.Rout")) {file.remove("Data/OpenML/df.infos.Rout")}
  }


  index.not.done = which(is.na(df.infos$done))


  # If there are new datasets, try loading, conversion, computation of dimension, and time
  if (!(identical(index.not.done, integer(0))))  {
    df.infos.file <- file("Data/OpenML/df.infos.Rout", open = "w")

    pb <- txtProgressBar(min = 1, max = length(index.not.done), style = 3)

    for(index in c(1:length(index.not.done))){

      # Index
      j = index.not.done[index]

      # begin
      df.infos$began[j] = TRUE

      # Try
      tryCatch({

        # Loading the dataset
        omldataset = getOMLDataSet(data.id = clas$data.id[j], verbosity = 0)
        if (identical(omldataset$target.features, character(0))) {
          omldataset$target.features="Class"
          omldataset$desc$default.target.attribute="Class"
        }
        df.infos$loaded[j] = "TRUE" 

        # check the target
        df.infos$target_type = class(omldataset$data[, omldataset$target.features])

        # Transform to mlr task
        configureMlr(on.learner.error = "warn", show.learner.output = TRUE, show.info = FALSE)
        mlrtask = convertOMLDataSetToMlr(omldataset, verbosity = 0)
        df.infos$converted[j] = TRUE

        # Get the dimension
        df.infos$dimension[j] = getTaskDimension(mlrtask)

        if (computeTime) {
          # Compute the time for lr andrf
          learners = list(makeLearner("classif.randomForest"),
                          makeLearner("classif.logreg"))
          rdesc = makeResampleDesc("Holdout", split = 0.8, stratify = TRUE)
          configureMlr(on.learner.error = "warn", show.learner.output = TRUE, show.info = TRUE)

          sink(df.infos.file)
          sink(df.infos.file, type = "message")
          print(paste("Iteration",j,"dataset",clas$data.id[j]), quote = FALSE)
          set.seed(seed)
          bmr = benchmark(learners, mlrtask, rdesc, list(acc,timetrain), 
                          keep.pred = TRUE, models = FALSE, show.info = FALSE)
          sink() 
          sink(type="message")

          perfs=NA
          perfs = getBMRPerformances(bmr, as.df = TRUE)
          time.train = sum(perfs$timetrain)

          df.infos$rf_time[j]=perfs$timetrain[which(perfs$learner.id=="classif.randomForest")]
          df.infos$lr_time[j]=perfs$timetrain[which(perfs$learner.id=="classif.logreg")]

          df.infos$rf_NA[j] = is.na(perfs$acc[which(perfs$learner.id=="classif.randomForest")])
          df.infos$lr_NA[j] = is.na(perfs$acc[which(perfs$learner.id=="classif.logreg")])
        }

      }, error = function(e) return(paste0("The variable '", j, "'", 
                                           " caused the error: '", e, "'")))

      setTxtProgressBar(pb, index)
      df.infos$done[j] = TRUE
      save(df.infos, file = "Data/OpenML/df.infos.RData")
    }
  }



  # =============================
  # Part 6 : Select the datasets
  # ============================= ----
  print("4. Remove datasets that failed", quote = FALSE)
  clas_select = data.frame(clas, df.infos)
  clas_select = clas_select[, -which(names(clas_select) %in% c("task.id.1", "data.id.1"))]
  clas_select$time = clas_select$rf_time+clas_select$lr_time

  # remove the one with loading unsuccessfull
  data.id_loading_failed = clas_select$data.id[which(is.na(clas_select$loaded))]
  if (!identical(which(clas_select$data.id %in% data.id_loading_failed), integer(0))) {
    clas_select = clas_select[-which(clas_select$data.id %in% data.id_loading_failed),]
  }

  print(paste("  Removing the datasets which loading failed :",nrow(clas_select), "Datasets"), quote = FALSE)

  # remove the one with conversion unsuccessfull
  data.id_convert_failed = clas_select$data.id[which(is.na(clas_select$converted))]
  if (!identical(which(clas_select$data.id %in% data.id_convert_failed), integer(0))) {
    clas_select = clas_select[-which(clas_select$data.id %in% data.id_convert_failed),]
  }
  print(paste("  Removing the datasets which conversion failed :",nrow(clas_select), "Datasets"), quote = FALSE)


  # =============================
  # Part 6 : Save it
  # ============================= 

  clas_time = clas_select

  # clas_time : small medium big and too big
  # Too big dataset corresponds to a training time of lr+rf that exceeds 100s on a Holdout 80% train and 20% test on the dataset
  # It corresponds to datasets such that n*p > 3000000 on our dataset pool
  clas_time_small = clas_time[which(clas_time$n*clas_time$p < 20000),]
  clas_time_medium = clas_time[which(clas_time$n*clas_time$p  > 20000 &  clas_time$n*clas_time$p  <  100000) ,]
  clas_time_big = clas_time[which(clas_time$n*clas_time$p  > 100000 &  clas_time$n*clas_time$p <  3000000) ,]
  clas_time_toobig = clas_time[which(clas_time$n*clas_time$p > 3000000),] 

  # save it
  save(clas_time, clas_time_small, clas_time_medium, clas_time_big, clas_time_toobig,  file = target_path )
}



# gets the dimensionality of a mlr dataset associated with the given task
getTaskDimension = function(task) {

  nCol = ncol(task$env$data)

  nbNumeric = task$task.desc$n.feat[1]
  nbFactorDimension = sum(sapply(c(1:nCol), function(x) getColumnDimension(task$env$data[,x]) ))
  nbFactorDimension = nbFactorDimension - 1 # because of the added 2nd level of the target

  dimension = as.integer(nbNumeric+nbFactorDimension) # because of the column target
  return(dimension)
}

# Get dimension column
getColumnDimension = function(col) {
  res = 0
  if (is.factor(col)) {
    res = length(levels(col))-1
  }
  return(res)
}
#File name:benchmark-batchtools.R (https://github.com/RaphaelCouronne/Benchmark_RF-LR_OpenML/blob/v0.9/Benchmark/benchmark-batchtools.R)
rm(list = ls())
detectCores(all.tests = FALSE, logical = TRUE)
library(mlr)
library(plyr)
library(batchtools)
library(OpenML)
saveOMLConfig(apikey = "7a4391537f767ea70db6af99497653e5", arff.reader = "RWeka", overwrite=TRUE)


setBatchtoolsExperiment = function(seed = 1, ncpus = 2, 
                                   clas_used,
                                   nameExperiment =  paste("Data/Batchtools/batchtool_experiment")) {

  # which subset of dataset
  omldatasets = clas_used$data.id


  unlink(nameExperiment, recursive = TRUE)
  regis = makeExperimentRegistry(nameExperiment, seed = seed,
                                 packages = c("mlr", "OpenML", "methods"), 
                                 #source = paste0(dir, "/benchmark_defs.R"),
                                 work.dir = paste0("Data/Batchtools"),
                                 #conf.file = paste0("Data/Batchtools/.batchtools.conf.R")
  )

  regis$cluster.functions = makeClusterFunctionsMulticore(ncpus = ncpus) 



  # add selected OML datasets as problems
  for (did in omldatasets) {
    data = list(did = did)
    addProblem(name = as.character(did), data = data)
  }


  # add one generic 'algo' that compares RF and LR
  addAlgorithm("eval", fun = function(job, data, instance,  ...) {
    par.vals = list(...)

   # tryCatch({

      # get the dataset
      omldataset = getOMLDataSet(data$did)
      if (identical(omldataset$target.features, character(0))) {
        omldataset$target.features="Class"
        omldataset$desc$default.target.attribute="Class"
      }
      task = convertOMLDataSetToMlr(omldataset)

      # learners
      lrn.classif.lr = makeLearner("classif.logreg", predict.type = "prob", fix.factors.prediction = TRUE)
      lrn.classif.rf = makeLearner("classif.randomForest", predict.type = "prob", fix.factors.prediction = TRUE)
      lrn.list = list(lrn.classif.lr,lrn.classif.rf)

      # measures
      measures = list(acc, brier, auc, timetrain)
      rdesc = makeResampleDesc("RepCV", folds = 5, reps = 10, stratify = TRUE)
      configureMlr(on.learner.error = "warn", show.learner.output = TRUE)
      bmr = benchmark(lrn.list, task, rdesc, measures, keep.pred = FALSE, models = FALSE, show.info = TRUE)
      bmr

    #}, error = function(e) return(paste0("The variable '", data$did, "'", 
    #                                     " caused the error: '", e, "'")))

  })


  # finalize experiment
  # set.seed(1)
  ades = data.frame(c(1))
  addExperiments(algo.designs = list(eval = ades))
  summarizeExperiments()
  getStatus()
}
# 
# # test jobs
# testJob(1, reg = regis)
# 
# # Now submit al
# options(batchtools.progress = TRUE)
# notDone = findNotDone()
# ids = chunkIds(notDone$job.id, chunk.size = 10)
# submitJobs(ids = ids, reg = regis)
# waitForJobs(ids = c(1:50), sleep = 10, timeout = 604800, stop.on.error = FALSE, reg = getDefaultRegistry())
# resetJobs(reg = regis, ids = c(1:50))
# getStatus()
# getErrorMessages()
# 
# res_classif_load = reduceResultsList(ids = 1:50, fun = function(r) as.list(r), reg = regis)
# 
# getErrorMessages(ids = 1:50, missing.as.error = FALSE,
#                  reg = regis)
# 
# getErrorMessages(ids = 1:50, reg = regis)

Это привело к regis. Я видел, что inorder для генерации ROC-кривой для моего прогноза, я должен вызвать generateThreshVsPerfData ()и plotROCCurves () для результата теста (BenchmarkResult ()). Чтобы генерировать ThreshVsPerfData (), я должен передать либо результат теста (т. е. bmr), либо прогноз. Поскольку я не могу вызвать get get get bmr из объекта regis, я использовал ReduResultsList ()inside generateThreshVsPerfData ()

#retrieve benchmark result
result = reduceResultsList(ids = c(c(1:284), c(286:318)), reg = regis, missing.val = NA)
Reducing [===================================================================================================>] 100% eta:  0s> 
> df = generateThreshVsPerfData(result, measures = list(fpr, tpr, mmce))
Error in generateThreshVsPerfData.list(result, measures = list(fpr, tpr,  : 
  Assertion on 'obj' failed: May only contain the following types: Prediction,ResampleResult.
#method 2
> # Extract predictions
> preds = getBMRPredictions(result, drop = TRUE)
Error in getBMRPredictions(result, drop = TRUE) : 
  Assertion on 'bmr' failed: Must inherit from class 'BenchmarkResult', but has class 'list'.
> 
> # Change the class attribute
> preds2 = lapply(preds, function(x) {class(x) = "Prediction"; return(x)})
Error in lapply(preds, function(x) { : object 'preds' not found
> 
> # Draw ROC curves
> df = generateThreshVsPerfData(preds2, measures = list(fpr, tpr, mmce))
Error in generateThreshVsPerfData(preds2, measures = list(fpr, tpr, mmce)) : 
  object 'preds2' not found
> plotROCCurves(df)
Error in plotROCCurves(df) : 
  Assertion on 'obj' failed: Must inherit from class 'ThreshVsPerfData', but has class 'function'.

Пожалуйста, кто-нибудь может порекомендовать, как я могу сгенерировать кривую roc для сравнительного результата. Я уже провел сравнительный анализ и получил результат, но в форме объекта regis, который я не могу использоватьс generateThreshVsPerfData(). Следовательно, я должен использовать результаты прогнозирования. Для генерации прогноза требуется порядокитак, могу ли я обучить одну и ту же модель с теми же наборами дат после бенчмаркинга и использовать предикат () поверх модели. Желательно ли это делать?

...