Передача параметров в функцию предсказания в mlr для xgboost - PullRequest
0 голосов
/ 29 июня 2018

Последняя версия xgboost (0.7) позволяет интерпретировать прогнозы, установив для параметра predcontrib значение TRUE. Я попытался изменить учащийся xgboost по умолчанию, чтобы эти вклады соответствовали прогнозам. Вот код (мое единственное дополнение между START и END):

#' @export
makeRLearner.classif.xgboost.c = function() {
  makeRLearnerClassif(
    cl = "classif.xgboost.c",
    package = "xgboost",
    par.set = makeParamSet(
      # we pass all of what goes in 'params' directly to ... of xgboost
      # makeUntypedLearnerParam(id = "params", default = list()),
      makeDiscreteLearnerParam(id = "booster", default = "gbtree", values = c("gbtree", "gblinear", "dart")),
      makeUntypedLearnerParam(id = "watchlist", default = NULL, tunable = FALSE),
      makeNumericLearnerParam(id = "eta", default = 0.3, lower = 0, upper = 1),
      makeNumericLearnerParam(id = "gamma", default = 0, lower = 0),
      makeIntegerLearnerParam(id = "max_depth", default = 6L, lower = 1L),
      makeNumericLearnerParam(id = "min_child_weight", default = 1, lower = 0),
      makeNumericLearnerParam(id = "subsample", default = 1, lower = 0, upper = 1),
      makeNumericLearnerParam(id = "colsample_bytree", default = 1, lower = 0, upper = 1),
      makeNumericLearnerParam(id = "colsample_bylevel", default = 1, lower = 0, upper = 1),
      makeIntegerLearnerParam(id = "num_parallel_tree", default = 1L, lower = 1L),
      makeNumericLearnerParam(id = "lambda", default = 1, lower = 0),
      makeNumericLearnerParam(id = "lambda_bias", default = 0, lower = 0),
      makeNumericLearnerParam(id = "alpha", default = 0, lower = 0),
      makeUntypedLearnerParam(id = "objective", default = "binary:logistic", tunable = FALSE),
      makeUntypedLearnerParam(id = "eval_metric", default = "error", tunable = FALSE),
      makeNumericLearnerParam(id = "base_score", default = 0.5, tunable = FALSE),
      makeNumericLearnerParam(id = "max_delta_step", lower = 0, default = 0),
      makeNumericLearnerParam(id = "missing", default = NULL, tunable = FALSE, when = "both",
                              special.vals = list(NA, NA_real_, NULL)),
      makeIntegerVectorLearnerParam(id = "monotone_constraints", default = 0, lower = -1, upper = 1),
      makeNumericLearnerParam(id = "tweedie_variance_power", lower = 1, upper = 2, default = 1.5, requires = quote(objective == "reg:tweedie")),
      makeIntegerLearnerParam(id = "nthread", lower = 1L, tunable = FALSE),
      makeIntegerLearnerParam(id = "nrounds", default = 1L, lower = 1L),
      # FIXME nrounds seems to have no default in xgboost(), if it has 1, par.vals is redundant
      makeUntypedLearnerParam(id = "feval", default = NULL, tunable = FALSE),
      makeIntegerLearnerParam(id = "verbose", default = 1L, lower = 0L, upper = 2L, tunable = FALSE),
      makeIntegerLearnerParam(id = "print_every_n", default = 1L, lower = 1L, tunable = FALSE,
                              requires = quote(verbose == 1L)),
      makeIntegerLearnerParam(id = "early_stopping_rounds", default = NULL, lower = 1L, special.vals = list(NULL), tunable = FALSE),
      makeLogicalLearnerParam(id = "maximize", default = NULL, special.vals = list(NULL), tunable = FALSE),
      makeDiscreteLearnerParam(id = "sample_type", default = "uniform", values = c("uniform", "weighted"), requires = quote(booster == "dart")),
      makeDiscreteLearnerParam(id = "normalize_type", default = "tree", values = c("tree", "forest"), requires = quote(booster == "dart")),
      makeNumericLearnerParam(id = "rate_drop", default = 0, lower = 0, upper = 1, requires = quote(booster == "dart")),
      makeNumericLearnerParam(id = "skip_drop", default = 0, lower = 0, upper = 1, requires = quote(booster == "dart")),

      ########## START ##########
      makeLogicalLearnerParam(id = "predcontrib", default = FALSE, tunable = FALSE),
      makeLogicalLearnerParam(id = "approxcontrib", default = FALSE, tunable = FALSE),
      ########### END ##########
      makeUntypedLearnerParam(id = "callbacks", default = list(), tunable = FALSE)
    ),
    par.vals = list(nrounds = 1L, verbose = 0L),
    properties = c("twoclass", "multiclass", "numerics", "prob", "weights", "missings", "featimp"),
    name = "eXtreme Gradient Boosting",
    short.name = "xgboost.c",
    note = "All settings are passed directly, rather than through `xgboost`'s `params` argument. `nrounds` has been set to `1` and `verbose` to `0` by default. `num_class` is set internally, so do not set this manually.",
    callees = "xgboost"
  )
}

#' @export
trainLearner.classif.xgboost.c = function(.learner, .task, .subset, .weights = NULL,  ...) {

  td = getTaskDesc(.task)
  parlist = list(...)
  nc = length(td$class.levels)

  if (is.null(parlist$objective))
    parlist$objective = ifelse(nc == 2L, "binary:logistic", "multi:softprob")

  if (.learner$predict.type == "prob" && parlist$objective == "multi:softmax")
    stop("objective = 'multi:softmax' does not work with predict.type = 'prob'")

  #if we use softprob or softmax as objective we have to add the number of classes 'num_class'
  if (parlist$objective %in% c("multi:softprob", "multi:softmax"))
    parlist$num_class = nc

  task.data = getTaskData(.task, .subset, target.extra = TRUE)
  label = match(as.character(task.data$target), td$class.levels) - 1
  parlist$data = xgboost::xgb.DMatrix(data = data.matrix(task.data$data), label = label)

  if (!is.null(.weights))
    xgboost::setinfo(parlist$data, "weight", .weights)

  if (is.null(parlist$watchlist))
    parlist$watchlist = list(train = parlist$data)

  do.call(xgboost::xgb.train, parlist)
}

#' @export
predictLearner.classif.xgboost.c = function(.learner, .model, .newdata, ...) {
  td = .model$task.desc
  m = .model$learner.model
  cls = td$class.levels
  nc = length(cls)
  obj = .learner$par.vals$objective

  if (is.null(obj))
    .learner$par.vals$objective = ifelse(nc == 2L, "binary:logistic", "multi:softprob")

  p = predict(m, newdata = data.matrix(.newdata), ...)

  if (nc == 2L) { #binaryclass
    if (.learner$par.vals$objective == "multi:softprob") {
      y = matrix(p, nrow = length(p) / nc, ncol = nc, byrow = TRUE)
      colnames(y) = cls
    } else {
      y = matrix(0, ncol = 2, nrow = nrow(.newdata))
      colnames(y) = cls
      y[, 1L] = 1 - p
      y[, 2L] = p
    }
    if (.learner$predict.type == "prob") {
      # VITV      
      #return(cbind(y, p))
      return(y)
    } else {
      p = colnames(y)[max.col(y)]
      names(p) = NULL
      p = factor(p, levels = colnames(y))
      return(p)
    }
  } else { #multiclass
    if (.learner$par.vals$objective  == "multi:softmax") {
      p = as.factor(p) #special handling for multi:softmax which directly predicts class levels
      levels(p) = cls
      return(p)
    } else {
      p = matrix(p, nrow = length(p) / nc, ncol = nc, byrow = TRUE)
      colnames(p) = cls
      if (.learner$predict.type == "prob") {
        return(p)
      } else {
        ind = max.col(p)
        cns = colnames(p)
        return(factor(cns[ind], levels = cns))
      }
    }
  }
}

#' @export
getFeatureImportanceLearner.classif.xgboost.c = function(.learner, .model, ...) {
  mod = getLearnerModel(.model, more.unwrap = TRUE)
  imp = xgboost::xgb.importance(feature_names = .model$features,
                                model = mod, ...)

  fiv = imp$Gain
  setNames(fiv, imp$Feature)
}

Вот воспроизводимый пример, демонстрирующий, как я называю код (после регистрации методов нового ученика):

library(mlr)
library(dplyr)
library(titanic)

sample <- sample.int(n = nrow(titanic_train), size = floor(.7*nrow(titanic_train)), replace = F)

train <- titanic_train[sample, ] %>% select(Pclass, Sex, Age, SibSp, Fare, Survived) %>% mutate(Sex = ifelse(Sex == 'male', 0, 1))

mlr::configureMlr(on.par.without.desc = "quiet")
lrn <- mlr::makeLearner(cl = 'classif.xgboost.c',
                        predict.type = "prob",
                        fix.factors.prediction = TRUE,
                        tree_method = 'exact',
                        early_stopping_rounds=5)

lrn <- mlr::makeImputeWrapper(lrn,
                              classes = list(integer = mlr::imputeMedian(),
                                             numeric = mlr::imputeHist(),
                                             factor = mlr::imputeMode()),
                              dummy.classes = "factor")

classif.task <- mlr::makeClassifTask(data = train,
                                     target = "Survived",
                                     positive = "1")
mod = train(lrn, classif.task)

test <- titanic_train[-sample,] %>% select(Pclass, Sex, Age, SibSp, Fare, Survived) %>% mutate(Sex = ifelse(Sex == 'male', 0, 1))

pred = predict(mod, newdata = test, predcontrib = TRUE)
calculateConfusionMatrix(pred)

У меня есть следующие два вопроса:

  1. При вызове функции прогнозирования с параметром predcontrib установить, этот параметр не передается в функцию прогнозирования обучаемого. Если я проверяю list(...), я получаю список 0. Есть ли способ передать параметры в функцию предсказания?
  2. Если я вызову функцию прогнозирования в ученике один раз с жестко заданным параметром predcontrib , установленным в значение ИСТИНА, и один раз без (быстрый и грязный трюк, чтобы избежать изменения кода тоже много на данный момент)

    contrib = predict(m, newdata = data.matrix(.newdata), predcontrib = TRUE, ...)
    p = predict(m, newdata = data.matrix(.newdata), ...)
    

    и вернуть оба

    return(cbind(y, p))
    

    тогда я получаю следующую ошибку:

        Error in checkPredictLearnerOutput(.learner, .model, p) : 
      predictLearner for classif.xgboost.c.imputed has returned not the class 
    levels as column names: 0,1,p
    

Есть ли способ изменить ожидаемый результат обучения ученика?

Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...