Я написал h2o.glm_custom
как «замену» для h2o.glm
, в которой хранятся модели перекрестной проверки, так что после слов можно использовать пользовательский критерий выбора, как предлагает @Diegolog. Мой подход использует h2o.grid
. Я попытался включить все параметры для h2o.glm
, но упростил некоторые настройки по умолчанию, чтобы избежать дополнительной работы.
h2o.glm_custom <- function(x,
y,
training_frame,
model_id = NULL,
validation_frame = NULL,
nfolds = 0,
seed = -1,
keep_cross_validation_models = TRUE,
keep_cross_validation_predictions = FALSE,
keep_cross_validation_fold_assignment = FALSE,
fold_assignment = "AUTO",
fold_column = NULL,
random_columns = NULL,
ignore_const_cols = TRUE,
score_each_iteration = FALSE,
offset_column = NULL,
weights_column = NULL,
family = "binomial",
rand_family = c("[gaussian]"),
tweedie_variance_power = 0,
tweedie_link_power = 1,
theta = 1e-10,
solver = "AUTO",
alpha = 0,
early_stopping = TRUE,
nlambdas = 100,
standardize = TRUE,
missing_values_handling = "MeanImputation",
plug_values = NULL,
compute_p_values = FALSE,
remove_collinear_columns = FALSE,
intercept = TRUE,
non_negative = FALSE,
max_iterations = -1,
objective_epsilon = -1,
beta_epsilon = 1e-04,
gradient_epsilon = -1,
link = "family_default",
rand_link = "[identity]",
startval = NULL,
calc_like = FALSE,
HGLM = FALSE,
prior = -1,
lambda_min_ratio = 0.01,
beta_constraints = NULL,
max_active_predictors = -1,
obj_reg = -1,
export_checkpoints_dir = NULL,
balance_classes = FALSE,
class_sampling_factors = NULL,
max_after_balance_size = 5,
max_hit_ratio_k = 0,
max_runtime_secs = 0,
custom_metric_func = NULL) {
# Find lambda_max
model <- h2o.glm(x,
y,
training_frame,
model_id,
validation_frame,
nfolds,
seed,
keep_cross_validation_models,
keep_cross_validation_predictions,
keep_cross_validation_fold_assignment,
fold_assignment,
fold_column,
random_columns,
ignore_const_cols,
score_each_iteration,
offset_column,
weights_column,
family,
rand_family,
tweedie_variance_power,
tweedie_link_power,
theta,
solver,
alpha,
NULL, # lambda
TRUE, # lambda_search
early_stopping,
1, # nlambdas
standardize,
missing_values_handling,
plug_values,
compute_p_values,
remove_collinear_columns,
intercept,
non_negative,
max_iterations,
objective_epsilon,
beta_epsilon,
gradient_epsilon,
link,
rand_link,
startval,
calc_like,
HGLM,
prior,
lambda_min_ratio,
beta_constraints,
max_active_predictors,
obj_reg = obj_reg,
export_checkpoints_dir = export_checkpoints_dir,
balance_classes = balance_classes,
class_sampling_factor = class_sampling_factors,
max_after_balance_size = max_after_balance_size,
max_hit_ratio_k = max_hit_ratio_k,
max_runtime_secs = max_runtime_secs,
custom_metric_func = custom_metric_func)
lambda_max <- model@model$lambda_best
# Perform grid search on lambda, with logarithmic scale
lambda_min <- lambda_max * lambda_min_ratio
grid <- exp(seq(log(lambda_max), log(lambda_min), length.out = nlambdas))
grid_list <- lapply(sapply(grid, list), list)
hyper_parameters <- list(lambda = grid_list)
result <- h2o.grid('glm',
x = x,
y = y,
training_frame = training_frame,
nfolds = nfolds,
family = family,
alpha = alpha,
ignore_const_cols = ignore_const_cols,
hyper_params = hyper_parameters,
seed = seed)
}
Затем можно использовать следующую функцию для выбора лямбда-выражения на основе ошибки ошибочной классификации:
get_cv_means <- function(grid_results) {
mean_errors <- lapply(grid_results@model_ids, function(id) {
model <- h2o.getModel(id)
lambda <- model@parameters$lambda
err <- as.numeric(model@model$cross_validation_metrics_summary['err', 'mean'])
data.frame(lambda = lambda, error = err)
})
dt <- data.table::rbindlist(mean_errors)
data.table::setkey(dt, lambda)
dt
}
Вот полный пример использования этих функций для выбора лямбда-выражения с использованием перекрестной проверки на основе неправильной классификации ошибка:
h2o.init()
path <- system.file("extdata", "prostate.csv", package= "h2o")
h2o_df <- h2o.importFile(path)
h2o_df$CAPSULE <- as.factor(h2o_df$CAPSULE)
lambda_min_ratio <- 0.000001
nlambdas <- 100
nfolds <- 20
result <- h2o.glm_custom(x = c("AGE", "RACE", "PSA", "GLEASON"),
y = "CAPSULE",
training_frame = h2o_df,
family = "binomial",
alpha = 1,
nfolds = nfolds,
lambda_min_ratio = lambda_min_ratio,
nlambdas = nlambdas,
early_stopping = TRUE)
tbl <- get_cv_means(result)
Дает:
> head(tbl)
lambda error
1: 2.222376e-07 0.2264758
2: 2.555193e-07 0.2394541
3: 2.937851e-07 0.2380508
4: 3.377814e-07 0.2595451
5: 3.883666e-07 0.2478443
6: 4.465272e-07 0.2595603
Который может быть нанесен, et c ...
ggplot() + geom_line(data = tbl[lambda < 0.00001], aes(x = lambda, y = error))