Я пытаюсь найти лучшее значение max_depth, используя следующий код
library(h2o)
h2o.init()
# import the titanic dataset
df <- h2o.importFile(path = "http://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv")
dim(df)
head(df)
tail(df)
summary(df,exact_quantiles=TRUE)
# pick a response for the supervised problem
response <- "survived"
# the response variable is an integer.
# we will turn it into a categorical/factor for binary classification
df[[response]] <- as.factor(df[[response]])
# use all other columns (except for the name) as predictors
predictors <- setdiff(names(df), c(response, "name"))
# split the data for machine learning
splits <- h2o.splitFrame(data = df,
ratios = c(0.6,0.2),
destination_frames = c("train.hex", "valid.hex", "test.hex"),
seed = 1234)
train <- splits[[1]]
valid <- splits[[2]]
test <- splits[[3]]
# Establish a baseline performance using a default GBM model trained on the 60% training split
# We only provide the required parameters, everything else is default
gbm <- h2o.gbm(x = predictors, y = response, training_frame = train)
# Get the AUC on the validation set
h2o.auc(h2o.performance(gbm, newdata = valid))
# The AUC is over 94%, so this model is highly predictive!
[1] 0.9480135
# Determine the best max_depth value to use during a hyper-parameter search.
# Depth 10 is usually plenty of depth for most datasets, but you never know
hyper_params = list( max_depth = seq(1,29,2) )
# or hyper_params = list( max_depth = c(4,6,8,12,16,20) ), which is faster for larger datasets
grid <- h2o.grid(
hyper_params = hyper_params,
# full Cartesian hyper-parameter search
search_criteria = list(strategy = "Cartesian"),
# which algorithm to run
algorithm="gbm",
# identifier for the grid, to later retrieve it
grid_id="depth_grid",
# standard model parameters
x = predictors,
y = response,
training_frame = train,
validation_frame = valid,
# more trees is better if the learning rate is small enough
# here, use "more than enough" trees - we have early stopping
ntrees = 10000,
# smaller learning rate is better, but because we have learning_rate_annealing,
# we can afford to start with a bigger learning rate
learn_rate = 0.05,
# learning rate annealing: learning_rate shrinks by 1% after every tree
# (use 1.00 to disable, but then lower the learning_rate)
learn_rate_annealing = 0.99,
# sample 80% of rows per tree
sample_rate = 0.8,
# sample 80% of columns per split
col_sample_rate = 0.8,
# fix a random number generator seed for reproducibility
seed = 1234,
# early stopping once the validation AUC doesn't improve by at least
# 0.01% for 5 consecutive scoring events
stopping_rounds = 5,
stopping_tolerance = 1e-4,
stopping_metric = "AUC",
# score every 10 trees to make early stopping reproducible
# (it depends on the scoring interval)
score_tree_interval = 10)
# by default, display the grid search results sorted by increasing logloss
# (because this is a classification task)
grid
# sort the grid models by decreasing AUC
sortedGrid <- h2o.getGrid("depth_grid", sort_by="auc", decreasing = TRUE)
sortedGrid
# find the range of max_depth for the top 5 models
topDepths = sortedGrid@summary_table$max_depth[1:5]
minDepth = min(as.numeric(topDepths))
maxDepth = max(as.numeric(topDepths))
> sortedGrid
Я получаю следующие ошибки:
- 'NULL' для строки 'h2o.auc (h2o.performance (gbm, newdata = =допустимо)) '
- ' ERRR для поля: _stopping_metric: метрика остановки не может быть AUC для регрессии.'при попытке выполнить функцию' h2o.grid '
Как решить проблемы?
Проблема решена с использованием приведенного выше примера кода.Проблема была в основном потому, что я использовал данные, которые были закодированы.После использования данных без кодирования, а также импорта данных с помощью команды «h2o.importFile» вместо «read.csv» проблемы были решены!