Поэтому я использую h2o.ai для создания модели биномиальной классификации, однако, когда я использую as.h2o для преобразования моих наборов данных. Он берет заголовок столбца моей целевой переменной, который называется «ПОКУПАТЬ», и добавляет его к уровням, так что вместо двух уровней 1 и 2 он становится тремя уровнями - ПОКУПАТЬ, 1 и 2. Это делает его полиномиальным и не нужным. почини это?
when I run perfH2o this is the output:
H2OMultinomialMetrics: gbm
Test Set Metrics:
=====================
MSE: (Extract with `h2o.mse`) 0.3260208
RMSE: (Extract with `h2o.rmse`) 0.5709823
Logloss: (Extract with `h2o.logloss`) 1.016186
Mean Per-Class Error: 0.2755556
R^2: (Extract with `h2o.r2`) -0.1913934
Confusion Matrix: Extract with `h2o.confusionMatrix(<model>, <data>)`)
=========================================================================
Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
BUY NO YES Error Rate
BUY 1 0 0 0.0000 = 0 / 1 #see here it is taking the header and thinking it is a level
NO 0 16 9 0.3600 = 9 / 25
YES 0 7 8 0.4667 = 7 / 15
Totals 1 23 17 0.3902 = 16 / 41
Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>, <data>)`
=======================================================================
Top-3 Hit Ratios:
k hit_ratio
1 1 0.609756
2 2 0.975610
3 3 1.000000
Вот мой код
#Getting packages
#install.packages("dplyr")
library(dplyr)
library(tidyverse)
library(tidyr)
#install.packages("tidyquant") #Used to quickly load the "tidyverse" (dplyr, tidyr, ggplot, etc)
along with custom,
#business-report-friendly ggplot themes. Also great for time series analysis (not featured)
library(tidyquant)
#install.packages("unbalanced")
library(unbalanced)#contains various methods for working with unbalanced data. I will be using
ubSMOTE() function
#installing H20 latest stable release H20 is a professional machine learning package
# The following two commands remove any previously installed H2O packages for R.
#if ("package:h2o" %in% search()) { detach("package:h2o", unload=TRUE) }
#if ("h2o" %in% rownames(installed.packages())) { remove.packages("h2o") }
# Next, we download packages that H2O depends on.
#pkgs <- c("RCurl","jsonlite")
#for (pkg in pkgs) {
# if (! (pkg %in% rownames(installed.packages()))) { install.packages(pkg) }
#}
# Now we download, install and initialize the H2O package for R.
#install.packages("h2o", type="source", repos="http://h2o-release.s3.amazonaws.com/h2o/rel-yule/2/R")
# Finally, let's load H2O and start up an H2O cluster
library(h2o)
h2o.init()
#Now getting the data
ngData <- read.csv(file.choose())
#Now I am going to create my Train, validation, and test set
splitPercentage1 <- .70
splitPercentage2 <- .5
numRows1 <- nrow(ngData)
sampleSize1 <- floor(splitPercentage1*numRows1)
set.seed(1)
idxTrain1 <- sample(1:numRows1, size = sampleSize1)
validationRaw <- ngData[-idxTrain1,]
trainRaw <- ngData[idxTrain1,]
#validation set created now time to make test set out of validation set
numRows2 <- nrow(validationRaw)
sampleSize2 <- floor(splitPercentage2*numRows2)
idxTrain2 <- sample(1:numRows2, size = sampleSize2)
testRaw <- validationRaw[-idxTrain2,]
validationRaw <- validationRaw[idxTrain2,]
#Now I have a randomly set train set, validation set, and test set
View(trainRaw)
View(testRaw)
View(validationRaw)
#all look good however we need our target variable "BUY" to be a factor not numeric
#also Buy = 1 Sell = 0 in the BUY column
trainRaw[,11] <- as.factor(trainRaw[,11])
testRaw[,11] <- as.factor(testRaw[,11])
validationRaw[,11] <- as.factor(validationRaw[,11])
View(trainRaw)
View(testRaw)
View(validationRaw)
#now to balance the data which i don't know if that is very necessary so I
#will check how balanced it is
Buytable <- table(trainRaw$BUY)
Buydistr <- prop.table(Buytable)
Buydistr
#very balanced with 52% sell and 47% buy so no need to balance
h2o.no_progress()
#converting into h2o data frames
trainH20 <- as.h2o(trainRaw)
validH20 <- as.h2o(validationRaw)
testH20 <- as.h2o(testRaw)
#now to find a classification model
y <- "BUY"
x <- setdiff(names(trainH20), y)
automl_models_h2o <- h2o.automl(
x = x,
y = y,
training_frame = trainH20,
validation_frame = validH20,
leaderboard_frame = testH20,
max_runtime_secs = 60
)
#time to extract the leading model
NGLeader <- automl_models_h2o@leader
#making predicitons using h2o.predict()
predH2o <- h2o.predict(NGLeader, newdata = testH20)
as_tibble(predH2o)
#now to check the performance
perfH2o <- h2o.performance(NGLeader, newdata = testH20)
perfH2o
h2o.r2(perfH2o)
#very bad r^2
#turns out my model believes that BUY is one of the possible outcomes of Y so it is multinomial I
must fix that
#######################################################################
Вот проблеск () моих данных:
Строк: 185
Столбцы: 11
$. Месяц апрель, июль, август, август, июль, февраль, сентябрь, январь, март, февраль, июнь, ...
$ Восток.Регион -12, 24 , 26, 21, 19, -43, 25, -43, -15, -9, 27, -28, 26, -27, 22, 23, 32, -54, 21, 12, ...
$ Средний Запад. Регион -20, 20, 36, 29, 16, -47, 35, -38, -7, -4, 35, -31, 45, -27, 22, 29, 27, -56 , 30, 14, -...
$ Mountain.Rgion -4, 6, 4, 3, 2, -6, 3, -10, 2, 0, 9, -2, 5, - 9, 5, 3, 6, -6, 4, 2, -4, 5, 5, 3, -1, -7, ...
$ Pacifi c .Регион 5, 5, 2, 0, -1, -10, 5, -13, 9, -1, 11, -3, 0, -14, 7, 0, 9, -11, 0, -3, -8, 5, 5 , 6, 0 ...
$ South.Central.Rgion 12, 3, 2, -2, -2, -41, 37, -15, 35, 21, 18, 1, 20, - 10, 5, -6, 32, -38, 12, -14, -6, 17 ...
$ Соль 8, -5, -2, -5, -6, -19, 14 , 13, 19, 5, -1, -1, 3, 15, -5, -3, 12, -8, 1, -13, -3, 3, -2 ...
$ NonSalt 3, 7, 4, 4, 3, -22, 22, -28, 18, 16, 18, 3, 17, -25, 10, -4, 19, -29, 11, -2, -3, 15, 1 ...
$ Итого. Нижнее 48 -19, 58, 69, 51, 34, -149, 105, -119, 23, 7, 98, -63, 96, -87, 61, 49, 106, -163 , 67, 1 ...
$ Flow.Change -0,34, -0,06, 0,41, 3,64, -0,47, -0,10, 0,42, -0,51, -1,64, -1,08, -0,15, -0,27, 0,43, ...
$ КУПИТЬ 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1 , 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, ...