Я пытаюсь найти решение для Kaggle TFI Dataset.
Но при построении модели в R с использованием Random Forest я обнаружил следующую ошибку:
Ошибка в [.data.frame
(data, all.vars (Terms), drop = FALSE):
выбраны неопределенные столбцы
Код:
library(Boruta)
library(caret)
train<-read.csv("train.csv")
names(train)
View(train)
str(train)
test<-read.csv("test.csv")
n.train<-nrow(train)
test$revenue<-1
myData<-rbind(train,test)
rm(train,test)
myData$Open.Date<-as.POSIXlt("01/01/2015", format="%m/%d/%Y") -
as.POSIXlt(myData$Open.Date, format="%m/%d/%Y")
myData$Open.Date<-as.numeric(myData$Open.Date/1000)
myData$City<-as.character(myData$City)
myData$City[myData$City.Group=="Other"]<-"Other"
myData$City[myData$City==unique(myData$City)[4]]<-unique(myData$City)[2]
myData$City<-as.factor(myData$City)
myData$City.Group<-NULL
myData$Type<-as.character(myData$Type)
myData$Type[myData$Type=="DT"]<-"IL"
myData$Type[myData$Type=="MB"]<-"FC"
myData$Type<-as.factor(myData$Type)
hist(train$revenue)
hist(log(train$revenue))
myData[,paste("P",1:37,sep="")]<-log(1+myData[,paste("P",1:37,sep="")])
myData$revenue<-log(myData$revenue)
head(myData,2)
important<-Boruta(revenue~.,data=myData[1:n.train,])
important$finalDecision
library(randomForest)
model<-train(myData$revenue[1:n.train]~.,method="rf",
data=myData[1:n.train,c(important$finalDecision!="Rejected",TRUE)])