Я пытаюсь понять и решить проблему с обучением модели XGBoost на R с использованием Rstudio. У меня есть набор данных с 1,5M наблюдений (маленький).
# select relevant vars
df <- firm %>% dplyr::select(log_assets,
log_employment,
log_assets,
log_employment,
log_profits,
lggdppc,
lrevenue,
seq,
fie,
soe,
year,
b07,
lrate
)
# clean data
df <- na.omit(df)
df$fie <- factor(df$fie)
df$soe <- factor(df$soe)
df$seq <- factor(df$seq)
df$year <- factor(df$year)
df$b07 <- factor(df$b07)
# split X and y
set.seed(100)
labels = df['lrate']
df = df[-grep('lrate', colnames(df))]
# 75% of the sample size
smp_size <- floor(0.75 * nrow(df))
# set the seed to make your partition reproducible. get train/test indices
set.seed(123)
train_ind <- sample(seq_len(nrow(df)), size = smp_size)
rm(firm)
# split one-hot-encode df
#df <- sparse.model.matrix(~ -1+ a + b, df)# no intercept --> even sparser
library(Matrix)
sparse_matrix <- sparse.model.matrix(labels$lrate ~ .-1, data = df) # Large dgCMatrix (128,1487,526 elements) 230.4 MB
# xgb modeling
df_dmat <- xgb.DMatrix(sparse_matrix, label=labels$lrate)
train_dmat <- df_dmat[train_ind,]
test_dmat <- df_dmat[-train_ind, ] # this line causes RStudio to crash
Это потому, что данные слишком велики? Вот скриншот моего окружения.