Какие данные у вас есть и какие сроки вы ожидаете? В следующем примере с 100M строк я получаю следующие значения времени
library(data.table)
df <- data.table(
Factor = as.factor(sample(LETTERS, size = 10^8, replace = T)),
Variable = sample(10^3, size = 10^8, replace = T)
)
# data.table solution 1
system.time({
aux <- df[, .(sumVar = sum(Variable)/sum(df$Variable)), by = .(Factor)]
df[aux, sumVar := sumVar, on = .(Factor = Factor)]
}) # ~10.5 seconds
# data.table solution 2
system.time({
df[, sumVar := sum(Variable)/sum(df$Variable), by = Factor]
}) # ~8.3 seconds
# dplyr solution 1
system.time({
df %>% dplyr::group_by(Factor) %>% dplyr::mutate(A=sum(Variable)/sum(df$Variable))
}) # ~10.0 seconds
Обратите внимание, что ускорение data.table становится более впечатляющим по мере увеличения мощности фактора.
df <- data.table(
Factor = as.factor(sample(as.character(10^6), size = 10^8, replace = T)),
Variable = as.numeric(sample(10^3, size = 10^8, replace = T))
)
# data.table solution 1
system.time({
aux <- df[, .(sumVar = sum(Variable)/sum(df$Variable)), by = .(Factor)]
df[aux, sumVar := sumVar, on = .(Factor = Factor)]
}) # ~5.0 seconds
# data.table solution 2
system.time({
df[, sumVar := sum(Variable)/sum(df$Variable), by = Factor]
}) # ~3.1 seconds
# dplyr solution 1
system.time({
df %>% dplyr::group_by(Factor) %>% dplyr::mutate(A=sum(Variable)/sum(df$Variable))
}) # ~6.9 seconds