Вот еще один вариант:
dt[order(A, B, C), {
uniqn <- rleidv(c(.SD, .(ID)))
lastidx <- c(which(diff(rowidv(.SD))<1L), .N)
c(.SD[lastidx], .(count=c(uniqn[lastidx[1L]], diff(uniqn[lastidx]))))
}, .SDcols=cols]
временной код:
cols <- c("A","B","C")
microbenchmark(times=1L,
DATA_TABLE = a00 <- dt[, .(count=uniqueN(ID)), cols],
DATA_TABLE1 = a01 <- dt[, .(count=length(unique(ID))), cols],
DPLYR = a1 <- dt %>%
group_by(A,B,C) %>%
summarise(count = n_distinct(ID)),
mtd2 = a2 <- dt[order(A, B, C), {
uniqn <- rleidv(c(.SD, .(ID)))
lastidx <- c(which(diff(rowidv(.SD))<1L), .N)
c(.SD[lastidx], .(count=c(uniqn[lastidx[1L]], diff(uniqn[lastidx]))))
}, .SDcols=cols]
)
fsetequal(a00, a01))
#[1] TRUE
fsetequal(a01, setDT(a1))
#[1] TRUE
fsetequal(setDT(a1), a2)
#[1] TRUE
время:
Unit: milliseconds
expr min lq mean median uq max neval
DATA_TABLE 459346.4602 459346.4602 459346.4602 459346.4602 459346.4602 459346.4602 1
DATA_TABLE1 6404.5130 6404.5130 6404.5130 6404.5130 6404.5130 6404.5130 1
DPLYR 3988.2554 3988.2554 3988.2554 3988.2554 3988.2554 3988.2554 1
mtd2 156.3265 156.3265 156.3265 156.3265 156.3265 156.3265 1
данные с 1 миллионами строк:
library(data.table)
library(dplyr)
library(microbenchmark)
# data simulated
set.seed(0L)
nr <- 1e6
dt = data.table(A = sample(1:424700,nr, replace = T),
B = sample(1:11900,nr, replace = T),
C = sample(1:600, nr, replace = T),
D = sample(1:3000, nr, replace = T))
dt[,ID:=paste(A,":::",D,":::",C)]