Несколько вариантов использования соединения (около 13 с с любым методом соединения для фактических размеров):
DT <- copy(dt)
setorder(DT, y, x)
DT[DT[.(unique(x)), on=.(x), .(y=first(y)), by=.EACHI], on=.(x,y)]
или, если первоначальный порядок имеет значение:
DT2 <- copy(dt)
setorder(DT2[, rn := .I], y, x)
dt[sort(DT2[.(unique(x)), on=.(x), rn[y==first(y)], by=.EACHI]$V1)]
, а также использование min
упомянуто в OP:
DT0[, rn := .I]
dt[DT0[.(unique(x)), on=.(x), rn[y==min(y)], by=.EACHI][order(V1), V1]]
код времени:
base <- function() {
for(y_check in y_unique) {
y_list[[as.character(y_check)]] <- df[df$y == y_check, ]
df <- df[!df$x %in% y_list[[as.character(y_check)]]$x, ]
}
do.call(rbind, y_list)
} #base
mtd0 <- function() {
for(y_check in y_unique) {
y_list[[as.character(y_check)]] <- dt[y == y_check]
dt <- dt[!x %in% y_list[[as.character(y_check)]]$x]
}
out <- rbindlist(y_list)
} #mtd0
join_mtd <- function() {
setorder(DT, y, x)
dt[DT[.(unique(x)), on=.(x), .(y=first(y)), by=.EACHI], on=.(x,y)]
} #join_mtd
join_mtd2 <- function() {
setorder(DT2[, rn := .I], y, x)
dt[sort(DT2[.(unique(x)), on=.(x), rn[y==first(y)], by=.EACHI]$V1)]
} #join_mtd2
join_mtd3 <- function() {
DT0[, rn := .I]
dt[DT0[.(unique(x)), on=.(x), rn[y==min(y)], by=.EACHI][order(V1), V1]]
} #join_mtd3
bench::mark(base(), data.table_0=mtd0(),
jm=join_mtd(), jm2=join_mtd2(), jm3=join_mtd2(), check=FALSE)
проверки:
baseans <- setDT(base())
data.table_0 <- mtd0()
ordbase <- setorder(copy(baseans), y, x)
jm <- join_mtd()
jm2 <- join_mtd2()
jm3 <- join_mtd3()
identical(baseans, data.table_0)
#[1] TRUE
identical(ordbase, setorder(jm, y, x))
#[1] TRUE
identical(ordbase, setorder(jm2, y, x))
#[1] TRUE
identical(ordbase, setorder(jm3, y, x))
#[1] TRUE
время:
# A tibble: 5 x 14
expression min mean median max `itr/sec` mem_alloc n_gc n_itr total_time result memory time gc
<chr> <bch:tm> <bch:tm> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <bch:tm> <list> <list> <list> <list>
1 base() 38.59s 38.59s 38.59s 38.59s 0.0259 27.3GB 308 1 38.59s <data.frame [632,329 x ~ <Rprofmem [43,206 x ~ <bch:t~ <tibble [1 x ~
2 data.table_0 24.65s 24.65s 24.65s 24.65s 0.0406 14GB 159 1 24.65s <data.table [632,329 x ~ <Rprofmem [72,459 x ~ <bch:t~ <tibble [1 x ~
3 jm 1.28s 1.28s 1.28s 1.28s 0.779 75MB 7 1 1.28s <data.table [632,329 x ~ <Rprofmem [2,418 x 3~ <bch:t~ <tibble [1 x ~
4 jm2 1.44s 1.44s 1.44s 1.44s 0.696 62.5MB 9 1 1.44s <data.table [632,329 x ~ <Rprofmem [1,783 x 3~ <bch:t~ <tibble [1 x ~
5 jm3 1.57s 1.57s 1.57s 1.57s 0.636 62.5MB 9 1 1.57s <data.table [632,329 x ~ <Rprofmem [178 x 3]> <bch:t~ <tibble [1 x ~
данные:
library(data.table)
library(bench)
set.seed(1L)
nr <- 10e6/10
ni <- 10e3/10
df <- data.frame(x = sample(nr, size = nr, replace = TRUE),
y = sample(ni, size = nr, replace = TRUE))
dt <- data.table(df)
DT0 <- copy(dt)
DT <- copy(dt)
DT2 <- copy(dt)
y_unique <- sort(unique(df$y))
y_list <- setNames(rep(list(list()), length(y_unique)), y_unique)