Мы можем расширить фрейм данных и посчитать число таким образом, хотя это может быть не самым эффективным решением.
library(tidyverse)
dat2 <- dat %>%
mutate(level = map2(level, level_max, `:`)) %>%
unnest() %>%
count(item, level)
dat2
# # A tibble: 4 x 3
# item level n
# <fct> <int> <int>
# 1 a 1 2
# 2 a 2 2
# 3 a 3 1
# 4 a 4 1
Для бенчмаркинга мы можем подделать большие данные:
set.seed(1) ; x <- data.frame(
user = rep(seq_len(1e3), each = 26),
item = rep(letters, times = 1e3),
level = sample(1:5, 1e3*26, replace = TRUE),
level_max = sample(5:9, 1e3*26, replace = TRUE)
)
microbenchmark::microbenchmark(
a = do.call(rbind.data.frame, apply(x, 1L, function(x) {
range <- x["level"]:x["level_max"]
data.frame(user = rep(x["user"], length(range)),
item = rep(x["item"], length(range)),
level = range)
})) %>%
group_by(item, level) %>%
summarize(count = n_distinct(user)),
b = mutate(x, level = map2(level, level_max, seq)) %>%
unnest() %>%
count(item, level),
times = 5
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# a 33489.1391 36795.5105 36460.3686 37205.2517 37284.5728 37527.3690 5
# b 407.6839 454.4582 461.4137 464.6317 480.3397 499.9549 5
И против подхода Фрэнка:
microbenchmark::microbenchmark(
b = mutate(x, level = map2(level, level_max, seq)) %>%
unnest() %>%
count(item, level)
,
f = x %>% group_by(item) %>%
summarise(level = map2(min(level), max(level_max), seq)) %>%
unnest(level) %>% mutate(n = count_matches(., x, item == item, level <= level, level_max >= level))
,
times = 5
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# b 500.72074 534.07906 550.55589 542.21577 570.87849 604.8854 5 b
# f 36.15499 41.06282 46.11696 41.16352 49.21757 62.9859 5 a
# speeds differ because this was run on Frank's comp unlike the first benchmark