Другой вариант, аналогичный подходу Сотоса, но 1) использование data.table
, 2) не использование factor
и 3) замена table
на Rfast::rowTabulate
:
v <- c('Hard', 'Match', 'Easy')
vv <- do.call(paste, expand.grid(v, v))
DT[, (vv) := {
mat <- mapply(paste, .SD[, -ncol(.SD), with=FALSE], .SD[, -1L])
as.data.table(Rfast::rowTabulate(matrix(match(mat, vv, 0L), nrow=.N)))
}, .SDcols=Task_Alpha:Task_Delta]
выход:
userID Score Task_Alpha Task_Beta Task_Charlie Task_Delta Hard Hard Match Hard Easy Hard Hard Match Match Match Easy Match Hard Easy Match Easy Easy Easy
1: 3108 -8.00 Easy Easy Easy Easy 0 0 0 0 0 0 0 0 3
2: 3207 3.00 Hard Easy Match Match 0 0 0 0 1 1 1 0 0
3: 3350 5.78 Hard Easy Hard Hard 1 0 1 0 0 0 1 0 0
4: 3961 10.00 Easy <NA> Hard Hard 1 0 0 0 0 0 0 0 0
5: 4021 10.00 Easy Easy <NA> Hard 0 0 0 0 0 0 0 0 1
data:
library(data.table)
library(Rfast)
DT <- structure(list(
userID = c(3108L, 3207L, 3350L, 3961L, 4021L),
Score = c(-8, 3, 5.78, 10, 10),
Task_Alpha = structure(c(1L, 2L, 2L, 1L, 1L), .Label = c("Easy", "Hard"), class = "factor"),
Task_Beta = structure(c(1L, 1L, 1L, NA, 1L), .Label = "Easy", class = "factor"),
Task_Charlie = structure(c(1L, 3L, 2L, 2L, NA), .Label = c("Easy", "Hard", "Match"), class = "factor"),
Task_Delta = structure(c(1L, 3L, 2L, 2L, 2L), .Label = c("Easy", "Hard", "Match"), class = "factor")),
class = "data.frame", row.names = c(NA, -5L))
setDT(DT)
Было бы интересно узнать, насколько быстро этот подход работает с фактическим набором данных и является ли фактический набор данных большим.
edit: добавленонекоторые времена
library(data.table)
nr <- 1e6
vec <- c('Hard', 'Match', 'Easy', NA)
DT <- data.table(userID=1:nr, Task_Alpha=sample(vec, nr, TRUE), Task_Beta=sample(vec, nr, TRUE),
Task_Charlie=sample(vec, nr, TRUE), Task_Delta=sample(vec, nr, TRUE))
df <- as.data.frame(DT)
DT0 <- copy(DT)
DT1 <- copy(DT)
DT2 <- copy(DT)
mtd0 <- function() {
t(apply(df[-1L], 1, function(i) {
i1 <- paste(i[-length(i)], i[-1L]);
i1 <- factor(i1, levels = do.call(paste, expand.grid(c('Easy', 'Match', 'Hard'),
c('Easy', 'Match', 'Hard'))));
table(i1)
}))
}
mtd1 <- function() {
f_cols <- names(DT0)[ sapply( DT0, is.factor ) ]
DT0[, (f_cols) := lapply(.SD, as.character), .SDcols = f_cols ]
#melt to long format
DT.melt <- melt( DT0, id.vars = "userID", measure.vars = patterns( task = "^Task_"))
#set order of Aplha-Beta-etc...
DT.melt[ grepl( "Alpha", variable ), order := 1 ]
DT.melt[ grepl( "Beta", variable ), order := 2 ]
DT.melt[ grepl( "Charlie", variable ), order := 3 ]
DT.melt[ grepl( "Delta", variable ), order := 4 ]
#order DT.melt
setorder( DT.melt, userID, order )
#fill in codes EE, etc...
DT.melt[, `:=`( code1 = gsub( "(^.).*", "\\1", value ),
code2 = gsub( "(^.).*", "\\1", shift( value, type = "lead" ) ) ),
by = userID ]
#filter only rows without NA
DT.melt <- DT.melt[ complete.cases( DT.melt ) ]
#cast to wide output
dcast( DT.melt, userID ~ paste0( code2, code1 ), fun.aggregate = length )
}
mtd2 <- function() {
v <- c('Hard', 'Match', 'Easy')
vv <- do.call(paste, expand.grid(v, v))
DT2[, (vv) := {
mat <- mapply(paste, .SD[, -ncol(.SD), with=FALSE], .SD[, -1L])
as.data.table(Rfast::rowTabulate(matrix(match(mat, vv, 0L), nrow=.N)))
}, .SDcols=Task_Alpha:Task_Delta]
}
bench::mark(mtd0(), mtd1(), mtd2(), check=FALSE)
времена:
# A tibble: 3 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 mtd0() 2.19m 2.19m 0.00760 252MB 2.26 1 297 2.19m <int[,9] [1,000,000 x 9]> <df[,3] [171,481 x 3]> <bch:tm> <tibble [1 x 3]>
2 mtd1() 33.16s 33.16s 0.0302 856MB 0.754 1 25 33.16s <df[,10] [843,688 x 10]> <df[,3] [8,454 x 3]> <bch:tm> <tibble [1 x 3]>
3 mtd2() 844.95ms 844.95ms 1.18 298MB 1.18 1 1 844.95ms <df[,14] [1,000,000 x 14]> <df[,3] [8,912 x 3]> <bch:tm> <tibble [1 x 3]>