используя ваш ответ и комментарии @HenrikB Мне удалось написать более быстрый подход:
## simulate data
nr <- 2291; nc <- 265
set.seed(420)
input_m <- matrix(rnorm(nr * nc), nrow = nr, ncol = nc)
input_m[1:5, 1:5]
# [,1] [,2] [,3] [,4] [,5]
# [1,] -0.76774389 1.2623614 2.44166184 -1.86900934 1.61130129
# [2,] -1.44513238 -0.5469383 -0.31919480 -0.03155421 0.09293325
# [3,] -0.71767075 -0.2753542 2.28792301 0.41545393 -0.47370802
# [4,] 0.06410398 1.4956864 0.06859527 2.19689076 -0.96428109
# [5,] -1.85365878 0.1609678 -0.52191522 -0.79557319 -0.33021108
jaccardLuke <- function(input_m) {
res_m = outer(1:ncol(input_m), 1:ncol(input_m) ,
FUN = Vectorize(function(r,c) {
require(matrixStats)
sum(rowMins(input_m[,c(r,c)]))/sum(rowMaxs(input_m[,c(r,c)]))
})
)
rownames(res_m) = colnames(input_m)
colnames(res_m) = colnames(input_m)
res_m
}
jaccardHenrikB <- function(input_m) {
require(matrixStats)
res_m = outer(1:ncol(input_m), 1:ncol(input_m) ,
FUN = Vectorize(function(r, r2) {
x <- rowRanges(input_m, cols = c(r, r2))
s <- colSums(x)
s[1] / s[2]
})
)
rownames(res_m) = colnames(input_m)
colnames(res_m) = colnames(input_m)
res_m
}
Моя функция:
jaccardMinem <- function(input_m) {
require(data.table)
require(matrixStats)
samples <- 1:ncol(input_m)
comb <- CJ(samples, samples)
comb[, i := .I]
comb <- melt(comb, 'i')
setorder(comb, value)
v2 <- paste0("V", 1:2)
comb[, variable2 := v2 , keyby = i]
comb2 <- dcast(comb, i ~ variable2, value.var = 'value')
combUnique <- unique(comb2, by = c('V1', 'V2'))
XX <- apply(combUnique[, -'i'], 1, function(x) {
x2 <- rowRanges(input_m, cols = x)
s <- colSums2(x2)
s[1] / s[2]
})
set(combUnique, j = 'xx', value = XX)
rez2 <- merge(comb2, combUnique[, -'i'], by = c('V1', 'V2'), all.x = T)
setorder(rez2, i)
rez2 <- array(rez2$xx, dim = rep(ncol(input_m), 2))
rownames(rez2) <- colnames(input_m)
colnames(rez2) <- colnames(input_m)
rez2
}
Проверьте, все ли равны:
all.equal(jaccardLuke(input_m), jaccardHenrikB(input_m))
# [1] TRUE
all.equal(jaccardLuke(input_m), jaccardMinem(input_m))
# [1] TRUE
бенчмаркинг:
system.time(jaccardLuke(input_m)) # 6.05 sek
system.time(jaccardHenrikB(input_m)) # 2.75 sek
system.time(jaccardMinem(input_m)) # 1.74 sek
## for larger data:
nr <- 5000; nc <- 500
set.seed(420)
input_m <- matrix(rnorm(nr * nc), nrow = nr, ncol = nc)
system.time(jaccardLuke(input_m)) # 41.55 sek
system.time(jaccardHenrikB(input_m)) # 19.87 sek
system.time(jaccardMinem(input_m)) # 11.17 sek
главное отличие в том, что я сначала вычисляю уникальные комбинации индексов, для которых нам нужно вычислить значения