Другая опция неэквивалентного объединения в data.table
:
DT[, posix_dt := as.POSIXct(posix_dt, format="%Y-%m-%d %T")]
DT[, c("start", "end") := .(posix_dt - 2*60, posix_dt)]
DT[, c("rm_sentiment", "rm_score") :=
.SD[.SD, on=.(posix_dt>=start, posix_dt<=end),
by=.EACHI, lapply(.SD, mean), .SDcols=c("sentiment", "score")][,
(1L:2L) := NULL]
]
вывод:
posix_dt sentiment score start end rm_sentiment rm_score
1: 2019-11-02 08:45:06 0.0000 2 2019-11-02 08:43:06 2019-11-02 08:45:06 0.00000000 2.000000
2: 2019-11-02 08:45:07 0.0000 5 2019-11-02 08:43:07 2019-11-02 08:45:07 0.00000000 3.500000
3: 2019-11-02 08:45:08 0.0201 4 2019-11-02 08:43:08 2019-11-02 08:45:08 0.00670000 3.666667
4: 2019-11-02 08:45:14 0.2732 7 2019-11-02 08:43:14 2019-11-02 08:45:14 0.07332500 4.500000
5: 2019-11-02 08:45:25 0.0000 3 2019-11-02 08:43:25 2019-11-02 08:45:25 0.05866000 4.200000
6: 2019-11-02 08:45:35 0.3182 16 2019-11-02 08:43:35 2019-11-02 08:45:35 0.10191667 6.166667
7: 2019-11-02 08:45:48 0.0000 3 2019-11-02 08:43:48 2019-11-02 08:45:48 0.08735714 5.714286
8: 2019-11-02 08:45:53 -0.3582 6 2019-11-02 08:43:53 2019-11-02 08:45:53 0.03166250 5.750000
9: 2019-11-02 08:46:00 0.4003 6 2019-11-02 08:44:00 2019-11-02 08:46:00 0.06536000 5.900000
10: 2019-11-02 08:46:00 0.0000 7 2019-11-02 08:44:00 2019-11-02 08:46:00 0.06536000 5.900000
11: 2019-11-02 08:46:04 0.0000 4 2019-11-02 08:44:04 2019-11-02 08:46:04 0.05941818 5.727273
12: 2019-11-02 08:46:07 0.0000 2 2019-11-02 08:44:07 2019-11-02 08:46:07 0.05446667 5.416667
13: 2019-11-02 08:46:16 0.4939 0 2019-11-02 08:44:16 2019-11-02 08:46:16 0.08826923 5.000000
14: 2019-11-02 08:46:19 0.0000 2 2019-11-02 08:44:19 2019-11-02 08:46:19 0.08196429 4.785714
15: 2019-11-02 08:46:32 -0.5267 2 2019-11-02 08:44:32 2019-11-02 08:46:32 0.04138667 4.600000
16: 2019-11-02 08:46:49 0.2960 0 2019-11-02 08:44:49 2019-11-02 08:46:49 0.05730000 4.312500
17: 2019-11-02 08:47:05 0.9753 7 2019-11-02 08:45:05 2019-11-02 08:47:05 0.10511667 4.722222
18: 2019-11-02 08:47:05 0.0000 9 2019-11-02 08:45:05 2019-11-02 08:47:05 0.10511667 4.722222
19: 2019-11-02 08:47:07 0.0000 3 2019-11-02 08:45:07 2019-11-02 08:47:07 0.10511667 4.777778
20: 2019-11-02 08:47:10 -0.2960 9 2019-11-02 08:45:10 2019-11-02 08:47:10 0.09270588 5.058824
данные:
library(data.table)
DT <- fread("posix_dt,sentiment,score
2019-11-02 08:45:06, 0.0000 , 2
2019-11-02 08:45:07, 0.0000 , 5
2019-11-02 08:45:08, 0.0201 , 4
2019-11-02 08:45:14, 0.2732 , 7
2019-11-02 08:45:25, 0.0000 , 3
2019-11-02 08:45:35, 0.3182 , 16
2019-11-02 08:45:48, 0.0000 , 3
2019-11-02 08:45:53, -0.3582 , 6
2019-11-02 08:46:00, 0.4003 , 6
2019-11-02 08:46:00, 0.0000 , 7
2019-11-02 08:46:04, 0.0000 , 4
2019-11-02 08:46:07, 0.0000 , 2
2019-11-02 08:46:16, 0.4939 , 0
2019-11-02 08:46:19, 0.0000 , 2
2019-11-02 08:46:32, -0.5267 , 2
2019-11-02 08:46:49, 0.2960 , 0
2019-11-02 08:47:05, 0.9753 , 7
2019-11-02 08:47:05, 0.0000 , 9
2019-11-02 08:47:07, 0.0000 , 3
2019-11-02 08:47:10, -0.2960 ,9")
Добавить еще экспериментальный подход с использованием подвижного соединения:
#data prep
cols <- c("sentiment", "score")
DT[, paste0("cs_", cols) := lapply(.SD, cumsum), .SDcols=cols]
DT[, c("rn", "posix_dt") := .(.I, as.POSIXct(posix_dt, format="%Y-%m-%d %T"))][,
start := posix_dt - 2 * 60]
#rolling join
DT[, c("firstrow", paste0("first_cs_", cols)) :=
.SD[.SD, on=.(posix_dt=start), roll=Inf, rollends=c(TRUE, TRUE),
{
k <- x.posix_dt == i.start
rolled <- x.posix_dt == i.posix_dt[1L]
xcs <- mget(paste0("x.cs_", cols))
v <- mget(paste0("x.", cols))
firstcs <- mapply(function(cs, v) fifelse(k | rolled, cs - v, cs),
xcs, v, SIMPLIFY=FALSE)
ix <- fifelse(k, x.rn, x.rn + 1L)
idx <- fifelse(rolled, 1L, ix)
c(.(idx), firstcs)
}]]
DT
#calculate means
DT[, paste0("rm_", cols) := lapply(cols, function(x) {
(get(paste0("cs_", x)) - get(paste0("first_cs_", x))) / (rn - firstrow + 1)
})]