Попытка 2
Похоже, fuzzyjoin
дает вам много повторяющихся совпадений, поэтому вместо этого давайте попытаемся извлечь нужные данные из df1
в df2
, используя map
и вложенные тиблы
## load library and set data options
library(tidyverse)
options(digits.secs = 3)
## nest a filtered df1 in df2$df
dat <-
df2 %>%
group_by(timestamp, hostname, eventName, eventType) %>%
mutate(df = map(timestamp, ~ df1 %>% filter(between(timestamp, .x - 2, .x + 2)))) # 4 second interval
## walkthrough column df pulling out matching data, averaging multiple matches
dat %>%
mutate(
vals = map2(
df,
timestamp,
~ if (nrow(.x) == 0)
NA
else
.x %>% mutate(diff = timestamp - .y) %>% filter(diff == min(diff)) %>% summarise(
powerDrawWatt = mean(powerDrawWatt),
gpuTempC = mean(gpuTempC)
)
),
powerDrawWatt = unlist(vals)['powerDrawWatt'],
gpuTempC = unlist(vals)['gpuTempC']
) %>%
select(-df,-vals)
Вывод
# A tibble: 5 x 8
# Groups: timestamp, eventName, eventType [5]
timestamp hostname eventName eventType jobId taskId powerDrawWatt gpuTempC
<dttm> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
1 2018-11-08 07:41:45.459 04dc4e9647154250beeee51b866b0715000000 TotalRender START 1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705 00390eee-c26c-41da-a02d-556bb7fcac67 29.9 34
2 2018-11-08 07:41:45.459 04dc4e9647154250beeee51b866b0715000000 Saving Config START 1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705 00390eee-c26c-41da-a02d-556bb7fcac67 29.9 34
3 2018-11-08 07:41:32.460 04dc4e9647154250beeee51b866b0715000000 Render START 1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705 00390eee-c26c-41da-a02d-556bb7fcac67 25.8 32
4 2018-11-08 07:41:32.460 04dc4e9647154250beeee51b866b0715000000 Saving Config STOP 1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705 00390eee-c26c-41da-a02d-556bb7fcac67 25.8 32
5 2018-11-08 07:42:09.344 04dc4e9647154250beeee51b866b0715000000 Render STOP 1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705 00390eee-c26c-41da-a02d-556bb7fcac67 NA NA
Данные
## create tibbles
df1 <-
tibble::tribble(
~timestamp, ~hostname, ~gpuSerial, ~gpuUUID, ~powerDrawWatt, ~gpuTempC, ~gpuUtilPerc, ~gpuMemUtilPerc,
"2018-11-08T07:41:27.242Z", "04dc4e9647154250beeee51b866b0715000000", 323217056165, "GPU-a1119ee9-9cd1-919f-a479-b902142c717d", 25.94, 32L, 0L, 0L,
"2018-11-08T07:41:29.259Z", "04dc4e9647154250beeee51b866b0715000000", 323217056165, "GPU-a1119ee9-9cd1-919f-a479-b902142c717d", 25.84, 32L, 0L, 0L,
"2018-11-08T07:41:31.285Z", "04dc4e9647154250beeee51b866b0715000000", 323217056165, "GPU-a1119ee9-9cd1-919f-a479-b902142c717d", 25.84, 32L, 0L, 0L,
"2018-11-08T07:41:33.301Z", "04dc4e9647154250beeee51b866b0715000000", 323217056165, "GPU-a1119ee9-9cd1-919f-a479-b902142c717d", 25.94, 32L, 0L, 0L,
"2018-11-08T07:41:44.600Z", "04dc4e9647154250beeee51b866b0715000000", 323217056165, "GPU-a1119ee9-9cd1-919f-a479-b902142c717d", 29.94, 34L, 0L, 0L,
"2018-11-08T07:42:11.500Z", "04dc4e9647154250beeee51b866b0715000000", 323217056165, "GPU-a1119ee9-9cd1-919f-a479-b902142c717d", 21.94, 36L, 0L, 0L,
"2018-11-08T07:41:35.322Z", "04dc4e9647154250beeee51b866b0715000000", 323217056165, "GPU-a1119ee9-9cd1-919f-a479-b902142c717d", 25.84, 32L, 0L, 0L
) %>%
mutate( timestamp = as.POSIXct(timestamp, format = '%Y-%m-%dT%H:%M:%OSZ', tz = 'UTC') )
df2 <-
tibble::tribble(
~timestamp, ~hostname, ~eventName, ~eventType, ~jobId, ~taskId,
"2018-11-08T07:41:45.459Z", "04dc4e9647154250beeee51b866b0715000000", "TotalRender", "START", "1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705", "00390eee-c26c-41da-a02d-556bb7fcac67",
"2018-11-08T07:41:45.459Z", "04dc4e9647154250beeee51b866b0715000000", "Saving Config", "START", "1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705", "00390eee-c26c-41da-a02d-556bb7fcac67",
"2018-11-08T07:41:32.461Z", "04dc4e9647154250beeee51b866b0715000000", "Render", "START", "1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705", "00390eee-c26c-41da-a02d-556bb7fcac67",
"2018-11-08T07:41:32.461Z", "04dc4e9647154250beeee51b866b0715000000", "Saving Config", "STOP", "1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705", "00390eee-c26c-41da-a02d-556bb7fcac67",
"2018-11-08T07:42:09.344Z", "04dc4e9647154250beeee51b866b0715000000", "Render", "STOP", "1024-lvl12-7e026be3-5fd0-48ee-b7d1-abd61f747705", "00390eee-c26c-41da-a02d-556bb7fcac67"
) %>%
mutate( timestamp = as.POSIXct(timestamp, format = '%Y-%m-%dT%H:%M:%OSZ', tz = 'UTC') )
Для округления ваших данных,
library(lubridate)
df1 %>%
mutate(
timestamp = lubridate::round_date(timestamp, '10 seconds')
) %>%
group_by(timestamp, hostname, gpuSerial, gpuUUID) %>%
summarise_all(mean)
Пример того, какое большое влияние может иметь обобщение ваших данных:
sec_interval <- seq(as.POSIXct('2018-11-08 00:00:0.01', tz = 'UTC'), as.POSIXct('2018-11-08 12:00:01', tz = 'UTC'), by = 0.01)
sec_interval_rounded <- lubridate::round_date(test, '10 seconds') %>% unique()
> length(sec_interval)
[1] 4320100
> length(sec_interval_rounded)
[1] 4321