Вот подход dplyr:
library(tidyverse)
df %>%
mutate(datetime = lubridate::mdy_hm(datetime)) %>%
group_by(doctor, patient_no) %>%
summarize(count = n(),
avg_days_between = (max(datetime) - min(datetime)) / lubridate::ddays(count - 1)) %>%
ungroup()
## A tibble: 4 x 4
# doctor patient_no count avg_days_between
# <chr> <dbl> <int> <dbl>
#1 dr.amoor 1596 1 NaN
#2 dr.gould 995 1 NaN
#3 dr.gould 1397 1 NaN
#4 dr.kahn 1561 2 266.
Или вы можете рассчитать каждую задержку и использовать другой метод, например, медиану, для характеристики avg delta.
df %>%
group_by(doctor, patient_no) %>%
mutate(datetime = lubridate::mdy_hm(datetime),
# coalesce helps
days_since_last = coalesce(c(datetime - lag(datetime))/
lubridate::ddays(1), 0)) %>%
summarize(count = n(),
median_time_between = median(days_since_last))
## A tibble: 4 x 4
# Groups: doctor [3]
# doctor patient_no count median_time_between
# <chr> <dbl> <int> <dbl>
#1 dr.amoor 1596 1 0
#2 dr.gould 995 1 0
#3 dr.gould 1397 1 0
#4 dr.kahn 1561 2 133.
пример данных
df <- tibble::tribble(
~doctor, ~patient_no, ~datetime,
"dr.kahn", 1561, "1/21/19 10:30",
"dr.gould", 1397, "2/6/19 12:30",
"dr.amoor", 1596, "2/11/19 9:00",
"dr.gould", 995, "10/7/19 12:30",
"dr.kahn", 1561, "10/14/19 9:30"
)