Это не красиво, но вот гибридное решение tidyverse / data.table, которое работает. Он разбивает вещи на компоненты:
- Полное объединение всех возможных комбинаций данных между dt1 и dt2 (по идентификатору).
- Определение типа перекрытия, видимого в каждой строке (есть7 перестановок) и установите перекрывающиеся даты ( _o суффикс).
- Для типов перекрытия требуется различное количество строк, чтобы отобразить объединенные даты начала и окончания. Разверните фрейм данных, чтобы указать количество строк, необходимое для каждого типа перекрытия.
- Создайте объединенные даты ( _c суффикс) на основе типа перекрытия.
- Определитек какому набору данных применяется комбинированный диапазон дат ( enroll_type = dt1, dt2 или оба), а затем отбрасывает строки из одного источника (dt1 / dt2), которые полностью охватываются enroll_type типа 'both'.
- Из-за сортировки по ID + дате ранее вы можете использовать опережение / отставание для усечения комбинированных дат, чтобы ни одна дата не покрывалась более чем одной startdate_c - enddate_c span.
Возможно, вы найдете способы сделать это более элегантным и эффективным.
library(data.table)
library(tidyr)
#create test data ----
dt1<- data.table(id = rep(1111, 4),
from_date = as.Date(c("2016-01-01", "2016-03-31","2016-09-02", "2016-09-03")),
to_date = as.Date(c("2016-03-15", "2016-09-01", "2016-09-02", "2016-12-15")),
progs = c("a1", "b1", "c1", "d1"))
setkey(dt1, id, from_date, to_date)
dt2<- data.table(id = rep(1111, 4),
from_date = as.Date(c("2016-02-01", "2016-04-01","2016-11-01", "2016-12-01")),
to_date = as.Date(c("2016-02-28", "2016-09-30", "2016-11-30", "2016-12-31")),
progs = c("a2", "b2", "c2", "d2"))
setkey(dt2, id, from_date, to_date)
# create all possible matches between time segments ----
dt <- setDT(mutate(dt1) %>% full_join(., dt2, by = c("id")) )
#dt[, c("progs.y", "progs.x") := NULL]
#setnames(dt, names(dt), c("id", "startdate_dt1", "enddate_dt1", "startdate_dt2", "enddate_dt2"))
setnames(dt, names(dt), c("id", "startdate_dt1", "enddate_dt1", "progs1", "startdate_dt2", "enddate_dt2", "progs2"))
# set up intervals ----
temp <- dt %>%
mutate(overlap_type = case_when(
# First ID the non-matches
is.na(startdate_dt1) | is.na(startdate_dt2) ~ 0,
# Then figure out which overlapping date comes first
# Exactly the same dates
startdate_dt1 == startdate_dt2 & enddate_dt1 == enddate_dt2 ~ 1,
# dt1 before dt2 (or exactly the same dates)
startdate_dt1 <= startdate_dt2 & startdate_dt2 <= enddate_dt1 &
enddate_dt1 <= enddate_dt2 ~ 2,
# dt2 before dt1
startdate_dt2 <= startdate_dt1 & startdate_dt1 <= enddate_dt2 &
enddate_dt2 <= enddate_dt1 ~ 3,
# dt2 dates competely within dt1 dates or vice versa
startdate_dt2 >= startdate_dt1 & enddate_dt2 <= enddate_dt1 ~ 4,
startdate_dt1 >= startdate_dt2 & enddate_dt1 <= enddate_dt2 ~ 5,
# dt1 coverage only before dt2 (or dt2 only after dt1)
startdate_dt1 < startdate_dt2 & enddate_dt1 < startdate_dt2 ~ 6,
# dt1 coverage only after dt2 (or dt2 only before dt1)
startdate_dt1 > enddate_dt2 & enddate_dt1 > enddate_dt2 ~ 7,
# Any rows that are left
TRUE ~ 8),
# Calculate overlapping dates
startdate_o = as.Date(case_when(
overlap_type %in% c(1, 2, 4) ~ startdate_dt2,
overlap_type %in% c(3, 5) ~ startdate_dt1), origin = "1970-01-01"),
enddate_o = as.Date(ifelse(overlap_type %in% c(1:5),
pmin(enddate_dt2, enddate_dt1),
NA), origin = "1970-01-01"),
# Need to duplicate rows to separate out non-overlapping dt1 and dt2 periods
repnum = case_when(
overlap_type %in% c(2:5) ~ 3,
overlap_type %in% c(6:7) ~ 2,
TRUE ~ 1)
) %>%
select(id, startdate_dt1, enddate_dt1, startdate_dt2, enddate_dt2,
startdate_o, enddate_o, overlap_type, repnum) %>%
arrange(id, startdate_dt1, startdate_dt2, startdate_o,
enddate_dt1, enddate_dt2, enddate_o)
### Expand out rows to separate out overlaps ----
temp_ext <- temp[rep(seq(nrow(temp)), temp$repnum), 1:ncol(temp)]
## process expanded ----
temp_ext <- temp_ext %>%
group_by(id, startdate_dt1, enddate_dt1, startdate_dt2, enddate_dt2) %>%
mutate(rownum_temp = row_number()) %>%
ungroup() %>%
arrange(id, startdate_dt1, enddate_dt1, startdate_dt2, enddate_dt2, startdate_o,
enddate_o, overlap_type, rownum_temp) %>%
mutate(
# Remove non-overlapping dates
startdate_dt1 = as.Date(ifelse((overlap_type == 6 & rownum_temp == 2) |
(overlap_type == 7 & rownum_temp == 1),
NA, startdate_dt1), origin = "1970-01-01"),
enddate_dt1 = as.Date(ifelse((overlap_type == 6 & rownum_temp == 2) |
(overlap_type == 7 & rownum_temp == 1),
NA, enddate_dt1), origin = "1970-01-01"),
startdate_dt2 = as.Date(ifelse((overlap_type == 6 & rownum_temp == 1) |
(overlap_type == 7 & rownum_temp == 2),
NA, startdate_dt2), origin = "1970-01-01"),
enddate_dt2 = as.Date(ifelse((overlap_type == 6 & rownum_temp == 1) |
(overlap_type == 7 & rownum_temp == 2),
NA, enddate_dt2), origin = "1970-01-01")) %>%
distinct(id, startdate_dt1, enddate_dt1, startdate_dt2, enddate_dt2, startdate_o,
enddate_o, overlap_type, rownum_temp, .keep_all = TRUE) %>%
# Remove first row if start dates are the same or dt1 is only one day
filter(!(overlap_type %in% c(2:5) & rownum_temp == 1 &
(startdate_dt1 == startdate_dt2 | startdate_dt1 == enddate_dt1))) %>%
# Remove third row if enddates are the same
filter(!(overlap_type %in% c(2:5) & rownum_temp == 3 & enddate_dt1 == enddate_dt2))
## Calculate the finalized date columms----
### Calculate finalized date columns
temp_ext <- temp_ext %>%
# Set up combined dates
mutate(
# Start with rows with only dt1 or dt2, or when both sets of dates are identical
startdate_c = as.Date(
case_when(
(!is.na(startdate_dt1) & is.na(startdate_dt2)) | overlap_type == 1 ~ startdate_dt1,
!is.na(startdate_dt2) & is.na(startdate_dt1) ~ startdate_dt2), origin = "1970-01-01"),
enddate_c = as.Date(
case_when(
(!is.na(enddate_dt1) & is.na(enddate_dt2)) | overlap_type == 1 ~ enddate_dt1,
!is.na(enddate_dt2) & is.na(enddate_dt1) ~ enddate_dt2), origin = "1970-01-01"),
# Now look at overlapping rows and rows completely contained within the other data's dates
startdate_c = as.Date(
case_when(
overlap_type %in% c(2, 4) & rownum_temp == 1 ~ startdate_dt1,
overlap_type %in% c(3, 5) & rownum_temp == 1 ~ startdate_dt2,
overlap_type %in% c(2:5) & rownum_temp == 2 ~ startdate_o,
overlap_type %in% c(2:5) & rownum_temp == 3 ~ enddate_o + 1,
TRUE ~ startdate_c), origin = "1970-01-01"),
enddate_c = as.Date(
case_when(
overlap_type %in% c(2:5) & rownum_temp == 1 ~ lead(startdate_o, 1) - 1,
overlap_type %in% c(2:5) & rownum_temp == 2 ~ enddate_o,
overlap_type %in% c(2, 5) & rownum_temp == 3 ~ enddate_dt2,
overlap_type %in% c(3, 4) & rownum_temp == 3 ~ enddate_dt1,
TRUE ~ enddate_c), origin = "1970-01-01"),
# Deal with the last line for each person if it's part of an overlap
startdate_c = as.Date(ifelse((id != lead(id, 1) | is.na(lead(id, 1))) &
overlap_type %in% c(2:5) &
enddate_dt1 != enddate_dt2,
lag(enddate_o, 1) + 1,
startdate_c), origin = "1970-01-01"),
enddate_c = as.Date(ifelse((id != lead(id, 1) | is.na(lead(id, 1))) &
overlap_type %in% c(2:5),
pmax(enddate_dt1, enddate_dt2, na.rm = TRUE),
enddate_c), origin = "1970-01-01")
) %>%
arrange(id, startdate_c, enddate_c, startdate_dt1, startdate_dt2,
enddate_dt1, enddate_dt2, overlap_type) %>%
mutate(
# Identify which type of enrollment this row represents
enroll_type =
case_when(
(overlap_type == 2 & rownum_temp == 1) |
(overlap_type == 3 & rownum_temp == 3) |
(overlap_type == 6 & rownum_temp == 1) |
(overlap_type == 7 & rownum_temp == 2) |
(overlap_type == 4 & rownum_temp %in% c(1, 3)) |
(overlap_type == 0 & is.na(startdate_dt2)) ~ "dt1",
(overlap_type == 3 & rownum_temp == 1) |
(overlap_type == 2 & rownum_temp == 3) |
(overlap_type == 6 & rownum_temp == 2) |
(overlap_type == 7 & rownum_temp == 1) |
(overlap_type == 5 & rownum_temp %in% c(1, 3)) |
(overlap_type == 0 & is.na(startdate_dt1)) ~ "dt2",
overlap_type == 1 | (overlap_type %in% c(2:5) & rownum_temp == 2) ~ "both",
TRUE ~ "x"
),
# Drop rows from enroll_type == h/m when they are fully covered by an enroll_type == b
drop =
case_when(
id == lag(id, 1) & !is.na(lag(id, 1)) &
startdate_c == lag(startdate_c, 1) & !is.na(lag(startdate_c, 1)) &
enddate_c >= lag(enddate_c, 1) & !is.na(lag(enddate_c, 1)) &
# Fix up quirk from dt1 data where two rows present for the same day
!(lag(enroll_type, 1) != "dt2" & lag(enddate_dt1, 1) == lag(startdate_dt1, 1)) &
enroll_type != "both" ~ 1,
id == lead(id, 1) & !is.na(lead(id, 1)) &
startdate_c == lead(startdate_c, 1) & !is.na(lead(startdate_c, 1)) &
enddate_c <= lead(enddate_c, 1) & !is.na(lead(enddate_c, 1)) &
# Fix up quirk from dt1 data where two rows present for the same day
!(lead(enroll_type, 1) != "dt2" & lead(enddate_dt1, 1) == lead(startdate_dt1, 1)) &
enroll_type != "both" & lead(enroll_type, 1) == "both" ~ 1,
# Fix up other oddities when the date range is only one day
id == lag(id, 1) & !is.na(lag(id, 1)) &
startdate_c == lag(startdate_c, 1) & !is.na(lag(startdate_c, 1)) &
startdate_c == enddate_c & !is.na(startdate_c) &
((enroll_type == "dt2" & lag(enroll_type, 1) %in% c("both", "dt1")) |
(enroll_type == "dt1" & lag(enroll_type, 1) %in% c("both", "dt2"))) ~ 1,
id == lag(id, 1) & !is.na(lag(id, 1)) &
startdate_c == lag(startdate_c, 1) & !is.na(lag(startdate_c, 1)) &
startdate_c == enddate_c & !is.na(startdate_c) &
startdate_dt1 == lag(startdate_dt1, 1) & enddate_dt1 == lag(enddate_dt1, 1) &
!is.na(startdate_dt1) & !is.na(lag(startdate_dt1, 1)) &
enroll_type != "both" ~ 1,
id == lead(id, 1) & !is.na(lead(id, 1)) &
startdate_c == lead(startdate_c, 1) & !is.na(lead(startdate_c, 1)) &
startdate_c == enddate_c & !is.na(startdate_c) &
((enroll_type == "dt2" & lead(enroll_type, 1) %in% c("both", "dt1")) |
(enroll_type == "dt1" & lead(enroll_type, 1) %in% c("both", "dt2"))) ~ 1,
# Drop rows where the enddate_c < startdate_c due to
# both data sources' dates ending at the same time
enddate_c < startdate_c ~ 1,
TRUE ~ 0
)
) %>%
filter(drop == 0 | is.na(drop)) %>%
# Truncate remaining overlapping end dates
mutate(enddate_c = as.Date(
ifelse(id == lead(id, 1) & !is.na(lead(startdate_c, 1)) &
startdate_c < lead(startdate_c, 1) &
enddate_c >= lead(enddate_c, 1),
lead(startdate_c, 1) - 1,
enddate_c),
origin = "1970-01-01")
) %>%
select(-drop, -repnum, -rownum_temp) %>%
# With rows truncated, now additional rows with enroll_type == h/m that
# are fully covered by an enroll_type == b
# Also catches single day rows that now have enddate < startdate
mutate(
drop = case_when(
id == lag(id, 1) & startdate_c == lag(startdate_c, 1) &
enddate_c == lag(enddate_c, 1) & lag(enroll_type, 1) == "both" &
enroll_type != "both" ~ 1,
id == lead(id, 1) & startdate_c == lead(startdate_c, 1) &
enddate_c <= lead(enddate_c, 1) & lead(enroll_type, 1) == "both" ~ 1,
id == lag(id, 1) & startdate_c >= lag(startdate_c, 1) &
enddate_c <= lag(enddate_c, 1) & enroll_type != "both" &
lag(enroll_type, 1) == "both" ~ 1,
id == lead(id, 1) & startdate_c >= lead(startdate_c, 1) &
enddate_c <= lead(enddate_c, 1) & enroll_type != "both" &
lead(enroll_type, 1) == "both" ~ 1,
TRUE ~ 0)
) %>%
filter(drop == 0 | is.na(drop)) %>%
select(id, startdate_c, enddate_c, enroll_type)