Недавно я столкнулся с подобной проблемой и обнаружил, что расширение таблицы для получения одной строки для каждой релевантной даты с последующим свертыванием назад к диапазонам было проще, чем пытаться выработать правильную логику только из конечных точек диапазона.
Вот как этот подход будет работать. В качестве альтернативы может быть возможно сделать что-то вроде this или this , но у этих подходов нет проблемы "не в пределах досягаемости", с которой вы имеете дело.
library(dplyr)
library(fuzzyjoin)
library(lubridate)
df <- data.frame(id=c(rep("a",3),rep("b",2)),
start=c("2017-01-01","2017-01-10","2017-02-10","2017-03-01","2017-04-20"),
end=c("2017-01-15","2017-01-20","2017-02-20","2017-03-28","2017-04-29"))
# All the dates in 2017.
all.2017.dates = data.frame(date = seq.Date(as.Date("2017-01-01"), as.Date("2017-12-31"), by = "day"))
# Start by expanding the original dataframe so that we get one record for each
# id for each date in any of that id's ranges.
df.expanded = df %>%
# Convert the strings to real dates.
mutate(start.date = as.Date(start),
end.date = as.Date(end)) %>%
# Left join to 2017 dates on dates that are in the range of this record.
fuzzy_left_join(all.2017.dates,
by = c("start.date" = "date", "end.date" = "date"),
match_fun = list(`<=`, `>=`)) %>%
# Filter to distinct ids/dates.
select(id, date) %>%
distinct()
# Now, do an anti-join that gets dates NOT in an id's ranges, and collapse back
# down to ranges.
df.final = expand.grid(id = unique(df$id),
date = all.2017.dates$date) %>%
# Anti-join on id and date.
anti_join(df.expanded,
by = c("id", "date")) %>%
# Sort by id, then date, so that the lead/lag functions behave as expected.
arrange(id, date) %>%
# Check whether this record is an endpoint (i.e., is it adjacent to the
# previous/next record?).
mutate(prev.day.included = coalesce(date == lag(date) + 1 &
id == lag(id), F),
next.day.included = coalesce(date == lead(date) - 1 &
id == lag(id), F)) %>%
# Filter to just endpoint records.
filter(!prev.day.included | !next.day.included) %>%
# Fill in both start and end dates on "start" records. The start date is the
# date in the record; the end date is the date of the next record.
mutate(start.date = as.Date(ifelse(!prev.day.included, date, NA),
origin = lubridate::origin),
end.date = as.Date(ifelse(!prev.day.included, lead(date), NA),
origin = lubridate::origin)) %>%
filter(!is.na(start.date))