Один из способов исправить поведение - добавить complete
в конце
library(dplyr)
library(tidyr)
final_df <- merge(df_1, df_2, by.x="subject", by.y="subject")
final_df %>%
complete(subject = paste('Subject', 1:7), fill = list(flag = 0))
# A tibble: 7 x 3
# subject first_icu flag
# <chr> <int> <dbl>
#1 Subject 1 NA 0
#2 Subject 2 NA 0
#3 Subject 3 154 1
#4 Subject 4 NA 0
#5 Subject 5 571 1
#6 Subject 6 NA 0
#7 Subject 7 298 1
или в merge
использовать all = TRUE
merge(df_1, df_2, by.x="subject", by.y="subject", all = TRUE)
# subject first_icu flag
#1 Subject 1 NA 0
#2 Subject 2 NA 0
#3 Subject 3 154 1
#4 Subject 4 NA 0
#5 Subject 5 571 1
#6 Subject 6 NA 0
#7 Subject 7 298 1
Или, если нам нужно сделать это несколько более компактным способом
library(stringr)
dat %>%
group_by(subject) %>%
summarise(first_icu = min(transfer_time[str_detect(level, 'Intensive')], na.rm = TRUE),
flag = +(sum(str_detect(environment, 'ICU') & str_detect(level, 'Intensive')) > 0)) %>%
mutate(first_icu = na_if(first_icu, Inf))
В base R
, мы можем сделать
out <- do.call(rbind, lapply(split(dat, dat$subject),
function(x) data.frame(subject = x$subject[1],
first_icu = min(x$transfer_time[grepl(x = x$level,
'Intensive')], na.rm = TRUE),
flag = sum(grepl("ICU", x$environment) & grepl("Intensive", x$level)))))
out$first_icu[is.infinite(out$first_icu)] <- NA
out$flag <- +(out$flag > 0)