У меня есть набор данных, df, который выглядит следующим образом:
Мой код работает хорошо, я просто хотел бы сгруппировать по номерам или буквенно-цифровым вместо букв, потому что у меня есть большой набор данных. Я также хотел бы сохранить пустые предметные поля. Как я могу настроить код для этого? Я знаю, что должен изменить эти строки, но не уверен, что здесь делать в полном объеме.
mutate(group = LETTERS[cumsum(new_group)])
filter(Subject != '') %>%
subject recipient length folder message date edit
80 out 1/2/2020 1:00:01 AM T
80 out 1/2/2020 1:00:05 AM T
hey sarah@mail.com,g@mail.com 80 out 1/2/2020 1:00:10 AM T
hey sarah@mail.com,g@mail.com 80 out 1/2/2020 1:00:15 AM T
hey sarah@mail.com,g@mail.com 80 out 1/2/2020 1:00:30 AM T
some k 900 in jjjjj 1/2/2020 1:00:35 AM F
some k 900 in jjjjj 1/2/2020 1:00:36 AM F
some k 900 in jjjjj 1/2/2020 1:00:37 AM F
hey sarah@mail.com,g@mail.com 80 draft 1/2/2020 1:02:00 AM T
hey sarah@mail.com,g@mail.com 80 draft 1/2/2020 1:02:05 AM T
no a 900 in iii 1/2/2020 1:02:10 AM F
no a 900 in iii 1/2/2020 1:02:15 AM F
no a 900 in iii 1/2/2020 1:02:20 AM F
no a 900 in iii 1/2/2020 1:02:25 AM F
hey sarah@mail.com,g@mail.com 80 draft 1/2/2020 1:03:00 AM T
hey sarah@mail.com,g@mail.com 80 draft 1/2/2020 1:03:20 AM T
Желаемый вывод:
Start End Duration Group Subject Length
1/2/2020 1:00:01 AM 1/2/2020 1:00:30 AM 29 A hey 80
1/2/2020 1:02:00 AM 1/2/2020 1:02:05 AM 5 A hey 80
1/2/2020 1:03:00 AM 1/2/2020 1:03:20 AM 20 A hey 80
Вот dput
structure(list(Subject = structure(c(1L, 1L, 2L, 2L, 2L, 4L,
4L, 4L, 2L, 2L, 3L, 3L, 3L, 3L, 2L, 2L, 1L, 1L), .Label = c("",
"hey", "no", "some"), class = "factor"), Recipient = structure(c(1L,
1L, 5L, 5L, 5L, 4L, 4L, 4L, 5L, 5L, 3L, 3L, 3L, 3L, 5L, 5L, 1L,
2L), .Label = c("", " ", "a", "k", "sarah@mail.com,gee@mail.com"
), class = "factor"), Length = c(80L, 80L, 80L, 80L, 80L, 900L,
900L, 900L, 80L, 80L, 900L, 900L, 900L, 900L, 80L, 80L, NA, NA
), Folder = structure(c(4L, 4L, 4L, 4L, 4L, 3L, 3L, 3L, 2L, 2L,
3L, 3L, 3L, 3L, 2L, 2L, 1L, 1L), .Label = c("", "draft", "in",
"out"), class = "factor"), Message = structure(c(1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 1L, 1L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L), .Label = c("",
"jjjjjjj", "llll"), class = "factor"), Date = structure(c(2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L,
17L, 1L, 1L), .Label = c("", "1/2/2020 1:00:01 AM", "1/2/2020 1:00:05 AM",
"1/2/2020 1:00:10 AM", "1/2/2020 1:00:15 AM", "1/2/2020 1:00:30 AM",
"1/2/2020 1:00:35 AM", "1/2/2020 1:00:36 AM", "1/2/2020 1:00:37 AM",
"1/2/2020 1:02:00 AM", "1/2/2020 1:02:05 AM", "1/2/2020 1:02:10 AM",
"1/2/2020 1:02:15 AM", "1/2/2020 1:02:20 AM", "1/2/2020 1:02:25 AM",
"1/2/2020 1:03:00 AM", "1/2/2020 1:03:20 AM"), class = "factor"),
Edit = c(TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE,
TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, NA, NA
)), class = "data.frame", row.names = c(NA, -18L))
Это код, который я использую:
df1<-df %>%
mutate_if(is.factor, as.character) %>%
mutate_at(c("Subject", "Recipient"), ~if_else(is.na(.), "", stringr::str_trim(.))) %>%
filter(Subject != '') %>%
mutate(Date = as.POSIXct(Date, format = '%m/%d/%Y %H:%M:%OS')) %>%
mutate(cond = Edit & Folder %in% c('out', 'draft') & Message == '') %>%
mutate(segment = cumsum(!cond)) %>%
filter(cond) %>%
group_by(Subject, Recipient, Length, segment) %>%
summarize(Start = min(Date),
End = max(Date),
Duration = End - Start) %>%
mutate(new_group = (Subject != lag(Subject, 1, "")) *
(Recipient != lag(Recipient, 1, "")) *
(Length != lag(Length, 1, ""))) %>%
ungroup() %>%
mutate(group = LETTERS[cumsum(new_group)])