Вот еще одно решение на основе tidyverse
, которое избегает left_join
.Я понятия не имею, какой подход будет быстрее для большого набора данных, но всегда хорошо иметь больше вариантов:
df <- structure(list(Ord_ID = c(1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L, 4L,
4L, 4L, 5L, 5L), Supplier = structure(c(1L, 1L, 2L, 2L, 3L, 3L,
3L, 3L, 2L, 2L, 2L, 4L, 4L), .Label = c("A", "B", "C", "D"), class = "factor"),
Trans_Type = c("PO", "Receipt", "PO", "Receipt", "PO", "Receipt",
"Receipt", "Receipt", "PO", "Receipt", "Receipt", "PO", "Receipt"
), Date = structure(c(17565, 17577, 17566, 17603, 17569,
17591, 17595, 17619, 17571, 17582, 17589, 17580, 17623), class = "Date")), row.names = c(NA,
-13L), class = "data.frame")
df %>%
group_by(Ord_ID, Supplier, Trans_Type) %>%
# Keep only min and max date values
filter(Date == min(Date) | Date == max(Date) | Trans_Type != 'Receipt') %>%
# Rename 2nd Receipt value Receipt_2 so there are no duplicated values
mutate(Trans_Type2 = if_else(Trans_Type == 'Receipt' & row_number() == 2,
'Receipt_2', Trans_Type)) %>%
# Drop Trans_Type variable (we can't replace in mutate since it's a grouping var)
ungroup(Trans_Type) %>%
select(-Trans_Type) %>%
# Spread the now unduplicated Trans_Type values
spread(Trans_Type2, Date) %>%
# Fill in Receipt_2 values where they're missing
mutate(Receipt_2 = if_else(is.na(Receipt_2), Receipt, Receipt_2))
# A tibble: 5 x 5
Ord_ID Supplier PO Receipt Receipt_2
<int> <fct> <date> <date> <date>
1 1 A 2018-02-03 2018-02-15 2018-02-15
2 2 B 2018-02-04 2018-03-13 2018-03-13
3 3 C 2018-02-07 2018-03-01 2018-03-29
4 4 B 2018-02-09 2018-02-20 2018-02-27
5 5 D 2018-02-18 2018-04-02 2018-04-02