(Конечно) это также можно сделать, используя data.table
пример данных
df <- structure(list(Name = c("Jim", "Jane", "Jose", "Matt", "Mickey",
"Tom", "Peter", "Jane", "Jim", "Jose"), Progress = c("65", "20",
"80", "20", "65", "45", "20", "70", "25", "80"), EndDate = c("11/25/2018 16:45",
"11/25/2018 18:05", "11/25/2018 14:20", "12/1/2018 22:52", "11/29/2018 18:15",
"12/2/2018 15:27", "11/26/2018 12:07", "11/30/2018 11:18", "11/29/2018 18:04",
"11/29/2018 21:12")), row.names = c(NA, -10L), class = "data.frame")
код
#create the data.table (can also be done using setDT(df) )
dt <- as.data.table( df )
#set the dates to a proper POSIXct-format
dt[, EndDate := as.POSIXct( EndDate, format = "%m/%d/%Y %H:%M") ]
#order omn EndDate (by reference!)
setorder( dt, EndDate )
#summarise by Name, if first Progress >70 then keep it, else keep last Progress
dt[ , list( Progress = ifelse( Progress[1] > 70, Progress[1], Progress[.N] ) ), by = .(Name)][]
тесты
microbenchmark::microbenchmark(
data.table = {
dt[, EndDate := as.POSIXct( EndDate, format = "%m/%d/%Y %H:%M") ]
setorder( dt, EndDate )
dt[ , list( Progress = ifelse( Progress[1] > 70, Progress[1], Progress[.N] ) ), by = .(Name)][]
},
tidyverse1 = {
df %>%
mutate(EndDate = mdy_hm(EndDate)) %>%
arrange(Name, EndDate) %>%
group_by(Name) %>%
slice(if(first(Progress) > 70) 1 else n())
},
tidyverse2 = {
df %>%
mutate(EndDate = mdy_hm(EndDate)) %>%
group_by(Name) %>%
slice(ifelse(n() > 1,
ifelse(any(Progress > 70), which.min(EndDate), which.max(EndDate)), 1))
}
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# data.table 1.654241 2.030820 2.709023 2.556978 2.782023 30.36590 100
# tidyverse1 6.847731 7.218286 8.742247 7.516838 8.034861 72.00902 100
# tidyverse2 6.173201 6.506398 7.286639 6.764582 7.088591 52.10180 100