Для полноты: используйте rle
, чтобы найти повторное значение:
df <- data.table(df)
a <- rle(df$Text)
df[, groups := rep(seq(1, length(a$lengths)), a$lengths)]
df
df[, .SD[.N, ], by = groups]
groups Timestamp Text
1: 1 2018-01-08 13:45:12 A
2: 2 2018-01-08 14:26:22 B
3: 3 2018-01-08 14:31:32 C
4: 4 2018-01-08 15:13:16 A
5: 5 2018-01-08 15:25:19 B
РЕДАКТИРОВАТЬ И эталонный тест (с большим набором)
df <- data.table(df)
df <- df[, lapply(.SD, sample, size = 10000, replace = TRUE), .SDcols = colnames(df)]
aa <- function(){
a <- rle(df$Text)
df[, groups := rep(seq(1, length(a$lengths)), a$lengths)]
df[, .SD[.N, ], by = groups]
}
aa2 <- function(){
df[, group := rleid(Text)]
df[, .SD[.N, ], by = group]
}
bb <- function(){
df %>%
group_by(group = rleid(Text)) %>%
slice(which.max(Timestamp)) %>%
ungroup() %>%
select(-group)
}
cc <- function(){
df %>%
mutate(Group = cumsum(c(FALSE, df$Text[-1] != df$Text[-n()]))) %>%
group_by(Group) %>%
filter(row_number() == n()) %>%
ungroup() %>%
select(-Group)
}
> microbenchmark(aa(), aa2(), bb(), cc(), times = 5)
Unit: milliseconds
expr min lq mean median uq max neval cld
aa() 1212.6609 1252.2010 1267.8729 1279.0700 1282.9894 1312.4432 5 c
aa2() 1213.9839 1271.1910 1275.3573 1283.8008 1299.9422 1307.8685 5 c
bb() 112.8352 116.5473 152.9838 142.4634 160.9753 232.0976 5 a
cc() 306.1699 306.4497 316.5756 315.7423 326.8091 327.7069 5 b