У меня есть дата, как в этом примере, я хотел бы найти похожие строки (не дубликаты) и вычислить сходство для каждого. Я нахожу это решение , но я хотел бы сохранить все свои столбцы и добавить процент сходства в качестве новой переменной. Моя цель - найти записи с самым высоким процентом сходства. Как я могу это сделать?
пример набора данных
df <- tibble::tribble(
~date, ~user_id, ~Station_id, ~location_id, ~ind_id, ~start_hour, ~start_minute, ~start_second, ~end_hour, ~end_minute, ~end_second, ~duration_min,
20191015, 19900234, 242, 2, "ac", 7, 25, 0, 7, 30, 59, 6,
20191015, 19900234, 242, 2, "ac", 7, 31, 0, 7, 32, 59, 2,
20191015, 19900234, 242, 2, "ac", 7, 33, 0, 7, 38, 59, 6,
20191015, 19900234, 242, 2, "ac", 7, 39, 0, 7, 40, 59, 2,
20191015, 19900234, 242, 2, "ac", 7, 41, 0, 7, 43, 59, 3,
20191015, 19900234, 242, 2, "ac", 7, 44, 0, 7, 45, 59, 2,
20191015, 19900234, 242, 2, "ac", 7, 47, 0, 7, 59, 59, 13,
20191015, 19900234, 242, 2, "ad", 7, 47, 0, 7, 59, 59, 13,
20191015, 19900234, 242, 2, "ac", 8, 5, 0, 8, 6, 59, 2,
20191015, 19900234, 242, 2, "ad", 8, 5, 0, 8, 6, 59, 2,
20191015, 19900234, 242, 2, "ac", 8, 7, 0, 8, 8, 59, 2,
20191015, 19900234, 242, 2, "ad", 8, 7, 0, 8, 8, 59, 2,
20191015, 19900234, 242, 2, "ac", 16, 26, 0, 16, 55, 59, 30,
20191015, 19900234, 242, 2, "ad", 16, 26, 0, 16, 55, 59, 30,
20191015, 19900234, 242, 2, "ad", 17, 5, 0, 17, 6, 59, 2,
20191015, 19900234, 242, 2, "ac", 17, 5, 0, 17, 23, 59, 19,
20191015, 19900234, 242, 2, "ad", 17, 7, 0, 17, 15, 59, 9,
20191015, 19900234, 242, 2, "ad", 17, 16, 0, 17, 22, 59, 7,
20191015, 19900234, 264, 2, "ac", 17, 24, 0, 17, 35, 59, 12,
20191015, 19900234, 264, 2, "ad", 17, 25, 0, 17, 35, 59, 11,
20191016, 19900234, 242, 1, "ac", 7, 12, 0, 7, 14, 59, 3,
20191016, 19900234, 242, 1, "ad", 7, 13, 0, 7, 13, 59, 1,
20191016, 19900234, 242, 1, "ac", 17, 45, 0, 17, 49, 59, 5,
20191016, 19900234, 242, 1, "ad", 17, 46, 0, 17, 48, 59, 3,
20191016, 19900234, 242, 2, "ad", 7, 14, 0, 8, 0, 59, 47,
20191016, 19900234, 242, 2, "ac", 7, 15, 0, 8, 0, 59, 47
)
Функция для сравнения строк
row_cf <- function(x, y, df){
sum(df[x,] == df[y,])/ncol(df)
}
Функция вывода
# 1) Create all possible row combinations
# 2) Rename
# 3) Run through each row
# 4) Calculate similarity
expand.grid(1:nrow(df), 1:nrow(df)) %>%
rename(row_1 = Var1, row_2 = Var2) %>%
rowwise() %>%
mutate(similarity = row_cf(row_1, row_2, df))
# A tibble: 676 x 3
row_1 row_2 similarity
<int> <int> <dbl>
1 1 1 1
2 2 1 0.75
3 3 1 0.833
4 4 1 0.75
5 5 1 0.75
6 6 1 0.75
7 7 1 0.75
8 8 1 0.667
9 9 1 0.583
10 10 1 0.5
Редактировать:Я хотел бы найти похожие строки в данных, как здесь