Подход,
Загрузить в наборе данных
library(tidyverse)
library(lubridate)
mydata <- tibble(
ID = c("I-1","I-1",
"I-2","I-3",
"I-3","I-4",
"I-5","I-6",
"I-7","I-8",
"I-9","I-19"),
Date = c("2020-01-01", "2020-01-01",
"2020-01-02", "2020-02-01",
"2020-02-01", "2020-02-02",
"2020-02-04", "2019-11-01",
"2019-11-01", "2018-12-13",
"2019-05-17", "2020-03-11"),
city = c("New York", "Utha",
"Washington", "Tokyo",
"Osaka", "London",
"Dubai", "Dubai",
"London", "Tokyo",
"Dubai", "Dubai"),
code = c("123", "103", "122", "123", "193, "144",
"101", "101", "144", "143", "101", "150"),
uid = c("K-1", "K-1", "K-1", "K-2", "K-2", "K-3",
"K-4", "K-4", "K-3", "K-5", "K-4", "K-7"))
mydata <- mydata %>%
mutate(Date = ymd(str_remove(Date, " .*")),
code = as.character(code))
Где пункт номер 1
Я использую count
из dplyr для подсчета кодов по городам. Затем case_when
для дальнейшей идентификации с помощью «Да» или «Нет» по запросу.
# This counts city and code, and fullfills your "Match" column requirement
startdate <- "2017-01-01"
enddate <- "2020-03-29"
mydata %>%
filter(Date >= startdate,
Date <= enddate) %>%
count(city, code, name = "count_samecode") %>%
mutate(Match = case_when(
count_samecode > 1 ~ "Yes",
T ~ "No")) %>%
head()
# # A tibble: 6 x 4
# city code count_samecode Match
# <chr> <chr> <int> <chr>
# 1 Dubai 101 3 Yes
# 2 Dubai 150 1 No
# 3 London 144 2 Yes
# 4 New York 123 1 No
# 5 Osaka 193 1 No
# 6 Tokyo 123 1 No
Где пункт номер 2
Я сделаю то же самое с UID
mydata %>%
filter(Date >= startdate,
Date <= enddate ) %>%
count(city, uid, name = "UIDs_#_filtered") %>%
head()
# # A tibble: 6 x 3
# city uid `UIDs_#_filtered`
# <chr> <chr> <int>
# 1 Dubai K-4 3
# 2 Dubai K-7 1
# 3 London K-3 2
# 4 New York K-1 1
# 5 Osaka K-2 1
# 6 Tokyo K-2 1
Где пункт номер 3
Я могу повторить count
пункта 2, чтобы узнать, сколько из этих городов имеют другой UID, где> 1 означает другой UID.
mydata %>%
filter(Date >= startdate,
Date <= enddate ) %>%
count(city, uid, name = "UIDs_#_filtered") %>%
count(city, name = "UIDs_#_different") %>%
head()
# # A tibble: 6 x 2
# city `UIDs_#_different`
# <chr> <int>
# 1 Dubai 2
# 2 London 1
# 3 New York 1
# 4 Osaka 1
# 5 Tokyo 2
# 6 Utha 1
Где номер предложения 4
Взяв тот же код из # 2, я могу удалить фильтр, чтобы найти весь набор данных
mydata %>%
count(city, uid, name = "UIDs_#_all") %>%
head()
Собирая все вместе
Используя несколько left_join
, мы можем приблизиться к желаемому результату. РЕДАКТИРОВАТЬ: Теперь будет получен первый экземпляр идентификатора из первой комбинации города / кода
check_duplicates_filterview.f <- function( df, startdate, enddate ){
# df should be a tibble
# startdate should be a string "yyyy-mm-dd"
# enddate should be a string "yyyy-mm-dd"
cityfilter <- df %>% filter(Date >= startdate,
Date <= enddate) %>% distinct(city) %>% pull(1)
df <- df %>%
filter(city %in% cityfilter) %>%
mutate(Date = ymd(str_remove(Date, " .*")),
code = as.character(code))
entire.db.countcodes <- df %>% # Finds count of code in entire DB
count(city, code)
where.1 <- df %>% filter(Date >= startdate,
Date <= enddate) %>%
distinct(city, code, .keep_all = T) %>%
left_join(entire.db.countcodes) %>%
rename("count_samecode" = n) %>%
mutate(Match = case_when(
count_samecode > 1 ~ "Yes",
T ~ "No"))
where.2 <- df %>%
filter(Date >= startdate,
Date <= enddate ) %>%
count(city, uid, name = "UIDs_#_filtered")
where.3 <- df %>%
filter(Date >= startdate,
Date <= enddate ) %>%
distinct(city, uid) %>%
count(city, name = "UIDs_#_distinct")
where.4 <- df %>%
filter(city %in% cityfilter) %>%
count(city, uid, name = "UIDs_#_all")
first_half <- left_join(where.1, where.2)
second_half <- left_join(where.4, where.3)
full <- left_join(first_half, second_half)
return(full)
}
# > check_duplicates_filterview.f(mydata, "2018-01-01", "2020-01-01")
# Joining, by = "city"
# Joining, by = "city"
# Joining, by = c("city", "uid")
# # A tibble: 5 x 8
# city code count_samecode Match uid `UIDs_#_filtered` `UIDs_#_all` `UIDs_#_distinct`
# <chr> <chr> <int> <chr> <chr> <int> <int> <int>
# 1 Dubai 101 2 Yes K-4 2 3 1
# 2 London 144 1 No K-3 1 2 1
# 3 New York 123 1 No K-1 1 1 1
# 4 Tokyo 143 1 No K-5 1 1 1
# 5 Utha 103 1 No K-1 1 1 1