Вы можете выполнить нечеткое сопоставление с помощью agrep()
, которое здесь я использовал для каждого заголовка с lapply()
, чтобы сгенерировать логический вектор совпадений для каждого текста, а затем использовал apply()
для data.frame. из этого совпадения создать вектор совпадающих заголовков.
Вы можете настроить значение max.distance
, но на вашем примере это отлично работает.
dt1 <- data.frame(
title = c("Secret in Their Eyes", "V for Vendetta", "Bottersnikes & Gumbles"),
genre = c("Dramas", "Action & Adventure", "Kids' TV"),
stringsAsFactors = FALSE
)
dt2 <- data.frame(
id = 1:5,
Text = c(
"I really liked V for Vendetta",
"Bottersnikes & Gumbles was a great film .... ",
"In any case, in my opinion bottersnikes &gumbles was a great film ...",
"@thewitcher was an interesting series",
"Secret in Their Eye is a terrible film! but I Like V per Vendetta"
),
stringsAsFactors = FALSE
)
match_titles <- function(target, titles) {
matches <- lapply(titles, agrepl, target,
max.distance = 0.3,
ignore.case = TRUE, fixed = TRUE
)
matched_titles <- apply(
data.frame(matches), 1,
function(y) paste(titles[y], collapse = ",")
)
matched_titles
}
dt2$titles <- match_titles(dt2$Text, dt1$title)
dt2
## id Text
## 1 1 I really liked V for Vendetta
## 2 2 Bottersnikes & Gumbles was a great film ....
## 3 3 In any case, in my opinion bottersnikes &gumbles was a great film ...
## 4 4 @thewitcher was an interesting series
## 5 5 Secret in Their Eye is a terrible film! but I Like V per Vendetta
## titles
## 1 V for Vendetta
## 2 Bottersnikes & Gumbles
## 3 Bottersnikes & Gumbles
## 4
## 5 Secret in Their Eyes,V for Vendetta