Предположим, у меня есть список заголовков статей в Википедии, и я хотел бы измерить сходство между статьями.
title <- c("virus",
"coronavirus",
"Coronaviridae",
"pandemic",
"2019–20_coronavirus_pandemic",
"Coronavirus_disease_2019",
"Severe_acute_respiratory_syndrome_coronavirus_2",
"Severe_acute_respiratory_syndrome_coronavirus",
"Severe_acute_respiratory_syndrome-related_coronavirus",
"syndrome",
"disease",
"infection"
)
Вот некоторые решения, но я предполагаю, что это не самый быстрый способ сделать эта задача для длинных списков названий. Кроме того, результаты не совсем совпадают.
Во-первых, функция для получения статьи из Википедии:
GetArticle <- function(pageName){
library(xml2)
library(httr)
query <- paste0("https://en.wikipedia.org/w/api.php?",
"action=query", "&format=xml", "&redirects", "&prop=extracts",
"&explaintext","&titles=", pageName)
answer <- httr::GET(query)
page.xml <- xml2::read_xml(answer)
page <- xml2::xml_find_all(page.xml, "//extract")
text <- as.character(base::trimws(xml_text(page)))
}
article <- unlist(lapply(title, GetArticle))
Я проверил эти возможные решения. Первая реализация "basi c".
function1 <- function(article){
start_time <- Sys.time()
cleanArticle <- gsub("[[:punct:]]", "", article)
cleanArticle <- gsub("[0-9]", "", cleanArticle)
titles.df <- as.data.frame(expand.grid(title, title))
names(titles.df) <- c("title1", "title2")
couples <- as.data.frame(expand.grid(cleanArticle,cleanArticle))
similarity <- function(text1, text2){
tokens1 <- tolower(unlist(strsplit(text1, " ")))
tokens2 <- tolower(unlist(strsplit(text2, " ")))
intersection.v <- sort(intersect(tokens1, tokens2))
union.v <- sort(union(tokens1, tokens2))
cardinalityOfIntersection <- length(intersection.v)
cardinalityOfUnion <- length(union.v)
score <- cardinalityOfIntersection / cardinalityOfUnion
return(score)
}
score <- c()
for (i in 1:length(article)) {
jaccardIndex <- similarity(as.character(couples$Var1[i]),
as.character(couples$Var2[i]))
score <- c(score, jaccardIndex)
}
results.df <- cbind(titles.df, score)
end_time <- Sys.time()
time <- end_time - start_time
print(time)
return(results.df)
}
Второе решение, построенное из пакета quanteda:
function2 <- function(article){
# https://quanteda.io/reference/textstat_simil.html
start_time <- Sys.time()
library(quanteda)
dfma <- dfm(article)
scores <- textstat_simil(dfma, method = "jaccard", margin = "documents")
mat <- as.matrix(scores)
rownames(mat) <- title
colnames(mat) <- title
library(reshape2)
candidates <- as.data.frame(subset(melt(mat), value!=0))
names(candidates) <- c("title1", "title2", "score")
end_time <- Sys.time()
time <- end_time - start_time
print(time)
return(candidates)
}
Третье решение, построенное из пакета повторного использования:
function3 <- function(article){
# https://cran.r-project.org/web/packages/textreuse/vignettes/textreuse-pairwise.html
start_time <- Sys.time()
library(textreuse)
corpus <- TextReuseCorpus(text=article, tokenizer = tokenize_words, progress = FALSE)
names(corpus) <- title
comparisons <- pairwise_compare(corpus, jaccard_similarity)
comparisons.df <- as.data.frame(comparisons)
candidates <- pairwise_candidates(comparisons)
names(candidates) <- c("title1", "title2", "score")
end_time <- Sys.time()
time <- end_time - start_time
print(time)
return(candidates)
}
Решения можно сравнить:
results1.df <- function1(article)
results2.df <- function2(article)
results3.df <- function3(article)
system.time(function1(article))
system.time(function2(article))
system.time(function3(article))
View(results1.df)
View(results2.df)
View(results3.df)
Некоторые показания:
> results1.df <- function1(article)
Time difference of 1.176277 secs
> results2.df <- function2(article)
Time difference of 0.3198049 secs
> results3.df <- function3(article)
Making 66 comparisons.
|============================================================================================================================| 100%
Time difference of 0.2578409 secs
> system.time(function1(article))
Time difference of 1.100325 secs
utilisateur système écoulé
1.08 0.00 1.09
> system.time(function2(article))
Time difference of 0.402745 secs
utilisateur système écoulé
0.45 0.00 0.41
> system.time(function3(article))
Making 66 comparisons.
|============================================================================================================================| 100%
Time difference of 0.1998792 secs
utilisateur système écoulé
0.22 0.00 0.22
Большое спасибо за вашу помощь!