Я пытаюсь использовать тот же код, что и в https://thiloshon.wordpress.com/2018/03/11/build-your-own-word-sentence-prediction-application-part-02/ для прогнозирования на уровне слов. Входные текстовые данные также находятся в указанной ссылке, и я использую файл en_US.news.txt
в качестве моего единственного входного файла.
library(quanteda)
library(data.table)
#read the .txt file
df=readLines('en_US.news.txt')
#take a sample of the df
sampleHolderNews <- sample(length(df), length(df) * 0.1)
US_News_Sample <- df[sampleHolderNews]
#build the corpus of the data
corp <- corpus(US_News_Sample)
#Preprocessing
master_Tokens <- tokens(x = tolower(corp),remove_punct =
TRUE,remove_numbers = TRUE,remove_hyphens = TRUE,remove_symbols = TRUE)
stemed_words <- tokens_wordstem(master_Tokens, language = "english")
#tokenization#
bi_gram <- tokens_ngrams(stemed_words, n = 2)
tri_gram <- tokens_ngrams(stemed_words, n = 3)
uni_DFM <- dfm(stemed_words)
bi_DFM <- dfm(bi_gram)
tri_DFM <- dfm(tri_gram)
uni_DFM <- dfm_trim(uni_DFM, 3)
bi_DFM <- dfm_trim(bi_DFM, 3)
tri_DFM <- dfm_trim(tri_DFM, 3)
sums_U <- colSums(uni_DFM)
sums_B <- colSums(bi_DFM)
sums_T <- colSums(tri_DFM)
# Create data tables with individual words as columns
uni_words <- data.table(word_1 = names(sums_U), count = sums_U)
bi_words <- data.table(
word_1 = sapply(strsplit(names(sums_B), "_", fixed = TRUE), '[[', 1),
word_2 = sapply(strsplit(names(sums_B), "_", fixed = TRUE), '[[', 2),
count = sums_B)
tri_words <- data.table(
word_1 = sapply(strsplit(names(sums_T), "_", fixed = TRUE), '[[', 1),
word_2 = sapply(strsplit(names(sums_T), "_", fixed = TRUE), '[[', 2),
word_3 = sapply(strsplit(names(sums_T), "_", fixed = TRUE), '[[', 3),
count = sums_T)
#indexing#
setkey(uni_words, word_1)
setkey(bi_words, word_1, word_2)
setkey(tri_words, word_1, word_2, word_3)
######## Finding Bi-Gram Probability #################
discount_value <- 0.75
# Finding number of bi-gram words
numOfBiGrams <- nrow(bi_words[.(word_1, word_2)])
# Dividing number of times word 2 occurs as second part of bigram, by total number of bigrams.
# Finding probability for a word given the number of times it was second word of a bigram
ckn <- bi_words[, .(Prob = ((.N) / numOfBiGrams)), by = word_2]
setkey(ckn, word_2)
# Assigning the probabilities as second word of bigram, to unigrams
uni_words[, Prob := ckn[word_1, Prob]]
uni_words <- uni_words[!is.na(uni_words$Prob)]
# Finding number of times word 1 occurred as word 1 of bi-grams
n1wi <- bi_words[, .(N = .N), by = word_1]
setkey(n1wi, word_1)
# Assigning total times word 1 occured to bigram cn1
bi_words[, Cn1 := uni_words[word_1, count]]
# Kneser Kney Algorithm
bi_words[, Prob := ((count - discount_value) / Cn1 + discount_value / Cn1 *
n1wi[word_1, N] * uni_words[word_2, Prob])]
######## End of Finding Bi-Gram Probability #################
######## Finding Tri-Gram Probability #################
# Finding count of word1-word2 combination in bigram
tri_words[, Cn2 := bi_words[.(word_1, word_2), .N]]
n1w12 <- tri_words[, .N, by = .(word_1, word_2)]
setkey(n1w12, word_1, word_2)
# Kneser Kney Algorithm
tri_words[, Prob := ((count - discount_value) / Cn2 + discount_value / Cn2 *
n1w12[.(word_1, word_2), .N] * bi_words[.(word_1, word_2), Prob])]
Здесь я получаю следующую ошибку для алгоритма Кнезера для триграмм:
Error in `[.data.table`(tri_words, , `:=`(Prob, ((count - discount_value)/Cn2 + :
Supplied 13867 items to be assigned to 3932 items of column 'Prob'. If you wish to 'recycle'
the RHS please use rep() to make this intent clear to readers of your code.
In addition: Warning messages:
1: In discount_value/Cn2 * n1w12[list(word_1, word_2), .N] * bi_words[list(word_1, :
longer object length is not a multiple of shorter object length
2: In (count - discount_value)/Cn2 + discount_value/Cn2 * n1w12[list(word_1, :
longer object length is not a multiple of shorter object length
Я мог бы найти некоторые похожие вопросы, связанные с ошибкой таблицы данных, но не могу понять, как мне решить эту ошибку в коде.