Я пытаюсь воспроизвести алгоритм предсказания слов, как в https://rpubs.com/akselix/word_prediction
library(quanteda)
library(dplyr)
library(tidyr)
library(tokenizers)
library(splitstackshape)
#read input data
df1=readLines('en_US.news.txt')
df2=readLines('en_US.blogs.txt')
combinedRaw = c(df1, df2)
set.seed(1220)
take a sample of data
n = 1/1000
combined = sample(combinedRaw, length(combinedRaw) * n)
# Split into train and validation sets
split = sample.split(combined, 0.8)
train = subset(combined, split == T)
valid = subset(combined, split == F)
#building unigram, bigram and trigram from the corpus of training data
train = corpus(train)
train1 = unlist(tokenize_ngrams(train,lowercase = TRUE,n=1,stopwords = character(),ngram_delim = ' ',simplify = TRUE))
train2 = unlist(tokenize_ngrams(train,lowercase = TRUE,n=2,stopwords = character(),ngram_delim = ' ',simplify = TRUE))
train3 = unlist(tokenize_ngrams(train,lowercase = TRUE,n=3,stopwords = character(),ngram_delim = ' ',simplify = TRUE))
#build frequency tables and remove NAs
fun.frequency = function(x, minCount = 1) {
x = x %>%
group_by(NextWord) %>%
summarize(count = n()) %>%
filter(count >= minCount)
x = x %>%
mutate(freq = count / sum(x$count)) %>%
select(-count) %>%
arrange(desc(freq))
}
dfTrain1 = data_frame(NextWord = train1)
dfTrain1 = fun.frequency(dfTrain1)
dfTrain1=dfTrain1[complete.cases(dfTrain1), ]
dfTrain2 = data_frame(NextWord = train2)
dfTrain2 = fun.frequency(dfTrain2)
dfTrain2=cSplit(dfTrain2,'NextWord',' ')
colnames(dfTrain2)=c('freq','word1','NextWord')
dfTrain2=dfTrain2[complete.cases(dfTrain2), ]
dfTrain3 = data_frame(NextWord = train3)
dfTrain3 = fun.frequency(dfTrain3)
dfTrain3= cSplit(dfTrain3,'NextWord', ' ')
colnames(dfTrain3)=c('freq','word1','word2','NextWord')
dfTrain3=dfTrain3[complete.cases(dfTrain3), ]
# Parse tokens from input text
fun.input = function(x) {
# If empty input, put both words empty
if(x == "") {
input1 = data_frame(word = "")
input2 = data_frame(word = "")
}
# Tokenize with same functions as training data
if(length(x) ==1) {
y = data_frame(word = unlist(tokenize_ngrams(corpus(x),lowercase = TRUE,n=1,stopwords =
character(),ngram_delim = ' ',simplify = TRUE)))
}
# If only one word, put first word empty
if (nrow(y) == 1) {
input1 = data_frame(word = "")
input2 = y
# Get last 2 words
} else if (nrow(y) >= 1) {
input1 = tail(y, 2)[1, ]
input2 = tail(y, 1)
}
# Return data frame of inputs
inputs = data_frame(words = unlist(rbind(input1,input2)))
return(inputs)
}
# Predict using stupid backoff algorithm ####
fun.predict = function(x, y, n = 100) {
# Predict giving just the top 1-gram words, if no input given
if(x == "" & y == "") {
prediction = dfTrain1 %>%
select(NextWord, freq)
# Predict using 3-gram model
} else if(x %in% dfTrain3$word1 & y %in% dfTrain3$word2) {
prediction = dfTrain3 %>%
filter(word1 %in% x & word2 %in% y) %>%
select(NextWord, freq)
# Predict using 2-gram model
} else if(y %in% dfTrain2$word1) {
prediction = dfTrain2 %>%
filter(word1 %in% y) %>%
select(NextWord, freq)
# If no prediction found before, predict giving just the top 1-gram words
} else{
prediction = dfTrain1 %>%
select(NextWord, freq)
}
# Return predicted word in a data frame
return(prediction[1:n, ])
}
, сейчас тестирую fun.predict
, он не работает должным образом, когда x
и y
не найдены в dfTrain3
, и он возвращает NA. например:
inputText ='girl is'
input1 = fun.input(inputText)[1, ]
input2 = fun.input(inputText)[2, ]
prediction = fun.predict(input1, input2, n = 5)
prediction
NextWord freq
NA <NA> NA
NA.1 <NA> NA
NA.2 <NA> NA
NA.3 <NA> NA
NA.4 <NA> NA
, тогда как на основе fun.predict
должны отображаться наиболее часто встречающиеся слова в униграмме:
NextWord freq
1 the 0.0014742920
2 to 0.0007985749
3 and 0.0002457153
4 of 0.0002457153
5 a 0.0002150009