неправильный вывод dplyr в функции прогнозирования следующего слова - PullRequest
1 голос
/ 24 марта 2020

Я пытаюсь воспроизвести алгоритм предсказания слов, как в https://rpubs.com/akselix/word_prediction

library(quanteda)
library(dplyr)
library(tidyr)
library(tokenizers)
library(splitstackshape)
#read input data 
df1=readLines('en_US.news.txt')
df2=readLines('en_US.blogs.txt')
combinedRaw = c(df1, df2)
set.seed(1220)
take a sample of data 
n = 1/1000
combined = sample(combinedRaw, length(combinedRaw) * n)

# Split into train and validation sets
split = sample.split(combined, 0.8)
train = subset(combined, split == T)
valid = subset(combined, split == F)
#building unigram, bigram and trigram from the corpus of training data 
train = corpus(train)
train1 = unlist(tokenize_ngrams(train,lowercase = TRUE,n=1,stopwords = character(),ngram_delim = ' ',simplify = TRUE))
train2 = unlist(tokenize_ngrams(train,lowercase = TRUE,n=2,stopwords = character(),ngram_delim = ' ',simplify = TRUE))
train3 = unlist(tokenize_ngrams(train,lowercase = TRUE,n=3,stopwords = character(),ngram_delim = ' ',simplify = TRUE))

  #build frequency tables and remove NAs 
 fun.frequency = function(x, minCount = 1) {
  x = x %>%
 group_by(NextWord) %>%
 summarize(count = n()) %>%
 filter(count >= minCount)
 x = x %>% 
 mutate(freq = count / sum(x$count)) %>% 
 select(-count) %>%
 arrange(desc(freq))
 }


 dfTrain1 = data_frame(NextWord = train1)
 dfTrain1 = fun.frequency(dfTrain1)
 dfTrain1=dfTrain1[complete.cases(dfTrain1), ]

 dfTrain2 = data_frame(NextWord = train2)
 dfTrain2 = fun.frequency(dfTrain2) 
 dfTrain2=cSplit(dfTrain2,'NextWord',' ')
 colnames(dfTrain2)=c('freq','word1','NextWord')
 dfTrain2=dfTrain2[complete.cases(dfTrain2), ]


dfTrain3 = data_frame(NextWord = train3)
dfTrain3 = fun.frequency(dfTrain3) 
dfTrain3= cSplit(dfTrain3,'NextWord', ' ')
colnames(dfTrain3)=c('freq','word1','word2','NextWord')
dfTrain3=dfTrain3[complete.cases(dfTrain3), ]


# Parse tokens from input text
fun.input = function(x) {

# If empty input, put both words empty
 if(x == "") {
 input1 = data_frame(word = "")
 input2 = data_frame(word = "")
}
# Tokenize with same functions as training data
 if(length(x) ==1) {
 y = data_frame(word = unlist(tokenize_ngrams(corpus(x),lowercase = TRUE,n=1,stopwords = 
 character(),ngram_delim = ' ',simplify = TRUE)))

 }
 # If only one word, put first word empty
 if (nrow(y) == 1) {
 input1 = data_frame(word = "")
 input2 = y

 # Get last 2 words    
 }   else if (nrow(y) >= 1) {
 input1 = tail(y, 2)[1, ]
 input2 = tail(y, 1)
 }

 #  Return data frame of inputs 
 inputs = data_frame(words = unlist(rbind(input1,input2)))
 return(inputs)
  }

 # Predict using stupid backoff algorithm ####

 fun.predict = function(x, y, n = 100) {

 # Predict giving just the top 1-gram words, if no input given
 if(x == "" & y == "") {
 prediction = dfTrain1 %>%
  select(NextWord, freq)

 # Predict using 3-gram model
  }   else if(x %in% dfTrain3$word1 & y %in% dfTrain3$word2) {
  prediction = dfTrain3 %>%
  filter(word1 %in% x & word2 %in% y) %>%
  select(NextWord, freq)

  # Predict using 2-gram model
  }   else if(y %in% dfTrain2$word1) {
   prediction = dfTrain2 %>%
  filter(word1 %in% y) %>%
   select(NextWord, freq)

   # If no prediction found before, predict giving just the top 1-gram words
    }   else{
    prediction = dfTrain1 %>%
    select(NextWord, freq)
    }

    # Return predicted word in a data frame
    return(prediction[1:n, ])
    }

, сейчас тестирую fun.predict, он не работает должным образом, когда x и y не найдены в dfTrain3, и он возвращает NA. например:

     inputText ='girl is'
     input1 =  fun.input(inputText)[1, ]
     input2 =  fun.input(inputText)[2, ]
     prediction = fun.predict(input1, input2, n = 5)
     prediction
     NextWord freq
     NA       <NA>   NA
     NA.1     <NA>   NA
     NA.2     <NA>   NA
     NA.3     <NA>   NA
     NA.4     <NA>   NA

, тогда как на основе fun.predict должны отображаться наиболее часто встречающиеся слова в униграмме:

     NextWord         freq
    1      the   0.0014742920
    2      to    0.0007985749
    3      and   0.0002457153
    4      of    0.0002457153
    5      a     0.0002150009
...