Я пытаюсь использовать Keras с R (R версия 3.5.0) для генерации вложений слов для набора данных Amazon Fine Foods Reviews (используя это руководство: https://blogs.rstudio.com/tensorflow/posts/2017-12-22-word-embeddings-with-keras/).
Но у меня естьследующая ошибка при обучении модели:
Error in py_call_impl(callable, dots$args, dots$keywords) :
ValueError: Error when checking model input: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 2 array(s), but instead got the following list of 1 arrays: [array([[[ 3],
[ 71],
[ 7],
...,
[ 58],
[ 182],
[ 3]],
[[ 60],
[1040],
[2197],
...,
[ 45],
[ ...
Detailed traceback:
File "/Users/me/.virtualenvs/r-tensorflow/lib/python2.7/site-packages/tensorflow/python/keras/engine/training.py", line 1426, in fit_generator
initial_epoch=initial_epoch)
File "/Users/me/.virtualenvs/r-tensorflow/lib/python2.7/site-packages/tensorflow/python/keras/engine/training_generator.py", line 191, in model_iteration
batch_outs = batch_function(*batch_data)
File "/Users/me/.virtualenvs/r-tensorflow/lib/python2.7/site-packages/tensorflow/python/keras/engine/training.py", line 1175, in train_on_batch
Вот мой сценарий: (я ничего не изменил из учебника)
#getting data
download.file("https://snap.stanford.edu/data/finefoods.txt.gz", "finefoods.txt.gz")
library(readr)
library(stringr)
reviews <- read_lines("finefoods.txt.gz")
reviews <- reviews[str_sub(reviews, 1, 12) == "review/text:"]
reviews <- str_sub(reviews, start = 14)
reviews <- iconv(reviews, to = "UTF-8")
head(reviews, 2)
#Preprocessing
library(keras)
tokenizer <- text_tokenizer(num_words = 20000)
tokenizer %>% fit_text_tokenizer(reviews)
#Skip-Gram Model
library(reticulate)
library(purrr)
skipgrams_generator <- function(text, tokenizer, window_size, negative_samples) {
gen <- texts_to_sequences_generator(tokenizer, sample(text))
function() {
skip <- generator_next(gen) %>%
skipgrams(
vocabulary_size = tokenizer$num_words,
window_size = window_size,
negative_samples = 1
)
x <- transpose(skip$couples) %>% map(. %>% unlist %>% as.matrix(ncol = 1))
y <- skip$labels %>% as.matrix(ncol = 1)
list(x, y)
}
}
embedding_size <- 128 # Dimension of the embedding vector.
skip_window <- 5 # How many words to consider left and right.
num_sampled <- 1 # Number of negative examples to sample for each word.
input_target <- layer_input(shape = 1)
input_context <- layer_input(shape = 1)
embedding <- layer_embedding(
input_dim = tokenizer$num_words + 1,
output_dim = embedding_size,
input_length = 1,
name = "embedding"
)
target_vector <- input_target %>%
embedding() %>%
layer_flatten()
context_vector <- input_context %>%
embedding() %>%
layer_flatten()
dot_product <- layer_dot(list(target_vector, context_vector), axes = 1)
output <- layer_dense(dot_product, units = 1, activation = "sigmoid")
model <- keras_model(list(input_target, input_context), output)
model %>% compile(loss = "binary_crossentropy", optimizer = "adam")
summary(model)
model %>%
fit_generator(
skipgrams_generator(reviews, tokenizer, skip_window, negative_samples),
steps_per_epoch = 100000, epochs = 5
)