Я пытаюсь кодировать свой собственный seq2seq NMT на основе учебника, найденного по адресу https://github.com/tensorflow/nmt/.Это моя реализация декодера без механизма внимания:
decoder_cells = tf.contrib.rnn.MultiRNNCell(
[tf.contrib.rnn.LSTMCell(num_units) for _ in range(num_layers)])
with tf.variable_scope("training_decoder"):
# Helper
projection_layer = tf.layers.Dense(vi_vocab_size)
helper = tf.contrib.seq2seq.TrainingHelper(
decoder_embed_input, sequence_length = decoder_input_lengths)
# Decoder
decoder = tf.contrib.seq2seq.BasicDecoder(
decoder_cells, helper, encoder_state,
output_layer=projection_layer)
# Dynamic decoding
training_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder,
impute_finished=True,
maximum_iterations=None)
training_logits = training_outputs
with tf.variable_scope("inference_decoder", reuse=True):
# Helper
start_tokens = tf.map_fn(lambda x: vi_sos_id, decoder_input_lengths)
helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding_decoder,
start_tokens,
vi_eos_id)
# Decoder
decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cells,
helper,
encoder_state,
output_layer=projection_layer)
#Dynamic decoding
prediction_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder,
impute_finished=True,
maximum_iterations=maximum_iterations)
Когда я тренирую модель, я получаю около 0,005 баллов BLEU на наборах разработчиков после 2000 пакетов (batch_size равен 50).Однако, когда я реализую механизм внимания следующим образом:
decoder_cells = tf.contrib.rnn.MultiRNNCell(
[tf.contrib.rnn.LSTMCell(num_units) for _ in range(num_layers)])
with tf.variable_scope("training_decoder"):
# Helper
projection_layer = tf.layers.Dense(vi_vocab_size)
helper = tf.contrib.seq2seq.TrainingHelper(
decoder_embed_input, sequence_length = decoder_input_lengths)
# Create an attention mechanism
attention_mechanism = tf.contrib.seq2seq.LuongAttention(
num_units, encoder_outputs)
# Attention wrapper
attention_decoder_cells = tf.contrib.seq2seq.AttentionWrapper(
cell=decoder_cells,
attention_mechanism=attention_mechanism,
attention_layer_size=num_units)
# Attention initial state
attention_initial_state = attention_decoder_cells.zero_state(dtype=tf.float32, batch_size=num_sentences).clone(
cell_state=encoder_state)
# Attention Decoder
decoder = tf.contrib.seq2seq.BasicDecoder(
attention_decoder_cells, helper,
initial_state=attention_initial_state,
output_layer=projection_layer)
# Dynamic decoding
training_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder,
impute_finished=True,
maximum_iterations=None)
training_logits = training_outputs
with tf.variable_scope("inference_decoder", reuse=tf.AUTO_REUSE):
# Helper
start_tokens = tf.map_fn(lambda x: vi_sos_id, decoder_input_lengths)
helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding_decoder,
start_tokens,
vi_eos_id)
# Create an attention mechanism
attention_mechanism = tf.contrib.seq2seq.LuongAttention(
num_units, encoder_outputs)
# Attention wrapper
attention_decoder_cells = tf.contrib.seq2seq.AttentionWrapper(
cell=decoder_cells,
attention_mechanism=attention_mechanism,
attention_layer_size=num_units)
# Attention initial state
attention_initial_state = attention_decoder_cells.zero_state(dtype=tf.float32, batch_size=num_sentences).clone(
cell_state=encoder_state)
# Decoder
decoder = tf.contrib.seq2seq.BasicDecoder(attention_decoder_cells,
helper,
attention_initial_state,
output_layer=projection_layer)
#Dynamic decoding
prediction_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder,
impute_finished=True,
maximum_iterations=maximum_iterations)
Мой показатель BLEU на наборах разработчиков постоянно равен 0 даже после 6000 партий.Кто-нибудь может сказать мне, в чем проблема?
РЕДАКТИРОВАТЬ: После того, как я установил output_attention=False
для AttentionWrapper, я получаю около 0,001 баллов BLEU на наборах разработчиков после 6000 партий.Но это не похоже на правильное решение, так как документация Tensorflow гласит, что output_attention=True
для механизмов внимания в стиле Luong, и я использую LuongAttention в своем коде.