Моя цель - извлечь вопрос из ответа, указанного в качестве входного. Я использовал модель sequence2sequence, реализовав функцию активации LSTM и softmax. Я случайным образом генерирую вопрос из базы данных, а затем извлекаю вопрос на основе ответа, полученного в качестве входных данных, полученный в результате вопрос, который я получаю из нейронной сети, сопоставляется со случайно сгенерированным вопросом для проверки сходства с помощью SequenceMatcher. Проблема заключается в том, что мой ответ неправильно сопоставляется с данным ответом, и случайно сгенерированный вопрос также является частичным. Вот мой код, написанный в spyder IDE: -
import numpy as np
import tensorflow as tf
import pickle
import json
from tensorflow.keras import layers , activations , models , preprocessing
from difflib import SequenceMatcher
import random
#print( tf.VERSION )
from tensorflow.keras import preprocessing , utils
#====================== Preparing Data ========================================
with open('data_1.json', encoding='utf-8') as f:
d = json.load(f)
questions_p1 = list()
questions_p2 = list()
questions_p3 = list()
questions_p4 = list()
questions = list()
answers = list()
for rep in d["intents"]:
for rep1 in rep["proficiency"]:
if(rep1["level"] == "P1"):
for rep2 in rep1["questions"]:
questions_p1.append(rep2["question"])
if(rep1["level"] == "P2"):
for rep2 in rep1["questions"]:
questions_p2.append(rep2["question"])
if(rep1["level"] == "P3"):
for rep2 in rep1["questions"]:
questions_p3.append(rep2["question"])
if(rep1["level"] == "P4"):
for rep2 in rep1["questions"]:
questions_p4.append(rep2["question"])
for rep in d["intents"]:
for rep1 in rep["proficiency"]:
for rep2 in rep1["questions"]:
questions.append(rep2["question"])
for rep2 in rep1["questions"]:
answers.append(rep2["responses"])
#=========================================================================
# answers_with_tags = list()
# for i in range( len( answers ) ):
# answers_with_tags.append( answers[i] )
# answers = list()
# for i in answers_with_tags:
# for j in i:
# answers.append( '<START> ' + j + ' <END>' )
#print(answers)
tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts( questions + answers )
VOCAB_SIZE = len( tokenizer.word_index )+1
print( 'VOCAB SIZE : {}'.format( VOCAB_SIZE ))
#======================= Neural Network =======================================
from gensim.models import Word2Vec
vocab = []
for word in tokenizer.word_index:
vocab.append( word )
print(vocab)
# def tokenize( sentences ):
# tokens_list = []
# vocabulary = []
# for sentence in sentences:
# sentence = sentence.lower()
# sentence = re.sub( '[^a-zA-Z]', ' ', sentence )
# tokens = sentence.split()
# vocabulary += tokens
# tokens_list.append( tokens )
# return tokens_list , vocabulary
a = list()
for i in tokenizer.word_index.keys():
a.append(i)
# p = tokenize( questions + answers )
model = Word2Vec( [a] , min_count = 1)
embedding_matrix = np.zeros( ( VOCAB_SIZE , 100 ) )
for i in range( len( tokenizer.word_index ) ):
embedding_matrix[ i ] = model[ vocab[i] ]
#========================== LSTM Prep =========================================
#encoder_input__1
tokenized_answers = tokenizer.texts_to_sequences( answers )
maxlen_answers = max( [ len(x) for x in tokenized_answers ] )
padded_answers = preprocessing.sequence.pad_sequences( tokenized_answers , maxlen=maxlen_answers, padding='post' )
encoder_input_data = np.array( padded_answers )
print( encoder_input_data.shape , maxlen_answers )
#decoder_input_data
tokenized_questions = tokenizer.texts_to_sequences( questions )
maxlen_questions = max( [ len(x) for x in tokenized_questions ] )
padded_questions = preprocessing.sequence.pad_sequences( tokenized_questions , maxlen=maxlen_questions , padding='post' )
decoder_input_data = np.array( padded_questions )
print( decoder_input_data.shape , maxlen_answers )
#decoder_output_data
tokenized_questions = tokenizer.texts_to_sequences( questions )
# for i in range(len(tokenized_questions)) :
# tokenized_questions[i] = tokenized_questions[i][1:]
padded_questions = preprocessing.sequence.pad_sequences( tokenized_questions , maxlen=maxlen_questions , padding='post' )
onehot_questions = utils.to_categorical( padded_questions , VOCAB_SIZE )
decoder_output_data = np.array( onehot_questions )
print( decoder_output_data.shape )
#============================ Implementing LSTM ===============================
encoder_inputs = tf.keras.layers.Input(shape=( None , ))
encoder_embedding = tf.keras.layers.Embedding( VOCAB_SIZE, 100 , mask_zero=True, weights=[embedding_matrix], trainable = "false" ) (encoder_inputs)
encoder_outputs , state_h , state_c = tf.keras.layers.LSTM( 100 , return_state=True )( encoder_embedding )
encoder_states = [ state_h , state_c ]
decoder_inputs = tf.keras.layers.Input(shape=( None , ))
decoder_embedding = tf.keras.layers.Embedding( VOCAB_SIZE, 100 , mask_zero=True, weights=[embedding_matrix], trainable = "false") (decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM( 100 , return_state=True , return_sequences=True )
decoder_outputs , _ , _ = decoder_lstm ( decoder_embedding , initial_state=encoder_states )
decoder_dense = tf.keras.layers.Dense( VOCAB_SIZE , activation=tf.keras.activations.softmax )
output = decoder_dense ( decoder_outputs )
model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output )
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy', metrics=['mae', 'acc'])
#model = tf.keras.models.load_model('model.h5')
model.summary()
model.fit([encoder_input_data , decoder_input_data], decoder_output_data, batch_size=8 , epochs = 58)
model.save( 'model2.h5' )
#==============================================================================
def make_inference_models():
encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)
decoder_state_input_h = tf.keras.layers.Input(shape=( 100 ,))
decoder_state_input_c = tf.keras.layers.Input(shape=( 100 ,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
decoder_embedding , initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = tf.keras.models.Model(
[decoder_inputs] + decoder_states_inputs,
[decoder_outputs] + decoder_states)
return encoder_model , decoder_model
def str_to_tokens( sentence : str ):
words = sentence.lower().split()
tokens_list = list()
for word in words:
tokens_list.append( tokenizer.word_index[ word ] )
return preprocessing.sequence.pad_sequences( [tokens_list] , maxlen=maxlen_questions , padding='post')
enc_model , dec_model = make_inference_models()
for _ in range(10):
question = random.sample(questions,1)
question = question[0][8:]
question = question[:-6]
print(question)
states_values = enc_model.predict( str_to_tokens( input( 'Enter answer : ' ) ) )
empty_target_seq = np.zeros( ( 1 , 1 ) )
empty_target_seq[0, 0] = tokenizer.word_index['start']
stop_condition = False
decoded_translation = ''
while not stop_condition :
dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values )
sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
sampled_word = None
for word , index in tokenizer.word_index.items() :
if sampled_word_index == index :
decoded_translation += ' {}'.format( word )
sampled_word = word
if sampled_word == 'end' or len(decoded_translation.split()) > maxlen_answers:
stop_condition = True
empty_target_seq = np.zeros( ( 1 , 1 ) )
empty_target_seq[ 0 , 0 ] = sampled_word_index
states_values = [ h , c ]
answer = decoded_translation
ratio = SequenceMatcher(None, question, answer).ratio()
print(decoded_translation)
print(ratio)
if(ratio > 0.5):
print("Correct")
Вывод, который я получаю: -
Вывод изображения