Обратный чатбот - получение вопросов из ответа, указанного в качестве входных данных - PullRequest
0 голосов
/ 13 марта 2020

Моя цель - извлечь вопрос из ответа, указанного в качестве входного. Я использовал модель sequence2sequence, реализовав функцию активации LSTM и softmax. Я случайным образом генерирую вопрос из базы данных, а затем извлекаю вопрос на основе ответа, полученного в качестве входных данных, полученный в результате вопрос, который я получаю из нейронной сети, сопоставляется со случайно сгенерированным вопросом для проверки сходства с помощью SequenceMatcher. Проблема заключается в том, что мой ответ неправильно сопоставляется с данным ответом, и случайно сгенерированный вопрос также является частичным. Вот мой код, написанный в spyder IDE: -

import numpy as np
import tensorflow as tf
import pickle
import json
from tensorflow.keras import layers , activations , models , preprocessing
from difflib import SequenceMatcher
import random

#print( tf.VERSION )

from tensorflow.keras import preprocessing , utils

#====================== Preparing Data ========================================

with open('data_1.json', encoding='utf-8') as f:
    d = json.load(f)

questions_p1 = list()
questions_p2 = list()
questions_p3 = list()
questions_p4 = list()

questions = list()
answers = list()


for rep in d["intents"]:
    for rep1 in rep["proficiency"]:
        if(rep1["level"] == "P1"):
            for rep2 in rep1["questions"]:
                questions_p1.append(rep2["question"])

        if(rep1["level"] == "P2"):
            for rep2 in rep1["questions"]:
                questions_p2.append(rep2["question"])

        if(rep1["level"] == "P3"):
            for rep2 in rep1["questions"]:
                questions_p3.append(rep2["question"])

        if(rep1["level"] == "P4"):
            for rep2 in rep1["questions"]:
                questions_p4.append(rep2["question"])


for rep in d["intents"]:
    for rep1 in rep["proficiency"]:
        for rep2 in rep1["questions"]:
            questions.append(rep2["question"])
        for rep2 in rep1["questions"]:
            answers.append(rep2["responses"])

#=========================================================================
# answers_with_tags = list()
# for i in range( len( answers ) ):
#     answers_with_tags.append( answers[i] )


# answers = list()
# for i in answers_with_tags:
#     for j in i:
#         answers.append( '<START> ' + j + ' <END>' )

#print(answers)
tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts( questions + answers )

VOCAB_SIZE = len( tokenizer.word_index )+1
print( 'VOCAB SIZE : {}'.format( VOCAB_SIZE ))

#======================= Neural Network =======================================

from gensim.models import Word2Vec


vocab = []
for word in tokenizer.word_index:
    vocab.append( word )

print(vocab)
# def tokenize( sentences ):
#     tokens_list = []
#     vocabulary = []
#     for sentence in sentences:
#         sentence = sentence.lower()
#         sentence = re.sub( '[^a-zA-Z]', ' ', sentence )
#         tokens = sentence.split()
#         vocabulary += tokens
#         tokens_list.append( tokens )
#     return tokens_list , vocabulary

a = list()
for i in tokenizer.word_index.keys():
    a.append(i)

# p = tokenize( questions + answers )
model = Word2Vec( [a] , min_count = 1) 

embedding_matrix = np.zeros( ( VOCAB_SIZE , 100 ) )
for i in range( len( tokenizer.word_index ) ):
    embedding_matrix[ i ] = model[ vocab[i] ]

#========================== LSTM Prep =========================================

#encoder_input__1
tokenized_answers = tokenizer.texts_to_sequences( answers )
maxlen_answers = max( [ len(x) for x in tokenized_answers ] )
padded_answers = preprocessing.sequence.pad_sequences( tokenized_answers , maxlen=maxlen_answers, padding='post' )
encoder_input_data = np.array( padded_answers )
print( encoder_input_data.shape , maxlen_answers )

#decoder_input_data
tokenized_questions = tokenizer.texts_to_sequences( questions )
maxlen_questions = max( [ len(x) for x in tokenized_questions ] )
padded_questions = preprocessing.sequence.pad_sequences( tokenized_questions , maxlen=maxlen_questions , padding='post' )
decoder_input_data = np.array( padded_questions )
print( decoder_input_data.shape , maxlen_answers )

#decoder_output_data
tokenized_questions = tokenizer.texts_to_sequences( questions )
# for i in range(len(tokenized_questions)) :
#     tokenized_questions[i] = tokenized_questions[i][1:]
padded_questions = preprocessing.sequence.pad_sequences( tokenized_questions , maxlen=maxlen_questions , padding='post' )
onehot_questions = utils.to_categorical( padded_questions , VOCAB_SIZE )
decoder_output_data = np.array( onehot_questions )
print( decoder_output_data.shape )

#============================ Implementing LSTM ===============================

encoder_inputs = tf.keras.layers.Input(shape=( None , ))
encoder_embedding = tf.keras.layers.Embedding( VOCAB_SIZE, 100 , mask_zero=True, weights=[embedding_matrix], trainable = "false" ) (encoder_inputs)
encoder_outputs , state_h , state_c = tf.keras.layers.LSTM( 100 , return_state=True )( encoder_embedding )
encoder_states = [ state_h , state_c ]

decoder_inputs = tf.keras.layers.Input(shape=( None ,  ))
decoder_embedding = tf.keras.layers.Embedding( VOCAB_SIZE, 100 , mask_zero=True, weights=[embedding_matrix], trainable = "false") (decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM( 100 , return_state=True , return_sequences=True )
decoder_outputs , _ , _ = decoder_lstm ( decoder_embedding , initial_state=encoder_states )
decoder_dense = tf.keras.layers.Dense( VOCAB_SIZE , activation=tf.keras.activations.softmax ) 
output = decoder_dense ( decoder_outputs )

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output )
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy', metrics=['mae', 'acc'])
#model = tf.keras.models.load_model('model.h5')
model.summary()

model.fit([encoder_input_data , decoder_input_data], decoder_output_data, batch_size=8 , epochs = 58) 
model.save( 'model2.h5' )

#==============================================================================

def make_inference_models():

    encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)

    decoder_state_input_h = tf.keras.layers.Input(shape=( 100 ,))
    decoder_state_input_c = tf.keras.layers.Input(shape=( 100 ,))

    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_embedding , initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = tf.keras.models.Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)

    return encoder_model , decoder_model

def str_to_tokens( sentence : str ):
    words = sentence.lower().split()
    tokens_list = list()
    for word in words:
        tokens_list.append( tokenizer.word_index[ word ] ) 
    return preprocessing.sequence.pad_sequences( [tokens_list] , maxlen=maxlen_questions , padding='post')

enc_model , dec_model = make_inference_models()

for _ in range(10):
    question = random.sample(questions,1)
    question = question[0][8:]
    question = question[:-6]
    print(question)
    states_values = enc_model.predict( str_to_tokens( input( 'Enter answer : ' ) ) )
    empty_target_seq = np.zeros( ( 1 , 1 ) )
    empty_target_seq[0, 0] = tokenizer.word_index['start']
    stop_condition = False
    decoded_translation = ''
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values )
        sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
        sampled_word = None
        for word , index in tokenizer.word_index.items() :
            if sampled_word_index == index :
                decoded_translation += ' {}'.format( word )
                sampled_word = word

        if sampled_word == 'end' or len(decoded_translation.split()) > maxlen_answers:
            stop_condition = True

        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ] 

    answer = decoded_translation
    ratio = SequenceMatcher(None, question, answer).ratio()
    print(decoded_translation)
    print(ratio)
    if(ratio > 0.5):
        print("Correct")

Вывод, который я получаю: -

Вывод изображения

Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...