Неправильный вывод при выполнении программы проверки орфографии - PullRequest
0 голосов
/ 06 декабря 2018

Я использую этот код для исправления орфографии.версия Python 3.6.5.Я выполняю этот код в блокноте Jupyter.

код:

import os

import errno

from collections import Counter

from hashlib import sha256

import re

import json

import itertools

import logging

import requests

import numpy as np

import pandas as pd

from numpy.random import choice as random_choice, randint as random_randint, 
shuffle as random_shuffle, seed as random_seed, rand

from numpy import zeros as np_zeros

from keras.models import Sequential, load_model

from keras.layers import Activation, TimeDistributed, Dense, RepeatVector, 
Dropout, recurrent

from keras.callbacks import Callback

LOGGER = logging.getLogger(__name__)

LOGGER.addHandler(logging.StreamHandler())

LOGGER.setLevel(logging.DEBUG)

random_seed(123)

class Configuration(object):

    CONFIG = Configuration()

    CONFIG.input_layers = 2

    CONFIG.output_layers = 2

    CONFIG.amount_of_dropout = 0.2

    CONFIG.hidden_size = 500

    CONFIG.initialization = "he_normal" 

    CONFIG.number_of_chars = 26

    CONFIG.max_input_len = 20

    CONFIG.inverted = True

    CONFIG.batch_size = 4

    CONFIG.epochs = 50

    CONFIG.steps_per_epoch = 10

    CONFIG.validation_steps = 10

    CONFIG.number_of_iterations = 10

dataset=pd.read_csv("inpspell_wordpair2.csv")

input_data=dataset['input'].tolist()

input_data1=str(input_data)

output_data=dataset['output'].tolist()

output_data1=str(output_data)

chars_input = set.union(*(set(input_data1) for inp in input_data1))

chars_output = set.union(*(set(output_data1) for op in output_data1))

chars = list(set.union(chars_input, chars_output))

chars=list(" abcdefghijklmnopqrstuvwxyz")

MIN_INPUT_LEN = 1

AMOUNT_OF_NOISE = 0.2 / CONFIG.max_input_len

CHARS = list(" abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .")

class CharacterTable(object):

    def __init__(self, chars):

         self.chars = sorted(set(chars))

         self.char_indices = dict((c, i) for i, c in enumerate(self.chars))

         self.indices_char = dict((i, c) for i, c in enumerate(self.chars))

    @property
    def size(self):

        return len(self.chars)

    def encode(self, C, maxlen):
        X = np.zeros((maxlen, len(self.chars)), dtype=np.bool) # 
        for i, c in enumerate(C):
            X[i, self.char_indices[c]] = 1
    return X

    def decode(self, X, calc_argmax=True):
        if calc_argmax:
            X = X.argmax(axis=-1)
        return ''.join(self.indices_char[x] for x in X)

def _vectorize(questions, answers, ctable):

    len_of_questions = len(questions)

    X = np_zeros((len_of_questions, CONFIG.max_input_len, ctable.size),dtype=int)

    print("inputchars")

    for i in range(len(questions)):
        print(i)
        sentence = questions.pop()
        print(sentence)
        for j, c in enumerate(sentence):
            print(j,c)
            try:
                X[i, j, ctable.char_indices[c]] = 1
            except KeyError:
                pass 

    y = np_zeros((len_of_questions, CONFIG.max_input_len, ctable.size), 
dtype=int)
    print("outputchars")
    for i in range(len(answers)):
        print(i)
        sentence = answers.pop()
        print(sentence)
        for j, c in enumerate(sentence):
            try:
                y[i, j, ctable.char_indices[c]] = 1
            except KeyError:
                pass 
    return X, y

def vectorize(questions, answers, chars=None):

    print('Vectorization...')
    chars = chars or CHARS
    ctable = CharacterTable(chars)
    print("inputdata before _vec")
    print(questions)
    X, y = _vectorize(questions, answers, ctable)

    print(X.shape)
    print(y.shape)

    return  X, y, CONFIG.max_input_len, ctable

def generate_model(output_len, chars=None):

    print('Build model...')
    chars = chars or CHARS
    model = Sequential()

    for layer_number in range(CONFIG.input_layers):
        model.add(recurrent.LSTM(CONFIG.hidden_size, input_shape=(None, len(chars)), kernel_initializer=CONFIG.initialization, return_sequences=layer_number + 1 < CONFIG.input_layers))
        model.add(Dropout(CONFIG.amount_of_dropout))

    model.add(RepeatVector(output_len))

    for _ in range(CONFIG.output_layers):
        model.add(recurrent.LSTM(CONFIG.hidden_size, return_sequences=True, 
kernel_initializer=CONFIG.initialization))
        model.add(Dropout(CONFIG.amount_of_dropout))


    model.add(TimeDistributed(Dense(len(chars), kernel_initializer=CONFIG.initialization)))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def iterate_training(model, X_train, y_train, X_val, y_val, ctable):

    for iteration in range(1, CONFIG.number_of_iterations):
        model.fit(X_train, y_train, batch_size=CONFIG.batch_size, 
epochs=CONFIG.epochs, validation_data=(X_val, y_val))

X_train,y_train, y_maxlen, ctable = vectorize(input_data, output_data, 
chars)

print ("y_maxlen, chars", y_maxlen, "".join(chars))

model = generate_model(y_maxlen, chars)

iterate_training(model, X_train, y_train, X_train, y_train, ctable)

for inp in X_train:

    inputarray = ctable.decode(inp)

    print(inputarray)

prediction=model.predict_classes(X_train, verbose=0)

for p in prediction:

    guess = ctable.decode(p, calc_argmax=False)

    print(guess)

for op in y_train:

    correct = ctable.decode(op)

    print(correct)

Для приведенного выше кода я взял 50 образцов слов в качестве входных данных.Но после выполнения я получаю правильное слово с дополнением последнего символа слова до максимальной длины ввода.Вывод будет таким, как показано ниже:

blangggggggggggggggg

аккумулятор * rrrrrrrrr

plateeeeeeeeeeeeeeee

pipeeeeeeeeeeeeeeeee

universallllllllllllllllllllllllllllllllllllllllllll (*) *

universallllllllllll * 101e * ellralllllll * 10161018 *

wheellllllllllllllll

kittsiveeellllllllll

solidddddddddddddddd

вводимые слова:

блан
аккумулятор
пластина
труба
универсальный
абразивный
круг
комплект
твердый

Как мне справиться с этой проблемой.

Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...