Я использую этот код для исправления орфографии.версия Python 3.6.5.Я выполняю этот код в блокноте Jupyter.
код:
import os
import errno
from collections import Counter
from hashlib import sha256
import re
import json
import itertools
import logging
import requests
import numpy as np
import pandas as pd
from numpy.random import choice as random_choice, randint as random_randint,
shuffle as random_shuffle, seed as random_seed, rand
from numpy import zeros as np_zeros
from keras.models import Sequential, load_model
from keras.layers import Activation, TimeDistributed, Dense, RepeatVector,
Dropout, recurrent
from keras.callbacks import Callback
LOGGER = logging.getLogger(__name__)
LOGGER.addHandler(logging.StreamHandler())
LOGGER.setLevel(logging.DEBUG)
random_seed(123)
class Configuration(object):
CONFIG = Configuration()
CONFIG.input_layers = 2
CONFIG.output_layers = 2
CONFIG.amount_of_dropout = 0.2
CONFIG.hidden_size = 500
CONFIG.initialization = "he_normal"
CONFIG.number_of_chars = 26
CONFIG.max_input_len = 20
CONFIG.inverted = True
CONFIG.batch_size = 4
CONFIG.epochs = 50
CONFIG.steps_per_epoch = 10
CONFIG.validation_steps = 10
CONFIG.number_of_iterations = 10
dataset=pd.read_csv("inpspell_wordpair2.csv")
input_data=dataset['input'].tolist()
input_data1=str(input_data)
output_data=dataset['output'].tolist()
output_data1=str(output_data)
chars_input = set.union(*(set(input_data1) for inp in input_data1))
chars_output = set.union(*(set(output_data1) for op in output_data1))
chars = list(set.union(chars_input, chars_output))
chars=list(" abcdefghijklmnopqrstuvwxyz")
MIN_INPUT_LEN = 1
AMOUNT_OF_NOISE = 0.2 / CONFIG.max_input_len
CHARS = list(" abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .")
class CharacterTable(object):
def __init__(self, chars):
self.chars = sorted(set(chars))
self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
self.indices_char = dict((i, c) for i, c in enumerate(self.chars))
@property
def size(self):
return len(self.chars)
def encode(self, C, maxlen):
X = np.zeros((maxlen, len(self.chars)), dtype=np.bool) #
for i, c in enumerate(C):
X[i, self.char_indices[c]] = 1
return X
def decode(self, X, calc_argmax=True):
if calc_argmax:
X = X.argmax(axis=-1)
return ''.join(self.indices_char[x] for x in X)
def _vectorize(questions, answers, ctable):
len_of_questions = len(questions)
X = np_zeros((len_of_questions, CONFIG.max_input_len, ctable.size),dtype=int)
print("inputchars")
for i in range(len(questions)):
print(i)
sentence = questions.pop()
print(sentence)
for j, c in enumerate(sentence):
print(j,c)
try:
X[i, j, ctable.char_indices[c]] = 1
except KeyError:
pass
y = np_zeros((len_of_questions, CONFIG.max_input_len, ctable.size),
dtype=int)
print("outputchars")
for i in range(len(answers)):
print(i)
sentence = answers.pop()
print(sentence)
for j, c in enumerate(sentence):
try:
y[i, j, ctable.char_indices[c]] = 1
except KeyError:
pass
return X, y
def vectorize(questions, answers, chars=None):
print('Vectorization...')
chars = chars or CHARS
ctable = CharacterTable(chars)
print("inputdata before _vec")
print(questions)
X, y = _vectorize(questions, answers, ctable)
print(X.shape)
print(y.shape)
return X, y, CONFIG.max_input_len, ctable
def generate_model(output_len, chars=None):
print('Build model...')
chars = chars or CHARS
model = Sequential()
for layer_number in range(CONFIG.input_layers):
model.add(recurrent.LSTM(CONFIG.hidden_size, input_shape=(None, len(chars)), kernel_initializer=CONFIG.initialization, return_sequences=layer_number + 1 < CONFIG.input_layers))
model.add(Dropout(CONFIG.amount_of_dropout))
model.add(RepeatVector(output_len))
for _ in range(CONFIG.output_layers):
model.add(recurrent.LSTM(CONFIG.hidden_size, return_sequences=True,
kernel_initializer=CONFIG.initialization))
model.add(Dropout(CONFIG.amount_of_dropout))
model.add(TimeDistributed(Dense(len(chars), kernel_initializer=CONFIG.initialization)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def iterate_training(model, X_train, y_train, X_val, y_val, ctable):
for iteration in range(1, CONFIG.number_of_iterations):
model.fit(X_train, y_train, batch_size=CONFIG.batch_size,
epochs=CONFIG.epochs, validation_data=(X_val, y_val))
X_train,y_train, y_maxlen, ctable = vectorize(input_data, output_data,
chars)
print ("y_maxlen, chars", y_maxlen, "".join(chars))
model = generate_model(y_maxlen, chars)
iterate_training(model, X_train, y_train, X_train, y_train, ctable)
for inp in X_train:
inputarray = ctable.decode(inp)
print(inputarray)
prediction=model.predict_classes(X_train, verbose=0)
for p in prediction:
guess = ctable.decode(p, calc_argmax=False)
print(guess)
for op in y_train:
correct = ctable.decode(op)
print(correct)
Для приведенного выше кода я взял 50 образцов слов в качестве входных данных.Но после выполнения я получаю правильное слово с дополнением последнего символа слова до максимальной длины ввода.Вывод будет таким, как показано ниже:
blangggggggggggggggg
аккумулятор * rrrrrrrrr
plateeeeeeeeeeeeeeee
pipeeeeeeeeeeeeeeeee
universallllllllllllllllllllllllllllllllllllllllllll (*) *
universallllllllllll * 101e * ellralllllll * 10161018 *
wheellllllllllllllll
kittsiveeellllllllll
solidddddddddddddddd
вводимые слова:
блан
аккумулятор
пластина
труба
универсальный
абразивный
круг
комплект
твердый
Как мне справиться с этой проблемой.