Текст описания в распознавании речи - PullRequest
0 голосов
/ 06 марта 2019

Я пытаюсь построить свою собственную сеть распознавания речи. Я понял, как предварительно обработать аудио. Но я не могу понять предварительную обработку текста.

У меня есть алфавит:

alphabet = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14,'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}

И я кодирую каждую букву предложения в число (27 - это пробел):

array([list([27, 23, 8, 5, 14, 27, 8, 5, 27, 19, 16, 5, 1, 11, 19, 27, 9, 14, 27, 15, 21, 18, 27, 12, 1, 14, 7, 21, 1, 7, 5, 27, 9, 27, 3, 1, 14, 27, 9, 14, 20, 5, 18, 16, 18, 5, 20, 27, 23, 8, 1, 20, 27, 8, 5, 27, 8, 1, 19, 27, 19, 1, 9, 4, 27]),
   list([27, 19, 15, 27, 14, 15, 23, 27, 9, 27, 6, 5, 1, 18, 27, 14, 15, 20, 8, 9, 14, 7, 27, 2, 5, 3, 1, 21, 19, 5, 27, 9, 20, 27, 23, 1, 19, 27, 20, 8, 15, 19, 5, 27, 15, 13, 5, 14, 19, 27, 20, 8, 1, 20, 27, 2, 18, 15, 21, 7, 8, 20, 27, 25, 15, 21, 27, 20, 15, 27, 13, 5, 27]),
   list([27, 14, 9, 7, 8, 20, 27, 6, 5, 12, 12, 27, 1, 14, 4, 27, 1, 14, 27, 1, 19, 19, 15, 18, 20, 13, 5, 14, 20, 27, 15, 6, 27, 6, 9, 7, 8, 20, 9, 14, 7, 27, 13, 5, 14, 27, 1, 14, 4, 27, 13, 5, 18, 3, 8, 1, 14, 20, 19, 27, 5, 14, 20, 5, 18, 5, 4, 27, 1, 14, 4, 27, 5, 24, 9, 20, 5, 4, 27, 20, 8, 5, 27, 20, 5, 14, 20, 27]),
   list([27, 9, 27, 8, 5, 1, 18, 4, 27, 1, 27, 6, 1, 9, 14, 20, 27, 13, 15, 22, 5, 13, 5, 14, 20, 27, 21, 14, 4, 5, 18, 27, 13, 25, 27, 6, 5, 5, 20, 27]),
   list([27, 25, 15, 21, 27, 3, 1, 13, 5, 27, 19, 15, 27, 20, 8, 1, 20, 27, 25, 15, 21, 27, 3, 15, 21, 12, 4, 27, 12, 5, 1, 18, 14, 27, 1, 2, 15, 21, 20, 27, 25, 15, 21, 18, 27, 4, 18, 5, 1, 13, 19, 27, 19, 1, 9, 4, 27, 20, 8, 5, 27, 15, 12, 4, 27, 23, 15, 13, 1, 14, 27])],
  dtype=object)

Вот 5 предложений. Я просто создаю один сетевой слой и пытаюсь перенести туда эти данные, чтобы получить число, соответствующее букве.

model = Sequential()
model.add(Dense(27, input_shape=(20,), activation='softmax'))
model.compile(loss='mean_squared_error',optimizer='Adam', metrics=['accuracy'])

for X, y in batch(X_train, y_train, 5):
    model.train_on_batch(X, y)

batch () просто разбивает X_train, y_train на пакет. 5 - размер партии.

Но когда я пытаюсь запустить сеть, я получаю сообщение об ошибке

Error when checking target: expected dense_25 to have shape (27,) but got array with shape (1,)

UPD: Я использую MFCC для X

    audio, sr = librosa.load(pathTrain+"\\"+str(file), mono=True, sr=None)
    fileMFCC = librosa.feature.mfcc(audio)
    mean_scale = np.mean(fileMFCC, axis=0)
    std_scale = np.std(fileMFCC, axis=0)
    fileMFCC = (fileMFCC - mean_scale[np.newaxis, :]) / std_scale[np.newaxis, :]

X -

[array([[-4.35889894, -4.35889894, -4.35455134, ..., -3.95851777,
     -3.99308173, -4.05261022],
    [ 0.22941573,  0.22941573,  0.31913073, ...,  1.87189324,
      1.7987301 ,  1.66804349],
    [ 0.22941573,  0.22941573,  0.31165866, ..., -0.27962786,
     -0.19009062, -0.13788484],
    ...,
    [ 0.22941573,  0.22941573,  0.18657944, ...,  0.14699792,
      0.12751924,  0.16724807],
    [ 0.22941573,  0.22941573,  0.18478513, ...,  0.00674492,
     -0.04570105,  0.01231168],
    [ 0.22941573,  0.22941573,  0.18232521, ...,  0.2571599 ,
      0.22477036,  0.09153304]]) 

и т.д.

...