ValueError: Входные массивы должны иметь то же количество выборок, что и целевые массивы. Найдено 1 входных образцов и 8 целевых образцов - PullRequest
0 голосов
/ 06 марта 2020

Мой код о классификации эмоций в комментариях на YouTube. Он имеет четыре класса happy, sad, angry и neutral. При тестировании модели я получаю сообщение об ошибке:

ValueError: Входные массивы должны иметь то же количество выборок, что и целевые массивы. Найдено 1 входных образцов и 8 целевых выборок

imageenter image description here***">

Мой код:

from string import punctuation
from os import listdir
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# turn a doc into clean tokens
def clean_doc(doc, vocab):
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
    # filter out tokens not in vocab
    tokens = [w for w in tokens if w in vocab]
    tokens = ' '.join(tokens)
    return tokens

# load all docs in a directory
def process_docs(directory, vocab, is_trian):
    documents = list()
    # walk through all files in the folder
    for filename in listdir(directory):
        # skip any reviews in the test set
        if is_trian and filename.startswith('a9') or filename.startswith('a10') or filename.startswith('h9') or filename.startswith('h10') or filename.startswith('s9') or filename.startswith('s10') or filename.startswith('n9') or filename.startswith('n10'):
            continue
        if not is_trian and not filename.startswith('a9') or filename.startswith('a10') or filename.startswith('h9') or filename.startswith('h10') or filename.startswith('s9') or filename.startswith('s10') or filename.startswith('n9') or filename.startswith('n10'):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # load the doc
        doc = load_doc(path)
        # clean doc
        tokens = clean_doc(doc, vocab)
        # add to list
        documents.append(tokens)
    return documents

# load the vocabulary
vocab_filename = 'vocab1.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

# load all training reviews
happy_docs = process_docs('/content/drive/My Drive/emotion_dataset/happy', vocab, True)
sad_docs = process_docs('/content/drive/My Drive/emotion_dataset/sad', vocab, True)
angry_docs = process_docs('/content/drive/My Drive/emotion_dataset/angry', vocab, True)
neutral_docs = process_docs('/content/drive/My Drive/emotion_dataset/neutral', vocab, True)
train_docs = happy_docs + sad_docs + angry_docs + neutral_docs

# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(train_docs)

# sequence encode
encoded_docs = tokenizer.texts_to_sequences(train_docs)
# pad sequences
max_length = max([len(s.split()) for s in train_docs])
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define training labels
ytrain = array([0 for _ in range(8)] + [1 for _ in range(8)] + [2 for _ in range(8)] + [3 for _ in range(8)])

# load all test reviews
happy_docs = process_docs('/content/drive/My Drive/emotion_dataset/happy', vocab, False)
sad_docs = process_docs('/content/drive/My Drive/emotion_dataset/sad', vocab, False)
angry_docs = process_docs('/content/drive/My Drive/emotion_dataset/angry', vocab, False)
neutral_docs = process_docs('/content/drive/My Drive/emotion_dataset/neutral', vocab, False)
test_docs = happy_docs + sad_docs + angry_docs + neutral_docs
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(test_docs)
# pad sequences
Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define test labels
ytest = array([0 for _ in range(2)] + [1 for _ in range(2)] + [2 for _ in range(2)] + [3 for _ in range(2)])

# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1

# define model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length))
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(Xtrain, ytrain, epochs=5, verbose=2)


# evaluate
loss, acc = model.evaluate(Xtest, ytest, verbose=0) #i am getting an error on this line
print('Test Accuracy: %f' % (acc*100))
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...