Как предсказать настроение немаркированных данных, используя модель Gensim word2ve c? - PullRequest
0 голосов
/ 10 апреля 2020

Я обучил и протестировал набор данных IMDb mov ie reviews, используя модель Gensim word2ve c, и хочу предсказать настроения моих собственных немаркированных данных. Я пытался, но получил ошибку. Я повторно использую код с открытым исходным кодом. Ниже приведен полный код:

import pandas as pd
import numpy as np
import text_normalizer as tn
import model_evaluation_utils as meu
np.set_printoptions(precision=2, linewidth=80)
import gensim
import keras
from keras.models import Sequential
from keras.layers import Dropout, Activation, Dense
from sklearn.preprocessing import LabelEncoder

dataset = pd.read_csv(r'imdb_reviews.csv')
new_data = pd.read_csv('abcd.csv', header=0)
# take a peek at the data
print(dataset.head())
reviews = np.array(dataset['reviews'])
sentiments = np.array(dataset['Sentiments'])

# build train and test datasets
train_reviews = reviews[:35000]
train_sentiments = sentiments[:35000]
test_reviews = reviews[35000:]
test_sentiments = sentiments[35000:]

# normalize datasets
norm_train_reviews = tn.normalize_corpus(train_reviews)
norm_test_reviews = tn.normalize_corpus(test_reviews)
le = LabelEncoder()
num_classes=2 
# tokenize train reviews & encode train labels
tokenized_train = [tn.tokenizer.tokenize(text)
                   for text in norm_train_reviews]
y_tr = le.fit_transform(train_sentiments)
y_train = keras.utils.to_categorical(y_tr, num_classes)
# tokenize test reviews & encode test labels
tokenized_test = [tn.tokenizer.tokenize(text)
                   for text in norm_test_reviews]
y_ts = le.fit_transform(test_sentiments)
y_test = keras.utils.to_categorical(y_ts, num_classes)
# print class label encoding map and encoded labels
print('Sentiment class label map:', dict(zip(le.classes_, le.transform(le.classes_))))
print('Sample test label transformation:\n'+'-'*35,
      '\nActual Labels:', test_sentiments[:3], '\nEncoded Labels:', y_ts[:3], 
      '\nOne hot encoded Labels:\n', y_test[:3])
# build word2vec model
w2v_num_features = 500
w2v_model = gensim.models.Word2Vec(tokenized_train, size=w2v_num_features, window=150,
                                   min_count=10, sample=1e-3)
def averaged_word2vec_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)

    def average_word_vectors(words, model, vocabulary, num_features):
        feature_vector = np.zeros((num_features,), dtype="float64")
        nwords = 0.

        for word in words:
            if word in vocabulary: 
                nwords = nwords + 1.
                feature_vector = np.add(feature_vector, model[word])
        if nwords:
            feature_vector = np.divide(feature_vector, nwords)

        return feature_vector

    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)
# generate averaged word vector features from word2vec model
avg_wv_train_features = averaged_word2vec_vectorizer(corpus=tokenized_train, model=w2v_model,
                                                     num_features=500)
avg_wv_test_features = averaged_word2vec_vectorizer(corpus=tokenized_test, model=w2v_model,
                                                    num_features=500)
print('Word2Vec model:> Train features shape:', avg_wv_train_features.shape, ' Test features shape:', avg_wv_test_features
def construct_deepnn_architecture(num_input_features):
    dnn_model = Sequential()
    dnn_model.add(Dense(512, activation='relu', input_shape=(num_input_features,)))
    dnn_model.add(Dropout(0.2))
    dnn_model.add(Dense(512, activation='relu'))
    dnn_model.add(Dropout(0.2))
    dnn_model.add(Dense(512, activation='relu'))
    dnn_model.add(Dropout(0.2))
    dnn_model.add(Dense(2))
    dnn_model.add(Activation('softmax'))

    dnn_model.compile(loss='categorical_crossentropy', optimizer='adam',                 
                      metrics=['accuracy'])
    return dnn_model
w2v_dnn = construct_deepnn_architecture(num_input_features=500)
batch_size = 100
w2v_dnn.fit(avg_wv_train_features, y_train, epochs=15, batch_size=batch_size, 
            shuffle=True, validation_split=0.1, verbose=1)
y_pred = w2v_dnn.predict_classes(avg_wv_test_features)
predictions = le.inverse_transform(y_pred)
meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=predictions, 
                                      classes=['positive', 'negative'])
# This I added to predict and save the results of my own data
pred_y2 = w2v_dnn.predict_classes(new_data['Articles'])
print(pred_y2)
pd.DataFrame(pred_y2, columns=['Sentiments']).to_csv('abcd_sentiments.csv')

Когда я запускаю этот код, я получаю следующую ошибку:

ValueError Traceback (последний вызов был последним) в ----> 1 pred_y2 = w2v_dnn.predict_classes (new_data ['Articles']) 2 print (pred_y2) 3 pd.DataFrame (pred_y2, columns = ['Sentiments']). to_csv ('abcd_sentiments.csv')

~ / PycharmProjects / News / venv / lib / python3 .7 / site-packages / keras / engine / sequential.py в предикатных классах (self, x, batch_size, verbose) 266 A numpy массив предсказаний классов. 267 "" "-> 268 proba = self.predict (x, batch_size = batch_size, verbose = verbose) 269, если proba.shape [-1]> 1: 270 возвращают proba.argmax (axis = -1)

~ / PycharmProjects / News / venv / lib / python3 .7 / site-packages / keras / engine / training.py в предиктах (self, x, batch_size, verbose, steps, callbacks, max_queue_size, worker, use_multiprocessing ) 1439 1440
# Случай 2: символьные c тензоры или Numpy в виде массива. -> 1441 x, _, _ = self._standardize_user_data (x) 1442, если self.stateful: 1443, если x [0] .shape [0]> batch_size и x [0] .shape [0]% batch_size! = 0:

~ / PycharmProjects / News / venv / lib / python3 .7 / site-packages / keras /engine/training.py в _standardize_user_data (self, x, y, sample_weight, class_weight, check_array_lengths, batch_size) 577 feed_input_shapes, 578 check_batch_axis = False, # Не применять размер пакета. -> 579 исключение_prefix = 'input') 580 581, если y не None:

~ / PycharmProjects / News / venv / lib / python3 .7 / site-packages / keras / engine / training_utils.py в standardize_ input_data (данные, имена, формы, check_batch_axis, exception_prefix) 143 ': ожидается' + имена [i] + 'будут иметь форму' + 144 str (shape) + ', но получили массив с формой' + -> 145 str (data_shape )) 146 возвращают данные 147

ValueError: Ошибка при проверке ввода: ожидалось, что dens_1_input имеет форму (500,), но получил массив с формой (1,)

Может кто-нибудь подсказать мне Как решить эту ошибку и предсказать настроение моих немаркированных данных? Я использую python 3.7 и блокнот jupyter от Pycharm IDE.

Заранее спасибо.

...