Как использовать беспорядочную матрицу после k-фолдинга - PullRequest
0 голосов
/ 27 января 2020

Пожалуйста, примите во внимание, что я не ученый или разработчик данных.

Мое нынешнее затруднительное положение можно описать так:

Я пытаюсь осуществить проект классификации мультиклассовых текстов. Прямо сейчас я просто использую набор игрушечных данных с 10 выборками в каждом классе. Каждый образец имеет 4 возможных метки (A1, A2, B1, B2), закодированные как 0,1,2,3. В качестве способа проверки я реализую k-fold (4) , но когда я пытаюсь создать матрицу путаницы , которая создает для меня конфликт, потому что это подразумевает дисбаланс между количество фактических и прогнозируемых выборок из-за сгибов.

Это ошибка, которую я получаю:

check_consistent_length(y_true, y_pred)
  File "D:\ProgramData\Miniconda3\envs\Env_DLexp1\lib\site-packages\sklearn\utils\validation.py", line 212, in check_consistent_length
    " samples: %r" % [int(l) for l in lengths])
ValueError: Found input variables with inconsistent numbers of samples: [30, 10]

Это мой код:

import os
import string
import keras
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
import re
import numpy as np
import tensorflow as tf
from numpy import array
from sklearn.model_selection import KFold



from numpy.random import seed
seed(1)

tf.random.set_seed(1)

#root="D:/bananaCorpus"
root="D:/CEFR_corpus"
train_dir=os.path.join(root,"train")

texts=[]
labels=[]

for label in ["A1","A2","B1","B2"]:
     directory=os.path.join(train_dir,label)
     for fname in os.listdir(directory):
         if fname[-4:]==".txt":
             f = open(os.path.join(directory, fname),encoding="utf-8")
             texts.append(f.read())
             f.close()
             if label == "A1":
                 labels.append(0)
             elif label=="A2":
                       labels.append(1)
             elif label=="B1":
                  labels.append(2)
             else:
                labels.append(3)

print(texts)
print(labels)
print("Corpus Length", len( root), "\n")
print("The total number of reviews in the train dataset is", len(texts),"\n")
stops = set(stopwords.words("english"))
print("The number of stopwords used in the beginning: ", len(stops),"\n")
print("The words removed from the corpus will be",stops,"\n")


## This adds new words or terms from words_to_add list to the stop_words
words_to_add=[]
[stops.append(w) for w in words_to_add]

##This removes the words or terms from the words_to_remove list,
##so that they are no longer included in stopwords
words_to_remove=["i","having"]
[stops.remove(w) for w in words_to_remove ]

texts=[[w.lower() for w  in word_tokenize("".join(str(review))) if  w not in stops and w not in string.punctuation and len(w)>2 and w.isalpha()]for review in texts ]

print("costumized stopwords: ", stops,"\n")
print("count of costumized stopwords",len(stops),"\n")
print("**********",texts,"\n")

#vectorization
#tokenizing the raw data
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

maxlen = 50
training_samples = 200
validation_samples = 10000
max_words = 10000

#delete?
tokens=keras.preprocessing.text.text_to_word_sequence(str(texts))
print("Sequence of tokens: ",tokens,"\n")

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

print("Tokens:", sequences,"\n")
word_index = tokenizer.word_index
print("Unique tokens:",word_index,"\n") 
print(' %s unique tokens in total.' % len(word_index,),"\n")
print("Unique tokens: ", word_index,"\n")
print("Dictionary of words and their count:", tokenizer.word_counts,"\n" )
print(" Number of docs/seqs used to fit the Tokenizer:", tokenizer.document_count,"\n")
print(tokenizer.word_index,"\n")
print("Dictionary of words and how many documents each appeared in:",tokenizer.word_docs,"\n")

data = pad_sequences(sequences, maxlen=maxlen, padding="post")
print("padded data","\n")
print(data)

#checking the encoding with a new document
text2="I like to study english in the morning and play games in the afternoon"
text2=[w.lower() for w  in word_tokenize("".join(str(text2))) if  w not in stops and w not in string.punctuation
          and len(w)>2 and w.isalpha()]
sequences = tokenizer.texts_to_sequences([text2])
text2 = pad_sequences(sequences, maxlen=maxlen, padding="post")
print("padded text2","\n")
print(text2)


#K fold cross-validation
labels = np.asarray(labels)

print('Shape of data tensor:', data.shape,"\n")
print('Shape of label tensor:', labels.shape,"\n")
print("labels",labels,"\n")



kf = KFold(n_splits=4, random_state=None, shuffle=True)
kf.get_n_splits(data)

print(kf)

KFold(n_splits=4, random_state=None, shuffle=True)
for train_index, test_index in kf.split(data):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = data[train_index], data[test_index]
    y_train, y_test = labels[train_index], labels[test_index]



#Pretrained embedding
glove_dir = 'D:\glove'

embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'),encoding="utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print("Loading Glove...")
print("Found %s words vectors fom GLOVE."% len(embeddings_index))

#Preparing the Glove word-embeddings matrix to pass to the embedding layer(max_words, embedding_dim)
embedding_dim = 100

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector




# define model
from keras.models import Sequential
from keras.layers import Embedding,Flatten,Dense
from keras import layers
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.normalization import BatchNormalization#delete if not used
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))#vocabulary size + the size of glove version +max len of input documents.
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(BatchNormalization(weights=None, epsilon=1e-06, momentum=0.9))#try before and after activation
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(4, activation='softmax'))
print(model.summary())

#Loading pretrained word embeddings and Freezing the Embedding layer
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False



#if you don't instantiate it externally, you will have to use the default values for the optimizer.
Adam=keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False)
#callbacks
callbacks_list = [
    keras.callbacks.EarlyStopping(
        monitor="val_sparse_categorical_accuracy",
        patience=1,
        ),

    keras.callbacks.ModelCheckpoint(
        filepath='banan_CNN_Text_classifier.best.hdf5',
        monitor="val_loss",
        save_best_only=True,
        ),

     keras.callbacks.ReduceLROnPlateau(
         monitor='val_loss', factor=0.1, patience=5),

    ]


# compile network
model.compile(optimizer=Adam,
              loss='sparse_categorical_crossentropy',
              metrics=['sparse_categorical_accuracy'])
history = model.fit(X_train, y_train,
                    epochs=15,
                    batch_size=32,#delete/adjust if necessary
                    verbose=1, 
                    callbacks=callbacks_list,
                    validation_data=(X_test, y_test))

# evaluate
loss, acc = model.evaluate(X_test, y_test, verbose=0)#try verbose 1
print('Test Accuracy: %f' % (acc*100))

#Plotting the result
import matplotlib.pyplot as plt

acc = history.history['sparse_categorical_accuracy']
val_acc = history.history['val_sparse_categorical_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
print(history.history.keys())


epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='sparse_categorical_accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

#confusion matrix
from sklearn.metrics import confusion_matrix
disp = confusion_matrix(train_index, test_index                                                                                                 )
title="CM"
disp.ax_.set_title(title)

plt.title("Confusion matrix")
print(disp.confusion_matrix)

plt.show()
...