Я начинаю с того, что Keras создает модель для классификации текстовых меток, вводя несколько текстовых объектов с одним выходом. У меня есть специфицированная c функция для создания модели и еще одна для тестирования модели с использованием другого набора данных.
Я все еще пытаюсь точно настроить предсказания модели, но я хотел бы попытаться понять, почему моя тестовая функция получает разные результаты каждый раз, когда модель воссоздается. Это обычно? Кроме того, я был бы признателен за любые советы по повышению точности модели.
def create_model (имя модели, данные, данные теста):
# lets take 80% data as training and remaining 20% for test.
train_size = int(len(data) * .9)
test_size = int(len(data) * .4)
train_headlines = data['Subject']
train_category = data['Category']
train_activities = data['Activity']
test_headlines = data['Subject'][:test_size]
test_category = data['Category'][:test_size]
test_activities = data['Activity'][:test_size]
# define Tokenizer with Vocab Sizes
vocab_size1 = 10000
vocab_size2 = 5000
batch_size = 100
tokenizer = Tokenizer(num_words=vocab_size1)
tokenizer2 = Tokenizer(num_words=vocab_size2)
test_headlines=test_headlines.astype(str)
train_headlines=train_headlines.astype(str)
test_category=test_category.astype(str)
train_category=train_category.astype(str)
tokenizer.fit_on_texts(test_headlines)
tokenizer2.fit_on_texts(test_category)
x_train = tokenizer.texts_to_matrix(train_headlines, mode='tfidf')
x_test = tokenizer.texts_to_matrix(test_headlines, mode='tfidf')
y_train = tokenizer2.texts_to_matrix(train_category, mode='tfidf')
y_test = tokenizer2.texts_to_matrix(test_category, mode='tfidf')
# load classes
labels = []
encoder = LabelBinarizer()
encoder.fit(train_activities)
text_labels = encoder.classes_
with open('outputs/classes.txt', 'w') as f:
for item in text_labels:
f.write("%s\n" % item)
z_train = encoder.transform(train_activities)
z_test = encoder.transform(test_activities)
num_classes = len(text_labels)
print ("num_classes: "+str(num_classes))
input1 = Input(shape=(vocab_size1,), name='main_input')
x1 = Dense(512, activation='relu')(input1)
x1 = Dense(64, activation='relu')(x1)
x1 = Dense(64, activation='relu')(x1)
input2 = Input(shape=(vocab_size2,), name='cat_input')
main_output = Dense(num_classes, activation='softmax', name='main_output')(x1)
model = Model(inputs=[input1, input2], outputs=[main_output])
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.summary()
history = model.fit([x_train,y_train], z_train,
batch_size=batch_size,
epochs=30,
verbose=1,
validation_split=0.1)
score = model.evaluate([x_test,y_test], z_test,
batch_size=batch_size, verbose=1)
print('Test accuracy:', score[1])
# serialize model to JSON
model_json = model.to_json()
with open("./outputs/my_model_"+model_name+".json", "w") as json_file:
json_file.write(model_json)
# creates a HDF5 file 'my_model.h5'
model.save('./outputs/my_model_'+model_name+'.h5')
# Save Tokenizer i.e. Vocabulary
with open('./outputs/tokenizer'+model_name+'.pickle', 'wb') as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
def validate_model (имя модели, данные теста, метки):
from keras.models import model_from_json
test_data['Subject'] = test_data['Subject'] + " " + test_data['Description']
headlines = test_data['Subject'].astype(str)
categories = test_data['Category'].astype(str)
# load json and create model
json_file = open("./outputs/my_model_"+model_name+".json", 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
# load weights into new model
model.load_weights('./outputs/my_model_'+model_name+'.h5')
print("Loaded model from disk")
# loading
import pickle
with open('./outputs/tokenizer'+model_name+'.pickle', 'rb') as handle:
tokenizer = pickle.load(handle)
# Subjects
x_pred = tokenizer.texts_to_matrix(headlines, mode='tfidf')
# Categorias
y_pred = tokenizer.texts_to_matrix(categories, mode='tfidf')
predictions = []
scores = []
predictions_vetor = model.predict({'main_input': x_pred, 'cat_input': y_pred})