Я тренирую три разных классификатора (KNN с k = 1,3,5,7,9,11,13 - Наивный Байес - Логистический) в одном наборе данных, используя один и тот же экстрактор признаков, и я сравниваю точность, чтобы определить какой классификатор лучше в наборе данных.
Однако каждый раз, когда я тренирую эти модели (повторно выполняю весь код в одном и том же наборе данных), я получаю разные значения точности для каждой модели.
Это нормально? Есть ли способ получить только одно значение точности для каждой модели?
Это весь мой код (извините, но некоторые комментарии на итальянском):
from skimage import io as sio
from skimage.feature import daisy
from dataset import Dataset
from time import time
from sklearn.cluster import MiniBatchKMeans as KMeans
from sklearn.preprocessing import Normalizer
import numpy as np
from matplotlib import pyplot as plt
from skimage.color import rgb2gray
dataset=Dataset('dataset')
classes=["edifici","quadri","sculture"]
print(dataset.getLength())
#dividiamo in test set e training set
training_set, test_set = dataset.splitTrainingTest(0.7) #70% training, 30% test
#------------Extraction of features and building of the vocabulary
#estraiamo tutte le features dalle immagini del dataset
training_local_features = extract_features(training_set)
#inizializziamo l'oggetto "KMeans" impostando il numero di centroidi
kmeans = KMeans(500) #avviamo il kmeans sulle feature estratte
kmeans.fit(training_local_features)
#i centroidi dei cluster ottenuti dall'algoritmo k-means sono conservati all'interno di k-means cluster
kmeans.cluster_centers_.shape
#codifichiamo le classi piuttosto che con delle stringhe, con degli indici numerici:
classes_idx=range(len(classes))
#estraggo le features dal training set:
X_training,y_training,paths_training=describe_dataset(training_set,kmeans)
X_test,y_test,paths_test=describe_dataset(test_set,kmeans)
#Normalization TF-IDF
presence=(X_training>0).astype(int)
df=presence.sum(axis=0)
n=len(X_training)
idf=np.log(float(n)/(1+df))
X_training_tfidf=X_training*idf
X_test_tfidf=X_training*idf
norm=Normalizer(norm='l2')
X_training_tfidf_12=norm.transform(X_training_tfidf)
X_test_tfidf_12=norm.transform(X_test_tfidf)
#--------------------------------------------------------------------------KNN
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier as KNN
#----------------------------------------------------------------------3NN
nn5 = KNN(3)
nn5.fit(X_training,y_training)
predicted_labels=nn5.predict(X_test)
a = accuracy_score(y_test,predicted_labels)
M = confusion_matrix(y_test,predicted_labels)
print ("3-NN, accuracy: %0.2f, Confusion Matrix:\n" %a)
print (M)
#----------------------------------------------------------------NAIVE BAYES
from sklearn.naive_bayes import MultinomialNB as NB
nb=NB()
#alleno il modello
nb.fit(X_training, y_training)
#valutiamo la performance
predicted_labels=nb.predict(X_test)
print("NAIVE BAYES: Accuracy: %0.2f, Confusion Matrix:/n"% accuracy_score(y_test,predicted_labels))
print(confusion_matrix(y_test,predicted_labels))
#---------------------------------------------------------------------LOGISTIC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.decomposition import PCA
pca=PCA()
pca.fit(X_training)
X_training_pca=pca.transform(X_training)
X_test_pca=pca.transform(X_test)
lr=LogisticRegression() #viene usato il metodo one vs rest di default
lr.fit(X_training_pca,y_training)
p=lr.predict(X_test_pca)
print ("LOGISTI REGRESSION: Accuracy: %0.2f, Confusion matrix:\n" % accuracy_score(y_test,p))
print(confusion_matrix(y_test,p))
Первый раз, когда я выполнил полученный код:
3-NN, accuracy: 0.67, Confusion Matrix:
[[ 9 3 4]
[ 4 11 2]
[ 3 0 12]]
NAIVE BAYES: Accuracy: 0.83, Confusion Matrix:/n
[[15 0 1]
[ 2 13 2]
[ 2 1 12]]
LOGISTI REGRESSION: Accuracy: 0.71, Confusion matrix:
[[13 2 1]
[ 5 11 1]
[ 4 1 10]]
Второй раз:
3-NN, accuracy: 0.65, Confusion Matrix:
[[11 2 3]
[ 3 13 1]
[ 7 1 7]]
NAIVE BAYES: Accuracy: 0.85, Confusion Matrix:/n
[[15 1 0]
[ 1 14 2]
[ 1 2 12]]
LOGISTI REGRESSION: Accuracy: 0.79, Confusion matrix:
[[13 0 3]
[ 0 14 3]
[ 1 3 11]]
Я ожидал, что результаты будут такими же ...