Я выполняю поиск по сетке, используя код ниже. Однако, чтобы получить среднюю матрицу путаницы, мне нужно вручную получить сгибы и заново установить модель для каждого сгиба. Однако средняя оценка точности отличается между двумя методами. Может кто-нибудь заметить ошибку, которую я делаю?
def logistic():
bal = 6000
smt = SMOTE(sampling_strategy = {0:6000,1:6000,2:6000,3:6000})
rus = RandomUnderSampler(sampling_strategy = {0:bal,1:bal,2:bal,3:bal,4:bal})
poly = polynomial_transform()
stand = StandardScaler()
pca = PCA()
logistic = LogisticRegression(max_iter=100, tol=0.01,solver = 'saga')
pipe = Pipeline(steps=[('smt',smt),('rus',rus),('poly', poly),('standardise', stand),('pca', pca), ('logistic', logistic)])
# Parameters of pipelines can be set using ‘__’ separated parameter names:
param_grid = {
'poly__degree': [1],
'pca__n_components':[10],
'logistic__C': [0.0001],
'logistic__multi_class': ['multinomial','ovr']
}
scorers = {
'accuracy_score': make_scorer(accuracy_score)
}
#performing grid search
search = GridSearchCV(pipe, param_grid, n_jobs=-1,verbose=2,scoring= scorers,refit='accuracy_score',cv=cv2)
search.fit(X_train, y_train.ravel())
#results of cross validation grid search
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)
return search
Fitting 5 folds for each of 2 candidates, totalling 10 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 10 out of 10 | elapsed: 3.1min finished
Best parameter (CV score=0.641):
{'logistic__C': 0.0001, 'logistic__multi_class': 'ovr', 'pca__n_components': 10, 'poly__degree': 1}
теперь вычисляет средний балл резюме вручную
best_est = logistic_CV_results.best_estimator_
from sklearn.base import clone
conf_matrix_list_of_arrays=[]
accuracy_arrays=[]
for train_index, test_index in cv2.split(X_train):
X_train_fold, X_test_fold = X_train[train_index], X_train[test_index]
y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]
est = clone(best_est)
est.fit(X_train_fold,y_train_fold.ravel())
y_pred = est.predict(X_test_fold)
conf_matrix_list_of_arrays .append(confusion_matrix(y_test_fold, y_pred,labels=[0,1,2,3,4]))
accuracy_arrays .append(accuracy_score(y_test_fold, y_pred))
mean_of_conf_matrix_arrays = np.mean(conf_matrix_list_of_arrays, axis=0)
mean_accuracy = np.mean(accuracy_arrays, axis=0)
print(mean_accuracy)
0.640433930911382