Результаты обучения / теста по GridSearchCV не согласуются с оценками cross_validate и KFold в scikit-learn - PullRequest
0 голосов
/ 11 мая 2019

Я занимаюсь регрессионным анализом в scikit-learn. Он состоит из трех частей: 1) функция масштабирования; 2) выбор функции; и 3) регрессия. Я озадачен, наблюдая, что оценки поезда / теста, рассчитанные GridSearchCV, не совпадают с оценками двух других методов: 1) с использованием cross_validate; 2) используя KFold. Результаты cross_validate и KFold согласуются между собой.

  1. Использование GridSearchCV
scaler = ColumnTransformer([('num', StandardScaler(), slice(0,42))], remainder='passthrough')
pipe = Pipeline([('scale_numeric_feature', scaler),
                 ('select_feature', None),
                 ('regression', Ridge(alpha=10.))], memory='/var/tmp')
N_FEATURE_OPTIONS = [10,50,100,105,110,115,120,125,130,135,140,145,150,200,229]
param_grid ={
    'select_feature': [SelectKBest(mutual_info_regression)],
    'select_feature__k': N_FEATURE_OPTIONS}

grid = GridSearchCV(pipe, param_grid=param_grid, cv=10, n_jobs=-1, iid=False,
                    return_train_score=True, scoring='neg_mean_squared_error')
grid.fit(train_x, train_y)

print(np.sqrt(-grid.cv_results_['mean_train_score']))
print(np.sqrt(-grid.cv_results_['mean_test_score']))
  1. Использование cross_validate
def model2(X, y, number_features):
    train_error = np.zeros((10, len(number_features)))
    test_error = np.zeros((10, len(number_features)))

    for i in range(len(number_features)):
        scaler = ColumnTransformer([('num', StandardScaler(), slice(0,42))], remainder='passthrough')
        selector = SelectKBest(mutual_info_regression, k=number_features[i])
        pipe = Pipeline([('scale_numeric_feature', scaler),
                         ('select_feature', selector),
                         ('regression', Ridge(alpha=10.))])
        results = cross_validate(pipe, X, y, scoring='neg_mean_squared_error', cv=10, return_train_score=True)
        train_error[:,i] = results['train_score']
        test_error[:,i] = results['test_score']

    return train_error, test_error

number_features = [10,50,100,105,110,115,120,125,130,135,140,145,150,200,229]
train_error, test_error = model2(train_x, train_y, number_features)

train_avg_error = np.sum(train_error, axis=0)/10.
test_avg_error = np.sum(test_error, axis=0)/10.
print(train_avg_error)
print(test_avg_error)
  1. Использование KFold
def model3(X, y, number_features):
    kf = KFold(n_splits=10, shuffle=True, random_state=0)
    kf.get_n_splits(X)
    train_error = np.zeros((10, len(number_features)))
    test_error = np.zeros((10, len(number_features)))
    j = 0
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.values[train_index], X.values[test_index]
        y_train, y_test = y.values[train_index], y.values[test_index]

        ct = ColumnTransformer([('scale', StandardScaler(), slice(0,42))], remainder='passthrough')
        ct.fit(X_train)
        X_train_scaled = ct.transform(X_train)
        X_test_scaled = ct.transform(X_test)

        selector = SelectKBest(mutual_info_regression, k='all')
        selector.fit(X_train_scaled, y_train)
        df = pd.DataFrame({'rank': selector.scores_})
        df['col'] = np.arange(df.shape[0])
        df = df.sort_values(by=['rank'])

        for i in range(len(number_features)):
            X_train_scaled_selected = X_train_scaled[:,df['col'][0:number_features[i]]]
            X_test_scaled_selected = X_test_scaled[:,df['col'][0:number_features[i]]]
            regressor = Ridge(alpha=10.)
            regressor.fit(X_train_scaled_selected, y_train)
            train_error[j,i] = (np.sqrt(mean_squared_error(y_train, regressor.predict(X_train_scaled_selected))))
            test_error[j,i] = (np.sqrt(mean_squared_error(y_test, regressor.predict(X_test_scaled_selected))))

        j = j + 1
    return train_error, test_error

number_features = [10,50,100,105,110,115,120,125,130,135,140,145,150,200,229]
train_error, test_error = model3(train_x, train_y, number_features)

train_avg_error = np.sum(train_error, axis=0)/10.
test_avg_error = np.sum(test_error, axis=0)/10.
print(train_avg_error)
print(test_avg_error)
  1. Результаты GridSearchCV
[0.15137006 0.11795896 0.10845177 0.10748715 0.10706335 0.10646141
 0.10573004 0.10456232 0.10402522 0.1036519  0.10348363 0.10291938
 0.10261879 0.10090228 0.09921763]
[0.15251642 0.12285965 0.11711583 0.11644118 0.11652196 0.11642403
 0.11550627 0.11472508 0.11469995 0.11429741 0.11464745 0.11481041
 0.11482462 0.11482693 0.11479239]
  1. cross_validate Результаты
[0.39413958 0.3828688  0.36101547 0.35630124 0.34966608 0.3429784
 0.33628348 0.3271997  0.31600279 0.30846108 0.29825291 0.2875739
 0.27624874 0.16764142 0.09916005]
[0.3948599  0.38327512 0.37222368 0.36873418 0.36044495 0.35463341
 0.34790507 0.33927794 0.32832151 0.31623257 0.31177676 0.30277563
 0.29323578 0.18754368 0.11479655]
  1. Результаты KFold
[0.39335244 0.38348877 0.36330136 0.35836005 0.35313161 0.3438586
 0.33763568 0.33000708 0.31850341 0.3086596  0.29673882 0.28499221
 0.27604167 0.16784554 0.09916005]
[0.39289968 0.38498342 0.37130633 0.36647269 0.36120497 0.3553052
 0.34963564 0.33994978 0.32737482 0.3194237  0.30952731 0.29963605
 0.29068475 0.18918951 0.11479655]
...