Я занимаюсь регрессионным анализом в scikit-learn. Он состоит из трех частей: 1) функция масштабирования; 2) выбор функции; и 3) регрессия. Я озадачен, наблюдая, что оценки поезда / теста, рассчитанные GridSearchCV, не совпадают с оценками двух других методов: 1) с использованием cross_validate; 2) используя KFold. Результаты cross_validate и KFold согласуются между собой.
- Использование GridSearchCV
scaler = ColumnTransformer([('num', StandardScaler(), slice(0,42))], remainder='passthrough')
pipe = Pipeline([('scale_numeric_feature', scaler),
('select_feature', None),
('regression', Ridge(alpha=10.))], memory='/var/tmp')
N_FEATURE_OPTIONS = [10,50,100,105,110,115,120,125,130,135,140,145,150,200,229]
param_grid ={
'select_feature': [SelectKBest(mutual_info_regression)],
'select_feature__k': N_FEATURE_OPTIONS}
grid = GridSearchCV(pipe, param_grid=param_grid, cv=10, n_jobs=-1, iid=False,
return_train_score=True, scoring='neg_mean_squared_error')
grid.fit(train_x, train_y)
print(np.sqrt(-grid.cv_results_['mean_train_score']))
print(np.sqrt(-grid.cv_results_['mean_test_score']))
- Использование cross_validate
def model2(X, y, number_features):
train_error = np.zeros((10, len(number_features)))
test_error = np.zeros((10, len(number_features)))
for i in range(len(number_features)):
scaler = ColumnTransformer([('num', StandardScaler(), slice(0,42))], remainder='passthrough')
selector = SelectKBest(mutual_info_regression, k=number_features[i])
pipe = Pipeline([('scale_numeric_feature', scaler),
('select_feature', selector),
('regression', Ridge(alpha=10.))])
results = cross_validate(pipe, X, y, scoring='neg_mean_squared_error', cv=10, return_train_score=True)
train_error[:,i] = results['train_score']
test_error[:,i] = results['test_score']
return train_error, test_error
number_features = [10,50,100,105,110,115,120,125,130,135,140,145,150,200,229]
train_error, test_error = model2(train_x, train_y, number_features)
train_avg_error = np.sum(train_error, axis=0)/10.
test_avg_error = np.sum(test_error, axis=0)/10.
print(train_avg_error)
print(test_avg_error)
- Использование KFold
def model3(X, y, number_features):
kf = KFold(n_splits=10, shuffle=True, random_state=0)
kf.get_n_splits(X)
train_error = np.zeros((10, len(number_features)))
test_error = np.zeros((10, len(number_features)))
j = 0
for train_index, test_index in kf.split(X):
X_train, X_test = X.values[train_index], X.values[test_index]
y_train, y_test = y.values[train_index], y.values[test_index]
ct = ColumnTransformer([('scale', StandardScaler(), slice(0,42))], remainder='passthrough')
ct.fit(X_train)
X_train_scaled = ct.transform(X_train)
X_test_scaled = ct.transform(X_test)
selector = SelectKBest(mutual_info_regression, k='all')
selector.fit(X_train_scaled, y_train)
df = pd.DataFrame({'rank': selector.scores_})
df['col'] = np.arange(df.shape[0])
df = df.sort_values(by=['rank'])
for i in range(len(number_features)):
X_train_scaled_selected = X_train_scaled[:,df['col'][0:number_features[i]]]
X_test_scaled_selected = X_test_scaled[:,df['col'][0:number_features[i]]]
regressor = Ridge(alpha=10.)
regressor.fit(X_train_scaled_selected, y_train)
train_error[j,i] = (np.sqrt(mean_squared_error(y_train, regressor.predict(X_train_scaled_selected))))
test_error[j,i] = (np.sqrt(mean_squared_error(y_test, regressor.predict(X_test_scaled_selected))))
j = j + 1
return train_error, test_error
number_features = [10,50,100,105,110,115,120,125,130,135,140,145,150,200,229]
train_error, test_error = model3(train_x, train_y, number_features)
train_avg_error = np.sum(train_error, axis=0)/10.
test_avg_error = np.sum(test_error, axis=0)/10.
print(train_avg_error)
print(test_avg_error)
- Результаты GridSearchCV
[0.15137006 0.11795896 0.10845177 0.10748715 0.10706335 0.10646141
0.10573004 0.10456232 0.10402522 0.1036519 0.10348363 0.10291938
0.10261879 0.10090228 0.09921763]
[0.15251642 0.12285965 0.11711583 0.11644118 0.11652196 0.11642403
0.11550627 0.11472508 0.11469995 0.11429741 0.11464745 0.11481041
0.11482462 0.11482693 0.11479239]
- cross_validate Результаты
[0.39413958 0.3828688 0.36101547 0.35630124 0.34966608 0.3429784
0.33628348 0.3271997 0.31600279 0.30846108 0.29825291 0.2875739
0.27624874 0.16764142 0.09916005]
[0.3948599 0.38327512 0.37222368 0.36873418 0.36044495 0.35463341
0.34790507 0.33927794 0.32832151 0.31623257 0.31177676 0.30277563
0.29323578 0.18754368 0.11479655]
- Результаты KFold
[0.39335244 0.38348877 0.36330136 0.35836005 0.35313161 0.3438586
0.33763568 0.33000708 0.31850341 0.3086596 0.29673882 0.28499221
0.27604167 0.16784554 0.09916005]
[0.39289968 0.38498342 0.37130633 0.36647269 0.36120497 0.3553052
0.34963564 0.33994978 0.32737482 0.3194237 0.30952731 0.29963605
0.29068475 0.18918951 0.11479655]