У меня есть 34 samples with 4 inputs and one output
в файле Excel.Я делаю прогноз, используя gradient boost regressor (GBR)
, и я хочу найти optimum parameters
для GBR, используя grid search method
из Sklearn
, используя cross validation
для разделения данных.Я реализовал этот код для настройки параметров GBR, но я получил эту ошибку ниже.Фактически, этот код был для classification problem using XGB
, и я изменил этот код, чтобы он соответствовал моей проблеме регрессии.Пожалуйста, вы можете помочь мне исправить эту ошибку?Это то, что я сделал правильно или нет?
Ошибка, которую я получил:
ValueError Traceback (most recent call last)
<ipython-input-5-4ee3b80c1f07> in <module>()
23 kfold = StratifiedKFold(n_splits=2, shuffle=True, random_state=0)
24 grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold,verbose=1)
---> 25 grid_result = grid_search.fit(X, label_encoded_y)
26 # summarize results
27 print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
D:\Anconda\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
637 error_score=self.error_score)
638 for parameters, (train, test) in product(candidate_params,
--> 639 cv.split(X, y, groups)))
640
641 # if one choose to see train score, "out" will contain train score info
D:\Anconda\lib\site-packages\sklearn\model_selection\_split.py in split(self, X, y, groups)
330 n_samples))
331
--> 332 for train, test in super(_BaseKFold, self).split(X, y, groups):
333 yield train, test
334
D:\Anconda\lib\site-packages\sklearn\model_selection\_split.py in split(self, X, y, groups)
93 X, y, groups = indexable(X, y, groups)
94 indices = np.arange(_num_samples(X))
---> 95 for test_index in self._iter_test_masks(X, y, groups):
96 train_index = indices[np.logical_not(test_index)]
97 test_index = indices[test_index]
D:\Anconda\lib\site-packages\sklearn\model_selection\_split.py in _iter_test_masks(self, X, y, groups)
632
633 def _iter_test_masks(self, X, y=None, groups=None):
--> 634 test_folds = self._make_test_folds(X, y)
635 for i in range(self.n_splits):
636 yield test_folds == i
D:\Anconda\lib\site-packages\sklearn\model_selection\_split.py in _make_test_folds(self, X, y)
597 raise ValueError("n_splits=%d cannot be greater than the"
598 " number of members in each class."
--> 599 % (self.n_splits))
600 if self.n_splits > min_groups:
601 warnings.warn(("The least populated class in y has only %d"
ValueError: n_splits=2 cannot be greater than the number of members in each class.
Это ниже моей попытки
# XGB, Tune n_estimators and max_depth
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import matplotlib
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor,GradientBoostingRegressor,
from sklearn.feature_selection import SelectFromModel
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.tree import DecisionTreeRegressor
from sklearn import ensemble
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from IPython.core.interactiveshell import InteractiveShell
matplotlib.use('Agg')
from matplotlib import pyplot
import numpy as np
#read data
Data_ini = pd.read_excel('Data - 1 output -Ra-in - Crossvalidation.xlsx').iloc[:,:] #read data
#encode string class values as integers
label_encoded_y = LabelEncoder().fit_transform(y)
# grid search
model = GradientBoostingRegressor()
n_estimators = [50, 100, 150, 200]
max_depth = [2, 4, 6, 8]
print(max_depth)
param_grid = dict(max_depth=max_depth, n_estimators=n_estimators)
kfold = StratifiedKFold(n_splits=2, shuffle=True, random_state=0)
grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold,verbose=1)
grid_result = grid_search.fit(X, label_encoded_y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
# plot results
scores = numpy.array(means).reshape(len(max_depth), len(n_estimators))
for i, value in enumerate(max_depth):
pyplot.plot(n_estimators, scores[i], label='depth: ' + str(value))
pyplot.legend()
pyplot.xlabel('n_estimators')
pyplot.ylabel('Log Loss')
pyplot.savefig('n_estimators_vs_max_depth.png')