Хорошо, так что то, что парень по вашей ссылке делает с
thresholds = sort(model.feature_importances_)
for thresh in thresholds:
# select features using threshold
selection = SelectFromModel(model, threshold=thresh, prefit=True)
select_X_train = selection.transform(X_train)
# train model
selection_model = XGBClassifier()
selection_model.fit(select_X_train, y_train)
# eval model
select_X_test = selection.transform(X_test)
predictions = selection_model.predict(select_X_test)
accuracy = accuracy_score(y_test, predictions)
print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1], accuracy*100.0))
, - это создать отсортированный массив пороговых значений, а затем он тренирует XGBoost для каждого элемента массива thresholds
.
Судя по вашему вопросу, я думаю, вы хотите выбрать только 6-й случай, с наименьшим количеством функций и высочайшей точностью. В этом случае вам нужно сделать что-то вроде этого:
selection = SelectFromModel(model, threshold=threshold[5], prefit=True)
select_X_train = selection.transform(X_train)
selection_model = XGBClassifier()
selection_model.fit(select_X_train, y_train)
select_X_test = selection.transform(X_test)
predictions = selection_model.predict(select_X_test)
accuracy = accuracy_score(y_test, predictions)
print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" % (threshold[5], select_X_train.shape[1], accuracy*100.0))
Если вы хотите автоматизировать все это, вы должны вычислить минимальное n, для которого точность максимальна. внутри это для l oop, и это будет выглядеть примерно так:
n_min = *your maximum number of used features*
acc_max = 0
thresholds = sort(model.feature_importances_)
obj_thresh = thresholds[0]
for thresh in thresholds:
selection = SelectFromModel(model, threshold=thresh, prefit=True)
select_X_train = selection.transform(X_train)
selection_model = XGBClassifier()
selection_model.fit(select_X_train, y_train)
select_X_test = selection.transform(X_test)
predictions = selection_model.predict(select_X_test)
accuracy = accuracy_score(y_test, predictions)
if(select_X_train.shape[1] < n_min) and (accuracy > acc_max):
n_min = select_X_train.shape[1]
acc_max = accuracy
obj_thresh = thresh
selection = SelectFromModel(model, threshold=obj_thresh, prefit=True)
select_X_train = selection.transform(X_train)
selection_model = XGBClassifier()
selection_model.fit(select_X_train, y_train)
select_X_test = selection.transform(X_test)
predictions = selection_model.predict(select_X_test)
accuracy = accuracy_score(y_test, predictions)
print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" % (obj_thresh, select_X_train.shape[1], accuracy*100.0))