Xgboost, похоже, дает те же результаты, даже когда переменная random_state изменена. Этот вопрос похож на XGBRegressor: изменение random_state не влияет , но я на самом деле использую подвыборку, и я все еще не получаю другие результаты. Ниже приведен пример кода. Если я запускаю все это, я получаю обучение и подтверждение MSE. Если я затем изменю значение random_state на другое число и перезапущу код с «xgb_dict = {» снизу (главное, не регенерируя данные, а только перезапустив модель), я получу точно такое же среднее значение обучения и проверкиквадрат ошибок. Я использую xgboost версии 0.90. Есть идеи, почему он так себя ведет?
import xgboost as xgb
import numpy as np
import pandas as pd
from scipy.stats import beta
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
def create_y(features):
"""Create fake data"""
y = 3 * features['x1'] + features['x2'] + 1e3 * features['x3']
high_x2_features = features.loc[features['x2'] > 1.4, :]
y[features['x2'] > 1.4] = 3 * high_x2_features['x1'] \
+ 2 * high_x2_features['x2'] + 1e3 * high_x2_features['x3']
y += 0.2 * (np.random.rand(features.shape[0]) - 0.4)
return np.maximum(0, y)
def get_data(n_samples):
"""Fetch the data as a Pandas dataframe.
The dataframe is structured such that every column is a feature (the last
column is the target variable, i.e., the y) and every row is associated
with a certain date/observation time.
"""
features_df = pd.DataFrame({
'x1':
beta.rvs(2, 5, size=n_samples),
'x2': (19 + 11.2 * (np.random.rand(n_samples)**
(3 / 2) - 0.4)) / np.sqrt(252),
'x3':
np.maximum(1e-4, beta.rvs(1, 2000, size=n_samples)),
})
features_df['x3'] += (features_df['x1'] / 1000 +
features_df['x2'] / 20000)
data_df = features_df
data_df['y'] = create_y(features_df)
feature_names = list(set(data_df.columns) - {'y'})
return data_df[feature_names + ['y']]
def split_features_target(data):
"""Split the given dataframe into the features and the target variable."""
feature_names = [col for col in data.columns if col != 'y']
features = data[feature_names]
target = data['y']
return features, target
def get_datasets(n_samples):
"""Get the data and split it into the training, validation, and test sets.
The total numbers of samples is given by n_samples.
"""
all_data = get_data(n_samples)
train, test = train_test_split(all_data, test_size=0.2, shuffle=False)
train, valid = train_test_split(train, test_size=0.2, shuffle=False)
return train, valid, test
#get the training sets
train_data, valid_data, test_data = get_datasets(75_000)
train_features, train_target = split_features_target(train_data)
valid_features, valid_target = split_features_target(valid_data)
dtrain = xgb.DMatrix(train_features, label=train_target)
xgb_dict = {
#'booster': 'dart',
'booster': 'gbtree',
#'booster': 'gblinear',
'max_depth': 2,
'random_state': 100,
'learning_rate': 0.10,
'objective': 'reg:squarederror',
'verbosity': 1,
'sample_type': 'uniform',
'subsample': 0.6,
'normalize_type': 'tree',
'rate_drop': 0.0,
'skip_drop': 0.0,
'min_child_weight': 1,
}
#starting hyperparameter of training rounds
training_rounds = 200
#train the model
bst = xgb.train(
xgb_dict,
dtrain,
training_rounds,
evals=[(xgb.DMatrix(valid_features, label=valid_target), 'RMSE')],
early_stopping_rounds=10) #use the validation to tune the model and find early_stopping
#get training predictions and MSE
xgb_train_pred = bst.predict(xgb.DMatrix(train_features), ntree_limit=training_rounds)
xgb_train_mse = mean_squared_error(xgb_train_pred, train_target)
#get valid predictions and MSE
xgb_valid_pred = bst.predict(xgb.DMatrix(valid_features), ntree_limit=training_rounds)
xgb_valid_mse = mean_squared_error(xgb_valid_pred, valid_target)