ML начинающий здесь.
У меня есть набор данных, содержащий многомерные метки, как непрерывные, так и двоичные. Когда я запускаю XGBRegressor или GradientBoostingRegressor, я получаю ужасную оценку дисперсии / r2, равную 0,05 иш.
Целевая метка непрерывна, измеряется в int как процент * 100 в запросе sql (например, 17, 33, 50,66 и т. Д. До 100)
Выбор функции (RFE) показывает, что двоичные метки не влияют. Я что-то не так делаю в отношении двоичных меток или, скорее, что я делаю неправильно?
Набор данныхпример:
Столбцы:
ID_NO, GENDER, AGE, ACTIVEYEARS, LASTACTIVE, LASTCANCEL, CHILDREN,
POLINFORCE, POLCANCELLED, EDICANCELLED, REMCANCELLED, QUOTES, CLAIMS,
OWNCLAIMS, REMCNT, PREMIUM, CLAIMEXPENSES, HHMEMBERS, HHPOLINFORCE,
HHPOLCAN, HHQUOTES, HHCLAIMS, HHOWNCLAIMS, HHCLAIMEXPENSES,
HHREMCNT, CAROWNER, HOMEOWNER, EDICAN
GENDER, CAROWNER, HOMEOWNER
являются двоичными, остальные - целыми.
Метки:
22731254,0,34,4,3,0,2,0,5,3,0,0,0,0,0,124,0,3,0,1,0,0,0,0,0,0,0,60
22731335,1,31,1,3,0,0,0,1,0,0,0,0,0,0,23,0,0,0,0,0,0,0,0,0,0,0,0
22763768,0,64,5,6,0,0,0,3,0,0,0,1,0,0,49,0,0,0,0,0,0,0,0,0,0,0,0
22770802,0,68,6,1,0,0,3,3,0,0,0,1,3,2,170,40,0,0,0,0,0,0,0,0,1,0,0
22770993,1,71,7,0,0,0,3,0,0,0,0,3,2,0,65,18,0,0,0,0,0,0,0,0,1,0,0
22797611,0,39,5,2,0,1,1,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0
22796119,1,56,0,6,0,0,0,4,0,0,0,0,0,0,13,0,3,0,0,0,0,0,0,0,0,0,0
22797999,0,59,3,3,0,0,0,3,0,0,0,1,0,0,86,0,0,0,0,0,0,0,0,0,0,0,0
22798006,1,58,5,3,0,0,0,0,4,0,0,0,0,0,45,6,0,0,3,0,1,0,0,0,0,0,100
22802852,1,60,6,1,0,0,0,6,1,0,0,2,0,0,319,12,4,1,2,0,1,0,66,0,0,0,17
22619616,0,53,7,1,0,0,4,3,0,0,1,2,0,0,93,1,1,0,0,0,0,0,0,0,1,0,0
Я использовал GridCVsearchчтобы получить оптимальный гиперпарам, без каких-либо различий в результате.
import numpy as np
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.metrics import r2_score, mean_squared_error, explained_variance_score, mean_squared_log_error, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, train_test_split, cross_val_score, cross_val_predict, cross_validate
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
from xgboost import XGBRegressor
def clean_dataset(df):
assert isinstance(df, pd.DataFrame), "df needs to be a pandas.DataFrame"
df.dropna(inplace=True)
indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
return df[indices_to_keep].astype(np.float64)
colnames = ["ID_NO", "GENDER", "AGE", "ACTIVEYEARS", "LASTACTIVE", "LASTCANCEL", "CHILDREN", "POLINFORCE",
"POLCANCELLED", "EDICANCELLED", "REMCANCELLED", "QUOTES", "CLAIMS", "OWNCLAIMS", "REMCNT", "PREMIUM",
"CLAIMEXPENSES", "HHMEMBERS", "HHPOLINFORCE", "HHPOLCAN", "HHQUOTES", "HHCLAIMS", "HHOWNCLAIMS",
"HHCLAIMEXPENSES", "HHREMCNT", "CAROWNER", "HOMEOWNER", "EDICAN"]
df = pd.read_csv("C:\\Downloads\\edican_so.csv", index_col="ID_NO", names=colnames, delimiter=",", header=0)
df = clean_dataset(df)
print(df.head())
Xtrainarray = df.values
ytrainarray = df.columns
X = shuffle(Xtrainarray[:, 0:df.shape[1] - 1])
y = shuffle(Xtrainarray[:, df.shape[1] - 1:df.shape[1]])
scaler = MinMaxScaler(copy=True, feature_range=(0, 1)).fit(X, y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, shuffle=True)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
def feature_ranking_rfe(estimator, Xtrain, ytrain, Xtest, ytest):
rfe = RFE(estimator, 6)
rfe_fit = rfe.fit(Xtrain, ytrain)
print("XGBregressor Feature Ranking: %s" % rfe_fit.ranking_)
xgb_regressor = XGBRegressor(base_score=0.5,
objective='reg:squarederror',
max_depth=1,
min_child_weight=9,
subsample=0.43,
colsample_bytree=0.55,
colsample_bylevel=0.95,
reg_alpha=0.001,
reg_lambda=0.001,
gamma=0.001,
learning_rate=0.000001,
scale_pos_weight=0.01,
n_estimators=6000)
def xgbr(estimator, Xtrain, ytrain, Xtest, ytest, nsplits=10, randomstate=42):
kfold = KFold(n_splits=nsplits, random_state=randomstate)
neg_mean_squared_error_scorer = make_scorer(mean_squared_error, greater_is_better=False)
xgb_fit = estimator.fit(Xtrain, ytrain,
eval_set=[(Xtest, ytest)],
eval_metric=['rmse'],
early_stopping_rounds=30,
verbose=False)
score = xgb_fit.score(Xtest, ytest)
print(score.mean())
try:
xgb_mse_score = cross_val_score(xgb_fit, Xtrain, ytrain, cv=kfold, scoring=neg_mean_squared_error_scorer)
print("XGBRegressor Train mean_squared_error: ", xgb_mse_score.mean(), xgb_mse_score.std())
except Exception as e:
print("error in xgb_mse_score")
print(e)
try:
y_pred = xgb_fit.predict(Xtest)
expl_var_scores = explained_variance_score(ytest, y_pred)
print("XGBRegressor predict explained_variance_score:", expl_var_scores)
r2_scores = r2_score(ytest, y_pred)
print("XGBRegressor predict r2_score:", r2_scores)
msle_score = mean_squared_log_error(ytest, y_pred)
print("XGBRegressor predict mean_squared_log_error: %.2f" % msle_score)
mse_score = mean_squared_error(ytest, y_pred)
print("XGBRegressor predict mean_squared_error: %.2f" % mse_score)
except Exception as e:
print(e)
if __name__ == "__main__":
feature_ranking_rfe(xgb_regressor, X_train, y_train.ravel(), X_test, y_test.ravel())
xgbr(xgb_regressor, X_train, y_train.ravel(), X_test, y_test.ravel())