непрерывные и двоичные переменные в наборе данных в регрессии - PullRequest
0 голосов
/ 03 ноября 2019

ML начинающий здесь.

У меня есть набор данных, содержащий многомерные метки, как непрерывные, так и двоичные. Когда я запускаю XGBRegressor или GradientBoostingRegressor, я получаю ужасную оценку дисперсии / r2, равную 0,05 иш.

Целевая метка непрерывна, измеряется в int как процент * 100 в запросе sql (например, 17, 33, 50,66 и т. Д. До 100)

Выбор функции (RFE) показывает, что двоичные метки не влияют. Я что-то не так делаю в отношении двоичных меток или, скорее, что я делаю неправильно?

Набор данныхпример:

Столбцы:

ID_NO, GENDER, AGE, ACTIVEYEARS, LASTACTIVE, LASTCANCEL, CHILDREN, 
POLINFORCE, POLCANCELLED, EDICANCELLED, REMCANCELLED, QUOTES, CLAIMS,
OWNCLAIMS, REMCNT, PREMIUM, CLAIMEXPENSES, HHMEMBERS, HHPOLINFORCE, 
HHPOLCAN, HHQUOTES, HHCLAIMS, HHOWNCLAIMS, HHCLAIMEXPENSES, 
HHREMCNT, CAROWNER, HOMEOWNER, EDICAN

GENDER, CAROWNER, HOMEOWNER являются двоичными, остальные - целыми.

Метки:

22731254,0,34,4,3,0,2,0,5,3,0,0,0,0,0,124,0,3,0,1,0,0,0,0,0,0,0,60
22731335,1,31,1,3,0,0,0,1,0,0,0,0,0,0,23,0,0,0,0,0,0,0,0,0,0,0,0
22763768,0,64,5,6,0,0,0,3,0,0,0,1,0,0,49,0,0,0,0,0,0,0,0,0,0,0,0
22770802,0,68,6,1,0,0,3,3,0,0,0,1,3,2,170,40,0,0,0,0,0,0,0,0,1,0,0
22770993,1,71,7,0,0,0,3,0,0,0,0,3,2,0,65,18,0,0,0,0,0,0,0,0,1,0,0
22797611,0,39,5,2,0,1,1,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0
22796119,1,56,0,6,0,0,0,4,0,0,0,0,0,0,13,0,3,0,0,0,0,0,0,0,0,0,0
22797999,0,59,3,3,0,0,0,3,0,0,0,1,0,0,86,0,0,0,0,0,0,0,0,0,0,0,0
22798006,1,58,5,3,0,0,0,0,4,0,0,0,0,0,45,6,0,0,3,0,1,0,0,0,0,0,100
22802852,1,60,6,1,0,0,0,6,1,0,0,2,0,0,319,12,4,1,2,0,1,0,66,0,0,0,17
22619616,0,53,7,1,0,0,4,3,0,0,1,2,0,0,93,1,1,0,0,0,0,0,0,0,1,0,0

Я использовал GridCVsearchчтобы получить оптимальный гиперпарам, без каких-либо различий в результате.

import numpy as np
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.metrics import r2_score, mean_squared_error, explained_variance_score, mean_squared_log_error, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, train_test_split, cross_val_score, cross_val_predict, cross_validate
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
from xgboost import XGBRegressor


def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pandas.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)


colnames = ["ID_NO", "GENDER", "AGE", "ACTIVEYEARS", "LASTACTIVE", "LASTCANCEL", "CHILDREN", "POLINFORCE",
            "POLCANCELLED", "EDICANCELLED", "REMCANCELLED", "QUOTES", "CLAIMS", "OWNCLAIMS", "REMCNT", "PREMIUM",
            "CLAIMEXPENSES", "HHMEMBERS", "HHPOLINFORCE", "HHPOLCAN", "HHQUOTES", "HHCLAIMS", "HHOWNCLAIMS",
            "HHCLAIMEXPENSES", "HHREMCNT", "CAROWNER", "HOMEOWNER", "EDICAN"]

df = pd.read_csv("C:\\Downloads\\edican_so.csv", index_col="ID_NO", names=colnames, delimiter=",", header=0)
df = clean_dataset(df)

print(df.head())

Xtrainarray = df.values
ytrainarray = df.columns
X = shuffle(Xtrainarray[:, 0:df.shape[1] - 1])
y = shuffle(Xtrainarray[:, df.shape[1] - 1:df.shape[1]])
scaler = MinMaxScaler(copy=True, feature_range=(0, 1)).fit(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, shuffle=True)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

def feature_ranking_rfe(estimator, Xtrain, ytrain, Xtest, ytest):
    rfe = RFE(estimator, 6)
    rfe_fit = rfe.fit(Xtrain, ytrain)
    print("XGBregressor Feature Ranking: %s" % rfe_fit.ranking_)

xgb_regressor = XGBRegressor(base_score=0.5,
                             objective='reg:squarederror',
                             max_depth=1,
                             min_child_weight=9,
                             subsample=0.43,
                             colsample_bytree=0.55,
                             colsample_bylevel=0.95,
                             reg_alpha=0.001,
                             reg_lambda=0.001,
                             gamma=0.001,
                             learning_rate=0.000001,
                             scale_pos_weight=0.01,
                             n_estimators=6000)

def xgbr(estimator, Xtrain, ytrain, Xtest, ytest, nsplits=10, randomstate=42):
    kfold = KFold(n_splits=nsplits, random_state=randomstate)

    neg_mean_squared_error_scorer = make_scorer(mean_squared_error, greater_is_better=False)

    xgb_fit = estimator.fit(Xtrain, ytrain,
                            eval_set=[(Xtest, ytest)],
                            eval_metric=['rmse'],
                            early_stopping_rounds=30,
                            verbose=False)

    score = xgb_fit.score(Xtest, ytest)
    print(score.mean())

    try:
        xgb_mse_score = cross_val_score(xgb_fit, Xtrain, ytrain, cv=kfold, scoring=neg_mean_squared_error_scorer)
        print("XGBRegressor Train mean_squared_error: ", xgb_mse_score.mean(), xgb_mse_score.std())
    except Exception as e:
        print("error in xgb_mse_score")
        print(e)

    try:
        y_pred = xgb_fit.predict(Xtest)

        expl_var_scores = explained_variance_score(ytest, y_pred)
        print("XGBRegressor predict explained_variance_score:", expl_var_scores)

        r2_scores = r2_score(ytest, y_pred)
        print("XGBRegressor predict r2_score:", r2_scores)

        msle_score = mean_squared_log_error(ytest, y_pred)
        print("XGBRegressor predict mean_squared_log_error: %.2f" % msle_score)

        mse_score = mean_squared_error(ytest, y_pred)
        print("XGBRegressor predict mean_squared_error: %.2f" % mse_score)
    except Exception as e:
        print(e)

if __name__ == "__main__":
    feature_ranking_rfe(xgb_regressor, X_train, y_train.ravel(), X_test, y_test.ravel())
    xgbr(xgb_regressor, X_train, y_train.ravel(), X_test, y_test.ravel())
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...