Мой набор данных выглядит так (первые 20 записей). Сценарий, который я тестирую, приведен ниже.
Credit_Score Net_Advance APR Mosaic Time_at_Address Time_in_Employment Time_with_Bank Value_of_Property Total_Outstanding_Balances Age
918 3000 14.4 46 132 288 168 178,000.00 64406 46
903 21000 7.9 16 288 37 300 180,000.00 31614 59
1060 7200 7.9 17 276 154 369 199,000.00 26045 56
839 8000 16.9 47 48 82 216 120,000.00 181217 33
1057 7650 7.4 55 156 342 510 180,000.00 63811 49
913 33000 9.4 59 18 170 240 205,000.00 219003 45
840 8000 15.9 12 293 77 317 179,000.00 90797 51
961 5300 11.9 43 163 351 243 92,000.00 84624 49
901 12000 11.9 11 108 24 180 180,000.00 158678 55
915 6000 12.9 49 36 72 384 120,000.00 2785 48
840 10150 12.4 24 37 58 261 110,000.00 109231 27
968 18000 8.4 24 2 168 420 120,000.00 85502 49
904 10000 8.7 46 24 8 174 150,000.00 157718 37
924 8000 9.9 47 418 439 379 120,000.00 2827 72
896 5000 9.4 15 4 240 300 246,000.00 257560 48
804 5000 17.1 44 12 36 240 165,000.00 160650 37
840 21200 11.5 44 339 133 231 117,000.00 31316 50
862 2000 31.9 18 44 63 186 291,000.00 279819 35
785 1100 40.9 23 94 54 150 120,000.00 789 39
847 20000 9.4 16 237 309 326 272,000.00 170348 59
Вот мой действительный код.
# Using both Regression and Classification to measure the Credit Score of a customer
import numpy as np
import pandas as pd
from sklearn import datasets
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection# random forest model creation
from sklearn.model_selection import train_test_split# implementing train-test-split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
# load data from CSV into data frame and use a specific argument 'thousands=',''
df = pd.read_csv("C:\\my_path\\credit.csv", encoding="ISO-8859-1",sep=',', thousands=',')
# view a small sample of data for piece of mind
df.head()
from sklearn.ensemble import RandomForestClassifier
features = np.array(['Net_Advance', 'APR', 'Mosaic', 'Mosaic_Class', 'Time_at_Address', 'Number_of_Dependants', 'Time_in_Employment', 'Income_Range', 'Time_with_Bank', 'Value_of_Property', 'Outstanding_Mortgage_Bal', 'Total_Outstanding_Balances', 'Age'])
clf = RandomForestClassifier()
clf.fit(df[features], df['Credit_Score'])
# from the calculated importances, order them from most to least important
# and make a barplot so we can visualize what is/isn't important
importances = clf.feature_importances_
sorted_idx = np.argsort(importances)
padding = np.arange(len(features)) + 0.5
plt.barh(padding, importances[sorted_idx], align='center')
plt.yticks(padding, features[sorted_idx])
plt.xlabel("Relative Importance")
plt.title("Variable Importance")
plt.show()
# try PCA & LDA methodologies
# first PCA ...
X = df[['Net_Advance', 'APR', 'Mosaic', 'Mosaic_Class', 'Time_at_Address', 'Number_of_Dependants', 'Time_in_Employment', 'Income_Range', 'Time_with_Bank', 'Value_of_Property', 'Outstanding_Mortgage_Bal', 'Total_Outstanding_Balances', 'Age']]
y = df[['Credit_Score']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=66)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.decomposition import PCA
pca = PCA()
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
from sklearn.decomposition import PCA
pca = PCA(n_components=1)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(max_depth=2, random_state=0)
classifier.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
# Performance Evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy ' + str(accuracy_score(y_test, y_pred)))
# Result:
Accuracy 0.009062326613648974
Итак, мой вопрос: как обучение может быть таким, таким низким? Это в основном нулевой результат обучения, основанный на коде и результатах, показанных выше. Кроме того, когда я тестирую несколько других концепций / экспериментов, как описано ниже, я вижу результаты точности около 60%, в лучшем случае. Я ожидал бы около 90% результатов с точностью ... Вот код, который я тестирую.
# Baggin Classifier
from sklearn.ensemble import BaggingClassifier
from sklearn import tree
model = BaggingClassifier(tree.DecisionTreeClassifier(random_state=1))
model.fit(x_train, y_train)
model.score(x_test,y_test)
# around 5% accorate. horrible!
# Bagging Regressor
from sklearn.ensemble import BaggingRegressor
model = BaggingRegressor(tree.DecisionTreeRegressor(random_state=1))
model.fit(x_train, y_train)
model.score(x_test,y_test)
# almost 65% accurate; better but not great!
# AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier(random_state=1)
model.fit(x_train, y_train)
model.score(x_test,y_test)
# just 1% accurate! no way!!
# AdaBoostRegressor
from sklearn.ensemble import AdaBoostRegressor
model = AdaBoostRegressor()
model.fit(x_train, y_train)
model.score(x_test,y_test)
# around 60%. just ok.
# GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
model= GradientBoostingClassifier(learning_rate=0.1,random_state=1)
model.fit(x_train, y_train)
model.score(x_test,y_test)
# around 60%. just ok.
# GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor
model= GradientBoostingRegressor()
model.fit(x_train, y_train)
model.score(x_test,y_test)
# around 60%. just ok.
# XGBClassifier
import xgboost as xgb
model=xgb.XGBClassifier(random_state=1,learning_rate=0.1)
model.fit(x_train, y_train)
model.score(x_test,y_test)
# around 60%. just ok.
# XGBRegressor
import xgboost as xgb
model=xgb.XGBRegressor()
model.fit(x_train, y_train)
model.score(x_test,y_test)
# around 60%. just ok.
Любая идея, почему это, во-первых, неправильно ???
# XGBRegressor
import xgboost as xgb
model=xgb.XGBRegressor()
model.fit(x_train, y_train)
model.score(x_test,y_test)