Я написал функцию для вычисления значений KS для различных итераций регрессионной модели логистики c. Когда я запускаю код, значение KS выводится на экран, но оно не сохраняется в таблице.
Пользовательская функция KS
def ks_value(bad_flag=None, predicted_prob=None):
## Data Prep
ksdf = pd.DataFrame([])
ksdf['bad_flag'] = bad_flag
ksdf['probability'] = predicted_prob
ksdf = ksdf.reset_index()
ksdf.drop(columns=['index'],inplace=True)
ksdf['decile'] = pd.qcut(ksdf['probability'],10,labels=['1','2','3','4','5','6','7','8','9','10'])
ksdf['good_flag'] = 1-ksdf['bad_flag']
ksdf.head()
## Pivot
ksdf1 = pd.pivot_table(data=ksdf,index=['decile'],values=['bad_flag','good_flag','probability'],
aggfunc={'bad_flag':[np.sum],
'good_flag':[np.sum],
'probability' : [np.min,np.max]})
## Add Columns
ksdf1['total_counts'] = ksdf1['bad_flag']+ksdf1['good_flag']
ksdf1 = ksdf1.reset_index()
ksdf1.columns = ['Decile','Defaulter_Count','Non-Defaulter_Count','max_score','min_score','Total_Count']
ksdf1 = ksdf1.sort_values(by='min_score',ascending=False)
ksdf1['Default_Rate'] = (ksdf1['Defaulter_Count'] / ksdf1['Total_Count']).apply('{0:.2%}'.format)
default_sum = ksdf1['Defaulter_Count'].sum()
non_default_sum = ksdf1['Non-Defaulter_Count'].sum()
ksdf1['Default %'] = (ksdf1['Defaulter_Count']/default_sum).apply('{0:.2%}'.format)
ksdf1['Non_Default %'] = (ksdf1['Non-Defaulter_Count']/non_default_sum).apply('{0:.2%}'.format)
## Compute KS
ksdf1['ks_stats'] = np.round(((ksdf1['Defaulter_Count'] / ksdf1['Defaulter_Count'].sum()).cumsum() -(ksdf1['Non-Defaulter_Count'] / ksdf1['Non-Defaulter_Count'].sum()).cumsum()), 4) * 100
return(ksdf1['ks_stats'].max())
Код для итерации в Logisti c Регрессия
# Iterating to find the Optimal value of C for model overfitting - Checks on Test Data
C_param_range = [0.001,0.01,0.1,1,10,100]
table1 = pd.DataFrame(columns = ['C_parameter','Test Accuracy','Train Accuracy','Test KS','Train KS'])
table1['C_parameter'] = C_param_range
j = 0
for i in C_param_range:
# Apply logistic regression model to training data
lr = LogisticRegression(penalty = 'l2', C = i,random_state = 0,max_iter = 1000)
lr.fit(X_train,y_train)
# Predict class (0,1) using model
y_pred = lr.predict(X_test)
y_pred2 = lr.predict(X_train)
y_prob = lr.predict_proba(X_test)[:,1]
y_prob2 = lr.predict_proba(X_train)[:,1]
# KS Value
table1.iloc[j,3] = ks_value(bad_flag=y_test, predicted_prob=y_prob)
table1.iloc[j,4] = ks_value(bad_flag=y_train, predicted_prob=y_prob2)
# Saving accuracy score in table
table1.iloc[j,1] = accuracy_score(y_test,y_pred)
table1.iloc[j,2] = accuracy_score(y_train,y_pred2)
j += 1
Вывод выглядит примерно так: KS - 35,49, KS - 34,25
C_parameter TestAccuracy TrainAccuracy TestKS TrainKS
0.001 0.919911 0.919056 NaN NaN
.