У меня есть набор данных бинарной классификации, который помогает мне выполнять бинарную классификацию для наборов данных AF (фибрилляция предсердий) и VT (желудочковая тахикардия). Но мои результаты конфигурации графика ROC таковы, что для каждой итерации он дает NAN, а для 4-гоодин показывает 0,85.Помогите мне в этом.
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn import svm
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
#Preprocessing of NSR
data11 = pd.read_csv("NOM_ECG_ELEC_POTL_IIWaveExport.csv")
data11 = data11.drop('Unnamed: 2',axis=1)
data11.columns = ['date','values']
data11=data11.drop('date',axis=1)
data11=np.array_split(data11,6636)
#data =list(data11.values())
index_list = pd.DataFrame()
for i in range(0,len(data11)):
indices = pd.DataFrame(data11[i])
#indices = pd.DataFrame(indices)
indices.columns =['value']
indices.index = indices.value
#indices = pd.concat([indices,data11],join ="inner",axis=1)
#indices = indices.drop(['value','date'],axis=1)
indices = indices.values.reshape(1,-1)
indices =pd.DataFrame(indices)
index_list = index_list.append(indices)
index_list = index_list.reset_index()
index_list=index_list.drop('index',axis=1)
index_list['output']=0
#Preprocessing of AF
data22 = pd.read_csv("VT.csv")
data22 = data22.drop('Unnamed: 2',axis=1)
data22.columns = ['date','values']
data22=data22.drop('date',axis=1)
data22=np.array_split(data22,4732)
#new_df_1 = pd.DataFrame()
#data2 = data22.groupby(['date']).groups
index_list_1 = pd.DataFrame()
for j in range(0,len(data22)):
indices = pd.DataFrame(data22[j])
#indices = pd.DataFrame(indices)
indices.columns =['value']
indices.index = indices.value
#indices = pd.concat([indices,data22],join ="inner",axis=1)
#indices = indices.drop(['value','date'],axis=1)
indices = indices.values.reshape(1,-1)
indices =pd.DataFrame(indices)
index_list_1 = index_list_1.append(indices)
index_list_1=index_list_1.fillna('9000')
index_list_1 = index_list_1.reset_index()
index_list_1=index_list_1.drop('index',axis=1)
index_list_1['output']=1
#Adding noise #Creating a random dataframe
df = pd.DataFrame(np.random.randint(8000,9000,size=(6636,1001)))
df['1001']=np.NAN
df.rename({'1001':'output'}, axis=1)
df=df.fillna('0')
#index_list_1.iloc[:,0:1001] = np.NAN
#cols1=index_list_1.iloc[:,1001:1002]
#y=cols1
#y.iloc[:,1001:1002]=np.NAN
#cols1=cols1.fillna('1')
#cols1 = index_list_1.iloc[:,0:1001]
#cols= index_list.fillna('9000')
merged = pd.concat([index_list_1,index_list])
merged=merged.drop(merged.columns[-2],axis=1)
#total=pd.concat([index_list_1,df])
#total=pd.concat([merged,df])
#merged_1=shuffle(merged)
#degree=merged['output']
#degree=degree.fillna('1')
#merged=pd.concat([cols1,degree])
#merged=merged.fillna('9000')
#merged_1 = merged_1.reset_index()
#merged_1 = merged_1.drop(['index'],axis=1)
merged.to_csv("AF_Prediction.csv")
cols2=merged['output']
#cols1=merged['output']
#cols1=pd.Dataframe(cols1)
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
#Create training and testing variables
cols2=pd.DataFrame(cols2)
X_train, X_test, y_train, y_test = train_test_split(merged,cols2, test_size=0.2)
y_train=pd.DataFrame(y_train)
#Instantiate Model
model = LogisticRegression()
#Hypertuning the Model
model= LogisticRegression(penalty='l1', dual=False, tol=0.0001, C=1.0)
#X_train.head()
#y_train.head()
#Fit Model
y_train=y_train.astype('bool')
model.fit(X_train, y_train)
# make class predictions for the testing set
y_pred_class = model.predict(X_test)
y_pred_class=pd.DataFrame(y_pred_class)
y_test=y_test.astype('bool')
y_pred_class=y_pred_class.astype('bool')
# calculate accuracy
from sklearn import metrics
from sklearn.metrics import accuracy_score
print(metrics.accuracy_score(y_test,y_pred_class,sample_weight=None))
#Confusion Matrix for the model
from sklearn.metrics import *
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import ParameterGrid
#param_grid = [{'kernel': ['linear']},{'kernel': ['rbf'], 'gamma': [1, 10]}]
#from sklearn.model_selection.ParameterGrid(param_grid)
cf=confusion_matrix(y_test,y_pred_class)
print(classification_report(y_test,y_pred_class))
#Support Vector Machine -Grid Search
def svc_param_selection(X, y, nfolds):
#Cs = [0.001, 0.01, 0.1, 1, 10]
#gammas = [0.001, 0.01, 0.1, 1]
#param_grid = {'C': Cs, 'gamma' : gammas}
param_grid = [{'kernel': ['linear']},{'kernel': ['rbf'], 'gamma': [1, 10]}]
grid_search = GridSearchCV(svm.SVC(param_grid))
grid_search.fit(X_train, y_train)
grid_search.best_params_
return grid_search.best_params_
best_params_=svc_param_selection(merged,cols2,10)
#Support Vector Machine classifier
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(cols, y, test_size = 0.50)
from sklearn.svm import SVC
svclassifier = SVC(kernel='rbf')
svclassifier.fit(X_train, y_train)
#Make prediction on SVM
y_pred = svclassifier.predict(X_test)
y_pred=y_pred.reshape(-1,1)
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
#K-Cross validation
from sklearn.model_selection import cross_val_score
clf = SVC(kernel='linear', C=1)
scores = cross_val_score(clf,X_train ,y_train , cv=10)
from scipy import interp
from itertools import cycle
from sklearn.metrics import roc_curve,auc
from sklearn.model_selection import KFold
tprs =[]
aucs= []
mean_fpr = np.linspace(0,1,100)
cv=KFold(n_splits=10)
i = 0
for train, test in cv.split(merged, cols2):
probas_ = model.fit(merged.iloc[train], cols2.iloc[train]).predict_proba(merged.iloc[test])
# Compute ROC curve and area the curve
fpr, tpr, thresholds = roc_curve(cols2.iloc[test], probas_[:, 1])
tprs.append(interp(mean_fpr, fpr, tpr))
tprs[-1][0] = 0.0
roc_auc = auc(fpr, tpr)
aucs.append(roc_auc)
plt.plot(fpr, tpr, lw=1, alpha=0.3,
label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
i += 1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
label='Chance', alpha=.8)
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
lw=2, alpha=.8)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
label=r'$\pm$ 1 std. dev.')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.savefig('roc 10fold for vitals only xgb.png')
plt.show()
#auc = metrics.roc_auc_score(y_test, y_pred)
#plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
#plt.legend(loc=4)
#plt.show()
#fpr, tpr, thresholds = metrics.roc_curve(y_test,
model.predict_proba(x_test)[:,1])
# Calculate Area under
#for m in models:
# model = m['model'] # select the model
# model.fit(X_train, Y_train) # train the model
# y_pred=model.predict(X_test) # predict the test data
# Compute False postive rate, and True positive rate
#fpr, tpr, thresholds = metrics.roc_curve(y_test,
model.predict_proba(X_test)[:,1])
# Calculate Area under the curve to display on the plot
#auc = metrics.roc_auc_score(Y_test,model.predict(X_test))
# Now, plot the computed values
#plt.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % (m['label'], auc))
#plt.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % (m['label'], auc)
)