матрица путаницы представляет некоторые пустые клетки - PullRequest
1 голос
/ 14 января 2020

Я использую этот код, чтобы исследовать корреляцию между моим золотым столбцом и остальными столбцами в моих данных, и я не понимаю, почему у меня есть пустые ячейки, которые являются пустыми в моей матрице путаницы, как показано ниже enter image description here

    # Run this program on your local python 
# interpreter, provided you have installed 
# the required libraries. 

# Importing the required packages 
import numpy as np 
import pandas as pd 
from sklearn.metrics import confusion_matrix 
from sklearn.cross_validation import train_test_split 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# Function importing Dataset 
column_count =0 
def importdata(): 
    balance_data = pd.read_csv( 'data1.txt', sep= ',', header = None) 
    row_count, column_count = balance_data.shape

    # Printing the dataswet shape 
    print ("Dataset Length: ", len(balance_data)) 
    print ("Dataset Shape: ", balance_data.shape) 
    print("Number of columns ", column_count)

    # Printing the dataset obseravtions 
    print ("Dataset: ",balance_data.head()) 
    return balance_data, column_count 
def columns(balance_data): 
    row_count, column_count = balance_data.shape
    return column_count

#Univariate selection 
def selection(column_count): 
    data = pd.read_csv("data1extended.txt")
    X = data.iloc[:,1:column_count]  #independent columns
    y = data.iloc[:,0]    #target column i.e price range
    #apply SelectKBest class to extract top 10 best features
    bestfeatures = SelectKBest(score_func=chi2, k=5)
    fit = bestfeatures.fit(X,y)
    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(X.columns)
    #concat two dataframes for better visualization 
    featureScores = pd.concat([dfcolumns,dfscores],axis=1)
    featureScores.columns = ['Specs','Score']  #naming the dataframe columns
    print(featureScores.nlargest(5,'Score'))  #print 10 best features
    return X,y,data
#Feature importance
def feature(X,y):    
    model = ExtraTreesClassifier()
    model.fit(X,y)
    print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
    #plot graph of feature importances for better visualization
    feat_importances = pd.Series(model.feature_importances_, index=X.columns)
    feat_importances.nlargest(5).plot(kind='barh')
    plt.show()
#Correlation Matrix 
def correlation(data, column_count):

    corrmat = data.corr()
    top_corr_features = corrmat.index
    plt.figure(figsize=(column_count,column_count))
    #plot heat map
    g=sns.heatmap(data[top_corr_features].corr(),annot=True,cmap="RdYlGn")
# Driver code 
def main(): 

    # Building Phase 
    data,column_count = importdata() 

    X,y,dataheaders=selection(column_count)
    feature(X,y)
    correlation(dataheaders,column_count)

# Calling main function 
if __name__=="__main__": 
    main() 

ссылки на мои файлы ввода данных:

data1.txt: https://drive.google.com/open?id=1qHAkFBboCKoVH4pKIPHoVmeyb3uVCUAk

data1Extended.txt : https://drive.google.com/open?id=1C8GZ2wiqFUW3WuYxyc0G3axgkM1Uwsb6

1 Ответ

1 голос
/ 14 января 2020

Я проверил ваш код, и это пропущенные значения в выводе data[top_corr_features].corr() кода

Как показано ниже

callersAtLeast1T    CalleesAtLeast1T    callersAllT calleesAllT CallersAtLeast1N    CalleesAtLeast1N    CallersAllN CalleesAllN InterfacesAtLeast1T ImplememntationsAtleast1T   ... ParametersatLeast1T FieldMethodsAtLeast1T   ReturnTypeAtLeast1T ParametersAtLeast1N FieldMethodsAtLeast1N   ReturnTypeN ParametersAllT  FieldMethodsAllT    ParametersAllN  FieldMethodsAllN
callersAtLeast1T    1.000000    0.345678    0.652674    0.296284    -0.543014   -0.247331   -0.462966   -0.186227   NaN NaN ... 0.063052    0.128554    0.128554    -0.066818   -0.132167   -0.132167   0.052226    0.128035    -0.056377   -0.130406
CalleesAtLeast1T    0.345678    1.000000    0.288627    0.545091    -0.312117   0.015580    -0.222012   -0.200932   NaN NaN ... 0.041103    0.100681    0.100681    -0.055963   -0.109964   -0.109964   0.013238    0.090864    -0.060733   -0.111723
callersAllT 0.652674    0.288627    1.000000    0.266658    -0.565592   -0.173942   -0.302166   -0.141829   NaN NaN ... 0.065166    0.113121    0.113121    -0.053850   -0.109912   -0.109912   0.081990    0.110453    -0.048177   -0.108930
calleesAllT 0.296284    0.545091    0.266658    1.000000    -0.229375   -0.177132   -0.165415   -0.109526   NaN NaN ... 0.078756    0.105691    0.105691    -0.027136   -0.075527   -0.075527   0.044648    0.107448    -0.033105   -0.074852
CallersAtLeast1N    -0.543014   -0.312117   -0.565592   -0.229375   1.000000    0.094479    0.534247    0.099672    NaN NaN ... -0.008624   -0.099353   -0.099353   0.095209    0.073561    0.073561    -0.008887   -0.099207   0.085180    0.070968
CalleesAtLeast1N    -0.247331   0.015580    -0.173942   -0.177132   0.094479    1.000000    0.133004    0.618332    NaN NaN ... -0.034831   -0.043234   -0.043234   0.014988    0.010699    0.010699    -0.050421   -0.048834   -0.023962   0.003937
CallersAllN -0.462966   -0.222012   -0.302166   -0.165415   0.534247    0.133004    1.000000    0.157877    NaN NaN ... 0.026512    -0.059774   -0.059774   0.091514    0.106280    0.106280    0.048346    -0.061887   0.127316    0.101202
CalleesAllN -0.186227   -0.200932   -0.141829   -0.109526   0.099672    0.618332    0.157877    1.000000    NaN NaN ... -0.004726   -0.022656   -0.022656   0.080276    0.080003    0.080003    -0.015513   -0.020702   0.044586    0.083339
InterfacesAtLeast1T NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
ImplememntationsAtleast1T   NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
childrenAtLeast1T   0.037713    0.045836    0.057783    0.084090    -0.032681   -0.014895   -0.017460   -0.009210   NaN NaN ... -0.002740   -0.004719   -0.004719   -0.003112   -0.006351   -0.006351   -0.002465   -0.004677   -0.002784   -0.006294
parentsAtLeast1T    0.037713    0.045836    0.057783    0.084090    -0.032681   -0.014895   -0.017460   -0.009210   NaN NaN ... -0.002740   -0.004719   -0.004719   -0.003112   -0.006351   -0.006351   -0.002465   -0.004677   -0.002784   -0.006294
InterfacesAtLeast1N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
ImplementationsAtLeast1N    NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
childrenAtLeast1N   -0.023797   -0.019580   -0.015532   -0.010673   0.027461    0.060254    0.051402    0.097445    NaN NaN ... -0.007258   -0.012497   -0.012497   -0.008241   -0.016820   -0.016820   -0.006528   -0.012387   -0.007373   -0.016670
parentsAtLeast1N    -0.023797   -0.019580   -0.015532   -0.010673   0.027461    0.060254    0.051402    0.097445    NaN NaN ... -0.007258   -0.012497   -0.012497   -0.008241   -0.016820   -0.016820   -0.006528   -0.012387   -0.007373   -0.016670
InterfacesAllT  NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
ImplementationsAllT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
childrenAllT    0.037713    0.045836    0.057783    0.084090    -0.032681   -0.014895   -0.017460   -0.009210   NaN NaN ... -0.002740   -0.004719   -0.004719   -0.003112   -0.006351   -0.006351   -0.002465   -0.004677   -0.002784   -0.006294
parentsAllT 0.037713    0.045836    0.057783    0.084090    -0.032681   -0.014895   -0.017460   -0.009210   NaN NaN ... -0.002740   -0.004719   -0.004719   -0.003112   -0.006351   -0.006351   -0.002465   -0.004677   -0.002784   -0.006294
InterfacesAllN  NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
ImplementationsAllN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
childrenAllN    -0.023797   -0.019580   -0.015532   -0.010673   0.027461    0.060254    0.051402    0.097445    NaN NaN ... -0.007258   -0.012497   -0.012497   -0.008241   -0.016820   -0.016820   -0.006528   -0.012387   -0.007373   -0.016670
ParentsAllN -0.023797   -0.019580   -0.015532   -0.010673   0.027461    0.060254    0.051402    0.097445    NaN NaN ... -0.007258   -0.012497   -0.012497   -0.008241   -0.016820   -0.016820   -0.006528   -0.012387   -0.007373   -0.016670
ParametersatLeast1T 0.063052    0.041103    0.065166    0.078756    -0.008624   -0.034831   0.026512    -0.004726   NaN NaN ... 1.000000    0.397018    0.397018    0.003145    -0.013637   -0.013637   0.899521    0.400748    -0.022512   -0.012910
FieldMethodsAtLeast1T   0.128554    0.100681    0.113121    0.105691    -0.099353   -0.043234   -0.059774   -0.022656   NaN NaN ... 0.397018    1.000000    1.000000    -0.026187   -0.088435   -0.088435   0.404635    0.991187    -0.038763   -0.087645
ReturnTypeAtLeast1T 0.128554    0.100681    0.113121    0.105691    -0.099353   -0.043234   -0.059774   -0.022656   NaN NaN ... 0.397018    1.000000    1.000000    -0.026187   -0.088435   -0.088435   0.404635    0.991187    -0.038763   -0.087645
ParametersAtLeast1N -0.066818   -0.055963   -0.053850   -0.027136   0.095209    0.014988    0.091514    0.080276    NaN NaN ... 0.003145    -0.026187   -0.026187   1.000000    0.302727    0.302727    -0.022634   -0.025671   0.894660    0.305813
FieldMethodsAtLeast1N   -0.132167   -0.109964   -0.109912   -0.075527   0.073561    0.010699    0.106280    0.080003    NaN NaN ... -0.013637   -0.088435   -0.088435   0.302727    1.000000    1.000000    -0.021141   -0.087655   0.319416    0.991067
ReturnTypeN -0.132167   -0.109964   -0.109912   -0.075527   0.073561    0.010699    0.106280    0.080003    NaN NaN ... -0.013637   -0.088435   -0.088435   0.302727    1.000000    1.000000    -0.021141   -0.087655   0.319416    0.991067
ParametersAllT  0.052226    0.013238    0.081990    0.044648    -0.008887   -0.050421   0.048346    -0.015513   NaN NaN ... 0.899521    0.404635    0.404635    -0.022634   -0.021141   -0.021141   1.000000    0.408361    -0.020250   -0.020551
FieldMethodsAllT    0.128035    0.090864    0.110453    0.107448    -0.099207   -0.048834   -0.061887   -0.020702   NaN NaN ... 0.400748    0.991187    0.991187    -0.025671   -0.087655   -0.087655   0.408361    1.000000    -0.038422   -0.086872
ParametersAllN  -0.056377   -0.060733   -0.048177   -0.033105   0.085180    -0.023962   0.127316    0.044586    NaN NaN ... -0.022512   -0.038763   -0.038763   0.894660    0.319416    0.319416    -0.020250   -0.038422   1.000000    0.322522
FieldMethodsAllN    -0.130406   -0.111723   -0.108930   -0.074852   0.070968    0.003937    0.101202    0.083339    NaN NaN ... -0.012910   -0.087645   -0.087645   0.305813    0.991067    0.991067    -0.020551   -0.086872   0.322522    1.000000
34 rows × 34 columns

...