Я использую этот код, чтобы исследовать корреляцию между моим золотым столбцом и остальными столбцами в моих данных, и я не понимаю, почему у меня есть пустые ячейки, которые являются пустыми в моей матрице путаницы, как показано ниже
# Run this program on your local python
# interpreter, provided you have installed
# the required libraries.
# Importing the required packages
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# Function importing Dataset
column_count =0
def importdata():
balance_data = pd.read_csv( 'data1.txt', sep= ',', header = None)
row_count, column_count = balance_data.shape
# Printing the dataswet shape
print ("Dataset Length: ", len(balance_data))
print ("Dataset Shape: ", balance_data.shape)
print("Number of columns ", column_count)
# Printing the dataset obseravtions
print ("Dataset: ",balance_data.head())
return balance_data, column_count
def columns(balance_data):
row_count, column_count = balance_data.shape
return column_count
#Univariate selection
def selection(column_count):
data = pd.read_csv("data1extended.txt")
X = data.iloc[:,1:column_count] #independent columns
y = data.iloc[:,0] #target column i.e price range
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=5)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score'] #naming the dataframe columns
print(featureScores.nlargest(5,'Score')) #print 10 best features
return X,y,data
#Feature importance
def feature(X,y):
model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(5).plot(kind='barh')
plt.show()
#Correlation Matrix
def correlation(data, column_count):
corrmat = data.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(column_count,column_count))
#plot heat map
g=sns.heatmap(data[top_corr_features].corr(),annot=True,cmap="RdYlGn")
# Driver code
def main():
# Building Phase
data,column_count = importdata()
X,y,dataheaders=selection(column_count)
feature(X,y)
correlation(dataheaders,column_count)
# Calling main function
if __name__=="__main__":
main()
ссылки на мои файлы ввода данных:
data1.txt: https://drive.google.com/open?id=1qHAkFBboCKoVH4pKIPHoVmeyb3uVCUAk
data1Extended.txt : https://drive.google.com/open?id=1C8GZ2wiqFUW3WuYxyc0G3axgkM1Uwsb6