Я хочу вычислить коэффициент корреляции между номинальной и числовой характеристиками, используя следующий код:
def corr_ratio(values, category_labels):
# Computes correlation ratio for a given array of values and category_labels
# Eqn: eta^2 = (sum_x [N_x * (mean(y_x) - mean(y))^2]) / (sum_x [sum_i [(y_xi - mean(y))^2]])
# values: 1D array
# category_labels: 1D array
category_labels = np.array(category_labels)
values = np.array(values)
categories = np.unique(category_labels)
data_dict = {}
for catname in categories:
indices = np.where(category_labels == catname)
data_dict[catname] = values[indices]
cat_means = {catname: np.mean(data_dict[catname]) for catname in categories}
overall_mean = np.mean(values)
nominator = np.sum([len(data_dict[catname]) * (cat_means[catname] - overall_mean) ** 2 for catname in categories])
denominator = np.sum([(val - overall_mean) ** 2 for catname in categories for val in data_dict[catname]])
corr_ratio = nominator / denominator
return corr_ratio
data = {
'Month': ['August', 'August', 'August', 'August', 'August', 'August', 'August', 'August', 'August', 'August',
'August', 'August',
'February', 'February', 'February', 'February', 'February', 'February', 'February', 'February',
'February', 'February', 'February', 'February'],
'Day': ['Sunday', 'Monday', 'Tuesday', 'Sunday', 'Monday', 'Tuesday', 'Sunday', 'Monday', 'Tuesday', 'Sunday',
'Monday', 'Tuesday',
'Sunday', 'Monday', 'Tuesday', 'Sunday', 'Monday', 'Tuesday', 'Sunday', 'Monday', 'Tuesday', 'Sunday',
'Monday', 'Tuesday', ],
'Temperature': [34, 32, 33, 36, 37, 35, 29, 32, 33, 32, 36, 30,
19, 22, 21, 17, 15, 14, 19, 20, 22, 20, 19, 18],
'WorkingHours': [0, 9.5, 8.5, 0, 9, 8.5, 0, 10, 9.5, 0, 8, 8.5,
0, 8.5, 9, 0, 9, 9, 0, 10, 8, 0, 8.5, 9.5]}
df = pd.DataFrame(data)
print(corr_ratio(df['Temperature'], df['Day']))
Однако возникает следующая ошибка. Буду признателен, если вы дадите мне знать, как решить эту проблему.
print(corr_ratio(df['Temperature'], df['Day']))
File "D:/mifs-master_2/MU/learning-from-imbalanced-classes-master/learning-from-imbalanced-classes-master/continuous/Final Logit/logit-final.py", line 383, in corr_ratio
cat_means = {catname: np.mean(data_dict[catname]) for catname in categories}
File "D:/mifs-master_2/MU/learning-from-imbalanced-classes-master/learning-from-imbalanced-classes-master/continuous/Final Logit/logit-final.py", line 383, in <dictcomp>
cat_means = {catname: np.mean(data_dict[catname]) for catname in categories}
KeyError: 'Monday'
Заранее спасибо.