Как замаскировать некоторые ячейки графика тепловой карты? - PullRequest
0 голосов
/ 02 сентября 2018

Я построил следующую тепловую карту: enter image description here

используя этот код:

data = {'Month':['August','August','August','August','August','August','August','August','August','August','August','August',
 'February','February','February','February','February','February','February','February','February','February','February','February'],
'Day':['Sunday','Monday','Tuesday','Sunday','Monday','Tuesday','Sunday','Monday','Tuesday','Sunday','Monday','Tuesday',
 'Sunday','Monday','Tuesday','Sunday','Monday','Tuesday','Sunday','Monday','Tuesday','Sunday','Monday','Tuesday',],
'Temperature':[34,32,33,36,37,35,29,32,33,32,36,30,
 19,22,21,17,15,14,19,20,22,20,19,18],
'WorkingHours':[0,9.5,8.5,0,9,8.5,0,10,9.5,0,8,8.5,
 0,8.5,9,0,9,9,0,10,8,0,8.5,9.5]}

df = pd.DataFrame(data)

def associations(dataset, nominal_columns=None, mark_columns=False, theil_u=False, plot=True,
return_results = False, **kwargs):
"""
Calculate the correlation/strength-of-association of features in data-set with both categorical (eda_tools) and
continuous features using:
- Pearson's R for continuous-continuous cases
- Correlation Ratio for categorical-continuous cases
- Cramer's V or Theil's U for categorical-categorical cases

:param dataset: NumPy ndarray / Pandas DataFrame
The data-set for which the features' correlation is computed
:param nominal_columns: string / list / NumPy ndarray
Names of columns of the data-set which hold categorical values. Can also be the string 'all' to state that all
columns are categorical, or None (default) to state none are categorical
:param mark_columns: Boolean (default: False)
if True, output's columns' names will have a suffix of '(nom)' or '(con)' based on there type (eda_tools or
continuous), as provided by nominal_columns
:param theil_u: Boolean (default: False)
In the case of categorical-categorical feaures, use Theil's U instead of Cramer's V
:param plot: Boolean (default: True)
If True, plot a heat-map of the correlation matrix
:param return_results: Boolean (default: False)
If True, the function will return a Pandas DataFrame of the computed associations
:param kwargs:
Arguments to be passed to used function and methods
:return: Pandas DataFrame
A DataFrame of the correlation/strength-of-association between all features
"""

dataset = convert(dataset, 'dataframe')
columns = dataset.columns
if nominal_columns is None:
nominal_columns = list()
elif nominal_columns == 'all':
nominal_columns = columns
corr = pd.DataFrame(index=columns, columns=columns)
for i in range(0,len(columns)):
for j in range(i,len(columns)):
if i == j:
corr[columns[i]][columns[j]] = 1.0
else:
if columns[i] in nominal_columns:
if columns[j] in nominal_columns:
if theil_u:
corr[columns[j]][columns[i]] = theils_u(dataset[columns[i]],dataset[columns[j]])
corr[columns[i]][columns[j]] = theils_u(dataset[columns[j]],dataset[columns[i]])
else:
cell = cramers_v(dataset[columns[i]],dataset[columns[j]])
corr[columns[i]][columns[j]] = cell
corr[columns[j]][columns[i]] = cell
else:
cell = correlation_ratio(dataset[columns[i]], dataset[columns[j]])
corr[columns[i]][columns[j]] = cell
corr[columns[j]][columns[i]] = cell
else:
if columns[j] in nominal_columns:
cell = correlation_ratio(dataset[columns[j]], dataset[columns[i]])
corr[columns[i]][columns[j]] = cell
corr[columns[j]][columns[i]] = cell
else:
cell, _ = ss.pearsonr(dataset[columns[i]], dataset[columns[j]])
corr[columns[i]][columns[j]] = cell
corr[columns[j]][columns[i]] = cell
corr.fillna(value=np.nan, inplace=True)
if mark_columns:
marked_columns = ['{} (nom)'.format(col) if col in nominal_columns else '{} (con)'.format(col) for col in columns]
corr.columns = marked_columns
corr.index = marked_columns
if plot:
plt.figure(figsize=kwargs.get('figsize',None))
sns.heatmap(corr, annot=kwargs.get('annot',True), fmt=kwargs.get('fmt','.2f'))
plt.show()
if return_results:
return corr

nominal.associations(df, nominal_columns=['Month','Day'])

но мне просто нужно, чтобы это было так: enter image description here

Фактически месяц и день являются номинальными характеристиками, а рабочее время и температура - числовыми. Корреляция между числовыми и номинальными характеристиками вычисляется с использованием Eta, поэтому я хочу построить ее отдельно.

Заранее спасибо.

1 Ответ

0 голосов
/ 02 сентября 2018

Мне кажется, нужен фильтр DataFrame по подмножеству списка имен столбцов:

Итак, измените:

sns.heatmap(corr, annot=kwargs.get('annot',True), fmt=kwargs.get('fmt','.2f'))

до:

c1 = ['WorkingHours','Temperature']
c2 = ['Day','Month']
sns.heatmap(corr.loc[c1, c2], annot=kwargs.get('annot',True), fmt=kwargs.get('fmt','.2f'))
...