Как получить распределение распределения всех столбцов одновременно? например, распределительный фитинг col_1, col_2, col_3
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import scipy
from sklearn.preprocessing import StandardScaler
import scipy.stats
import matplotlib.pyplot as plt
# Load data and select first column
from sklearn import datasets
data_set = datasets.load_breast_cancer()
# Multiple coloumns of csv
col_1=data_set.data[:,0]
col_2=data_set.data[:,1]
col_3=data_set.data[:,2]
# Create an index array (x) for data
x = np.arange(len(col_1))
size = len(col_1)
plt.hist(col_1)
plt.show()
sc=StandardScaler()
yy = col_1.reshape (-1,1)
sc.fit(yy)
y_std =sc.transform(yy)
y_std = y_std.flatten()
y_std
del yy
dist_names = ['beta',
'expon',
'gamma',
'lognorm',
'norm',
'pearson3',
'triang',
'uniform',
'weibull_min',
'weibull_max']
# Set up empty lists to stroe results
chi_square = []
p_values = []
# Set up 50 bins for chi-square test
# Observed data will be approximately evenly distrubuted aross all bins
percentile_bins = np.linspace(0,100,51)
percentile_cutoffs = np.percentile(y_std, percentile_bins)
observed_frequency, bins = (np.histogram(y_std, bins=percentile_cutoffs))
cum_observed_frequency = np.cumsum(observed_frequency)
# Loop through candidate distributions
for distribution in dist_names:
# Set up distribution and get fitted distribution parameters
dist = getattr(scipy.stats, distribution)
param = dist.fit(y_std)
# Obtain the KS test P statistic, round it to 5 decimal places
p = scipy.stats.kstest(y_std, distribution, args=param)[1]
p = np.around(p, 5)
p_values.append(p)
# Get expected counts in percentile bins
# This is based on a 'cumulative distrubution function' (cdf)
cdf_fitted = dist.cdf(percentile_cutoffs, *param[:-2], loc=param[-2],
scale=param[-1])
expected_frequency = []
for bin in range(len(percentile_bins)-1):
expected_cdf_area = cdf_fitted[bin+1] - cdf_fitted[bin]
expected_frequency.append(expected_cdf_area)
# calculate chi-squared
expected_frequency = np.array(expected_frequency) * size
cum_expected_frequency = np.cumsum(expected_frequency)
ss = sum (((cum_expected_frequency - cum_observed_frequency) ** 2) / cum_observed_frequency)
chi_square.append(ss)
# Collate results and sort by goodness of fit (best at top)
results = pd.DataFrame()
results['Distribution'] = dist_names
results['chi_square'] = chi_square
results['p_value'] = p_values
results.sort_values(['chi_square'], inplace=True)
# Report results
print ('\nDistributions sorted by goodness of fit:')
print ('----------------------------------------')
print (results)