Распределительная арматура Multiple coloumns - PullRequest
0 голосов
/ 09 апреля 2020

Я пытаюсь подобрать распределение своих данных, используя scipy.stats . Данные содержат несколько столбцов col_1, col_2, col_3 в одном файле CSV.

  • Проблема заключается в том, что для подгонки распределения требуется только один столбец, чтобы определить наилучшие фитинги распределения, как я показал в приведенном ниже коде.
  • Как получить распределение распределения всех столбцов одновременно? например, распределительный фитинг col_1, col_2, col_3

    import warnings
    warnings.filterwarnings("ignore")
    
    import pandas as pd
    import numpy as np
    import scipy
    from sklearn.preprocessing import StandardScaler
    import scipy.stats
    import matplotlib.pyplot as plt
    
    # Load data and select first column
    
    from sklearn import datasets
    data_set = datasets.load_breast_cancer()
    
    # Multiple coloumns of csv
    col_1=data_set.data[:,0]
    col_2=data_set.data[:,1]
    col_3=data_set.data[:,2]
    
    # Create an index array (x) for data
    
    x = np.arange(len(col_1))
    size = len(col_1)
    
    plt.hist(col_1)
    plt.show()
    
    
    sc=StandardScaler() 
    yy = col_1.reshape (-1,1)
    sc.fit(yy)
    y_std =sc.transform(yy)
    y_std = y_std.flatten()
    y_std
    del yy
    
    
    dist_names = ['beta',
                  'expon',
                  'gamma',
                  'lognorm',
                  'norm',
                  'pearson3',
                  'triang',
                  'uniform',
                  'weibull_min', 
                  'weibull_max']
    
    # Set up empty lists to stroe results
    chi_square = []
    p_values = []
    
    
    # Set up 50 bins for chi-square test
    # Observed data will be approximately evenly distrubuted aross all bins
    percentile_bins = np.linspace(0,100,51)
    percentile_cutoffs = np.percentile(y_std, percentile_bins)
    observed_frequency, bins = (np.histogram(y_std, bins=percentile_cutoffs))
    cum_observed_frequency = np.cumsum(observed_frequency)
    
    
    
    # Loop through candidate distributions
    
    for distribution in dist_names:
        # Set up distribution and get fitted distribution parameters
        dist = getattr(scipy.stats, distribution)
        param = dist.fit(y_std)
    
        # Obtain the KS test P statistic, round it to 5 decimal places
        p = scipy.stats.kstest(y_std, distribution, args=param)[1]
        p = np.around(p, 5)
        p_values.append(p)    
    
        # Get expected counts in percentile bins
        # This is based on a 'cumulative distrubution function' (cdf)
        cdf_fitted = dist.cdf(percentile_cutoffs, *param[:-2], loc=param[-2], 
                              scale=param[-1])
        expected_frequency = []
        for bin in range(len(percentile_bins)-1):
            expected_cdf_area = cdf_fitted[bin+1] - cdf_fitted[bin]
            expected_frequency.append(expected_cdf_area)
    
        # calculate chi-squared
        expected_frequency = np.array(expected_frequency) * size
        cum_expected_frequency = np.cumsum(expected_frequency)
        ss = sum (((cum_expected_frequency - cum_observed_frequency) ** 2) / cum_observed_frequency)
        chi_square.append(ss)
    
    # Collate results and sort by goodness of fit (best at top)
    
    results = pd.DataFrame()
    results['Distribution'] = dist_names
    results['chi_square'] = chi_square
    results['p_value'] = p_values
    results.sort_values(['chi_square'], inplace=True)
    
    # Report results
    
    print ('\nDistributions sorted by goodness of fit:')
    print ('----------------------------------------')
    print (results)
    
...