Question

Я попытался построить Probability Density Function (PDF) plot своих данных после нахождения лучших параметров, но график показывает плоскую линию вместо кривой.

Это вопрос масштабирования?
Это проблема Continuous or Discrete data? Файл данных доступен здесь
Цель здесь - получить наилучшие распределительные фитинги и затем построить график функции PDF.
Мои значения данных такие маленькие, как: 0.21, 1.117 .et c. Статистика данных и графики PDF показаны ниже:

Мой сценарий приведен ниже:

from time import time
from datetime import datetime
start_time = datetime.now()
import pandas as pd
pd.options.display.float_format = '{:.4f}'.format
import numpy as np
import pickle
import scipy
import scipy.stats
import matplotlib.pyplot as plt

data= pd.read_csv("line_RXC_data.csv",usecols=['R'],parse_dates=True, squeeze=True)
df=data

y_std=df
# del yy

import warnings
warnings.filterwarnings("ignore")


# Create an index array (x) for data

y=df
#
# Create an index array (x) for data

x = np.arange(len(y))
size = len(y)

#simple visualisation of the data
plt.hist(y)
plt.title("Histogram of resistance ")
plt.xlabel("Resistance data visualization ")
plt.ylabel("Frequency")
plt.show()

y_df = pd.DataFrame(y)
tt=y_df.describe()
print(tt)

dist_names = [
                'foldcauchy',
                'beta',
                'expon',
                  'exponnorm',
                'norm', 
                'lognorm',
                  'dweibull',
                'pareto',
                  'gamma'
]


x = np.arange(len(df))
size = len(df)
y_std = df
y=df
chi_square = []
p_values = []

# Set up 50 bins for chi-square test
# Observed data will be approximately evenly distrubuted aross all bins
percentile_bins = np.linspace(0,100,51)
percentile_cutoffs = np.percentile(y_std, percentile_bins)
observed_frequency, bins = (np.histogram(y_std, bins=percentile_cutoffs))
cum_observed_frequency = np.cumsum(observed_frequency)

# Loop through candidate distributions

for distribution in dist_names:
    s1 = time()
    # Set up distribution and get fitted distribution parameters
    dist = getattr(scipy.stats, distribution)
    # print("1")
    param = dist.fit(y_std)
    # print("2")
    # Obtain the KS test P statistic, round it to 5 decimal places
    p = scipy.stats.kstest(y_std, distribution, args=param)[1]
    p = np.around(p, 5)
    p_values.append(p)    
    # print("3")
    # Get expected counts in percentile bins
    # This is based on a 'cumulative distrubution function' (cdf)
    cdf_fitted = dist.cdf(percentile_cutoffs, *param[:-2], loc=param[-2], 
                          scale=param[-1])
    # print("4")
    expected_frequency = []
    for bin in range(len(percentile_bins)-1):
        expected_cdf_area = cdf_fitted[bin+1] - cdf_fitted[bin]
        expected_frequency.append(expected_cdf_area)

    # calculate chi-squared
    expected_frequency = np.array(expected_frequency) * size
    cum_expected_frequency = np.cumsum(expected_frequency)
    ss = sum (((cum_expected_frequency - cum_observed_frequency) ** 2) / cum_observed_frequency)
    chi_square.append(ss)
    print(f"chi_square {distribution} time: {time() - s1}")

#    print("std of predicted probability : ", np.std(cum_observed_frequency))   

# Collate results and sort by goodness of fit (best at top)

results = pd.DataFrame()
results['Distribution'] = dist_names
results['chi_square'] = chi_square
results['p_value'] = p_values
results.sort_values(['chi_square'], inplace=True)

# Report results

print ('\nDistributions sorted by goodness of fit:')
print ('----------------------------------------')
print (results)


#%%

# Divide the observed data into 100 bins for plotting (this can be changed)
number_of_bins = 100
bin_cutoffs = np.linspace(np.percentile(y,0), np.percentile(y,99),number_of_bins)

# Create the plot
plt.figure(figsize=(7, 4))
h = plt.hist(y, bins = bin_cutoffs, color='0.70')

# Get the top three distributions from the previous phase
number_distributions_to_plot = 5
dist_names = results['Distribution'].iloc[0:number_distributions_to_plot]


#%%
# Create an empty list to stroe fitted distribution parameters
parameters = []

# Loop through the distributions ot get line fit and paraemters

for dist_name in dist_names:
    # Set up distribution and store distribution paraemters
    dist = getattr(scipy.stats, dist_name)
    param = dist.fit(y)
    parameters.append(param)

    # Get line for each distribution (and scale to match observed data)
    pdf_fitted = dist.pdf(x, *param[:-2], loc=param[-2], scale=param[-1])
    scale_pdf = np.trapz (h[0], h[1][:-1]) / np.trapz (pdf_fitted, x)
    pdf_fitted *= scale_pdf

    # Add the line to the plot
    plt.plot(pdf_fitted, label=dist_name)

    # Set the plot x axis to contain 99% of the data
    # This can be removed, but sometimes outlier data makes the plot less clear
    plt.xlim(0,np.percentile(y,99))


# Add legend and display plotfig = plt.figure(figsize=(8,5)) 

plt.legend()
plt.title(u'Data distribution charateristics) \n' )
plt.xlabel(u'Resistance')
plt.ylabel('Frequency )')
plt.show()

# Store distribution paraemters in a dataframe (this could also be saved)
dist_parameters = pd.DataFrame()
dist_parameters['Distribution'] = (
        results['Distribution'].iloc[0:number_distributions_to_plot])
dist_parameters['Distribution parameters'] = parameters

# Print parameter results
print ('\nDistribution parameters:')
print ('------------------------')

for index, row in dist_parameters.iterrows():
    print ('\nDistribution:', row[0])
    print ('Parameters:', row[1] )

pjs · Answer 1 · 15 февраля 2020

Если вы посмотрите на следующий категориальный частотный анализ, вы увидите, что у вас есть только 15 различных значений, разбросанных по всему диапазону с большими промежутками между ними, а не континуум значений. Половина наблюдений имеет значение 0,211, а еще ~ 36% происходит при значении 1,117, ~ 8% при 0,194 и ~ 4% при 0,001. Я считаю ошибкой считать это непрерывными данными.

Ruslan Rozbeiko · Answer 2 · 14 февраля 2020

Это похоже на проблему масштабирования. Попробуйте избавиться от этих серых полос и посмотреть, работает ли это. После этого вы можете попробовать добавить вторую ось Y, как здесь: https://matplotlib.org/gallery/api/two_scales.html.

Распределительный участок показывает квартиру pdf

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 2 ]

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Распределительный участок показывает квартиру pdf

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 2 ]

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Похожие темы