Я хочу спрогнозировать некоторые данные с помощью ARIMA. Все работает, когда я хочу спрогнозировать некоторые тестовые данные, но модель ARIMA даже не близка и подходит для прямых линий или просто для некоторых квадратичных функций. Я пробовал много разных значений для p, d, q. Ничего не помогло Теперь мне просто интересно, почему код не работает. У кого-нибудь есть идеи, где может произойти сбой?
CSV-файл находится здесь: https://www.dropbox.com/s/3s0touu0bo3hw2o/ivwb2051.csv?dl=0
Картинка с прогнозом здесь: https://www.dropbox.com/s/0v6wdtselrspqd8/Capture.PNG?dl=0
import pandas as pd
from pandas import datetime
import matplotlib.pyplot as plt
from pandas.plotting import _converter
from datetime import timedelta as delta
import numpy as np
import statsmodels.api as sm
import warnings
from statsmodels.tsa.arima_model import ARIMA
import itertools
from statsmodels.graphics.tsaplots import plot_acf
from sklearn.metrics import mean_squared_error
# https://www.kaggle.com/poiupoiu/how-to-use-sarimax
# using scipy version 1.2.0, because of incompabilities with statsmodels 0.9.0
_converter.register()
def parser(x):
return datetime.strptime(x, '%Y%m%d %H:%M:%S')
'''------------------------------------------------------------------------------------------------------------ '''
'''-------------------------------------------ALL COLUMNS------------------------------------------------------ '''
'''------------------------------------------------------------------------------------------------------------ '''
my_dataframe = ['Unnamed: 0', 'server', 'MemoryUsedPercent', 'TotalVisibleMemorySize', 'FreePhysicalMemory',
'\\logicaldisk(c:)\\% free space', '\\logicaldisk(c:)\\free megabytes',
'\\logicaldisk(d:)\\% free space', '\\logicaldisk(d:)\\free megabytes',
'\\logicaldisk(e:)\\% free space', '\\logicaldisk(e:)\\free megabytes',
'\\logicaldisk(f:)\\% free space', '\\logicaldisk(f:)\\free megabytes',
'\\processor(_total)\\% processor time']
'''------------------------------------------------------------------------------------------------------------ '''
'''---------------------------------GET DATA FROM CSV AND PARSE------------------------------------------------ '''
'''------------------------------------------------------------------------------------------------------------ '''
data = pd.read_csv('ivwb2051.csv', index_col=1, parse_dates=[1], date_parser=parser)
data.index = data.index.map(lambda x: x.replace(second=0))
arimaColumn = 2
data = data.drop(my_dataframe[0:arimaColumn], axis=1)
data = data.drop(my_dataframe[(arimaColumn + 1):], axis=1)
column = data[my_dataframe[arimaColumn]]
column_str = my_dataframe[arimaColumn]
'''------------------------------------------------------------------------------------------------------------ '''
'''---------------------------------FULFILL NON FREQUENT ELEMENTS---------------------------------------------- '''
'''------------------------------------------------------------------------------------------------------------ '''
update_series = delta(minutes=10)
for n in range(0, len(data)):
if np.isnan(data[my_dataframe[arimaColumn]][n]) or data[my_dataframe[arimaColumn]][n] == 0:
data[my_dataframe[arimaColumn]][n] = data[my_dataframe[arimaColumn]][n - 1]
def insert_row(row_number, df, date, row_value):
df1 = df[0:row_number]
df2 = df[row_number:]
new = pd.DataFrame(
{'date': [date], column_str: [row_value]})
new.set_index('date', inplace=True)
out = df1.append(new).append(df2)
return out
def delete_row(row_number, df):
df1 = df[0:row_number]
df2 = df[row_number + 1:]
out = df1.append(df2)
return out
def change_date(row_number, df, newdate):
row_value = data[my_dataframe[arimaColumn]][row_number]
out = delete_row(row_number, df)
out = insert_row(row_number, out, newdate, row_value)
return out
# Fulfill timestamps, that haven't been sent, to make the series frequent
for n in range(1, len(data)):
# Wenn Daten im Abstand von x*10min fehlen, neue Zeile(n) mit dem Wert des vorherigen
if (((data.index[n] - data.index[n - 1]) != update_series)
and ((data.index[n] - data.index[n - 1]) % update_series == delta(minutes=0))):
while (data.index[n] - data.index[n - 1]) >= 2 * update_series:
data = insert_row(n, data, data.index[n] - update_series, data[my_dataframe[arimaColumn]][n - 1])
# Wenn Daten in anderem Abstand fehlen, runde ab und lösche falls gleich n-1 sont fülle den Rest bis n-1 im
# richtigen Intervall auf.
elif (data.index[n] - data.index[n - 1]) % update_series != delta(minutes=0):
if data.index[n] - ((data.index[n] - data.index[n - 1]) % update_series) != data.index[n - 1]:
data = change_date(n, data, data.index[n] - ((data.index[n] - data.index[n - 1]) % update_series))
while (data.index[n] - data.index[n - 1]) >= 2 * update_series:
data = insert_row(n, data, data.index[n] - update_series, data[my_dataframe[arimaColumn]][n - 1])
print('Your Series has now an static frequency!\n')
'''-------------------------------------------------------------------------------------------------------------'''
'''------------------------------------SEASONAL DECOMPOSE ------------------------------------------------------'''
'''-------------------------------------------------------------------------------------------------------------'''
frequency = 4320 # monthly
frequency = 6*24
res = sm.tsa.seasonal_decompose(data.dropna(), freq=frequency)
fig = res.plot()
fig.set_figheight(8)
fig.set_figwidth(15)
plt.show()
'''-------------------------------------------------------------------------------------------------------------'''
'''-----------------------------------------------TRAIN/ TEST---------------------------------------------------'''
'''-------------------------------------------------------------------------------------------------------------'''
# Ab 3645/ 31.03
Y_length = data.shape[0] - 3502
testingPercentage = 0.8
boundForTest = int(Y_length * testingPercentage) + 3502
tr_start, tr_end = data.index[3502], data.index[boundForTest]
te_start, te_end = data.index[boundForTest+1], data.index[data.shape[0]-1]
train = data[column_str][tr_start:tr_end].dropna()
test = data[column_str][te_start:te_end].dropna()
'''------------------------------------------------------------------------------------------------------------ '''
'''-----------------------------------------TESTING STATIONARITY----------------------------------------------- '''
'''------------------------------------------------------------------------------------------------------------ '''
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries):
# Determing rolling statistics
rolmean = timeseries.rolling(frequency + 1).mean()
rolstd = timeseries.rolling(frequency + 1).std()
# Plot rolling statistics:
fig = plt.figure(figsize=(12, 8))
orig = plt.plot(timeseries, color='blue', label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label='Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show()
# Perform Dickey-Fuller test:
print('Results of Dickey-Fuller Test:')
dftest = adfuller(timeseries, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value', '#Lags Used', 'Number of Observations Used'])
for key, value in dftest[4].items():
dfoutput['Critical Value (%s)' % key] = value
print(dfoutput)
'''-------------------------------------------------------------------------------------------------------------'''
'''---------------------------------------------ADF-TESTING-----------------------------------------------------'''
'''-------------------------------------------------------------------------------------------------------------'''
test_stationarity(data[my_dataframe[arimaColumn]])
'''
data['first_difference'] = data[my_dataframe[arimaColumn]] - data[my_dataframe[arimaColumn]].shift(1)
test_stationarity(data.first_difference.dropna(inplace=False))
data['seasonal_difference'] = data[my_dataframe[arimaColumn]] - data[my_dataframe[arimaColumn]].shift(frequency)
test_stationarity(data.seasonal_difference.dropna(inplace=False))
data['seasonal_first_difference'] = data.first_difference - data.first_difference.shift(frequency)
test_stationarity(data.seasonal_first_difference.dropna(inplace=False))
'''
fig, ax = plt.subplots(2, 1, figsize=(20, 10))
fig = sm.graphics.tsa.plot_acf(train.diff().dropna(), lags=50, ax=ax[0])
fig = sm.graphics.tsa.plot_pacf(train.diff().dropna(), lags=50, ax=ax[1])
plt.show()
'''-------------------------------------------------------------------------------------------------------------'''
'''---------------------------------------------ARIMA-----------------------------------------------------------'''
'''-------------------------------------------------------------------------------------------------------------'''
warnings.filterwarnings('ignore')
resDiff = sm.tsa.arma_order_select_ic(train, max_ar=2, max_ma=2, ic='aic', trend='c')
print('ARMA(p,q) =', resDiff['aic_min_order'], 'is the best.')
arima = sm.tsa.statespace.SARIMAX(train, order=(4, 2, 3), freq=pd.DateOffset(minutes=10), seasonal_order=(0, 0, 0, 0),
enforce_stationarity=False, enforce_invertibility=False, ).fit()
arima.summary()
print(arima.summary())
from sklearn.metrics import mean_squared_error
pred = arima.predict(tr_end, te_end)[1:]
print('ARIMA model MSE:{}'.format(mean_squared_error(test, pred)))
print(pred)
pd.DataFrame({'test': test, 'pred': pred}).plot()
plt.show()
Выходной график / прогноз даже не близок / совершенно неверен.