У меня есть сплайновая регрессия, в которой я предсказываю температуру на следующие 5 дней. Модель работает нормально, но я хочу сохранить последовательность данных, чтобы прогноз выполнялся быстрее. Я хотел использовать pickle, но код дает мне ValueError: shapes (5,5) and (14,) not aligned: 5 (dim 1) != 14 (dim 0)
, хотя раньше он работал отлично. Как заставить код работать с pickle так же, как со сплайновым поездом регрессии, если не используется pickle для сохранения и загрузки модели?
Код с pickle ниже:
import datetime as dt
import json
import pickle
import numpy as np
import pandas as pd
import plotly
import statsmodels.api as sm
from patsy import dmatrix
from plotly import graph_objs as go
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from flask_babel import _
from visualization.timeseries import create_columns
import time
import _pickle as cPickle
# register_matplotlib_converters()
np.seterr(divide='ignore')
start = time.time()
def calculate_spline_regression(data, sensor_name, days_predicted):
create_columns(data) # from timeseries
data['day'] = pd.to_datetime(data['day'], dayfirst=True)
data = data.sort_values(by=['readable time'])
group_by_df = pd.DataFrame([name, group.mean()[sensor_name]] for name, group in data.groupby('day'))
group_by_df.columns = ['day', sensor_name]
group_by_df['day'] = pd.to_datetime(group_by_df['day'])
# todo modify
# initial length of dataframe(before future prediction)
initial_len_df = len(group_by_df)
# days_predicted = 3
rng = pd.date_range(group_by_df['day'].min(), periods=len(group_by_df) + days_predicted, freq='D')
df = pd.DataFrame({'day': rng})
df[sensor_name] = group_by_df[sensor_name]
df[sensor_name][len(group_by_df):] = group_by_df[sensor_name].mean() # ""
group_by_df = df
print(group_by_df)
group_by_df['day'] = group_by_df['day'].map(dt.datetime.toordinal)
X = group_by_df[['day']].values
y = group_by_df[[sensor_name]].values
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=days_predicted, shuffle=False)
# create list of real values(actual) and forecasted values
# return the MSE for each grade used for regression forecasting
def analyse_forecast(dataframe_name, predicted_list, regression_type):
print("\n Grade: ", degree)
print("MSE " + regression_type + " regression(mean squared error)",
mean_squared_error(dataframe_name[sensor_name], predicted_list))
print("r2 score ", r2_score(dataframe_name[sensor_name], predicted_list))
# rmse = np.sqrt(mean_squared_error(dataframe_name[sensor_name], predicted_list))
# print( "RMSE for " + regression_type + " regression=", rmse)
return mean_squared_error(dataframe_name[sensor_name], predicted_list)
# decide maximum regression grade
max_grade = int(len(group_by_df) / 2)
if max_grade > 15:
max_grade = 10
# print("MAX GRADE=", max_grade)
group_by_df.reset_index(inplace=True)
# create dataframe with mse values and corresponding regression grade
def mse_minumum(regression_type, mse_list_regression, max_grade_regression):
mse_df = pd.DataFrame(mse_list_regression)
mse_df.columns = ['mse_values']
mse_df[regression_type + '_grade'] = [i + 1 for i in range(0, max_grade_regression)]
mse_df['mse_values'] = mse_df['mse_values'].drop_duplicates()
minimum_mse_val = mse_df[mse_df['mse_values'] == mse_df['mse_values'].min()]
# print( "minimum MSE for given " + regression_type + " grades:",
# mse_df[mse_df['mse_values'] == mse_df['mse_values'].min()])
minimum_mse_val.reset_index(drop=True, inplace=True)
# print("REMOVE INDEX", minimum_mse_val)
print("mse vaaaal", minimum_mse_val['mse_values'][0])
print("spline grade", minimum_mse_val['spline_grade'][0])
return max_grade_regression, minimum_mse_val['mse_values'][0], minimum_mse_val['spline_grade'][0]
# percentiles for train data
percentile_25_train = np.percentile(group_by_df['day'][:len(X_train)], 25)
percentile_50_train = np.percentile(group_by_df['day'][:len(X_train)], 50)
percentile_75_train = np.percentile(group_by_df['day'][:len(X_train)], 75)
# percentiles for test data
percentile_25_test = np.percentile(group_by_df['day'][len(X_train):], 25)
percentile_50_test = np.percentile(group_by_df['day'][len(X_train):], 50)
percentile_75_test = np.percentile(group_by_df['day'][len(X_train):], 75)
spline_regression_fig = go.Figure()
mse_list_spline = []
mse_list_train_spline = []
mse_list_test_spline = []
connected = False
maximum_working_degree = 0
grade_max = 16
predicted_df = pd.DataFrame()
# while not connected:
for count, degree in enumerate([i + 1 for i in range(0, 10)]):
# Specifying 3 knots for regression spline
transformed_x1 = dmatrix(
"bs(X_train, knots=(percentile_25_train,percentile_50_train,percentile_75_train), degree=degree,"
" include_intercept=False)",
{"X_train": X_train}, return_type='dataframe')
try:
fit_spline = sm.OLS(y_train, transformed_x1).fit()
#save the model to disk
filename = 'static/prediction_models/model_' + str(sensor_name) + '.pkl'
pickle.dump(fit_spline, open(filename, 'wb'))
# load model
# pickle_in = open('static/prediction_models/model_' + str(sensor_name) + '.pkl', 'rb')
# fit_spline = pickle.load(pickle_in)
except ValueError:
print("value error")
return False
# predict test values
pred_spline_test = fit_spline.predict(
dmatrix(
"bs(X_test, knots=(percentile_25_test,percentile_50_test,percentile_75_test),degree=degree, "
"include_intercept=False)",
{"X_test": X_test}, return_type='dataframe'))
# predict train values
pred_spline_train = fit_spline.predict(
dmatrix(
"bs(X_train, knots=(percentile_25_train,percentile_50_train,percentile_75_train), degree=degree,"
" include_intercept=False)",
{"X_train": X_train}, return_type='dataframe'))
pred_spline_train = pred_spline_train.tolist()
pred_spline_test = pred_spline_test.tolist()
# holds all predicted values(train and test)
predicted_val = pred_spline_train + pred_spline_test
predicted_df[degree] = predicted_val
mse_list_spline.append(analyse_forecast(group_by_df, predicted_val, "spline"))
maximum_working_degree, minimum_mse_val, spline_grade_min_mse = mse_minumum("spline", mse_list_spline, degree)
spline_regression_fig.add_trace(go.Scatter(
x=group_by_df['day'].map(dt.datetime.fromordinal)[initial_len_df:],
y=predicted_df[spline_grade_min_mse][initial_len_df:],
name=_("Predicted values grade ") + str(spline_grade_min_mse),
text=_("Predicted values grade ") + str(spline_grade_min_mse),
hoverinfo='text+x+y',
mode='lines+markers',
marker=dict(
color=np.where(group_by_df['day'].index < len(y_train), 'red', 'green'))))
spline_regression_fig.update_layout(
height=700,
font=dict(color="grey"),
paper_bgcolor='rgba(0,0,0,0)',
title=_("Regression Spline for ") + _(sensor_name),
yaxis_title=_(sensor_name),
xaxis_title=_('Day'),
showlegend=True)
print("MINIMUM mse", minimum_mse_val)
print("BEST GRADE IS", spline_grade_min_mse)
print("BEST PREDICTIONS ARE", predicted_df[spline_grade_min_mse])
spline_regression_fig.show()
spline_regression_json = json.dumps(spline_regression_fig, cls=plotly.utils.PlotlyJSONEncoder)
print('It took', time.time() - start, 'seconds.')
return spline_regression_json, minimum_mse_val, spline_grade_min_mse
data = pd.read_csv("https://raw.githubusercontent.com/iulianastroia/csv_data/master/final_dataframe.csv")
sensor_name = "temperature"
spline_fig, minimum_mse_val, spline_grade_min_mse = calculate_spline_regression(data, sensor_name, 5)