Почему модель рассола ведет себя иначе, чем реальная модель? - PullRequest
0 голосов
/ 19 июня 2020

У меня есть сплайновая регрессия, в которой я предсказываю температуру на следующие 5 дней. Модель работает нормально, но я хочу сохранить последовательность данных, чтобы прогноз выполнялся быстрее. Я хотел использовать pickle, но код дает мне ValueError: shapes (5,5) and (14,) not aligned: 5 (dim 1) != 14 (dim 0), хотя раньше он работал отлично. Как заставить код работать с pickle так же, как со сплайновым поездом регрессии, если не используется pickle для сохранения и загрузки модели?

Код с pickle ниже:

import datetime as dt
import json
import pickle

import numpy as np
import pandas as pd
import plotly
import statsmodels.api as sm
from patsy import dmatrix
from plotly import graph_objs as go
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from flask_babel import _
from visualization.timeseries import create_columns
import time
import _pickle as cPickle

# register_matplotlib_converters()
np.seterr(divide='ignore')
start = time.time()


def calculate_spline_regression(data, sensor_name, days_predicted):
    create_columns(data)  # from timeseries
    data['day'] = pd.to_datetime(data['day'], dayfirst=True)
    data = data.sort_values(by=['readable time'])

    group_by_df = pd.DataFrame([name, group.mean()[sensor_name]] for name, group in data.groupby('day'))
    group_by_df.columns = ['day', sensor_name]
    group_by_df['day'] = pd.to_datetime(group_by_df['day'])

    # todo modify
    # initial length of dataframe(before future prediction)
    initial_len_df = len(group_by_df)
    # days_predicted = 3
    rng = pd.date_range(group_by_df['day'].min(), periods=len(group_by_df) + days_predicted, freq='D')

    df = pd.DataFrame({'day': rng})
    df[sensor_name] = group_by_df[sensor_name]
    df[sensor_name][len(group_by_df):] = group_by_df[sensor_name].mean()  # ""

    group_by_df = df
    print(group_by_df)

    group_by_df['day'] = group_by_df['day'].map(dt.datetime.toordinal)

    X = group_by_df[['day']].values
    y = group_by_df[[sensor_name]].values
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=days_predicted, shuffle=False)

    # create list of real values(actual) and forecasted values
    # return the MSE for each grade used for regression forecasting
    def analyse_forecast(dataframe_name, predicted_list, regression_type):
        print("\n Grade: ", degree)
        print("MSE " + regression_type + " regression(mean squared error)",
              mean_squared_error(dataframe_name[sensor_name], predicted_list))
        print("r2 score ", r2_score(dataframe_name[sensor_name], predicted_list))
        # rmse = np.sqrt(mean_squared_error(dataframe_name[sensor_name], predicted_list))
        # print( "RMSE for " + regression_type + " regression=", rmse)
        return mean_squared_error(dataframe_name[sensor_name], predicted_list)

    # decide maximum regression grade
    max_grade = int(len(group_by_df) / 2)
    if max_grade > 15:
        max_grade = 10

    # print("MAX GRADE=", max_grade)
    group_by_df.reset_index(inplace=True)

    # create dataframe with mse values and corresponding regression grade
    def mse_minumum(regression_type, mse_list_regression, max_grade_regression):
        mse_df = pd.DataFrame(mse_list_regression)
        mse_df.columns = ['mse_values']
        mse_df[regression_type + '_grade'] = [i + 1 for i in range(0, max_grade_regression)]
        mse_df['mse_values'] = mse_df['mse_values'].drop_duplicates()
        minimum_mse_val = mse_df[mse_df['mse_values'] == mse_df['mse_values'].min()]
        # print( "minimum MSE for given " + regression_type + " grades:",
        #       mse_df[mse_df['mse_values'] == mse_df['mse_values'].min()])
        minimum_mse_val.reset_index(drop=True, inplace=True)

        # print("REMOVE INDEX", minimum_mse_val)
        print("mse vaaaal", minimum_mse_val['mse_values'][0])
        print("spline grade", minimum_mse_val['spline_grade'][0])
        return max_grade_regression, minimum_mse_val['mse_values'][0], minimum_mse_val['spline_grade'][0]

    # percentiles for train data
    percentile_25_train = np.percentile(group_by_df['day'][:len(X_train)], 25)
    percentile_50_train = np.percentile(group_by_df['day'][:len(X_train)], 50)
    percentile_75_train = np.percentile(group_by_df['day'][:len(X_train)], 75)

    # percentiles for test data
    percentile_25_test = np.percentile(group_by_df['day'][len(X_train):], 25)
    percentile_50_test = np.percentile(group_by_df['day'][len(X_train):], 50)
    percentile_75_test = np.percentile(group_by_df['day'][len(X_train):], 75)

    spline_regression_fig = go.Figure()
    mse_list_spline = []
    mse_list_train_spline = []
    mse_list_test_spline = []

    connected = False
    maximum_working_degree = 0
    grade_max = 16

    predicted_df = pd.DataFrame()

    # while not connected:
    for count, degree in enumerate([i + 1 for i in range(0, 10)]):
        # Specifying 3 knots for regression spline
        transformed_x1 = dmatrix(
            "bs(X_train, knots=(percentile_25_train,percentile_50_train,percentile_75_train), degree=degree,"
            " include_intercept=False)",
            {"X_train": X_train}, return_type='dataframe')

        try:
            fit_spline = sm.OLS(y_train, transformed_x1).fit()

            #save the model to disk
            filename = 'static/prediction_models/model_' + str(sensor_name) + '.pkl'
            pickle.dump(fit_spline, open(filename, 'wb'))

            # load model
            # pickle_in = open('static/prediction_models/model_' + str(sensor_name) + '.pkl', 'rb')
            # fit_spline = pickle.load(pickle_in)
        except ValueError:
            print("value error")
            return False
        # predict test values
        pred_spline_test = fit_spline.predict(
            dmatrix(
                "bs(X_test, knots=(percentile_25_test,percentile_50_test,percentile_75_test),degree=degree, "
                "include_intercept=False)",
                {"X_test": X_test}, return_type='dataframe'))

        # predict train values
        pred_spline_train = fit_spline.predict(
            dmatrix(
                "bs(X_train, knots=(percentile_25_train,percentile_50_train,percentile_75_train), degree=degree,"
                " include_intercept=False)",
                {"X_train": X_train}, return_type='dataframe'))

        pred_spline_train = pred_spline_train.tolist()
        pred_spline_test = pred_spline_test.tolist()
        # holds all predicted values(train and test)
        predicted_val = pred_spline_train + pred_spline_test

        predicted_df[degree] = predicted_val

        mse_list_spline.append(analyse_forecast(group_by_df, predicted_val, "spline"))

    maximum_working_degree, minimum_mse_val, spline_grade_min_mse = mse_minumum("spline", mse_list_spline, degree)

    spline_regression_fig.add_trace(go.Scatter(
        x=group_by_df['day'].map(dt.datetime.fromordinal)[initial_len_df:],
        y=predicted_df[spline_grade_min_mse][initial_len_df:],
        name=_("Predicted values grade ") + str(spline_grade_min_mse),
        text=_("Predicted values grade ") + str(spline_grade_min_mse),
        hoverinfo='text+x+y',
        mode='lines+markers',
        marker=dict(
            color=np.where(group_by_df['day'].index < len(y_train), 'red', 'green'))))

    spline_regression_fig.update_layout(
        height=700,
        font=dict(color="grey"),
        paper_bgcolor='rgba(0,0,0,0)',
        title=_("Regression Spline for ") + _(sensor_name),
        yaxis_title=_(sensor_name),
        xaxis_title=_('Day'),
        showlegend=True)

    print("MINIMUM mse", minimum_mse_val)
    print("BEST GRADE IS", spline_grade_min_mse)
    print("BEST PREDICTIONS ARE", predicted_df[spline_grade_min_mse])
    spline_regression_fig.show()

    spline_regression_json = json.dumps(spline_regression_fig, cls=plotly.utils.PlotlyJSONEncoder)
    print('It took', time.time() - start, 'seconds.')

    return spline_regression_json, minimum_mse_val, spline_grade_min_mse


data = pd.read_csv("https://raw.githubusercontent.com/iulianastroia/csv_data/master/final_dataframe.csv")
sensor_name = "temperature"

spline_fig, minimum_mse_val, spline_grade_min_mse = calculate_spline_regression(data, sensor_name, 5)

Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...