RandomForestRegressor: поезд и прогнозируемые данные отключены - PullRequest
0 голосов
/ 22 июня 2019

У меня проблема с прогнозом RandomForestRegressor моих данных .Как вы можете видеть здесь, предсказание не выглядит так, как должно:,Однако, насколько я понимаю, необходимо получить более достоверный прогноз.Есть ли у вас какие-либо советы по этому поводу?

"""
Here we change resample from daily to every 3H.
Random Forest Regressor is not able to make "good" pedictions on future dates where no data exists.
"""

from sklearn.ensemble import RandomForestRegressor

def load_data():
    df = pd.read_csv('sample-data.csv', usecols=['created', 'total_gross'], parse_dates=['created'])
    # The following line could be replaced with parse_dates=['created'] above
    # df['created'] = pd.to_datetime(df.created)
    return df.set_index('created').resample('3H').sum().fillna(0)

df = load_data()

# Visualize data
plt.xticks(rotation=90)
plt.plot(df)
plt.show()

# Accumulate
df['total_gross_accumulated'] = df['total_gross'].cumsum()

# Visualize accumulated data
plt.xticks(rotation=90)
plt.plot(df.index, df['total_gross_accumulated'])
plt.show()

X = df.index.astype('int64').values.reshape(-1,1) // 10**9
y = df['total_gross_accumulated']

def eval_on_features(features, target, regressor):
    # use the first n data points for training
    # IMPORTANT: n_train must be within range of resampled data. See X.shape
    n_train = 453

    # Split
    X_train, X_test = features[:n_train], features[n_train:]
    y_train, y_test = target[:n_train], target[n_train:]

    # Fit
    regressor.fit(X_train, y_train)

    # Show R^2 score
    print("Test-set R^2: {:.2f}".format(regressor.score(X_test, y_test)))

    # Predictions
    y_pred = regressor.predict(X_test)
    y_pred_train = regressor.predict(X_train)

    # Plot data
    plt.plot(range(n_train), y_train, label="train")
    plt.plot(range(n_train, len(y_test) + n_train), y_test, '-', label="test")
    plt.plot(range(n_train), y_pred_train, '--', label="prediction train")
    plt.plot(range(n_train, len(y_test) + n_train), y_pred, '--', label="prediction test")

    plt.legend(loc=(1.01, 0))


from sklearn.ensemble import RandomForestRegressor
"""
We have individual days and RandomForestRegressor can't deal with these.
There is no more information available after Aug 18. Therefore, Aug 18 is taken as last point of information.
"""
regressor = RandomForestRegressor(n_estimators=100, random_state=0)
eval_on_features(X, y, regressor)

X_hour_week = np.hstack(
    [
     df.index.dayofweek.values.reshape(-1, 1),
     df.index.hour.values.reshape(-1, 1)
    ]
)

eval_on_features(X_hour_week, y, regressor)
...