У меня есть данные, которые выглядят следующим образом.
Year Quarter Quantity Price
2000 1 23 142
2000 2 23 144
2000 3 23 147
2000 4 23 151
2001 1 22 160
2001 2 22 183
2001 3 22 186
2001 4 22 186
2002 1 21 212
2002 2 19 232
2002 3 19 223
2002 4 19 224
2003 1 19 231
2003 2 19 228
2003 3 19 238
2003 4 19 238
2004 1 19 234
2004 2 19 231
Я делю их на данные тестирования и данные обучения, как вы можете видеть ниже.Кажется, все работает нормально, до print(my_submission.head())
.В конце я пытаюсь добавить результаты тестирования в тренировочные данные, чтобы я мог построить все на графике, что-то вроде этого.

Как я могу создать похожий график?Вот весь мой код.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import sklearn
from statsmodels.formula.api import ols
from seaborn import heatmap
df = pd.read_csv('C:\\my_path_here\\insurance.csv')
df.head(10)
df.tail(10)
# Check null value
print(df.isnull().sum())
# Check zero value
print((df == 0).astype(int).sum(axis=0))
# Check for any duplicates
df.drop_duplicates()
data_model = ols("Quantity ~ Price", data=df).fit()
print(data_model.summary())
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_partregress_grid(data_model, fig=fig)
fig = plt.figure(figsize=(12, 8))
fig = sm.graphics.plot_ccpr_grid(data_model, fig=fig)
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_regress_exog(data_model, 'Price', fig=fig)
df1 = df.groupby(['Year']).agg({'Quantity':'sum','Price':'sum'}).reset_index()
# another way to do the aggregation to setup dataset for the pivot
#df1 = df.groupby(['Year'])["Quantity", "Price"].apply(lambda x : x.astype(int).sum())
X = df[['Year', 'Quantity', 'Price']]
import matplotlib.pyplot as plt
corr = X.corr()
sns.heatmap(corr, xticklabels=corr.columns,yticklabels=corr.columns, annot=True)
df2 = df.pivot('Year', 'Quarter', 'Price')
f, ax = plt.subplots(figsize=(5, 15))
sns.heatmap(df2, linewidths=.5, ax=ax, annot=True, fmt='.1f')
rev_df_revenue = df.pivot('Quantity',values='Price')
heatmap(rev_df_revenue, annot=True, fmt=".2f")
plt.title('Heatmap of Price')
# everythign looks OK up to this point...
# Split into training set and test set
train, test = sklearn.model_selection.train_test_split(df, train_size = 0.8)
train.shape
test.shape
print(train.dtypes)
print(test.dtypes)
# Start features engineering process
# set X as the independent variable and y as the dependent variable
# we will drop 'trip_duration' from the training dataset;
# other factors can influence our ability to make predictions
y = train['Price']
train.drop(['Price'], axis=1, inplace=True)
X = train
X.shape, y.shape
# start to test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape
# start to validate
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, random_state=42)
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape
from sklearn.ensemble import RandomForestRegressor
m1 = RandomForestRegressor(n_estimators=19, min_samples_split=2, min_samples_leaf=4, max_features='auto', max_depth=80, bootstrap=True)
m1.fit(X_train, y_train)
m1.score(X_valid, y_valid)
test_columns = X_train.columns
predictions = m1.predict(test[test_columns])
my_submission = pd.DataFrame({'Quantity': test.Quantity, 'Price': predictions})
print(my_submission.head())
train.Price.plot(figsize=(15,8), title= 'Price', fontsize=14)
test.Price.plot(figsize=(15,8), title= 'Price', fontsize=14)
plt.show()