Я получаю отрицательный R ^ 2 независимо от того, какую модель регрессии я использую - PullRequest
1 голос
/ 26 января 2020

Я пытаюсь создать модель, которая описывала бы поведение моих данных. Я попробовал простую линейную регрессию, простую полиномиальную регрессию и полиномиальную регрессию с регуляризацией и перекрестной проверкой. Я следовал этому объяснению для выполнения регрессий.

Проблема в том, что все модели дают отрицательные значения R ^ 2 для тестовых данных. Я пробовал полиномиальные модели 1-й, 2-й, 3-й степени. Тогда становится еще хуже.

Мне было интересно, может ли кто-нибудь помочь мне выяснить, что не так? Или какую модель я могу использовать, чтобы избавиться от отрицательного R ^ 2 и получить нормальную?

Сводка для простой линейной регрессии

MAE, MSE , RMSE и R ^ 2 для простой линейной регрессии

MAE, MSE, RMSE и R ^ 2 для простой полиномиальной регрессии

MAE , MSE, RMSE и R ^ 2 для полиномиальной регрессии с регуляризацией и перекрестной проверкой

Код:

import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm

from pandas import DataFrame
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

# Import function to automatically create polynomial features 
from sklearn.preprocessing import PolynomialFeatures

# Import Linear Regression and a regularized regression function
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV


#Initial data (Three independent variables - Cycle, Internal Resistance and CV Capacity; One dependent - Full Capacity)
SoH = {'Cycle': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28],
                'Internal_Resistance': [0.039684729, 0.033377614, 0.031960606, 0.03546798, 0.036786229, 0.03479803, 0.026613861, 0.028650246, 0.028183795, 0.035455215, 0.029205355, 0.033891692, 0.026988849, 0.025647298, 0.033970376, 0.03172454, 0.032437203, 0.033771218, 0.030939938, 0.036919977, 0.027832869, 0.028602469, 0.023065191, 0.028890529, 0.026640394, 0.031488253, 0.02865842, 0.027648949],
                'CV_Capacity': [389.9270401, 307.7366414, 357.6412139, 192.134787, 212.415946, 204.737916, 166.506029, 157.826878, 196.432589, 181.937188, 192.070363, 209.890964, 198.978988, 206.126864, 185.631644, 193.776497, 200.61431, 174.359373, 177.503285, 174.07905, 170.654873, 184.528031, 208.065379, 210.134795, 208.199237, 184.693507, 193.00402, 191.913131],
                'Full_Capacity': [1703.8575, 1740.7017, 1760.66, 1775.248302, 1771.664053, 1781.958089, 1783.2295, 1784.500912, 1779.280477, 1780.175547, 1800.761265, 1789.047162, 1791.763677, 1787.014667, 1796.520256, 1798.349587, 1791.776304, 1788.892761, 1791.990303, 1790.307248, 1796.580484, 1803.89133, 1793.305294, 1784.638742, 1780.056339, 1783.081746, 1772.001436, 1794.182046]        
                }
#Data to test the model
Test = {'Cycle': [29, 30, 31, 32, 33, 34, 35],
                'Internal_Resistance': [0.026217822, 0.032549629, 0.025744309, 0.027945824, 0.027332509, 0.027960729, 0.028969193],
                'CV_Capacity': [196.610972, 194.915587, 183.209067, 182.41669, 204.018257, 179.929472, 189.576431],
                'Full_Capacity': [1777.880947, 1792.21646, 1785.653845, 1788.401923, 1782.983718, 1793.939504, 1788.67233]        
                }

#Convert initial data into DataFrame
df = DataFrame(SoH,columns=['Cycle','Internal_Resistance','CV_Capacity','Full_Capacity'])
df1 = DataFrame(SoH,columns=['Cycle','Internal_Resistance','CV_Capacity'])
X = df1.to_numpy()
print(df.head())
print()

#Convert data to test the model into DataFrame
dft = DataFrame(Test,columns=['Cycle','Internal_Resistance','CV_Capacity','Full_Capacity'])
dft1 = DataFrame(Test,columns=['Cycle','Internal_Resistance','CV_Capacity'])
Xt = dft1.to_numpy()

#Plot the Full Capacity vs predictors (Cycle, Internal Resistance and CV Capacity)
for i in df.columns:
    df.plot.scatter(i,'Full_Capacity', edgecolors=(0,0,0),s=50,c='g',grid=True)



#STATSMODELS



# Fitting data with statsmodels
X1 = df[['Cycle','Internal_Resistance','CV_Capacity']]
Y1 = df['Full_Capacity']
X1 = sm.add_constant(X1.values) # adding a constant 
model = sm.OLS(Y1, X1).fit()
predictions = model.predict(X1)  
print_model = model.summary()
print(print_model)
print()



#SCIKIT LEARN (Simple polynomial regression and polynomial regression with regularization and cross-validation)



# Fitting data - simple polynomial regression (1st degree)   
linear_model = LinearRegression(normalize=True)
X_linear=df.drop('Full_Capacity',axis=1)
y_linear=df['Full_Capacity']
X_linear_test=dft.drop('Full_Capacity',axis=1)
y_linear_test=dft['Full_Capacity']
linear_model.fit(X_linear,y_linear)
y_pred_linear = linear_model.predict(X_linear)
y_pred_linear_test = linear_model.predict(X_linear_test)

#Coefficients for the model
coeff_linear = pd.DataFrame(linear_model.coef_,index=df.drop('Full_Capacity',axis=1).columns, columns=['Linear model coefficients'])
print(coeff_linear)
print()

#Metrics of the model
MAE_linear = mean_absolute_error(y_linear, y_pred_linear)
print("Mean absolute error of linear model:",MAE_linear)
MSE_linear = mean_squared_error(y_linear, y_pred_linear)
print("Mean-squared error of linear model:",MSE_linear)
RMSE_linear = np.sqrt(MSE_linear)
print("Root-mean-squared error of linear model:",RMSE_linear)
print()

MAE_linear_test = mean_absolute_error(y_linear_test, y_pred_linear_test)
print("Mean absolute error of linear model (validation):",MAE_linear_test)
MSE_linear_test = mean_squared_error(y_linear_test, y_pred_linear_test)
print("Mean-squared error of linear model (validation):",MSE_linear_test)
RMSE_linear_test = np.sqrt(MSE_linear_test)
print("Root-mean-squared error of linear model (validation):",RMSE_linear_test)
print()

print ("R2 value of linear model:",linear_model.score(X_linear,y_linear))
print ("R2 value of linear model (validation):",linear_model.score(X_linear_test,y_linear_test))
print()

#Plot predicted values vs actual values
plt.figure(figsize=(12,8))
plt.xlabel("Predicted value with linear fit",fontsize=20)
plt.ylabel("Actual y-values",fontsize=20)
plt.grid(1)
plt.scatter(y_pred_linear,y_linear,edgecolors=(0,0,0),lw=2,s=80)
plt.plot(y_pred_linear,y_pred_linear, 'k--', lw=2)

plt.figure(figsize=(12,8))
plt.xlabel("Predicted (validation) value with linear fit",fontsize=20)
plt.ylabel("Actual (validation) y-values",fontsize=20)
plt.grid(1)
plt.scatter(y_pred_linear_test,y_linear_test,edgecolors=(0,0,0),lw=2,s=80)
plt.plot(y_pred_linear_test,y_pred_linear_test, 'k--', lw=2)




#Fitting data - simple polynomial regression (3rd degree)  
poly = PolynomialFeatures(3,include_bias=False)
X_poly = poly.fit_transform(X)
X_poly_feature_name = poly.get_feature_names(['Feature'+str(l) for l in range(1,4)])

print()
print()
print("3rd degree polynomial regression")
print()
print()

print(X_poly_feature_name)
print(len(X_poly_feature_name))
print()

df_poly = pd.DataFrame(X_poly, columns=X_poly_feature_name)
print(df_poly.head())
print()

df_poly['y']=df['Full_Capacity']
print(df_poly.head())
print()

X_train=df_poly.drop('y',axis=1)
y_train=df_poly['y']

#Testing the model
test = PolynomialFeatures(3,include_bias=False)
X_test=test.fit_transform(Xt)
X_test_feature_name = test.get_feature_names(['Feature'+str(l) for l in range(1,4)])
print(X_test_feature_name)
print(len(X_test_feature_name))
print()

df_test = pd.DataFrame(X_test, columns=X_test_feature_name)
print(df_test.head())
print()

df_test['y']=dft['Full_Capacity']

#Data to test the polynomial models
X_testo=df_test.drop('y',axis=1)
y_testo=df_test['y']

poly = LinearRegression(normalize=True)
model_poly=poly.fit(X_train,y_train)
y_poly = poly.predict(X_train)
y_poly_test = np.array(poly.predict(X_testo))

coeff_poly = pd.DataFrame(model_poly.coef_,index=df_poly.drop('y',axis=1).columns, columns=['Coefficients polynomial model'])
print(coeff_poly)
print()

#Metrics of the polynomial model
MAE_poly = mean_absolute_error(y_train, y_poly)
print("Mean absolute error of simple polynomial model:",MAE_poly)
MSE_poly = mean_squared_error(y_train, y_poly)
print("Mean-squared error of simple polynomial model:",MSE_poly)
RMSE_poly = np.sqrt(MSE_poly)
print("Root-mean-squared error of simple polynomial model:",RMSE_poly)
print()

MAE_poly_test = mean_absolute_error(y_testo, y_poly_test)
print("Mean absolute error of simple polynomial model (validation):",MAE_poly_test)
MSE_poly_test = mean_squared_error(y_testo, y_poly_test)
print("Mean-squared error of simple polynomial model (validation):",MSE_poly_test)
RMSE_poly_test = np.sqrt(MSE_poly_test)
print("Root-mean-squared error of simple polynomial model (validation):",RMSE_poly_test)
print()

print ("R2 value of simple polynomial model:",model_poly.score(X_train,y_train))
print ("R2 value of simple polynomial model (validation):",model_poly.score(X_testo,y_testo))
print()

plt.figure(figsize=(12,8))
plt.xlabel("Predicted value with simple polynomial model",fontsize=20)
plt.ylabel("Actual y-values",fontsize=20)
plt.grid(1)
plt.scatter(y_poly,y_train,edgecolors=(0,0,0),lw=2,s=80)
plt.plot(y_poly,y_poly, 'k--', lw=2)

plt.figure(figsize=(12,8))
plt.xlabel("Predicted (validation) value with Simple polynomial model",fontsize=20)
plt.ylabel("Actual (validation) y-values",fontsize=20)
plt.grid(1)
plt.scatter(y_poly_test,y_testo,edgecolors=(0,0,0),lw=2,s=80)
plt.plot(y_poly_test,y_poly_test, 'k--', lw=2)



#Fitting data with a polynomial model with regularization and cross-validation
model1 = LassoCV(cv=10,verbose=0,normalize=True,eps=0.001,n_alphas=100, fit_intercept = True, tol=0.0001,max_iter=10000)
model1.fit(X_train,y_train)
y_pred1 = np.array(model1.predict(X_train))
y_pred2 = np.array(model1.predict(X_testo))

print()
print()
print("3rd degree polynomial regression with regularization and cross-validation")
print()
print()

coeff1 = pd.DataFrame(model1.coef_,index=df_poly.drop('y',axis=1).columns, columns=['Coefficients Metamodel'])
print(coeff1)
print()

print(coeff1[coeff1['Coefficients Metamodel']!=0])
print("Intercept of the new polynomial model:",model1.intercept_)
print()

#Metrics of the polynomial model with regularization and cross-validation
MAE_1 = mean_absolute_error(y_train, y_pred1)
print("Mean absolute error of the new polynomial model:",MAE_1)
MSE_1 = mean_squared_error(y_train, y_pred1)
print("Mean-squared error of the new polynomial model:",MSE_1)
RMSE_1 = np.sqrt(MSE_1)
print("Root-mean-squared error of the new polynomial model:",RMSE_1)
print()

MAE_1_test = mean_absolute_error(y_testo, y_pred2)
print("Mean absolute error of the new polynomial model (validation):",MAE_1_test)
MSE_1_test = mean_squared_error(y_testo, y_pred2)
print("Mean-squared error of the new polynomial model (validation):",MSE_1_test)
RMSE_1_test = np.sqrt(MSE_1_test)
print("Root-mean-squared error of the new polynomial model (validation):",RMSE_1_test)
print()

print ("R2 value of the new polynomial model:",model1.score(X_train,y_train))
print ("R2 value of the new polynomial model (validation):",model1.score(X_testo,y_testo))
print ("Alpha of the new polynomial model:",model1.alpha_)
print()

plt.figure(figsize=(12,8))
plt.xlabel("Predicted value with Metamodel",fontsize=20)
plt.ylabel("Actual y-values",fontsize=20)
plt.grid(1)
plt.scatter(y_pred1,y_train,edgecolors=(0,0,0),lw=2,s=80)
plt.plot(y_pred1,y_pred1, 'k--', lw=2)

plt.figure(figsize=(12,8))
plt.xlabel("Predicted (validation) value with Metamodel",fontsize=20)
plt.ylabel("Actual (validation) y-values",fontsize=20)
plt.grid(1)
plt.scatter(y_pred2,y_testo,edgecolors=(0,0,0),lw=2,s=80)
plt.plot(y_pred2,y_pred2, 'k--', lw=2) ```


  [1]: https://i.stack.imgur.com/AhSwJ.png
...