R2, который я оцениваю, неверен, он отрицателен и слишком высок - PullRequest
0 голосов
/ 08 февраля 2020

Доброе утро, я использую нейронную сеть, чтобы предсказать физический квантовый процесс. После прогнозирования я хочу понять, есть ли корреляция с другой функцией, поэтому я строю график, приложенный ниже, где я вижу, что существует большая корреляция. Затем я хочу оценить r2, но есть ошибка, потому что результат - 10.187, что невозможно. Более того, если я оцениваю r2 с excel, это правильно, потому что r2 = 0,86, как вы можете видеть ниже.

Кто может это исправить?

Это сюжет, первый с python, а второй с Excel (с r2).

enter image description here

enter image description here

Это код:

# =============================================================================
# Carico le librerie
# =============================================================================
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from keras import optimizers
from keras import regularizers
from keras import backend
from tensorflow.keras import regularizers
from keras.regularizers import l2


# =============================================================================
# Scelgo il test size
# =============================================================================

test_size = 0.20


# =============================================================================
# Carico il dataset e scelgo label e features
# =============================================================================
dataset = pd.read_csv('Dataset.csv', decimal=',', delimiter = ";")

label = dataset['Label']
features = dataset.drop(columns = ['Label'])

# =============================================================================
# Per la normalizzazione
# =============================================================================
y_max_pre_normalize = max(label)
y_min_pre_normalize = min(label)

def denormalize(y):
    final_value = y*(y_max_pre_normalize-y_min_pre_normalize)+y_min_pre_normalize
    return final_value

# =============================================================================
# Suddivido il dataset in train e test
# =============================================================================

X_train1, X_test1, y_train1, y_test1 = train_test_split(features, label, test_size = test_size, shuffle = True)

# =============================================================================
# Estraggo le feature che mi serviranno nella seconda rete, perchè così sono 
# già state mischiate da shuffle 
# =============================================================================

feat = ['18','19','20']
features_post_train = X_train1[feat]
features_post_test = X_test1[feat]

X_train1 = X_train1.drop(columns = ['18','19','20'])
X_test1 = X_test1.drop(columns = ['18','19','20'])


y_test2 = y_test1.to_frame()
y_train2 = y_train1.to_frame()

# =============================================================================
# Normalizzo
# =============================================================================
scaler1 = preprocessing.MinMaxScaler()
scaler2 = preprocessing.MinMaxScaler()
X_train = scaler1.fit_transform(X_train1)
X_test = scaler2.fit_transform(X_test1)


scaler3 = preprocessing.MinMaxScaler()
scaler4 = preprocessing.MinMaxScaler()
y_train = scaler3.fit_transform(y_train2)
y_test = scaler4.fit_transform(y_test2)





# =============================================================================
# Mi serve per avere come metrica R2
# =============================================================================
from keras import backend as K
def r2_score(y_true, y_pred):
    SS_res =  K.sum(K.square(y_true - y_pred)) 
    SS_tot = K.sum(K.square(y_true - K.mean(y_true))) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

from datetime import datetime
logdir = "logs/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir) 

# =============================================================================
# Creo la 1° rete neurale
# =============================================================================
optimizer = tf.keras.optimizers.Adam(lr=0.001)
model = Sequential()
c= 1e-10
# ,kernel_regularizer=l2(c), bias_regularizer=l2(c)
model.add(Dense(100, input_shape = (X_train.shape[1],), activation = 'relu',kernel_initializer='glorot_uniform'))
model.add(Dropout(0.2))
model.add(Dense(100, activation = 'relu',kernel_initializer='glorot_uniform'))
model.add(Dropout(0.2))
model.add(Dense(100, activation = 'relu',kernel_initializer='glorot_uniform'))
model.add(Dense(1,activation = 'linear',kernel_initializer='glorot_uniform'))

model.compile(loss = 'mse', optimizer = optimizer, metrics = ['mse', r2_score])

history = model.fit(X_train, y_train, epochs = 200,
                    validation_split = 0.1, shuffle=False, batch_size=250
                    )

history_dict = history.history

loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
r2_score_train = history_dict['r2_score']
r2_score_val = history_dict['val_r2_score']

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

y_train_pred = denormalize(y_train_pred)
y_test_pred = denormalize(y_test_pred)



# =============================================================================
# Plot di y reali vs previste 
# =============================================================================
plt.figure()
plt.plot((y_test1),(y_test_pred),'.', color='darkviolet', alpha=1, marker='o', markersize = 2, markeredgecolor = 'black', markeredgewidth = 0.1)
plt.plot((np.array((-0.1,7))),(np.array((-0.1,7))),'-', color='magenta')
plt.xlabel('True')
plt.ylabel('Predicted')
plt.title('Test')
# plt.set_title('R2: ' + str(r2_score(y_test_pred, y_test1)))

# plt.savefig('figure11.png', dpi=150, bbox_inches="tight")

plt.figure()
plt.plot((y_train1),(y_train_pred),'.', color='darkviolet', alpha=1, marker='o', markersize = 2, markeredgecolor = 'black', markeredgewidth = 0.1)
plt.plot((np.array((-0.1,7))),(np.array((-0.1,7))),'-', color='magenta')
plt.xlabel('True')
plt.ylabel('Predicted')
# plt.set_title('R2: ' + str(r2_score(y_train_pred, y_train1)))
plt.title('Train')
# plt.savefig('figure21.png', dpi=150, bbox_inches="tight")

# =============================================================================
# Plot di R2
# =============================================================================
plt.figure()
plt.plot(r2_score_train,'b',label = 'r2_score_train')
plt.plot(r2_score_val,'r',label = 'r2_score_val')
plt.xlabel('Epochs')
plt.ylabel('r2_score')
plt.legend()

# =============================================================================
# Plot della loss
# =============================================================================

plt.figure()
plt.plot(loss_values,'b',label = 'training loss')
plt.plot(val_loss_values,'r',label = 'val training loss')
plt.xlabel('Epochs')
plt.ylabel('Loss Function')
plt.legend()
# plt.savefig('figure31.png', dpi=150, bbox_inches="tight")

# =============================================================================
# Valuto R2, mse e RMSE
# =============================================================================
from sklearn.metrics import r2_score

print("\n\nThe R2 score on the test set is:\t{:0.3f}".format(r2_score(y_test_pred, y_test1)))

print("The R2 score on the train set is:\t{:0.3f}".format(r2_score(y_train_pred, y_train1)))
from sklearn import metrics


score = metrics.mean_squared_error(y_test_pred,y_test1)
print("\n\nFinal score test (MSE): %0.4f" %(score))
score1 = metrics.mean_squared_error(y_train_pred,y_train1)
print("Final score train (MSE): %0.4f" %(score1))
score2 = np.sqrt(metrics.mean_squared_error(y_test_pred,y_test1))
print(f"Final score test (RMSE): %0.4f" %(score2))
score3 = np.sqrt(metrics.mean_squared_error(y_train_pred,y_train1))
print(f"Final score train (RMSE): %0.4f" %(score3))


# =============================================================================
# Unisco le variabili estratte prima con quella predetta dalla rete (MSS_EO_Pred)
# =============================================================================

label_old = y_train2.append(y_test2)
label_old = label_old.to_numpy()
label_old = pd.DataFrame(label_old)


new_feat = features_post_train.append(features_post_test)

dataset = X_train1.append(X_test1)

dataset = pd.concat([dataset,new_feat],axis = 1)
dataset2 = dataset
dataset = dataset.to_numpy()
dataset = pd.DataFrame(dataset)
y_test_pred = pd.DataFrame(y_test_pred)
y_train_pred = pd.DataFrame(y_train_pred)

label_predicted = y_train_pred.append(y_test_pred)
label_predicted = label_predicted.to_numpy()
label_predicted = pd.DataFrame(label_predicted)
label_predicted.columns = ['MSS_EO_Predicted']
dataset = pd.concat([dataset,label_predicted],axis = 1)
dataset = pd.concat([dataset,label_old],axis = 1)

print("The R2 score on the train set is:\t{:0.3f}".format(r2_score(dataset.iloc[:,-5],dataset.iloc[:,-2])))

plt.figure()
plt.plot(dataset.iloc[:,-2],dataset.iloc[:,-5],'b.')
plt.xlabel('EO Pred')
plt.ylabel('TP Misurato')
plt.legend()
...