это код, который я создал для простой линейной регрессии.Это код, и у меня есть несколько вопросов, на которые я ищу ответы.Как обнаружить и удалить выбросы из X и Y, может быть, пример кода поможет?Что вы думаете о качестве подготовки и оценки модельной части?Правильная перекрестная проверка?поезд испытательный набор?Как интерпретировать среднеквадратичные значения?большие значения - хороший знак или нет?
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy import stats
# Import Excel File
data = pd.read_excel ("C:\\Users\\AchourAh\\Desktop\\Simple_Linear_Regression\\SP Level Simple Linear Regression\\PL32_PMM_03_09_2018_SP_Level.xlsx",'Sheet1') #Import Excel file
# Replace null values of the whole dataset with 0
data1 = data.fillna(0)
print(data1)
# Extraction of the independent and dependent variable
X = data1.iloc[0:len(data1),1].values.reshape(-1, 1) #Extract the column of the COPCOR SP we are going to check its impact
Y = data1.iloc[0:len(data1),2].values.reshape(-1, 1) #Extract the column of the PAUS SP
# Data Splitting to train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size =0.25,random_state=42)
# Training the model and Evaluation of the Model
from sklearn.linear_model import LinearRegression
from sklearn import model_selection
lm = LinearRegression() #create an lm object of LinearRegression Class
lm.fit(X_train, Y_train) #train our LinearRegression model using the training set of data - dependent and independent variables as parameters. Teaching lm that Y_train values are all corresponding to X_train values.
from sklearn.model_selection import KFold
kf = KFold(n_splits=6, random_state=None)
for train_index, test_index in kf.split(X_train
print("Train:", train_index, "Validation:",test_index)
X_train1, X_test1 = X[train_index], X[test_index]
Y_train1, Y_test1 = Y[train_index], Y[test_index]
results = -1 * model_selection.cross_val_score(lm, X_train1, Y_train1,scoring='neg_mean_squared_error', cv=kf)
print(results)
print(results.mean())
y_pred = lm.predict(X_test)
from sklearn.metrics import mean_squared_error
mse_test = mean_squared_error(Y_test, y_pred)
print(mse_test)
import math
print(math.sqrt(mse_test))
print(math.sqrt(results.mean()))
df = pd.DataFrame({'Actual': [Y_test], 'Predicted': [y_pred]})
print(df)
# Graph of the Training model
plt.scatter(X_train, Y_train, color = 'red')#plots scatter graph of COP COR against PAUS for values in X_train and y_train
plt.plot(X_train, lm.predict(X_train), color = 'blue')#plots the graph of predicted PAUS against COP COR.
plt.title('SP000905974')
plt.xlabel('COP COR Quantity')
plt.ylabel('PAUS Quantity')
plt.show()#Show the graph
# Statistical Analysis of the training set with Statsmodels
X2 = sm.add_constant(X_train) # add a constant to the model
est = sm.OLS(Y_train, X2).fit()
print(est.summary()) # print the results
# Statistical Analysis of the training set with Scikit-Learn
params = np.append(lm.intercept_,lm.coef_)
predictions = lm.predict(X_train)
newX = pd.DataFrame({"Constant":np.ones(len(X_train))}).join(pd.DataFrame (X_train))
MSE = (sum((Y_train-predictions)**2))/(len(newX)-len(newX.columns))
var_b = MSE*(np.linalg.inv(np.dot(newX.T,newX)).diagonal())
sd_b = np.sqrt(var_b)
ts_b = params/ sd_b
p_values =[2*(1-stats.t.cdf(np.abs(i),(len(newX)-1))) for i in ts_b]
sd_b = np.round(sd_b,3)
ts_b = np.round(ts_b,3)
p_values = np.round(p_values,5)
params = np.round(params,4)
myDF1 = pd.DataFrame()
myDF1["Coefficients"],myDF1["Standard Errors"],myDF1["t values"],myDF1["P-values"] = [params,sd_b,ts_b,p_values]
print(myDF1)
Я новичок в этом, и я также открыт для других замечаний, если что-то не так с кодированием?