Статистический анализ для линейной регрессии с помощью Scikit Learn - PullRequest
0 голосов
/ 16 октября 2018

код, с которым у меня возникла проблема, это # ​​Статистический анализ обучающего набора с Scikit-Learn, который представляет собой код для простой линейной регрессии, который воспроизводит выходные данные Statsmodels с использованием Scikit learn, но когда я пытался изменить его, чтобы выполнить егона множественной линейной регрессии у меня не получилось.Есть ли у вас какие-либо идеи, как изменить его, чтобы он работал?

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy import stats
data = pd.read_excel ("C:\\Users\\Aymen\\Desktop\\Multiple_Linear_Regression\\Excels\\ABC.xlsx",'Sheet1') #Import Excel file


# Replace null values of the whole dataset with 0
data1 = data.fillna(0)
print(data1)

# Extraction of the independent and dependent variables
X = data1.iloc[0:len(data1),[1,2,3,4,5]] #Extract the column of the COPCOR SP we are going to check its impact
Y = data1.iloc[0:len(data1),6] #Extract the column of the PAUS SP
XY = data1.iloc[0:len(data1),[1,2,3,4,5,6]] #Extract the column of the COPCOR SP we are going to check its impact


# Data Splitting to train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size =0.25,random_state=42)
print(X_train.head(0))
from sklearn.preprocessing import StandardScaler
scaled_features = StandardScaler().fit_transform(X_train)
X_train = pd.DataFrame(scaled_features, index=X_train.index, columns=X_train.columns)
scaled_features = StandardScaler().fit_transform(X_test)
X_test = pd.DataFrame(scaled_features, index=X_test.index, columns=X_test.columns)


# Statistical Analysis of the training set with Statsmodels
X_All = sm.add_constant(X_train) # add a constant to the model
est = sm.OLS(Y_train, X_All).fit()
print(est.summary()) # print the results

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import math
lm = LinearRegression()  # create an lm object of LinearRegression Class
lm.fit(X_train,Y_train)  # train our LinearRegression model using the training set of data - dependent and independent variables as parameters. Teaching lm that Y_train values are all corresponding to X_train.
print(lm.intercept_)
print(lm.coef_)
mse_test = mean_squared_error(Y_test, lm.predict(X_test))
print(math.sqrt(mse_test))

# Statistical Analysis of the training set with Scikit-Learn
params1 = np.append(lm.intercept_,lm.coef_)
predictions1 = lm.predict(X_train)
newX1 = pd.DataFrame({"Constant":np.ones(len(X_train))}).join(pd.DataFrame(X_train))
MSE1 = (sum((Y_train-predictions1)**2))/(len(newX1)-len(newX1.columns))
var_b1 = MSE1*(np.linalg.inv(np.dot(newX1.T,newX1)).diagonal())
sd_b1 = np.sqrt(var_b1)
ts_b1 = params1/ sd_b1
p_values1 =[2*(1-stats.t.cdf(np.abs(i),(len(newX1)-1))) for i in ts_b1]
sd_b1 = np.round(sd_b1,3)
ts_b1 = np.round(ts_b1,3)
p_values1 = np.round(p_values1,5)
params1 = np.round(params1,4)
myDF2 = pd.DataFrame()
myDF2["Coefficients"],myDF2["Standard Errors"],myDF2["t values"],myDF2["P-values"] = [params1,sd_b1,ts_b1,p_values1]
print(myDF2)

# Data Splitting to train and test set of the reduced data
X_1 = data1.iloc[0:len(data1),[2,3]] #Extract the column of the COPCOR SP we are going to check its impact
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X_1, Y, test_size =0.25,random_state=42)
scaled_features = StandardScaler().fit_transform(X_train2)
X_train2 = pd.DataFrame(scaled_features, index=X_train2.index, columns=X_train2.columns)
scaled_features = StandardScaler().fit_transform(X_test2)
X_test2 = pd.DataFrame(scaled_features, index=X_test2.index, columns=X_test2.columns)


# Statistical Analysis of the reduced model with Statsmodels
X_reduced = sm.add_constant(X_train2) # add a constant to the model
est_reduced = sm.OLS(Y_train2, X_reduced).fit()
print(est_reduced.summary()) # print the results

# Fitting a Linear Model for the reduced model with Scikit-Learn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import math
lm1 = LinearRegression() #create an lm object of LinearRegression Class
lm1.fit(X_train2, Y_train2)
print(lm1.intercept_)
print(lm1.coef_)
mse_test1 = mean_squared_error(Y_test2, lm1.predict(X_test2))
print(math.sqrt(mse_test1))


#Cross Validation and Training again the model
from sklearn.model_selection import KFold
from sklearn import model_selection
kf = KFold(n_splits=6, random_state=42)
for train_index, test_index in kf.split(X_train):
 print("Train:", train_index, "Validation:",test_index)
 X_train1, X_test1 = X.iloc[train_index], X.iloc[test_index]
 Y_train1, Y_test1 = Y.iloc[train_index], Y.iloc[test_index]
results = -1 * model_selection.cross_val_score(lm1, X_train1, Y_train1,scoring='neg_mean_squared_error', cv=kf)
print(np.sqrt(results))
#RMSE values interpretation
print(math.sqrt(mse_test))
print(math.sqrt(results.mean()))
#Good model built no overfitting or underfitting (Barely Same for test and training5/6 : Goal of Cross validation but low prediction accuracy = Value is big

import seaborn
Corr=XY.corr(method='pearson')
mask=np.zeros_like(Corr)
mask[np.triu_indices_from(mask)]=True
seaborn.heatmap(Corr,cmap='RdYlGn_r',vmax=1.0,vmin=-1.0,mask=mask, linewidths=2.5)
plt.yticks(rotation=0)
plt.xticks(rotation=90)
plt.show()

Я получаю это:

  C:\Inicio\tools\64\Anaconda3-5.2.0.1\lib\site-packages\scipy\stats\_distn_infrastructure.py:1738: RuntimeWarning: invalid value encountered in greater_equal
  cond2 = (x >= self.b) & cond0

Результаты показывают только коэффициенты правильно, другие значения NaN

это пример данных

   Last 26 weeks        Variable Number 1
0          201823                                   0   
1          201824                                   0   
2          201825                                   0   
3          201826                                   0   
4          201827                                   0   
5          201828                              105000   
6          201829                                   0   
7          201830                             -105000   
8          201831                                   0   
9          201832                                   0   
10         201833                                   0   
11         201834                                   0   
12         201835                                   0   
13         201836                                   0   
14         201837                                   0   
15         201838                                   0   
16         201839                                   0   
17         201840                                   0   
18         201841                                   0   
19         201842                                   0   
20         201843                                   0   
21         201844                                   0   
22         201845                                   0   
23         201846                                   0   
24         201847                                   0   
25         201848                                   0   

    Variable Number 2
0                                      0   
1                                      0   
2                                      0   
3                                      0   
4                                 543000   
5                                      0   
6                                      0   
7                                      0   
8                                      0   
9                                      0   
10                                     0   
11                                120000   
12                                     0   
13                                 -3000   
14                                     0   
15                                     0   
16                                -75000   
17                                -36000   
18                                228000   
19                                     0   
20                                     0   
21                                     0   
22                                     0   
23                                630000   
24                                     0   
25                               -132000   

               Variable Number 3 \
0                                            0   
1                                            0   
2                                            0   
3                                            0   
4                                            0   
5                                            0   
6                                            0   
7                                            0   
8                                            0   
9                                            0   
10                                           0   
11                                           0   
12                                           0   
13                                           0   
14                                           0   
15                                           0   
16                                           0   
17                                           0   
18                                           0   
19                                      345000   
20                                           0   
21                                           0   
22                                           0   
23                                           0   
24                                           0   
25                                           0   

         Variable Number 4                          Variable Number 5\
0                                      714000                       0   
1                                           0                   57000   
2                                           0                       0   
3                                           0                       0   
4                                           0                       0   
5                                           0                    6000   
6                                           0                       0   
7                                           0                       0   
8                                           0                       0   
9                                           0                       0   
10                                          0                       0   
11                                          0                       0   
12                                          0                    3000   
13                                          0                       0   
14                                          0                       0   
15                                          0                   24000   
16                                          0                       0   
17                                          0                       0   
18                                          0                       0   
19                                          0                       0   
20                                          0                    3000   
21                                          0                       0   
22                                          0                       0   
23                                          0                       0   
24                                          0                  138000   
25                                          0                   48000   

Variable Number 6 
0      765000  
1       57000  
2           0  
3           0  
4      615000  
5      111000  
6           0  
7           0  
8           0  
9           0  
10          0  
11     237000  
12     165000  
13          0  
14          0  
15      24000  
16          0  
17          0  
18     357000  
19     429000  
20       3000  
21      21000  
22          0  
23     630000  
24     138000  
25      48000  
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...