Я пытаюсь оптимизировать свою логистическую регрессию, удаляя переменные ниже моего уровня сигнатуры 5%.
Мне интересно, могу ли я использовать только подходящие методы OLS или Logit?
Когда яиспользуйте метод подбора Logit, если я увеличиваю количество своих функций выше определенного числа, я получаю сообщение об ошибке «LinAlgError: Singular matrix».Когда я удаляю некоторые из этих функций, метод logit fit работает.
Есть ли ограничение на количество функций, которые вы можете добавить в logit.fit?Или это вызвано чем-то другим.
Мой код ниже
# Importing the libraries
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso
import pyodbc
#Setup SQL connection
conn = pyodbc.connect('Driver={SQL Server};'
'Server=xxxxxx;'
'Database=Datawarehouse;'
'Trusted_Connection=yes;')
cursor = conn.cursor()
dataset = pd.read_sql('SELECT * from [VIEW_ContactsLeadsModel_AU]', con=conn)
conn.close()
#Create a copy of the data frame
ContactsOutput = dataset.copy()
#Fill Null Values
#dataset.isnull().any()
dataset['Email'] = dataset['Email'].fillna(value = 'UEmail')
dataset['WorkplaceFunction'] = dataset['WorkplaceFunction'].fillna(value = 'UWorkplace')
dataset['IndustryLevel1'] = dataset['IndustryLevel1'].fillna(value = 'UIND1')
dataset['IndustryLevel2'] = dataset['IndustryLevel2'].fillna(value = 'UIND2')
dataset['IndustryLevel3'] = dataset['IndustryLevel3'].fillna(value = 'UIND3')
dataset['Title'] = dataset['Title'].fillna(value = 'UTitle')
dataset['LeadRating'] = dataset['LeadRating'].fillna(value = 'URating')
#Drop Variables not needed
dataset.drop(['ContactIDno','Email','MobilePhone','Phone','ContactName','Title','MailingCountry','LeadRating','LeadBuid','LeadDiv','LeadDPC','LeadSBU'],axis=1,inplace=True)
#Create Dummy variables
ContactDummy = pd.get_dummies(dataset['Contact_status__c'],drop_first=True)
WFunctionDummy = pd.get_dummies(dataset['WorkplaceFunction'],drop_first=True)
Industry1Dummy = pd.get_dummies(dataset['IndustryLevel1'],drop_first=True)
Industry2Dummy = pd.get_dummies(dataset['IndustryLevel2'],drop_first=True)
Industry3Dummy = pd.get_dummies(dataset['IndustryLevel3'],drop_first=True)
StateDummy = pd.get_dummies(dataset['MailingState'],drop_first=True)
LeadSourceDummy = pd.get_dummies(dataset['LeadSource'],drop_first=True)
#Drop original variables that have been dummied
dataset.drop(['Contact_status__c','WorkplaceFunction','IndustryLevel1','IndustryLevel2','IndustryLevel3'],axis=1,inplace=True)
dataset.drop(['MailingState','LeadSource','MailingCity'],axis=1,inplace=True)
#Move Target to the first line of Dataframe
dataset = dataset[['Target','MobilePhoneFlag']]
#Add new dummies to data
#dataset = pd.concat([dataset,ContactDummy,WFunctionDummy,Industry1Dummy,LeadSourceDummy,StateDummy],axis=1) #causes error LinAlgError: Singular matrix
dataset = pd.concat([dataset,ContactDummy,WFunctionDummy,Industry1Dummy,LeadSourceDummy],axis=1) #does not cause error LinAlgError: Singular matrix
#Create a view of your dataset
#Delete Dummy variables after moving back to DF
del ContactDummy
del WFunctionDummy
del Industry1Dummy
del Industry2Dummy
del Industry3Dummy
del StateDummy
del LeadSourceDummy
# Creating the dataset
X = dataset.drop('Target',axis = 1)
y = dataset['Target']
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
import statsmodels.api as sm
logit_model=sm.Logit(y_train,X_train)
result=logit_model.fit()
print(result.summary2())