Я работаю в сфере страхования и хотел бы сравнить наши стандартные регрессионные модели GLM Пуассона с байесовскими моделями.Однако я не уверен, как включить в модель как категориальные, так и непрерывные предикторы.Я пробовал код ниже:
import pandas as pd
import numpy as np
import scipy.stats as stats
import seaborn as sns
from pymc3 import Model, Normal, HalfNormal, Poisson, find_MAP, NUTS, sample, summary, traceplot
import theano
#create fake data
n = 1000
y = np.random.poisson(lam = 1, size = n)
lower = 5000
upper = 150000
mu, sigma = 60000, 30000
X = stats.truncnorm(
(lower - mu) / sigma, (upper - mu) / sigma, loc=mu, scale=sigma)
value = X.rvs(1000)
married = ['m', 's', 's', 'm', 'm']
marital_status = married * 200
credit = ['GF', '375', '400', '425', '450', '475', '500', '525', '550', '575', '600', '625', '650',
'675', '700', '725', '750', '775', '800', 'T', '575', '600', '625', '650',
'675', '700', '725', '750', '775', '800', '750', '775', '800', '750', '775', '800'
,'775', '800', '775', '800', '700', '725', '750', '775', '800', 'T', '575', '600', '625',
'650']
credit_group = credit * 20
df = pd.DataFrame(columns = ['claims'], data = y)
df['value'] = value
df['marital_status'] = marital_status
df['credit_group'] = credit_group
marital_dummies = pd.get_dummies(df['marital_status'], drop_first=True)
credit_dummies = pd.get_dummies(df['credit_group'], drop_first=True)
df['single_status'] = marital_dummies
df = pd.concat([df, credit_dummies], axis = 1)
df.drop(['marital_status', 'credit_group'], axis = 1, inplace = True)
df.head()
value_shared = theano.shared(df['value'].values)
single_shared = theano.shared(df['single_status'].values)
x3_shared = theano.shared(df['400'].values)
x4_shared = theano.shared(df['425'].values)
x5_shared = theano.shared(df['450'].values)
x6_shared = theano.shared(df['475'].values)
x7_shared = theano.shared(df['500'].values)
x8_shared = theano.shared(df['525'].values)
x9_shared = theano.shared(df['550'].values)
x10_shared = theano.shared(df['575'].values)
x11_shared = theano.shared(df['600'].values)
x12_shared = theano.shared(df['625'].values)
x13_shared = theano.shared(df['650'].values)
x14_shared = theano.shared(df['675'].values)
x15_shared = theano.shared(df['700'].values)
x16_shared = theano.shared(df['725'].values)
x17_shared = theano.shared(df['750'].values)
x18_shared = theano.shared(df['775'].values)
x19_shared = theano.shared(df['800'].values)
x20_shared = theano.shared(df['GF'].values)
x21_shared = theano.shared(df['T'].values)
#
np.random.seed(123)
poisson_model = Model()
with poisson_model:
#priors for unknown model paramters
alpha = Normal('alpha', mu = 0, sd = 1)
beta = Normal('beta', mu = 0, sd = 1, shape = 2)
sigma = HalfNormal('sigma', sd = 1)
x1 = single_shared
x2 = value_shared
x3 = x3_shared
x4 = x4_shared
x5 = x5_shared
x6 = x6_shared
x7 = x7_shared
x8 = x8_shared
x9 = x9_shared
x10 = x10_shared
x11 = x11_shared
x12 = x12_shared
x13 = x13_shared
x14 = x14_shared
x15 = x15_shared
x16 = x16_shared
x17 = x17_shared
x18 = x18_shared
x19 = x19_shared
x20 = x20_shared
x21 = x21_shared
mu = alpha + beta[0]*alpha + beta[1]*x1 + beta[2]*x2 + beta[3]*x3 + beta[4]*x4 + beta[5]*x5 + beta[6]*x6+ beta[7]*x7+ beta[8]*x8+ beta[9]*x9+ beta[10]*x10+ beta[11]*x11+ beta[12]*x12 + beta[13]*x13+ beta[14]*x14+ beta[15]*x15+ beta[16]*x16+ beta[17]*x17+ beta[18]*x18+ beta[19]*x19+ beta[20]*x20+ beta[21]*x21
claims = Poisson('claims', mu = mu, sigma = sigma, observed = df['claims'])
start = find_MAP(fmin=optimize.fmin_powell)
step = NUTS(scaling = start)
trace = sample(1000, step, start = start)
Я не уверен, почему, но я продолжаю получать ошибку «Ошибка индекса: индекс выходит за границы».