Я предлагаю следующий код, который выполняет многомерную регрессию модели
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import datasets
class LinearRegression:
def __init__(self, learning_rate=0.0001, n_iters=1000):
self.lr = learning_rate
self.n_iters = n_iters
#since we have three independent variable, we initialize 4 weights with zeros
self.weights = np.array([[0.0],[0.0],[0.0],[0.0]])
def update_param(self, x_featureset, y_targets, weights):
"""
x_featureset - (160,3)
y_targets - (160,1)
predictions - (160,1)
weights - (4,1)
"""
predictions = self.predict(x_featureset, weights)
#extract the features
x1 = x_featureset[:,0]
x2 = x_featureset[:,1]
x3 = x_featureset[:,2]
#calculate partial derivatives
d_w0 = - (y_targets - predictions)
d_w1 = -x1*(y_targets - predictions)
d_w2 = -x2*(y_targets - predictions)
d_w3 = -x3*(y_targets - predictions)
#multiply derivative by learning rate and subtract from our weights
weights[0][0] -= (self.lr * np.mean(d_w0))
weights[1][0] -= (self.lr * np.mean(d_w1))
weights[2][0] -= (self.lr *np.mean(d_w2))
weights[3][0] -= (self.lr*np.mean(d_w3))
return weights
def cost_function(self, x_featureset, y_targets, weights):
"""
x_featureset - (160,3)
y_targets - (160,1)
predictions - (160,1)
weights - (4,1)
"""
total_observation = len(y_targets)
predictions = self.predict(x_featureset, weights)
sq_error = (y_targets-predictions)**2
return 1.0/(2*total_observation) * sq_error.sum()
def normalize(self, x_featureset):
"""
x_featureset - (160,3)
x_featureset.T - (3,160)
"""
for features in x_featureset.T:
fmean = np.mean(features)
frange = np.amax(features) - np.amin(features)
#vector subtraction
features -= fmean
#vector division
features /= frange
return x_featureset
def train(self, x, y):
cost_history = []
#nomalize independent variables
x = self.normalize(x)
for i in range(self.n_iters):
self.weights = self.update_param(x, y, self.weights)
cost = self.cost_function(x,y, self.weights)
cost_history.append(cost)
#log process
if i % 10 == 0:
print("cost: {}".format(cost))
def predict(self, x_featureset, weights):
"""
featureset - (160,3)
weights - (4,1)
predictions - (160,1)
"""
# Y = W0 + W1* X1 + W2 * X2 + W3 * X3
y_predicted = weights[0,:]+np.dot(x_featureset, weights[1:,:])
return y_predicted
#generating sample data using sklearn
def generate_data():
x, y = datasets.make_regression(n_samples=200, n_features=3, noise=20, random_state=4)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=1234)
return (x_train, x_test, y_train, y_test)
#create model instance
model = LinearRegression()
x_train, x_test, y_train, y_test = generate_data()
#fit the data
model.train(x_train, y_train)
Вывод:
cost: 980808.7969914433
cost: 980757.9150537294
cost: 980707.1372473323
cost: 980656.4633691043
cost: 980605.8932163038
cost: 980555.4265865949
cost: 980505.0632780452
cost: 980454.8030891262
cost: 980404.6458187121
cost: 980354.5912660785
cost: 980304.6392309029
cost: 980254.7895132622
cost: 980205.0419136335
cost: 980155.3962328921
cost: 980105.8522723109
cost: 980056.4098335612
cost: 980007.0687187086
cost: 979957.8287302161
cost: 979908.6896709399
cost: 979859.6513441313
cost: 979810.7135534338
cost: 979761.8761028836
cost: 979713.138796909
cost: 979664.5014403281
cost: 979615.9638383496
cost: 979567.5257965708
cost: 979519.1871209786
cost: 979470.9476179467
cost: 979422.8070942352
cost: 979374.7653569917
cost: 979326.8222137484
cost: 979278.9774724214
cost: 979231.2309413117
cost: 979183.5824291029
cost: 979136.031744861
cost: 979088.578698033
cost: 979041.2230984472
cost: 978993.9647563117
cost: 978946.8034822136
cost: 978899.7390871193
cost: 978852.7713823715
cost: 978805.9001796913
cost: 978759.1252911751
cost: 978712.4465292948
cost: 978665.8637068978
cost: 978619.3766372039
cost: 978572.9851338081
cost: 978526.689010676
cost: 978480.4880821462
cost: 978434.3821629275
cost: 978388.3710680995
cost: 978342.4546131112
cost: 978296.6326137795
cost: 978250.9048862904
cost: 978205.271247197
cost: 978159.7315134181
cost: 978114.2855022394
cost: 978068.9330313113
cost: 978023.6739186477
cost: 977978.5079826281
cost: 977933.4350419926
cost: 977888.4549158453
cost: 977843.5674236503
cost: 977798.7723852337
cost: 977754.0696207809
cost: 977709.4589508367
cost: 977664.9401963042
cost: 977620.5131784454
cost: 977576.177718878
cost: 977531.9336395771
cost: 977487.7807628732
cost: 977443.7189114518
cost: 977399.7479083528
cost: 977355.8675769694
cost: 977312.0777410483
cost: 977268.3782246873
cost: 977224.7688523371
cost: 977181.2494487979
cost: 977137.8198392204
cost: 977094.4798491052
cost: 977051.2293043006
cost: 977008.0680310033
cost: 976964.9958557582
cost: 976922.0126054548
cost: 976879.1181073303
cost: 976836.3121889662
cost: 976793.5946782889
cost: 976750.9654035685
cost: 976708.4241934177
cost: 976665.9708767924
cost: 976623.6052829901
cost: 976581.3272416494
cost: 976539.1365827485
cost: 976497.0331366067
cost: 976455.0167338816
cost: 976413.0872055686
cost: 976371.2443830017
cost: 976329.488097852
cost: 976287.8181821259
cost: 976246.2344681664
Обновление:
Тестирование с lr=0.001
, поскольку выше показатель обучения был слишком велик и итераций до 100000
. Я обнаружил, что модель сходится вокруг следующих значений стоимости.
cost: 959301.8925571552
cost: 959298.6367338672
cost: 959296.3380453996
cost: 959294.9824055596
cost: 959294.5560072181
cost: 959295.0453167808
cost: 959296.4370687702
cost: 959298.7182605114
cost: 959301.8761469286