Я создал простой алгоритм градиентного спуска (с множественной линейной регрессией) с нуля, используя Python. Это не сработало (из-за постоянной скорости обучения c), поэтому, чтобы исправить это, я сделал адаптивные скорости обучения с помощью Adagrad. Но каждый раз, когда я запускаю его, размеры шагов одинаковы. Здесь для всех параметров. И это тоже не сходится. Вот ссылка на весь сценарий. https://pastebin.com/GkjCUavZ. пожалуйста помоги. Я застрял на некоторое время.
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from random import randint
import matplotlib.pyplot as plt
class Loss():
def __init__(self,loss_func,batch_size=1): # give it the name of the loss_function, and the batch_size which is usually 1.
self.loss_func = loss_func # the user can see the loss function they used. if they call the loss_func atribute
self.equation = [] # equation of the line of best fit. it will be populated after the model has trained
self.data = pd.read_csv('https://raw.githubusercontent.com/SuvroBaner/Python-for-Data-Science-and-Machine-Learning-Bootcamp/master/10.%20Linear-Regression/Ecommerce%20Customers')
self.data = self.data[['Avg. Session Length','Time on App','Time on Website','Length of Membership','Yearly Amount Spent']]
self.x = self.data.drop('Yearly Amount Spent',axis = 1) # features of the data
self.y = self.data['Yearly Amount Spent'] # labels
self.x_train,self.x_test,self.y_train,self.y_test = train_test_split(self.x,self.y,test_size = 0.1)
self.gradients_of_intercept = [] # a list of gradeints for the intercept
self.gradients_of_param1 = [] # a list of gradeints for the first param
self.gradients_of_param2 = [] # a list of gradeints for the second param
self.gradients_of_param3 = [] # a list of gradeints for the third param
self.gradients_of_param4 = [] # a list of gradeints for the fourth param
## this function receives the training_set and converts it to the form [feature,1,feature2,feature3,feature4,label]
def get_pairs(self,x_train,y_train):
pairs = [] # the pairs will go here
counter = 0 ## used to get the label
for feature in x_train.values: # for every data point in the training set (just the features)
list_feature = list(feature) # turn the vector into a list so stuff can be added to it
list_feature.append(y_train.values[counter]) # append its corresponding label
pairs.append(list_feature) # append all of that to the pairs list
counter+=1 # add one to the counter
return pairs # return a list of lists whose's [-1] element is the label and the rest are the features
def SSR(self,intercept,param1,param2,param3,param4,batch_size): # gets paramaters and the features to make a prediction. it also does derivatices
data = self.get_pairs(self.x_train,self.y_train) # gets a dataset in the form ==> [feature1,feature2,feature3,feature4,label]
random_points = None # used as a place holder for the random data points
training_batch = []
if(batch_size < self.x_train.values.shape[0]): # if the batch_size is less than the whole data
random_points = np.random.randint(self.x_train.values.shape[0],size = batch_size) # get a vector of of size(batch_size) of random row numbers
else:
raise IndexError("batch_size must be less than or equal to the training set.")
for random_row_number in random_points: # get the random row numbers
training_batch.append(data[random_row_number]) # append the data point that is in that row to the training_batch
# total gradients of the parameters
d_intercept_total = 0
d_param1_total = 0
d_param2_total = 0
d_param3_total = 0
d_param4_total = 0
for data_point in training_batch: # for data point in the training_batch
f1 = data_point[0] # first feature
f2 = data_point[1] # second feature
f3 = data_point[2] # third feature
f4 = data_point[3] # fourth feature
label = data_point[4] # the label
d_intercept = -2*(label-(intercept + (f1*param1) + (f2*param2) + (f3*param3)+ (f4*param4) )) # the derivative of the sum_of_the_squared_residuals function
# with relation to the intercept using the chain rule
## get the derivative of the loss function with relation to the first paramater using the chain rule
d_param1 = 2*(0-f1)*(label-(intercept+ (f1*param1) + (f2*param2) + (f3*param3)+ (f4*param4) )) # the derivative of the loss function (sum of squared residuals) with relation to the first param
## get the derivative of the loss function with relation to the second paramater using the chain rule
d_param2 = 2*(0-f2)*(label-(intercept+ (f1*param1) + (f2*param2) + (f3*param3)+ (f4*param4) )) # the derivative of the function with relation to the second param
## get the derivative of the loss function with relation to the third paramater using the chain rule
d_param3 = 2*(0-f3)*(label-(intercept+ (f1*param1) + (f2*param2) + (f3*param3)+ (f4*param4) )) # the derivative of the function with relation to the third param
## get the derivative of the loss function with relation to the fourth paramater using the chain rule
d_param4 = 2*(0-f4)*(label-(intercept+ (f1*param1) + (f2*param2) + (f3*param3)+ (f4*param4) )) # the derivative of the function with relation to the fourth param
### sum of all the gradients for each parameter (add all the slopes from each paramater to the total for that paramater)
d_intercept_total += d_intercept
d_param1_total += d_param1
d_param2_total += d_param2
d_param3_total += d_param3
d_param4_total += d_param4
self.gradients_of_intercept.append(d_intercept_total) # append the intercept gradient to the list of gradients for the intercept
self.gradients_of_param1.append(d_param1_total) # append the firt parma's gradient to the list of gradients for the first param
self.gradients_of_param2.append(d_param2_total) # some thing as above, but for the second param
self.gradients_of_param3.append(d_param3_total) # some thing as above, but for the third param
self.gradients_of_param4.append(d_param4_total) # some thing as above, but for the fourth param
return d_intercept_total,d_param1_total,d_param2_total,d_param3_total,d_param4_total
def get_alpha_t(self,t): # used to calculate a different learning rate for each parameter
## the alpha_t_intercept is the summation of all of the gradients of the intercept up to t squared. the one is added to t because of indexing.
# eg. if its the first step then t = 0. doing x[0:0] will not return anything
alpha_t_intercept = np.sum(np.array(self.gradients_of_intercept[:t+1])**2)
## the alpha_t_param1 is the summation of all of the gradients of the first param up to t squared
alpha_t_param1 = np.sum(np.array(self.gradients_of_param1[:t+1])**2)
## the alpha_t_param2 is the summation of all of the gradients of the second param up to t squared
alpha_t_param2 = np.sum(np.array(self.gradients_of_param2[:t+1])**2)
## the alpha_t_param3 is the summation of all of the gradients of the third param up to t squared
alpha_t_param3 = np.sum(np.array(self.gradients_of_param3[:t+1])**2)
## the alpha_t_param4 is the summation of all of the gradients of the fourth param up to t squared
alpha_t_param4 = np.sum(np.array(self.gradients_of_param4[:t+1])**2)
return alpha_t_intercept,alpha_t_param1,alpha_t_param2,alpha_t_param3,alpha_t_param4 # return the alpha_ts
def predict(self,x):
return np.sum(np.array(self.equation[0:4]) * np.array(x))+ self.equation[4]
#### this function zips the features and labels together. ====> [feture1,feature2,..,featuren,label]
def fit(self,epsilon,initial_learning_rate,batch_size,steps,verbose = 0): # fits the model to the training data
intercept,param1,param2,param3,param4 = randint(0,15),randint(0,15),randint(0,15),randint(0,15),randint(0,15) # do the random initialization for the model
if self.loss_func == 'SSR': # if the person chose sum of the squared residuals as their loss function
for i in range(steps):
intercept_point,param1_slope,param2_slope,param3_slope,param4_slope = self.SSR(intercept,param1,param2,param3,param4,batch_size) # get the gradients of the params
learning_rate_intercept = (initial_learning_rate)/np.sqrt(self.get_alpha_t(i)[0]+epsilon) # get the learning rate of the intercept at step = i
learning_rate_param1 = (initial_learning_rate)/np.sqrt(self.get_alpha_t(i)[1]+epsilon) # get the learning rate of the first param at step = i
learning_rate_param2 = (initial_learning_rate)/np.sqrt(self.get_alpha_t(i)[2]+epsilon) # get the learning rate of the second param at step = i
learning_rate_param3 = (initial_learning_rate)/np.sqrt(self.get_alpha_t(i)[3]+epsilon) # get the learning rate of the third param at step = i
learning_rate_param4 = (initial_learning_rate)/np.sqrt(self.get_alpha_t(i)[4]+epsilon) # get the learning rate of the fourth param at step = i
intercept = intercept-(intercept_point*learning_rate_intercept) # get the new intercept
param1 = param1-(param1_slope*learning_rate_param1) # get the new first slope
param2 = param2-(param2_slope*learning_rate_param2) # get the new second slope
param3 = param3-(param3_slope*learning_rate_param3) # get the new third slope
param4 = param4-(param4_slope*learning_rate_param4) # get the new fourth slope
if verbose != 0: # if the user wants to see the information.
#print(the step sizes for each parameter)
print(i,[intercept_point*learning_rate_intercept],[param1_slope*learning_rate_param1],[param2_slope*learning_rate_param2],[param3_slope*learning_rate_param3],[param4_slope*learning_rate_param4])
# write the final equation to a file for easy access (used to uset the intercepts instead of training again)
self. equation = [intercept,param1,param2,param3,param4] ## return the equation of the line/plane of best fit
with open('equation.txt','a+') as g:
g.write(str(self.equation)) # write the equation in the form [intercept,param1,param2,param3,param4] to a file
a = Loss('SSR')
a.fit(10**-8,0.01,5,100,9)