Адаградский алгоритм возвращает почти одинаковую скорость обучения для каждого параметра - PullRequest
0 голосов
/ 01 апреля 2020

Я создал простой алгоритм градиентного спуска (с множественной линейной регрессией) с нуля, используя Python. Это не сработало (из-за постоянной скорости обучения c), поэтому, чтобы исправить это, я сделал адаптивные скорости обучения с помощью Adagrad. Но каждый раз, когда я запускаю его, размеры шагов одинаковы. Здесь для всех параметров. И это тоже не сходится. Вот ссылка на весь сценарий. https://pastebin.com/GkjCUavZ. пожалуйста помоги. Я застрял на некоторое время.

import numpy as np
import pandas as pd
from sklearn.model_selection import  train_test_split
from random import randint
import  matplotlib.pyplot as plt

class Loss():
    def __init__(self,loss_func,batch_size=1):  # give it the name of the loss_function, and the batch_size which is usually 1.     
        self.loss_func = loss_func  # the user can see the loss function they used. if they call the loss_func atribute
        self.equation = []   # equation of the line of best fit. it will be populated after the model has trained
        self.data = pd.read_csv('https://raw.githubusercontent.com/SuvroBaner/Python-for-Data-Science-and-Machine-Learning-Bootcamp/master/10.%20Linear-Regression/Ecommerce%20Customers')
        self.data = self.data[['Avg. Session Length','Time on App','Time on Website','Length of Membership','Yearly Amount Spent']]
        self.x = self.data.drop('Yearly Amount Spent',axis = 1)  # features of the data
        self.y = self.data['Yearly Amount Spent']   # labels
        self.x_train,self.x_test,self.y_train,self.y_test = train_test_split(self.x,self.y,test_size = 0.1)
        self.gradients_of_intercept = []  # a list of gradeints for the  intercept
        self.gradients_of_param1 = [] # a list of gradeints for the  first param
        self.gradients_of_param2 = [] # a list of gradeints for the  second param
        self.gradients_of_param3 = [] # a list of gradeints for the  third param
        self.gradients_of_param4 = [] # a list of gradeints for the  fourth param

    ## this function receives the training_set and converts it to the form [feature,1,feature2,feature3,feature4,label]
    def get_pairs(self,x_train,y_train):
        pairs = []  # the pairs will go here
        counter = 0  ## used to get the label
        for feature in x_train.values:  # for every data point in the training set (just the features)
            list_feature = list(feature)  # turn the vector into a list so stuff can be added to it
            list_feature.append(y_train.values[counter])  # append its corresponding label
            pairs.append(list_feature)   # append all of that to the pairs list
            counter+=1   # add one to the counter
        return pairs    # return a list of lists whose's [-1] element is the label and the rest are the features

    def SSR(self,intercept,param1,param2,param3,param4,batch_size):  # gets paramaters and the features to make a prediction. it also does derivatices

        data = self.get_pairs(self.x_train,self.y_train)   # gets a dataset in the form ==> [feature1,feature2,feature3,feature4,label]
        random_points = None # used as a place holder for the random data points
        training_batch = []
        if(batch_size < self.x_train.values.shape[0]): # if the batch_size is less than the whole data
            random_points = np.random.randint(self.x_train.values.shape[0],size = batch_size)  # get a vector of of size(batch_size) of random row numbers
        else:
            raise IndexError("batch_size must be less than or equal to  the training set.")

        for random_row_number in random_points:  # get the random row numbers
            training_batch.append(data[random_row_number]) # append the data point that is in that row to the training_batch

        # total gradients of the parameters
        d_intercept_total = 0   
        d_param1_total = 0
        d_param2_total = 0
        d_param3_total = 0
        d_param4_total = 0

        for data_point in training_batch: # for data point in the training_batch

            f1 = data_point[0]  # first feature
            f2 = data_point[1]  # second feature
            f3 = data_point[2]  # third feature
            f4 = data_point[3] # fourth feature
            label = data_point[4]  # the label


            d_intercept = -2*(label-(intercept + (f1*param1) + (f2*param2) + (f3*param3)+ (f4*param4) ))  # the derivative of the sum_of_the_squared_residuals function
            # with relation to the intercept using the chain rule

            ## get the derivative of the loss function with relation to the first paramater using the chain rule
            d_param1 = 2*(0-f1)*(label-(intercept+ (f1*param1) + (f2*param2) + (f3*param3)+ (f4*param4) )) # the derivative of the loss function (sum of squared residuals) with relation to the first param

            ## get the derivative of the loss function with relation to the second paramater using the chain rule
            d_param2 = 2*(0-f2)*(label-(intercept+ (f1*param1) + (f2*param2) + (f3*param3)+ (f4*param4) )) # the derivative of the function with relation to the second param

            ## get the derivative of the loss function with relation to the third paramater using the chain rule
            d_param3 = 2*(0-f3)*(label-(intercept+ (f1*param1) + (f2*param2) + (f3*param3)+ (f4*param4) )) # the derivative of the function with relation to the third param

            ## get the derivative of the loss function with relation to the fourth paramater using the chain rule
            d_param4 = 2*(0-f4)*(label-(intercept+ (f1*param1) + (f2*param2) + (f3*param3)+ (f4*param4) )) # the derivative of the function with relation to the fourth param

            ### sum of all the gradients for each parameter (add all the slopes from each paramater to the total for that paramater)
            d_intercept_total += d_intercept 
            d_param1_total += d_param1
            d_param2_total += d_param2
            d_param3_total += d_param3
            d_param4_total += d_param4
        self.gradients_of_intercept.append(d_intercept_total) # append the intercept gradient to the list of gradients for the intercept
        self.gradients_of_param1.append(d_param1_total) # append the firt parma's gradient to the list of gradients for the first param
        self.gradients_of_param2.append(d_param2_total) # some thing as above, but for the second param
        self.gradients_of_param3.append(d_param3_total) # some thing as above, but for the third param
        self.gradients_of_param4.append(d_param4_total) # some thing as above, but for the fourth param

        return d_intercept_total,d_param1_total,d_param2_total,d_param3_total,d_param4_total

    def get_alpha_t(self,t):     # used to calculate a different learning rate for each parameter

        ## the alpha_t_intercept is the summation of all of the gradients of the intercept up to t squared. the one is added to t because of indexing. 
        # eg. if its the first step then t = 0. doing x[0:0] will not return anything

        alpha_t_intercept = np.sum(np.array(self.gradients_of_intercept[:t+1])**2)  

        ## the alpha_t_param1 is the summation of all of the gradients of the first param up to t squared
        alpha_t_param1 = np.sum(np.array(self.gradients_of_param1[:t+1])**2)

        ## the alpha_t_param2 is the summation of all of the gradients of the second param up to t squared
        alpha_t_param2 = np.sum(np.array(self.gradients_of_param2[:t+1])**2)

        ## the alpha_t_param3 is the summation of all of the gradients of the third param up to t squared
        alpha_t_param3 = np.sum(np.array(self.gradients_of_param3[:t+1])**2)

        ## the alpha_t_param4 is the summation of all of the gradients of the fourth param up to t squared
        alpha_t_param4 = np.sum(np.array(self.gradients_of_param4[:t+1])**2)

        return alpha_t_intercept,alpha_t_param1,alpha_t_param2,alpha_t_param3,alpha_t_param4 # return the alpha_ts

    def predict(self,x):

        return np.sum(np.array(self.equation[0:4]) * np.array(x))+ self.equation[4]
    #### this function zips the features and labels together. ====> [feture1,feature2,..,featuren,label]

    def fit(self,epsilon,initial_learning_rate,batch_size,steps,verbose = 0):  #  fits the model to the training data 

        intercept,param1,param2,param3,param4 = randint(0,15),randint(0,15),randint(0,15),randint(0,15),randint(0,15)  # do the random initialization for the model

        if self.loss_func == 'SSR': # if the person chose sum of the squared residuals as their loss function

            for i in range(steps):
intercept_point,param1_slope,param2_slope,param3_slope,param4_slope =  self.SSR(intercept,param1,param2,param3,param4,batch_size) # get the gradients of the params

                learning_rate_intercept = (initial_learning_rate)/np.sqrt(self.get_alpha_t(i)[0]+epsilon)  # get the learning rate of the intercept at step = i
                learning_rate_param1 = (initial_learning_rate)/np.sqrt(self.get_alpha_t(i)[1]+epsilon) # get the learning rate of the first param at step = i
                learning_rate_param2 = (initial_learning_rate)/np.sqrt(self.get_alpha_t(i)[2]+epsilon) # get the learning rate of the second param at step = i
                learning_rate_param3 = (initial_learning_rate)/np.sqrt(self.get_alpha_t(i)[3]+epsilon) # get the learning rate of the third param at step = i
                learning_rate_param4 = (initial_learning_rate)/np.sqrt(self.get_alpha_t(i)[4]+epsilon) # get the learning rate of the fourth param at step = i

                intercept = intercept-(intercept_point*learning_rate_intercept)  # get the new intercept
                param1 = param1-(param1_slope*learning_rate_param1) # get the new first slope
                param2 = param2-(param2_slope*learning_rate_param2) # get the new second slope
                param3 = param3-(param3_slope*learning_rate_param3) # get the new third slope
                param4 = param4-(param4_slope*learning_rate_param4) # get the new fourth slope

                if verbose != 0: # if the user wants to see the information. 
                    #print(the step sizes for each parameter)
                    print(i,[intercept_point*learning_rate_intercept],[param1_slope*learning_rate_param1],[param2_slope*learning_rate_param2],[param3_slope*learning_rate_param3],[param4_slope*learning_rate_param4])

            # write the final equation to a file for easy access (used to uset the intercepts instead of training again)

            self. equation = [intercept,param1,param2,param3,param4]   ## return the equation of the line/plane of best fit
            with open('equation.txt','a+') as g:
                g.write(str(self.equation))  # write the equation in the form [intercept,param1,param2,param3,param4] to a file 

a = Loss('SSR')
a.fit(10**-8,0.01,5,100,9)
...