ваш код очень медленный, научитесь использовать векторизацию, и вы неправильно вычисляете производные, я не дважды проверял код, но вот векторизованная версия вашего кода с правильными градиентами, посмотрите, как быстро он работает таким образом :
import scipy.io
import math
import numpy as np
dataset = scipy.io.loadmat('dataset.mat')
data = dataset['hog_features_train'].astype('float64') # Size is [2000, 324]
bias_term = np.ones(shape=(2000,1))
data = np.concatenate(( bias_term , data), axis=1) # add bias term as an extra 1 in data features
labels = dataset['superclass_labels_train'].astype('float16') # Size is [2000, 1]
NUMBER_OF_FEATURES = data.shape[1] # Is 325
# Initialize weights with last weight as bias
w = np.random.normal(0, 0.01, NUMBER_OF_FEATURES)
# linear(?) = ?₀ + ?₁?₁ + ⋯ +
def linear(w, observation):
return np.matmul(observation,w)
# sigmoid(?) = 1 / (1 + exp(−?)
def sigmoid(x):
return 1 / (1 + np.exp(-x))
# prob(?) = 1 / (1 + exp(−linear(?))
def prob(w, observation):
return sigmoid(linear(w, observation))
# LLF = Σᵢ(?ᵢ log(prob(?ᵢ)) + (1 − ?ᵢ) log(1 − prob(?ᵢ)))
def log_likelyhood(w, data, labels):
return np.sum(prob(w, data))
# NOTE: d/dw(log(1/(1 + e^(-w * x + b)))) = x / (1 + e^(wx+b))
def gradient(w, data, labels):
#Initialze gradient vector
denom = (np.exp(linear(w, data)) + 1)
denom = np.expand_dims(denom, axis=1) # reshape from (2000,) to (2000, 1) for broadcasting
gradient = np.zeros_like(w)
gradient[1:] = np.sum((data[:, 1:] * labels) / denom, axis=0)
gradient[0] = np.sum(-1 / denom)
return gradient
LEARNING_RATE = 0.0001
EPOCH = 1000
# Calculate the LLF
likelyhood = log_likelyhood(w, data, labels)
print('likelyhood at the beginning: ', likelyhood)
# Gradient ascent algorithm
for i in range(EPOCH):
gradient1 = gradient(w, data, labels)
w += gradient1 * LEARNING_RATE
likelyhood = log_likelyhood(w, data, labels)
print('likelyhood after epoch', i + 1, ': ', likelyhood)