Я реализовал регуляризованную нейронную сеть с нуля, используя Numpy для классификации набора данных MNIST. При сравнении результатов моей модели с результатами Seftial от tf.Keras результаты существенно отличаются.
Все гиперпараметры (включая batch_size, learning_rate, lambda, т. Е. Коэффициент регуляризации, эпохи) одинаковы для двух сетей. Я инициализировал все ядра с нулей и использовал полный пакетный градиентный спуск и установил для Shuffle значение False, чтобы удалить любую случайность. Модели построены с использованием одного и того же оптимизатора (SGD), функции потерь (MSE), регуляризатора (l2-регуляризация) и функций активации (сигмоид во всех слоях).
Интересно отметить, однако, что, когда я устанавливаю λ = 0, результаты (потеря обучения, точность, веса, градиенты и т. Д.) Точно совпадают. Поэтому я подозреваю, что проблема в том, как Keras реализует регуляризаторы по сравнению с тем, как я это реализовал.
Модель Keras:
def kerasModel(lamda, lr):
model = Sequential()
model.add(Dense(30, input_dim=784, activation="sigmoid",
kernel_regularizer=l2(lamda),
kernel_initializer ='zeros', # rn(stddev=1)
bias_initializer = 'zeros'))
model.add(Dense(10, activation="sigmoid",
kernel_regularizer = l2(lamda),
kernel_initializer = 'zeros', #rn(stddev=1),
bias_initializer = 'zeros'))
optimizer = tf.keras.optimizers.SGD(lr=lr)
model.compile(optimizer=optimizer,
loss='mse',
metrics=['accuracy'])
return model
Моя модель:
class MyNeuralNet:
def __init__(self, layerArray):
"""
layerArray: array of dimensions of layers.
size of layerArray is the number of layers in our network
"""
self.layers = layerArray
self.B = []
self.W = []
self.input = None
for layerNum in range(1, len(layerArray)): #1st layer is input so we exclude that
biasVector = np.zeros((layerArray[layerNum], 1))
self.B.append(biasVector)
weightsMatrix = np.zeros((layerArray[layerNum], layerArray[layerNum-1]))
self.W.append(weightsMatrix)
def calcCost(self, data, lamda):
"""" Calculates the loss """
j = 0
for data_ in data:
yPred = self.forwardPass(data_[0])
y = data_[1].reshape((10,1))
costEx = cost(yPred, y) #cost of one example
j += sum(costEx)
j = j/len(data)
j += 0.5*(lamda)*sum(
[np.linalg.norm(w)**2 for w in self.W])
return j
def netSize(self):
""" number of layers in the network excluding the input layer"""
return len(self.layers) - 1
def activateLayer(self, z, func = 0):
''' applies specified activation function to the layer z.
0 for Sigmoid
1 for Relu '''
if func == 1:
activatedLayer = relu(z)
return activatedLayer
if not func:
activatedLayer = sigmoid(z)
return activatedLayer
else:
raise Exception("Activation function can either be 0 (sigmoid) or 1 (relu)")
def derivatieActivateLayer(self, z, func = 0):
""" applies derivate of specified activation function to the layer z.
0 for Sigmoid
1 for Relu"""
if func == 1:
z = np.array(z)
return np.dot(1, z>0)
if not func:
z = np.array(z)
sigmoid = self.activateLayer(z, 0)
return sigmoid*(1-sigmoid)
else:
raise Exception("Activation function can either be 0 (sigmoid) or 1 (relu)")
def forwardPass(self, layer, func = 0):
""" Outputs the output layer by performing a forward pass """
layer = layer.reshape((784,1))
for i in range(self.netSize()):
layer = np.dot(self.W[i], layer) + self.B[i]
layer = self.activateLayer(layer, func)
return layer
def backPropagate(self, x, y, func = 0):
"""
Backpropagates through the network to compute gradients.
"""
#initializing gradients
dW = []
dB = []
for i in range(self.netSize()):
dW.append(np.zeros(self.W[i].shape))
dB.append(np.zeros(self.B[i].shape))
outputLayers = [] #Z's
activeOutputLayers = [] #Sigmoid of Z's or g(Z)
x=x[0].reshape((784,1))
activeOutput = x #input layer
activeOutputLayers.append(activeOutput)
for b,w in zip(self.B, self.W):
output = np.dot(w, activeOutput) + b
outputLayers.append(output)
activeOutput = self.activateLayer(output, func)
activeOutputLayers.append(activeOutput)
outputLayers = np.array(outputLayers)
activeOutputLayers = np.array(activeOutputLayers)
n = self.netSize()
dZ = derivateCost(activeOutput, y) * self.derivatieActivateLayer(output, func)
dW[n-1] = np.dot(dZ, activeOutputLayers[-2].T)
dB[n-1] = dZ
for l in range(n-1):
dZ = np.dot(self.W[n-1-l].T, dZ) * self.derivatieActivateLayer(outputLayers[n-2-l], func)
dB[l] = dZ
dW[l] = np.dot(dZ , activeOutputLayers[max(0,n-3-l)].T)
dB = np.array(dB)
dW = np.array(dW)
return (np.array(dB), np.array(dW))
def train(self, train, validation, epochs, batchSize, lr, lamda = 0, func = 0):
"""
Performs gradient descent and updates the network.
train: training data
validation: validation data
epochs: number of iterations
lamda: regularization rate
batchSize: the size of the batch for gradient descent.
func: activation function, func = 0 means sigmoid.
"""
for i in range(epochs):
for batch in dataIter(batchSize, train):
xBatch, yBatch = batch[:, :-1], batch[:, -1]
dW = []
dB = []
for j in range(self.netSize()):
dW.append(np.zeros(self.W[j].shape))
dB.append(np.zeros(self.B[j].shape))
for x, y in zip(xBatch, yBatch):
gradB, gradW = self.backPropagate(x, y, func)
n = self.netSize()
#summing weights and biases for all examples in the mini batch
dW = [w + gradw for w, gradw in zip(dW, gradW)]
dB = [b + gradb for b, gradb in zip(dB, gradB)]
for j in range(self.netSize()):
self.W[j] = self.W[j]*(1-(lamda)*lr) - (lr/batchSize)*dW[j]
self.B[j] = self.B[j] - (lr/batchSize)*dB[j]
========== ** Helper Functions ** ============
import random
import numpy as np
def sigmoid(z):
return 1/(1+np.exp(-z))
def relu(z):
return np.maximum(0, z)
def cost(yPred, y):
return 0.1*(yPred-y.reshape((10,1)))**2
def derivateCost(yPred, y):
return 0.2*(yPred - y.reshape((10,1)))
def dataIter(batchSize, data):
#random.shuffle(data)
batches = []
for i in range(0, data.shape[0], batchSize):
batches.append(data[i:i+batchSize])
return batches