Хотя проблема в названии является специфической (сходящаяся к тому же значению только для сигмоида , при этом затраты на другие функции активации в целом не снижаются), моя сеть в целом глючит, и после многих часы, потраченные на отладку / тестирование, я не могу понять, почему, даже после обнаружения незначительных ошибок. Выполнение обратного распространения на бумаге для первой тренировочной эпохи в соответствии с тем, что делала функция.
Мне неприятно говорить, что я не знаю проблемы с моей сетью. Я был бы признателен, если бы кто-то мог просмотреть это, чтобы намекнуть, какие части (ы) моей сетевой реализации неверны.
Что я уже пробовал
- Различные функции активации
- Различные методы инициализации веса (как Ксавье, так и случайные)
Пример (XOR)
Результаты после 20 000 эпох:
Inputs = 1.000000 0.000000, Target Outputs = 1.000000, Predicted Outputs = 0.028415.
Inputs = 0.000000 1.000000, Target Outputs = 1.000000, Predicted Outputs = 0.028452.
Inputs = 1.000000 1.000000, Target Outputs = 0.000000, Predicted Outputs = 0.028426.
Inputs = 0.000000 0.000000, Target Outputs = 0.000000, Predicted Outputs = 0.028441.
Примечание
Так как я не предоставляю data.c
(поскольку это сделало бы сообщение очень длинным, так как оно загружает данные из файлов csv
), поверьте, что input
и targetOutput
в методе train
model.c
, после того как к ним применены initInput
и initTargetOutput
, они содержат значения входа [1 or 0, 1 or 0]
и целевого вывода [1 or 0]
. Я полностью проверил это сам.
GitHub
Хранилище, содержащее файлы ниже, находится здесь . Приношу свои извинения за то, что не включил его в отредактированный пост. Обратите внимание, что в репозитории есть много разделов кода, которые окружены макросами #if
#else
(один для проверки градиента, другие для переключения печати). Это выходит за рамки этого поста (поэтому я удалил их в следующих файлах).
model.c
#include <stdlib.h>
#include <stdio.h>
#include <memory.h>
#include "model.h"
#include "functions.h"
/**
* @param model
* @param input The head to ann input array of size <code>model.neuronsPerLayer[INPUT_LAYER]</code> that has the inputs
* of the model.
*/
void setInput(struct Model* model, double input[]) {
model->values[INPUT_LAYER] = input;
}
void propagateInputForward(struct Model* model, double input[]) {
setInput(model, input);
for (int endLayerIndex = 1; endLayerIndex < NUMBER_OF_LAYERS; endLayerIndex++) {
int startLayerIndex = endLayerIndex - 1;
int endNeuronCount = model->neuronsPerLayer[endLayerIndex];
int startNeuronCount = model->neuronsPerLayer[startLayerIndex];
for (int endNeuronIndex = 0; endNeuronIndex < endNeuronCount; endNeuronIndex++) {
double weightedSum = 0.0;
double bias = model->biases[endLayerIndex][endNeuronIndex];
for (int startNeuronIndex = 0; startNeuronIndex < startNeuronCount; startNeuronIndex++) {
double weight = model->weights[endLayerIndex][endNeuronIndex][startNeuronIndex];
double startNeuronValue = model->values[startLayerIndex][startNeuronIndex];
double weightedInfluence = weight * startNeuronValue;
weightedSum += weightedInfluence;
}
weightedSum += bias;
double activatedNeuronValue = model->getActivation(weightedSum);
model->values[endLayerIndex][endNeuronIndex] = activatedNeuronValue;
}
}
}
/**
* @param model The model which the parameter gradients will be based on.
* @param layerIndex The layer index whose weight deltas are being calculated.
* @param baseDelta The base delta, equal to change in the cost function over change in
* the weighted sum of the neuron value.
* @param weightGradients The weight gradient to fill.
* @param biasGradients The bias gradient to fill.
*/
void updateParameterGradients(struct Model *model, const double* targetOutput, double** weightGradients[],
double* biasGradients[]) {
int outputNeuronCount = model->neuronsPerLayer[OUTPUT_LAYER];
// Entry indexed by [layerIndex][neuronIndex] gives
// Δ C / Δ Z[layerIndex, neuronIndex]
double* errors[NUMBER_OF_LAYERS];
errors[OUTPUT_LAYER] = malloc(sizeof(double) * outputNeuronCount);
// Fill errors of output layers
for (int outputNeuronIndex = 0; outputNeuronIndex < outputNeuronCount; outputNeuronIndex++) {
double outputNeuronValue = model->values[OUTPUT_LAYER][outputNeuronIndex];
double targetOutputNeuronValue = targetOutput[outputNeuronIndex];
// Δ C_outputNeuronIndex / Δ A[OUTPUT_LAYER][outputNeuronIndex]
double firstErrorComponent = model->getCostDerivative(outputNeuronValue, targetOutputNeuronValue);
// Δ A[OUTPUT_LAYER][outputNeuronIndex] / Δ Z[OUTPUT_LAYER][outputNeuronIndex]
double secondErrorComponent = model->getActivationDerivative(outputNeuronValue);
// Δ C_outputNeuronIndex / Δ Z[OUTPUT_LAYER][outputNeuronIndex]
double error = firstErrorComponent * secondErrorComponent;
errors[OUTPUT_LAYER][outputNeuronIndex] = error;
}
// Fill errors of non-output layers
for (int endLayerIndex = OUTPUT_LAYER; endLayerIndex > INPUT_LAYER; endLayerIndex--) {
int startLayerIndex = endLayerIndex - 1;
int startNeuronsCount = model->neuronsPerLayer[startLayerIndex];
int endNeuronsCount = model->neuronsPerLayer[endLayerIndex];
errors[startLayerIndex] = malloc(sizeof(double) * startNeuronsCount);
for (int startNeuronIndex = 0; startNeuronIndex < startNeuronsCount; startNeuronIndex++) {
double error = 0.0;
for (int endNeuronIndex = 0; endNeuronIndex < endNeuronsCount; endNeuronIndex++) {
double nextError = errors[endLayerIndex][endNeuronIndex];
double nextWeight = model->weights[endLayerIndex][endNeuronIndex][startNeuronIndex];
double activationValue = model->values[startLayerIndex][startNeuronIndex];
double activationValueDelta = model->getActivationDerivative(activationValue);
double errorInfluence = nextWeight * nextError * activationValueDelta;
error += errorInfluence;
}
errors[startLayerIndex][startNeuronIndex] = error;
}
}
// Update weights and biases of all layers based on errors
for (int endLayerIndex = OUTPUT_LAYER; endLayerIndex > INPUT_LAYER; endLayerIndex--) {
int startLayerIndex = endLayerIndex - 1;
int endNeuronCount = model->neuronsPerLayer[endLayerIndex];
int startNeuronCount = model->neuronsPerLayer[startLayerIndex];
for (int endNeuronIndex = 0; endNeuronIndex < endNeuronCount; endNeuronIndex++) {
double endNeuronError = errors[endLayerIndex][endNeuronIndex];
double biasGradientInfluence = endNeuronError;
biasGradients[endLayerIndex][endNeuronIndex] += biasGradientInfluence;
for (int startNeuronIndex = 0; startNeuronIndex < startNeuronCount; startNeuronIndex++) {
double startNeuronValue = model->values[startLayerIndex][startNeuronIndex];
double weightGradientInfluence = endNeuronError * startNeuronValue;
weightGradients[endLayerIndex][endNeuronIndex][startNeuronIndex] += weightGradientInfluence;
}
}
}
}
/**
* Updates the weight and bias values within {@code model}, given the gradients of the cost function
* with respect to the weights and biases.
*
* @param model
* @param weightGradients
* @param biasGradients
*/
void updateParameterValues(struct Model* model, double** weightGradients[], double* biasGradients[], int batchSize) {
for (int endLayerIndex = 1; endLayerIndex < NUMBER_OF_LAYERS; endLayerIndex++) {
int endNeuronCount = model->neuronsPerLayer[endLayerIndex];
int startLayerIndex = endLayerIndex - 1;
int startNeuronCount = model->neuronsPerLayer[startLayerIndex];
for (int endNeuronIndex = 0; endNeuronIndex < endNeuronCount; endNeuronIndex++) {
double biasDelta = biasGradients[endLayerIndex][endNeuronIndex];
biasDelta /= batchSize;
biasDelta *= model->learningRate;
// update bias
model->biases[endLayerIndex][endNeuronIndex] -= biasDelta;
for (int startNeuronIndex = 0; startNeuronIndex < startNeuronCount; startNeuronIndex++) {
double weightDelta = weightGradients[endLayerIndex][endNeuronIndex][startNeuronIndex];
weightDelta /= batchSize;
weightDelta *= model->learningRate;
// update weight
model->weights[endLayerIndex][endNeuronIndex][startNeuronIndex] -= weightDelta;
}
}
}
}
static int epochIndex = 0;
void initGradients(struct Model* model, double** weightGradients[], double* biasGradients[]) {
for (int endLayerIndex = 1; endLayerIndex < NUMBER_OF_LAYERS; endLayerIndex++) {
int endNeuronCount = model->neuronsPerLayer[endLayerIndex];
int startLayerIndex = endLayerIndex - 1;
int startNeuronCount = model->neuronsPerLayer[startLayerIndex];
biasGradients[endLayerIndex] = malloc(sizeof(double) * endNeuronCount);
weightGradients[endLayerIndex] = malloc(sizeof(double*) * endNeuronCount);
for (int endNeuronIndex = 0; endNeuronIndex < endNeuronCount; endNeuronIndex++) {
biasGradients[endLayerIndex][endNeuronIndex] = 0.0;
weightGradients[endLayerIndex][endNeuronIndex] = malloc(sizeof(double) * startNeuronCount);
for (int startNeuronIndex = 0; startNeuronIndex < startNeuronCount; startNeuronIndex++)
weightGradients[endLayerIndex][endNeuronIndex][startNeuronIndex] = 0.0;
}
}
}
/**
* Feeds the input values of the entry into the input array given.
*
* @param input
* @param entry
* @param inputColumnIndices
* @param inputColumnIndicesCount
*/
void initInput(double input[], const double entry[], const int inputColumnIndices[], int inputColumnIndicesCount) {
for (int inputColumnIndex = 0; inputColumnIndex < inputColumnIndicesCount; inputColumnIndex++) {
int inputColumn = inputColumnIndices[inputColumnIndex];
input[inputColumnIndex] = entry[inputColumn];
}
}
/**
* Feeds the target output values of entry given into the target output array given.
*
* @param targetOutput
* @param entry
* @param outputColumnIndices
* @param outputColumnIndicesCount
*/
void initTargetOutput(double targetOutput[], const double entry[], const int outputColumnIndices[], int outputColumnIndicesCount) {
for (int outputColumnIndex = 0; outputColumnIndex < outputColumnIndicesCount; outputColumnIndex++) {
int outputColumn = outputColumnIndices[outputColumnIndex];
targetOutput[outputColumnIndex] = entry[outputColumn];
}
}
void test(struct Model* model, struct Data* data, int inputColumnIndices[], int outputColumnIndices[], double** predictedOutputs, double costs[]) {
int inputNeuronCount = model->neuronsPerLayer[INPUT_LAYER];
int outputNeuronCount = model->neuronsPerLayer[OUTPUT_LAYER];
for (int entryIndex = 0; entryIndex < data->numberOfEntries; entryIndex++) {
double *entry = data->elements[entryIndex];
double input[inputNeuronCount];
double targetOutput[outputNeuronCount];
initInput(input, entry, inputColumnIndices, inputNeuronCount);
initTargetOutput(targetOutput, entry, outputColumnIndices, outputNeuronCount);
// forward propagation
propagateInputForward(model, input);
double cost = 0.0;
for (int outputIndex = 0; outputIndex < outputNeuronCount; outputIndex++) {
double value = model->values[OUTPUT_LAYER][outputIndex];
predictedOutputs[entryIndex][outputIndex] = value;
double targetValue = targetOutput[outputIndex];
cost += model->getCost(value, targetValue);
}
// Take average cost
cost /= outputNeuronCount;
costs[entryIndex] = cost;
}
}
void freeGradients(struct Model* model, double** weightGradients[], double** biasGradients) {
for (int endLayerIndex = 1; endLayerIndex < NUMBER_OF_LAYERS; endLayerIndex++) {
free(biasGradients[endLayerIndex]);
int endNeuronCount = model->neuronsPerLayer[endLayerIndex];
for (int neuronIndex = 0; neuronIndex < endNeuronCount; neuronIndex++)
free(weightGradients[endLayerIndex][neuronIndex]);
}
}
/**
* Trains the model on the given data.
*
* @param model
* @param data Container for the data the model will be trained on.
* @param inputColumnIndices The indices of the columns within {@code data} that are the input columns.
* @param outputColumnIndices The indices of the columns within {@code data} that are the output columns.
*/
void train(struct Model* model, struct Data* data, int inputColumnIndices[], int outputColumnIndices[]) {
// For both weightGradients and biasGradients, index 0 is not occupied.
// [endLayerIndex][endNeuronIndex in layerIndex][startNeuronIndex in layerIndex - 1]
double** weightGradients[NUMBER_OF_LAYERS];
// [endLayerIndex][endNeuronIndex]
double* biasGradients[NUMBER_OF_LAYERS];
// Allocate the storage for the weight and bias deltas, in addition
// to initializing them all weight and bias deltas with values of 0
initGradients(model, weightGradients, biasGradients);
int inputNeuronCount = model->neuronsPerLayer[INPUT_LAYER];
int outputNeuronCount = model->neuronsPerLayer[OUTPUT_LAYER];
epochIndex++;
// Feed each input into model
for (int entryIndex = 0; entryIndex < data->numberOfEntries; entryIndex++) {
double* entry = data->elements[entryIndex];
double input[inputNeuronCount];
double targetOutput[outputNeuronCount];
// Feed values of entry into input and targetOutput given indices of input and output columns
initInput(input, entry, inputColumnIndices, inputNeuronCount);
initTargetOutput(targetOutput, entry, outputColumnIndices, outputNeuronCount);
// forward propagation
propagateInputForward(model, input);
// update weight and bias gradients based on this entry, part of the batch
updateParameterGradients(model, targetOutput, weightGradients, biasGradients);
}
updateParameterValues(model, weightGradients, biasGradients, data->numberOfEntries);
freeGradients(model, weightGradients, biasGradients);
}
/**
* Allocates the memory for the parameters (weights and biases) of the model, in addition to initializing
* them to their default values.
*
* @param model
*/
void initParameters(struct Model* model) {
// initialize weights with arbitrary
for (int endLayerIndex = 1; endLayerIndex < NUMBER_OF_LAYERS; endLayerIndex++) {
int endNeuronCount = model->neuronsPerLayer[endLayerIndex];
int startLayerIndex = endLayerIndex - 1;
int startNeuronCount = model->neuronsPerLayer[startLayerIndex];
model->weights[endLayerIndex] = malloc(sizeof(double*) * endNeuronCount);
for (int endNeuronIndex = 0; endNeuronIndex < endNeuronCount; endNeuronIndex++) {
model->weights[endLayerIndex][endNeuronIndex] = malloc(sizeof(double) * startNeuronCount);
model->biases[endLayerIndex] = malloc(sizeof(double) * endNeuronCount);
for (int startNeuronIndex = 0; startNeuronIndex < startNeuronCount; startNeuronIndex++) {
model->weights[endLayerIndex][endNeuronIndex][startNeuronIndex] = model->getInitialWeightValue(startNeuronCount, endNeuronCount);
model->biases[endLayerIndex][endNeuronIndex] = model->getInitialBiasValue(startNeuronCount, endNeuronCount);
}
}
}
}
/**
* Allocayes the memory for the values of the model.
*
* @param model
*/
void initValues(struct Model* model) {
for (int layerIndex = 0; layerIndex < NUMBER_OF_LAYERS; layerIndex++) {
int neuronsInLayer = model->neuronsPerLayer[layerIndex];
model->values[layerIndex] = malloc(sizeof(double) * neuronsInLayer);
}
}
main.c
#include <stdio.h>
#include <stdlib.h>
#include <zconf.h>
#include <time.h>
#include "model.h"
#include "functions.h"
#include "data.h"
#define EPOCH_COUNT 20000
#define NUMBER_OF_COLUMNS 3
#define TRAIN_ENTRIES_SIZE 4
#define TEST_ENTRIES_SIZE 4
int main() {
time_t currentTime;
time(¤tTime);
srand(currentTime);
struct Model model = {
.neuronsPerLayer = {2, 2, 1},
.learningRate = 0.02,
// Default values
.getActivation = applySigmoid,
.getActivationDerivative = applySigmoidDerivative,
.getCost = getCost,
.getCostDerivative = getCostDerivative,
.getInitialWeightValue = getInitialRandomWeight,
.getInitialBiasValue = getInitialBias,
};
int numberOfInputs = model.neuronsPerLayer[INPUT_LAYER];
int numberOfOutputs = model.neuronsPerLayer[OUTPUT_LAYER];
// Change working directory so data can be referenced relative to parent data folder
chdir("..");
struct Data trainData;
fill(&trainData, "data/xor/train.csv", NUMBER_OF_COLUMNS, TRAIN_ENTRIES_SIZE);
struct Data testData;
fill(&testData, "data/xor/test.csv", NUMBER_OF_COLUMNS, TEST_ENTRIES_SIZE);
int inputColumnIndices[numberOfInputs];
int outputColumnIndices[numberOfOutputs];
inputColumnIndices[0] = 0;
inputColumnIndices[1] = 1;
outputColumnIndices[0] = 2;
initValues(&model);
initParameters(&model);
for (int epochIndex = 0; epochIndex < EPOCH_COUNT; epochIndex++)
train(&model, &trainData, inputColumnIndices, outputColumnIndices);
exit(0);
}
functions.c
#include <stdlib.h>
#include "functions.h"
#include "math.h"
double applySigmoid(double weightedSum) {
double eToWSum = pow(M_E, weightedSum);
return eToWSum / (eToWSum + 1);
}
double applySigmoidDerivative(double activationValue) {
return activationValue * (1 - activationValue);
}
double applyReLU(double weightedSum) {
return weightedSum < 0 ? 0 : weightedSum;
}
double applyReLUDerivative(double activationValue) {
return activationValue == 0 ? 0 : 1;
}
double applyTanH(double weightedSum) {
return 2 * applyReLU(2 * weightedSum) - 1;
}
double applyTanHDerivative(double activationValue) {
return 1 - pow(activationValue, 2);
}
double getInitialXavierWeight(double previousLayerSize, double layerSize) {
return sqrt(2 / previousLayerSize);
}
double getInitialRandomWeight(double previousLayerSize, double layerSize) {
return ((double) rand() / RAND_MAX) * 0.01;
}
double getInitialBias(double previousLayerSize, double layerSize) {
return 0;
}
double getCost(double neuronValue, double intendedValue) {
double difference = neuronValue - intendedValue;
return 0.5 * pow(difference, 2);
}
double getCostDerivative(double neuronValue, double intendedValue) {
return neuronValue - intendedValue;
}
Если требуются какие-либо другие файлы, пожалуйста, запросите.