Я запрограммировал нейронную сеть на C ++, но когда ее обучают изучать XOR, в итоге получается, что она всегда выдает 0,5.
Я пытался вывести формулы для градиентного спуска сам, чтобы ониможет быть ошибочным:
dE/dLi = how much the output of the ith layer affects the final error
dLi/dZi = how much each value of Z = W*x+b affects the output of the activation function in the ith layer
dldl - это просто (dLi / dZi) / (dE / dLi).
dE / dx - это то, как ввод i-го слоя влияет на ошибку, иэто также то, как выходные данные i-1-го уровня влияют на ошибку, поэтому он распространяется обратно на предыдущий уровень как dE / dLi-1.
формулы, используемые для актуализации весов, смещения и обратного распространения ошибкиявляются:
w -= learning_rate * (dLi/dZi)/(dE/dLi)*(Xi-1)^T = learning_rate * (dldl)*(Xi-1)^T
b -= learning_rate * (dLi/dZi)/(dE/dLi) = learning_rate * dldl
dE/dX = dE/dLi-1 = (Wi)^T * (dLi/dZi)/(dE/dLi) = (Wi)^T * dldl
Вот код, я включил весь файл .hh и части .cc, которые я считаю актуальными (я могу при необходимости опубликовать код других функций). Я также включу .hh использованного мной класса матрицы:
.hh
#ifndef NEURAL_NETWORK_HH
#define NEURAL_NETWORK_HH
#include <iostream>
#include <vector>
#include <fstream>
#include <exception>
#include <cmath>
#include "matrix.hh"
//----------------------------------------------//
// NEURAL NETWORK EXCEPTIONS CLASS //
//----------------------------------------------//
class neural_exception: public std::exception {
public:
neural_exception(const char* error) : exception(), message(error) {}
const char* what() const throw() {return message;};
private:
const char* message;
};
/* NOTE: All these functions have a parameter bprop if bprop = false the
activation function is performed in the matrix m: m = L(m)
If bprop, the derivative of the a.f. is performed: m = L'(m)
*/
//----------------------------------------------//
// COST FUNCTIONS //
//----------------------------------------------//
/*
0: quadratic
*/
double quadratic (matrix& output, const matrix& correct, bool bprop = false);
/* pre: both output and correct are single column matrix with the same shape
post: the value of the quadratic cost function is returned for the matrices
provided: value = sum((output[i][j] - correct[i][j])^2) */
//----------------------------------------------//
// ACTIVATION FUNCTIONS //
//----------------------------------------------//
/*
0: ReLu
*/
void ReLu (matrix& m, bool bprop = false);
/* pre: true
post: the ReLu activation function. If an element of it is smaller than 0,
it is converted to 0, otherwise it is left as it is.*/
void sigmoid (matrix& m, bool bprop = false);
/* pre: true
post: for each element of the matrix applies y = 1/(1 + e^(-x))
it is a method to obtain values between 0 and 1 */
//----------------------------------------------//
// DATA STRUCTURES //
//----------------------------------------------//
// object that contains all the data to create a neural network
typedef struct {
int number_of_inputs;
std::vector<int> shape;
std::vector<int> activation_functions;
int cost_function;
double value_w;
double value_b;
bool random_w;
bool random_b;
} neural_data;
// vector with all the cost functions available
const std::vector<double (*)(matrix&, const matrix&, bool)> COST_FUNC =
{
quadratic
};
// only used in the draw method
const std::vector<std::string> COST_FUNC_NAMES =
{
"quadratic"
};
// vector with all the activation functions available
const std::vector<void (*)(matrix&, bool)> ACT_FUNC =
{
ReLu,
sigmoid
};
// only used in the draw method
const std::vector<std::string> ACT_FUNC_NAMES =
{
"ReLu",
"sigmoid"
};
//----------------------------------------------//
// NEURAL NETWORK CLASS //
//----------------------------------------------//
class neural_network {
std::vector<matrix> w; // weights
std::vector<matrix> b; // biases
std::vector<matrix> x; // matrix of W*X+b for each layer in last computation
std::vector<int> act_func; // activation functions
int cost_func; // cost functions
bool save_x; // if true, the results of each layer will be saved
public:
// CONSTRUCTORS
neural_network(std::string file_path);
/* pre: there is a file with the data from a neural_network object in the
specified path
post: *this is equal to the neural network saved in the file */
neural_network (neural_data data);
/* pre: true
post: all the data for the neural network is introduced via the data
struct.
Necessary data:
- Number of inputs
- Number of layers
- Size of each layer
- Activation functions
- Cost function
- Range of random values for w (weights)
- Will w be initialized with random values?
- Initial value of b (biases)
- Will b be initialized with random values?
*/
neural_network (const neural_network& n);
/* pre: true
post: *this becomes a copy of n */
// METHODS
matrix calculate (const matrix& x);
/* pre: x is a column matrix with lenght number_of_inputs
post: the output is the predicted result in a form of a column matrix
with length equal to the number of outputs */
void save_to_file (std::string file_path) const;
/* pre: file_path contains the path and file name in wich the data will be
stored
post: all the data of the neural network has been stored in the file. */
void draw (bool detailed = true, int precision = 2) const;
/* pre: precision is a strictly positive integer
post: all the neural network is drawn, if detailed, with the weights and
biases. */
// MODIFIERS
void backpropagate (matrix x, const matrix& correct, double learning_rate);
/* pre: x and correct are column matrices with lenght number_of_inputs and
learning_rate is a strictly positive real
post: the backpropagation algorithm is applied to the neural network */
void change_activation_function (int layer, int function);
/* pre: 0 <= layer < numer_of_layers and function is in the list
post: the activation function of the selected layer will be changed */
// CONSULTORS
double error (const matrix& x, const matrix& correct);
/* pre: x and correct are column matrices with lenght number_of_inputs
post: the returned value is the cost function value for the calculated
and the correct output provided. */
int number_of_layers () const;
/* pre: true
post: returns the number of layers */
int number_of_inputs () const;
/* pre: true
post: returns the number of inputs */
int number_of_outputs () const;
/* pre: true
post: returns the number of layers */
std::vector<int> shape () const;
/* pre: true
post: returns a vector with the size of each layer */
};
#endif
.cc
#include "neural_network.hh"
using namespace std;
// ERROR MESSAGES FOR THE NEURAL_EXCEPTION CLASS
const char* FILE_NOT_FOUND = "file for constructor initiation not found";
const char* ERROR_CREATING_FILE = "error while creating the file";
const char* NO_ACTIVATION_FUNC = "activation function not found";
const char* NO_COST_FUNC = "cost function not foound";
const char* BAD_N_ACT_FUNC = "one activation function per layer id needed";
const char* BAD_SHAPE = "all values for shape must be bigger than 0";
const char* BAD_INPUT_SHAPE = "incorrect shape of inputs";
const char* BAD_LAYER = "layer out of bounds";
// COST FUNCTIONS
double quadratic (matrix& output, const matrix& correct, bool bprop)
{
if (bprop) {
output -= correct;
return 0;
} else {
output -= correct;
long double sum = output.mod();
// return the sum of the squares of each number divided by two
return (0.5 * sum * sum);
}
}
// ACTIVATION FUNCTIONS
void ReLu (matrix& m, bool bprop)
{
if (bprop) {
matrix result(m.nrows(), m.nrows(), 0);
for (int i = 0; i < m.nrows(); i++) {
if (m[i][0] >= 0)
result.change(i, i, 1);
}
m = result;
} else {
for (int i = 0; i < m.nrows(); i++) {
if (m[i][0] < 0)
m.change(i, 0, 0);
}
}
}
void sigmoid (matrix& m, bool bprop)
{
if (bprop) {
matrix result(m.nrows(), m.nrows(), 0);
double e_pow_x;
for (int i = 0; i < m.nrows(); i++) {
// the derivative is: y' = e^x/(1 + e^x)^2
e_pow_x = pow(M_E, m[i][0]);
result.change(i, i, e_pow_x / pow( 1 + e_pow_x , 2));
}
m = result;
} else {
for (int i = 0; i < m.nrows(); i++)
m.change(i,0, 1/(1 + pow(M_E, -1 * m[i][0])));
}
}
neural_network::neural_network (neural_data data)
{
// first of all, check that all parameters are correct
// check there is one activation function per layer
if (data.activation_functions.size() != data.shape.size())
throw neural_exception(BAD_N_ACT_FUNC);
// check if all the activation functions exist
for (int i = 0; i < data.activation_functions.size(); i++) {
if (data.activation_functions[i] < 0 or
data.activation_functions[i] >= ACT_FUNC.size())
throw neural_exception(NO_ACTIVATION_FUNC);
else
act_func.push_back(data.activation_functions[i]);
}
// check if the cost function exist
if ( data.cost_function < 0 or data.cost_function >= COST_FUNC.size())
throw neural_exception(NO_COST_FUNC);
else
cost_func = data.cost_function;
int n_rows, n_cols = data.number_of_inputs;
// create the neural network
for (int i = 0; i < data.shape.size(); i++) {
// check shape is valid
if (data.shape[i] < 1) throw neural_exception(BAD_SHAPE);
// for the first layer, set the number of inputs equal to the nn's
if (i != 0)
n_cols = data.shape[i-1];
n_rows = data.shape[i];
// initialize matrices for w and b
w.push_back(matrix(n_rows, n_cols, data.value_w, data.random_w));
b.push_back(matrix(n_rows, 1, data.value_b, data.random_b));
}
// save_x will only be true in bprop for optimization reasons
save_x = false;
}
matrix neural_network::calculate (const matrix& x)
{
matrix x_copy(x);
if (x_copy.nrows() != w[0].ncols())
throw neural_exception(BAD_INPUT_SHAPE);
this->x.clear();
for (int i = 0; i < w.size(); i++) {
if (save_x)
this->x.push_back(x_copy.gcopy());
x_copy = w[i]*x_copy;
x_copy += b[i];
// select the activation function from the array
ACT_FUNC[act_func[i]](x_copy,false);
}
if (save_x)
this->x.push_back(x_copy.gcopy());
return x_copy;
}
// MODIFIERS
void neural_network::backpropagate (const matrix x, const matrix& correct,
double learning_rate)
{
save_x = true;
matrix dedx = calculate(x);
COST_FUNC[cost_func](dedx, correct, true);
matrix dedw(0,0), dldl(0,0);
for (int i = w.size() - 1; i >= 0; i--) {
// calculate first dl/dl = (dLi/dZi) / (dE/dLi)
dldl = this->x[i+1].gcopy();
ACT_FUNC[act_func[i]](dldl, true);
dldl *= dedx;
// actualize bias: b = b - learning_rate * dl/dl
b[i] -= dldl.gmultn(learning_rate);
// actualize dedx: dE/dXi-1 = (Wi)^T * dl/dl
dedx = w[i].gT() * dldl;
// actualize w: w = w - learning_rate * (dl/dl * (Xi-1)^T)
w[i] -= (dldl * this->x[i].gT()).gmultn(learning_rate);
}
save_x = false;
return;
}
matrix.hh
#ifndef MATRIX_HH
#define MATRIX_HH
#include <iostream>
#include <vector>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <exception>
typedef std::vector<std::vector<double>> vector_matrix;
// CUSTOM EXCEPTION CLASS
class matrix_exception: public std::exception {
public:
matrix_exception(const char* error) : exception(), message(error) {}
const char* what() const throw() {return message;};
private:
const char* message;
};
// CLASS MATRIX
class matrix {
vector_matrix m;
int cols, rows;
void copy (const matrix& m);
/* pre: true.
post: the matrix is equal to m. */
public:
// CONSTRUCTORS
matrix (int rows, int cols, double val = 0.0, bool random = false);
/* pre: cols and rows are positive integers
post: a matrix of shape rows*cols and filled with val (0 as a
default) if random = false and with random values from 0 to
val (1 as a default) otherwise. */
matrix (const matrix& m);
/* pre: true
post: creates a matrix equal to m */
// MODIFIERS
void change (int row, int col, double val);
/* pre: 0 <= col < cols and 0 <= row < rows
post: changes the element in the position row col to val. */
void read ();
/* pre: there is a series of row*cols doubles to read in the standard
channel
post: the matrix has been filled with the given values. The first
row takes the first row values and so on */
void T ();
/* pre: true
post: the matrix has been transposed. */
matrix gT () const;
/* pre:true
post: returns a transposed matrix. */
void multn (double n);
/* pre:true
post: the matrix has been multiplied by n. */
matrix gmultn (double n) const;
/* pre: true
post: the returned value is the matrix multiplied by n.*/
matrix operator+ (const matrix& m) const;
/* pre: both matrices have the same shape.
post: the returned value is a matrix with the same shape and whose
elements are the sum of both matrices. */
matrix operator- (const matrix& m) const;
/* pre: both matrices have the same shape.
post: the returned value is a matrix with the same shape and whose
elements are the difference of both matrices. */
matrix operator^ (const matrix& m) const;
/* pre: both matrices have the same shape.
post: the returned value is a matrix with the same shape and whose
elements are the elementwise product of both matrices. */
matrix operator* (const matrix& m) const;
/* pre: the number of cols of this is equal to the number of rows of m.
post: the returned matrix is the result of the matrix product
between this and m. */
bool operator== (const matrix& m) const;
/* pre: true.
post: returns true if both matrices have the same shape and elements. */
void operator= (const matrix& m);
/* pre: true.
post: the left-hand side matrix is equal to the one in the right. */
void operator+=(const matrix& m);
/* pre: both matrices have the same shape.
post: the returned value is a matrix with the same shape and whose
elements are the sum of both matrices. */
void operator-= (const matrix& m);
/* pre: both matrices have the same shape.
post: the returned value is a matrix with the same shape and whose
elements are the difference between both matrices. */
void operator^= (const matrix& m);
/* pre: both matrices have the same shape.
post: the returned value is a matrix with the same shape and whose
elements are the elementwise multiplication of both matrices. */
void operator*= (const matrix& m);
/* pre: the number of cols of this is equal to the number of rows of m.
post: this becomes the result of the matrix product between this
and m. */
std::vector <double> operator[] (int row) const;
/* pre: 0 <= row < rows and 0 <= col < cols
post: returns the rowth row. NOT SUITABLE FOR CHANGING ITEMS IN THE
MATRIX, for that purpose use change */
// CONSULTORS
void shape (int& row, int& col) const;
/* pre: true
post: changes the value of row to the number of rows and col to the
number of columns. */
int nrows () const;
/* pre: true
post: returns number of rows. */
int ncols () const;
/* pre: true
post: returns the number of columns. */
double elm (int row, int col) const;
/* pre: 0 <= col < cols and 0 <= row < rows
post: returns the element in the row col position. */
long double mod () const;
/* pre: true
post: the value returned is square of the sum of the squares for
each value of the matrix */
void draw (int margin = 3, int p = 3) const;
/* pre: margin and p are a positive integers
post: draws the matrix with margin number of spaces to the left and
precission p. If an item does not have a decimal part, the .000...
wont be drown, but if it does but is too small to fit in p precission
a n.0000... will be shown. */
matrix gcopy () const;
/* pre:true
post: returns a copy of the matrix */
};
#endif