Нейронная сеть в C ++ всегда выдает 0.5 при изучении XOR - PullRequest
0 голосов
/ 26 октября 2019

Я запрограммировал нейронную сеть на C ++, но когда ее обучают изучать XOR, в итоге получается, что она всегда выдает 0,5.

Я пытался вывести формулы для градиентного спуска сам, чтобы ониможет быть ошибочным:

dE/dLi = how much the output of the ith layer affects the final error

dLi/dZi = how much each value of Z = W*x+b affects the output of the activation function in the ith layer

dldl - это просто (dLi / dZi) / (dE / dLi).

dE / dx - это то, как ввод i-го слоя влияет на ошибку, иэто также то, как выходные данные i-1-го уровня влияют на ошибку, поэтому он распространяется обратно на предыдущий уровень как dE / dLi-1.

формулы, используемые для актуализации весов, смещения и обратного распространения ошибкиявляются:

w -= learning_rate * (dLi/dZi)/(dE/dLi)*(Xi-1)^T = learning_rate * (dldl)*(Xi-1)^T

b -= learning_rate * (dLi/dZi)/(dE/dLi) = learning_rate * dldl

dE/dX = dE/dLi-1 = (Wi)^T * (dLi/dZi)/(dE/dLi) = (Wi)^T * dldl

Вот код, я включил весь файл .hh и части .cc, которые я считаю актуальными (я могу при необходимости опубликовать код других функций). Я также включу .hh использованного мной класса матрицы:

.hh

#ifndef NEURAL_NETWORK_HH
#define NEURAL_NETWORK_HH

#include <iostream>
#include <vector>
#include <fstream>
#include <exception>
#include <cmath>
#include "matrix.hh"

//----------------------------------------------//
//      NEURAL NETWORK EXCEPTIONS CLASS         //
//----------------------------------------------//

class neural_exception: public std::exception {
  public:
   neural_exception(const char* error) : exception(), message(error) {}
   const char* what() const throw() {return message;}; 
  private:
   const char* message;
};



/*  NOTE: All these functions have a parameter bprop if bprop = false the
            activation function is performed in the matrix m: m = L(m)
            If bprop, the derivative of the a.f. is performed: m = L'(m)
*/


//----------------------------------------------//
//               COST FUNCTIONS                 //
//----------------------------------------------//
/*
    0: quadratic
*/

double quadratic (matrix& output, const matrix& correct, bool bprop = false);
/*  pre: both output and correct are single column matrix with the same shape
    post: the value of the quadratic cost function is returned for the matrices
            provided: value = sum((output[i][j] - correct[i][j])^2) */


//----------------------------------------------//
//            ACTIVATION FUNCTIONS              //
//----------------------------------------------//
/*  
    0: ReLu
*/

void ReLu (matrix& m, bool bprop = false);
/*  pre: true
    post: the ReLu activation function. If an element of it is smaller than 0,
            it is converted to 0, otherwise it is left as it is.*/

void sigmoid (matrix& m, bool bprop = false);
/*  pre: true
    post: for each element of the matrix applies y = 1/(1 + e^(-x))
            it is a method to obtain values between 0 and 1 */


//----------------------------------------------//
//              DATA STRUCTURES                 //
//----------------------------------------------//

// object that contains all the data to create a neural network
typedef struct {
    int number_of_inputs;
    std::vector<int> shape;
    std::vector<int> activation_functions;
    int cost_function;
    double value_w;
    double value_b;
    bool random_w;
    bool random_b;
} neural_data;

// vector with all the cost functions available
const std::vector<double (*)(matrix&, const matrix&, bool)> COST_FUNC =
            {
                quadratic
            };

// only used in the draw method
const std::vector<std::string> COST_FUNC_NAMES =
            {
                "quadratic"
            };

// vector with all the activation functions available
const std::vector<void (*)(matrix&, bool)> ACT_FUNC =
            {
                ReLu,
                sigmoid
            };

// only used in the draw method
const std::vector<std::string> ACT_FUNC_NAMES =
            {
                "ReLu",
                "sigmoid"
            };


//----------------------------------------------//
//          NEURAL NETWORK CLASS                //
//----------------------------------------------//


class neural_network {

    std::vector<matrix> w;      // weights
    std::vector<matrix> b;      // biases
    std::vector<matrix> x;      // matrix of W*X+b for each layer in last computation
    std::vector<int> act_func;  // activation functions
    int cost_func;              // cost functions
    bool save_x;            // if true, the results of each layer will be saved

public:

    //  CONSTRUCTORS

    neural_network(std::string file_path);
    /*  pre: there is a file with the data from a neural_network object in the
                specified path
        post: *this is equal to the neural network saved in the file */

    neural_network (neural_data data);
    /*  pre: true
        post: all the data for the neural network is introduced via the data
                struct.
                Necessary data:

                    - Number of inputs
                    - Number of layers
                    - Size of each layer
                    - Activation functions
                    - Cost function
                    - Range of random values for w (weights)
                    - Will w be initialized with random values?
                    - Initial value of b (biases)
                    - Will b be initialized with random values?

    */

    neural_network (const neural_network& n);
    /*  pre: true
        post: *this becomes a copy of n */


    //  METHODS

    matrix calculate (const matrix& x);
    /*  pre: x is a column matrix with lenght number_of_inputs
        post: the output is the predicted result in a form of a column matrix
                with length equal to the number of outputs */

    void save_to_file (std::string file_path) const;
    /*  pre: file_path contains the path and file name in wich the data will be
                stored
        post: all the data of the neural network has been stored in the file. */

    void draw (bool detailed = true, int precision = 2) const;
    /*  pre: precision is a strictly positive integer
        post: all the neural network is drawn, if detailed, with the weights and
                biases. */


    //  MODIFIERS

    void backpropagate (matrix x, const matrix& correct, double learning_rate);
    /*  pre: x and correct are column matrices with lenght number_of_inputs and
                learning_rate is a strictly positive real
        post: the backpropagation algorithm is applied to the neural network */

    void change_activation_function (int layer, int function);
    /*  pre: 0 <= layer < numer_of_layers and function is in the list
        post: the activation function of the selected layer will be changed */


    //  CONSULTORS

    double error (const matrix& x, const matrix& correct);
    /*  pre: x and correct are column matrices with lenght number_of_inputs
        post: the returned value is the cost function value for the calculated
                and the correct output provided. */

    int number_of_layers () const;
    /*  pre: true
        post: returns the number of layers */

    int number_of_inputs () const;
    /*  pre: true
        post: returns the number of inputs */

    int number_of_outputs () const;
    /*  pre: true
        post: returns the number of layers */

    std::vector<int> shape () const;
    /*  pre: true
        post: returns a vector with the size of each layer */

};

#endif

.cc

#include "neural_network.hh"
using namespace std;

//      ERROR MESSAGES FOR THE NEURAL_EXCEPTION CLASS

const char* FILE_NOT_FOUND      = "file for constructor initiation not found";
const char* ERROR_CREATING_FILE = "error while creating the file";
const char* NO_ACTIVATION_FUNC  = "activation function not found";
const char* NO_COST_FUNC        = "cost function not foound";
const char* BAD_N_ACT_FUNC      = "one activation function per layer id needed";
const char* BAD_SHAPE           = "all values for shape must be bigger than 0";
const char* BAD_INPUT_SHAPE     = "incorrect shape of inputs";
const char* BAD_LAYER           = "layer out of bounds";

//      COST FUNCTIONS

double quadratic (matrix& output, const matrix& correct, bool bprop)
{
    if (bprop) {
        output -= correct;
        return 0;
    } else {
        output -= correct;
        long double sum = output.mod();

        // return the sum of the squares of each number divided by two
        return (0.5 * sum * sum);
    }
}

//      ACTIVATION FUNCTIONS

void ReLu (matrix& m, bool bprop)
{
    if (bprop) {

        matrix result(m.nrows(), m.nrows(), 0);

        for (int i = 0; i < m.nrows(); i++) {
            if (m[i][0] >= 0)
                result.change(i, i, 1);
        }

        m = result;

    } else {
        for (int i = 0; i < m.nrows(); i++) {
            if (m[i][0] < 0)
                m.change(i, 0, 0);
        }
    }
}


void sigmoid (matrix& m, bool bprop)
{
    if (bprop) {
        matrix result(m.nrows(), m.nrows(), 0);
        double e_pow_x;

        for (int i = 0; i < m.nrows(); i++) {
            // the derivative is: y' = e^x/(1 + e^x)^2
            e_pow_x = pow(M_E, m[i][0]);
            result.change(i, i, e_pow_x / pow( 1 + e_pow_x , 2));
        }

        m = result;
    } else {
        for (int i = 0; i < m.nrows(); i++)
            m.change(i,0, 1/(1 + pow(M_E, -1 * m[i][0])));
    }
}

neural_network::neural_network (neural_data data)
{
    // first of all, check that all parameters are correct

    // check there is one activation function per layer
    if (data.activation_functions.size() != data.shape.size())
        throw neural_exception(BAD_N_ACT_FUNC);

    // check if all the activation functions exist
    for (int i = 0; i < data.activation_functions.size(); i++) {
        if (data.activation_functions[i] < 0 or
            data.activation_functions[i] >= ACT_FUNC.size())
            throw neural_exception(NO_ACTIVATION_FUNC);
        else
            act_func.push_back(data.activation_functions[i]);
    }

    // check if the cost function exist
    if ( data.cost_function < 0 or data.cost_function >= COST_FUNC.size())
        throw neural_exception(NO_COST_FUNC);
    else
        cost_func = data.cost_function;

    int n_rows, n_cols = data.number_of_inputs;

    // create the neural network
    for (int i = 0; i < data.shape.size(); i++) {
        // check shape is valid
        if (data.shape[i] < 1) throw neural_exception(BAD_SHAPE);
        // for the first layer, set the number of inputs equal to the nn's
        if (i != 0)
            n_cols = data.shape[i-1];
        n_rows = data.shape[i];
        // initialize matrices for w and b
        w.push_back(matrix(n_rows, n_cols, data.value_w, data.random_w));
        b.push_back(matrix(n_rows, 1, data.value_b, data.random_b));
    }

    // save_x will only be true in bprop for optimization reasons
    save_x = false;
}

matrix neural_network::calculate (const matrix& x)
{
    matrix x_copy(x);
    if (x_copy.nrows() != w[0].ncols())
        throw neural_exception(BAD_INPUT_SHAPE);

    this->x.clear();

    for (int i = 0; i < w.size(); i++) {

        if (save_x)
            this->x.push_back(x_copy.gcopy());

        x_copy = w[i]*x_copy;
        x_copy += b[i];

        // select the activation function from the array
        ACT_FUNC[act_func[i]](x_copy,false);
    }

    if (save_x)
            this->x.push_back(x_copy.gcopy());

    return x_copy;
}

//      MODIFIERS

void neural_network::backpropagate (const matrix x, const matrix& correct, 
                                                double learning_rate)
{
    save_x = true;

    matrix dedx = calculate(x);
    COST_FUNC[cost_func](dedx, correct, true);

    matrix dedw(0,0), dldl(0,0);

    for (int i = w.size() - 1; i >= 0; i--) {
        // calculate first dl/dl = (dLi/dZi) / (dE/dLi)
        dldl = this->x[i+1].gcopy();
        ACT_FUNC[act_func[i]](dldl, true);
        dldl *= dedx;

        // actualize bias: b = b - learning_rate * dl/dl
        b[i] -= dldl.gmultn(learning_rate);

        // actualize dedx: dE/dXi-1 = (Wi)^T * dl/dl
        dedx = w[i].gT() * dldl;

        // actualize w: w = w - learning_rate * (dl/dl * (Xi-1)^T)
        w[i] -= (dldl * this->x[i].gT()).gmultn(learning_rate);
    }

    save_x = false;
    return;
}

matrix.hh

#ifndef MATRIX_HH
#define MATRIX_HH

#include <iostream>
#include <vector>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <exception>

typedef std::vector<std::vector<double>> vector_matrix;

//      CUSTOM EXCEPTION CLASS

class matrix_exception: public std::exception {
  public:
   matrix_exception(const char* error) : exception(), message(error) {}
   const char* what() const throw() {return message;}; 
  private:
   const char* message;
};


//      CLASS MATRIX

class matrix {

    vector_matrix m;

    int cols, rows;

    void copy (const matrix& m);
    /*  pre: true.
        post: the matrix is equal to m. */

public:

    //  CONSTRUCTORS

    matrix (int rows, int cols, double val = 0.0, bool random = false);
    /*  pre: cols and rows are positive integers
        post: a matrix of shape rows*cols and filled with val (0 as a
                default) if random = false and with random values from 0 to 
                val (1 as a default) otherwise. */

    matrix (const matrix& m);
    /*  pre: true
        post: creates a matrix equal to m */



    //  MODIFIERS

    void change (int row, int col, double val);
    /*  pre: 0 <= col < cols and 0 <= row < rows
        post: changes the element in the position row col to val. */

    void read ();
    /*  pre: there is a series of row*cols doubles to read in the standard
                channel
        post: the matrix has been filled with the given values. The first
                row takes the first row values and so on */

    void T ();
    /*  pre: true
        post: the matrix has been transposed. */

    matrix gT () const;
    /*  pre:true
        post: returns a transposed matrix. */

    void multn (double n);
    /*  pre:true
        post: the matrix has been multiplied by n. */

    matrix gmultn (double n) const;
    /*  pre: true
        post: the returned value is the matrix multiplied by n.*/

    matrix operator+ (const matrix& m) const;
    /*  pre: both matrices have the same shape.
        post: the returned value is a matrix with the same shape and whose
                elements are the sum of both matrices. */

    matrix operator- (const matrix& m) const;
    /*  pre: both matrices have the same shape.
        post: the returned value is a matrix with the same shape and whose
                elements are the difference of both matrices. */

    matrix operator^ (const matrix& m) const;
    /*  pre: both matrices have the same shape.
        post: the returned value is a matrix with the same shape and whose
                elements are the elementwise product of both matrices. */

    matrix operator* (const matrix& m) const;
    /*  pre: the number of cols of this is equal to the number of rows of m.
        post: the returned matrix is the result of the matrix product
                between this and m. */

    bool operator== (const matrix& m) const;
    /*  pre: true.
        post: returns true if both matrices have the same shape and elements. */

    void operator= (const matrix& m);
    /*  pre: true.
        post: the left-hand side matrix is equal to the one in the right. */

    void operator+=(const matrix& m);
    /*  pre: both matrices have the same shape.
        post: the returned value is a matrix with the same shape and whose
                elements are the sum of both matrices. */

    void operator-= (const matrix& m);
    /*  pre: both matrices have the same shape.
        post: the returned value is a matrix with the same shape and whose
                elements are the difference between both matrices. */

    void operator^= (const matrix& m);
    /*  pre: both matrices have the same shape.
        post: the returned value is a matrix with the same shape and whose
                elements are the elementwise multiplication of both matrices. */

    void operator*= (const matrix& m);
    /*  pre: the number of cols of this is equal to the number of rows of m.
        post: this becomes the result of the matrix product between this
                and m. */

    std::vector <double> operator[] (int row) const;
    /*  pre: 0 <= row < rows and 0 <= col < cols
        post: returns the rowth row. NOT SUITABLE FOR CHANGING ITEMS IN THE
                MATRIX, for that purpose use change */

    //  CONSULTORS

    void shape (int& row, int& col) const;
    /*  pre: true
        post: changes the value of row to the number of rows and col to the
        number of columns. */

    int nrows () const;
    /*  pre: true
        post: returns number of rows. */

    int ncols () const;
    /*  pre: true
        post: returns the number of columns. */

    double elm (int row, int col) const;
    /*  pre: 0 <= col < cols and 0 <= row < rows
        post: returns the element in the row col position. */

    long double mod () const;
    /*  pre: true
        post: the value returned is square of the sum of the squares for
                each value of the matrix */

    void draw (int margin = 3, int p = 3) const;
    /*  pre: margin and p are a positive integers
        post: draws the matrix with margin number of spaces to the left and 
        precission p. If an item does not have a decimal part, the .000... 
        wont be drown, but if it does but is too small to fit in p precission
        a n.0000... will be shown. */

    matrix gcopy () const;
    /*  pre:true
        post: returns a copy of the matrix */
};

#endif
...