С функцией активации сигмоида на моем выходе сеть узнает задачу намного лучше, чем с функцией линейной активации.
Я использую регуляризацию L2 с моей функцией стоимости, у меня есть скорость обучения и срок импульса, но он учится лучше с функцией активации сигмовидной кишки.
Что я могу сделать, чтобы улучшить результаты?
import csv
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
import numpy as np
import numpy.random as r
import matplotlib.pyplot as plt
import random
def readcsv(filename):
ifile = open(filename, "rU")
reader = csv.reader(ifile, delimiter=",")
rownum = 0
dataset = []
for row in reader:
rownum += 1
data = []
for s in dataset:
Dataset = [float(i) for i in s]
return [data, rownum]
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def sigmoid_deriv(x):
return x * (1 - x)
def initialise_weights(nn_structure):
weights = {}
bias = {}
c_weights = {1: np.ones((nn_structure[1], nn_structure[1]))}
context = {0: r.random_sample((nn_structure[1]))}
for l in range(1, len(nn_structure)):
q = []
for j in range(1, nn_structure[l] + 1):
w = [random.uniform(-0.09, 0.09) for i in range(nn_structure[l-1])]
weights[l] = np.array(q)
bias[l] = r.random_sample((nn_structure[l],))
return weights, bias, c_weights, context
def initialise_weights_changes(nn_structure):
deltaweights = {}
deltabias = {}
deltac_weights = {1: np.zeros((nn_structure[1], nn_structure[1]))}
for l in range(1, len(nn_structure)):
deltaweights[l] = np.zeros((nn_structure[l], nn_structure[l-1]))
deltabias[l] = np.zeros((nn_structure[l],))
return deltaweights, deltabias, deltac_weights
def feed_forward(x, weights, bias, c_weights, context, i, hidden_layer):
hidden_layer[i] = {1: x}
activations = {}
for l in range(1, len(weights) + 1):
node_in = hidden_layer[i][l]
if l == 1:
activations[l+1] = sigmoid(weights[l].dot(node_in) + c_weights[l].dot(context[i]) + bias[l])
activations[l+1] = weights[l].dot(node_in) + bias[l]
hidden_layer[i][l+1] = activations[l+1]
if l == 1:
context[i + 1] = hidden_layer[i][l+1]
return hidden_layer, activations
def calculate_out_layer_delta(y, hidden_layer):#, activations):
return -(y - hidden_layer)
def calculate_hidden_delta(delta_plus_1, weights_l):#, z_l):
return np.dot(np.transpose(weights_l), delta_plus_1)
def train_nn(nn_structure, X, y, iter_num=1000, alpha=0.6, momentum = 0.4, bptt = 5, reg = 0.0000009):
weights, bias, c_weights, context = initialise_weights(nn_structure)
cnt = 0
m = len(y)
avg_cost_func = []
print('Starting gradient descent for {} iterations'.format(iter_num))
while cnt < iter_num:
if cnt%1000 == 0:
print('Iteration {} of {}'.format(cnt, iter_num))
deltaweights, deltabias, deltac_weights = initialise_weights_changes(nn_structure)
avg_cost = 0
hidden_layer = {}
delta = {}
bp = []
for i in range(len(y)):
# perform the feed forward pass and return the stored h and z values, to be used in the gradient descent step
hidden_layer, activations = feed_forward(X[i], weights, bias, c_weights, context, i, hidden_layer)
# loop from nl-1 to 1 backpropagating the errors
if len(bp) == bptt:
for j in reversed(bp):
delta[j] = {}
if j == bp[-1]:
for l in range(len(nn_structure), 0, -1):
if l == len(nn_structure):
delta[j][l] = calculate_out_layer_delta(y[j], hidden_layer[j][l])
avg_cost += mean_squared_error(y[j] , hidden_layer[j][l]) + 0.5 * reg * np.linalg.norm(np.power(weights[l - 1], 2)) + 0.5 * reg * np.linalg.norm(np.power(weights[l - 2], 2))
if l > 1:
delta[j][l] = calculate_hidden_delta(delta[j][l+1], weights[l]) * sigmoid_deriv(hidden_layer[j][l])
for l in range(len(nn_structure), 0, -1):
if l == len(nn_structure):
delta[j][l] = calculate_out_layer_delta(y[j], hidden_layer[j][l])
avg_cost += np.linalg.norm((y[j] - hidden_layer[j][l]))
if l > 1:
delta[j][l] = calculate_hidden_delta(delta[j][l + 1], weights[l]) + calculate_hidden_delta(delta[j + 1][l], c_weights[1]) * sigmoid_deriv(hidden_layer[j][l])
for l in range(len(nn_structure) - 1, 0, -1):
deltaweights[l] = (-alpha * np.dot(delta[j][l + 1][:, np.newaxis], np.transpose(hidden_layer[j][l][:, np.newaxis]))) + (momentum * deltaweights[l]) + (reg / bptt * weights[l])
deltabias[l] = (-alpha * delta[j][l + 1]) + ((momentum * deltabias[l])) + (reg / bptt * bias[l])
if l == 1:
deltac_weights[l] += (-alpha * np.dot(delta[j][1 + 1][:, np.newaxis], np.transpose(context[j][:, np.newaxis]))) + (momentum * deltac_weights[l]) + (reg / bptt * c_weights[l])
# perform the gradient descent step for the weights in each layer
for l in range(len(nn_structure) - 1, 0, -1):
weights[l] += (1 / bptt * deltaweights[l]) - (reg / bptt * weights[l])
bias[l] += (1 / bptt * deltabias[l]) - (reg / bptt * bias[l])
if l == 1:
c_weights[l] += (1 / bptt * deltac_weights[l]) - (reg / bptt * c_weights[l])
bp = []
deltaweights, deltabias, deltac_weights = initialise_weights_changes(nn_structure)
# complete the average cost calculation
if cnt % 500 == 0:
avg_cost = 1.0 / m * avg_cost
if cnt % 1000 == 0:
print('Error', avg_cost)
cnt += 1
alpha = alpha - (alpha/iter_num)
return weights, bias, avg_cost_func, c_weights, context
def predict_y(weights, bias, X, c_weights, context):
m = X.shape[0]
y = np.zeros((m,))
for i in range(m):
hidden_layer = {1: X[i]}
for l in range(1, len(weights) + 1):
node_in = hidden_layer[l]
if l == 1:
activations = weights[l].dot(node_in) + c_weights[l].dot(context[l]) + bias[l]
activations = weights[l].dot(node_in) + bias[l]
hidden_layer[l + 1] = sigmoid(activations)
y[i] = hidden_layer[3]
return y
if __name__ == "__main__":
# load data and scale
filename = 'C:/Users/n0762538/Documents/Data/MackeyGlass/MackeyGlass.csv'
dataset, rownum = readcsv(filename)
dataset = np.array(dataset)
# define data
no = int(0.70 * len(dataset))
train_data = dataset[0:no]
test_data = dataset[no:-1]
train_output = dataset[1:no + 1]
test_output = dataset[no + 1:]
X = train_data
y = train_output
# setup the NN structure
nn_structure = [len(dataset[0]), 3, len(dataset[0])]
# train the NN
weights, bias, avg_cost_func, c_weights, context = train_nn(nn_structure, X, y)
# plot the avg_cost_func
plt.ylabel('Average J')
plt.xlabel('Iteration number')
# get the prediction accuracy and print
#print('test:', test_output)
y_pred = predict_y(weights, bias, test_data, c_weights, context)
print('Prediction accuracy is {}%'.format(r2_score(test_output, y_pred) * 100))
plt.title('Approach 1')
plt.xlabel('Iteration number')