Я пробовал разные гиперпараметры и количество слоев и узлов, но моя модель ничего не изучает даже после итерации 2000 года, и я также пробовал окружение MountainCarContinuous-v0, но это тоже не сработало.
iпробовал другую архитектуру и модель из github, но моя модель ничего не изучает
import numpy as np
import tensorflow as tf
import random
import gym
import pylab
import sys
from keras.initializers import RandomUniform
from keras.models import Sequential, Model
from keras.optimizers import Adam
from keras.layers import Dense, Input, Add, Concatenate, Flatten, GaussianNoise, Lambda
from keras import backend as K
from collections import deque
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
K.set_session(sess)
EPISODES = 100000
class Actor(object):
def __init__(self, sess, state_size, action_size, TAU, lr, action_bound=1, load=False):
self.sess = sess
self.TAU = TAU
self.lr = lr
self.load = load
self.action_bound = action_bound
self.model, self.weights, self.state = self.bulid_network(state_size, action_size)
self.target_model, self.target_weights, self.target_weights = self.bulid_network(state_size, action_size)
self.q_grads = tf.placeholder(tf.float32, [None, action_size])
self.grads = tf.gradients(self.model.output, self.weights, -self.q_grads)
self.optimize = tf.train.AdamOptimizer(lr).apply_gradients(zip(self.grads, self.weights))
self.sess.run(tf.global_variables_initializer())
if self.load:
self.model.load_weights("./DDPG_Actor.h5")
self.target_model.load_weights("./DDPG_Actor_target.h5")
def train(self, state, grads):
self.sess.run(self.optimize, feed_dict={self.state : state, self.q_grads : grads})
def update(self):
W, target_W = self.model.get_weights(), self.target_model.get_weights()
for i in range(len(W)):
target_W[i] = self.TAU*W[i] + (1 - self.TAU)*target_W[i]
self.target_model.set_weights(target_W)
def save(self):
self.model.save_weights("./DDPG_Actor.h5")
self.target_model.save_weights("./DDPG_Actor_target.h5")
def bulid_network(self, state_size, action_size):
input = Input(shape=[state_size])
X = Dense(400, activation='relu', kernel_initializer='glorot_normal')(input)
X = Dense(300, activation='relu', kernel_initializer='glorot_normal')(X)
output = Dense(action_size, activation='tanh', kernel_initializer='glorot_normal')(X)
output = Lambda(lambda i : i*self.action_bound)(output)
model = Model(input=input, output=output)
return model, model.trainable_weights, input
class Critic(object):
def __init__(self, sess, state_size, action_size, TAU, lr, load=False):
self.sess = sess
self.TAU = TAU
self.lr = lr
self.load = load
self.optimizer = tf.train.AdamOptimizer(lr)
self.model, self.state, self.action = self.build_network(state_size, action_size)
self.target_model, self.target_state, self.target_action = self.build_network(state_size, action_size)
self.q_grads = tf.gradients(self.model.output, self.action)
self.sess.run(tf.global_variables_initializer())
if self.load:
self.model.load_weights("./DDPG_Critic.h5")
self.target_model.load_weights("./DDPG_Critic_target.h5")
def gradients(self, state, action):
return self.sess.run(self.q_grads, feed_dict={self.state : state, self.action : action})[0]
def save(self):
self.model.save_weights("./DDPG_Critic.h5")
self.target_model.save_weights("./DDPG_Critic_target.h5")
def update(self):
W, target_W = self.model.get_weights(), self.target_model.get_weights()
for i in range(len(W)):
target_W[i] = self.TAU*W[i] + (1 - self.TAU)*target_W[i]
self.target_model.set_weights(target_W)
def build_network(self, state_size, action_size):
S = Input(shape=[state_size])
A = Input(shape=[action_size])
X1 = Dense(400, activation='relu', kernel_initializer='glorot_normal')(S)
X2 = Dense(400, activation='relu', kernel_initializer='glorot_normal')(A)
X = Add()([X1,X2])
X = Dense(300, activation='relu', kernel_initializer='glorot_normal')(X)
output = Dense(action_size, activation='linear', kernel_initializer='glorot_normal')(X)
model = Model(inputs=[S, A], outputs=output)
model.compile(loss='mse', optimizer=Adam(lr=self.lr))
return model, S, A
class DDPG(object):
def __init__(self, sess, state_size, action_size, action_bound=1, memory_size=5000, batch_size=64, actor_lr=0.0001, critic_lr=0.001, gamma=0.99, TAU=0.001):
self.sess = sess
self.state_size = state_size
self.action_size = action_size
self.memory = deque(maxlen=memory_size)
self.batch_size = batch_size
self.actor_lr = actor_lr
self.critic_lr = critic_lr
self.gamma = gamma
self.TAU = TAU
self.train_start = 1000
self.epsilon = 1
self.epsilon_min = 0.001
self.mu = 0.0
self.x = 0
self.theta = 0.01
self.sigma = 0.1
self.epsilon_decay = (self.epsilon - self.epsilon_min) / 100000
self.actor = Actor(sess, state_size, action_size, TAU, actor_lr,action_bound, load=False)
self.critic = Critic(sess, state_size, action_size, TAU, critic_lr, load=False)
def append(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done))
if self.epsilon > self.epsilon_min :
self.epsilon -= self.epsilon_decay
def OU(self):
dx = self.theta*(self.mu - self.x) + self.sigma*np.random.randn(1)
self.x += dx
return self.x
def get_action(self, state):
action = self.actor.model.predict(state)[0]
return action + self.OU()*self.epsilon
def save(self):
self.actor.save()
self.critic.save()
def train(self):
if len(self.memory) < self.train_start:
return
batch_size = min(self.batch_size, len(self.memory))
mini_batch = random.sample(self.memory, batch_size)
states = np.asarray([e[0] for e in mini_batch])
states = np.reshape(states, [batch_size, self.state_size])
actions = np.asarray([e[1] for e in mini_batch])
rewards = np.asarray([e[2] for e in mini_batch])
next_states = np.asarray([e[3] for e in mini_batch])
next_states = np.reshape(next_states, [batch_size, self.state_size])
done = np.asarray([e[4] for e in mini_batch])
target = np.zeros_like(actions)
target_q_values = self.critic.target_model.predict([next_states, self.actor.target_model.predict(next_states)])
for i in range(len(mini_batch)):
if done[i]:
target[i] = rewards[i]
else :
target[i] = rewards[i] + self.gamma*target_q_values[i]
loss = self.critic.model.train_on_batch([states, actions], target)
action_for_grad = self.actor.model.predict(states)
q_grads = self.critic.gradients(states,action_for_grad)
self.actor.train(states,q_grads)
self.actor.update()
self.critic.update()
env = gym.make('Pendulum-v0')
state_size = env.observation_space.shape[0]
action_size = 1
action_bound = env.action_space.high
agent = DDPG(sess, state_size, action_size, action_bound)
scores, episodes = [], []
for e in range(EPISODES):
done = False
score = 0
state = env.reset()
state = np.reshape(state, [1,state_size])
step = 0
while not done:
action = agent.get_action(state)
#print(action)
next_state, reward, done, _ = env.step([action])
next_state = np.reshape(next_state,[1,state_size])
score += reward[0]
agent.append(state, action, reward, next_state, done)
state = next_state
step += 1
if step % 20 == 0:
agent.train()
if done:
scores.append(score)
episodes.append(e)
pylab.plot(episodes, scores, 'b')
pylab.savefig("./DDPG_Pendulum.png")
print("episode:", e, " score:", score, " epsilon:", agent.epsilon)
#if np.mean(scores[-min(10, len(scores)) :]) > -120 :
#sys.exit()
if e % 50 == 0:
agent.save()
всегда я получал около -1450 наград за эпизод