Исследование:
Ради простоты я изменяю ваш код для запуска с Pendulum-v0 вместо пользовательской среды RocketLander-v0 в Google Colab.
Ниже приведены измененияЯ сделал для запуска Pendulum-v0:
Удалите строку: import rocket_lander_gym
Измените строку: STATE_DIM, ACT_DIM = 10, 3
на эту: STATE_DIM, ACT_DIM = 3, 1
Измените строку:env = gym.make('RocketLander-v0')
к этому: env = gym.make('Pendulum-v0')
После внесения этих небольших, но необходимых изменений для запуска Pendulum-v0 ваш код по-прежнему создает nans в последнем выражении print(reward)
.Это намекает на то, что проблема, скорее всего, связана с кодом, и вряд ли это проблема игровой среды.
Вывод для окончательного print(reward)
оператора до устранения проблем (содержит nans вплоть доконец вывода):
[-1239.414496251207, -1267.7001978172505, -1247.1635071416315, -1255.8660458301786, -1246.770645397439, -1259.1171723968932, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]
Решение:
После более тщательного изучения я обнаружил следующие проблемы и внес в ваш код некоторые изменения, которые в конечном итоге решают проблему с nans.
(Фактические проблемы , которые вызывают nans, заключаются в point 5 & 6 . Магическое число 2
, которое является множителем, использованным для умножения mu
отличается от верхней границы вашего клипа 1
в точка 6. )
1) Ваше отношение вероятностей неверно, поэтому я изменяю его следующим образом:
ratio = pi.prob(self.tf_advantage) / old_pi.prob(self.tf_advantage)
на это:
ratio = pi.prob(self.tf_action) / old_pi.prob(self.tf_action)
2) У вас есть 2 self.train_opt
self.train_opt = tf.train.AdamOptimizer(CR_LR)
self.train_opt = self.train_opt.apply_gradients(zip(gradients, variables))
поэтому я изменяю 2-й оператор self.train_opt
на:
self.ctrain_op = self.train_opt.apply_gradients(zip(gradients, variables))
3) self.atrain_operation
- оптимизатор, поэтому я заменяю этострока:
self.atrain_operation = self.atrain_operation.apply_gradients(zip(gradients, variables))
с:
self.atrain_op = self.atrain_operation.apply_gradients(zip(gradients, variables))
4) Соответственно, закомментированные строки также заменяются в функции update
:
#[self.tfsess.run(self.atrain_operation, {self.tf_state: state, self.tf_action: action, self.tf_advantage: advantage}) for _ in range(ACTOR_UPDATE_STEPS)]
[self.tfsess.run(self.atrain_op, {self.tf_state: state, self.tf_action: action, self.tf_advantage: advantage}) for _ in range(ACTOR_UPDATE_STEPS)]
#[self.tfsess.run(self.train_opt, {self.tf_state: state, self.tf_dreward: reward}) for _ in range(CRITIC_UPDATE_STEPS)]
[self.tfsess.run(self.ctrain_op, {self.tf_state: state, self.tf_dreward: reward}) for _ in range(CRITIC_UPDATE_STEPS)]
5) В функции tinynn
вместо умножения на магическое число 2:
mu = 2 * tf.layers.dense(l1, ACT_DIM, tf.nn.tanh, trainable=trainable)
замените его следующим:
mu = self.env.action_space.high * tf.layers.dense(l1, ACT_DIM, tf.nn.tanh, name='mu', trainable=trainable)
6) Вместо этого return np.clip(action, -1, 1)
в функции choose_action
используйте это:
return np.clip(a, self.env.action_space.low, self.env.action_space.high)
7) Я также передаю env
в PPO (), чтобы tinynn
мог иметь доступ к среде:
"""
if __name__ == '__main__':
ppo = PPO()
#env = gym.make('RocketLander-v0')
env = gym.make('Pendulum-v0')
reward = ppo.train(env, ppo, 100)
print(reward)
"""
if __name__ == '__main__':
#env = gym.make('RocketLander-v0')
env = gym.make('Pendulum-v0')
ppo = PPO(env)
reward = ppo.train(env, ppo, 100)
print(reward)
Результаты (протестировано на Google Colab):
Вывод для окончательного print(reward)
оператора после исправления проблем (больше нет nans) :
[-1076.4211985938728, -1089.7948555704293, -1115.6341917789869, -1147.7961139172062, -1162.9589624975872, -1193.6444573268725, -1214.9662239699737, -1219.295151702447, -1228.3773779343328, -1211.7559065793157, -1239.1770034164979, -1256.5497739717612, -1248.942050034072, -1251.5809026533057, -1246.350714892043, -1223.1414157442061, -1231.5288547710811, -1223.5475405502032, -1217.095971096193, -1215.639878904649, -1182.084416025169, -1174.3085216226718, -1176.5976104186886, -1188.5439312195451, -1160.6565487872776, -1132.5758139546506, -1148.7299082836548, -1149.1097155137375, -1124.4154423538491, -1100.4411098048593, -1081.2445587548245, -1035.7597376533809, -1039.5657416397464, -1046.8627585876952, -1007.554202371864, -997.4072232047926, -924.0742105089892, -872.5268280283873, -889.6594740458157, -929.8577808816676, -957.1616193294444, -887.3960001717214, -811.6005555799227, -769.4648914456843, -692.6909819129986, -623.7238271047137, -656.6829518032941, -629.9657550649539, -651.9125731231816, -678.5172027274579, -683.0097144683796, -640.7089935328387, -589.4306203212271, -556.3242756529115, -526.881331084439, -539.3604006694065, -511.27673189202727, -526.1856726355412, -512.7768642430646, -514.7892695498354, -527.2777710366902, -516.3731318862425, -504.3876365547384, -466.66983741261095, -446.0724507306932, -414.25670263412803, -449.7266236253488, -471.7990471628901, -492.56922815695845, -455.6665136249609, -436.67493361178475, -393.1425637497276, -445.3335873259794, -440.30325932671377, -437.07634044015583, -406.7068409952513, -379.062809279313, -444.46652386541916, -439.60389029825603, -422.0043960746679, -424.80904663279813, -486.0321568909586, -476.00519893661306, -493.3553901668465, -457.4723683354885, -450.83268159600254, -458.6995892890558, -514.3951245072926, -519.3061062950538, -507.1919061966863, -469.59914342990675, -422.66056322913045, -439.53868966691357, -395.9325190449425, -369.7488471733708, -398.1944563259144, -397.3649275140671, -401.18423175784426, -400.9083352836444, -374.0640183220304]
Весь работающий модифицированный код (больше нет nans):
import gym
import numpy as np
import tensorflow as tf
#import rocket_lander_gym
EP_LEN = 200
GAMMA = 0.9
SL_LR = 1e-4
CR_LR = 1e-4
BATCH = 5
ACTOR_UPDATE_STEPS = 20
CRITIC_UPDATE_STEPS = 20
#STATE_DIM, ACT_DIM = 10, 3
STATE_DIM, ACT_DIM = 3, 1
METHOD = [
dict(name='kl_penalty', kl_target=0.01, lam=0.5),
dict(name='clip', epsilon=0.2),
][1]
PRINT_DEBUG_MSG = False
class PPO:
def __init__(self, env):
self.env = env
self.tfsess = tf.Session()
self.tf_state = tf.placeholder(tf.float32, [None, STATE_DIM], 'state')
# Critic (value network)
with tf.variable_scope('critic'):
# Layers
l1 = tf.layers.dense(self.tf_state, 100, tf.nn.relu)
# Value
self.value = tf.layers.dense(l1, 1)
# Discounted reward: reward in the furture
self.tf_dreward = tf.placeholder(tf.float32, [None, 1], 'discounted_reward')
# Advantage: determine quality of action
self.advantage = self.tf_dreward - self.value
# Loss function: minimize the advantage over time
# The loss function is a mean squared error
self.loss = tf.reduce_mean(tf.square(self.advantage))
# Gradient descent using Adam optimizer
self.train_opt = tf.train.AdamOptimizer(CR_LR)
gradients, variables = zip(*self.train_opt.compute_gradients(self.loss))
gradients, _ = tf.clip_by_global_norm(gradients, 1.0)
#self.train_opt = self.train_opt.apply_gradients(zip(gradients, variables))
self.ctrain_op = self.train_opt.apply_gradients(zip(gradients, variables))
# Actor (policy network)
pi, pi_params = self.tinynn('pi', trainable=True)
old_pi, old_pi_params = self.tinynn('old_pi', trainable=False)
# Sample actions from both the old and the new policy networks
with tf.variable_scope('sample_action'):
# Choose an action from the distribution learnt
self.sample_operation = tf.squeeze(pi.sample(1), axis=0)
with tf.variable_scope('update_old_pi'):
# Choose an action from the distribution learnt
self.update_old_pi_operation = [old_pi.assign(p) for p, old_pi in zip(pi_params, old_pi_params)]
# Placeholder for the action and the advantage
self.tf_action = tf.placeholder(tf.float32, [None, ACT_DIM], 'action')
self.tf_advantage = tf.placeholder(tf.float32, [None, 1], 'advantage')
# Compute loss function
with tf.variable_scope('loss'):
with tf.variable_scope('surrogate'):
#ratio = pi.prob(self.tf_advantage) / old_pi.prob(self.tf_advantage)
ratio = pi.prob(self.tf_action) / old_pi.prob(self.tf_action)
surrogate = ratio * self.tf_advantage
# KL penalty
if METHOD['name'] == 'kl_penalty':
# Lambda
self.tf_lambda = tf.placeholder(tf.float32, None, 'lambda')
# Compute KL divergence between old and new policy
kl = tf.contrib.distributions.kl_divergence(old_pi, pi)
# Get mean
self.kl_mean = tf.reduce_mean(kl)
# Compute loss using surrogate
self.aloss = -(tf.reduce_mean(surrogate - self.tf_lambda * kl))
else:
self.aloss = -tf.reduce_mean(tf.minimum(surrogate, tf.clip_by_value(ratio, 1.-METHOD['epsilon'], 1.+METHOD['epsilon']) * self.tf_advantage))
# Minimize the loss using gradient descent
with tf.variable_scope('atrain'):
self.atrain_operation = tf.train.AdamOptimizer(SL_LR)
gradients, variables = zip(*self.atrain_operation.compute_gradients(self.aloss))
gradients, _ = tf.clip_by_global_norm(gradients, 1.0)
#self.atrain_operation = self.atrain_operation.apply_gradients(zip(gradients, variables))
self.atrain_op = self.atrain_operation.apply_gradients(zip(gradients, variables))
# Write to disk
tf.summary.FileWriter("log/", self.tfsess.graph)
# Run the session
self.tfsess.run(tf.global_variables_initializer())
def update(self, state, action, reward):
self.tfsess.run(self.update_old_pi_operation)
advantage = self.tfsess.run(self.advantage, {self.tf_state: state, self.tf_dreward: reward})
# Update actor (policy)
if METHOD['name'] == 'kl_penalty':
for _ in range(ACTOR_UPDATE_STEPS):
_, kl = self.tfsess.run([self.atrain_operation, self.kl_mean], {self.tf_state: state, self.tf_action: action, tf_advantage: advantage, self.tf_lambda: METHOD['lam']})
if kl > 4*METHOD['kl_target']:
break
if kl < METHOD['kl_target'] / 1.5:
# Adaptive lambda
METHOD['lam'] /= 2
elif kl > METHOD['kl_target'] * 1.5:
METHOD['lam'] *= 2
# Lambda might explode, we need to clip it
METHOD['lam'] = np.clip(METHOD['lam'], 1e-4, 10)
else:
#[self.tfsess.run(self.atrain_operation, {self.tf_state: state, self.tf_action: action, self.tf_advantage: advantage}) for _ in range(ACTOR_UPDATE_STEPS)]
[self.tfsess.run(self.atrain_op, {self.tf_state: state, self.tf_action: action, self.tf_advantage: advantage}) for _ in range(ACTOR_UPDATE_STEPS)]
# Update critic (value)
#[self.tfsess.run(self.train_opt, {self.tf_state: state, self.tf_dreward: reward}) for _ in range(CRITIC_UPDATE_STEPS)]
[self.tfsess.run(self.ctrain_op, {self.tf_state: state, self.tf_dreward: reward}) for _ in range(CRITIC_UPDATE_STEPS)]
def tinynn(self, name, trainable):
with tf.variable_scope(name):
l1 = tf.layers.dense(self.tf_state, 100, tf.nn.relu, trainable=trainable)
#mu = 2 * tf.layers.dense(l1, ACT_DIM, tf.nn.tanh, trainable=trainable)
mu = self.env.action_space.high * tf.layers.dense(l1, ACT_DIM, tf.nn.tanh, name='mu', trainable=trainable)
sigma = tf.layers.dense(l1, ACT_DIM, tf.nn.softplus, trainable=trainable)
norm_dist = tf.distributions.Normal(loc=mu, scale=sigma)
params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name)
return norm_dist, params
def choose_action(self, state):
state = state[np.newaxis, :]
action = self.tfsess.run(self.sample_operation, {self.tf_state: state})[0]
#return np.clip(action, -1, 1)
return np.clip(action, self.env.action_space.low, self.env.action_space.high)
def get_value(self, state):
if state.ndim < 2: state = state[np.newaxis, :]
return self.tfsess.run(self.value, {self.tf_state: state})[0, 0]
def train(self, env, ppo, epochs, render=False):
# Rewards
all_ep_r = []
# Training loop
for ep in range(epochs):
# Initial state
s = env.reset()
# States, actions and rewards
buffer_s, buffer_a, buffer_r = [], [], []
# Initial reward
ep_r = 0
# For a single episode
for t in range(EP_LEN):
if render:
# Render the environment
env.render()
# Choose best action
a = ppo.choose_action(s)
# State,reward,done,info
s_, r, done, _ = env.step(a)
if PRINT_DEBUG_MSG:
print("Action Taken ",a)
print("Observation ",s_)
print("Reward Gained ",r, end='\n\n')
# Add to buffers
buffer_s.append(s)
buffer_a.append(a)
buffer_r.append((r+8)/8) # normalize reward, find to be useful
s = s_
# Total reward
ep_r += r
# Update PPO
if (t+1) % BATCH == 0 or t == EP_LEN - 1:
# Get value
v_s_ = ppo.get_value(s_)
# Discounted reward
discounted_r = []
# Update rewards
for r in buffer_r[::-1]:
v_s_ = r + GAMMA * v_s_
discounted_r.append(v_s_)
discounted_r.reverse()
# Buffer states actions rewards
bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, np.newaxis]
buffer_s, buffer_a, buffer_r = [], [], []
ppo.update(bs, ba, br)
# Check if done
if done:
#print("Simulation done.")
break
# Append episode rewards
if ep == 0: all_ep_r.append(ep_r)
else: all_ep_r.append(all_ep_r[-1]*0.9 + ep_r*0.1)
# Close the environment
env.close()
# Return all episode rewards
return all_ep_r
"""
if __name__ == '__main__':
ppo = PPO()
#env = gym.make('RocketLander-v0')
env = gym.make('Pendulum-v0')
reward = ppo.train(env, ppo, 100)
print(reward)
"""
if __name__ == '__main__':
#env = gym.make('RocketLander-v0')
env = gym.make('Pendulum-v0')
ppo = PPO(env)
reward = ppo.train(env, ppo, 100)
print(reward)