Я пытаюсь использовать TensorFlow, и вот как я вычисляю потери
learning_rate=0.001
g_target_q_t = tf.placeholder(tf.float32, None, name="target_value")
g_action = tf.placeholder(tf.int32, None, name='g_action')
action_one_hot = tf.one_hot(g_action, n_output, 1.0, 0.0, name='action_one_hot')
q_acted = tf.reduce_sum(y * action_one_hot, reduction_indices=1, name='q_acted')
g_loss = tf.reduce_mean(tf.square(g_target_q_t - q_acted), name='g_loss')
optim = tf.train.RMSPropOptimizer(learning_rate=var.learning_rate, momentum=0.95, epsilon=0.01).minimize(g_loss)
Я уже изменил оптимизатор на:
optim = tf.train.AdamOptimizer(learning_rate=var.learning_rate).minimize(
g_loss)
, и это пример обучения
def q_learning_mini_batch(current_agent, current_sess):
""" Training a sampled mini-batch """
batch_s_t, batch_s_t_plus_1, batch_action, batch_reward = current_agent.memory.sample()
if current_agent.double_q: # double q-learning
pred_action = current_sess.run(g_q_action, feed_dict={x: batch_s_t_plus_1})
q_t_plus_1 = current_sess.run(target_q_with_idx, {x_p: batch_s_t_plus_1, g_target_q_idx: [[idx, pred_a] for idx, pred_a in enumerate(pred_action)]})
batch_target_q_t = current_agent.discount * q_t_plus_1 + batch_reward
else:
q_t_plus_1 = current_sess.run(y_p, {x_p: batch_s_t_plus_1})
max_q_t_plus_1 = np.max(q_t_plus_1, axis=1)
batch_target_q_t = current_agent.discount * max_q_t_plus_1 + batch_reward
_, loss_val = current_sess.run([optim, g_loss], {g_target_q_t: batch_target_q_t, g_action: batch_action, x: batch_s_t})
return loss_val
Но значение потерь равно nan, а также я получил большое значение потерь без каких-либо изменений.
Код здесь , и добавление выполняется в функции get_state
в agent.py
Каковы объяснения этого значения нан?