«Нет градиентов для любой переменной» Нет Градиенты - Отключение графика? - PullRequest
0 голосов
/ 11 марта 2020

Я пытаюсь преобразовать этот скрипт Pytorch в TensorFlow 2, но когда вычисляются градиенты, возникает следующая ошибка: Градиенты для любой переменной не предусмотрены. Переменные, наблюдаемые GradientTape, я предполагаю, что где-то должно быть отключение в моем графике, но я не могу найти, где это находится. Вот минимальный воспроизводимый код:

import numpy as np
import random
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow.keras.layers import Dense, Input
import gym

class ActorCritic(tf.keras.Model):
  def __init__(self, n_actions):
    super(ActorCritic, self).__init__()
    self.hiddenLogits = Dense(256, activation='tanh')
    self.hiddenValue = Dense(256, activation='tanh')
    self.outputLogits = Dense(n_actions, 'softmax')
    self.outputValue = Dense(n_actions, activation='linear')

  def call(self, inputs):
    x = tf.convert_to_tensor(inputs)
    logits = self.hiddenLogits(x)
    value = self.hiddenValue(x)
    probas = self.outputLogits(logits)
    qvalues = self.outputValue(value)
    value = tf.reduce_sum(probas*qvalues, axis=-1)
    return tf.squeeze(probas), tf.squeeze(qvalues), value


def compute_acer_loss(policies, q_values, values, actions, rewards, retrace, masks, behavior_policies, gamma=0.99, truncation_clip=10, entropy_weight=0.0001):
  loss = 0
  var_list = model.trainable_variables
  with tf.GradientTape() as tape:
    tape.watch(var_list)
    for step in reversed(range(len(rewards))):

      importance_weight = tf.stop_gradient(policies[step]) / tf.stop_gradient(behavior_policies[step])

      retrace = rewards[step] + gamma * retrace * masks[step]
      advantage = retrace - values[step]

      log_policy_action =tf.math.log(tf.gather(policies[step], actions[step], axis=1))
      truncated_importance_weight = tf.clip_by_value(tf.gather(importance_weight, actions[step], axis=1), -float('inf'), truncation_clip)

      actor_loss = -tf.reduce_mean(truncated_importance_weight * log_policy_action * tf.stop_gradient(advantage))

      correction_weight = tf.clip_by_value(1 - truncation_clip / importance_weight, 0, float('inf'))
      actor_loss -= tf.reduce_mean(tf.reduce_sum(correction_weight * tf.math.log(policies[step]) * tf.stop_gradient(q_values[step] - values[step]), axis=0))

      entropy = entropy_weight * -tf.reduce_mean(tf.reduce_sum(tf.math.log(policies[step])* policies[step], axis=0))

      q_value = tf.gather(q_values[step], actions[step], axis=1)
      critic_loss = tf.reduce_mean((retrace - q_value) ** 2, axis=0)

      truncated_rho = tf.clip_by_value(tf.gather(importance_weight, actions[step], axis=1), -float('inf'), 1)
      retrace = truncated_rho * (retrace - tf.stop_gradient(q_value)) + tf.stop_gradient(values[step])

      loss += actor_loss + critic_loss - entropy

  grads = tape.gradient(loss,var_list)
  optimizer.apply_gradients(zip(grads, var_list))


env = gym.make("CartPole-v0")
model = ActorCritic(env.action_space.n)
optimizer = tf.keras.optimizers.Adam()

frame_idx    = 0
max_frames   = 10000
num_steps    = 5
state = env.reset()

while frame_idx < max_frames:

  q_values = []
  values   = []
  policies = []
  actions  = []
  rewards  = []
  masks    = []

  for step in range(num_steps):
    policy, q_value, value = model(state[None,:])

    action = int(tfp.distributions.Multinomial(1, probs=policy).sample().numpy()[0])
    next_state, reward, done, _ = env.step(action)
    mask = 1-done


    q_values.append(q_value)
    policies.append(policy)
    actions.append(action)
    rewards.append(reward)
    values.append(value)
    masks.append(mask)

    state = next_state
    if done:
      state = env.reset()


  _, _, retrace = model(state[None,:])
  retrace = tf.stop_gradient(retrace)
  compute_acer_loss(policies, q_values, values, actions, rewards, retrace, masks, policies)

  frame_idx += num_steps
...