Я реализовал простую модель c актер-критик в Tensorflow2, чтобы изучить среду Cartpole. Но это совсем не обучение. Средний балл за каждые 25 эпизодов ниже 20. Может кто-нибудь указать, почему модель не обучается?
Примечание: я добавляю эту строку только потому, что этот сайт не позволяет мне публиковать вопрос, потому что он содержит в основном код. Не обращайте внимания.
import numpy as np
import random
from collections import namedtuple, deque, defaultdict
import math
import tensorflow as tf
from tensorflow.keras import Input, layers, Model
import tensorflow_probability as tfp
env_name = 'CartPole-v0'
env = gym.make(env_name)
input_dims = env.observation_space.shape
output_dims = env.action_space.n
learning_rate = 0.0001
inputs = layers.Input((*input_dims))
common = layers.Dense(128, activation=tf.nn.relu)(inputs)
policy_head = layers.Dense(output_dims, activation=tf.nn.softmax)(common) #Actor
value_head = layers.Dense(1)(common) #Critic
model = tf.keras.Model(inputs=inputs, outputs=[policy_head, value_head])
opt = tf.keras.optimizers.Adam(learning_rate = learning_rate)
SavedAction = namedtuple('SavedAction',('log_prob','state_value'))
def choose_action(obs):
obs = tf.expand_dims(tf.convert_to_tensor(obs),axis=0)
probs, state_value = model(obs)
m = tfp.distributions.Categorical(probs = probs)
action = m.sample()
return action.numpy()[0], SavedAction(m.log_prob(action),state_value)
def get_loss(saved_actions, rewards, gamma):
returns = []
R = 0
for r in rewards[::-1]:
R = r + gamma*R
returns.insert(0,R)
policy_losses, value_losses = [], []
for (log_prob,value), R in zip(saved_actions,returns):
advantage = R - value
policy_losses.append(-log_prob*advantage)
value_losses.append(tf.keras.losses.mean_squared_error(R,value))
p = sum(policy_losses)
v = sum(value_losses)
loss = p+v
return loss
def train(n_episodes=1500,gamma=1.0, update_every = 25):
gradient_tape = tf.GradientTape()
scores = []
scores_window = deque(maxlen = update_every)
for episode in range(n_episodes):
rewards = []
saved_actions = []
obs = env.reset()
done = False
gradient_tape.__enter__() #Initialize graident tape at the start of each episode
while not done:
action, saved_action = choose_action(obs)
new_obs, reward, done, _ = env.step(action)
rewards.append(reward)
saved_actions.append(saved_action)
obs = new_obs
loss = get_loss(saved_actions, rewards, gamma)
gradients = gradient_tape.gradient(loss, model.trainable_variables)
opt.apply_gradients(zip(gradients, model.trainable_variables))
gradient_tape.__exit__(None,None,None) #Exit graident tape at end of each episode
scores.append(sum(rewards))
scores_window.append(scores)
if episode % update_every == 0:
print("episode {} AvgScore: {}". format(episode, np.mean(scores_window)))
return scores
scores = train(n_episodes=500)