Эта реализация PPO где-то содержит ошибку, и я не могу понять, что не так.Сеть возвращает нормальное распределение и оценку стоимости от критика.Последний слой актера предоставляет четыре значения действия F.tanh
ed, которые используются в качестве среднего значения для распределения.nn.Parameter(torch.zeros(action_dim))
является стандартным отклонением.
Траектории для 20 параллельных агентов добавляются в одну и ту же память.Длина эпизода равна 1000, и memory.sample()
возвращает np.random.permutation
записей памяти по 20 КБ в виде тензоров с пакетами размером 64. Перед суммированием тензоров пакетов значения сохраняются как (1, -1) тензоров в collection.deque
с.Возвращенные тензоры: detach()
ed.
среда
brain_name = envs.brain_names[0]
env_info = envs.reset(train_mode=True)[brain_name]
env_info = envs.step(actions.cpu().detach().numpy())[brain_name]
next_states = env_info.vector_observations
rewards = env_info.rewards
dones = env_info.local_done
шаг обновления
def clipped_surrogate_update(policy, memory, num_epochs=10, clip_param=0.2, gradient_clip=5, beta=0.001, value_loss_coeff=0.5):
advantages_batch, states_batch, log_probs_old_batch, returns_batch, actions_batch = memory.sample()
advantages_batch = (advantages_batch - advantages_batch.mean()) / advantages_batch.std()
for _ in range(num_epochs):
for i in range(len(advantages_batch)):
advantages_sample = advantages_batch[i]
states_sample = states_batch[i]
log_probs_old_sample = log_probs_old_batch[i]
returns_sample = returns_batch[i]
actions_sample = actions_batch[i]
dist, values = policy(states_sample)
log_probs_new = dist.log_prob(actions_sample.to(device)).sum(-1).unsqueeze(-1)
entropy = dist.entropy().sum(-1).unsqueeze(-1).mean()
ratio = (log_probs_new - log_probs_old_sample).exp()
clipped_ratio = torch.clamp(ratio, 1-clip_param, 1+clip_param)
clipped_surrogate_loss = -torch.min(ratio*advantages_sample, clipped_ratio*advantages_sample).mean()
value_function_loss = (returns_sample - values).pow(2).mean()
Loss = clipped_surrogate_loss - beta * entropy + value_loss_coeff * value_function_loss
optimizer_policy.zero_grad()
Loss.backward()
torch.nn.utils.clip_grad_norm_(policy.parameters(), gradient_clip)
optimizer_policy.step()
del Loss
выборка данных
def collect_trajectories(envs, env_info, policy, memory, tmax=200, nrand=0, gae_tau = 0.95, discount = 0.995):
next_episode = False
states = env_info.vector_observations
n_agents = len(env_info.agents)
state_list=[]
reward_list=[]
prob_list=[]
action_list=[]
value_list=[]
if nrand > 0:
# perform nrand random steps
for _ in range(nrand):
actions = np.random.randn(num_agents, action_size)
actions = np.clip(actions, -1, 1)
env_info = envs.step(actions)[brain_name]
states = env_info.vector_observations
for t in range(tmax):
states = torch.FloatTensor(states).to(device)
dist, values = policy(states)
actions = dist.sample()
probs = dist.log_prob(actions).sum(-1).unsqueeze(-1)
env_info = envs.step(actions.cpu().detach().numpy())[brain_name]
next_states = env_info.vector_observations
rewards = env_info.rewards
dones = env_info.local_done
state_list.append(states)
reward_list.append(rewards)
prob_list.append(probs)
action_list.append(actions)
value_list.append(values)
states = next_states
if np.any(dones):
next_episode = True
break
_, next_value = policy(torch.FloatTensor(states).to(device))
reward_arr = np.array(reward_list)
undiscounted_rewards = np.sum(reward_arr, axis=0)
state_arr = torch.stack(state_list)
prob_arr = torch.stack(prob_list)
action_arr = torch.stack(action_list)
value_arr = torch.stack(value_list)
reward_arr = torch.FloatTensor(reward_arr[:, :, np.newaxis])
advantage_list = []
return_list = []
returns = next_value.detach()
advantages = torch.FloatTensor(np.zeros((n_agents, 1)))
for i in reversed(range(state_arr.shape[0])):
returns = reward_arr[i] + discount * returns
td_error = reward_arr[i] + discount * next_value - value_arr[i]
advantages = advantages * gae_tau * discount + td_error
next_value = value_arr[i]
advantage_list.append(advantages.detach())
return_list.append(returns.detach())
advantage_arr = torch.stack(advantage_list)
return_arr = torch.stack(return_list)
for i in range(state_arr.shape[0]):
memory.add({'advantages': advantage_arr[i],
'states': state_arr[i],
'log_probs_old': prob_arr[i],
'returns': return_arr[i],
'actions': action_arr[i]})
return undiscounted_rewards, next_episode