У меня есть модель nn Actor Criti c TD3 с LSTM в моем AI. Для каждого обучения я создаю пакеты последовательных данных и обучаю свой ИИ.
Может ли кто-нибудь из экспертов, пожалуйста, помогите сообщить, требуются ли мне эпохи для этого ИИ. И в целом, сколько эпох я могу использовать. этот код, поскольку я создаю много пакетов на одном этапе обучения, возможно ли также иметь эпохи.
Ниже приведен код этапа обучения
def train(
self,
replay_buffer,
iterations,
batch_size=50,
discount=0.99,
tau=0.005,
policy_noise=0.2,
noise_clip=0.5,
policy_freq=2,
):
b_state = torch.Tensor([])
b_next_state = torch.Tensor([])
b_done = torch.Tensor([])
b_reward = torch.Tensor([])
b_action = torch.Tensor([])
for it in range(iterations):
# print ('it: ', it, ' iterations: ', iterations)
# Step 4: We sample a batch of transitions (s, s’, a, r) from the memory
(batch_states, batch_next_states, batch_actions,
batch_rewards, batch_dones) = \
replay_buffer.sample(batch_size)
batch_states = batch_states.astype(float)
batch_next_states = batch_next_states.astype(float)
batch_actions = batch_actions.astype(float)
batch_rewards = batch_rewards.astype(float)
batch_dones = batch_dones.astype(float)
state = torch.from_numpy(batch_states)
next_state = torch.from_numpy(batch_next_states)
action = torch.from_numpy(batch_actions)
reward = torch.from_numpy(batch_rewards)
done = torch.from_numpy(batch_dones)
b_size = 1
seq_len = state.shape[0]
batch = b_size
input_size = state_dim
state = torch.reshape(state, ( 1,seq_len, state_dim))
next_state = torch.reshape(next_state, ( 1,seq_len, state_dim))
done = torch.reshape(done, ( 1,seq_len, 1))
reward = torch.reshape(reward, ( 1, seq_len, 1))
action = torch.reshape(action, ( 1, seq_len, action_dim))
b_state = torch.cat((b_state, state),dim=0)
b_next_state = torch.cat((b_next_state, next_state),dim=0)
b_done = torch.cat((b_done, done),dim=0)
b_reward = torch.cat((b_reward, reward),dim=0)
b_action = torch.cat((b_action, action),dim=0)
# state = torch.reshape(state, (seq_len, 1, state_dim))
# next_state = torch.reshape(next_state, (seq_len, 1,
# state_dim))
# done = torch.reshape(done, (seq_len, 1, 1))
# reward = torch.reshape(reward, (seq_len, 1, 1))
# action = torch.reshape(action, (seq_len, 1, action_dim))
# b_state = torch.cat((b_state, state),dim=1)
# b_next_state = torch.cat((b_next_state, next_state),dim=1)
# b_done = torch.cat((b_done, done),dim=1)
# b_reward = torch.cat((b_reward, reward),dim=1)
# b_action = torch.cat((b_action, action),dim=1)
print("dim state:",b_state.shape)
# for h and c shape (num_layers * num_directions, batch, hidden_size)
ha0 = torch.zeros(lstm_layers, b_state.shape[0], state_dim)
ca0 = torch.zeros(lstm_layers, b_state.shape[0], state_dim)
hc0 = torch.zeros(lstm_layers, b_state.shape[0], state_dim + action_dim)
cc0 = torch.zeros(lstm_layers, b_state.shape[0], state_dim + action_dim)
# Step 5: From the next state s’, the Actor target plays the next action a’
b_next_action = self.actor_target(b_next_state, (ha0, ca0))
b_next_action = b_next_action[0]
# Step 6: We add Gaussian noise to this next action a’ and we clamp it in a range of values supported by the environment
noise = torch.Tensor(b_next_action).data.normal_(0,
policy_noise)
noise = noise.clamp(-noise_clip, noise_clip)
b_next_action = (b_next_action + noise).clamp(-self.max_action,
self.max_action)
# Step 7: The two Critic targets take each the couple (s’, a’) as input and return two Q-values Qt1(s’,a’) and Qt2(s’,a’) as outputs
result = self.critic_target(b_next_state, b_next_action, (hc0,cc0))
target_Q1 = result[0]
target_Q2 = result[1]
# Step 8: We keep the minimum of these two Q-values: min(Qt1, Qt2)
target_Q = torch.min(target_Q1, target_Q2).double()
# Step 9: We get the final target of the two Critic models, which is: Qt = r + γ * min(Qt1, Qt2), where γ is the discount factor
target_Q = b_reward + (1 - b_done) * discount * target_Q
# Step 10: The two Critic models take each the couple (s, a) as input and return two Q-values Q1(s,a) and Q2(s,a) as outputs
b_action_reshape = torch.reshape(b_action, b_next_action.shape)
result = self.critic(b_state, b_action_reshape, (hc0, cc0))
current_Q1 = result[0]
current_Q2 = result[1]
# Step 11: We compute the loss coming from the two Critic models: Critic Loss = MSE_Loss(Q1(s,a), Qt) + MSE_Loss(Q2(s,a), Qt)
critic_loss = F.mse_loss(current_Q1, target_Q) \
+ F.mse_loss(current_Q2, target_Q)
# Step 12: We backpropagate this Critic loss and update the parameters of the two Critic models with a SGD optimizer
self.critic_optimizer.zero_grad()
critic_loss.backward()
self.critic_optimizer.step()
# Step 13: Once every two iterations, we update our Actor model by performing gradient ascent on the output of the first Critic model
out = self.actor(b_state, (ha0, ca0))
out = out[0]
(actor_loss, hx, cx) = self.critic.Q1(b_state, out, (hc0,cc0))
actor_loss = -1 * actor_loss.mean()
self.actor_optimizer.zero_grad()
actor_loss.backward()
self.actor_optimizer.step()
# Step 14: Still once every two iterations, we update the weights of the Actor target by polyak averaging
for (param, target_param) in zip(self.actor.parameters(),
self.actor_target.parameters()):
target_param.data.copy_(tau * param.data + (1 - tau)
* target_param.data)
# Step 15: Still once every two iterations, we update the weights of the Critic target by polyak averaging
for (param, target_param) in zip(self.critic.parameters(),
self.critic_target.parameters()):
target_param.data.copy_(tau * param.data + (1 - tau)
* target_param.data)