Я уже некоторое время работаю над проблемой с кузнечиком в DQN, и у меня не получается добиться того, чтобы агент изучил игру. За 100 i sh эпизодов количество временных шагов, в течение которых он может остаться в живых, составляет:
[15, 31, 12, 14, 29, 25, 12, 31, 27, 29, 12, 16, 18, 17, 12, 34, 22, 17, 19, 33, 13, 13, 34, 13, 21, 14, 15, 23, 41, 9, 15, 25, 11, 30, 13, 16, 42, 10, 18, 54, 8, 12, 13, 23, 18, 14, 18, 14, 34, 10, 10, 10, 24, 24, 15, 51, 16, 31, 14, 12, 36, 21, 24, 14, 14, 15, 18, 13, 22, 17, 12, 14, 13, 18, 25, 17, 19, 28, 21, 14, 18, 15, 18, 11, 14, 10, 10, 38, 22, 19, 30, 17, 22, 16, 23, 10, 20, 21, 13, 27, 18, 13, 18, 16, 13, 65, 10, 29, 9, 34]
Этот тип означает, что агент на самом деле не учил меня. Я настроил скорость обучения на скорость затухания эпсилона от 0.01
до 0.0001
, но я не видел большой разницы. Я также увеличил количество эпизодов до 300i sh, но агент все еще не учится. На самом деле, кажется, он выживает в течение короткого промежутка времени.
Что-то не так с кодом, который я реализовал для DQN? Код ниже, любая помощь приветствуется.
Одна интересная вещь, которую я обнаружил, заключалась в том, что вычисленные потери вначале уменьшились, но снова возросли. Я не понимаю, почему это так.
[22.0753, 13.9927, 11.9891, 0.0746, 0.025, 0.0487, 0.0521, 0.0543, 0.057, 0.0687, 0.05, 0.0526, 4.845, 0.0493, 0.0299, 0.0397, 0.0423, 0.039, 0.0423, 0.2675, 0.0493, 0.0553, 0.0405, 0.0457, 0.05, 0.0423, 0.0462, 0.0493, 0.0555, 0.057, 0.0602, 0.098, 0.0698, 0.0727, 0.0853, 0.0866, 0.2486, 0.1247, 0.1271, 0.1313, 0.1358, 0.1621, 0.1721, 0.1783, 0.2131, 0.2257, 0.2675, 0.2827, 0.2955, 0.3042, 0.3574, 0.3952, 0.5354, 0.6258, 0.6349, 2.4508, 0.7996, 0.8465, 1.0056, 1.0829, 1.2275, 1.3019, 1.3446, 3.4537, 1.5911, 1.6456, 1.9513, 1.9767, 2.0278, 2.0476, 2.3613, 2.475, 3.039, 3.0158, 3.1077, 3.6984, 3.8358, 4.7687, 4.6146, 4.73, 5.5007, 5.607, 5.7745, 5.8513, 5.8884, 7.1544, 8.3076, 8.6733, 10.1566, 10.6312, 10.9324, 12.5568, 13.1688, 13.3174, 15.2715, 15.9004, 16.1051, 18.8363, 19.3386, 21.3502, 22.6002, 23.1654, 23.6404, 31.468, 32.1631, 35.1421, 36.6199, 32.6949]
import gym
import time
import math
import random
from collections import namedtuple
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
import matplotlib.pyplot as plt
class DQN(nn.Module):
# def __init__(self, img_height, img_width):
# Food coord and snakes coords will be fead into the network as board height and widths are already established
def __init__(self, observation_space=4):
super(DQN,self).__init__()
self.fc1 = nn.Linear(in_features=observation_space, out_features=24)
# +1 because of turn requires another input node
self.fc2 = nn.Linear(in_features=24, out_features=24)
self.out = nn.Linear(in_features=24, out_features=2)
def forward(self, t):
#t = t.flatten(start_dim=1)
t = F.relu(self.fc1(t))
# print("fc1",t)
t = F.relu(self.fc2(t))
# print("fc2",t)
t = self.out(t)
# print("out",t)
return t
#Replay Memory
class ReplayMemory():
def __init__(self,capacity):
self.capacity = capacity
self.memory = []
self.push_count = 0
def push(self, experience):
"""
Adds expeirence to the oveall memory. Memory buffer follows FIFO when full
"""
if len(self.memory) < self.capacity:
self.memory.append(experience)
else:
self.memory[self.push_count % self.capacity] = experience
self.push_count += 1
def sample(self, batch_size):
"""
Samples random experiences with the size of 'batch_size'
"""
return random.sample(self.memory, batch_size)
def can_provide_sample(self, batch_size):
"""
Checks if we have enough memory stored. If size of memory is greater
then batch_size then there is enough stored experiences.
"""
return len(self.memory) >= batch_size
#Epsilon Greedy Strategy
class EpsilonGreedyStrategy():
def __init__(self,start,end,decay):
#Start, end and decay values
self.start = start
self.end = end
self.decay = decay
def get_exploration_rate(self, current_step):
"""
Calculates the exploration rate
Args:
current_step (int): The current step for the episode
Returns:
(float): The exploration rate
"""
return self.end + (self.start - self.end)*math.exp(-1*current_step*self.decay)
#Reinforcement Learning Agent
class Agent():
def __init__(self, strategy, num_actions, device):
self.current_step = 0
self.strategy = strategy
self.num_actions = num_actions
self.device = device
def select_action(self, state, policy_net):
"""
Selects the action to be taken
Args:
state (tensor): The state of the game
policy_net (DQN): The policy network of the NN
Returns:
(int): The action to take
"""
rate = self.strategy.get_exploration_rate(self.current_step)
self.current_step += 1
if rate > random.random():
# print("Exploring")
return random.randrange(self.num_actions) #explore
else:
with torch.no_grad():
print("Exploiting")
# print("policy_net",state, type(state), torch.from_numpy(state))
# print("policy_net", policy_net(state.float()))
return policy_net(state).argmax(dim=0).item() #exploit
class QValues():
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@staticmethod
def action_tuple_to_tensor(self,action):
"""
Converts the action from a tuple to a tensor
Args: action(tuple)
Return: action(tensor)
"""
action_tensor = torch.tensor([1,0,0,0])
return
@staticmethod
def get_current(policy_net, states, actions):
"""
Get the current Q value.
Args:
policy_net (DQN): The policy_net
states (tensor): The states of the game
actions (int): The action to be taken from the current state of the game
Returns:
(tensor): Output nodes of the policy_net
"""
# print(policy_net(states[0]))
action_tensor = torch.tensor([0,0,0,0])
action_tensor[actions] = actions
# print("states",states, type(states),torch.from_numpy(state))
return policy_net(states).gather(dim=0,index=action_tensor)
@staticmethod
def get_next(target_net, next_states, actions):
"""
Get the future Q value.
Args:
target_net (DQN): The target_net
next_states (tensor): The future states of the game
actions (int): The action to be taken from the future state of the game
Returns:
(tensor): Output nodes of the target_net
"""
#final_state_locations = next_states.flatten(start_dim=1)
action_tensor = torch.tensor([0,0,0,0])
action_tensor[actions] = actions
return target_net(next_states).gather(dim=0,index=action_tensor)
# final_state_locations = next_states.flatten(start_dim=1).max(dim=1)[0].eq(0).type(torch.bool)
# non_final_state_locations = (final_state_locations == False)
# batch_size = next_states.shape[0]
# values = torch.zeros(batch_size).to(QValues.device)
# values[non_final_state_locations] = target_net(non_final_states).max(dim=1)[0].detach()
return values
def plot(episode_count_list, step_count_list):
"""
Plots the graph of episode_count against step_count
Args:
episode_count_list (List): A list of episode counts
step_count_list (List): A list of step counts for each episode
"""
plt.plot(episode_count_list, step_count_list)
plt.ylabel('Number of steps')
plt.axis([0, 10, 0, 200])
plt.show()
Experience = namedtuple(
'Experience',
('state','action','next_state','reward')
)
if __name__ == '__main__':
env = gym.make('CartPole-v0')
env.reset()
batch_size = 50
gamma = 0.999
eps_start = 1
eps_end = 0.01
eps_decay = 0.0001
target_update = 50 #How often the target network will update from policy network's weights
memory_size = 10000
lr = 0.0001
num_episodes = 300
num_actions_available = 2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
policy_net = DQN().to(device)
target_net = DQN().to(device)
target_net.load_state_dict(policy_net.state_dict()) #Uploads the weights of the policy net to the target net
target_net.eval()
optimizer = optim.Adam(params=policy_net.parameters(), lr=lr)
strategy = EpsilonGreedyStrategy(eps_start,eps_end, eps_decay)
agent = Agent(strategy,num_actions_available,device)
replay_memory = ReplayMemory(memory_size)
step_count_list = []
loss = None
loss_list = []
for i in range(num_episodes):
observation = env.reset()
for t in range(200):
env.render()
# action = env.action_space.sample()
state = torch.from_numpy(observation).float()
action = agent.select_action(state,policy_net)
observation, reward, done, info = env.step(action) # take a random action
if done == True:
reward = -reward*5
if done == False:
reward = reward*5
# print("observation", observation)
# print("reward", reward)
# print("info", info)
replay_memory.push(Experience(state, action, torch.from_numpy(observation).float(), reward))
# print(replay_memory.memory[0].state,len(replay_memory.memory))
if replay_memory.can_provide_sample(batch_size):
replay_memory.sample(batch_size)
for j in range(batch_size):
# print("reward",replay_memory.memory[j].reward)
current_q_values = QValues.get_current(policy_net, replay_memory.memory[j].state, replay_memory.memory[j].action)
next_q_values = QValues.get_next(target_net, replay_memory.memory[j].next_state, replay_memory.memory[j].action)
target_q_values = (next_q_values * gamma) + replay_memory.memory[j].reward
loss = F.mse_loss(current_q_values.unsqueeze(1), target_q_values.unsqueeze(1))
optimizer.zero_grad()
loss.backward()
optimizer.step()
if done:
if loss: loss_list.append(round(float(loss),4))
print("Episode {} finished after {} timesteps".format(i,t+1))
step_count_list.append(t)
break
if len(replay_memory.memory) % target_update == 0:
target_net.load_state_dict(policy_net.state_dict())
env.close()
print(step_count_list)
print(loss_list)