Агент карпола Openai из спортзала, похоже, не учится. Потеря, кажется, увеличивается - PullRequest
0 голосов
/ 30 марта 2020

Я уже некоторое время работаю над проблемой с кузнечиком в DQN, и у меня не получается добиться того, чтобы агент изучил игру. За 100 i sh эпизодов количество временных шагов, в течение которых он может остаться в живых, составляет:

[15, 31, 12, 14, 29, 25, 12, 31, 27, 29, 12, 16, 18, 17, 12, 34, 22, 17, 19, 33, 13, 13, 34, 13, 21, 14, 15, 23, 41, 9, 15, 25, 11, 30, 13, 16, 42, 10, 18, 54, 8, 12, 13, 23, 18, 14, 18, 14, 34, 10, 10, 10, 24, 24, 15, 51, 16, 31, 14, 12, 36, 21, 24, 14, 14, 15, 18, 13, 22, 17, 12, 14, 13, 18, 25, 17, 19, 28, 21, 14, 18, 15, 18, 11, 14, 10, 10, 38, 22, 19, 30, 17, 22, 16, 23, 10, 20, 21, 13, 27, 18, 13, 18, 16, 13, 65, 10, 29, 9, 34]

Этот тип означает, что агент на самом деле не учил меня. Я настроил скорость обучения на скорость затухания эпсилона от 0.01 до 0.0001, но я не видел большой разницы. Я также увеличил количество эпизодов до 300i sh, но агент все еще не учится. На самом деле, кажется, он выживает в течение короткого промежутка времени.

Что-то не так с кодом, который я реализовал для DQN? Код ниже, любая помощь приветствуется.

Одна интересная вещь, которую я обнаружил, заключалась в том, что вычисленные потери вначале уменьшились, но снова возросли. Я не понимаю, почему это так.

[22.0753, 13.9927, 11.9891, 0.0746, 0.025, 0.0487, 0.0521, 0.0543, 0.057, 0.0687, 0.05, 0.0526, 4.845, 0.0493, 0.0299, 0.0397, 0.0423, 0.039, 0.0423, 0.2675, 0.0493, 0.0553, 0.0405, 0.0457, 0.05, 0.0423, 0.0462, 0.0493, 0.0555, 0.057, 0.0602, 0.098, 0.0698, 0.0727, 0.0853, 0.0866, 0.2486, 0.1247, 0.1271, 0.1313, 0.1358, 0.1621, 0.1721, 0.1783, 0.2131, 0.2257, 0.2675, 0.2827, 0.2955, 0.3042, 0.3574, 0.3952, 0.5354, 0.6258, 0.6349, 2.4508, 0.7996, 0.8465, 1.0056, 1.0829, 1.2275, 1.3019, 1.3446, 3.4537, 1.5911, 1.6456, 1.9513, 1.9767, 2.0278, 2.0476, 2.3613, 2.475, 3.039, 3.0158, 3.1077, 3.6984, 3.8358, 4.7687, 4.6146, 4.73, 5.5007, 5.607, 5.7745, 5.8513, 5.8884, 7.1544, 8.3076, 8.6733, 10.1566, 10.6312, 10.9324, 12.5568, 13.1688, 13.3174, 15.2715, 15.9004, 16.1051, 18.8363, 19.3386, 21.3502, 22.6002, 23.1654, 23.6404, 31.468, 32.1631, 35.1421, 36.6199, 32.6949]
import gym
import time
import math
import random
from collections import namedtuple
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
import matplotlib.pyplot as plt

class DQN(nn.Module):
#     def __init__(self, img_height, img_width):
    # Food coord and snakes coords will be fead into the network as board height and widths are already established
    def __init__(self, observation_space=4):
        super(DQN,self).__init__()
        self.fc1 = nn.Linear(in_features=observation_space, out_features=24)
        # +1 because of turn requires another input node
        self.fc2 = nn.Linear(in_features=24, out_features=24)
        self.out = nn.Linear(in_features=24, out_features=2)

    def forward(self, t):
        #t = t.flatten(start_dim=1)
        t = F.relu(self.fc1(t))
        # print("fc1",t)
        t = F.relu(self.fc2(t))
        # print("fc2",t)
        t = self.out(t)
        # print("out",t)
        return t

#Replay Memory
class ReplayMemory():
    def __init__(self,capacity):
        self.capacity = capacity
        self.memory = []
        self.push_count = 0

    def push(self, experience):
        """
        Adds expeirence to the oveall memory. Memory buffer follows FIFO when full
        """
        if len(self.memory) < self.capacity:
            self.memory.append(experience)
        else:
            self.memory[self.push_count % self.capacity] = experience
        self.push_count += 1

    def sample(self, batch_size):
        """
        Samples random experiences with the size of 'batch_size'
        """
        return random.sample(self.memory, batch_size)

    def can_provide_sample(self, batch_size):
        """
        Checks if we have enough memory stored. If size of memory is greater
        then batch_size then there is enough stored experiences.
        """
        return len(self.memory) >= batch_size

#Epsilon Greedy Strategy
class EpsilonGreedyStrategy():
    def __init__(self,start,end,decay):
        #Start, end and decay values
        self.start = start
        self.end = end
        self.decay = decay

    def get_exploration_rate(self, current_step):
        """
        Calculates the exploration rate

        Args:
            current_step (int): The current step for the episode
        Returns:
            (float): The exploration rate
        """
        return self.end + (self.start - self.end)*math.exp(-1*current_step*self.decay)

#Reinforcement Learning Agent
class Agent():
    def __init__(self, strategy, num_actions, device):
        self.current_step = 0
        self.strategy = strategy
        self.num_actions = num_actions
        self.device = device

    def select_action(self, state, policy_net):
        """
        Selects the action to be taken

        Args:
            state (tensor): The state of the game
            policy_net (DQN): The policy network of the NN

        Returns:
            (int): The action to take
        """
        rate = self.strategy.get_exploration_rate(self.current_step)
        self.current_step += 1

        if rate > random.random():
            # print("Exploring")
            return random.randrange(self.num_actions) #explore
        else:
            with torch.no_grad():
                print("Exploiting")
                # print("policy_net",state, type(state), torch.from_numpy(state))
                # print("policy_net", policy_net(state.float()))
                return policy_net(state).argmax(dim=0).item() #exploit

class QValues():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    @staticmethod
    def action_tuple_to_tensor(self,action):
        """
        Converts the action from a tuple to a tensor
        Args: action(tuple)
        Return: action(tensor)
        """
        action_tensor = torch.tensor([1,0,0,0])
        return

    @staticmethod
    def get_current(policy_net, states, actions):
        """
        Get the current Q value.
        Args:
            policy_net (DQN): The policy_net
            states (tensor): The states of the game
            actions (int): The action to be taken from the current state of the game

        Returns:
            (tensor): Output nodes of the policy_net
        """
        # print(policy_net(states[0]))
        action_tensor = torch.tensor([0,0,0,0])
        action_tensor[actions] = actions
        # print("states",states, type(states),torch.from_numpy(state))
        return policy_net(states).gather(dim=0,index=action_tensor)

    @staticmethod
    def get_next(target_net, next_states, actions):
        """
        Get the future Q value.
        Args:
            target_net (DQN): The target_net
            next_states (tensor): The future states of the game
            actions (int): The action to be taken from the future state of the game

        Returns:
            (tensor): Output nodes of the target_net
        """
        #final_state_locations = next_states.flatten(start_dim=1)
        action_tensor = torch.tensor([0,0,0,0])
        action_tensor[actions] = actions
        return target_net(next_states).gather(dim=0,index=action_tensor)

#         final_state_locations = next_states.flatten(start_dim=1).max(dim=1)[0].eq(0).type(torch.bool)
#         non_final_state_locations = (final_state_locations == False)
#         batch_size = next_states.shape[0]
#         values = torch.zeros(batch_size).to(QValues.device)
#         values[non_final_state_locations] = target_net(non_final_states).max(dim=1)[0].detach()
        return values

def plot(episode_count_list, step_count_list):
    """
    Plots the graph of episode_count against step_count
    Args:
        episode_count_list (List): A list of episode counts
        step_count_list (List): A list of step counts for each episode
    """
    plt.plot(episode_count_list, step_count_list)
    plt.ylabel('Number of steps')
    plt.axis([0, 10, 0, 200])
    plt.show()

Experience = namedtuple(
    'Experience',
    ('state','action','next_state','reward')
)

if __name__ == '__main__':
    env = gym.make('CartPole-v0')
    env.reset()

    batch_size = 50
    gamma = 0.999
    eps_start = 1
    eps_end = 0.01
    eps_decay = 0.0001
    target_update = 50 #How often the target network will update from policy network's weights
    memory_size = 10000
    lr = 0.0001
    num_episodes = 300
    num_actions_available = 2
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    policy_net = DQN().to(device)
    target_net = DQN().to(device)
    target_net.load_state_dict(policy_net.state_dict()) #Uploads the weights of the policy net to the target net
    target_net.eval()
    optimizer = optim.Adam(params=policy_net.parameters(), lr=lr)
    strategy = EpsilonGreedyStrategy(eps_start,eps_end, eps_decay)
    agent = Agent(strategy,num_actions_available,device)
    replay_memory = ReplayMemory(memory_size)

    step_count_list = []
    loss = None
    loss_list = []

    for i in range(num_episodes):
        observation = env.reset()
        for t in range(200):
            env.render()
            # action = env.action_space.sample()
            state = torch.from_numpy(observation).float()
            action = agent.select_action(state,policy_net)
            observation, reward, done, info = env.step(action) # take a random action
            if done == True:
                reward = -reward*5
            if done == False:
                reward = reward*5
                # print("observation", observation)
                # print("reward", reward)
                # print("info", info)
                replay_memory.push(Experience(state, action, torch.from_numpy(observation).float(), reward))
                # print(replay_memory.memory[0].state,len(replay_memory.memory))
            if replay_memory.can_provide_sample(batch_size):
                replay_memory.sample(batch_size)
                for j in range(batch_size):
                    # print("reward",replay_memory.memory[j].reward)
                    current_q_values = QValues.get_current(policy_net, replay_memory.memory[j].state, replay_memory.memory[j].action)
                    next_q_values = QValues.get_next(target_net, replay_memory.memory[j].next_state, replay_memory.memory[j].action)
                    target_q_values = (next_q_values * gamma) + replay_memory.memory[j].reward
                    loss = F.mse_loss(current_q_values.unsqueeze(1), target_q_values.unsqueeze(1))
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
            if done:
                if loss: loss_list.append(round(float(loss),4))
                print("Episode {} finished after {} timesteps".format(i,t+1))
                step_count_list.append(t)
                break

            if len(replay_memory.memory) % target_update == 0:
                target_net.load_state_dict(policy_net.state_dict())
    env.close()
    print(step_count_list)
    print(loss_list)
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...