Оценка, сводимая к минимуму Pytorch NN с проблемой Cartpole - PullRequest
0 голосов
/ 10 октября 2019

Я пытаюсь решить проблему CartPole в тренажерном зале openAI, обучая простой 2-слойный NN в pytorch. Используемый метод - DQN, но результаты сходятся на максимальном балле около 8 или 9 и не улучшаются с течением времени или с тренировкой. Вместо этого оценка становится ниже с тренировкой. Как это можно улучшить / что не так в коде, который заставляет его делать это? Ниже используется код:

import gym
import random
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
from collections import namedtuple
import numpy as np

class network(nn.Module):
    def __init__(self):
        nn.Module.__init__(self)
        # network takes 4 inputs (state, action, next_state, reward), hidden layer then has
        # 256 inputs and the network has 2 outputs (the q value of going left or right)
        # in this network the index of the output references the action.
        self.l1 = nn.Linear(4, 256)
        self.l2 = nn.Linear(256, 2)

def forward(self, x):
    # forward function defines how the model will run
    x = F.relu(self.l1(x))
    x = self.l2(x)
    return (x)

class replay_memory():
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []

def save(self, transition):
    # saves all transitions for the environment in a tensor
    self.memory.append(transition)
    if len(self.memory) > self.capacity:
        del self.memory[0]

def sample(self, batch_size):
    # generates a random sample from the memory
    return random.sample(self.memory, batch_size)

def __len__(self):
    return len(self.memory)


class agent():
    def __init__(self, env, model):
        self.epsilon = 1  # exploration rate
        self.epsilon_min = 0.001 # smallest exploration value
        self.epsilon_decay = 0.995 # rate at which exploration occurs
        self.learning_rate = 0.001


def act(self, state, model):
        # define actions, random or optimal based on exploration rate DOES NOT ACCOUNT FOR THE DECAY
    if random.uniform(0, 1) <= self.epsilon:
        action = torch.LongTensor([[random.randrange(2)]])
        action_np = (action.numpy())[0][0]
    else:

        action = model(Variable(torch.FloatTensor([state])).type(torch.FloatTensor)).max(1)[1].view(1,1)
        action_np = (action.numpy())[0][0]


    if self.epsilon > self.epsilon_min:
        self.epsilon *= self.epsilon_decay
    else:
        self.epsilon = self.epsilon_min

    return action, action_np


def trained_act(self, episodes, network, env):
    for e in range (episodes):
        state = env.reset()
        for t in range (200):
            action, action_np = agent.act(state, network)
            next_state, reward, done, info = env.step(action_np)
            env.render()
            if done:
                break
        print(t)

        env.close()

def learn(batch_size, gamma, memory, optimizer):
    BATCH_SIZE = batch_size
    if len(memory) < BATCH_SIZE:
        return
# random transition batch is taken from experience replay memory.
transitions = memory.sample(BATCH_SIZE)
batch_state, batch_action, batch_reward, batch_next_state = zip(*transitions)

batch_state = Variable(torch.cat(batch_state))
batch_action = Variable(torch.cat(batch_action))
batch_reward = Variable(torch.cat(batch_reward))
batch_next_state = Variable(torch.cat(batch_next_state))


current_q_values = network.forward(batch_state).gather(1, batch_action.unsqueeze(-1))

max_next_q_values = network.forward(batch_next_state).detach().max(1)[0]
expected_q_values = batch_reward + (gamma * max_next_q_values)

# loss is measured from error between current and newly expected Q values
loss = F.smooth_l1_loss(expected_q_values, current_q_values)

# backpropagation of loss to NN
optimizer.zero_grad()
loss.backward()
optimizer.step()

return loss



env = gym.make('CartPole-v0')
env.reset()
network = network()
agent = agent(env, network)
batch_size = 50
episode = 500
T = 200
gamma = 0.95
memory = replay_memory(100)
optimizer = optim.SGD(network.parameters(), 0.001)
l = []
s = []

for e in range (episode):
    state = env.reset()
    for t in range (T):
        action, action_np = agent.act(state, network)
        next_state, reward, done, info = env.step(action_np)
        if done:
            reward = -2

    transition = torch.FloatTensor([state]),torch.LongTensor([action]), torch.FloatTensor([reward]),torch.FloatTensor([next_state])

    memory.save(transition)
    state = next_state

    loss = learn(batch_size, gamma, memory, optimizer)
    l.append(loss)
    if done:
        print('Loss = {}, Episode = {}, finsited after {} steps'.format(loss, e, t))
        s.append(t)
        break

1 Ответ

0 голосов
/ 10 октября 2019

Я бы переписал ваш алгоритм обучения:

for e in range (episode):
    state = env.reset()
    done = False
    t = 0
    while not done:
        action, action_np = agent.act(state, network)
        next_state, reward, done, info = env.step(action_np)


        transition = torch.FloatTensor([state]),torch.LongTensor([action]), torch.FloatTensor([reward]),torch.FloatTensor([next_state])

        memory.save(transition)
        state = next_state

        loss = learn(batch_size, gamma, memory, optimizer)
        l.append(loss)
        if t < T:
            t += 1
        else:
           done = True
        if done:
            print('Loss = {}, Episode = {}, finsited after {} steps'.format(loss, e, t))
            s.append(t)
            break
...