Потеря Deep Q Network становится все выше - PullRequest
0 голосов
/ 08 сентября 2018

Я пытался создать DQN для игр Atari в Tensorflow. Вот мой код:

import tensorflow as tf
import gym
import numpy as np
import matplotlib.pyplot as plt
import random
from collections import deque
from skimage import transform
from skimage import io
from skimage.color import rgb2gray
from os import getcwd

class DQN_Agent:
    def DQN(self,x):
        layer1 = tf.layers.conv2d(x,32,5,padding='same',activation=tf.nn.relu)
        layer2 = tf.layers.conv2d(layer1,32,5,padding='same',activation=tf.nn.relu)
        layer3 = tf.layers.flatten(layer2)
        layer4 = tf.layers.dense(layer3,24,tf.nn.relu)
        layer5 = tf.layers.dense(layer4,24,tf.nn.relu)
        layer6 = tf.layers.dense(layer5,self.n_actions,tf.nn.softmax)

        return layer6

    def __init__(self,resize_dim,n_actions,replay_memory_size,history_length):
        self.resize_dim = resize_dim
        self.history_length = history_length
        self.history = deque(maxlen=self.history_length)
        self.n_actions = n_actions
        self.memory = deque(maxlen=replay_memory_size)
        self.learning_rate = 0.001
        self.gamma = 0.99
        self.epsilon = 1.0
        self.epsilon_decay = 0.999
        self.epsilon_min = 0.1

        self.x = tf.placeholder(tf.float32,[None,self.resize_dim,self.resize_dim,self.history_length])
        self.y = tf.placeholder(tf.float32,[None,self.n_actions])
        self.logits = self.DQN(self.x)
        self.loss = tf.losses.softmax_cross_entropy(self.y,self.logits)
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
        self.sess = tf.Session()
        self.saver = tf.train.Saver()
        self.init = tf.global_variables_initializer()

        self.sess.run(self.init)

    def preprocess(self,images):
        for i in range(len(images)):
            images[i] = rgb2gray(images[i])
            images[i] = transform.resize(images[i],(self.resize_dim,self.resize_dim))
        images = np.transpose(images,(2,1,0))
        images = transform.rotate(images,-90)
        images = np.array([images])

        return images

    def act(self,x,Testing=False):
        if Testing:
            x = self.preprocess(x)
            Q = self.sess.run(self.logits,feed_dict={self.x:x})
            action = np.argmax(Q[0])

        else:
            if random.random() > self.epsilon:
                x = self.preprocess(x)
                Q = self.sess.run(self.logits,feed_dict={self.x:x})
                action = np.argmax(Q[0])
            else:
                action = random.randrange(0,self.n_actions)

        return action

    def replay(self,batch):
        losses = []
        for state, next_state, reward, action, done in batch:
            state = self.preprocess(state)
            next_state = self.preprocess(next_state)

            target = np.zeros(self.n_actions)

            if not done:
                next_reward = np.amax(self.sess.run(self.logits,feed_dict={self.x:next_state})[0])
                target[action] = reward + next_reward*self.gamma

            l,_ = self.sess.run([self.loss,self.optimizer],feed_dict={self.x:state, self.y:[target]})
            losses.append(l)
        return sum(losses)/len(losses)

    def save_model(self):
        self.saver.save(self.sess,getcwd()+'/model.ckpt')

    def load_model(self):
        self.saver.restore(self.sess,getcwd()+'/model.ckpt')

    def decrease_epsilon(self):
        self.epsilon = max(self.epsilon_min, self.epsilon*self.epsilon_decay)

    def remember(self,state, next_state, reward, action, done):
        self.memory.append((state, next_state, reward, action, done))

    def sample_memory(self,size):
        return random.sample(self.memory,size)

EPISODES = 2000
env = gym.make('Breakout-v0')
n_actions = env.action_space.n
resize_dim = 84
history_length = 3
replay_memory_size = 190000
batch_size = 32
how_often = 25
losses = []
episode_losses = []

agent = DQN_Agent(resize_dim,n_actions,replay_memory_size,history_length)

with tf.device('/device:GPU:0'):
    for episode in range(EPISODES):
        state = env.reset()
        agent.history.append(state)
        state = agent.history
        episode_reward = 0
        while True:
            action = agent.act(state)
            next_state,reward,done,info = env.step(action)
            agent.history.append(next_state)
            next_state = agent.history
            episode_reward += reward
            agent.remember(state, next_state, reward, action, done)
            state = next_state

            if len(agent.memory) >= batch_size:
                batch = agent.sample_memory(batch_size)
                l = agent.replay(batch)
                print('average loss on batch:',l)
                losses.append(l)

            if done:
                print('episode: {}/{}, episode reward: {}, epsilon: {}'.format(episode+1,EPISODES,episode_reward,agent.epsilon))
                break
        agent.decrease_epsilon()

        if (episode+1)%how_often == 0:
            agent.save_model()

        episode_losses.append(sum(losses)/len(losses))

        plt.plot(range(episode+1),episode_losses)
        plt.ylabel('losses')
        plt.xlabel('episodes')
        plt.savefig(getcwd()+'/loss_plot.png')

Проблема заключается в том, что при обучении сети для 1000 эпизодов потери увеличились с 0,4 до более 0,9. После этого я попытался обучить алгоритм для 2000 эпизодов и подал в сеть 3 изображения вместо 1, но это ничего не изменило. Вот график потерь:

Кроме того, когда я пытаюсь проверить сеть в промежуточной среде, манипулятор даже не двигается. Может кто-нибудь сказать мне, как исправить эти проблемы?

...