Я пытался создать DQN для игр Atari в Tensorflow. Вот мой код:
import tensorflow as tf
import gym
import numpy as np
import matplotlib.pyplot as plt
import random
from collections import deque
from skimage import transform
from skimage import io
from skimage.color import rgb2gray
from os import getcwd
class DQN_Agent:
def DQN(self,x):
layer1 = tf.layers.conv2d(x,32,5,padding='same',activation=tf.nn.relu)
layer2 = tf.layers.conv2d(layer1,32,5,padding='same',activation=tf.nn.relu)
layer3 = tf.layers.flatten(layer2)
layer4 = tf.layers.dense(layer3,24,tf.nn.relu)
layer5 = tf.layers.dense(layer4,24,tf.nn.relu)
layer6 = tf.layers.dense(layer5,self.n_actions,tf.nn.softmax)
return layer6
def __init__(self,resize_dim,n_actions,replay_memory_size,history_length):
self.resize_dim = resize_dim
self.history_length = history_length
self.history = deque(maxlen=self.history_length)
self.n_actions = n_actions
self.memory = deque(maxlen=replay_memory_size)
self.learning_rate = 0.001
self.gamma = 0.99
self.epsilon = 1.0
self.epsilon_decay = 0.999
self.epsilon_min = 0.1
self.x = tf.placeholder(tf.float32,[None,self.resize_dim,self.resize_dim,self.history_length])
self.y = tf.placeholder(tf.float32,[None,self.n_actions])
self.logits = self.DQN(self.x)
self.loss = tf.losses.softmax_cross_entropy(self.y,self.logits)
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
self.sess = tf.Session()
self.saver = tf.train.Saver()
self.init = tf.global_variables_initializer()
self.sess.run(self.init)
def preprocess(self,images):
for i in range(len(images)):
images[i] = rgb2gray(images[i])
images[i] = transform.resize(images[i],(self.resize_dim,self.resize_dim))
images = np.transpose(images,(2,1,0))
images = transform.rotate(images,-90)
images = np.array([images])
return images
def act(self,x,Testing=False):
if Testing:
x = self.preprocess(x)
Q = self.sess.run(self.logits,feed_dict={self.x:x})
action = np.argmax(Q[0])
else:
if random.random() > self.epsilon:
x = self.preprocess(x)
Q = self.sess.run(self.logits,feed_dict={self.x:x})
action = np.argmax(Q[0])
else:
action = random.randrange(0,self.n_actions)
return action
def replay(self,batch):
losses = []
for state, next_state, reward, action, done in batch:
state = self.preprocess(state)
next_state = self.preprocess(next_state)
target = np.zeros(self.n_actions)
if not done:
next_reward = np.amax(self.sess.run(self.logits,feed_dict={self.x:next_state})[0])
target[action] = reward + next_reward*self.gamma
l,_ = self.sess.run([self.loss,self.optimizer],feed_dict={self.x:state, self.y:[target]})
losses.append(l)
return sum(losses)/len(losses)
def save_model(self):
self.saver.save(self.sess,getcwd()+'/model.ckpt')
def load_model(self):
self.saver.restore(self.sess,getcwd()+'/model.ckpt')
def decrease_epsilon(self):
self.epsilon = max(self.epsilon_min, self.epsilon*self.epsilon_decay)
def remember(self,state, next_state, reward, action, done):
self.memory.append((state, next_state, reward, action, done))
def sample_memory(self,size):
return random.sample(self.memory,size)
EPISODES = 2000
env = gym.make('Breakout-v0')
n_actions = env.action_space.n
resize_dim = 84
history_length = 3
replay_memory_size = 190000
batch_size = 32
how_often = 25
losses = []
episode_losses = []
agent = DQN_Agent(resize_dim,n_actions,replay_memory_size,history_length)
with tf.device('/device:GPU:0'):
for episode in range(EPISODES):
state = env.reset()
agent.history.append(state)
state = agent.history
episode_reward = 0
while True:
action = agent.act(state)
next_state,reward,done,info = env.step(action)
agent.history.append(next_state)
next_state = agent.history
episode_reward += reward
agent.remember(state, next_state, reward, action, done)
state = next_state
if len(agent.memory) >= batch_size:
batch = agent.sample_memory(batch_size)
l = agent.replay(batch)
print('average loss on batch:',l)
losses.append(l)
if done:
print('episode: {}/{}, episode reward: {}, epsilon: {}'.format(episode+1,EPISODES,episode_reward,agent.epsilon))
break
agent.decrease_epsilon()
if (episode+1)%how_often == 0:
agent.save_model()
episode_losses.append(sum(losses)/len(losses))
plt.plot(range(episode+1),episode_losses)
plt.ylabel('losses')
plt.xlabel('episodes')
plt.savefig(getcwd()+'/loss_plot.png')
Проблема заключается в том, что при обучении сети для 1000 эпизодов потери увеличились с 0,4 до более 0,9. После этого я попытался обучить алгоритм для 2000 эпизодов и подал в сеть 3 изображения вместо 1, но это ничего не изменило. Вот график потерь:
Кроме того, когда я пытаюсь проверить сеть в промежуточной среде, манипулятор даже не двигается. Может кто-нибудь сказать мне, как исправить эти проблемы?