Это моя реализация DQN для Breakout. Я использовал Keras, NumPy и openai-gym в своем коде.
import gym
import random
import numpy as np
import tensorflow as tf
from keras.layers import Dense,Conv2D,MaxPooling2D,Flatten,BatchNormalization
from keras.models import Model
from keras.models import Sequential
import cv2
from collections import deque
from keras.optimizers import RMSprop,SGD,Adam
from keras import backend as K
from datetime import datetime
import os.path
import time
from keras.models import load_model
from keras.models import clone_model
from keras.callbacks import TensorBoard
memory = []
env = gym.make('BreakoutDeterministic-v4')
explore_episodes = 10
training_episodes = 10000
ACTIONS = env.action_space.n
def train_mini_batch(model):
batch_size = 32
mini_batch = random.sample(memory, batch_size)
s_t = np.zeros((batch_size, 84, 84 , 4))
s_t_1 = np.zeros((batch_size, 84, 84 , 4))
target = np.zeros((batch_size,ACTIONS))
r = np.zeros((batch_size))
a = np.zeros((batch_size))
dead = np.zeros((batch_size))
for i in range(batch_size):
s_t[i] = mini_batch[i][0]
r[i] = mini_batch[i][1]
s_t_1[i] = mini_batch[i][2]
a[i] = mini_batch[i][3]
dead[i] = mini_batch[i][4]
next_Q_values = model.predict(s_t_1)
for i in range(batch_size):
if dead[i]:
target[i][int(a[i])] = r[i]
else:
target[i][int(a[i])] = r[i] + 0.99* np.amax(next_Q_values[i])
h = model.fit(s_t, target, epochs=1,batch_size=batch_size, verbose=0)
def get_action(state,model = None,image = None):
if(state == 'explore'):
action = random.randrange(ACTIONS)
else:
image = np.expand_dims(image ,axis = 0)
action = np.argmax(model.predict(image))
return action
def pre_processing(image):
image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
image = cv2.resize(image,(84,84))
return image
def create_model():
model = Sequential()
model.add(Conv2D(32, (3, 3),input_shape=(84,84,4),activation='relu'))
model.add(Conv2D(32, (3, 3),activation='relu'))
model.add(Flatten())
#model.add(Dense(512,activation='relu'))
model.add(Dense(ACTIONS,activation='softmax'))
model.summary()
optimizer = Adam(learning_rate=1e-2, beta_1=0.9, beta_2=0.999, amsgrad=False)
model.compile(optimizer, loss='mse')
return model
def train_model():
model = create_model()
episode = 0
for i in range(explore_episodes):
done = False
episode_reward = 0
print("episode:",i)
observation = env.reset()
observation = pre_processing(observation)
present_state = np.stack((observation,observation,observation,observation),axis= 2)
while(done == False):
env.render()
action = get_action('explore')
observation, reward, done, __ = env.step(action)
episode_reward = episode_reward + reward
observation = pre_processing(observation)
next_state = np.stack((present_state[:, :, 1],present_state[:, :, 2],present_state[ :, :, 3],observation),axis = 2)
memory.append((present_state,reward,next_state,action, done))
present_state = next_state
print("reward:",episode_reward)
for i in range(training_episodes):
done = False
episode_reward = 0
#print("episode:",i)
observation = env.reset()
observation = pre_processing(observation)
present_state = np.stack((observation,observation,observation,observation),axis= 2)
while(done == False):
env.render()
action = get_action('training',model=model,image= present_state)
observation, reward, done, __ = env.step(action)
episode_reward = episode_reward + reward
observation = pre_processing(observation)
next_state = np.stack((present_state[:, :, 1],present_state[:, :, 2],present_state[ :, :, 3],observation),axis = 2)
memory.append((present_state,reward,next_state,action, done))
present_state = next_state
train_mini_batch(model)
if(i % 10 == 0):
model.save("openai_mycode")
print("episode:",i)
print("reward:",episode_reward)
train_model()
Во время обучения агент постоянно выполняет одно действие, и модель не может учиться. Окружающая среда со временем застревает во время тренировок. Я не могу понять, почему моя модель не тренируется.