Моя сеть DQN не может тренироваться, и после первого эпизода среда в спортзале застревает. Не могли бы вы сказать мне ошибку в моем коде - PullRequest
0 голосов
/ 24 марта 2020

Это моя реализация DQN для Breakout. Я использовал Keras, NumPy и openai-gym в своем коде.

import gym
import random
import numpy as np
import tensorflow as tf
from keras.layers import Dense,Conv2D,MaxPooling2D,Flatten,BatchNormalization
from keras.models import Model
from keras.models import Sequential
import cv2
from collections import deque
from keras.optimizers import RMSprop,SGD,Adam
from keras import backend as K
from datetime import datetime
import os.path
import time
from keras.models import load_model
from keras.models import clone_model
from keras.callbacks import TensorBoard

memory = []
env = gym.make('BreakoutDeterministic-v4')
explore_episodes = 10
training_episodes = 10000
ACTIONS = env.action_space.n

def train_mini_batch(model):
    batch_size = 32
    mini_batch = random.sample(memory, batch_size)
    s_t = np.zeros((batch_size, 84, 84 , 4))
    s_t_1 = np.zeros((batch_size, 84, 84 , 4))
    target = np.zeros((batch_size,ACTIONS))
    r = np.zeros((batch_size))
    a = np.zeros((batch_size))
    dead = np.zeros((batch_size))
    for i in range(batch_size):
        s_t[i] = mini_batch[i][0]
        r[i] = mini_batch[i][1]
        s_t_1[i] = mini_batch[i][2]
        a[i] = mini_batch[i][3]
        dead[i] = mini_batch[i][4]

    next_Q_values = model.predict(s_t_1)

    for i in range(batch_size):
        if dead[i]:
            target[i][int(a[i])] = r[i]

        else:
            target[i][int(a[i])] = r[i] + 0.99* np.amax(next_Q_values[i])
    h = model.fit(s_t, target, epochs=1,batch_size=batch_size, verbose=0)


def get_action(state,model = None,image = None):
    if(state == 'explore'):
        action = random.randrange(ACTIONS)

    else:
        image = np.expand_dims(image ,axis = 0)
        action = np.argmax(model.predict(image))
    return action   



def pre_processing(image):
    image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
    image = cv2.resize(image,(84,84))
    return image


def create_model():
    model = Sequential()
    model.add(Conv2D(32, (3, 3),input_shape=(84,84,4),activation='relu'))
    model.add(Conv2D(32, (3, 3),activation='relu'))
    model.add(Flatten())
    #model.add(Dense(512,activation='relu'))
    model.add(Dense(ACTIONS,activation='softmax'))
    model.summary()
    optimizer = Adam(learning_rate=1e-2, beta_1=0.9, beta_2=0.999, amsgrad=False)
    model.compile(optimizer, loss='mse')
    return model

def train_model():
    model = create_model()
    episode = 0
    for i in range(explore_episodes):
        done = False
        episode_reward = 0
        print("episode:",i)
        observation = env.reset()
        observation = pre_processing(observation)
        present_state = np.stack((observation,observation,observation,observation),axis= 2)
        while(done == False):
            env.render()
            action = get_action('explore')
            observation, reward, done, __ = env.step(action)
            episode_reward = episode_reward + reward
            observation = pre_processing(observation)
            next_state = np.stack((present_state[:, :, 1],present_state[:, :, 2],present_state[ :, :, 3],observation),axis = 2)
            memory.append((present_state,reward,next_state,action, done))
            present_state = next_state
        print("reward:",episode_reward)
    for i in range(training_episodes):
        done = False
        episode_reward = 0
        #print("episode:",i)
        observation = env.reset()
        observation = pre_processing(observation)
        present_state = np.stack((observation,observation,observation,observation),axis= 2)

        while(done == False):
            env.render()
            action = get_action('training',model=model,image= present_state)
            observation, reward, done, __ = env.step(action)
            episode_reward = episode_reward + reward
            observation = pre_processing(observation)
            next_state = np.stack((present_state[:, :, 1],present_state[:, :, 2],present_state[ :, :, 3],observation),axis = 2)
            memory.append((present_state,reward,next_state,action, done))
            present_state = next_state
            train_mini_batch(model)
        if(i % 10 == 0):
            model.save("openai_mycode")
        print("episode:",i)
        print("reward:",episode_reward)

train_model()

Во время обучения агент постоянно выполняет одно действие, и модель не может учиться. Окружающая среда со временем застревает во время тренировок. Я не могу понять, почему моя модель не тренируется.

Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...