DQN с Gridworld не учится - PullRequest
       70

DQN с Gridworld не учится

0 голосов
/ 15 апреля 2020

Я знаю, что в DQN много игровых площадок, но я хочу сделать простую. Я использую матрицу 20 на 20 в качестве среды, и один агент использует матрицу в качестве входных данных для обучения поиску задач. (1 представляет агент, 2 представляет дыру, 3 представляет задачу.) Но я попробовал Conv2d и чистую структуру сети Dense, агент все еще не изучал. Я хочу знать, в чем проблема с моим кодом.

from collections import deque
from keras.models import Model
from keras.layers import Dense,Input
from keras.optimizers import Adam
import  numpy as np
import random
import copy
import pandas as pd

class Consulter:
    def __init__(self,type,width,height):
        self.type = type
        self.actions = list(range(4))
        self.pool = deque(maxlen=100000)
        self.height = height
        self.width = width
        self.model = self.act_model()

        self.batch_size = 64
        self.target_model =self.act_model()
        self.epsilon = 1
        self.state = None

        self.step = 0

    def store_memory(self,s,a,r,s_,over):

        self.pool.append([s,a,r,s_,over])

    def act_model(self):

        i = Input(shape=(400,))
        x = Dense(64,activation='relu')(x)
        x = Dense(16, activation='relu')(x)
        y = Dense(4)(x)

        model = Model(inputs=i,outputs=y)
        model.compile(optimizer=Adam(lr=0.0001),loss='mse')

        return model


    def train_model(self):

        if len(self.pool) < self.batch_size:
            return

        mini_batch = random.sample(self.pool, self.batch_size)

        input_data = np.zeros((self.batch_size, 400))
        output_data = np.zeros((self.batch_size, 400))

        action, reward, over = [], [], []

        for i in range(self.batch_size):

            input_data[i] = mini_batch[i][0].flatten()
            action.append(mini_batch[i][1])
            reward.append(mini_batch[i][2])
            output_data[i] = mini_batch[i][3].flatten()
            over.append(mini_batch[i][4])

        result = self.model.predict(input_data)
        result_action = self.model.predict(output_data)
        target_result = self.target_model.predict(output_data)

        for i in range(self.batch_size):
            if over[i]:
                result[i][action[i]] = reward[i]
            else:

                act_model = np.argmax(result_action[i])
                result[i][action[i]] = reward[i] + 0.95 * target_result[i][act_model]

        self.model.fit(input_data, result, batch_size=self.batch_size, epochs=1, verbose=0)

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    def select_action(self):
        self.step+=1
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.actions)
        else:

            state = self.state.reshape(1, 400)
            q_value = self.model.predict(state)  # need to figure our what q_value represents

            return np.argmax(q_value[0])

    def save_model(self,path):
        self.model.save(path)


COORD_LIST = [(i,j) for i in range(20) for j in range(20)]

class BuildEnv():
    def __init__(self,):

        self.grid_state = np.zeros(shape=(20, 20))

        self.agent_pos = None
        self.game_over = False

        self.time_step = 0
        self.N_task = 10
        self.total_r = 0


    def reset(self):

        self.time_step = 0
        self.grid_state = np.zeros(shape=(20, 20))

        self.total_r = 0
        self.game_over = False
        # 5 holes, 1 agent, 10 tasks, generate in random position
        rand = random.sample(COORD_LIST, self.N_task+5+1)
        hole = 5

        for i in range(hole):
            self.grid_state[rand[i][0],rand[i][1]] = 2
        for i  in range(hole,len(rand)):
            if i==len(rand)-1:

                self.grid_state[rand[i][0],rand[i][1]]=1
                self.agent_pos = rand[i]
            else:
                self.grid_state[rand[i][0], rand[i][1]] = 3

        return copy.deepcopy(self.grid_state)

    def get_state(self):
        return copy.deepcopy(self.grid_state)


    def generate_r(self,action):
        x,y = self.agent_pos
        x_,y_ = self.agent_pos

        if action == 0:  # up
            if x > 0:
                x -= 1
        elif action == 1:  # down
            if x < 19:
                x += 1
        elif action == 2:  # left
            if y > 0:
                y -= 1
        elif action == 3:  # right
            if y < 19:
                y += 1

        self.agent_pos = (x,y)
        s = copy.deepcopy(self.grid_state)
        #print(action, self.agent_pos)

        val = self.grid_state[x,y]

        r = 0
        if val == 0 or val == 1:
            r=-1
            self.grid_state[x_,y_]=0
            self.grid_state[x,y]=1

        elif val == 3:
            self.grid_state[x_,y_]=0
            self.grid_state[x,y]= 0
            r = 100
            print('agent_solved one task at {}'.format(self.agent_pos))
            if np.count_nonzero(self.grid_state==3)==0:
                self.game_over=True

        elif val ==2 :

            r = -250
            print('agent in a shit hole at {}'.format(self.agent_pos))
            self.game_over = True

        self.time_step += 1
        if self.time_step == 500:
            self.game_over = True
        self.total_r +=r
        return s, r


    def rest_tasks(self):

        print("All tasks =  ",self.N_task,"    rest tasks =    ",np.count_nonzero(self.grid_state==3),
              '    finish rate', round((self.N_task-np.count_nonzero(self.grid_state==3))/self.N_task,2))

        return np.count_nonzero(self.grid_state==3)

    def finished_tasks(self):
        # print(self.N_task,np.count_nonzero(self.grid_state==3))
        # print(self.grid_state)
        return self.N_task - np.count_nonzero(self.grid_state==3)


if __name__ == '__main__':
    header = True

    def initial_data():
        data = {'Episode': [], 'Time_step': [],'Total_R': [], 'Finished_tasks': [],'tasks_num': []}

        return data


    data = initial_data()

    env = BuildEnv()

    episode = 30000
    consulter = Consulter(list(range(4)),20,20)

    for i in range(episode):
        print('this is episode ==========', i, '   epsilon =============   ',consulter.epsilon)
        consulter.state = env.reset()
        while True:

            a = consulter.select_action()
            s, r = env.generate_r(a)

            s_ = env.get_state()

            consulter.store_memory(s.reshape(1,400),a,r,s_.reshape(1,400),env.game_over)
            consulter.state = s_

            if env.time_step%10==0:
                consulter.train_model()

            if consulter.step > 2500 and consulter.epsilon > 0.05:
                    consulter.epsilon *= 0.999
                    consulter.step -= 2500


            if env.game_over:
                consulter.state = None
                if (i+1)%5==0:
                    consulter.update_target_model()
                data['Episode'].append(i)
                data['Total_R'].append(env.total_r)
                data['tasks_num'].append(env.N_task)
                data['Finished_tasks'].append(env.N_task - env.rest_tasks())
                data['Time_step'].append(env.time_step)
                if (i+1) % 1000 == 0:
                    data = pd.DataFrame(data)
                    mode = 'w' if header else 'a'
                    data.to_csv('./consulter_hist.csv', mode=mode,
                                header=header, index=False)
                    header = False

                    data = initial_data()

                    consulter.save_model('./consulter_model.h5')

                break

...