Я знаю, что в DQN много игровых площадок, но я хочу сделать простую. Я использую матрицу 20 на 20 в качестве среды, и один агент использует матрицу в качестве входных данных для обучения поиску задач. (1 представляет агент, 2 представляет дыру, 3 представляет задачу.) Но я попробовал Conv2d и чистую структуру сети Dense, агент все еще не изучал. Я хочу знать, в чем проблема с моим кодом.
from collections import deque
from keras.models import Model
from keras.layers import Dense,Input
from keras.optimizers import Adam
import numpy as np
import random
import copy
import pandas as pd
class Consulter:
def __init__(self,type,width,height):
self.type = type
self.actions = list(range(4))
self.pool = deque(maxlen=100000)
self.height = height
self.width = width
self.model = self.act_model()
self.batch_size = 64
self.target_model =self.act_model()
self.epsilon = 1
self.state = None
self.step = 0
def store_memory(self,s,a,r,s_,over):
self.pool.append([s,a,r,s_,over])
def act_model(self):
i = Input(shape=(400,))
x = Dense(64,activation='relu')(x)
x = Dense(16, activation='relu')(x)
y = Dense(4)(x)
model = Model(inputs=i,outputs=y)
model.compile(optimizer=Adam(lr=0.0001),loss='mse')
return model
def train_model(self):
if len(self.pool) < self.batch_size:
return
mini_batch = random.sample(self.pool, self.batch_size)
input_data = np.zeros((self.batch_size, 400))
output_data = np.zeros((self.batch_size, 400))
action, reward, over = [], [], []
for i in range(self.batch_size):
input_data[i] = mini_batch[i][0].flatten()
action.append(mini_batch[i][1])
reward.append(mini_batch[i][2])
output_data[i] = mini_batch[i][3].flatten()
over.append(mini_batch[i][4])
result = self.model.predict(input_data)
result_action = self.model.predict(output_data)
target_result = self.target_model.predict(output_data)
for i in range(self.batch_size):
if over[i]:
result[i][action[i]] = reward[i]
else:
act_model = np.argmax(result_action[i])
result[i][action[i]] = reward[i] + 0.95 * target_result[i][act_model]
self.model.fit(input_data, result, batch_size=self.batch_size, epochs=1, verbose=0)
def update_target_model(self):
self.target_model.set_weights(self.model.get_weights())
def select_action(self):
self.step+=1
if np.random.rand() < self.epsilon:
return np.random.choice(self.actions)
else:
state = self.state.reshape(1, 400)
q_value = self.model.predict(state) # need to figure our what q_value represents
return np.argmax(q_value[0])
def save_model(self,path):
self.model.save(path)
COORD_LIST = [(i,j) for i in range(20) for j in range(20)]
class BuildEnv():
def __init__(self,):
self.grid_state = np.zeros(shape=(20, 20))
self.agent_pos = None
self.game_over = False
self.time_step = 0
self.N_task = 10
self.total_r = 0
def reset(self):
self.time_step = 0
self.grid_state = np.zeros(shape=(20, 20))
self.total_r = 0
self.game_over = False
# 5 holes, 1 agent, 10 tasks, generate in random position
rand = random.sample(COORD_LIST, self.N_task+5+1)
hole = 5
for i in range(hole):
self.grid_state[rand[i][0],rand[i][1]] = 2
for i in range(hole,len(rand)):
if i==len(rand)-1:
self.grid_state[rand[i][0],rand[i][1]]=1
self.agent_pos = rand[i]
else:
self.grid_state[rand[i][0], rand[i][1]] = 3
return copy.deepcopy(self.grid_state)
def get_state(self):
return copy.deepcopy(self.grid_state)
def generate_r(self,action):
x,y = self.agent_pos
x_,y_ = self.agent_pos
if action == 0: # up
if x > 0:
x -= 1
elif action == 1: # down
if x < 19:
x += 1
elif action == 2: # left
if y > 0:
y -= 1
elif action == 3: # right
if y < 19:
y += 1
self.agent_pos = (x,y)
s = copy.deepcopy(self.grid_state)
#print(action, self.agent_pos)
val = self.grid_state[x,y]
r = 0
if val == 0 or val == 1:
r=-1
self.grid_state[x_,y_]=0
self.grid_state[x,y]=1
elif val == 3:
self.grid_state[x_,y_]=0
self.grid_state[x,y]= 0
r = 100
print('agent_solved one task at {}'.format(self.agent_pos))
if np.count_nonzero(self.grid_state==3)==0:
self.game_over=True
elif val ==2 :
r = -250
print('agent in a shit hole at {}'.format(self.agent_pos))
self.game_over = True
self.time_step += 1
if self.time_step == 500:
self.game_over = True
self.total_r +=r
return s, r
def rest_tasks(self):
print("All tasks = ",self.N_task," rest tasks = ",np.count_nonzero(self.grid_state==3),
' finish rate', round((self.N_task-np.count_nonzero(self.grid_state==3))/self.N_task,2))
return np.count_nonzero(self.grid_state==3)
def finished_tasks(self):
# print(self.N_task,np.count_nonzero(self.grid_state==3))
# print(self.grid_state)
return self.N_task - np.count_nonzero(self.grid_state==3)
if __name__ == '__main__':
header = True
def initial_data():
data = {'Episode': [], 'Time_step': [],'Total_R': [], 'Finished_tasks': [],'tasks_num': []}
return data
data = initial_data()
env = BuildEnv()
episode = 30000
consulter = Consulter(list(range(4)),20,20)
for i in range(episode):
print('this is episode ==========', i, ' epsilon ============= ',consulter.epsilon)
consulter.state = env.reset()
while True:
a = consulter.select_action()
s, r = env.generate_r(a)
s_ = env.get_state()
consulter.store_memory(s.reshape(1,400),a,r,s_.reshape(1,400),env.game_over)
consulter.state = s_
if env.time_step%10==0:
consulter.train_model()
if consulter.step > 2500 and consulter.epsilon > 0.05:
consulter.epsilon *= 0.999
consulter.step -= 2500
if env.game_over:
consulter.state = None
if (i+1)%5==0:
consulter.update_target_model()
data['Episode'].append(i)
data['Total_R'].append(env.total_r)
data['tasks_num'].append(env.N_task)
data['Finished_tasks'].append(env.N_task - env.rest_tasks())
data['Time_step'].append(env.time_step)
if (i+1) % 1000 == 0:
data = pd.DataFrame(data)
mode = 'w' if header else 'a'
data.to_csv('./consulter_hist.csv', mode=mode,
header=header, index=False)
header = False
data = initial_data()
consulter.save_model('./consulter_model.h5')
break