Я для чего-то использовал dqn; это не сработало. Я упростил задачу, так что есть 2 действия: 0 и 1. Каждое действие соответствует одной награде: 0 или -1. Тем не менее, мой q-агент постоянно сбивает с толку, давая двум действиям дикие значения в тысячах. Пожалуйста, что я делаю не так?
import numpy as np
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
class ReplayBuffer():
def __init__(self, input_dims):
self.mem_size = memory_size
self.mem_cntr = 0
self.state_memory = np.zeros((self.mem_size, *input_dims),
dtype = np.float32)
self.new_state_memory = np.zeros((self.mem_size, *input_dims),
dtype = np.float32)
self.action_memory = np.zeros(self.mem_size, dtype = np.int32)
self.reward_memory = np.zeros(self.mem_size, dtype = np.float32)
self.terminal_memory = np.zeros(self.mem_size, dtype = np.int32)
def store_transition(self, state, action, reward, state_, done):
index = self.mem_cntr % self.mem_size
self.state_memory[index] = state
self.new_state_memory[index] = state_
self.reward_memory[index] = reward
self.action_memory[index] = action
self.terminal_memory[index] = 1 - int(done)
self.mem_cntr += 1
def sample_buffer(self):
max_mem = min(self.mem_cntr, self.mem_size)
batch = np.random.choice(max_mem, batch_size, replace = False)
states = self.state_memory[batch]
states_ = self.new_state_memory[batch]
rewards = self.reward_memory[batch]
actions = self.action_memory[batch]
terminal = self.terminal_memory[batch]
return states, actions, rewards, states_, terminal
def build_dqn(n_actions, input_dims):
model = keras.Sequential([
keras.layers.InputLayer(input_shape = input_dims),
keras.layers.Dense(fc1_dims, activation = "relu"),
keras.layers.Dense(fc2_dims, activation = "relu"),
keras.layers.Dense(fc3_dims, activation = "relu"),
# keras.layers.Dense(fc4_dims, activation = "relu"),
keras.layers.Dense(n_actions, activation = None)])
model.compile(optimizer = Adam(lr = learning_rate), loss = "mean_squared_error")
return model
class Agent():
def __init__(self, n_actions, input_dims):
self.action_space = [i for i in range(n_actions)]
self.gamma = gamma
self.epsilon = epsilon_start
self.eps_dec = epsilon_dec
self.eps_min = epsilon_end
self.batch_size = batch_size
self.model_file = model_name
self.memory = ReplayBuffer(input_dims)
self.q_eval = build_dqn(n_actions, input_dims)
def store_transition(self, state, action, reward, new_state, done):
self.memory.store_transition(state, action, reward, new_state, done)
def choose_action(self, observation):
if(np.random.random() < self.epsilon):
action = np.random.choice(self.action_space)
else:
state = np.array([observation])
actions = self.q_eval.predict(state)
action = np.argmax(actions[0])
print(actions)
print(action)
return action
def learn(self):
if (self.memory.mem_cntr < self.batch_size):
return
states, actions, rewards, states_, dones = \
self.memory.sample_buffer()
q_eval = self.q_eval.predict(states)
q_next = self.q_eval.predict(states_)
q_target = np.copy(q_eval)
batch_index = np.arange(self.batch_size, dtype = np.int32)
q_target[batch_index, actions] = rewards + \
self.gamma + np.max(q_next, axis = 1)*dones
self.q_eval.train_on_batch(states, q_target)
self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min
def save_model(self):
self.q_eval.save(self.model_file)
def load_model(self):
self.q_eval = load_model(self.model_file)
Это суперстандартный агент dqn, большая часть которого скопирована из учебника. Не могу понять, где это может пойти не так.