Я создал dqn-агент на основе значений, используя tf-Keras (tenorflow == 1.4, python 3.7), но из результата CartPole-v1 агент ничего не изучает
Round : 20, вознаграждение: 16,0
раунд: 40, вознаграждение: 9,0
раунд: 60, вознаграждение: 9,0
раунд: 80, вознаграждение: 9,0
Раунд: 100, Награда: 10,0
В чем проблема? Это мой код
class SimpleDQN:
def __init__(self,
model: tf.keras.models.Model,
action_dim: int,
gamma=0.99,
replace_point=1024,
memory_size=4096,
memory_batch=64,
init_greedy=0.1,
max_greedy=0.9,
greedy_increase=1.001):
# Model
self.eval_model = model
self.target_model = model
self.action_dim = action_dim
# Setting
self.gamma = gamma
self.replace_point = replace_point
self.replace_pointer = 0
# Memory
self.memory = None
self.memory_batch = memory_batch
self.memory_size = memory_size
self.memory_pointer = 0
# Greedy
self.greedy = init_greedy
self.max_greedy = max_greedy
self.greedy_increase = greedy_increase
def storage(self, s, a, r, s_, done):
if self.memory is None:
self.memory = {
's': np.zeros((self.memory_size,) + s.shape[1:]),
'a': np.zeros((self.memory_size, 1), dtype=int),
'r': np.zeros((self.memory_size, 1), dtype=float),
's_': np.zeros((self.memory_size,) + s_.shape[1:]),
'd': np.zeros((self.memory_size, 1), dtype=int),
}
i = self.memory_pointer % self.memory_size
self.memory['s'][i] = s
self.memory['a'][i] = a
self.memory['r'][i] = r
self.memory['s_'][i] = s_
self.memory['d'][i] = done
self.memory_pointer += 1
def choose_action(self, s):
if np.random.uniform() > self.greedy:
a = np.random.choice(self.action_dim, size=1)
else:
a = np.argmax(self.eval_model.predict(s)[0])
return int(a)
def train(self) -> float:
i = np.random.choice(self.memory_size, size=self.memory_batch, replace=False)
bs = self.memory['s'][i]
ba = self.memory['a'][i]
br = self.memory['r'][i]
bs_ = self.memory['s_'][i]
bd = self.memory['d'][i]
eval_y = self.eval_model.predict(bs)
target_y = self.target_model.predict(bs_)
ys = br + self.gamma * np.max(target_y, axis=1).reshape(-1, 1) * (1 - bd)
for i, a, y in zip(range(self.memory_batch), ba, ys):
eval_y[i, a] = y
loss = self.eval_model.train_on_batch(bs, eval_y)
if self.replace_pointer % self.replace_point == 0:
self.target_model.set_weights(self.eval_model.get_weights())
self.replace_pointer += 1
self.greedy = min(self.greedy * self.greedy_increase, self.max_greedy)
return loss