Я написал реализацию Q-Learning для решения проблемы OpenAI FrozenLake-v0 с использованием простого NN.
Моя нейронная сеть выглядит так:
Уровень ввода: 16
Выходной слой: 4
Реализация в ясном тензорном потоке прошла очень хорошо.Около 70% завершили эпизоды после тренировки из 10 тысяч эпизодов.
После этого я хотел написать тот же алгоритм с использованием Keras, но на этот раз алгоритм работал очень плохо, после 10k эпизодов около 5% законченных эпизодов.
Я предполагаю, что допустил ошибку в своей реализации Keras, но не могу понять это.
Реализация Tensorflow:
import gym
import numpy as np
import tensorflow as tf
env = gym.make('FrozenLake-v0')
discount_rate = 0.99
random_action_chance = 0.1
num_episodes = 10000
max_episode_step = 100
log_interval = 100
tf.reset_default_graph()
inputs = tf.placeholder(shape=[1, 16], dtype=tf.float32)
W = tf.Variable(tf.random_uniform([16, 4], 0, 0.01))
Q = tf.matmul(inputs, W)
predict = tf.argmax(Q, 1)
Qnext = tf.placeholder(shape=[1, 4], dtype=tf.float32)
loss = tf.reduce_sum(tf.square(Qnext - Q))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
updateModel = optimizer.minimize(loss)
init = tf.initialize_all_variables()
rewards_from_episodes = []
with tf.Session() as sess:
sess.run(init)
for episode in range(num_episodes):
observation = env.reset()
episode_reward = 0
if episode % log_interval == 0 and episode > 0:
print('Episode: {}, Reward: {}'.format(episode, sum(rewards_from_episodes[episode - log_interval: episode]) / log_interval))
W1 = []
for step in range(max_episode_step):
# Select action
action, targetQ = sess.run([predict, Q], feed_dict={inputs: np.identity(16)[observation:observation + 1]})
if np.random.rand(1) < random_action_chance:
action[0] = env.action_space.sample()
new_observation, reward, done, _ = env.step(action[0])
Qnew = sess.run(Q, feed_dict={inputs: np.identity(16)[new_observation:new_observation + 1]})
maxQvalue = np.max(Qnew)
targetQ[0, action[0]] = reward + discount_rate * maxQvalue
# Train network using target and predicted Q values
_, W1 = sess.run([updateModel, W], feed_dict={inputs: np.identity(16)[observation:observation + 1],
Qnext: targetQ})
episode_reward += reward
observation = new_observation
if done:
random_action_chance = 1. / ((episode / 50) + 10)
break
rewards_from_episodes.append(episode_reward)
print("Mean of all episodes: {}%".format(sum(rewards_from_episodes) / num_episodes))
Реализация Keras:
import gym
import numpy as np
import tensorflow as tf
import random
from tensorflow.python.keras.layers import *
env = gym.make('FrozenLake-v0')
learning_rate = 0.1
discount_rate = 0.99
random_action_chance = 0.1
num_episodes = 10000
max_episode_step = 100
log_interval = 100
model = tf.keras.Sequential()
model.add(Dense(4, kernel_initializer='uniform'))
model.compile(optimizer=tf.train.GradientDescentOptimizer(learning_rate=learning_rate),
loss='mean_squared_error')
rewards_from_episodes = []
for episode in range(num_episodes):
observation = env.reset()
episode_reward = 0
if episode % log_interval == 0 and episode > 0:
print('Episode: {}, Reward: {}'.format(episode, sum(
rewards_from_episodes[episode - log_interval: episode]) / log_interval))
for step in range(max_episode_step):
# Select action
targetQ = model.predict(np.identity(16)[observation:observation + 1], batch_size=1)
action = np.argmax(targetQ)
if random.random() < random_action_chance:
action = env.action_space.sample()
new_observation, reward, done, _ = env.step(action)
Qnew = model.predict(np.identity(16)[new_observation:new_observation + 1], batch_size=1)
maxQvalue = np.max(Qnew)
targetQ[0, action] = reward + discount_rate * maxQvalue
# Train network using target and predicted Q values
model.fit(np.identity(16)[observation:observation + 1], targetQ, epochs=1, batch_size=1, verbose=0)
episode_reward += reward
observation = new_observation
if done:
random_action_chance = 1. / ((episode / 50) + 10)
break
rewards_from_episodes.append(episode_reward)
print("Mean of all episodes: {}%".format(sum(rewards_from_episodes) / num_episodes))