Следующий код - это все, чтобы загрузить среду и заставить агента выполнять случайные действия:
env_name = 'TrainingEnvironment/FP_Esports_Competition' # Name of the Unity environment binary to launch
Загрузить зависимости:
import matplotlib.pyplot as plt
import numpy as np
import sys
import random
import tensorflow as tf
import tflearn
from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfig, EngineConfigurationChannel
%matplotlib inline
print("Python version:")
print(sys.version)
# check Python version
if (sys.version_info[0] < 3):
raise Exception("ERROR: ML-Agents Toolkit (v0.3 onwards) requires Python 3")
Запуск среды:
engine_configuration_channel = EngineConfigurationChannel()
env = UnityEnvironment(base_port = 5006, file_name=env_name, side_channels = [engine_configuration_channel])
# Set the resolution of the player window and the time scale of the engine. The agent view is always 128x128, this only impacts your view.
# You can use these parameters when you like, so you can switch between resolutions and timescales on the go within this notebook.
# There is a weird bug with the plugin that makes the timescale 100 when writing anything to the engine configuration channel, and 1 when not writing anything.
# I do not know the cause of this, but I am finding out. Comment out that line to play in real time.
engine_configuration_channel.set_configuration_parameters(width = 800, height = 800)
#Reset the environment
env.reset()
# Set the default brain to work with
group_name = env.get_agent_groups()[0]
group_spec = env.get_agent_group_spec(group_name)
Код, позволяющий агенту выполнять случайные действия:
env.reset()
step_result = env.get_step_result(group_name)
done = False
episode_rewards = 0
while not done:
vector_obs = step_result.obs[1]
vis_obs = step_result.obs[0]
action = np.random.randn(step_result.n_agents(), group_spec.action_size)
action[0][0] = 0.5 #forward speed = 100%
action[0][1] = random.uniform(-1, 1) #rotation = 3% to the left
env.set_actions(group_name, action)
env.step()
step_result = env.get_step_result(group_name)
episode_rewards += step_result.reward[0]
done = step_result.done[0]
print("Total reward this episode: {}".format(episode_rewards))
#Close the environment when done
env.close()
Теперь возникает проблема - у меня есть код для нейронной сети, но я не знаю, как его интегрировать для этого Speci c Задача обучения подкрепление
# feed-forward part of network
observation = tflearn.input_data(shape=[None, 4]) # [frames required, vector observation]
net = tflearn.fully_connected(observation, 256, activation="relu")
net = tflearn.fully_connected(net, 256, activation="relu")
net = tflearn.fully_connected(net, 256, activation="relu")
out = tflearn.fully_connected(net, 2, activation="softmax") # '2' represents action space | softmax used to turn result into probability interpretable by humans
reward_holder = tf.placeholder(tf.float32, [None])
action_holder = tf.placeholder(tf.int32, [None])
responsible_outputs = tf.gather(tf.reshape(out, [-1]), tf.range(0, tf.shape(out)[0] * tf.shape(out)[1], 2) + action_holder) # ???
loss = -tf.reduce_mean(tf.log(responsible_outputs) * reward_holder) # (log responsible_outputs - standard way for loss function) | reward decides how much loss there is
optimizer = tf.train.AdamOptimizer() # update weights by adjusting cost function | cost function: mean of the loss between the predicted value y' and actual value y
update = optimizer.minimize(loss)
gamma = 0.99 # represents how much future rewards are prioritised (higher gamma = taking future more into account)
def discount_reward(rewards):
running_reward = 0
result = np.zeros_like(rewards) # create empty discount reward
for i in reversed(range(len(rewards))): # go from last reward to first
result[i] = rewards[i] + gamma * running_reward
running_reward += rewards[i]
return result
engine_configuration_channel = EngineConfigurationChannel()
env = UnityEnvironment(base_port = 5006, file_name=env_name, side_channels = [engine_configuration_channel])
# Set the resolution of the player window and the time scale of the engine. The agent view is always 128x128, this only impacts your view.
# You can use these parameters when you like, so you can switch between resolutions and timescales on the go within this notebook.
# There is a weird bug with the plugin that makes the timescale 100 when writing anything to the engine configuration channel, and 1 when not writing anything.
# I do not know the cause of this, but I am finding out. Comment out that line to play in real time.
engine_configuration_channel.set_configuration_parameters(width = 800, height = 800)
#Reset the environment
env.reset()
# Set the default brain to work with
group_name = env.get_agent_groups()[0]
group_spec = env.get_agent_group_spec(group_name)
num_episodes = 15000
max_time = 15000 # max frames per game
all_rewards = []
saver = tf.train.Saver()
train_data = []
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(num_episodes):
obs = env.reset()
episode_reward = 0 # keep track of reward of current episode
ep_history = [] # keep track of state of the action in the rewards taken at each frame in this episode
for j in range(max_time):
# choose an action
a_one_hot = sess.run(out, feed_dict={observation: [obs]}).reshape(2)
action = np.random.choice(a_one_hot, p=a_one_hot) # set probabilities of action to %
action = np.argmax(a_one_hot == action) # return 0 or 1
obs1, r, d, _ = env.step(action) # tell bot which step to take
ep_history.append([obs, r, action]) # keep record of frame
obs = obs1
episode_reward += r # add reward
# update the network
if d == True:
all_rewards.append(episode_reward) # keep track of all rewards (for graphing)
ep_history = np.array(ep_history)
ep_history[:, 1] = discount_reward(ep_history[:, 1]) # use function which takes into account future reward
train_data.extend(ep_history) # store data to be used for future training
if i % 10 == 0 and i != 0: # train model after every 10th episode
train_data = np.array(train_data) # format training data to numpy array
sess.run(update, feed_dict={observation: np.vstack(train_data[:, 0]),
reward_holder: train_data[:, 1],
action_holder: train_data[:, 2]})
train_data = [] # reset training data - get data from new model
break
# print mean of rewards after every 100 episodes
if i % 100 == 0 and i != 0:
print(np.mean(all_rewards[-100:]))
if np.mean(all_rewards[-100:]) == 200:
break
saver.save(sess, "/tmp/model.ckpt")
Любая помощь будет по достоинству оценена. Я знаю, что это большой вопрос, чтобы задать.