Как обучить агента, используя обучение с подкреплением? - PullRequest
0 голосов
/ 21 марта 2020

Следующий код - это все, чтобы загрузить среду и заставить агента выполнять случайные действия:

env_name = 'TrainingEnvironment/FP_Esports_Competition'  # Name of the Unity environment binary to launch

Загрузить зависимости:

import matplotlib.pyplot as plt
import numpy as np
import sys
import random
import tensorflow as tf
import tflearn

from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfig, EngineConfigurationChannel

%matplotlib inline

print("Python version:")
print(sys.version)

# check Python version
if (sys.version_info[0] < 3):
    raise Exception("ERROR: ML-Agents Toolkit (v0.3 onwards) requires Python 3")

Запуск среды:

engine_configuration_channel = EngineConfigurationChannel()

env = UnityEnvironment(base_port = 5006, file_name=env_name, side_channels = [engine_configuration_channel])

# Set the resolution of the player window and the time scale of the engine. The agent view is always 128x128, this only impacts your view.
# You can use these parameters when you like, so you can switch between resolutions and timescales on the go within this notebook.
# There is a weird bug with the plugin that makes the timescale 100 when writing anything to the engine configuration channel, and 1 when not writing anything. 
# I do not know the cause of this, but I am finding out. Comment out that line to play in real time.

engine_configuration_channel.set_configuration_parameters(width = 800, height = 800)

#Reset the environment
env.reset()

# Set the default brain to work with
group_name = env.get_agent_groups()[0]
group_spec = env.get_agent_group_spec(group_name)

Код, позволяющий агенту выполнять случайные действия:

env.reset()
step_result = env.get_step_result(group_name)
done = False
episode_rewards = 0

while not done:
    vector_obs = step_result.obs[1]
    vis_obs = step_result.obs[0]

    action = np.random.randn(step_result.n_agents(), group_spec.action_size)
    action[0][0] = 0.5 #forward speed = 100%
    action[0][1] = random.uniform(-1, 1) #rotation = 3% to the left
    env.set_actions(group_name, action)
    env.step()
    step_result = env.get_step_result(group_name)
    episode_rewards += step_result.reward[0]
    done = step_result.done[0]
print("Total reward this episode: {}".format(episode_rewards))

#Close the environment when done
env.close()

Теперь возникает проблема - у меня есть код для нейронной сети, но я не знаю, как его интегрировать для этого Speci c Задача обучения подкрепление

# feed-forward part of network
observation = tflearn.input_data(shape=[None, 4]) # [frames required, vector observation]
net = tflearn.fully_connected(observation, 256, activation="relu")
net = tflearn.fully_connected(net, 256, activation="relu")
net = tflearn.fully_connected(net, 256, activation="relu")
out = tflearn.fully_connected(net, 2, activation="softmax") # '2' represents action space | softmax used to turn result into probability interpretable by humans

reward_holder = tf.placeholder(tf.float32, [None])
action_holder = tf.placeholder(tf.int32, [None])

responsible_outputs = tf.gather(tf.reshape(out, [-1]), tf.range(0, tf.shape(out)[0] * tf.shape(out)[1], 2) + action_holder) # ???

loss = -tf.reduce_mean(tf.log(responsible_outputs) * reward_holder) # (log responsible_outputs - standard way for loss function) | reward decides how much loss there is

optimizer = tf.train.AdamOptimizer() # update weights by adjusting cost function | cost function: mean of the loss between the predicted value y' and actual value y
update = optimizer.minimize(loss)
gamma = 0.99 # represents how much future rewards are prioritised (higher gamma = taking future more into account)

def discount_reward(rewards):
    running_reward = 0
    result = np.zeros_like(rewards) # create empty discount reward
    for i in reversed(range(len(rewards))): # go from last reward to first
        result[i] = rewards[i] + gamma * running_reward
        running_reward += rewards[i]
    return result
engine_configuration_channel = EngineConfigurationChannel()

env = UnityEnvironment(base_port = 5006, file_name=env_name, side_channels = [engine_configuration_channel])

# Set the resolution of the player window and the time scale of the engine. The agent view is always 128x128, this only impacts your view.
# You can use these parameters when you like, so you can switch between resolutions and timescales on the go within this notebook.
# There is a weird bug with the plugin that makes the timescale 100 when writing anything to the engine configuration channel, and 1 when not writing anything. 
# I do not know the cause of this, but I am finding out. Comment out that line to play in real time.

engine_configuration_channel.set_configuration_parameters(width = 800, height = 800)

#Reset the environment
env.reset()

# Set the default brain to work with
group_name = env.get_agent_groups()[0]
group_spec = env.get_agent_group_spec(group_name)

num_episodes = 15000
max_time = 15000 # max frames per game
all_rewards = []
saver = tf.train.Saver()
train_data = []

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for i in range(num_episodes):
        obs = env.reset()
        episode_reward = 0 # keep track of reward of current episode
        ep_history = [] # keep track of state of the action in the rewards taken at each frame in this episode
        for j in range(max_time):
            # choose an action
            a_one_hot = sess.run(out, feed_dict={observation: [obs]}).reshape(2)
            action = np.random.choice(a_one_hot, p=a_one_hot) # set probabilities of action to %
            action = np.argmax(a_one_hot == action) # return 0 or 1
            obs1, r, d, _ = env.step(action) # tell bot which step to take
            ep_history.append([obs, r, action]) # keep record of frame
            obs = obs1
            episode_reward += r # add reward
            # update the network
            if d == True:
                all_rewards.append(episode_reward) # keep track of all rewards (for graphing) 
                ep_history = np.array(ep_history)
                ep_history[:, 1] = discount_reward(ep_history[:, 1]) # use function which takes into account future reward
                train_data.extend(ep_history) # store data to be used for future training
                if i % 10 == 0 and i != 0: # train model after every 10th episode
                    train_data = np.array(train_data)  # format training data to numpy array
                    sess.run(update, feed_dict={observation: np.vstack(train_data[:, 0]),
                                                    reward_holder: train_data[:, 1],
                                                    action_holder: train_data[:, 2]})
                    train_data = [] # reset training data - get data from new model
                break

        # print mean of rewards after every 100 episodes      
        if i % 100 == 0 and i != 0:
            print(np.mean(all_rewards[-100:]))
            if np.mean(all_rewards[-100:]) == 200:
                break

    saver.save(sess, "/tmp/model.ckpt")

Любая помощь будет по достоинству оценена. Я знаю, что это большой вопрос, чтобы задать.

...