Визуализация тензорной доски в примере Atari (MsPacman) по изучению глубокого армирования - PullRequest
0 голосов
/ 04 мая 2018

В моей диссертации я описываю пример глубокого обучения (DRL), написанного на python. Я не писал код, я только что запустил обучение на сервере Linux, и все работает отлично.

Теперь я нахожусь в точке, где я хочу визуализировать точность / прогноз, потери, стабильность обучения и так далее с помощью тензорной доски . Я работаю в среде conda virutal, где я установил тренажерный зал, atari-py, Pillow и PyOpenGL. На сервере установлен TensorFlow-GPU. Это - ссылка на хранилище, откуда я получил код. Я смотрел учебники о тензорной доске, и я понял, но я не могу включить эти переменные, такие как tf.summary.histogram() в мой код. Я не знаю, где именно их разместить и какие переменные могут быть важны для визуализации.

Я уже сделал так, что тензорная доска визуализирует всю сеть с помощью средства записи файлов в виде графика, который выглядит как this .

Но теперь я застрял. Каждый раз, когда я хочу включить гистограммы переменных, код выдает ошибки. Кажется, что я поставил их в неправильные позиции, или я пытаюсь визуализировать неправильные переменные. (Я не очень знаком с кодом Python, возможно, я неправильно понял синтаксис.) Было бы очень приятно, если бы кто-то мог мне помочь.

Этот является ссылкой на туториал по тензорной доске, на который я ориентирован.

Ниже вы видите код. Пример состоит только из этого файла.

from __future__ import division, print_function, unicode_literals

# Handle arguments (before slow imports so --help can be fast)
import argparse

parser = argparse.ArgumentParser(
    description="Train a DQN net to play MsMacman.")
parser.add_argument("-n", "--number-steps", type=int, default=4000000,
                    help="total number of training steps")
parser.add_argument("-l", "--learn-iterations", type=int, default=4,
                    help="number of game iterations between each training step")
parser.add_argument("-s", "--save-steps", type=int, default=1000,
                    help="number of training steps between saving checkpoints")
parser.add_argument("-c", "--copy-steps", type=int, default=10000,
                    help="number of training steps between copies of online DQN to target DQN")
parser.add_argument("-r", "--render", action="store_true", default=False,
                    help="render the game during training or testing")
parser.add_argument("-p", "--path", default="my_dqn.ckpt",
                    help="path of the checkpoint file")
parser.add_argument("-t", "--test", action="store_true", default=False,
                    help="test (no learning and minimal epsilon)")
parser.add_argument("-v", "--verbosity", action="count", default=0,
                    help="increase output verbosity")
args = parser.parse_args()

from collections import deque
import gym
import numpy as np
import os
import tensorflow as tf

writer = tf.summary.FileWriter("/home/maggie/tbfiles/1")

env = gym.make("MsPacman-v0")
done = True  # env needs to be reset

# First let's build the two DQNs (online & target)
input_height = 88
input_width = 80
input_channels = 1
conv_n_maps = [32, 64, 64]
conv_kernel_sizes = [(8, 8), (4, 4), (3, 3)]
conv_strides = [4, 2, 1]
conv_paddings = ["SAME"] * 3
conv_activation = [tf.nn.relu] * 3

# TESTING ACTIVATION VIS TENSORBOARD
#tf.summary.histogram("activation", conv_activation)
n_hidden_in = 64 * 11 * 10  # conv3 has 64 maps of 11x10 each
n_hidden = 512
hidden_activation = tf.nn.relu
n_outputs = env.action_space.n  # 9 discrete actions are available
initializer = tf.contrib.layers.variance_scaling_initializer()


def q_network(X_state, name):
    prev_layer = X_state
    with tf.variable_scope(name) as scope:
        for n_maps, kernel_size, strides, padding, activation in zip(
                conv_n_maps, conv_kernel_sizes, conv_strides,
                conv_paddings, conv_activation):
            prev_layer = tf.layers.conv2d(
                prev_layer, filters=n_maps, kernel_size=kernel_size,
                strides=strides, padding=padding, activation=activation,
                kernel_initializer=initializer)

        last_conv_layer_flat = tf.reshape(prev_layer, shape=[-1, n_hidden_in])
        hidden = tf.layers.dense(last_conv_layer_flat, n_hidden,
                                 activation=hidden_activation,
                                 kernel_initializer=initializer)
        outputs = tf.layers.dense(hidden, n_outputs,
                                  kernel_initializer=initializer)
    trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                       scope=scope.name)
    trainable_vars_by_name = {var.name[len(scope.name):]: var
                              for var in trainable_vars}
    return outputs, trainable_vars_by_name


X_state = tf.placeholder(tf.float32, shape=[None, input_height, input_width,
                                            input_channels], name="x_state")
online_q_values, online_vars = q_network(X_state, name="q_networks/online")
target_q_values, target_vars = q_network(X_state, name="q_networks/target")

# We need an operation to copy the online DQN to the target DQN
copy_ops = [target_var.assign(online_vars[var_name])
            for var_name, target_var in target_vars.items()]
copy_online_to_target = tf.group(*copy_ops)

# Now for the training operations
learning_rate = 0.001
momentum = 0.95

with tf.variable_scope("train"):
    X_action = tf.placeholder(tf.int32, shape=[None], name="x_action")
    y = tf.placeholder(tf.float32, shape=[None, 1], name="labels")
    q_value = tf.reduce_sum(online_q_values * tf.one_hot(X_action, n_outputs),
                            axis=1, keep_dims=True)
    error = tf.abs(y - q_value)
    clipped_error = tf.clip_by_value(error, 0.0, 1.0)
    linear_error = 2 * (error - clipped_error)
    loss = tf.reduce_mean(tf.square(clipped_error) + linear_error, name="loss")

    global_step = tf.Variable(0, trainable=False, name='global_step')
    optimizer = tf.train.MomentumOptimizer(
        learning_rate, momentum, use_nesterov=True, name="xent")
    training_op = optimizer.minimize(loss, global_step=global_step)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

# Let's implement a simple replay memory
replay_memory_size = 20000
replay_memory = deque([], maxlen=replay_memory_size)


def sample_memories(batch_size):
    indices = np.random.permutation(len(replay_memory))[:batch_size]
    cols = [[], [], [], [], []]  # state, action, reward, next_state, continue
    for idx in indices:
        memory = replay_memory[idx]
        for col, value in zip(cols, memory):
            col.append(value)
    cols = [np.array(col) for col in cols]
    return (cols[0], cols[1], cols[2].reshape(-1, 1), cols[3],
            cols[4].reshape(-1, 1))


# And on to the epsilon-greedy policy with decaying epsilon
eps_min = 0.1
eps_max = 1.0 if not args.test else eps_min
eps_decay_steps = args.number_steps // 2


def epsilon_greedy(q_values, step):
    epsilon = max(eps_min, eps_max - (eps_max - eps_min) * step / eps_decay_steps)
    if np.random.rand() < epsilon:
        return np.random.randint(n_outputs)  # random action
    else:
        return np.argmax(q_values)  # optimal action


# We need to preprocess the images to speed up training
mspacman_color = np.array([210, 164, 74]).mean()


def preprocess_observation(obs):
    img = obs[1:176:2, ::2]  # crop and downsize
    img = img.mean(axis=2)  # to greyscale
    img[img == mspacman_color] = 0  # Improve contrast
    img = (img - 128) / 128 - 1  # normalize from -1. to 1.
    return img.reshape(88, 80, 1)


# TensorFlow - Execution phase
training_start = 10000  # start training after 10,000 game iterations
discount_rate = 0.99
skip_start = 90  # Skip the start of every game (it's just waiting time).
batch_size = 50
iteration = 0  # game iterations
done = True  # env needs to be reset

# We will keep track of the max Q-Value over time and compute the mean per game
loss_val = np.infty
game_length = 0
total_max_q = 0
mean_max_q = 0.0

with tf.Session() as sess:
    if os.path.isfile(args.path + ".index"):
        saver.restore(sess, args.path)
    else:
        init.run()
        copy_online_to_target.run()
    while True:
        step = global_step.eval()
        if step >= args.number_steps:
            break
        iteration += 1
        if args.verbosity > 0:
            print("\rIteration {}   Training step {}/{} ({:.1f})%   "
                  "Loss {:5f}    Mean Max-Q {:5f}   ".format(
                iteration, step, args.number_steps, step * 100 / args.number_steps,
                loss_val, mean_max_q), end="")
        if done:  # game over, start again
            obs = env.reset()
            for skip in range(skip_start):  # skip the start of each game
                obs, reward, done, info = env.step(0)
            state = preprocess_observation(obs)

        if args.render:
            env.render()

        # Online DQN evaluates what to do
        q_values = online_q_values.eval(feed_dict={X_state: [state]})
        action = epsilon_greedy(q_values, step)

        # Online DQN plays
        obs, reward, done, info = env.step(action)
        next_state = preprocess_observation(obs)

        # Let's memorize what happened
        replay_memory.append((state, action, reward, next_state, 1.0 - done))
        state = next_state

        if args.test:
            continue

        # Compute statistics for tracking progress (not shown in the book)
        total_max_q += q_values.max()
        game_length += 1
        if done:
            mean_max_q = total_max_q / game_length
            total_max_q = 0.0
            game_length = 0

        if iteration < training_start or iteration % args.learn_iterations != 0:
            continue  # only train after warmup period and at regular intervals

        # Sample memories and use the target DQN to produce the target Q-Value
        X_state_val, X_action_val, rewards, X_next_state_val, continues = (
            sample_memories(batch_size))
        next_q_values = target_q_values.eval(
            feed_dict={X_state: X_next_state_val})
        max_next_q_values = np.max(next_q_values, axis=1, keepdims=True)
        y_val = rewards + continues * discount_rate * max_next_q_values

        # Train the online DQN
        _, loss_val = sess.run([training_op, loss], feed_dict={
            X_state: X_state_val, X_action: X_action_val, y: y_val})

        # Regularly copy the online DQN to the target DQN
        if step % args.copy_steps == 0:
            copy_online_to_target.run()

        # And save regularlys
        if step % args.save_steps == 0:
            saver.save(sess, os.path.join(os.getcwd(), 'my_dqn.ckpt'))

        merged_summary = tf.summary.merge_all()
        writer.add_graph(sess.graph)

Это ошибка, которую я получаю от терминала, когда комментирую строку #tf.summary.histogram("activation", conv_activation) in. (Строка 48)

(pacman) maggie@neuronalresearch:~/Documents/AI/my_project_folder/my_project$ python tiny_dqn.py -v --number-steps 1000
Traceback (most recent call last):
  File "/opt/anaconda3/envs/pacman/lib/python3.5/site-packages/tensorflow/python/framework/tensor_util.py", line 468, in make_tensor_proto
    str_values = [compat.as_bytes(x) for x in proto_values]
  File "/opt/anaconda3/envs/pacman/lib/python3.5/site-packages/tensorflow/python/framework/tensor_util.py", line 468, in <listcomp>
    str_values = [compat.as_bytes(x) for x in proto_values]
  File "/opt/anaconda3/envs/pacman/lib/python3.5/site-packages/tensorflow/python/util/compat.py", line 65, in as_bytes
    (bytes_or_text,))
TypeError: Expected binary or unicode string, got <function relu at 0x7f8999cca048>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "tiny_dqn.py", line 48, in <module>
    tf.summary.histogram("activation", conv_activation)
  File "/opt/anaconda3/envs/pacman/lib/python3.5/site-packages/tensorflow/python/summary/summary.py", line 192, in histogram
    tag=tag, values=values, name=scope)
  File "/opt/anaconda3/envs/pacman/lib/python3.5/site-packages/tensorflow/python/ops/gen_logging_ops.py", line 188, in _histogram_summary
    "HistogramSummary", tag=tag, values=values, name=name)
  File "/opt/anaconda3/envs/pacman/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 513, in _apply_op_helper
    raise err
  File "/opt/anaconda3/envs/pacman/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 510, in _apply_op_helper
    preferred_dtype=default_dtype)
  File "/opt/anaconda3/envs/pacman/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 926, in internal_convert_to_tensor
    ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
  File "/opt/anaconda3/envs/pacman/lib/python3.5/site-packages/tensorflow/python/framework/constant_op.py", line 229, in _constant_tensor_conversion_function
    return constant(v, dtype=dtype, name=name)
  File "/opt/anaconda3/envs/pacman/lib/python3.5/site-packages/tensorflow/python/framework/constant_op.py", line 208, in constant
    value, dtype=dtype, shape=shape, verify_shape=verify_shape))
  File "/opt/anaconda3/envs/pacman/lib/python3.5/site-packages/tensorflow/python/framework/tensor_util.py", line 472, in make_tensor_proto
    "supported type." % (type(values), values))
TypeError: Failed to convert object of type <class 'list'> to Tensor. Contents: [<function relu at 0x7f8999cca048>, <function relu at 0x7f8999cca048>, <function relu at 0x7f8999cca048>]. Consider casting elements to a supported type.
(pacman) maggie@neuronalresearch:~/Documents/AI/my_project_folder/my_project$ 
...