Почему результат при восстановлении сохраненной модели DDPG значительно отличается от результата при сохранении? - PullRequest
0 голосов
/ 08 апреля 2020

Я сохраняю обученную модель после определенного количества эпизодов с помощью специальной функции save () класса DDPG (сеть сохраняется, когда награда достигает нуля), но когда я снова восстанавливаю модель, используя saver.restore () сеть выдает вознаграждение, равное приблизительно -1800. Почему это происходит, может я что-то не так делаю? Моя сеть:

import tensorflow as tf
import numpy as np
import gym

epsiode_steps = 500

# learning rate for actor
lr_a = 0.001

# learning rate for critic
lr_c = 0.002
gamma = 0.9
alpha = 0.01
memory = 10000
batch_size = 32
render = True


class DDPG(object):
    def __init__(self, no_of_actions, no_of_states, a_bound, ):
        self.memory = np.zeros((memory, no_of_states * 2 + no_of_actions + 1), dtype=np.float32)

        # initialize pointer to point to our experience buffer
        self.pointer = 0

        self.sess = tf.Session()

        self.noise_variance = 3.0

        self.no_of_actions, self.no_of_states, self.a_bound = no_of_actions, no_of_states, a_bound,

        self.state = tf.placeholder(tf.float32, [None, no_of_states], 's')
        self.next_state = tf.placeholder(tf.float32, [None, no_of_states], 's_')
        self.reward = tf.placeholder(tf.float32, [None, 1], 'r')

        with tf.variable_scope('Actor'):
            self.a = self.build_actor_network(self.state, scope='eval', trainable=True)
            a_ = self.build_actor_network(self.next_state, scope='target', trainable=False)

        with tf.variable_scope('Critic'):
            q = self.build_crtic_network(self.state, self.a, scope='eval', trainable=True)
            q_ = self.build_crtic_network(self.next_state, a_, scope='target', trainable=False)

        self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
        self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')

        self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
        self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')

        # update target value
        self.soft_replace = [
            [tf.assign(at, (1 - alpha) * at + alpha * ae), tf.assign(ct, (1 - alpha) * ct + alpha * ce)]
            for at, ae, ct, ce in zip(self.at_params, self.ae_params, self.ct_params, self.ce_params)]

        q_target = self.reward + gamma * q_

        td_error = tf.losses.mean_squared_error(labels=(self.reward + gamma * q_), predictions=q)

        self.ctrain = tf.train.AdamOptimizer(lr_c).minimize(td_error, name="adam-ink", var_list=self.ce_params)

        a_loss = - tf.reduce_mean(q)

        # train the actor network with adam optimizer for minimizing the loss
        self.atrain = tf.train.AdamOptimizer(lr_a).minimize(a_loss, var_list=self.ae_params)

        tf.summary.FileWriter("logs2", self.sess.graph)

        # initialize all variables

        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver()
        self.saver.restore(self.sess, "Pendulum/nn.ckpt")


    def choose_action(self, s):
        a = self.sess.run(self.a, {self.state: s[np.newaxis, :]})[0]
        a = np.clip(np.random.normal(a, self.noise_variance), -2, 2)

        return a

    def learn(self):
        # soft target replacement
        self.sess.run(self.soft_replace)

        indices = np.random.choice(memory, size=batch_size)
        batch_transition = self.memory[indices, :]
        batch_states = batch_transition[:, :self.no_of_states]
        batch_actions = batch_transition[:, self.no_of_states: self.no_of_states + self.no_of_actions]
        batch_rewards = batch_transition[:, -self.no_of_states - 1: -self.no_of_states]
        batch_next_state = batch_transition[:, -self.no_of_states:]

        self.sess.run(self.atrain, {self.state: batch_states})
        self.sess.run(self.ctrain, {self.state: batch_states, self.a: batch_actions, self.reward: batch_rewards,
                                    self.next_state: batch_next_state})

    # we define a function store_transition which stores all the transition information in the buffer
    def store_transition(self, s, a, r, s_):
        trans = np.hstack((s, a, [r], s_))

        index = self.pointer % memory
        self.memory[index, :] = trans
        self.pointer += 1

        if self.pointer > memory:
            self.noise_variance *= 0.99995
            self.learn()

    # we define the function build_actor_network for builing our actor network and after crtic network
    def build_actor_network(self, s, scope, trainable)
        with tf.variable_scope(scope):
            l1 = tf.layers.dense(s, 30, activation=tf.nn.tanh, name='l1', trainable=trainable)
            a = tf.layers.dense(l1, self.no_of_actions, activation=tf.nn.tanh, name='a', trainable=trainable)
            return tf.multiply(a, self.a_bound, name="scaled_a")               

    def build_crtic_network(self, s, a, scope, trainable):
        with tf.variable_scope(scope):
            n_l1 = 30
            w1_s = tf.get_variable('w1_s', [self.no_of_states, n_l1], trainable=trainable)
            w1_a = tf.get_variable('w1_a', [self.no_of_actions, n_l1], trainable=trainable)
            b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
            net = tf.nn.tanh(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)

            q = tf.layers.dense(net, 1, trainable=trainable)
            return q

    def save(self):
        self.saver.save(self.sess, "Pendulum/nn.ckpt")

env = gym.make("Pendulum-v0")
env = env.unwrapped
env.seed(1)

no_of_states = env.observation_space.shape[0]
no_of_actions = env.action_space.shape[0]

a_bound = env.action_space.high
ddpg = DDPG(no_of_actions, no_of_states, a_bound)

total_reward = []

no_of_episodes = 300
# for each episodes
for i in range(no_of_episodes):
    # initialize the environment
    s = env.reset()

    # episodic reward
    ep_reward = 0

    for j in range(epsiode_steps):

        env.render()

        # select action by adding noise through OU process
        a = ddpg.choose_action(s)

        # peform the action and move to the next state s
        s_, r, done, info = env.step(a)

        # store the the transition to our experience buffer
        # sample some minibatch of experience and train the network
        ddpg.store_transition(s, a, r, s_)

        # update current state as next state
        s = s_

        # add episodic rewards
        ep_reward += r

        if int(ep_reward) == 0 and i > 200:
            ddpg.save()
            print("save")
            quit()

        if j == epsiode_steps - 1:
            total_reward.append(ep_reward)
            print('Episode:', i, ' Reward: %i' % int(ep_reward))

            break
...