DDPG - Критическая сеть актера не сходится с непрерывным пространством действий - PullRequest
0 голосов
/ 16 октября 2018

Я сейчас немного растерялся.Я реализовал Actor-Critic Network и в зависимости от настройки он либо начинает немного сходиться, но выдает значения далеко не справа.Или он выдает почти одинаковые значения потерь снова и снова, но с самого начала производит значения, которые являются своего рода правильными.Я действительно понятия не имею, как это возможно.Это моя текущая модель, которая производит значения, но не сходится:

def create_actor_model(self):
    state_input = Input(shape=self.observation_space_shape)
    h1 = Dense(18, activation='linear')(state_input)
    h1l = LeakyReLU(alpha=0.01)(h1)
    h3 = Dense(18, activation='tanh')(h1l)
    h3n = BatchNormalization()(h3)
    output = Dense(self.action_space_shape[0], activation='tanh')(h3n)
    model = Model(input=state_input, output=output)
    adam = Adam(lr=self.action_space_shape)
    model.compile(loss="mse", optimizer=adam)
    return state_input, model

def create_critic_model(self):
    state_input = Input(shape=self.observation_space_shape)
    state_h1 = Dense(18, activation='relu')(state_input)
    state_h2 = Dense(36)(state_h1)

    action_input = Input(shape=self.action_space_shape)
    action_h1 = Dense(36)(action_input)

    merged = Add()([state_h2, action_h1])
    l_h1 = LeakyReLU(alpha=0.01)(merged)
    merged_h1 = Dense(18, activation='tanh')(l_h1)
    h1n = BatchNormalization()(merged_h1)
    output = Dense(1, activation='tanh')(h1n)
    model = Model(input=[state_input, action_input], output=output)

    adam = Adam(lr=self.action_space_shape)
    model.compile(loss="mse", optimizer=adam, metrics=['mae', 'mse', 'msle'])
    return state_input, action_input, model


 def _train_actor_batch(self, batch_size, s_batch, a_batch, r_batch, s2_batch):
    predicted_action = self.actor_model.predict_on_batch(s_batch)
    grads = self.sess.run(self.critic_grads, feed_dict={
        self.critic_state_input: s_batch,
        self.critic_action_input: predicted_action
    })

    self.sess.run(self.optimize, feed_dict={
        self.actor_state_input: s_batch,
        self.actor_critic_grad: grads[0]
    })

def _train_critic_batch(self, batch_size, s_batch, a_batch, r_batch, s2_batch):
    target_action = self.target_actor_model.predict_on_batch(s2_batch)
    future_reward = self.target_critic_model.predict_on_batch([s2_batch, target_action])

    rewards = []
    for k in range(batch_size):
        this_future_reward = future_reward[k] if batch_size > 1 else future_reward
        rewards.append(r_batch[k] + self.gamma * this_future_reward)

    return self.critic_model.train_on_batch([s_batch, a_batch], np.reshape(rewards, batch_size))

def replay(self, batch_size):
    memory_length = len(self.memory)
    if memory_length < batch_size:
        samples = random.sample(self.memory, memory_length)
    else:
        samples = random.sample(self.memory, batch_size)

    s_batch = np.array([cur_state[0] for cur_state, _, _, _ in samples])
    a_batch = np.array([float(action[0]) for _, action, _, _ in samples])
    r_batch = np.array([reward[0] for _, _, reward, _ in samples])
    s2_batch = np.array([new_state[0] for _, _, _, new_state in samples])

    critic_loss = self._train_critic_batch(len(s_batch), s_batch, a_batch, r_batch, s2_batch)
    self._train_actor_batch(len(s_batch), s_batch, a_batch, r_batch, s2_batch)

    self.update_target()
    return critic_loss

def _update_actor_target(self):
    actor_model_weights = self.actor_model.get_weights()
    actor_target_weights = self.target_actor_model.get_weights()

    for i in range(len(actor_target_weights)):
        actor_target_weights[i] = actor_model_weights[i] * self.tau + actor_target_weights[i] * (1 - self.tau)
    self.target_actor_model.set_weights(actor_target_weights)

def _update_critic_target(self):
    critic_model_weights = self.critic_model.get_weights()
    critic_target_weights = self.target_critic_model.get_weights()

    for i in range(len(critic_target_weights)):
        critic_target_weights[i] = critic_model_weights[i] * self.tau + critic_target_weights[i] * (1 - self.tau)
    self.target_critic_model.set_weights(critic_target_weights)

def update_target(self):
    self._update_actor_target()
    self._update_critic_target()



def __init__(self):
    self.memory = deque(maxlen=2000)
    self.actor_state_input, self.actor_model = self.create_actor_model()
    _, self.target_actor_model = self.create_actor_model()

    self.actor_critic_grad = tf.placeholder(tf.float32, [None, self.action_space_shape[0]])

    actor_model_weights = self.actor_model.trainable_weights
    self.actor_grads = tf.gradients(self.actor_model.output,
                                    actor_model_weights, -self.actor_critic_grad)
    grads = zip(self.actor_grads, actor_model_weights)
    self.optimize = tf.train.AdamOptimizer(self.learning_rate).apply_gradients(grads)

    self.critic_state_input, self.critic_action_input, \
    self.critic_model = self.create_critic_model()
    _, _, self.target_critic_model = self.create_critic_model()

    self.critic_grads = tf.gradients(self.critic_model.output,
                                     self.critic_action_input)

    self.sess.run(tf.initialize_all_variables())

А затем я тренируюсь со следующим методом, который вызывается для каждой эпохи (в конце очищаемая память являетсяВоспроизведение опыта):

def train(self, states, epoch, env, is_new_epoch):
    train_size = int(len(states) * 0.70)

    train = dict(list(states.items())[0:train_size])
    test = dict(list(states.items())[train_size:len(states)])
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")

        working_states = copy(train)

        critic_eval = list()
        rewards = dict()
        for last_day, (last_state_vec, _, last_action) in working_states.items():
            this_day = last_day + timedelta(days=1)
            if this_day in working_states:
                (new_state_vec, _, _) = working_states.get(this_day)
                rewards[last_day] = env.get_reward_by_states(last_state_vec, new_state_vec)

        amt = len(working_states)
        i = 0
        for last_day, (last_state_vec, _, last_action) in working_states.items():
            i+= 1
            this_day = last_day + timedelta(days=1)
            if this_day in working_states:
                (new_state_vec, _, _) = working_states.get(this_day)
                reward = np.reshape(rewards[last_day], [1, ])
                self.remember(last_state_vec, [last_action], reward, new_state_vec)
                new_eval = self.replay(env.batch_size)
                critic_eval.append(new_eval)
         self.memory.clear()

Это значения потерь, которые я получил за 15 эпох: Actor Critic loss values over 15 epochs

Один образец, полученный из памяти:

state
  [8 79 48246 53607 29 34 37 Decimal('1.0000000000') 6]
action
  0.85
reward
  0.2703302
next state
  [9 79 48074 57869 27 28 32 Decimal('1.0000000000') 0]
...