моя модель детерминированного градиента политики ничего не изучает даже после 2000 итераций - PullRequest
1 голос
/ 20 июня 2019

Я пробовал разные гиперпараметры и количество слоев и узлов, но моя модель ничего не изучает даже после итерации 2000 года, и я также пробовал окружение MountainCarContinuous-v0, но это тоже не сработало.

iпробовал другую архитектуру и модель из github, но моя модель ничего не изучает

import numpy as np
import tensorflow as tf
import random
import gym
import pylab
import sys

from keras.initializers import RandomUniform
from keras.models import Sequential, Model
from keras.optimizers import Adam
from keras.layers import Dense, Input, Add, Concatenate, Flatten, GaussianNoise, Lambda
from keras import backend as K
from collections import deque

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
K.set_session(sess)

EPISODES = 100000

class Actor(object):

    def __init__(self, sess, state_size, action_size, TAU, lr, action_bound=1, load=False):
        self.sess = sess
        self.TAU = TAU
        self.lr = lr
        self.load = load
        self.action_bound = action_bound

        self.model, self.weights, self.state = self.bulid_network(state_size, action_size)
        self.target_model, self.target_weights, self.target_weights = self.bulid_network(state_size, action_size)
        self.q_grads = tf.placeholder(tf.float32, [None, action_size])
        self.grads = tf.gradients(self.model.output, self.weights, -self.q_grads)
        self.optimize = tf.train.AdamOptimizer(lr).apply_gradients(zip(self.grads, self.weights))
        self.sess.run(tf.global_variables_initializer())

        if self.load:
            self.model.load_weights("./DDPG_Actor.h5")
            self.target_model.load_weights("./DDPG_Actor_target.h5")

    def train(self, state, grads):
        self.sess.run(self.optimize, feed_dict={self.state : state, self.q_grads : grads})

    def update(self):
        W, target_W = self.model.get_weights(), self.target_model.get_weights()
        for i in range(len(W)):
            target_W[i] = self.TAU*W[i] + (1 - self.TAU)*target_W[i]
        self.target_model.set_weights(target_W)

    def save(self):
        self.model.save_weights("./DDPG_Actor.h5")
        self.target_model.save_weights("./DDPG_Actor_target.h5")

    def bulid_network(self, state_size, action_size):
        input = Input(shape=[state_size])
        X = Dense(400, activation='relu', kernel_initializer='glorot_normal')(input)
        X = Dense(300, activation='relu', kernel_initializer='glorot_normal')(X)
        output = Dense(action_size, activation='tanh', kernel_initializer='glorot_normal')(X)
        output = Lambda(lambda i : i*self.action_bound)(output)
        model = Model(input=input, output=output)
        return model, model.trainable_weights, input 


class Critic(object):

    def __init__(self, sess, state_size, action_size, TAU, lr, load=False):
        self.sess = sess
        self.TAU = TAU
        self.lr = lr
        self.load = load
        self.optimizer = tf.train.AdamOptimizer(lr)

        self.model, self.state, self.action = self.build_network(state_size, action_size)
        self.target_model, self.target_state, self.target_action = self.build_network(state_size, action_size)
        self.q_grads = tf.gradients(self.model.output, self.action)
        self.sess.run(tf.global_variables_initializer())

        if self.load:
            self.model.load_weights("./DDPG_Critic.h5")
            self.target_model.load_weights("./DDPG_Critic_target.h5")

    def gradients(self, state, action):
        return self.sess.run(self.q_grads, feed_dict={self.state : state, self.action : action})[0]

    def save(self):
        self.model.save_weights("./DDPG_Critic.h5")
        self.target_model.save_weights("./DDPG_Critic_target.h5")

    def update(self):
        W, target_W = self.model.get_weights(), self.target_model.get_weights()
        for i in range(len(W)):
            target_W[i] = self.TAU*W[i] + (1 - self.TAU)*target_W[i]
        self.target_model.set_weights(target_W)        

    def build_network(self, state_size, action_size):
        S = Input(shape=[state_size])
        A = Input(shape=[action_size])
        X1 = Dense(400, activation='relu', kernel_initializer='glorot_normal')(S)
        X2 = Dense(400, activation='relu', kernel_initializer='glorot_normal')(A)
        X = Add()([X1,X2])
        X = Dense(300, activation='relu', kernel_initializer='glorot_normal')(X)
        output = Dense(action_size, activation='linear', kernel_initializer='glorot_normal')(X)
        model = Model(inputs=[S, A], outputs=output)
        model.compile(loss='mse', optimizer=Adam(lr=self.lr))
        return model, S, A


class DDPG(object):
    def __init__(self, sess, state_size, action_size, action_bound=1, memory_size=5000, batch_size=64, actor_lr=0.0001, critic_lr=0.001, gamma=0.99, TAU=0.001):
        self.sess = sess
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=memory_size)
        self.batch_size = batch_size
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.gamma = gamma
        self.TAU = TAU
        self.train_start = 1000
        self.epsilon =  1
        self.epsilon_min = 0.001
        self.mu = 0.0
        self.x = 0
        self.theta = 0.01
        self.sigma = 0.1
        self.epsilon_decay = (self.epsilon - self.epsilon_min) / 100000


        self.actor = Actor(sess, state_size, action_size, TAU, actor_lr,action_bound, load=False)
        self.critic = Critic(sess, state_size, action_size, TAU, critic_lr, load=False)

    def append(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        if self.epsilon > self.epsilon_min :
          self.epsilon -= self.epsilon_decay

    def OU(self):
        dx = self.theta*(self.mu - self.x) + self.sigma*np.random.randn(1)
        self.x += dx
        return self.x

    def get_action(self, state):
        action =  self.actor.model.predict(state)[0]
        return action + self.OU()*self.epsilon

    def save(self):
        self.actor.save()
        self.critic.save()

    def train(self):
        if len(self.memory) < self.train_start:
            return

        batch_size = min(self.batch_size, len(self.memory))
        mini_batch = random.sample(self.memory, batch_size)

        states = np.asarray([e[0] for e in mini_batch])
        states = np.reshape(states, [batch_size, self.state_size])
        actions = np.asarray([e[1] for e in mini_batch])
        rewards = np.asarray([e[2] for e in mini_batch])
        next_states = np.asarray([e[3] for e in mini_batch])
        next_states = np.reshape(next_states, [batch_size, self.state_size])
        done = np.asarray([e[4] for e in mini_batch])
        target = np.zeros_like(actions)

        target_q_values = self.critic.target_model.predict([next_states, self.actor.target_model.predict(next_states)])
        for i in range(len(mini_batch)):
            if done[i]:
                target[i] = rewards[i]

            else :
                target[i] = rewards[i] + self.gamma*target_q_values[i]

        loss = self.critic.model.train_on_batch([states, actions], target)
        action_for_grad = self.actor.model.predict(states)
        q_grads = self.critic.gradients(states,action_for_grad)
        self.actor.train(states,q_grads)
        self.actor.update()
        self.critic.update()


env = gym.make('Pendulum-v0')

state_size = env.observation_space.shape[0]
action_size = 1
action_bound = env.action_space.high
agent = DDPG(sess, state_size, action_size, action_bound)


scores, episodes = [], []

for e in range(EPISODES):
    done = False
    score = 0
    state = env.reset()
    state = np.reshape(state, [1,state_size])
    step = 0

    while not done:

        action = agent.get_action(state)
        #print(action)
        next_state, reward, done, _ = env.step([action])
        next_state = np.reshape(next_state,[1,state_size])

        score += reward[0]
        agent.append(state, action, reward, next_state, done)
        state = next_state

        step += 1
        if step % 20 == 0:
          agent.train()

        if done:

            scores.append(score)
            episodes.append(e)
            pylab.plot(episodes, scores, 'b')
            pylab.savefig("./DDPG_Pendulum.png")
            print("episode:", e, " score:", score, " epsilon:", agent.epsilon)

            #if np.mean(scores[-min(10, len(scores)) :]) > -120 :
                #sys.exit()


    if e % 50 == 0:
        agent.save()

всегда я получал около -1450 наград за эпизод

...