Сложность балансировки маятника с использованием алгоритма глубокого обучения - PullRequest
0 голосов
/ 23 апреля 2019

Я пытаюсь сбалансировать перевернутый маятник, используя алгоритм DDPG.Я не могу получить ожидаемые результаты: при весах обученного актера маятник должен уравновеситься в вертикальном положении и оставаться там не менее 3 секунд (dt = 0,01), начиная со случайно инициализированного состояния.

Для этого я использовал OpenAI-Gym, чтобы настроить свое окружение и определить динамическое уравнение и функцию вознаграждения.Как я упоминал ранее, я использовал алгоритм DDPG с последовательной памятью, случайный процесс Орнштейна-Уленбека.

import gym
import numpy as np
from gym import error, spaces, utils
from gym.utils import seeding
from os import path
import random


class InvPendulumEnv(gym.Env):
    metadata = {
        'render.modes': ['human', 'rgb_array'],
        'video.frames_per_second': 30
}


    def __init__(self):
        self.max_theta = np.pi / 8  # rad
        self.max_thetadot = 0.5     # rad/sec
        self.max_torque = 300       # N-m
        self.dt = 0.01
        self.viewer = None

        bounds = np.array([self.max_theta, self.max_thetadot])

        self.action_space = spaces.Box(low=-self.max_torque, high=self.max_torque, shape=(1,), dtype=np.float32)

        self.observation_space = spaces.Box(low=-bounds,high=bounds, dtype=np.float32)
        self.seed()

   def seed(self, seed=None):
        _, seed = seeding.np_random(seed)
        return [seed]

   def step(self, tor):

        #print(tor, "Action provided for the next timestep")

        th, thdot = self.state
        #print("Theta", "Thetadot", th, thdot,'\n')

        tor_prev = self.action      # Action at time t-1
        #print("previous timestep torque", tor_prev)

        g = 9.8             # acceleration due to gravity
        m = 65              # Mass
        l = 1.1             # length
        dt = self.dt        # Time step
        a = 0.83            # Filtering factor
        b = 0.8             # damping constant
        k = 8               # stiffness constant
        c = np.sqrt(40)     # noise amplitude
        rmax = 1


        tor_con = np.clip(tor, -self.max_torque, self.max_torque)[0] + c*np.random.normal(0, 1, 1)[0]
    # Torque applied by the controller with additive white gaussian noise
    #print(tor_con,"torque by controller \n")

        tor_t = a * tor_con + (1 - a)*tor_prev
        # Torque at time t with filtering

        #print(tor_t, "torque at time t\n")

        I = m * (l ** 2)
        # Moment of Inertia

        newthdot = thdot + (tor_t + m * g * l * np.sin(th) - b * thdot - k * thdot) / I * dt
        # dynamical equation solved by euler method
        #print(newthdot, "newthetadot")

        newth = th + newthdot * dt

        newthdot = np.clip(newthdot, -self.max_thetadot, self.max_thetadot)
    #Clipping the value of angular velocity
    #print("New thetadot and theta", newthdot, newth)

        self.state = np.array([newth, newthdot])

        self.action = tor_t

        done = bool(newth > np.pi/8 or newth < -np.pi/8)

        reward = rmax*np.exp(-(newth/(self.max_theta/5))**2 - (newthdot/(self.max_thetadot/5))**2)

        return self.state, reward, done, {}

    def reset(self):
        init_th = ((random.random() - 0.5) * 2) * 5
        init_thr = init_th * np.pi / 180
        init_thdotr = ((random.random() - 0.5) * 2) * 0.0625
        self.state = np.array([init_thr, init_thdotr])
        #print(self.state, "Initial State")
        self.action = 0
        return self.state

    def render(self, mode='human'):

        if self.viewer is None:
            from gym.envs.classic_control import rendering
            self.viewer = rendering.Viewer(500, 500)
            self.viewer.set_bounds(-2.2, 2.2, -2.2, 2.2)

            surface = rendering.Line(start=(-1.2, -0.05), end=(1.2, -0.05))

            self.viewer.add_geom(surface)

            bob = rendering.make_circle(0.15, filled=True)
            bob.set_color(.8, .3, .2)
            attributes = rendering.Transform(translation=(0.0, 1.0))
            bob.add_attr(attributes)

            rod = rendering.FilledPolygon([(-0.025, 0), (-0.025, 1.0 - 0.15), (0.025, 1.0 - 0.15), (0.025, 0)])
            rod.set_color(0.2, 0.2, 0.7)

            pendulum = rendering.Compound([bob, rod])
            pendulum.set_color(0.4, 0.5, 1)
            translate = rendering.Transform(translation=(0.0, -0.05))
            pendulum.add_attr(translate)
            self.pole_transform = rendering.Transform()
            pendulum.add_attr(self.pole_transform)
            self.viewer.add_geom(pendulum)

            axle_fill = rendering.make_circle(radius=.1, res=30, filled=True)
            axle_fill.set_color(1, 1, 1)

            axle = rendering.make_circle(radius=0.1, res=30, filled=False)
            semi = rendering.Transform(translation=(0.0, -0.05))
            axle_fill.add_attr(semi)
            axle.add_attr(semi)
            axle.set_color(0, 0, 0)

            self.viewer.add_geom(axle_fill)
            self.viewer.add_geom(axle)

            pivot = rendering.make_circle(0.02, filled=True)
            self.viewer.add_geom(pivot)

            hide = rendering.FilledPolygon([(-2.2, -0.07), (-2.2, -2.2), (2.2, -2.2), (2.2, -0.07)])
            hide.set_color(1, 1, 1)
            self.viewer.add_geom(hide)

            fname = path.join(path.dirname(__file__), "clockwise.png")
            self.img = rendering.Image(fname, 0.5, 0.5)
            self.imgtrans = rendering.Transform()
            self.img.add_attr(self.imgtrans)

        self.viewer.add_onetime(self.img)
        self.pole_transform.set_rotation(self.state[0])
        if self.action != 0:
            self.imgtrans.scale = (-self.action / 8, np.abs(self.action) / 8)

        return self.viewer.render(return_rgb_array=mode == 'rgb_array')

    def close(self):
        if self.viewer:
            self.viewer.close()
            self.viewer = None

Ниже приведен мой код для агента DDPG

import numpy as np
import gym
import h5py

from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Flatten, Input, Concatenate, ELU
from keras.optimizers import Adam
from keras import backend as K

from rl.agents import DDPGAgent
from rl.memory import SequentialMemory
from rl.random import OrnsteinUhlenbeckProcess

from Inv_pendulum import InvPendulumEnv

env = InvPendulumEnv()

#ENV_NAME = 'Inverted_Pendulum-v0'


# Get the environment and extract the number of actions.
#env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
assert len(env.action_space.shape) == 1
nb_actions = env.action_space.shape[0]

# Next, we build a very simple model.
actor = Sequential()
actor.add(Flatten(input_shape=(1,) + env.observation_space.shape))
actor.add(Dense(16, activation="relu"))
actor.add(Dense(16, activation="relu"))
actor.add(Dense(16, activation="relu"))
actor.add(Dense(nb_actions, activation="linear"))
print(actor.summary())


action_input = Input(shape=(nb_actions,), name='action_input')
observation_input = Input(shape=(1,) + env.observation_space.shape, 
name='observation_input')
flattened_observation = Flatten()(observation_input)
x = Concatenate()([action_input, flattened_observation])
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(1)(x)
x = Activation('linear')(x)
critic = Model(inputs=[action_input, observation_input], outputs=x)
print(critic.summary())

# Finally, we configure and compile our agent. You can use every built-in 
Keras optimizer and
# even the metrics!

memory = SequentialMemory(limit=100000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3)
agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
              memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100,
              random_process=random_process, gamma=.99, target_model_update=1e-3)
agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['rmse'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
agent.fit(env, nb_steps=60000, visualize=True, verbose=1, nb_max_episode_steps=300)

# After training is done, we save the final weights.
#agent.save_weights('ddpg_weights.hdf5', overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=300)

Я ожидаю, что сОбученный актер весит Маятник должен уравновесить себя в вертикальном положении и оставаться там не менее 3 секунд (dt = 0,01), начиная со случайно инициализированного состояния.

Но в моем случае эпизод заканчивается до 3 секунд, т.е.наш агент умирает, т.е. маятник падает от -pi / 8 до pi / 8.

Как мне улучшить мой алгоритм или настроить параметры так, чтобы я получил желаемые результаты?

Должен ли я изменить свою сетьархитектура для актера, если да, что будет лучшим?

...