Я пытаюсь сбалансировать перевернутый маятник, используя алгоритм DDPG.Я не могу получить ожидаемые результаты: при весах обученного актера маятник должен уравновеситься в вертикальном положении и оставаться там не менее 3 секунд (dt = 0,01), начиная со случайно инициализированного состояния.
Для этого я использовал OpenAI-Gym, чтобы настроить свое окружение и определить динамическое уравнение и функцию вознаграждения.Как я упоминал ранее, я использовал алгоритм DDPG с последовательной памятью, случайный процесс Орнштейна-Уленбека.
import gym
import numpy as np
from gym import error, spaces, utils
from gym.utils import seeding
from os import path
import random
class InvPendulumEnv(gym.Env):
metadata = {
'render.modes': ['human', 'rgb_array'],
'video.frames_per_second': 30
}
def __init__(self):
self.max_theta = np.pi / 8 # rad
self.max_thetadot = 0.5 # rad/sec
self.max_torque = 300 # N-m
self.dt = 0.01
self.viewer = None
bounds = np.array([self.max_theta, self.max_thetadot])
self.action_space = spaces.Box(low=-self.max_torque, high=self.max_torque, shape=(1,), dtype=np.float32)
self.observation_space = spaces.Box(low=-bounds,high=bounds, dtype=np.float32)
self.seed()
def seed(self, seed=None):
_, seed = seeding.np_random(seed)
return [seed]
def step(self, tor):
#print(tor, "Action provided for the next timestep")
th, thdot = self.state
#print("Theta", "Thetadot", th, thdot,'\n')
tor_prev = self.action # Action at time t-1
#print("previous timestep torque", tor_prev)
g = 9.8 # acceleration due to gravity
m = 65 # Mass
l = 1.1 # length
dt = self.dt # Time step
a = 0.83 # Filtering factor
b = 0.8 # damping constant
k = 8 # stiffness constant
c = np.sqrt(40) # noise amplitude
rmax = 1
tor_con = np.clip(tor, -self.max_torque, self.max_torque)[0] + c*np.random.normal(0, 1, 1)[0]
# Torque applied by the controller with additive white gaussian noise
#print(tor_con,"torque by controller \n")
tor_t = a * tor_con + (1 - a)*tor_prev
# Torque at time t with filtering
#print(tor_t, "torque at time t\n")
I = m * (l ** 2)
# Moment of Inertia
newthdot = thdot + (tor_t + m * g * l * np.sin(th) - b * thdot - k * thdot) / I * dt
# dynamical equation solved by euler method
#print(newthdot, "newthetadot")
newth = th + newthdot * dt
newthdot = np.clip(newthdot, -self.max_thetadot, self.max_thetadot)
#Clipping the value of angular velocity
#print("New thetadot and theta", newthdot, newth)
self.state = np.array([newth, newthdot])
self.action = tor_t
done = bool(newth > np.pi/8 or newth < -np.pi/8)
reward = rmax*np.exp(-(newth/(self.max_theta/5))**2 - (newthdot/(self.max_thetadot/5))**2)
return self.state, reward, done, {}
def reset(self):
init_th = ((random.random() - 0.5) * 2) * 5
init_thr = init_th * np.pi / 180
init_thdotr = ((random.random() - 0.5) * 2) * 0.0625
self.state = np.array([init_thr, init_thdotr])
#print(self.state, "Initial State")
self.action = 0
return self.state
def render(self, mode='human'):
if self.viewer is None:
from gym.envs.classic_control import rendering
self.viewer = rendering.Viewer(500, 500)
self.viewer.set_bounds(-2.2, 2.2, -2.2, 2.2)
surface = rendering.Line(start=(-1.2, -0.05), end=(1.2, -0.05))
self.viewer.add_geom(surface)
bob = rendering.make_circle(0.15, filled=True)
bob.set_color(.8, .3, .2)
attributes = rendering.Transform(translation=(0.0, 1.0))
bob.add_attr(attributes)
rod = rendering.FilledPolygon([(-0.025, 0), (-0.025, 1.0 - 0.15), (0.025, 1.0 - 0.15), (0.025, 0)])
rod.set_color(0.2, 0.2, 0.7)
pendulum = rendering.Compound([bob, rod])
pendulum.set_color(0.4, 0.5, 1)
translate = rendering.Transform(translation=(0.0, -0.05))
pendulum.add_attr(translate)
self.pole_transform = rendering.Transform()
pendulum.add_attr(self.pole_transform)
self.viewer.add_geom(pendulum)
axle_fill = rendering.make_circle(radius=.1, res=30, filled=True)
axle_fill.set_color(1, 1, 1)
axle = rendering.make_circle(radius=0.1, res=30, filled=False)
semi = rendering.Transform(translation=(0.0, -0.05))
axle_fill.add_attr(semi)
axle.add_attr(semi)
axle.set_color(0, 0, 0)
self.viewer.add_geom(axle_fill)
self.viewer.add_geom(axle)
pivot = rendering.make_circle(0.02, filled=True)
self.viewer.add_geom(pivot)
hide = rendering.FilledPolygon([(-2.2, -0.07), (-2.2, -2.2), (2.2, -2.2), (2.2, -0.07)])
hide.set_color(1, 1, 1)
self.viewer.add_geom(hide)
fname = path.join(path.dirname(__file__), "clockwise.png")
self.img = rendering.Image(fname, 0.5, 0.5)
self.imgtrans = rendering.Transform()
self.img.add_attr(self.imgtrans)
self.viewer.add_onetime(self.img)
self.pole_transform.set_rotation(self.state[0])
if self.action != 0:
self.imgtrans.scale = (-self.action / 8, np.abs(self.action) / 8)
return self.viewer.render(return_rgb_array=mode == 'rgb_array')
def close(self):
if self.viewer:
self.viewer.close()
self.viewer = None
Ниже приведен мой код для агента DDPG
import numpy as np
import gym
import h5py
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Flatten, Input, Concatenate, ELU
from keras.optimizers import Adam
from keras import backend as K
from rl.agents import DDPGAgent
from rl.memory import SequentialMemory
from rl.random import OrnsteinUhlenbeckProcess
from Inv_pendulum import InvPendulumEnv
env = InvPendulumEnv()
#ENV_NAME = 'Inverted_Pendulum-v0'
# Get the environment and extract the number of actions.
#env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
assert len(env.action_space.shape) == 1
nb_actions = env.action_space.shape[0]
# Next, we build a very simple model.
actor = Sequential()
actor.add(Flatten(input_shape=(1,) + env.observation_space.shape))
actor.add(Dense(16, activation="relu"))
actor.add(Dense(16, activation="relu"))
actor.add(Dense(16, activation="relu"))
actor.add(Dense(nb_actions, activation="linear"))
print(actor.summary())
action_input = Input(shape=(nb_actions,), name='action_input')
observation_input = Input(shape=(1,) + env.observation_space.shape,
name='observation_input')
flattened_observation = Flatten()(observation_input)
x = Concatenate()([action_input, flattened_observation])
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(1)(x)
x = Activation('linear')(x)
critic = Model(inputs=[action_input, observation_input], outputs=x)
print(critic.summary())
# Finally, we configure and compile our agent. You can use every built-in
Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=100000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3)
agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100,
random_process=random_process, gamma=.99, target_model_update=1e-3)
agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['rmse'])
# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
agent.fit(env, nb_steps=60000, visualize=True, verbose=1, nb_max_episode_steps=300)
# After training is done, we save the final weights.
#agent.save_weights('ddpg_weights.hdf5', overwrite=True)
# Finally, evaluate our algorithm for 5 episodes.
agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=300)
Я ожидаю, что сОбученный актер весит Маятник должен уравновесить себя в вертикальном положении и оставаться там не менее 3 секунд (dt = 0,01), начиная со случайно инициализированного состояния.
Но в моем случае эпизод заканчивается до 3 секунд, т.е.наш агент умирает, т.е. маятник падает от -pi / 8 до pi / 8.
Как мне улучшить мой алгоритм или настроить параметры так, чтобы я получил желаемые результаты?
Должен ли я изменить свою сетьархитектура для актера, если да, что будет лучшим?