Question

我正在尝试使用DDPG算法平衡倒立摆。我无法获得预期的结果，因为受过训练的演员的体重使摆锤在垂直位置保持平衡，并从随机初始化状态开始至少停留3秒钟（dt = 0.01）。

为此，我使用OpenAI-Gym设置环境并定义动力方程和奖励函数。正如我前面提到的，我已经将DDPG算法与顺序存储器一起使用，即Ornstein-Uhlenback随机过程。

import gym
import numpy as np
from gym import error, spaces, utils
from gym.utils import seeding
from os import path
import random


class InvPendulumEnv(gym.Env):
    metadata = {
        'render.modes': ['human', 'rgb_array'],
        'video.frames_per_second': 30
}


    def __init__(self):
        self.max_theta = np.pi / 8  # rad
        self.max_thetadot = 0.5     # rad/sec
        self.max_torque = 300       # N-m
        self.dt = 0.01
        self.viewer = None

        bounds = np.array([self.max_theta, self.max_thetadot])

        self.action_space = spaces.Box(low=-self.max_torque, high=self.max_torque, shape=(1,), dtype=np.float32)

        self.observation_space = spaces.Box(low=-bounds,high=bounds, dtype=np.float32)
        self.seed()

   def seed(self, seed=None):
        _, seed = seeding.np_random(seed)
        return [seed]

   def step(self, tor):

        #print(tor, "Action provided for the next timestep")

        th, thdot = self.state
        #print("Theta", "Thetadot", th, thdot,'\n')

        tor_prev = self.action      # Action at time t-1
        #print("previous timestep torque", tor_prev)

        g = 9.8             # acceleration due to gravity
        m = 65              # Mass
        l = 1.1             # length
        dt = self.dt        # Time step
        a = 0.83            # Filtering factor
        b = 0.8             # damping constant
        k = 8               # stiffness constant
        c = np.sqrt(40)     # noise amplitude
        rmax = 1


        tor_con = np.clip(tor, -self.max_torque, self.max_torque)[0] + c*np.random.normal(0, 1, 1)[0]
    # Torque applied by the controller with additive white gaussian noise
    #print(tor_con,"torque by controller \n")

        tor_t = a * tor_con + (1 - a)*tor_prev
        # Torque at time t with filtering

        #print(tor_t, "torque at time t\n")

        I = m * (l ** 2)
        # Moment of Inertia

        newthdot = thdot + (tor_t + m * g * l * np.sin(th) - b * thdot - k * thdot) / I * dt
        # dynamical equation solved by euler method
        #print(newthdot, "newthetadot")

        newth = th + newthdot * dt

        newthdot = np.clip(newthdot, -self.max_thetadot, self.max_thetadot)
    #Clipping the value of angular velocity
    #print("New thetadot and theta", newthdot, newth)

        self.state = np.array([newth, newthdot])

        self.action = tor_t

        done = bool(newth > np.pi/8 or newth < -np.pi/8)

        reward = rmax*np.exp(-(newth/(self.max_theta/5))**2 - (newthdot/(self.max_thetadot/5))**2)

        return self.state, reward, done, {}

    def reset(self):
        init_th = ((random.random() - 0.5) * 2) * 5
        init_thr = init_th * np.pi / 180
        init_thdotr = ((random.random() - 0.5) * 2) * 0.0625
        self.state = np.array([init_thr, init_thdotr])
        #print(self.state, "Initial State")
        self.action = 0
        return self.state

    def render(self, mode='human'):

        if self.viewer is None:
            from gym.envs.classic_control import rendering
            self.viewer = rendering.Viewer(500, 500)
            self.viewer.set_bounds(-2.2, 2.2, -2.2, 2.2)

            surface = rendering.Line(start=(-1.2, -0.05), end=(1.2, -0.05))

            self.viewer.add_geom(surface)

            bob = rendering.make_circle(0.15, filled=True)
            bob.set_color(.8, .3, .2)
            attributes = rendering.Transform(translation=(0.0, 1.0))
            bob.add_attr(attributes)

            rod = rendering.FilledPolygon([(-0.025, 0), (-0.025, 1.0 - 0.15), (0.025, 1.0 - 0.15), (0.025, 0)])
            rod.set_color(0.2, 0.2, 0.7)

            pendulum = rendering.Compound([bob, rod])
            pendulum.set_color(0.4, 0.5, 1)
            translate = rendering.Transform(translation=(0.0, -0.05))
            pendulum.add_attr(translate)
            self.pole_transform = rendering.Transform()
            pendulum.add_attr(self.pole_transform)
            self.viewer.add_geom(pendulum)

            axle_fill = rendering.make_circle(radius=.1, res=30, filled=True)
            axle_fill.set_color(1, 1, 1)

            axle = rendering.make_circle(radius=0.1, res=30, filled=False)
            semi = rendering.Transform(translation=(0.0, -0.05))
            axle_fill.add_attr(semi)
            axle.add_attr(semi)
            axle.set_color(0, 0, 0)

            self.viewer.add_geom(axle_fill)
            self.viewer.add_geom(axle)

            pivot = rendering.make_circle(0.02, filled=True)
            self.viewer.add_geom(pivot)

            hide = rendering.FilledPolygon([(-2.2, -0.07), (-2.2, -2.2), (2.2, -2.2), (2.2, -0.07)])
            hide.set_color(1, 1, 1)
            self.viewer.add_geom(hide)

            fname = path.join(path.dirname(__file__), "clockwise.png")
            self.img = rendering.Image(fname, 0.5, 0.5)
            self.imgtrans = rendering.Transform()
            self.img.add_attr(self.imgtrans)

        self.viewer.add_onetime(self.img)
        self.pole_transform.set_rotation(self.state[0])
        if self.action != 0:
            self.imgtrans.scale = (-self.action / 8, np.abs(self.action) / 8)

        return self.viewer.render(return_rgb_array=mode == 'rgb_array')

    def close(self):
        if self.viewer:
            self.viewer.close()
            self.viewer = None

以下是我的DDPG代理代码

import numpy as np
import gym
import h5py

from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Flatten, Input, Concatenate, ELU
from keras.optimizers import Adam
from keras import backend as K

from rl.agents import DDPGAgent
from rl.memory import SequentialMemory
from rl.random import OrnsteinUhlenbeckProcess

from Inv_pendulum import InvPendulumEnv

env = InvPendulumEnv()

#ENV_NAME = 'Inverted_Pendulum-v0'


# Get the environment and extract the number of actions.
#env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
assert len(env.action_space.shape) == 1
nb_actions = env.action_space.shape[0]

# Next, we build a very simple model.
actor = Sequential()
actor.add(Flatten(input_shape=(1,) + env.observation_space.shape))
actor.add(Dense(16, activation="relu"))
actor.add(Dense(16, activation="relu"))
actor.add(Dense(16, activation="relu"))
actor.add(Dense(nb_actions, activation="linear"))
print(actor.summary())


action_input = Input(shape=(nb_actions,), name='action_input')
observation_input = Input(shape=(1,) + env.observation_space.shape, 
name='observation_input')
flattened_observation = Flatten()(observation_input)
x = Concatenate()([action_input, flattened_observation])
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(1)(x)
x = Activation('linear')(x)
critic = Model(inputs=[action_input, observation_input], outputs=x)
print(critic.summary())

# Finally, we configure and compile our agent. You can use every built-in 
Keras optimizer and
# even the metrics!

memory = SequentialMemory(limit=100000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3)
agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
              memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100,
              random_process=random_process, gamma=.99, target_model_update=1e-3)
agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['rmse'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
agent.fit(env, nb_steps=60000, visualize=True, verbose=1, nb_max_episode_steps=300)

# After training is done, we save the final weights.
#agent.save_weights('ddpg_weights.hdf5', overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=300)

我希望摆锤的重量与受过训练的演员的体重保持平衡，并在垂直位置保持至少3秒钟（dt = 0.01），从随机初始化状态开始。

但是在我的情况下，情节在3秒之前结束，也就是说我们的经纪人去世了，即钟摆超出了-pi / 8到pi / 8。

我该如何改善算法或对参数进行超调，以便获得理想的结果？

如果是的话，我应该为演员更改我的网络架构吗？

使用深度强化学习算法平衡摆的难度

0 个答案: