我正在学习强化学习。 此代码会引起一些问题。 当运行train()函数时,其策略和值函数输出均为Nan。 这是我的运行简单环境的代码Cartpole。
这是导入设置。
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
这是超级参数。
learning_rate = 0.0005
gamma = 0.98
lmbda = 0.95
K = 3
epsilon = 0.1
使用PPO和LSTM的代理代码。
class PPO_lstm(nn.Module):
def __init__(self):
super(PPO_lstm, self).__init__()
self.log_list = []
self.fc1 = nn.Linear(4, 32) # state encoder, value and policy function use it
self.lstm = nn.LSTM(32, 64) # LSTM, value and policy function use it
self.fc2_pi = nn.Linear(64, 2) # policy function
self.fc2_v = nn.Linear(64, 1) # value function
self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
def pi(self, x, hidden_state): # get policy
x = F.relu(self.fc1(x)) # x size : [32] or [batch, 32]
x = x.view(1, -1, 32) # x size : [1, batch, 32]
x, new_hidden = self.lstm(x, hidden_state) # x size : [1, batch, 64]
x = x.squeeze(0) # x size : [batch, 64]
pi = F.log_softmax(self.fc2_pi(x), dim=1)
return pi, new_hidden # pi size : [batch, 2], new_hidden size : [1, batch, 64]
def v(self, x, hidden_state): # get state value
x = F.relu(self.fc1(x)) # x size : [32] or [batch, 32]
x = x.view(1, -1, 32) # x size : [1, batch, 32]
x, new_hidden = self.lstm(x, hidden_state) # x size : [1, batch, 64]
x = x.squeeze(0) # x size : [batch, 64]
v = self.fc2_v(x) # v size : [batch, 1]
return v # v size : [batch, 1]
def add_log(self, log): # add transition which used train function
self.log_list.append(log) # s, a, r, s', action_prob, done, hidden(h, c), hidden_prime(h, c)
def make_batch(self): # make batch file using log_list for training
s_lst, a_lst, r_lst, s_prime_lst, prob_a_lst, done_lst, h_h_lst, h_c_lst, h_prime_h_lst, h_prime_c_lst = [], [], [], [], [], [], [], [], [], []
for i in self.log_list:
s, a, r, s_prime, done, action_prob, h, h_prime = i
s_lst.append(s) # s_lst size : [batch, 4]
a_lst.append([a]) # a_lst size : [batch, 1]
r_lst.append([r]) # r_lst size : [batch, 1]
s_prime_lst.append(s_prime) # s_prime_lst size : [batch, 4]
prob_a_lst.append([action_prob]) # prob_a_lst size : [batch, 1]
done_lst.append([0 if done else 1]) # done_lst size : [batch, 1]
h_h, h_c = h # hidden_h, hidden_c size : [1, 1, 64]
h_h_lst.append(h_h.detach().numpy()) # hidden1_c_lst size : [batch] inside : [1, 1, 64]
h_c_lst.append(h_c.detach().numpy()) # hidden1_h_lst size : [batch] inside : [1, 1, 64]
h_prime_h, h_prime_c = h_prime
h_prime_h_lst.append(h_prime_h.detach().numpy()) # hidden2_c_lst size : [batch] inside : [1, 1, 64]
h_prime_c_lst.append(h_prime_c.detach().numpy()) # hidden2_h_lst size : [batch] inside : [1, 1, 64]
s_lst = torch.tensor(s_lst, dtype=torch.float)
a_lst = torch.tensor(a_lst)
r_lst = torch.tensor(r_lst)
s_prime_lst = torch.tensor(s_prime_lst, dtype=torch.float)
prob_a_lst = torch.tensor(prob_a_lst, dtype=torch.float)
done_lst = torch.tensor(done_lst)
h_h_lst = torch.tensor(h_h_lst, dtype=torch.float).squeeze().unsqueeze(0)
h_c_lst = torch.tensor(h_c_lst, dtype=torch.float).squeeze().unsqueeze(0)
h_prime_h_lst = torch.tensor(h_prime_h_lst, dtype=torch.float).squeeze().unsqueeze(0)
h_prime_c_lst = torch.tensor(h_prime_c_lst, dtype=torch.float).squeeze().unsqueeze(0) # size : [1, batch, 64]
self.log_list = []
return s_lst, a_lst, r_lst, s_prime_lst, prob_a_lst, done_lst, h_h_lst, h_c_lst, h_prime_h_lst, h_prime_c_lst
使用批处理(s,a,r,s',完成,隐藏,隐藏')的训练功能
def train(self):
s, a, r, s_prime, prob_a, done_mask, h_h, h_c, h_prime_h, h_prime_c = self.make_batch()
h = (h_h.detach(), h_c.detach()) # h size : ([1, batch, 64], [1, batch, 64])
h_prime = (h_prime_h.detach(), h_prime_c.detach()) # h_prime size : ([1, batch, 64], [1, batch, 64])
for k in range(K): # PPO Algo. K iteration
td_target = r + gamma * self.v(s_prime, h_prime) * done_mask # td_target size : [batch, 1]
delta = td_target - self.v(s, h) # 기존 Advantage : Q(s, a) - V(s) == delta : r + gamma * V(s') - V(s)
delta = delta.detach().numpy()
advantage_lst = [] # GAE
advantage = 0.0
for delta_t in delta[::-1]:
advantage = gamma * lmbda * advantage + delta_t[0]
advantage_lst.append([advantage])
advantage_lst.reverse() # advantage_lst size : [batch, 1]
advantage = torch.tensor(advantage_lst, dtype=torch.float)
curr_pi, _ = self.pi(s, h) # curr_pi size : [batch, 2]
curr_pi_a = curr_pi.gather(1, a) # curr_pi_a size : [batch, 2].gather(1, [batch, 1]) = [batch, 1]
past_pi = prob_a # past_pi size : [batch, 1]
ratio = torch.exp(torch.log(curr_pi_a) - torch.log(past_pi)) # a/b == exp(log(a)-log(b))
# ratio size : [batch, 1]
surr_1 = ratio * advantage
surr_2 = torch.clamp(ratio, 1 - epsilon, 1 + epsilon) * advantage
loss = -torch.min(surr_1, surr_2) + F.smooth_l1_loss(self.v(s, h), td_target.detach())
# actor loss + critic loss
self.optimizer.zero_grad()
loss.mean().backward()
self.optimizer.step()
它是运行openAI Gym卡特彼勒环境的主要功能。
def main():
env = gym.make('CartPole-v1')
agent = PPO_lstm()
score = 0.0
print_interval = 20
for episode in range(10000):
hidden = (torch.zeros([1, 1, 64], dtype=torch.float), torch.zeros([1, 1, 64], dtype=torch.float)) # lstm init hidden state
s = env.reset()
done = False
while not done:
for step in range(20):
pi, hidden_prime = agent.pi(torch.from_numpy(s).float(), hidden) # pi size : [1, 2]
pi = pi.squeeze() # pi size : [2]
m = Categorical(pi)
action = m.sample().item()
s_prime, reward, done, info = env.step(action)
agent.add_log((s, action, reward / 100.0, s_prime, pi[action].item(), done, hidden, hidden_prime))
# hidden dim : ([1, 1, 64], [1, 1, 64])
hidden = hidden_prime
s = s_prime
score += reward
if done:
break
agent.train()
if episode % print_interval == 0 and episode != 0:
print("episode {}'s avg score : {}".format(episode, score/print_interval))
score = 0.0
env.close()
if __name__ == '__main__':
main()
当我在不使用LSTM(使用MLP策略和值函数)的情况下尝试此代码时,它运行良好。 您能告诉我使用LSTM或隐藏状态有什么问题吗?