问题:关于Advantage actor critic(pytorch + gym)模型训练死活不收敛,谁能帮忙看看是为啥?
源码:(复制粘贴可直接运行)
# -*- coding: utf-8 -*-
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
from torch.distributions import Categorical
# 折扣率
GAMMA = 0.99
# 学习率
LR_a = 0.001
LR_c = 0.01
# 总epoch数
EPISODE = 800
# 单次运行环境设计最大步数,避免一直跑不停
STEP = 3000
# 测试10次取平均
TEST = 10
# 共享网络层
class share_layer(nn.Module):
def __init__(self):
super(share_layer, self).__init__()
self.linear1 = nn.Linear(4, 32) # 4指的是state的维度
nn.init.normal_(self.linear1.weight, 0, 0.1)
nn.init.constant_(self.linear1.bias, 0.1)
def forward(self, out):
out = self.linear1(out)
out = F.relu(out)
return out
class PGNetwork(nn.Module):
def __init__(self, sl):
super(PGNetwork, self).__init__()
self.sl = sl
# 策略网络,输出动作概率
self.fc2 = nn.Linear(32, 2)
# 初始化权重取正态分布方法
nn.init.normal_(self.fc2.weight, 0, 0.1)
# 初始化偏置取常数0.1
nn.init.constant_(self.fc2.bias, 0.1)
def forward(self, state):
x = self.sl(state)
x = self.fc2(x)
action_p = F.softmax(x, dim=1)
return action_p
class Actor(object):
def __init__(self, env, sl):
self.state_dim = env.observation_space.shape[0]
self.action_dim = env.action_space.n
self.network = PGNetwork(sl)
# adam优化器
self.optimizer = torch.optim.Adam(self.network.parameters(), lr=LR_a)
def choose_action(self, observation):
"""选择动作输出,返回动作以及动作对应概率取对数"""
# 调整张量shape
observation = torch.from_numpy(observation).float().unsqueeze(0)
# 策略网络预测
probs = self.network(observation)
# 随机选择动作
m = Categorical(probs)
action = m.sample()
# 计算动作对应概率的对数值
log_prob = torch.log(probs.squeeze().gather(0, action))
return action.item(), log_prob
def learn(self, log_prob, td_error):
# 这里是否要取负号呢
loss_a = -log_prob * td_error
self.optimizer.zero_grad()
loss_a.backward()
self.optimizer.step()
class QNetwork(nn.Module):
def __init__(self, sl):
super(QNetwork, self).__init__()
self.sl = sl
# 价值网络,输出为状态价值1*1
self.fc2 = nn.Linear(32, 1)
nn.init.normal_(self.fc2.weight, 0, 0.1)
nn.init.constant_(self.fc2.bias, 0.1)
def forward(self, state):
x = self.sl(state)
value = self.fc2(x)
return value
class Critic(object):
def __init__(self, env, sl):
self.state_dim = env.observation_space.shape[0]
self.action_dim = env.action_space.n
self.network = QNetwork(sl)
self.optimizer = torch.optim.Adam(self.network.parameters(), lr=LR_c)
self.loss_func = nn.MSELoss()
def train(self, state, reward, next_state):
"""训练critic,返回值td_error用于策略网络进化"""
s, s_ = torch.from_numpy(state).float(), torch.from_numpy(next_state).float()
v = self.network(s)
v_ = self.network(s_)
loss_q = self.loss_func(v, reward + GAMMA * v_)
self.optimizer.zero_grad()
loss_q.backward()
self.optimizer.step()
with torch.no_grad():
# 计算td_error
td_error = reward + v_ - v
return td_error
def main():
env = gym.make("CartPole-v1")
sl = share_layer()
actor = Actor(env, sl)
critic = Critic(env, sl)
for episode in range(EPISODE):
state = env.reset()
for step in range(STEP):
action, log_prob = actor.choose_action(state)
next_state, reward, done, _ = env.step(action)
# ---------原始reward很难收敛,这里更改了reward的计算方法,如果认为不需要可以直接注释掉---------
x, x_dot, theta, theta_dot = next_state
r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
reward = r1 + r2
# ----------------------------------------------------------------------------------
td_error = critic.train(state, reward, next_state)
actor.learn(log_prob, td_error)
state = next_state
if done:
break
# 50代测试一次效果
if episode % 50 == 0:
total_reward = 0
for i in range(TEST):
state = env.reset()
for j in range(STEP):
# env.render() # 绘图
action, _ = actor.choose_action(state)
state, reward, done, _ = env.step(action)
total_reward += reward
if done:
break
ave_reward = total_reward / TEST
print("episode:", episode, ";Evaluation Average Reward:", ave_reward)
if __name__ == "__main__":
time_start = time.time()
main()
time_end = time.time()
print("Total time is ", time_end - time_start, 's')