cinbol 2022-06-17 01:14 采纳率: 50%
浏览 52
已结题

A2C模型训练不收敛

问题:关于Advantage actor critic(pytorch + gym)模型训练死活不收敛,谁能帮忙看看是为啥?

源码:(复制粘贴可直接运行)

# -*- coding: utf-8 -*-
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
from torch.distributions import Categorical

# 折扣率
GAMMA = 0.99
# 学习率
LR_a = 0.001
LR_c = 0.01
# 总epoch数
EPISODE = 800
# 单次运行环境设计最大步数,避免一直跑不停
STEP = 3000
# 测试10次取平均
TEST = 10


# 共享网络层
class share_layer(nn.Module):
    def __init__(self):
        super(share_layer, self).__init__()
        self.linear1 = nn.Linear(4, 32)  # 4指的是state的维度
        nn.init.normal_(self.linear1.weight, 0, 0.1)
        nn.init.constant_(self.linear1.bias, 0.1)

    def forward(self, out):
        out = self.linear1(out)
        out = F.relu(out)
        return out


class PGNetwork(nn.Module):
    def __init__(self, sl):
        super(PGNetwork, self).__init__()
        self.sl = sl
        # 策略网络,输出动作概率
        self.fc2 = nn.Linear(32, 2)
        # 初始化权重取正态分布方法
        nn.init.normal_(self.fc2.weight, 0, 0.1)
        # 初始化偏置取常数0.1
        nn.init.constant_(self.fc2.bias, 0.1)

    def forward(self, state):
        x = self.sl(state)
        x = self.fc2(x)
        action_p = F.softmax(x, dim=1)
        return action_p


class Actor(object):
    def __init__(self, env, sl):
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n
        self.network = PGNetwork(sl)
        # adam优化器
        self.optimizer = torch.optim.Adam(self.network.parameters(), lr=LR_a)

    def choose_action(self, observation):
        """选择动作输出,返回动作以及动作对应概率取对数"""
        # 调整张量shape
        observation = torch.from_numpy(observation).float().unsqueeze(0)
        # 策略网络预测
        probs = self.network(observation)
        # 随机选择动作
        m = Categorical(probs)
        action = m.sample()
        # 计算动作对应概率的对数值
        log_prob = torch.log(probs.squeeze().gather(0, action))
        return action.item(), log_prob

    def learn(self, log_prob, td_error):
        # 这里是否要取负号呢
        loss_a = -log_prob * td_error

        self.optimizer.zero_grad()
        loss_a.backward()
        self.optimizer.step()


class QNetwork(nn.Module):
    def __init__(self, sl):
        super(QNetwork, self).__init__()
        self.sl = sl
        # 价值网络,输出为状态价值1*1
        self.fc2 = nn.Linear(32, 1)
        nn.init.normal_(self.fc2.weight, 0, 0.1)
        nn.init.constant_(self.fc2.bias, 0.1)

    def forward(self, state):
        x = self.sl(state)
        value = self.fc2(x)
        return value


class Critic(object):
    def __init__(self, env, sl):
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n
        self.network = QNetwork(sl)
        self.optimizer = torch.optim.Adam(self.network.parameters(), lr=LR_c)
        self.loss_func = nn.MSELoss()

    def train(self, state, reward, next_state):
        """训练critic,返回值td_error用于策略网络进化"""
        s, s_ = torch.from_numpy(state).float(), torch.from_numpy(next_state).float()
        v = self.network(s)
        v_ = self.network(s_)
        loss_q = self.loss_func(v, reward + GAMMA * v_)

        self.optimizer.zero_grad()
        loss_q.backward()
        self.optimizer.step()

        with torch.no_grad():
            # 计算td_error
            td_error = reward + v_ - v

        return td_error


def main():
    env = gym.make("CartPole-v1")
    sl = share_layer()
    actor = Actor(env, sl)
    critic = Critic(env, sl)

    for episode in range(EPISODE):
        state = env.reset()

        for step in range(STEP):
            action, log_prob = actor.choose_action(state)
            next_state, reward, done, _ = env.step(action)

            # ---------原始reward很难收敛,这里更改了reward的计算方法,如果认为不需要可以直接注释掉---------
            x, x_dot, theta, theta_dot = next_state
            r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
            r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
            reward = r1 + r2
            # ----------------------------------------------------------------------------------

            td_error = critic.train(state, reward, next_state)
            actor.learn(log_prob, td_error)
            state = next_state

            if done:
                break
        # 50代测试一次效果
        if episode % 50 == 0:
            total_reward = 0
            for i in range(TEST):
                state = env.reset()
                for j in range(STEP):
                    # env.render()  # 绘图
                    action, _ = actor.choose_action(state)
                    state, reward, done, _ = env.step(action)
                    total_reward += reward
                    if done:
                        break
            ave_reward = total_reward / TEST
            print("episode:", episode, ";Evaluation Average Reward:", ave_reward)


if __name__ == "__main__":
    time_start = time.time()
    main()
    time_end = time.time()
    print("Total time is ", time_end - time_start, 's')

  • 写回答

1条回答 默认 最新

  • 白驹_过隙 算法领域新星创作者 2022-06-17 08:18
    关注

    学习率调低点看看

    本回答被题主选为最佳回答 , 对您是否有帮助呢?
    评论

报告相同问题?

问题事件

  • 系统已结题 6月25日
  • 已采纳回答 6月17日
  • 创建了问题 6月17日

悬赏问题

  • ¥15 有兄弟姐妹会用word插图功能制作类似citespace的图片吗?
  • ¥200 uniapp长期运行卡死问题解决
  • ¥15 请教:如何用postman调用本地虚拟机区块链接上的合约?
  • ¥15 为什么使用javacv转封装rtsp为rtmp时出现如下问题:[h264 @ 000000004faf7500]no frame?
  • ¥15 乘性高斯噪声在深度学习网络中的应用
  • ¥15 关于docker部署flink集成hadoop的yarn,请教个问题 flink启动yarn-session.sh连不上hadoop,这个整了好几天一直不行,求帮忙看一下怎么解决
  • ¥15 深度学习根据CNN网络模型,搭建BP模型并训练MNIST数据集
  • ¥15 C++ 头文件/宏冲突问题解决
  • ¥15 用comsol模拟大气湍流通过底部加热(温度不同)的腔体
  • ¥50 安卓adb backup备份子用户应用数据失败