PPO强化学习训练不收敛

使用PPO算法不收敛

import os
import glob
import time
from datetime import datetime

import torch
import numpy as np

import gym


from PPODemo import PPO

from gym import spaces

from OrbitalTransfer import *
#修改为使用归一化输入

import random

import time

from OrbitCore import *


class Environment(gym.Env):
    def __init__(self):
        self.min_action = -1
        self.max_action = 1
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(12,),
                                            dtype=float)  #追击rv,逃逸rv
        self.action_space = spaces.Box(low = self.min_action, high = self.max_action, shape=(3,),
                                       dtype=float)  # action给出三个xyx两个速度增量方向
        self.chase = None  #归一化之后的
        self.escape = None
        self.time = 1
        # torch.set_default_dtype(torch.float64)

    def reset(self):

        self._initialize_positions()#0-1之间
        observation = self._get_observation()#0-1之间
        return observation

    def _initialize_positions(self):  # 初始化航天器位置信息
        current_time = int(time.time())
        # 使用当前时间戳作为随机数生成器的种子
        random.seed(current_time)
        '''
        初始化：设置a,e,i,omega,w,M0
        设置a，i,omega差不多
        '''
        a = random.random()
        i = random.random()
        omega = random.random()
        self.chase = torch.tensor([a, random.random(), i, omega, random.random(), random.random()])#设置到0-1之间
        self.escape = torch.tensor([a, random.random(), i, omega, random.random(), random.random()])


    def step(self,action):#输入action，输出状态，奖励，结束标志,action应该也是tensor
        # action = torch.clamp(torch.tensor(action)*0.0001, min=-0.0001, max=0.0001)#需要修改为直接输出为tensor,限制action大小
        action = torch.clamp(action*0.0001, min=-0.0001, max=0.0001)
        # action = torch.clamp(action, min=-1, max=1)#限制大小
        self.chase = self.update_state(self.chase,action)
        action = torch.tensor([0, 0, 0])
        self.escape = self.update_state(self.escape, action)
        observation = self._get_observation()
        reward = self.reward()
        done = self.check_termination()
        return observation, reward, done, None

    def update_state(self,state,action):#输入0-1的state,更新self.state
        S = self.inverseTrans(state)
        Core = OrbitCore()
        Transfer = OrbitalTransfer()
        r, v = Core.Orbit_Element_2_State_rv(S)
        v = v + action
        Predict = OrbitPredict()
        rv = torch.cat((r.unsqueeze(0),v.unsqueeze(0)),dim = 1).view(-1)
        # print("rv",rv)

        # r, v = Predict.J2OrbitRV(rv,50)#时间先设置为50s,输出rv
        # coe = Core.State_rv_2_Orbit_Element(r, v)#这个函数有问题，需要修改
        # coe2 = Core.State_rv_2_Orbit_Element(r.numpy(),v.numpy())
        # state = self.trans(coe)#转化到0-1之间

        coe = Core.State_rv_2_Orbit_Element(r, v)
        if torch.isnan(coe[5]):
            print("error in main update_state State_rv_2_Orbit_Element")
        # print("coe",coe)#出现e>1,设置推力小于0.0001

        coe = Predict.J2Orbit(coe, 50)
        if torch.isnan(coe[5]):
            print("error in main update_state J2Orbit")
        state = self.trans(coe)

        return state

    def _get_observation(self):
        observation = torch.cat((self.chase.unsqueeze(0),self.escape.unsqueeze(0)), dim=1)
        return observation

    def check_termination(self):
        terminate = False
        chase = self.inverseTrans(self.chase)#0-1转化为六根
        escape = self.inverseTrans(self.escape)
        distance = self.distance(chase,escape)
        if distance <= 0.5:
            terminate = True
        return terminate

    def trans(self,state):#六根转化到0-1之间
        a = (state[0]-30000)/15000
        e = state[1]
        i = state[2]/180
        omega = state[3]/180
        w = state[4]/360
        M0 = state[5]/360
        return torch.tensor([a, e, i, omega, w, M0])

    def inverseTrans(self,state):#0-1转化为六根数
        a = state[0]*15000 + 30000
        e = state[1]
        i = state[2]*180
        omega = state[3]*180
        w = state[4]*360
        M0 = state[5]*360
        return torch.tensor([a,e,i,omega,w,M0], dtype=torch.float64)

    def distance(self,state1,state2):
        Core = OrbitCore()
        r1, v1 = Core.Orbit_Element_2_State_rv(state1)#修改为支持tensor的,state是六根
        r2, v2 = Core.Orbit_Element_2_State_rv(state2)
        distance = torch.norm(r1 - r2)
        return distance

    def reward(self):
        coe_chase = self.inverseTrans(self.chase)
        coe_escape = self.inverseTrans(self.escape)
        reward = -1 * self.distance(coe_chase,coe_escape)
        return reward

################################### Training ###################################
def train():
    # Predict = OrbitPredict()
    print("============================================================================================")

    ####### initialize environment hyperparameters ######
    env_name = "train-orbit-cw"

    has_continuous_action_space = True  # continuous action space; else discrete

    max_ep_len = 1000  # max timesteps in one episode
    max_training_timesteps = int(3e6)  # break training loop if timeteps > max_training_timesteps

    print_freq = max_ep_len * 10  # print avg reward in the interval (in num timesteps)
    log_freq = max_ep_len * 2  # log avg reward in the interval (in num timesteps)
    save_model_freq = int(1e5)  # save model frequency (in num timesteps)

    action_std = 0.6  # starting std for action distribution (Multivariate Normal)
    action_std_decay_rate = 0.05  # linearly decay action_std (action_std = action_std - action_std_decay_rate)
    min_action_std = 0.1  # minimum action_std (stop decay after action_std <= min_action_std)
    action_std_decay_freq = int(2.5e5)  # action_std decay frequency (in num timesteps)
    #####################################################

    ## Note : print/log frequencies should be > than max_ep_len

    ################ PPO hyperparameters ################
    update_timestep = max_ep_len * 4  # update policy every n timesteps
    K_epochs = 80  # update policy for K epochs in one PPO update

    eps_clip = 0.2  # clip parameter for PPO
    gamma = 0.99  # discount factor

    lr_actor = 0.0003  # learning rate for actor network
    lr_critic = 0.001  # learning rate for critic network

    random_seed = 0  # set random seed if required (0 = no random seed)
    #####################################################

    print("training environment name : " + env_name)

    # env = gym.make(env_name)

    env = Environment()

    # state space dimension
    state_dim = env.observation_space.shape[0]

    # action space dimension
    if has_continuous_action_space:
        action_dim = env.action_space.shape[0]
    else:
        action_dim = env.action_space.n

    ###################### logging ######################

    #### log files for multiple runs are NOT overwritten
    log_dir = "PPO_logs"
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    log_dir = log_dir + '/' + env_name + '/'
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    #### get number of log files in log directory
    run_num = 0
    current_num_files = next(os.walk(log_dir))[2]
    run_num = len(current_num_files)

    #### create new log file for each run

    log_f_name = log_dir + '/PPO_' + env_name + "_log_" + str(run_num) + ".csv"

    print("current logging run number for " + env_name + " : ", run_num)
    print("logging at : " + log_f_name)
    #####################################################

    ################### checkpointing ###################
    log_dir = "PPO_preTrained"
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    log_dir = log_dir + '/' + env_name + '/'
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    run_num_pretrained = 0  #### change this to prevent overwriting weights in same env_name folder

    current_num_files = next(os.walk(log_dir))[2]
    run_num_pretrained = len(current_num_files)

    directory = "PPO_preTrained"
    if not os.path.exists(directory):
        os.makedirs(directory)

    directory = directory + '/' + env_name + '/'
    if not os.path.exists(directory):
        os.makedirs(directory)

    checkpoint_path = directory + "PPO_{}_{}_{}.pth".format(env_name, random_seed, run_num_pretrained)
    print("save checkpoint path : " + checkpoint_path)
    #####################################################

    ############# print all hyperparameters #############
    print("--------------------------------------------------------------------------------------------")
    print("max training timesteps : ", max_training_timesteps)
    print("max timesteps per episode : ", max_ep_len)
    print("model saving frequency : " + str(save_model_freq) + " timesteps")
    print("log frequency : " + str(log_freq) + " timesteps")
    print("printing average reward over episodes in last : " + str(print_freq) + " timesteps")
    print("--------------------------------------------------------------------------------------------")
    print("state space dimension : ", state_dim)
    print("action space dimension : ", action_dim)
    print("--------------------------------------------------------------------------------------------")
    if has_continuous_action_space:
        print("Initializing a continuous action space policy")
        print("--------------------------------------------------------------------------------------------")
        print("starting std of action distribution : ", action_std)
        print("decay rate of std of action distribution : ", action_std_decay_rate)
        print("minimum std of action distribution : ", min_action_std)
        print("decay frequency of std of action distribution : " + str(action_std_decay_freq) + " timesteps")
    else:
        print("Initializing a discrete action space policy")
    print("--------------------------------------------------------------------------------------------")
    print("PPO update frequency : " + str(update_timestep) + " timesteps")
    print("PPO K epochs : ", K_epochs)
    print("PPO epsilon clip : ", eps_clip)
    print("discount factor (gamma) : ", gamma)
    print("--------------------------------------------------------------------------------------------")
    print("optimizer learning rate actor : ", lr_actor)
    print("optimizer learning rate critic : ", lr_critic)
    if random_seed:
        print("--------------------------------------------------------------------------------------------")
        print("setting random seed to ", random_seed)
        torch.manual_seed(random_seed)
        env.seed(random_seed)
        np.random.seed(random_seed)
    #####################################################

    print("============================================================================================")

    ################# training procedure ################

    # initialize a PPO agent
    ppo_agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, has_continuous_action_space,
                    action_std)

    # track total training time
    start_time = datetime.now().replace(microsecond=0)
    print("Started training at (GMT) : ", start_time)

    print("============================================================================================")

    # logging file
    log_f = open(log_f_name, "w+")
    log_f.write('episode,timestep,reward\n')

    # printing and logging variables
    print_running_reward = 0
    print_running_episodes = 0

    log_running_reward = 0
    log_running_episodes = 0

    time_step = 0
    i_episode = 0

    # training loop
    while time_step <= max_training_timesteps:

        state = env.reset()
        current_ep_reward = 0

        for t in range(1, max_ep_len + 1):

            # select action with policy
            action = ppo_agent.select_action(state.float())
            # print(action)
            state, reward, done, _ = env.step(action)

            # saving reward and is_terminals
            ppo_agent.buffer.rewards.append(reward)
            ppo_agent.buffer.is_terminals.append(done)

            time_step += 1
            current_ep_reward += reward

            # update PPO agent
            if time_step % update_timestep == 0:
                ppo_agent.update()

            # if continuous action space; then decay action std of ouput action distribution
            if has_continuous_action_space and time_step % action_std_decay_freq == 0:
                ppo_agent.decay_action_std(action_std_decay_rate, min_action_std)

            # log in logging file
            if time_step % log_freq == 0:
                # log average reward till last episode
                log_avg_reward = log_running_reward / log_running_episodes
                # log_avg_reward = round(log_avg_reward, 4)
                log_avg_reward = torch.round(log_avg_reward)

                log_f.write('{},{},{}\n'.format(i_episode, time_step, log_avg_reward))
                log_f.flush()

                log_running_reward = 0
                log_running_episodes = 0

            # printing average reward
            if time_step % print_freq == 0:
                # print average reward till last episode
                print_avg_reward = print_running_reward / print_running_episodes
                # print_avg_reward = round(print_avg_reward, 2)
                print_avg_reward = torch.round(print_avg_reward)

                print("Episode : {} \t\t Timestep : {} \t\t Average Reward : {}".format(i_episode, time_step,
                                                                                        print_avg_reward))

                print_running_reward = 0
                print_running_episodes = 0

            # save model weights
            if time_step % save_model_freq == 0:
                print("--------------------------------------------------------------------------------------------")
                print("saving model at : " + checkpoint_path)
                ppo_agent.save(checkpoint_path)
                print("model saved")
                print("Elapsed Time  : ", datetime.now().replace(microsecond=0) - start_time)
                print("--------------------------------------------------------------------------------------------")

            # break; if the episode is over
            if done:
                break

        print_running_reward += current_ep_reward
        print_running_episodes += 1

        log_running_reward += current_ep_reward
        log_running_episodes += 1

        i_episode += 1

    log_f.close()
    env.close()

    # print total training time
    print("============================================================================================")
    end_time = datetime.now().replace(microsecond=0)
    print("Started training at (GMT) : ", start_time)
    print("Finished training at (GMT) : ", end_time)
    print("Total training time  : ", end_time - start_time)
    print("============================================================================================")


if __name__ == '__main__':
    train()

上面是训练的代码(还涉及调用其他的库)
具体的项目地址是：https://github.com/YRQhit/PPOTrain/tree/main
代码的目的是训练一个训练一个网络，给三个方向的速度，靠近一个目标，但是训练中不收敛
训练的曲线图如下

写回答
好问题 0 提建议
追加酬金
关注问题
分享
邀请回答
编辑收藏删除
收藏举报

22条回答默认最新

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
专家-赤兔[在线] 全栈领域优质创作者 2024-03-22 11:35
关注
获得1.95元问题酬金
引自免费微信小程序：皆我百晓生

从您提供的训练日志和代码片段中，可以看出您在尝试使用PPO（Proximal Policy Optimization）算法训练一个航天器轨道转移任务的强化学习模型。但是，在训练过程中，您遇到了不收敛的问题，即平均奖励（Average Reward）没有朝着正向目标提升，反而在较大的负数值区间波动。

针对PPO训练不收敛的问题，可以从以下几个方面进行排查与优化：

环境设计：

检查您的奖励函数设计是否合理。在您的环境中，奖励是基于航天器间距离计算的，如果只有接近目标时才有正奖励，而大部分情况下都是负奖励，这可能导致训练困难。可以考虑添加一些引导策略朝正确方向移动的中间奖励或者惩罚远离目标的行为。

动作尺度：

动作空间被限制在-0.0001到0.0001之间，这个范围对于实际的轨道转移任务可能过小，导致难以产生有意义的变化。您可以调整action_std初始值和衰减率，确保输出的动作幅度能够对环境状态造成足够影响。

探索-利用权衡：

PPO中的探索性可以通过action_std控制，检查当前设置是否合理。过低的action_std会导致策略过于保守，不易于发现更优策略；过高则可能导致训练不稳定。尝试增大起始action_std并适当降低衰减率。

更新频率与K_epochs：

考虑到每四步才进行一次PPO更新，并且每个更新迭代80个epochs，这可能导致训练效率不高。您可以根据实际情况调整update_timestep和K_epochs，增加更新频率或减少迭代次数以加快训练速度和收敛。

模型架构与参数：

检查PPO网络结构以及学习率(lr_actor, lr_critic)等超参数设置是否合适。过大的学习率可能导致训练不稳定，过小的学习率会减慢收敛速度。适当调整这些参数以找到最优配置。

训练时间：

从日志来看，训练持续了数小时，但并未达到理想收敛状态。考虑到RL问题通常需要较长时间训练，您可能需要增加训练步数或优化其他参数来改进收敛情况。

环境随机性：

在每次episode重置时，环境初始化的位置、速度等参数是完全随机的，这可能导致训练过程中的状态分布非常广泛，不利于学习。可以考虑采用更加稳定或逐渐增难的环境初始化策略。

离散化处理：

尽管您的动作空间是连续的，但在某些环节如torch.clamp()操作中，将动作限制在一个很小的范围内，这可能会限制策略的有效探索空间。确认这些操作是否有必要。

为了进一步诊断问题，建议您：

对奖励函数进行可视化，查看其在整个状态空间的分布，以验证奖励信号是否能有效指导agent学习。
可以尝试使用更稳定的baseline算法（例如DQN或其他On-Policy算法），对比实验结果，看是否仍有类似问题出现。
使用TensorBoard等工具记录和观察训练过程中的损失函数变化、策略分布变化以及奖励曲线等，以便直观地分析不收敛的原因。

在调试过程中，请逐步修改上述某个或某几个因素，并观察训练曲线的变化，以便找出最可能的原因。
解决无用
评论打赏
分享
举报编辑记录

评论

按下Enter换行，Ctrl+Enter发表内容

报告相同问题？

关注问题

如何快速掌握深度强化学习的各种算法，比如DQN，DDPG，PPO，SAC等等，并用Python准确实现呢？？ python pytorch 机器学习
2023-02-11 18:41

回答 1 已采纳以下答案引用自GPT-3大模型,请合理使用： ```想请问大神们，有没有比较好的资料或者指导方法，能够快速掌握深度强化学习的各种算法呢？首先，你需要理解深度强化学习的基本概念。然后，你可以通过阅读相
强化学习代码报错，typerror python
2022-10-05 11:55

回答 1 已采纳 StopTrainingOnRewardThreshold 不是都提示说传递了一个不期望的参数 'treshhold_type' Ctrl +鼠标左键点进去看下这个方法都需要哪些参数啊
报错：'NoneType' object has no attribute 'shape' opencv python
2021-09-25 13:30

回答 2 已采纳这个是已经处理完了，读取的数据为空，所以为nonetype
周博磊强化学习课程源码
2021-09-25 12:24

《周博磊强化学习课程源码》是一份深入学习强化学习技术的重要资源，它由知名学者或教育者周博磊提供，旨在帮助学习者通过实际编程来理解和掌握强化学习的基本概念、算法及其应用。这份源码集合包含了多个教程项目，...
linux多网卡及多拨号的问题 linux ubuntu 网络
2016-12-21 01:31

回答 2 已采纳 http://blog.csdn.net/yuanbinquan/article/details/51468886
【经验】深度强化学习训练与调参技巧
2022-06-07 19:30

风度78的博客来源：知乎(https://zhuanlan.zhihu.com/p/482656367)作者：岳小飞天下苦 RL 久矣，其中最苦的地方莫过于训练和调参了，人人欲“调”之而后快。在此为 RL 社区贡献一点绵薄之力，首先摘录Stable Baselines3的RL Tips...
ChatGPT技术原理解析：从RL之PPO算法、RLHF到GPT4、instructGPT
2023-01-15 22:01

v_JULY_v的博客一方面，对于想了解ChatGPT背后原理和如何发展而来的，逐一阐述从GPT/GPT2/GPT3到强化学习、PPO算法，最后再到instructGPT、ChatGPT、SeqGAN 且本文之前，99%的文章都不会把PPO算法从头推到尾，本文会把PPO从零推到...
深度强化学习的常用算法——DQN，DDPG，PPO
2023-08-05 01:48

AI天才研究院的博客近几年，随着强化学习在各个领域的广泛应用，深度强化学习也逐渐成为学术界和工业界研究的热点话题。深度强化学习（Deep Reinforcement Learning）是基于机器学习和大数据等技术提出的一种新的机器学习方法。它利用...
Python深度强化学习求解动态旅行商问题源码
2024-03-07 20:14

在本项目中，"Python深度强化学习求解动态旅行商问题源码" 是一个利用Python编程语言和深度强化学习（Deep Reinforcement Learning, DRL）技术来解决动态旅行商问题（Dynamic Traveling Salesman Problem, DTSP）的...
基于A2C深度强化学习算法的水下机器人姿态控制python代码.rar
2021-08-29 18:04

同时，也可以考虑结合其他强化学习算法或技术，如Proximal Policy Optimization (PPO) 或者蒙特卡洛学习，以提高性能。这个项目提供了一个实践强化学习和深度学习在实际问题中应用的实例，对于理解和提升在这一...
如何使用Python构建强化学习环境？
2023-08-15 02:46

AI天才研究院的博客 强化学习（Reinforcement Learning，简称RL）作为人工智能和机器学习的重要分支，近年来在各个领域都取得了显著的成果。从围棋到机器人控制，从自动驾驶到推荐系统，强化学习的应用范围正在不断扩大。然而，要想成功...
Transformer与强化学习结合提升物联网智能决策
2024-07-19 14:28

人工智能培训咨询叶梓的博客同时，为了部署和实现强化学习算法，特别是近端策略优化(PPO)，研究者们使用了Stable Baselines库，它为强化学习研究提供了一套稳定和高效的算法实现。研究者们通过SimPy库模拟了一个复杂的智能城市IoT环境。SimPy...
强化学习：强化学习与深度学习的结合
2024-10-08 03:18

AI天才研究院的博客 强化学习：强化学习与深度学习的结合作者：禅与计算机程序设计艺术 / Zen and the Art of Computer Programming 关键词： 强化学习，深度学习，结合，智能决策，环境交互，强化学习算法，深度神经网络，深度强化...
强化学习中的调参经验与编程技巧(on policy 篇)
2020-08-31 00:44

启人zhr的博客在强化学习的训练过程中，常常会遇见以下问题在某一环境中可以work的超参数拿去训练别的环境却怎么训练不出来训练时熵在增大训练动作达到边界本文通过调试几个环境的案例来探究强化学习的调参方法 1 pendulum ...
使用强化学习快速让AI学会玩贪食蛇游戏(轻量级二十分钟训练+代码)
2021-07-10 19:54

iπ弟弟的博客如何让AI玩会贪食蛇，甚至比你厉害概述构建问题(强化学习求解的一般步骤)环境动作定义状态定义奖励设计训练奖励值收敛图效果代码概述所用技术：强化学习(Deep Reinforcement Learning)，属于一种无监督学习，利用...
DeepRLexamples.jl：使用Julia的深度强化学习示例
2021-02-04 14:59

在本案例中，我们关注的是使用Julia编程语言实现的深度强化学习示例——"DeepRLexamples.jl"。Julia是一种高性能的语言，尤其适合科学计算、数值分析以及机器学习任务。首先，让我们了解一下Julia的基本特性。...
【方法总结】值分布强化学习（Distributional RL）
2021-01-13 08:24

深度强化学习实验室的博客深度强化学习实验室官网：http://www.neurondance.com/来源：微软研究院AI头条授权转载编辑：DeepRL值分布强化学习（Distributional Reinfo...
在线学习的深度强化学习——Online Reinforcement Learning for Learning
2023-09-01 12:23

AI天才研究院的博客传统的强化学习方法在处理高维状态空间和复杂决策问题时往往力不从心，而深度强化学习通过结合深度学习的强大表示能力，极大地扩展了强化学习的应用范围。然而，在实际应用中，我们常常面临着动态变化的环境和持续...
没有解决我的问题, 去提问

问题事件

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
系统已结题 3月30日
关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
赞助了问题酬金50元 3月22日
关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
创建了问题 3月22日

悬赏问题

¥15 数据量少可以用MK趋势分析吗
¥15 使用VH6501干扰RTR位，CANoe上显示的错误帧不足32个就进入bus off快慢恢复，为什么？
¥15 大智慧怎么编写一个选股程序
¥100 python 调用 cgps 命令获取实时位置信息
¥15 两台交换机分别是trunk接口和access接口为何无法通信，通信过程是如何？
¥15 C语言使用vscode编码错误
¥15 用KSV5转成本时，如何不生成那笔中间凭证
¥20 ensp怎么配置让PC1和PC2通讯上
¥50 有没有适合匹配类似图中的运动规律的图像处理算法
¥15 dnat基础问题,本机发出,别人返回的包,不能命中

PPO强化学习训练不收敛

22条回答 默认 最新

问题事件

悬赏问题

22条回答默认最新