PPO算法离散空间训练不收敛


import os
import glob
import time
from datetime import datetime

import torch
import numpy as np

import gym

from PPODemo import PPO

from gym import spaces

from OrbitalTransfer import *
# 修改为使用归一化输入

import random

import time

from OrbitCore import *


class Environment(gym.Env):
    def __init__(self):
        self.min_action = -1
        self.max_action = 1
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(12,),
                                            dtype=float)  # 追击rv,逃逸rv
        # 定义输出动作空间（三维向量）
        self.action_space = spaces.Discrete(7)  # 三个方向
        # self.action_space = spaces.Box(low=self.min_action, high=self.max_action, shape=(3,),
        #                                dtype=float)  # action给出三个xyx两个速度增量方向
        self.chase = None  # 归一化之后的
        self.escape = None
        self.time = 1
        # torch.set_default_dtype(torch.float64)

    def reset(self):

        self._initialize_positions()  # 0-1之间
        observation = self._get_observation()  # 0-1之间
        return observation

    def _initialize_positions(self):  # 初始化航天器位置信息
        current_time = int(time.time())
        # 使用当前时间戳作为随机数生成器的种子
        random.seed(current_time)
        '''
        初始化：设置a,e,i,omega,w,M0
        设置a，i,omega差不多
        '''
        a = random.random()
        i = random.random()
        omega = random.random()
        w = random.random()
        M0 = random.random()
        self.chase = torch.tensor([a, random.random() * 0.00001, i, omega, w - 0.0001 * random.random(),
                                   M0 - 0.0001 * random.random()])  # 设置到0-1之间
        self.escape = torch.tensor([a, random.random() * 0.00001, i, omega, w, M0])
        self.withoutImpluse = self.chase  # 使用追击方数据作为初始化
        # print("the coe of escape is", self.inverseTrans(self.escape))
        # print("the coe of chase is", self.inverseTrans(self.chase))
        # print("the coe of chase without Impluse is", self.inverseTrans(self.withoutImpluse))
        # print("initial distance is", self.distance_input01(self.chase, self.escape))

    def step(self, action):  # 输入action，输出状态，奖励，结束标志,action应该也是tensor
        # action = torch.clamp(torch.tensor(action)*0.0001, min=-0.0001, max=0.0001)#需要修改为直接输出为tensor,限制action大小
        # action = torch.clamp(action * 0.0001, min=-0.0001, max=0.0001)
        # action = torch.clamp(action, min=-1, max=1)#限制大小
        # action = torch.tensor([0, 0, 0])
        # print("action is",action)
        # print("the origin coe of chase is",self.inverseTrans(self.chase))
        # print("the origin coe of chase withour Impluse is",self.inverseTrans(self.withoutImpluse))
        action = self.chooseAction(action)  # 离散转化为向量
        # torch.set_printoptions(precision=20)

        # print("action is ", action)

        self.chase = self.update_state(self.chase, action)
        action = torch.tensor([0, 0, 0])
        self.escape = self.update_state(self.escape, action)
        self.withoutImpluse = self.update_state(self.withoutImpluse, action)

        observation = self._get_observation()
        reward = self.reward()
        done = self.check_termination()
        # print("update distance is",self.distance_input01(self.chase, self.escape))
        # print("update distance of noImpluse is", self.distance_input01(self.withoutImpluse, self.escape))
        # torch.set_printoptions(precision=20)
        # print("action is ", action)

        # print("update of the coe of chase is",self.inverseTrans(self.chase))
        # print("update of the coe of chase without Impluse is", self.inverseTrans(self.withoutImpluse))
        # print("error between of two is",self.inverseTrans(self.chase) - self.inverseTrans(self.withoutImpluse))
        return observation, reward, done, None

    def chooseAction(self, action):
        if (action == 0):
            output = torch.tensor([1, 0, 0])
        elif (action == 1):
            output = torch.tensor([0, 1, 0])
        elif (action == 2):
            output = torch.tensor([0, 0, 1])
        elif (action == 3):
            output = torch.tensor([0, 0, 0])
        elif (action == 4):
            output = torch.tensor([-1, 0, 0])
        elif (action == 5):
            output = torch.tensor([0, -1, 0])
        elif (action == 6):
            output = torch.tensor([0, 0, -1])
        action = output  * 0.0001
        return action



    def update_state(self, state, action):  # 输入0-1的state,更新self.state
        S = self.inverseTrans(state)
        Core = OrbitCore()
        Transfer = OrbitalTransfer()
        r, v = Core.Orbit_Element_2_State_rv(S)
        v = v + action
        Predict = OrbitPredict()
        rv = torch.cat((r.unsqueeze(0), v.unsqueeze(0)), dim=1).view(-1)
        # print("rv",rv)

        # r, v = Predict.J2OrbitRV(rv,50)#时间先设置为50s,输出rv
        # coe = Core.State_rv_2_Orbit_Element(r, v)#这个函数有问题，需要修改
        # coe2 = Core.State_rv_2_Orbit_Element(r.numpy(),v.numpy())
        # state = self.trans(coe)#转化到0-1之间

        coe = Core.State_rv_2_Orbit_Element(r, v)
        if torch.isnan(coe[5]):
            print("error in main update_state State_rv_2_Orbit_Element")
        # print("coe",coe)#出现e>1,设置推力小于0.0001

        coe = Predict.J2Orbit(coe, 50)
        if torch.isnan(coe[5]):
            print("error in main update_state J2Orbit")
        state = self.trans(coe)

        return state

    def _get_observation(self):
        observation = torch.cat((self.chase.unsqueeze(0), self.escape.unsqueeze(0)), dim=1)
        return observation

    def check_termination(self):
        terminate = False
        chase = self.inverseTrans(self.chase)  # 0-1转化为六根
        escape = self.inverseTrans(self.escape)
        distance = self.distance(chase, escape)
        if distance <= 0.5:
            terminate = True
        return terminate

    def trans(self, state):  # 六根转化到0-1之间
        a = (state[0] - 30000) / 15000
        e = state[1]
        i = state[2] / 180
        omega = state[3] / 180
        w = state[4] / 360
        M0 = state[5] / 360
        return torch.tensor([a, e, i, omega, w, M0])

    def inverseTrans(self, state):  # 0-1转化为六根数
        a = state[0] * 15000 + 30000
        e = state[1]
        i = state[2] * 180
        omega = state[3] * 180
        w = state[4] * 360
        M0 = state[5] * 360
        return torch.tensor([a, e, i, omega, w, M0], dtype=torch.float64)

    def distance(self, state1, state2):
        Core = OrbitCore()
        r1, v1 = Core.Orbit_Element_2_State_rv(state1)  # 修改为支持tensor的,state是六根
        r2, v2 = Core.Orbit_Element_2_State_rv(state2)
        distance = torch.norm(r1 - r2)
        return distance

    def distance_input01(self, chase, escape):
        coe_chase = self.inverseTrans(chase)
        coe_escape = self.inverseTrans(escape)
        reward = self.distance(coe_chase, coe_escape)
        return reward

    def reward(self):
        coe_chase = self.inverseTrans(self.chase)
        coe_escape = self.inverseTrans(self.escape)
        reward = -1 * self.distance(coe_chase, coe_escape)
        return reward


################################### Training ###################################
def train():
    # Predict = OrbitPredict()
    print("============================================================================================")

    ####### initialize environment hyperparameters ######
    env_name = "train-orbit-discrete"

    has_continuous_action_space = False  # continuous action space; else discrete

    max_ep_len = 500  # max timesteps in one episode,对抗次数减小到500
    max_training_timesteps = int(3e6)  # break training loop if timeteps > max_training_timesteps

    print_freq = max_ep_len * 10  # print avg reward in the interval (in num timesteps)
    log_freq = max_ep_len * 2  # log avg reward in the interval (in num timesteps)
    save_model_freq = int(1e5)  # save model frequency (in num timesteps)

    action_std = 0.6  # starting std for action distribution (Multivariate Normal)
    action_std_decay_rate = 0.05  # linearly decay action_std (action_std = action_std - action_std_decay_rate)
    min_action_std = 0.1  # minimum action_std (stop decay after action_std <= min_action_std)
    action_std_decay_freq = int(2.5e5)  # action_std decay frequency (in num timesteps)
    #####################################################

    ## Note : print/log frequencies should be > than max_ep_len

    ################ PPO hyperparameters ################
    update_timestep = max_ep_len * 4  # update policy every n timesteps
    K_epochs = 80  # update policy for K epochs in one PPO update

    eps_clip = 0.2  # clip parameter for PPO
    gamma = 0.99  # discount factor

    lr_actor = 0.0001  # learning rate for actor network
    lr_critic = 0.0005  # learning rate for critic network

    random_seed = 0  # set random seed if required (0 = no random seed)
    #####################################################

    print("training environment name : " + env_name)

    # env = gym.make(env_name)

    env = Environment()

    # state space dimension
    state_dim = env.observation_space.shape[0]

    # action space dimension
    if has_continuous_action_space:
        action_dim = env.action_space.shape[0]
    else:
        action_dim = env.action_space.n

    ###################### logging ######################

    #### log files for multiple runs are NOT overwritten
    log_dir = "PPO_logs"
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    log_dir = log_dir + '/' + env_name + '/'
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    #### get number of log files in log directory
    run_num = 0
    current_num_files = next(os.walk(log_dir))[2]
    run_num = len(current_num_files)

    #### create new log file for each run

    log_f_name = log_dir + '/PPO_' + env_name + "_log_" + str(run_num) + ".csv"

    print("current logging run number for " + env_name + " : ", run_num)
    print("logging at : " + log_f_name)
    #####################################################

    ################### checkpointing ###################
    log_dir = "PPO_preTrained"
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    log_dir = log_dir + '/' + env_name + '/'
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    run_num_pretrained = 0  #### change this to prevent overwriting weights in same env_name folder

    current_num_files = next(os.walk(log_dir))[2]
    run_num_pretrained = len(current_num_files)

    directory = "PPO_preTrained"
    if not os.path.exists(directory):
        os.makedirs(directory)

    directory = directory + '/' + env_name + '/'
    if not os.path.exists(directory):
        os.makedirs(directory)

    checkpoint_path = directory + "PPO_{}_{}_{}.pth".format(env_name, random_seed, run_num_pretrained)
    print("save checkpoint path : " + checkpoint_path)
    #####################################################

    ############# print all hyperparameters #############
    print("--------------------------------------------------------------------------------------------")
    print("max training timesteps : ", max_training_timesteps)
    print("max timesteps per episode : ", max_ep_len)
    print("model saving frequency : " + str(save_model_freq) + " timesteps")
    print("log frequency : " + str(log_freq) + " timesteps")
    print("printing average reward over episodes in last : " + str(print_freq) + " timesteps")
    print("--------------------------------------------------------------------------------------------")
    print("state space dimension : ", state_dim)
    print("action space dimension : ", action_dim)
    print("--------------------------------------------------------------------------------------------")
    if has_continuous_action_space:
        print("Initializing a continuous action space policy")
        print("--------------------------------------------------------------------------------------------")
        print("starting std of action distribution : ", action_std)
        print("decay rate of std of action distribution : ", action_std_decay_rate)
        print("minimum std of action distribution : ", min_action_std)
        print("decay frequency of std of action distribution : " + str(action_std_decay_freq) + " timesteps")
    else:
        print("Initializing a discrete action space policy")
    print("--------------------------------------------------------------------------------------------")
    print("PPO update frequency : " + str(update_timestep) + " timesteps")
    print("PPO K epochs : ", K_epochs)
    print("PPO epsilon clip : ", eps_clip)
    print("discount factor (gamma) : ", gamma)
    print("--------------------------------------------------------------------------------------------")
    print("optimizer learning rate actor : ", lr_actor)
    print("optimizer learning rate critic : ", lr_critic)
    if random_seed:
        print("--------------------------------------------------------------------------------------------")
        print("setting random seed to ", random_seed)
        torch.manual_seed(random_seed)
        env.seed(random_seed)
        np.random.seed(random_seed)
    #####################################################

    print("============================================================================================")

    ################# training procedure ################

    # initialize a PPO agent
    ppo_agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, has_continuous_action_space,
                    action_std)

    # track total training time
    start_time = datetime.now().replace(microsecond=0)
    print("Started training at (GMT) : ", start_time)

    print("============================================================================================")

    # logging file
    log_f = open(log_f_name, "w+")
    log_f.write('episode,timestep,reward\n')

    # printing and logging variables
    print_running_reward = 0
    print_running_episodes = 0

    log_running_reward = 0
    log_running_episodes = 0

    time_step = 0
    i_episode = 0

    # training loop
    while time_step <= max_training_timesteps:

        state = env.reset()
        current_ep_reward = 0

        for t in range(1, max_ep_len + 1):

            # select action with policy
            action = ppo_agent.select_action(state.float())
            # print(action)
            state, reward, done, _ = env.step(action)

            # saving reward and is_terminals
            ppo_agent.buffer.rewards.append(reward)
            ppo_agent.buffer.is_terminals.append(done)

            time_step += 1
            current_ep_reward += reward

            # update PPO agent
            if time_step % update_timestep == 0:
                ppo_agent.update()

            # if continuous action space; then decay action std of ouput action distribution
            if has_continuous_action_space and time_step % action_std_decay_freq == 0:
                ppo_agent.decay_action_std(action_std_decay_rate, min_action_std)

            # log in logging file
            if time_step % log_freq == 0:
                # log average reward till last episode
                log_avg_reward = log_running_reward / log_running_episodes
                # log_avg_reward = round(log_avg_reward, 4)
                log_avg_reward = torch.round(log_avg_reward)

                log_f.write('{},{},{}\n'.format(i_episode, time_step, log_avg_reward))
                log_f.flush()

                log_running_reward = 0
                log_running_episodes = 0

            # printing average reward
            if time_step % print_freq == 0:
                # print average reward till last episode
                print_avg_reward = print_running_reward / print_running_episodes
                # print_avg_reward = round(print_avg_reward, 2)
                print_avg_reward = torch.round(print_avg_reward)

                print("Episode : {} \t\t Timestep : {} \t\t Average Reward : {}".format(i_episode, time_step,
                                                                                        print_avg_reward))

                print_running_reward = 0
                print_running_episodes = 0

            # save model weights
            if time_step % save_model_freq == 0:
                print("--------------------------------------------------------------------------------------------")
                print("saving model at : " + checkpoint_path)
                ppo_agent.save(checkpoint_path)
                print("model saved")
                print("Elapsed Time  : ", datetime.now().replace(microsecond=0) - start_time)
                print("--------------------------------------------------------------------------------------------")

            # break; if the episode is over
            if done:
                break

        print_running_reward += current_ep_reward
        print_running_episodes += 1

        log_running_reward += current_ep_reward
        log_running_episodes += 1

        i_episode += 1

    log_f.close()
    env.close()

    # print total training time
    print("============================================================================================")
    end_time = datetime.now().replace(microsecond=0)
    print("Started training at (GMT) : ", start_time)
    print("Finished training at (GMT) : ", end_time)
    print("Total training time  : ", end_time - start_time)
    print("============================================================================================")

if __name__ == '__main__':
    train()

这个代码我已经改为离散的动作空间了，但是代码还是不能收敛
reward曲线

写回答
好问题 0 提建议
追加酬金
关注问题
分享
邀请回答
编辑收藏删除结题
收藏举报

2条回答默认最新

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
码农阿豪@新空间代码工作室 Java领域优质创作者 2024-03-24 10:21
关注
让阿豪来帮你解答，本回答参考chatgpt3.5编写提供，如果还有疑问可以评论或留言
该代码是一个以深度强化学习框架实现的环境类，用于提供环境给智能体进行学习。问题：
该环境类提供了哪些属性和方法？
该环境类的作用是什么？它是如何实现的？回答：
该环境类提供了以下属性和方法：
min_action: 最小动作值
max_action: 最大动作值
observation_space: 观测空间，用于描述智能体可以观测到的环境信息
action_space: 动作空间，用于描述智能体可以采取的动作方案
chase: 追击航天器的状态信息
escape: 逃逸航天器的状态信息
time: 时间
reset(): 重置环境状态
_initialize_positions(): 初始化航天器的位置信息
_get_observation(): 获取观测值
step(action): 执行动作，返回下一状态、奖励和是否结束的标志
该环境类的作用是提供一个模拟环境，智能体可以在其中进行强化学习。它通过提供一个状态、动作、奖励和终止状态的框架来描述智能体与环境的交互过程，并通过实现相应的属性和方法，将这个框架具体化。在具体实现中，该环境类使用了追击航天器和逃逸航天器的状态信息作为状态，选择动作后更新状态并返回新的状态、奖励和是否结束的标志。智能体在此基础上可以使用深度强化学习等技术进行学习和决策。
解决无用
评论打赏
分享
举报

评论

按下Enter换行，Ctrl+Enter发表内容

报告相同问题？

关注问题

如何快速掌握深度强化学习的各种算法，比如DQN，DDPG，PPO，SAC等等，并用Python准确实现呢？？ python pytorch 机器学习
2023-02-11 18:41

回答 1 已采纳以下答案引用自GPT-3大模型,请合理使用： ```想请问大神们，有没有比较好的资料或者指导方法，能够快速掌握深度强化学习的各种算法呢？首先，你需要理解深度强化学习的基本概念。然后，你可以通过阅读相
强化学习代码报错，typerror python
2022-10-05 11:55

回答 1 已采纳 StopTrainingOnRewardThreshold 不是都提示说传递了一个不期望的参数 'treshhold_type' Ctrl +鼠标左键点进去看下这个方法都需要哪些参数啊
报错：'NoneType' object has no attribute 'shape' opencv python
2021-09-25 13:30

回答 2 已采纳这个是已经处理完了，读取的数据为空，所以为nonetype
基于飞桨PARL实践PPO算法，让“猎豹”学会奔跑！
2020-09-03 19:21

百度大脑的博客 4.2 在连续控制环境中与其他算法对比可以看得出，在前1000000步过程中，PPO 明显大多数其他算法收敛更快；而且适应范围更广，在多种任务中皆表现出色。 4.3 复杂的连续控制环境：模拟人类奔跑可以看到，在复杂的...
linux多网卡及多拨号的问题 linux ubuntu 网络
2016-12-21 01:31

回答 2 已采纳 http://blog.csdn.net/yuanbinquan/article/details/51468886
深度强化学习的常用算法——DQN，DDPG，PPO
2023-08-05 01:48

AI天才研究院的博客它利用大量的实时反馈信息和高维动作空间，通过学习从原始输入到执行动作的映射关系，从而解决复杂问题，取得比传统机器学习更好的效果。其中最著名的就是由OpenAI开发的强化学习库OpenAI Gym。
Deep Reinforcement Learning: PPO vs DQN
2023-08-30 11:54

AI天才研究院的博客在人工智能领域，大规模并行计算（如：GPU）成为支撑强化学习系统快速演进、实现高度抽象...近年来，研究人员提出了新的深度强化学习方法——Proximal Policy Optimization (PPO)，它可以有效克服DQN存在的很多不足。
深度强化学习的相关概念和发展趋势，并介绍DQN、DDPG、A3C、PPO等经典算法
2023-08-07 01:09

AI天才研究院的博客 1990年，基于Monte Carlo方法的Q-learning发明者William McAllister提出了一种...1993年，李宏毅等人首次提出DDPG(Deep Deterministic Policy Gradient)，成功将智能体从状态空间直接映射到动作空间，实现端到端学习。
经典策略梯度算法
2023-11-30 22:03

数分虐我千百遍的博客 DDPG 算法被提出的初衷其实是 DQN 算法的一个连续动作空间版本扩展。深度确定性策略梯度算法（ deep deterministic policy gradient，DDPG），是一种确定性的策略梯度算法。由于DQN算法中动作是通过贪心策略或者arg...
深度强化学习调参技巧：以DQN、DDPG、TD3、PPO、SAC等算法为例
2023-07-14 15:08

汀、人工智能的博客深度强化学习调参技巧：以D3QN、DDPG、TD3、PPO、SAC算法为例
人工智能的学习算法
2023-10-01 12:13

ScienceLi1125的博客常见的方法包括 Q - 学习、深度 Q 网络、策略梯度方法、Actor - Critic 方法、模仿学习等： Q - 学习（Q-learning）是一种基于值函数的强化学习算法2，用于离散状态和动作空间的问题。它通过建立一个 Q 值表格，...
【算法】演员~评论家方法
2024-08-30 17:12

shinelord明的博客演员-评论家方法是一种强大的强化学习算法，结合了策略和价值函数的优点，适用于多种复杂的环境。
AI人工智能核心算法原理与代码实例讲解：Qlearning
2024-06-28 01:08

AI天才研究院的博客 AI人工智能核心算法原理与代码实例讲解：Q-learning 作者：禅与计算机程序设计艺术 / Zen and the Art of Computer Programming / TextGenWebUILLM
在线学习的深度强化学习——Online Reinforcement Learning for Learning
2023-09-01 12:23

AI天才研究院的博客传统的强化学习方法在处理高维状态空间和复杂决策问题时往往力不从心，而深度强化学习通过结合深度学习的强大表示能力，极大地扩展了强化学习的应用范围。然而，在实际应用中，我们常常面临着动态变化的环境和持续...
《强化学习周刊》第59期：GCRL、DNN-RCUC&PD-MORL
2022-08-26 15:54

智源社区的博客其允许基于相对质量对结果进行权衡，可用于连续和离散的动作空间，并且可以自然地应用于受约束和不受约束的环境中。本文展示了如何通过抽样计算一类广泛的风险敏感目标计算策略梯度的渐近一致估计，随后结合方差减少...
强化学习Reinforcement Learning算法的样本效率提升策略
2024-10-08 03:48

AI天才研究院的博客强化学习Reinforcement Learning算法的样本效率提升策略作者：禅与计算机程序设计艺术 / Zen and the Art of Computer Programming 1. 背景介绍
大语言模型(LLMs)综述调研
2023-08-24 16:31

nanobobo的博客架构选择使用LM目标进行预训练时，casual decoder架构可以实现出色的零样本和少样本泛化能力，并表现出优于其他架构的零样本性能。此外，指令调整和对齐调整已经被证明可以进一步增强大型casual decoder模型的能力。...
算法工程师深度解构ChatGPT技术
2022-12-09 19:28

腾讯云开发者的博客引言 |本栏目特邀腾讯知名语言文本项目算法工程师冉昱、薛晨，用专业视野带你由浅入深了解ChatGPT技术全貌。它经历了什么训练过程？成功关键技术是什么？将如何带动行业的变革？开发者如何借鉴ChatGPT思路和技术，...
没有解决我的问题, 去提问

问题事件

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
创建了问题 3月24日

悬赏问题

¥15 数据量少可以用MK趋势分析吗
¥15 使用VH6501干扰RTR位，CANoe上显示的错误帧不足32个就进入bus off快慢恢复，为什么？
¥15 大智慧怎么编写一个选股程序
¥100 python 调用 cgps 命令获取实时位置信息
¥15 两台交换机分别是trunk接口和access接口为何无法通信，通信过程是如何？
¥15 C语言使用vscode编码错误
¥15 用KSV5转成本时，如何不生成那笔中间凭证
¥20 ensp怎么配置让PC1和PC2通讯上
¥50 有没有适合匹配类似图中的运动规律的图像处理算法
¥15 dnat基础问题,本机发出,别人返回的包,不能命中

PPO算法离散空间训练不收敛

2条回答 默认 最新

问题事件

悬赏问题

2条回答默认最新