¥YRQ¥ 2024-03-24 10:20 采纳率: 16.7%
浏览 33

PPO算法离散空间训练不收敛


import os
import glob
import time
from datetime import datetime

import torch
import numpy as np

import gym

from PPODemo import PPO

from gym import spaces

from OrbitalTransfer import *
# 修改为使用归一化输入

import random

import time

from OrbitCore import *


class Environment(gym.Env):
    def __init__(self):
        self.min_action = -1
        self.max_action = 1
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(12,),
                                            dtype=float)  # 追击rv,逃逸rv
        # 定义输出动作空间(三维向量)
        self.action_space = spaces.Discrete(7)  # 三个方向
        # self.action_space = spaces.Box(low=self.min_action, high=self.max_action, shape=(3,),
        #                                dtype=float)  # action给出三个xyx两个速度增量方向
        self.chase = None  # 归一化之后的
        self.escape = None
        self.time = 1
        # torch.set_default_dtype(torch.float64)

    def reset(self):

        self._initialize_positions()  # 0-1之间
        observation = self._get_observation()  # 0-1之间
        return observation

    def _initialize_positions(self):  # 初始化航天器位置信息
        current_time = int(time.time())
        # 使用当前时间戳作为随机数生成器的种子
        random.seed(current_time)
        '''
        初始化:设置a,e,i,omega,w,M0
        设置a,i,omega差不多
        '''
        a = random.random()
        i = random.random()
        omega = random.random()
        w = random.random()
        M0 = random.random()
        self.chase = torch.tensor([a, random.random() * 0.00001, i, omega, w - 0.0001 * random.random(),
                                   M0 - 0.0001 * random.random()])  # 设置到0-1之间
        self.escape = torch.tensor([a, random.random() * 0.00001, i, omega, w, M0])
        self.withoutImpluse = self.chase  # 使用追击方数据作为初始化
        # print("the coe of escape is", self.inverseTrans(self.escape))
        # print("the coe of chase is", self.inverseTrans(self.chase))
        # print("the coe of chase without Impluse is", self.inverseTrans(self.withoutImpluse))
        # print("initial distance is", self.distance_input01(self.chase, self.escape))

    def step(self, action):  # 输入action,输出状态,奖励,结束标志,action应该也是tensor
        # action = torch.clamp(torch.tensor(action)*0.0001, min=-0.0001, max=0.0001)#需要修改为直接输出为tensor,限制action大小
        # action = torch.clamp(action * 0.0001, min=-0.0001, max=0.0001)
        # action = torch.clamp(action, min=-1, max=1)#限制大小
        # action = torch.tensor([0, 0, 0])
        # print("action is",action)
        # print("the origin coe of chase is",self.inverseTrans(self.chase))
        # print("the origin coe of chase withour Impluse is",self.inverseTrans(self.withoutImpluse))
        action = self.chooseAction(action)  # 离散转化为向量
        # torch.set_printoptions(precision=20)

        # print("action is ", action)

        self.chase = self.update_state(self.chase, action)
        action = torch.tensor([0, 0, 0])
        self.escape = self.update_state(self.escape, action)
        self.withoutImpluse = self.update_state(self.withoutImpluse, action)

        observation = self._get_observation()
        reward = self.reward()
        done = self.check_termination()
        # print("update distance is",self.distance_input01(self.chase, self.escape))
        # print("update distance of noImpluse is", self.distance_input01(self.withoutImpluse, self.escape))
        # torch.set_printoptions(precision=20)
        # print("action is ", action)

        # print("update of the coe of chase is",self.inverseTrans(self.chase))
        # print("update of the coe of chase without Impluse is", self.inverseTrans(self.withoutImpluse))
        # print("error between of two is",self.inverseTrans(self.chase) - self.inverseTrans(self.withoutImpluse))
        return observation, reward, done, None

    def chooseAction(self, action):
        if (action == 0):
            output = torch.tensor([1, 0, 0])
        elif (action == 1):
            output = torch.tensor([0, 1, 0])
        elif (action == 2):
            output = torch.tensor([0, 0, 1])
        elif (action == 3):
            output = torch.tensor([0, 0, 0])
        elif (action == 4):
            output = torch.tensor([-1, 0, 0])
        elif (action == 5):
            output = torch.tensor([0, -1, 0])
        elif (action == 6):
            output = torch.tensor([0, 0, -1])
        action = output  * 0.0001
        return action



    def update_state(self, state, action):  # 输入0-1的state,更新self.state
        S = self.inverseTrans(state)
        Core = OrbitCore()
        Transfer = OrbitalTransfer()
        r, v = Core.Orbit_Element_2_State_rv(S)
        v = v + action
        Predict = OrbitPredict()
        rv = torch.cat((r.unsqueeze(0), v.unsqueeze(0)), dim=1).view(-1)
        # print("rv",rv)

        # r, v = Predict.J2OrbitRV(rv,50)#时间先设置为50s,输出rv
        # coe = Core.State_rv_2_Orbit_Element(r, v)#这个函数有问题,需要修改
        # coe2 = Core.State_rv_2_Orbit_Element(r.numpy(),v.numpy())
        # state = self.trans(coe)#转化到0-1之间

        coe = Core.State_rv_2_Orbit_Element(r, v)
        if torch.isnan(coe[5]):
            print("error in main update_state State_rv_2_Orbit_Element")
        # print("coe",coe)#出现e>1,设置推力小于0.0001

        coe = Predict.J2Orbit(coe, 50)
        if torch.isnan(coe[5]):
            print("error in main update_state J2Orbit")
        state = self.trans(coe)

        return state

    def _get_observation(self):
        observation = torch.cat((self.chase.unsqueeze(0), self.escape.unsqueeze(0)), dim=1)
        return observation

    def check_termination(self):
        terminate = False
        chase = self.inverseTrans(self.chase)  # 0-1转化为六根
        escape = self.inverseTrans(self.escape)
        distance = self.distance(chase, escape)
        if distance <= 0.5:
            terminate = True
        return terminate

    def trans(self, state):  # 六根转化到0-1之间
        a = (state[0] - 30000) / 15000
        e = state[1]
        i = state[2] / 180
        omega = state[3] / 180
        w = state[4] / 360
        M0 = state[5] / 360
        return torch.tensor([a, e, i, omega, w, M0])

    def inverseTrans(self, state):  # 0-1转化为六根数
        a = state[0] * 15000 + 30000
        e = state[1]
        i = state[2] * 180
        omega = state[3] * 180
        w = state[4] * 360
        M0 = state[5] * 360
        return torch.tensor([a, e, i, omega, w, M0], dtype=torch.float64)

    def distance(self, state1, state2):
        Core = OrbitCore()
        r1, v1 = Core.Orbit_Element_2_State_rv(state1)  # 修改为支持tensor的,state是六根
        r2, v2 = Core.Orbit_Element_2_State_rv(state2)
        distance = torch.norm(r1 - r2)
        return distance

    def distance_input01(self, chase, escape):
        coe_chase = self.inverseTrans(chase)
        coe_escape = self.inverseTrans(escape)
        reward = self.distance(coe_chase, coe_escape)
        return reward

    def reward(self):
        coe_chase = self.inverseTrans(self.chase)
        coe_escape = self.inverseTrans(self.escape)
        reward = -1 * self.distance(coe_chase, coe_escape)
        return reward


################################### Training ###################################
def train():
    # Predict = OrbitPredict()
    print("============================================================================================")

    ####### initialize environment hyperparameters ######
    env_name = "train-orbit-discrete"

    has_continuous_action_space = False  # continuous action space; else discrete

    max_ep_len = 500  # max timesteps in one episode,对抗次数减小到500
    max_training_timesteps = int(3e6)  # break training loop if timeteps > max_training_timesteps

    print_freq = max_ep_len * 10  # print avg reward in the interval (in num timesteps)
    log_freq = max_ep_len * 2  # log avg reward in the interval (in num timesteps)
    save_model_freq = int(1e5)  # save model frequency (in num timesteps)

    action_std = 0.6  # starting std for action distribution (Multivariate Normal)
    action_std_decay_rate = 0.05  # linearly decay action_std (action_std = action_std - action_std_decay_rate)
    min_action_std = 0.1  # minimum action_std (stop decay after action_std <= min_action_std)
    action_std_decay_freq = int(2.5e5)  # action_std decay frequency (in num timesteps)
    #####################################################

    ## Note : print/log frequencies should be > than max_ep_len

    ################ PPO hyperparameters ################
    update_timestep = max_ep_len * 4  # update policy every n timesteps
    K_epochs = 80  # update policy for K epochs in one PPO update

    eps_clip = 0.2  # clip parameter for PPO
    gamma = 0.99  # discount factor

    lr_actor = 0.0001  # learning rate for actor network
    lr_critic = 0.0005  # learning rate for critic network

    random_seed = 0  # set random seed if required (0 = no random seed)
    #####################################################

    print("training environment name : " + env_name)

    # env = gym.make(env_name)

    env = Environment()

    # state space dimension
    state_dim = env.observation_space.shape[0]

    # action space dimension
    if has_continuous_action_space:
        action_dim = env.action_space.shape[0]
    else:
        action_dim = env.action_space.n

    ###################### logging ######################

    #### log files for multiple runs are NOT overwritten
    log_dir = "PPO_logs"
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    log_dir = log_dir + '/' + env_name + '/'
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    #### get number of log files in log directory
    run_num = 0
    current_num_files = next(os.walk(log_dir))[2]
    run_num = len(current_num_files)

    #### create new log file for each run

    log_f_name = log_dir + '/PPO_' + env_name + "_log_" + str(run_num) + ".csv"

    print("current logging run number for " + env_name + " : ", run_num)
    print("logging at : " + log_f_name)
    #####################################################

    ################### checkpointing ###################
    log_dir = "PPO_preTrained"
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    log_dir = log_dir + '/' + env_name + '/'
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    run_num_pretrained = 0  #### change this to prevent overwriting weights in same env_name folder

    current_num_files = next(os.walk(log_dir))[2]
    run_num_pretrained = len(current_num_files)

    directory = "PPO_preTrained"
    if not os.path.exists(directory):
        os.makedirs(directory)

    directory = directory + '/' + env_name + '/'
    if not os.path.exists(directory):
        os.makedirs(directory)

    checkpoint_path = directory + "PPO_{}_{}_{}.pth".format(env_name, random_seed, run_num_pretrained)
    print("save checkpoint path : " + checkpoint_path)
    #####################################################

    ############# print all hyperparameters #############
    print("--------------------------------------------------------------------------------------------")
    print("max training timesteps : ", max_training_timesteps)
    print("max timesteps per episode : ", max_ep_len)
    print("model saving frequency : " + str(save_model_freq) + " timesteps")
    print("log frequency : " + str(log_freq) + " timesteps")
    print("printing average reward over episodes in last : " + str(print_freq) + " timesteps")
    print("--------------------------------------------------------------------------------------------")
    print("state space dimension : ", state_dim)
    print("action space dimension : ", action_dim)
    print("--------------------------------------------------------------------------------------------")
    if has_continuous_action_space:
        print("Initializing a continuous action space policy")
        print("--------------------------------------------------------------------------------------------")
        print("starting std of action distribution : ", action_std)
        print("decay rate of std of action distribution : ", action_std_decay_rate)
        print("minimum std of action distribution : ", min_action_std)
        print("decay frequency of std of action distribution : " + str(action_std_decay_freq) + " timesteps")
    else:
        print("Initializing a discrete action space policy")
    print("--------------------------------------------------------------------------------------------")
    print("PPO update frequency : " + str(update_timestep) + " timesteps")
    print("PPO K epochs : ", K_epochs)
    print("PPO epsilon clip : ", eps_clip)
    print("discount factor (gamma) : ", gamma)
    print("--------------------------------------------------------------------------------------------")
    print("optimizer learning rate actor : ", lr_actor)
    print("optimizer learning rate critic : ", lr_critic)
    if random_seed:
        print("--------------------------------------------------------------------------------------------")
        print("setting random seed to ", random_seed)
        torch.manual_seed(random_seed)
        env.seed(random_seed)
        np.random.seed(random_seed)
    #####################################################

    print("============================================================================================")

    ################# training procedure ################

    # initialize a PPO agent
    ppo_agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, has_continuous_action_space,
                    action_std)

    # track total training time
    start_time = datetime.now().replace(microsecond=0)
    print("Started training at (GMT) : ", start_time)

    print("============================================================================================")

    # logging file
    log_f = open(log_f_name, "w+")
    log_f.write('episode,timestep,reward\n')

    # printing and logging variables
    print_running_reward = 0
    print_running_episodes = 0

    log_running_reward = 0
    log_running_episodes = 0

    time_step = 0
    i_episode = 0

    # training loop
    while time_step <= max_training_timesteps:

        state = env.reset()
        current_ep_reward = 0

        for t in range(1, max_ep_len + 1):

            # select action with policy
            action = ppo_agent.select_action(state.float())
            # print(action)
            state, reward, done, _ = env.step(action)

            # saving reward and is_terminals
            ppo_agent.buffer.rewards.append(reward)
            ppo_agent.buffer.is_terminals.append(done)

            time_step += 1
            current_ep_reward += reward

            # update PPO agent
            if time_step % update_timestep == 0:
                ppo_agent.update()

            # if continuous action space; then decay action std of ouput action distribution
            if has_continuous_action_space and time_step % action_std_decay_freq == 0:
                ppo_agent.decay_action_std(action_std_decay_rate, min_action_std)

            # log in logging file
            if time_step % log_freq == 0:
                # log average reward till last episode
                log_avg_reward = log_running_reward / log_running_episodes
                # log_avg_reward = round(log_avg_reward, 4)
                log_avg_reward = torch.round(log_avg_reward)

                log_f.write('{},{},{}\n'.format(i_episode, time_step, log_avg_reward))
                log_f.flush()

                log_running_reward = 0
                log_running_episodes = 0

            # printing average reward
            if time_step % print_freq == 0:
                # print average reward till last episode
                print_avg_reward = print_running_reward / print_running_episodes
                # print_avg_reward = round(print_avg_reward, 2)
                print_avg_reward = torch.round(print_avg_reward)

                print("Episode : {} \t\t Timestep : {} \t\t Average Reward : {}".format(i_episode, time_step,
                                                                                        print_avg_reward))

                print_running_reward = 0
                print_running_episodes = 0

            # save model weights
            if time_step % save_model_freq == 0:
                print("--------------------------------------------------------------------------------------------")
                print("saving model at : " + checkpoint_path)
                ppo_agent.save(checkpoint_path)
                print("model saved")
                print("Elapsed Time  : ", datetime.now().replace(microsecond=0) - start_time)
                print("--------------------------------------------------------------------------------------------")

            # break; if the episode is over
            if done:
                break

        print_running_reward += current_ep_reward
        print_running_episodes += 1

        log_running_reward += current_ep_reward
        log_running_episodes += 1

        i_episode += 1

    log_f.close()
    env.close()

    # print total training time
    print("============================================================================================")
    end_time = datetime.now().replace(microsecond=0)
    print("Started training at (GMT) : ", start_time)
    print("Finished training at (GMT) : ", end_time)
    print("Total training time  : ", end_time - start_time)
    print("============================================================================================")

if __name__ == '__main__':
    train()

这个代码我已经改为离散的动作空间了,但是代码还是不能收敛
reward曲线

img

  • 写回答

2条回答 默认 最新

  • 码农阿豪@新空间代码工作室 Java领域优质创作者 2024-03-24 10:21
    关注
    让阿豪来帮你解答,本回答参考chatgpt3.5编写提供,如果还有疑问可以评论或留言
    该代码是一个以深度强化学习框架实现的环境类,用于提供环境给智能体进行学习。 问题:
    1. 该环境类提供了哪些属性和方法?
    2. 该环境类的作用是什么?它是如何实现的? 回答:
    3. 该环境类提供了以下属性和方法:
    4. min_action: 最小动作值
    5. max_action: 最大动作值
    6. observation_space: 观测空间,用于描述智能体可以观测到的环境信息
    7. action_space: 动作空间,用于描述智能体可以采取的动作方案
    8. chase: 追击航天器的状态信息
    9. escape: 逃逸航天器的状态信息
    10. time: 时间
    11. reset(): 重置环境状态
    12. _initialize_positions(): 初始化航天器的位置信息
    13. _get_observation(): 获取观测值
    14. step(action): 执行动作,返回下一状态、奖励和是否结束的标志
    15. 该环境类的作用是提供一个模拟环境,智能体可以在其中进行强化学习。它通过提供一个状态、动作、奖励和终止状态的框架来描述智能体与环境的交互过程,并通过实现相应的属性和方法,将这个框架具体化。在具体实现中,该环境类使用了追击航天器和逃逸航天器的状态信息作为状态,选择动作后更新状态并返回新的状态、奖励和是否结束的标志。智能体在此基础上可以使用深度强化学习等技术进行学习和决策。
    评论

报告相同问题?

问题事件

  • 创建了问题 3月24日

悬赏问题

  • ¥15 数据量少可以用MK趋势分析吗
  • ¥15 使用VH6501干扰RTR位,CANoe上显示的错误帧不足32个就进入bus off快慢恢复,为什么?
  • ¥15 大智慧怎么编写一个选股程序
  • ¥100 python 调用 cgps 命令获取 实时位置信息
  • ¥15 两台交换机分别是trunk接口和access接口为何无法通信,通信过程是如何?
  • ¥15 C语言使用vscode编码错误
  • ¥15 用KSV5转成本时,如何不生成那笔中间凭证
  • ¥20 ensp怎么配置让PC1和PC2通讯上
  • ¥50 有没有适合匹配类似图中的运动规律的图像处理算法
  • ¥15 dnat基础问题,本机发出,别人返回的包,不能命中