¥YRQ¥ 2024-03-24 11:06 采纳率: 16.7%
浏览 17
已结题

PPO离散化二维追击问题不收敛


import os
import glob
import time
from datetime import datetime

import torch
import numpy as np

import gym

from PPODemo import PPO

from gym import spaces

from OrbitalTransfer import *
# 修改为使用归一化输入

import random

import time

from OrbitCore import *

#构造一个简单的环境,测试代码收敛性
class Environment(gym.Env):
    def __init__(self):
        self.min_action = -1
        self.max_action = 1
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(4,),
                                            dtype=float)  # 追击rv,逃逸rv
        # 定义输出动作空间(三维向量)
        self.action_space = spaces.Discrete(4)  # 三个方向
        self.chase = None  # 归一化之后的
        self.escape = None
        self.time = 1

    def reset(self):

        self._initialize_positions()  # 0-1之间
        observation = self._get_observation()  # 0-1之间
        return observation

    def _initialize_positions(self):  # 初始化航天器位置信息
        current_time = int(time.time())
        # 使用当前时间戳作为随机数生成器的种子
        random.seed(current_time)
        self.chase = torch.tensor([random.randint(1, 60),random.randint(1, 60)])
        self.escape = torch.tensor([random.randint(1, 60),random.randint(1, 60)])

    def step(self, action):  # 输入action,输出状态,奖励,结束标志,action应该也是tensor
        action = self.chooseAction(action)  # 离散转化为向量

        self.chase = self.update_state(self.chase, action)
        action = torch.tensor([0, 0])
        self.escape = self.update_state(self.escape, action)
        print("escape is",self.escape,"chase is ",self.chase)

        observation = self._get_observation()
        reward = self.reward()
        done = self.check_termination()
        return observation, reward, done, None

    def chooseAction(self, action):
        if (action == 0):
            output = torch.tensor([1, 0])
        elif (action == 1):
            output = torch.tensor([0, 1])
        elif (action == 2):
            output = torch.tensor([0, 0])
        elif (action == 3):
            output = torch.tensor([0, -1])
        elif (action == 4):
            output = torch.tensor([-1, 0])
        return output



    def update_state(self, state, action):  # 输入0-1的state,更新self.state
        state = state + action
        return state

    def _get_observation(self):
        observation = torch.cat((self.chase.unsqueeze(0), self.escape.unsqueeze(0)), dim=1)
        return observation

    def check_termination(self):
        terminate = False
        if self.escape[0] == self.chase[0] and self.escape[1] == self.chase[1]:
            terminate = True

        return terminate

    def distance(self, state1, state2):
        Core = OrbitCore()
        r1, v1 = Core.Orbit_Element_2_State_rv(state1)  # 修改为支持tensor的,state是六根
        r2, v2 = Core.Orbit_Element_2_State_rv(state2)
        distance = torch.norm(r1 - r2)
        return distance

    def distance_input01(self, chase, escape):
        coe_chase = self.inverseTrans(chase)
        coe_escape = self.inverseTrans(escape)
        reward = self.distance(coe_chase, coe_escape)
        return reward

    def reward(self):
        reward = -1 * (torch.abs(self.chase[0]-self.escape[0]) + torch.abs(self.chase[1]-self.escape[1]) )
        return reward


################################### Training ###################################
def train():
    # Predict = OrbitPredict()
    print("============================================================================================")

    ####### initialize environment hyperparameters ######
    env_name = "train-orbit-discrete"

    has_continuous_action_space = False  # continuous action space; else discrete

    max_ep_len = 100  # max timesteps in one episode,对抗次数减小到500
    max_training_timesteps = int(3e6)  # break training loop if timeteps > max_training_timesteps

    print_freq = max_ep_len * 10  # print avg reward in the interval (in num timesteps)
    log_freq = max_ep_len * 2  # log avg reward in the interval (in num timesteps)
    save_model_freq = int(1e5)  # save model frequency (in num timesteps)

    action_std = 0.6  # starting std for action distribution (Multivariate Normal)
    action_std_decay_rate = 0.05  # linearly decay action_std (action_std = action_std - action_std_decay_rate)
    min_action_std = 0.1  # minimum action_std (stop decay after action_std <= min_action_std)
    action_std_decay_freq = int(2.5e5)  # action_std decay frequency (in num timesteps)
    #####################################################

    ## Note : print/log frequencies should be > than max_ep_len

    ################ PPO hyperparameters ################
    update_timestep = max_ep_len * 4  # update policy every n timesteps
    K_epochs = 80  # update policy for K epochs in one PPO update

    eps_clip = 0.2  # clip parameter for PPO
    gamma = 0.99  # discount factor

    lr_actor = 0.0001  # learning rate for actor network
    lr_critic = 0.0005  # learning rate for critic network

    random_seed = 0  # set random seed if required (0 = no random seed)
    #####################################################

    print("training environment name : " + env_name)

    # env = gym.make(env_name)

    env = Environment()

    # state space dimension
    state_dim = env.observation_space.shape[0]

    # action space dimension
    if has_continuous_action_space:
        action_dim = env.action_space.shape[0]
    else:
        action_dim = env.action_space.n

    ###################### logging ######################

    #### log files for multiple runs are NOT overwritten
    log_dir = "PPO_logs"
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    log_dir = log_dir + '/' + env_name + '/'
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    #### get number of log files in log directory
    run_num = 0
    current_num_files = next(os.walk(log_dir))[2]
    run_num = len(current_num_files)

    #### create new log file for each run

    log_f_name = log_dir + '/PPO_' + env_name + "_log_" + str(run_num) + ".csv"

    print("current logging run number for " + env_name + " : ", run_num)
    print("logging at : " + log_f_name)
    #####################################################

    ################### checkpointing ###################
    log_dir = "PPO_preTrained"
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    log_dir = log_dir + '/' + env_name + '/'
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    run_num_pretrained = 0  #### change this to prevent overwriting weights in same env_name folder

    current_num_files = next(os.walk(log_dir))[2]
    run_num_pretrained = len(current_num_files)

    directory = "PPO_preTrained"
    if not os.path.exists(directory):
        os.makedirs(directory)

    directory = directory + '/' + env_name + '/'
    if not os.path.exists(directory):
        os.makedirs(directory)

    checkpoint_path = directory + "PPO_{}_{}_{}.pth".format(env_name, random_seed, run_num_pretrained)
    print("save checkpoint path : " + checkpoint_path)
    #####################################################

    ############# print all hyperparameters #############
    print("--------------------------------------------------------------------------------------------")
    print("max training timesteps : ", max_training_timesteps)
    print("max timesteps per episode : ", max_ep_len)
    print("model saving frequency : " + str(save_model_freq) + " timesteps")
    print("log frequency : " + str(log_freq) + " timesteps")
    print("printing average reward over episodes in last : " + str(print_freq) + " timesteps")
    print("--------------------------------------------------------------------------------------------")
    print("state space dimension : ", state_dim)
    print("action space dimension : ", action_dim)
    print("--------------------------------------------------------------------------------------------")
    if has_continuous_action_space:
        print("Initializing a continuous action space policy")
        print("--------------------------------------------------------------------------------------------")
        print("starting std of action distribution : ", action_std)
        print("decay rate of std of action distribution : ", action_std_decay_rate)
        print("minimum std of action distribution : ", min_action_std)
        print("decay frequency of std of action distribution : " + str(action_std_decay_freq) + " timesteps")
    else:
        print("Initializing a discrete action space policy")
    print("--------------------------------------------------------------------------------------------")
    print("PPO update frequency : " + str(update_timestep) + " timesteps")
    print("PPO K epochs : ", K_epochs)
    print("PPO epsilon clip : ", eps_clip)
    print("discount factor (gamma) : ", gamma)
    print("--------------------------------------------------------------------------------------------")
    print("optimizer learning rate actor : ", lr_actor)
    print("optimizer learning rate critic : ", lr_critic)
    if random_seed:
        print("--------------------------------------------------------------------------------------------")
        print("setting random seed to ", random_seed)
        torch.manual_seed(random_seed)
        env.seed(random_seed)
        np.random.seed(random_seed)
    #####################################################

    print("============================================================================================")

    ################# training procedure ################

    # initialize a PPO agent
    ppo_agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, has_continuous_action_space,
                    action_std)

    # track total training time
    start_time = datetime.now().replace(microsecond=0)
    print("Started training at (GMT) : ", start_time)

    print("============================================================================================")

    # logging file
    log_f = open(log_f_name, "w+")
    log_f.write('episode,timestep,reward\n')

    # printing and logging variables
    print_running_reward = 0
    print_running_episodes = 0

    log_running_reward = 0
    log_running_episodes = 0

    time_step = 0
    i_episode = 0

    # training loop
    while time_step <= max_training_timesteps:

        state = env.reset()
        current_ep_reward = 0

        for t in range(1, max_ep_len + 1):

            # select action with policy
            action = ppo_agent.select_action(state.float())
            # print(action)
            state, reward, done, _ = env.step(action)

            # saving reward and is_terminals
            ppo_agent.buffer.rewards.append(reward)
            ppo_agent.buffer.is_terminals.append(done)

            time_step += 1
            current_ep_reward += reward

            # update PPO agent
            if time_step % update_timestep == 0:
                ppo_agent.update()

            # if continuous action space; then decay action std of ouput action distribution
            if has_continuous_action_space and time_step % action_std_decay_freq == 0:
                ppo_agent.decay_action_std(action_std_decay_rate, min_action_std)

            # log in logging file
            if time_step % log_freq == 0:
                # log average reward till last episode
                log_avg_reward = log_running_reward / log_running_episodes
                # log_avg_reward = round(log_avg_reward, 4)
                log_avg_reward = torch.round(log_avg_reward)

                log_f.write('{},{},{}\n'.format(i_episode, time_step, log_avg_reward))
                log_f.flush()

                log_running_reward = 0
                log_running_episodes = 0

            # printing average reward
            if time_step % print_freq == 0:
                # print average reward till last episode
                print_avg_reward = print_running_reward / print_running_episodes
                # print_avg_reward = round(print_avg_reward, 2)
                print_avg_reward = torch.round(print_avg_reward)

                print("Episode : {} \t\t Timestep : {} \t\t Average Reward : {}".format(i_episode, time_step,
                                                                                        print_avg_reward))

                print_running_reward = 0
                print_running_episodes = 0

            # save model weights
            if time_step % save_model_freq == 0:
                print("--------------------------------------------------------------------------------------------")
                print("saving model at : " + checkpoint_path)
                ppo_agent.save(checkpoint_path)
                print("model saved")
                print("Elapsed Time  : ", datetime.now().replace(microsecond=0) - start_time)
                print("--------------------------------------------------------------------------------------------")

            # break; if the episode is over
            if done:
                break

        print_running_reward += current_ep_reward
        print_running_episodes += 1

        log_running_reward += current_ep_reward
        log_running_episodes += 1

        i_episode += 1

    log_f.close()
    env.close()

    # print total training time
    print("============================================================================================")
    end_time = datetime.now().replace(microsecond=0)
    print("Started training at (GMT) : ", start_time)
    print("Finished training at (GMT) : ", end_time)
    print("Total training time  : ", end_time - start_time)
    print("============================================================================================")


if __name__ == '__main__':
    train()

我修改为一个网格空间的追击问题,但是还是不收敛,帮我看看reward为啥不能提高
希望这个蓝色的可以追上红色的球

img


这是reward,一直不收敛

img

  • 写回答

22条回答 默认 最新

  • 专家-赤兔[在线] 全栈领域优质创作者 2024-03-24 11:06
    关注
    获得0.30元问题酬金

    引自免费微信小程序:皆我百晓生

    针对您提出的强化学习环境中PPO算法在解决二维追击问题时的不收敛问题,首先,我们需要深入分析问题并找出可能的原因。从提供的代码来看,有以下几个方面可能导致训练效果不佳或不收敛:

    1. 奖励函数设计:在Environment类中定义的reward方法计算了两者之间的距离差绝对值之和,但这样的设计可能不足以有效引导智能体接近目标。在追击问题中,通常希望当追逐者靠近逃脱者时给予正奖励,反之则给予惩罚。建议尝试将奖励函数修改为只基于两者之间的欧氏距离,并确保追逐者靠近时奖励增加,远离时减少。
    def reward(self):
        # 奖励应随着追逐者靠近逃脱者而增大
        reward = -torch.norm(self.chase - self.escape)
        return reward
    
    1. 动作离散化:目前环境中的动作空间是离散的,仅能选择五个固定的动作向量。这可能限制了追逐者的灵活性,导致其难以精确调整方向逼近逃脱者。考虑是否可以通过更细致的离散划分或者采用连续动作空间来改善策略的学习。

    2. 更新频率与训练步数:根据您的设置,模型每4倍于最大episode长度的时间步才进行一次更新,这对于复杂度较低的问题可能足够,但对于具有一定挑战性的追击问题可能过于稀疏。可以尝试降低update_timestep,提高更新频率。

    3. 探索-利用平衡:在PPOAgent中,对于离散动作空间并没有明确地提及添加任何探索策略,例如ε-greedy。适当增加探索有助于策略更好地了解环境,从而改进收敛性能。

    4. 状态空间归一化:虽然在注释中有提到要对输入进行归一化处理,但在实际实现中似乎没有看到相关代码。确保观测到的状态经过适当归一化可以帮助优化过程。

    5. 环境复杂性:当前环境设计相对简单,每次动作都是在当前位置基础上直接加法操作。而在真实追击问题中,往往存在物理动力学约束和非线性因素。为了模拟真实情况,可能需要引入更复杂的运动模型。

    综上所述,您可以先从上述几点着手,逐步调整奖励函数、探索策略以及更新频率等参数,观察训练结果的变化,以期改进收敛表现。同时,也需要注意调试环境,确保它能合理反映问题的本质特征。若需要进一步的帮助,请提供具体调参后的训练表现及日志,以便针对性地进行分析。

    评论 编辑记录

报告相同问题?

问题事件

  • 系统已结题 4月1日
  • 创建了问题 3月24日

悬赏问题

  • ¥15 已知平面坐标系(非直角坐标系)内三个点的坐标,反求两坐标轴的夹角
  • ¥15 webots有问题,无响应
  • ¥15 数据量少可以用MK趋势分析吗
  • ¥15 使用VH6501干扰RTR位,CANoe上显示的错误帧不足32个就进入bus off快慢恢复,为什么?
  • ¥15 大智慧怎么编写一个选股程序
  • ¥100 python 调用 cgps 命令获取 实时位置信息
  • ¥15 两台交换机分别是trunk接口和access接口为何无法通信,通信过程是如何?
  • ¥15 C语言使用vscode编码错误
  • ¥15 用KSV5转成本时,如何不生成那笔中间凭证
  • ¥20 ensp怎么配置让PC1和PC2通讯上