PPO离散化二维追击问题不收敛


import os
import glob
import time
from datetime import datetime

import torch
import numpy as np

import gym

from PPODemo import PPO

from gym import spaces

from OrbitalTransfer import *
# 修改为使用归一化输入

import random

import time

from OrbitCore import *

#构造一个简单的环境，测试代码收敛性
class Environment(gym.Env):
    def __init__(self):
        self.min_action = -1
        self.max_action = 1
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(4,),
                                            dtype=float)  # 追击rv,逃逸rv
        # 定义输出动作空间（三维向量）
        self.action_space = spaces.Discrete(4)  # 三个方向
        self.chase = None  # 归一化之后的
        self.escape = None
        self.time = 1

    def reset(self):

        self._initialize_positions()  # 0-1之间
        observation = self._get_observation()  # 0-1之间
        return observation

    def _initialize_positions(self):  # 初始化航天器位置信息
        current_time = int(time.time())
        # 使用当前时间戳作为随机数生成器的种子
        random.seed(current_time)
        self.chase = torch.tensor([random.randint(1, 60),random.randint(1, 60)])
        self.escape = torch.tensor([random.randint(1, 60),random.randint(1, 60)])

    def step(self, action):  # 输入action，输出状态，奖励，结束标志,action应该也是tensor
        action = self.chooseAction(action)  # 离散转化为向量

        self.chase = self.update_state(self.chase, action)
        action = torch.tensor([0, 0])
        self.escape = self.update_state(self.escape, action)
        print("escape is",self.escape,"chase is ",self.chase)

        observation = self._get_observation()
        reward = self.reward()
        done = self.check_termination()
        return observation, reward, done, None

    def chooseAction(self, action):
        if (action == 0):
            output = torch.tensor([1, 0])
        elif (action == 1):
            output = torch.tensor([0, 1])
        elif (action == 2):
            output = torch.tensor([0, 0])
        elif (action == 3):
            output = torch.tensor([0, -1])
        elif (action == 4):
            output = torch.tensor([-1, 0])
        return output



    def update_state(self, state, action):  # 输入0-1的state,更新self.state
        state = state + action
        return state

    def _get_observation(self):
        observation = torch.cat((self.chase.unsqueeze(0), self.escape.unsqueeze(0)), dim=1)
        return observation

    def check_termination(self):
        terminate = False
        if self.escape[0] == self.chase[0] and self.escape[1] == self.chase[1]:
            terminate = True

        return terminate

    def distance(self, state1, state2):
        Core = OrbitCore()
        r1, v1 = Core.Orbit_Element_2_State_rv(state1)  # 修改为支持tensor的,state是六根
        r2, v2 = Core.Orbit_Element_2_State_rv(state2)
        distance = torch.norm(r1 - r2)
        return distance

    def distance_input01(self, chase, escape):
        coe_chase = self.inverseTrans(chase)
        coe_escape = self.inverseTrans(escape)
        reward = self.distance(coe_chase, coe_escape)
        return reward

    def reward(self):
        reward = -1 * (torch.abs(self.chase[0]-self.escape[0]) + torch.abs(self.chase[1]-self.escape[1]) )
        return reward


################################### Training ###################################
def train():
    # Predict = OrbitPredict()
    print("============================================================================================")

    ####### initialize environment hyperparameters ######
    env_name = "train-orbit-discrete"

    has_continuous_action_space = False  # continuous action space; else discrete

    max_ep_len = 100  # max timesteps in one episode,对抗次数减小到500
    max_training_timesteps = int(3e6)  # break training loop if timeteps > max_training_timesteps

    print_freq = max_ep_len * 10  # print avg reward in the interval (in num timesteps)
    log_freq = max_ep_len * 2  # log avg reward in the interval (in num timesteps)
    save_model_freq = int(1e5)  # save model frequency (in num timesteps)

    action_std = 0.6  # starting std for action distribution (Multivariate Normal)
    action_std_decay_rate = 0.05  # linearly decay action_std (action_std = action_std - action_std_decay_rate)
    min_action_std = 0.1  # minimum action_std (stop decay after action_std <= min_action_std)
    action_std_decay_freq = int(2.5e5)  # action_std decay frequency (in num timesteps)
    #####################################################

    ## Note : print/log frequencies should be > than max_ep_len

    ################ PPO hyperparameters ################
    update_timestep = max_ep_len * 4  # update policy every n timesteps
    K_epochs = 80  # update policy for K epochs in one PPO update

    eps_clip = 0.2  # clip parameter for PPO
    gamma = 0.99  # discount factor

    lr_actor = 0.0001  # learning rate for actor network
    lr_critic = 0.0005  # learning rate for critic network

    random_seed = 0  # set random seed if required (0 = no random seed)
    #####################################################

    print("training environment name : " + env_name)

    # env = gym.make(env_name)

    env = Environment()

    # state space dimension
    state_dim = env.observation_space.shape[0]

    # action space dimension
    if has_continuous_action_space:
        action_dim = env.action_space.shape[0]
    else:
        action_dim = env.action_space.n

    ###################### logging ######################

    #### log files for multiple runs are NOT overwritten
    log_dir = "PPO_logs"
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    log_dir = log_dir + '/' + env_name + '/'
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    #### get number of log files in log directory
    run_num = 0
    current_num_files = next(os.walk(log_dir))[2]
    run_num = len(current_num_files)

    #### create new log file for each run

    log_f_name = log_dir + '/PPO_' + env_name + "_log_" + str(run_num) + ".csv"

    print("current logging run number for " + env_name + " : ", run_num)
    print("logging at : " + log_f_name)
    #####################################################

    ################### checkpointing ###################
    log_dir = "PPO_preTrained"
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    log_dir = log_dir + '/' + env_name + '/'
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    run_num_pretrained = 0  #### change this to prevent overwriting weights in same env_name folder

    current_num_files = next(os.walk(log_dir))[2]
    run_num_pretrained = len(current_num_files)

    directory = "PPO_preTrained"
    if not os.path.exists(directory):
        os.makedirs(directory)

    directory = directory + '/' + env_name + '/'
    if not os.path.exists(directory):
        os.makedirs(directory)

    checkpoint_path = directory + "PPO_{}_{}_{}.pth".format(env_name, random_seed, run_num_pretrained)
    print("save checkpoint path : " + checkpoint_path)
    #####################################################

    ############# print all hyperparameters #############
    print("--------------------------------------------------------------------------------------------")
    print("max training timesteps : ", max_training_timesteps)
    print("max timesteps per episode : ", max_ep_len)
    print("model saving frequency : " + str(save_model_freq) + " timesteps")
    print("log frequency : " + str(log_freq) + " timesteps")
    print("printing average reward over episodes in last : " + str(print_freq) + " timesteps")
    print("--------------------------------------------------------------------------------------------")
    print("state space dimension : ", state_dim)
    print("action space dimension : ", action_dim)
    print("--------------------------------------------------------------------------------------------")
    if has_continuous_action_space:
        print("Initializing a continuous action space policy")
        print("--------------------------------------------------------------------------------------------")
        print("starting std of action distribution : ", action_std)
        print("decay rate of std of action distribution : ", action_std_decay_rate)
        print("minimum std of action distribution : ", min_action_std)
        print("decay frequency of std of action distribution : " + str(action_std_decay_freq) + " timesteps")
    else:
        print("Initializing a discrete action space policy")
    print("--------------------------------------------------------------------------------------------")
    print("PPO update frequency : " + str(update_timestep) + " timesteps")
    print("PPO K epochs : ", K_epochs)
    print("PPO epsilon clip : ", eps_clip)
    print("discount factor (gamma) : ", gamma)
    print("--------------------------------------------------------------------------------------------")
    print("optimizer learning rate actor : ", lr_actor)
    print("optimizer learning rate critic : ", lr_critic)
    if random_seed:
        print("--------------------------------------------------------------------------------------------")
        print("setting random seed to ", random_seed)
        torch.manual_seed(random_seed)
        env.seed(random_seed)
        np.random.seed(random_seed)
    #####################################################

    print("============================================================================================")

    ################# training procedure ################

    # initialize a PPO agent
    ppo_agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, has_continuous_action_space,
                    action_std)

    # track total training time
    start_time = datetime.now().replace(microsecond=0)
    print("Started training at (GMT) : ", start_time)

    print("============================================================================================")

    # logging file
    log_f = open(log_f_name, "w+")
    log_f.write('episode,timestep,reward\n')

    # printing and logging variables
    print_running_reward = 0
    print_running_episodes = 0

    log_running_reward = 0
    log_running_episodes = 0

    time_step = 0
    i_episode = 0

    # training loop
    while time_step <= max_training_timesteps:

        state = env.reset()
        current_ep_reward = 0

        for t in range(1, max_ep_len + 1):

            # select action with policy
            action = ppo_agent.select_action(state.float())
            # print(action)
            state, reward, done, _ = env.step(action)

            # saving reward and is_terminals
            ppo_agent.buffer.rewards.append(reward)
            ppo_agent.buffer.is_terminals.append(done)

            time_step += 1
            current_ep_reward += reward

            # update PPO agent
            if time_step % update_timestep == 0:
                ppo_agent.update()

            # if continuous action space; then decay action std of ouput action distribution
            if has_continuous_action_space and time_step % action_std_decay_freq == 0:
                ppo_agent.decay_action_std(action_std_decay_rate, min_action_std)

            # log in logging file
            if time_step % log_freq == 0:
                # log average reward till last episode
                log_avg_reward = log_running_reward / log_running_episodes
                # log_avg_reward = round(log_avg_reward, 4)
                log_avg_reward = torch.round(log_avg_reward)

                log_f.write('{},{},{}\n'.format(i_episode, time_step, log_avg_reward))
                log_f.flush()

                log_running_reward = 0
                log_running_episodes = 0

            # printing average reward
            if time_step % print_freq == 0:
                # print average reward till last episode
                print_avg_reward = print_running_reward / print_running_episodes
                # print_avg_reward = round(print_avg_reward, 2)
                print_avg_reward = torch.round(print_avg_reward)

                print("Episode : {} \t\t Timestep : {} \t\t Average Reward : {}".format(i_episode, time_step,
                                                                                        print_avg_reward))

                print_running_reward = 0
                print_running_episodes = 0

            # save model weights
            if time_step % save_model_freq == 0:
                print("--------------------------------------------------------------------------------------------")
                print("saving model at : " + checkpoint_path)
                ppo_agent.save(checkpoint_path)
                print("model saved")
                print("Elapsed Time  : ", datetime.now().replace(microsecond=0) - start_time)
                print("--------------------------------------------------------------------------------------------")

            # break; if the episode is over
            if done:
                break

        print_running_reward += current_ep_reward
        print_running_episodes += 1

        log_running_reward += current_ep_reward
        log_running_episodes += 1

        i_episode += 1

    log_f.close()
    env.close()

    # print total training time
    print("============================================================================================")
    end_time = datetime.now().replace(microsecond=0)
    print("Started training at (GMT) : ", start_time)
    print("Finished training at (GMT) : ", end_time)
    print("Total training time  : ", end_time - start_time)
    print("============================================================================================")


if __name__ == '__main__':
    train()

我修改为一个网格空间的追击问题，但是还是不收敛，帮我看看reward为啥不能提高
希望这个蓝色的可以追上红色的球

这是reward，一直不收敛

写回答
好问题 0 提建议
追加酬金
关注问题
分享
邀请回答
编辑收藏删除
收藏举报

22条回答默认最新

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
专家-赤兔[在线] 全栈领域优质创作者 2024-03-24 11:06
关注
获得0.30元问题酬金
引自免费微信小程序：皆我百晓生

针对您提出的强化学习环境中PPO算法在解决二维追击问题时的不收敛问题，首先，我们需要深入分析问题并找出可能的原因。从提供的代码来看，有以下几个方面可能导致训练效果不佳或不收敛：

奖励函数设计：在Environment类中定义的reward方法计算了两者之间的距离差绝对值之和，但这样的设计可能不足以有效引导智能体接近目标。在追击问题中，通常希望当追逐者靠近逃脱者时给予正奖励，反之则给予惩罚。建议尝试将奖励函数修改为只基于两者之间的欧氏距离，并确保追逐者靠近时奖励增加，远离时减少。

def reward(self): # 奖励应随着追逐者靠近逃脱者而增大 reward = -torch.norm(self.chase - self.escape) return reward

动作离散化：目前环境中的动作空间是离散的，仅能选择五个固定的动作向量。这可能限制了追逐者的灵活性，导致其难以精确调整方向逼近逃脱者。考虑是否可以通过更细致的离散划分或者采用连续动作空间来改善策略的学习。

更新频率与训练步数：根据您的设置，模型每4倍于最大episode长度的时间步才进行一次更新，这对于复杂度较低的问题可能足够，但对于具有一定挑战性的追击问题可能过于稀疏。可以尝试降低update_timestep，提高更新频率。

探索-利用平衡：在PPOAgent中，对于离散动作空间并没有明确地提及添加任何探索策略，例如ε-greedy。适当增加探索有助于策略更好地了解环境，从而改进收敛性能。

状态空间归一化：虽然在注释中有提到要对输入进行归一化处理，但在实际实现中似乎没有看到相关代码。确保观测到的状态经过适当归一化可以帮助优化过程。

环境复杂性：当前环境设计相对简单，每次动作都是在当前位置基础上直接加法操作。而在真实追击问题中，往往存在物理动力学约束和非线性因素。为了模拟真实情况，可能需要引入更复杂的运动模型。

综上所述，您可以先从上述几点着手，逐步调整奖励函数、探索策略以及更新频率等参数，观察训练结果的变化，以期改进收敛表现。同时，也需要注意调试环境，确保它能合理反映问题的本质特征。若需要进一步的帮助，请提供具体调参后的训练表现及日志，以便针对性地进行分析。
解决无用
评论打赏
分享
举报编辑记录

评论

按下Enter换行，Ctrl+Enter发表内容

报告相同问题？

关注问题

如何快速掌握深度强化学习的各种算法，比如DQN，DDPG，PPO，SAC等等，并用Python准确实现呢？？ python pytorch 机器学习
2023-02-11 18:41

回答 1 已采纳以下答案引用自GPT-3大模型,请合理使用： ```想请问大神们，有没有比较好的资料或者指导方法，能够快速掌握深度强化学习的各种算法呢？首先，你需要理解深度强化学习的基本概念。然后，你可以通过阅读相
linux多网卡及多拨号的问题 linux ubuntu 网络
2016-12-21 01:31

回答 2 已采纳 http://blog.csdn.net/yuanbinquan/article/details/51468886
强化学习代码报错，typerror python
2022-10-05 11:55

回答 1 已采纳 StopTrainingOnRewardThreshold 不是都提示说传递了一个不期望的参数 'treshhold_type' Ctrl +鼠标左键点进去看下这个方法都需要哪些参数啊
ppo算法二维智能体规避障碍物
2023-10-21 16:43

总的来说，PPO算法在解决二维智能体规避障碍物的问题上展现出强大的能力，它通过控制策略更新的幅度，能够在保证学习稳定性的同时，逐步提升智能体的决策能力。在实际应用中，可能还需要考虑其他因素，如环境的复杂...
报错：'NoneType' object has no attribute 'shape' opencv python
2021-09-25 13:30

回答 2 已采纳这个是已经处理完了，读取的数据为空，所以为nonetype
深度强化学习电气工程复现文章，适合小白学习关键词：能量管理深度学习强化学习深度强化学习能源系统优化调度 编程语言：p
2024-09-08 11:05

编程语言：python平台主题：用于能源系统优化调度的深度强化学习算法的性能比较内容简介：摘要——深度强化学习 (DRL) 算法利用其数据驱动和无模型特性，有可能应对由于引入可再生能源发电而导致的不确定性水平...
ppo1.zip_Windows编程_Python__Windows编程_Python_
2021-08-09 21:17

本文将深入探讨如何利用Python进行Windows编程，并结合"ppo1.zip"中的资源，特别是PPO（Proximal Policy Optimization）算法的应用，来帮助你快速理解和实现这一强化学习算法。首先，Windows编程通常涉及到与操作...
基于PPO算法的智能汽车端到端深度强化学习控制研究
2024-02-24 16:07

(1) 本文深入研究了深度学习与强化学习理论基础，介绍了以演员－评论家框架为基础的 PPO算法。在此基础上构建了基千深度强化学习的自动驾驶模型，为建立端到端深度强化学习自动驾驶控制模型的开发打下基础。 (2)...
近端策略优化 PPO | 损失值计算问题
2024-07-17 16:11

bug菌¹的博客问题描述近端策略优化算法（PPO）中， actor网络的损失计算公式中，取ratioadvantage和裁切后ratioadvantage的较小者，当优势函数大于0时，ratio的裁切起到了作用，目的是限制权重更新步长。然而，当优势函数...
MNIST_with_PPO:使用PPO解决MNIST分类问题
2021-04-17 11:11

PPO是一种强化学习（RL）算法，它在连续动作空间和离散动作空间的问题中都表现出色。 PPO的核心思想是通过近似策略梯度方法来优化策略网络，同时确保更新步骤不会使策略发生过大的变化。它引入了一个名为“优势函数...
ppo学习之ppo算法实现.zip
2024-08-05 20:35

ppo算法 ppo学习之ppo算法实现.zip
ChatGPT技术原理解析：从RL之PPO算法、RLHF到GPT4、instructGPT
2023-01-15 22:01

v_JULY_v的博客本篇ChatGPT笔记会全力做到，通俗易懂且循序...且本文之前，99%的文章都不会把PPO算法从头推到尾，本文会把PPO从零推到尾，按照“RL-策略梯度-重要性采样(重要性权重)-TRPO(增加信任区域和KL散度约束)-PPO”的顺序逐步
大语言模型原理与工程实践：PPO 算法
2024-06-21 00:42

AI天才研究院的博客 1. 背景介绍近年来，随着人工智能技术的迅速发展，大语言模型在自然语言处理领域取得了巨大的成功。这些模型能够生成自然流畅的文本，回答...PPO 算法是一种用于训练大语言模型的常用算法，它可以有效地提高模型的性能
Actor-Critic原理+PPO算法推导
2022-01-20 16:50

"Actor-Critic原理+PPO算法推导" Actor-Critic 原理是强化学习中的一种重要方法，主要用于解决 Sequential Decision Making 问题。该方法结合了 Actor 网络和 Critic 网络，Actor 网络负责选择动作，而 Critic 网络...
PPO算法常见问题与解决方案
2024-04-27 16:34

AI天才研究院的博客 1. 背景介绍 1.1 强化学习与策略梯度方法强化学习 (Reinforcement Learning, RL) 作为机器学习的一个重要分支，专注于...策略梯度方法是强化学习中的一类重要算法，它通过直接优化策略来最大化期望回报。PPO (Proxim
强化学习之PPO算法实战完整代码
2024-10-16 16:11

强化学习之PPO算法实战完整代码
dip:逆向双摆问题的近端策略优化（PPO）PyTorch实现
2021-04-01 23:05

采用近端策略优化（PPO）的反向双摆训练注意：请查看此工作簿，因为GitHub在渲染Jupyter Notebooks时遇到问题。环境设定安装Miniconda： ://docs.conda.io/en/latest/miniconda.html 打开终端： git clone git@...
装配任务的PPO强化学习
2023-10-21 16:45

在当前的【标题】"装配任务的PPO强化学习"中，主要涉及的是使用强化学习算法PPO（Proximal Policy Optimization）解决机器人装配任务的问题。【描述】简单明了，直指主题，即该研究或项目是关于如何运用PPO来优化...
没有解决我的问题, 去提问

问题事件

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
系统已结题 4月1日
关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
创建了问题 3月24日

悬赏问题

¥15 已知平面坐标系（非直角坐标系）内三个点的坐标，反求两坐标轴的夹角
¥15 webots有问题，无响应
¥15 数据量少可以用MK趋势分析吗
¥15 使用VH6501干扰RTR位，CANoe上显示的错误帧不足32个就进入bus off快慢恢复，为什么？
¥15 大智慧怎么编写一个选股程序
¥100 python 调用 cgps 命令获取实时位置信息
¥15 两台交换机分别是trunk接口和access接口为何无法通信，通信过程是如何？
¥15 C语言使用vscode编码错误
¥15 用KSV5转成本时，如何不生成那笔中间凭证
¥20 ensp怎么配置让PC1和PC2通讯上

PPO离散化二维追击问题不收敛

22条回答 默认 最新

问题事件

悬赏问题

22条回答默认最新