fellatai 2024-07-25 12:09 采纳率: 0%
浏览 5
已结题

LSPI算法的大问题

import sys
import time
import random
import numpy as np
import gymnasium as gym
from frozenlake import *

def lspi(D, pi0, model, argmaxuQ, gamma, p):
    dtheta_converge = p.get("dtheta_converge", 1e-9)
    iter_max = p.get("iter_max", 100)
    online_printing = p.get("online_printing", True)

    Y = D["Y"]
    U = D["U"]
    R = D["R"]
    Yn = D["Yn"]
    dimY = Y.shape[0]
    N = Y.shape[1]
    dimU = U.shape[0]

    Un = np.zeros((dimU, N))
    pi = pi0

    for n_iter in range(iter_max):
        for n in range(N-1):
            Un[:, n] = pi(Yn[:, n])

        model = lstd0q(Y, U, R.T, Yn, Un, model, gamma, [])
        def Q(y, u): return -predict_linear(np.concatenate((y, u), axis=0), model)
        pi = argmaxuQ(model["w"])

        if online_printing:
            Qp = np.empty(N)
            Qpn = np.empty(N)
            for n in range(N-1):
                Qp[n] = Q(Y[:, n], U[:, n])[0]
                Qpn[n] = Q(Yn[:, n], Un[:, n])[0]
            rmstde = get_rmstde(Qp, Qpn, R, gamma)[0]
            print(f"Iteration = {n_iter}, RMSTDE = {rmstde:.6f}, Qp = {np.mean(Qp):.6f} +/- {np.std(Qp):.6f}")

        if n_iter > 0:
            dtheta = np.linalg.norm(np.ravel(model["w"]) - np.ravel(modelp["w"]))
            if dtheta < dtheta_converge:
                print('Minimum parameter change threshold reached.')
                break
        modelp = model

    if n_iter == iter_max and online_printing:
        print('Max. number of iterations reached.')
    return pi, Q, model, n_iter

def lstd0q(Y, U, R, Yn, Un, model, gamma, p):
    Z = np.concatenate((Y, U), axis=0)
    Zn = np.concatenate((Yn, Un), axis=0)
    model = lstd0(Z, R, Zn, model, gamma, p)
    return model

def lstd0(Y, R, Yn, model, gamma, p):
    N = Y.shape[1]
    dimPhi = model['phi'](Y[:, 0]).shape[0]

    Phi = np.empty((N, dimPhi))
    Phin = np.empty((N, dimPhi))
    for n in range(N-1):
        Phi[n, :] = model['phi'](Y[:, n]).ravel()
        Phin[n, :] = model['phi'](Yn[:, n]).ravel()

    model['w'] = wlstd0(Phi, Phin, R, gamma)
    return model

def wlstd0(Phi, Phin, R, gamma):
    A = np.dot(Phi.T, Phi - gamma * Phin) + 0.01 * np.eye(Phi.shape[1])
    b = np.dot(Phi.T, R)

    lambda_identity = 0.0001 * np.eye(Phi.shape[1])
    A += lambda_identity
    theta, _, _, _ = np.linalg.lstsq(A, b, rcond=None)
    return theta

def epsilon_greedy_policy(state, Q, epsilon=0.1):
    if random.uniform(0, 1) < epsilon:
        return [environment.action_space.sample()]  # 随机选择动作
    else:
        return [np.argmax(Q(state))]

def evaluate_policy(environment, Q, K, epsilon=0.1):
    y = []
    u = []
    l = []

    state, info = environment.reset()
    done = False
    for i in range(K):
        action = epsilon_greedy_policy(state, Q, epsilon)
        next_state, cost, done, truncated, info = environment.step(action[0])

        y.append(state)
        u.append(action)
        l.append(cost)
        state = next_state
        if done or truncated:
            break
y = np.array(y)  # Convert the list of states to a numpy array
    l = np.array(l)  # Convert the list of costs to a numpy array
    u = np.array(u)  # Convert the list of actions to a numpy array
    return y, u, l

def evaluate_lspi(environment):
    """Evaluate the LSPI algorithm on the given environment and data."""
    p = dict()
    gamma = 0.9  # Discount factor, adjust according to your needs
    p["iter_max"] = 100  # Maximum number of iterations
    p["dtheta_converge"] = 0.01  # Parameter convergence threshold

    # Function to find the action that maximizes Q-value given theta
    def fnArgmaxuQ(_theta):
        return argmaxuQ_f_basis_frozenlake(_theta, environment.unwrapped)

    # Basis function for feature representation
    def phi(z):
        return f_basis_frozenlake(z[0], z[1], environment.unwrapped)

    # Initialize the model
    model = dict()
    model["phi"] = phi
    model["w"] = np.zeros(environment.unwrapped.observation_space.n * environment.unwrapped.action_space.n)

    # Initial policy that always selects action 0
    def pi0(x):
        return [0]

    # Generate training data
    data = {"Y": [], "U": [], "R": [], "Yn": []}
    num_episodes = 1000  # Number of episodes to generate data
    for _ in range(num_episodes):
        state, info = environment.reset()
        done = False
        while not done:
            action = environment.action_space.sample()  # Sample a random action
            next_state, reward, done, truncated, info = environment.step(action)  # Step in the environment
            data["Y"].append(state)  # Append current state to data
            data["U"].append(action)  # Append action to data
            data["R"].append(reward)  # Append reward to data
            data["Yn"].append(next_state)  # Append next state to data
            state = next_state  # Update state to next state

    for key in data:
        data[key] = np.array(data[key]).reshape(1, -1)  # Convert lists to numpy arrays

    D = data  # Training data

    # Train LSPI to get the learned policy
    pi, Q, model, n_iter = lspi(D, pi0, model, fnArgmaxuQ, gamma, p)

    # Evaluate the learned policy
    episodes = 100  # Number of evaluation episodes
    total_costs = []
    for _ in range(episodes):
        y, u, l = evaluate_policy(environment, pi, 10)  # Evaluate for a maximum of 10 steps per episode
        total_costs.append(sum(l))  # Append the total cost (sum of rewards) for this episode

    avg_cost = np.mean(total_costs)  # Calculate average cost
    std_cost = np.std(total_costs)  # Calculate standard deviation of cost

    # Print evaluation results for debugging
    print(f"Average Cost: {avg_cost}, Standard Deviation of Cost: {std_cost}")

    # Return the learned policy and the number of iterations
    return pi, n_iter

def main():
    random.seed(1)
    np.random.seed(1)

    environment = gym.make('FrozenLake-v1', desc=["SFFF", "FFFF", "FFFF", "HHHG"], map_name="4x4", is_slippery=False,
                           render_mode="human")

    policy, n_iter = evaluate_lspi(environment)

    # Evaluate the learned policy with a maximum of 5 steps
    y, u, l = evaluate_policy(environment, policy, 5)
    print("u =", u)
    print("y =", y)
    print("l =", l)
    print('Cost (evaluated on plant) = ' + str(sum(l)))

if __name__ == "__main__":
    main()


我的问题就是这个小精灵为啥会一直动下去?明明我已经设置好了,到了终点就停下啊,结束程序

  • 写回答

25条回答 默认 最新

  • 阿里嘎多学长 2024-07-25 12:09
    关注

    以下内容由AIGC及阿里嘎多学长共同生成、有用望采纳:


    从您提供的代码和问题描述来看,小精灵持续移动可能是因为几个原因:

    1. 环境状态未正确更新:如果state在每次循环迭代后没有正确更新为next_state,那么小精灵可能会陷入无限循环。

    2. done标志未正确处理:如果done标志没有在小精灵到达目标状态时设置为True,循环可能不会终止。

    3. 策略评估函数未正确实现:如果epsilon_greedy_policy函数没有正确地根据当前策略选择动作,小精灵可能无法采取最优动作到达目标。

    4. LSPI算法实现问题:如果LSPI算法的实现有误,可能导致学习到的策略不是最优的,从而无法引导小精灵到达终点。

    为了解决这个问题,我们可以进行以下分析和修改:

    1. 检查环境的donetruncated标志

    确保在environment.step(action[0])调用后,donetruncated标志能够正确地反映小精灵是否到达终点或是否需要停止。

    2. 检查evaluate_policy函数

    确保在evaluate_policy函数中,当donetruncatedTrue时,循环能够正确地终止。

    3. 检查epsilon_greedy_policy函数

    确保epsilon_greedy_policy函数能够根据当前策略选择正确的动作。

    4. 检查LSPI算法的实现

    确保LSPI算法的实现是正确的,并且能够学习到一个好的策略。

    示例代码修改

    以下是对evaluate_policy函数的一些可能的修改,以确保当小精灵到达终点时循环能够正确地终止:

    def evaluate_policy(environment, Q, K, epsilon=0.1):
        y = []
        u = []
        l = []
        state, info = environment.reset()
        done = False
        for i in range(K):  # 限制最大步数
            action = epsilon_greedy_policy(state, Q, epsilon)
            next_state, cost, done, truncated, info = environment.step(action[0])
            
            # 检查是否到达终点或需要停止
            if done or truncated:
                break
    
            y.append(state)
            u.append(action)
            l.append(cost)
            state = next_state
    
        y = np.array(y)
        l = np.array(l)
        u = np.array(u)
        return y, u, l
    

    假设数据和示例输入输出

    假设我们有一个简单的FrozenLake环境,其中小精灵的起始位置是(0, 0),目标位置是(3, 3)。示例输入是一个随机策略,示例输出是小精灵在环境中的一系列状态、动作和成本,直到它到达目标或达到最大步数。

    • 示例输入:随机策略(每次选择一个随机动作)
    • 示例输出:小精灵的状态序列、动作序列和成本序列

    例如,如果小精灵的起始状态是(0, 0),目标状态是(3, 3),那么示例输出可能是:

    • 状态序列[(0, 0), (1, 0), (2, 0), (3, 0), (3, 1), (3, 2), (3, 3)]
    • 动作序列[0, 1, 1, 1, 1, 1, 1](假设动作0是向右移动,1是向下移动)
    • 成本序列[0, -1, -1, -1, -1, -1, 0](假设每次移动的成本是-1,到达目标的成本是0)

    请注意,这些修改和示例仅供参考,您可能需要根据您的具体环境和问题进行调整。如果您需要进一步的帮助,请提供更多的信息,例如环境的具体实现细节或LSPI算法的具体实现问题。

    评论 编辑记录

报告相同问题?

问题事件

  • 已结题 (查看结题原因) 7月26日
  • 赞助了问题酬金15元 7月25日
  • 创建了问题 7月25日

悬赏问题

  • ¥15 35114 SVAC视频验签的问题
  • ¥15 impedancepy
  • ¥15 在虚拟机环境下完成以下,要求截图!
  • ¥15 求往届大挑得奖作品(ppt…)
  • ¥15 如何在vue.config.js中读取到public文件夹下window.APP_CONFIG.API_BASE_URL的值
  • ¥50 浦育平台scratch图形化编程
  • ¥20 求这个的原理图 只要原理图
  • ¥15 vue2项目中,如何配置环境,可以在打完包之后修改请求的服务器地址
  • ¥20 微信的店铺小程序如何修改背景图
  • ¥15 UE5.1局部变量对蓝图不可见