James_deut
2021-05-20 22:05
采纳率: 20%
浏览 73

强化学习,python

目前在用python做强化学习,刚入门,有个问题不是很明白。   

QLearning的代码是:

import numpy as np
import math

class QLearning(object):
    def __init__(self, state_dim: object, action_dim: object, cfg: object) -> object:
        self.action_dim = action_dim  # dimension of acgtion
        self.lr = cfg.lr  # learning rate
        self.gamma = cfg.gamma  
        self.epsilon = 0 
        self.sample_count = 0  
        self.epsilon_start = cfg.epsilon_start
        self.epsilon_end = cfg.epsilon_end
        self.epsilon_decay = cfg.epsilon_decay
        self.Q_table = np.zeros((state_dim, action_dim)) # Q表
        
    def choose_action(self, state):
        self.sample_count += 1
        self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
            math.exp(-1. * self.sample_count / self.epsilon_decay)
        if np.random.uniform(0, 1) > self.epsilon:  # 随机选取0-1之间的值,如果大于epsilon就按照贪心策略选取action,否则随机选取
            action = self.predict(state)
        else:
            action = np.random.choice(self.action_dim)  #有一定概率随机探索选取一个动作
        return action

    def predict(self, state):
        '''根据输入观测值,采样输出的动作值,带探索,测试模型时使用
        '''
        Q_list = self.Q_table[state, :]
        Q_max = np.max(Q_list)
        action_list = np.where(Q_list == Q_max)[0]  
        action = np.random.choice(action_list) # Q_max可能对应多个 action ,可以随机抽取一个
        return action
            
    def update(self, state, action, reward, next_state, done):
        Q_predict = self.Q_table[state, action]
        if done:
            Q_target = reward  # 没有下一个状态了
        else:
            Q_target = reward + self.gamma * np.max(
                self.Q_table[next_state, :])  # Q_table-learning
        self.Q_table[state, action] += self.lr * (Q_target - Q_predict)  # 修正q
    def save(self,path):
        np.save(path+"Q_table.npy", self.Q_table)
    def load(self, path):
        self.Q_table = np.load(path+"Q_table.npy")


做测试的代码是:

import gym
from QLearning.agent1 import QLearning
from envs.gridworld_env import CliffWalkingWapper

env = gym.make("CliffWalking-v0")  # 0 up, 1 right, 2 down, 3 left
env = CliffWalkingWapper(env)
agent = QLearning(
    obs_dim=env.observation_space.n,
    action_dim=env.action_space.n,
    learning_rate = cfg.lr,
    gamma = cfg.gamma,
    epsilon_start = cfg.epsilon_start, epsilon_end = cfg.epsilon_end, epsilon_decay=cfg.epsilon_decay)
render = False  # 是否打开GUI画面
rewards = []  # 记录所有episode的reward
MA_rewards = []  # 记录滑动平均的reward
steps = []  # 记录所有episode的steps
for i_episode in range(1, cfg.max_episodes + 1):
    ep_reward = 0  # 记录每个episode的reward
    ep_steps = 0  # 记录每个episode走了多少step
    obs = env.reset()  # 重置环境, 重新开一局(即开始新的一个episode)
    while True:
        action = agent.sample(obs)  # 根据算法选择一个动作
        next_obs, reward, done, _ = env.step(action)  # 与环境进行一个交互
        # 训练 Q-learning算法
        agent.learn(obs, action, reward, next_obs, done)  # 不需要下一步的action
        obs = next_obs  # 存储上一个观察值
        ep_reward += reward
        ep_steps += 1  # 计算step数
        if render:
            env.render()  # 渲染新的一帧图形
        if done:
            break
    steps.append(ep_steps)
    rewards.append(ep_reward)
    # 计算滑动平均的reward
    if i_episode == 1:
        MA_rewards.append(ep_reward)
    else:
        MA_rewards.append(
            0.9 * MA_rewards[-1] + 0.1 * ep_reward)
    print('Episode %s: steps = %s , reward = %.1f, explore = %.2f' % (i_episode, ep_steps,
                                                                      ep_reward, agent.epsilon))
agent.save()  # 训练结束,保存模型

我用的是pycharm, 为什么cfg.是报错呢(下划线为红色)? 我始终想不明白

 

  • 写回答
  • 关注问题
  • 收藏
  • 邀请回答

1条回答 默认 最新

  • 皮皮宽 2021-05-20 22:46
    已采纳

    我寻思你也没定义cfg啊。。第一个代码段的那个cfg只是形参。cfg包含哪些东西,值都是多少,你这里都没有定义。

    一般是自己写一个配置文件,然后python读取进来,或者自己定义一个cfg。

    已采纳该答案
    打赏 评论

相关推荐 更多相似问题