caohaogyl 2022-04-09 21:11
浏览 132
已结题

tensorflow 2.x DQN无法收敛

最近看莫凡的 强化学习 课程,DQN部分莫凡使用的是tensorflow1.x 书写。我想着自己用tensorflow2.x写一遍,但发现训练很久都没有收敛。
有没有能帮忙分析下原因的,非常感谢


```python

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras

class DeepQNetwork:
    def __init__(self, n_actions, n_features, learning_rate=0.01,
    reward_decay=0.9, e_greedy=0.9, replace_target_iter=300,
    memory_size=500, batch_size=64, e_greedy_increment=None):
        self.n_actions = n_actions
        self.n_features = n_features
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon_max = e_greedy
        self.replace_target_iter = replace_target_iter
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.epsilon_increment = e_greedy_increment
        if e_greedy_increment is None:
            self.epsilon = self.epsilon_max
        else:
            self.epsilon = 0

        self.learn_step_counter = 0

        self.memory = np.zeros((self.memory_size, n_features*2+3))
        self.eval_net = self._build_net('eval_net')
        self.eval_net.compile(loss=keras.losses.MeanAbsoluteError(), optimizer=keras.optimizers.Adam(learning_rate=0.1), metrics=[keras.metrics.MeanAbsoluteError()])
        self.target_net = self._build_net('target_net')
        self.target_net.set_weights(self.eval_net.get_weights())
        self.early_stopping=keras.callbacks.EarlyStopping(monitor='loss', patience=10)

    def _build_net(self, name):
        net = keras.Sequential(name=name)
        net.add(keras.layers.Dense(32, activation='relu'))
        net.add(keras.layers.Dense(16, activation='relu'))
        net.add(keras.layers.Dense(self.n_actions))
        net.build((None, self.n_features))
        checkpoint = tf.train.Checkpoint(model=net)      
        checkpoint.restore(tf.train.latest_checkpoint('./save')) 
        return net
    
    def store_transition(self, s, a, r, done, s_):
        if not hasattr(self, 'memory_counter'):
            self.memory_counter = 0
        transition = np.hstack((s, [a, r, int(done)], s_))
        index = self.memory_counter % self.memory_size
        self.memory[index, :] = transition
        self.memory_counter += 1

    def choose_action(self, observation, train=True):
        observation = observation[np.newaxis, :]
        if train == False or np.random.uniform() < self.epsilon:
            actions_values = self.eval_net(observation)
            action = np.argmax(actions_values)
        else:
            action = np.random.choice(self.n_actions)
        return action

    def learn(self):
        if self.learn_step_counter % self.replace_target_iter == 0:
            self.target_net.set_weights(self.eval_net.get_weights())
        if self.memory_counter > self.memory_size:
            sample_index = np.random.choice(self.memory_size, size=self.batch_size)
        else:
            sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
        batch_memory = self.memory[sample_index, :]
        
        q_eval = self.eval_net(batch_memory[:, :self.n_features])
        q_next = self.target_net(batch_memory[:, -self.n_features:])

        q_target = np.array(q_eval)
        for i in range(self.batch_size):
            action, reward, done = batch_memory[i, self.n_features], batch_memory[i, self.n_features+1], bool(batch_memory[i, self.n_features+2])
            action = int(action)
            max_q = np.max(q_next[i, :])
            if done:
                q_target[i, action] = q_eval[i, action] + self.lr * (reward - q_eval[i, action])
            else:
                q_target[i, action] = q_eval[i, action] + self.lr * (reward + self.gamma * max_q - q_eval[i, action])
        if hasattr(self, 'debug') == False:
            self.debug = True
            print('q_eval : \n', q_eval)
            print('q_target : \n', q_target)
        self.eval_net.fit(batch_memory[:, :self.n_features], q_target, batch_size=self.batch_size, epochs=10, verbose=0)

        if self.epsilon_increment is not None and self.epsilon < self.epsilon_max:
            print('increase e-greedy : %f\n' % self.epsilon)
            self.epsilon += self.epsilon_increment
        # print('e-greedy : %d\n' % self.epsilon)
        self.learn_step_counter += 1

    def save(self):
        checkpoint = tf.train.Checkpoint(model=self.eval_net)
        manager = tf.train.CheckpointManager(checkpoint, directory='./save', max_to_keep=1)
        path = manager.save(checkpoint_number=0)
        print('model save to %s \n' % path)

```

  • 写回答

0条回答 默认 最新

    报告相同问题?

    问题事件

    • 系统已结题 4月17日
    • 创建了问题 4月9日

    悬赏问题

    • ¥15 vs code配置c语言遇到这个问题
    • ¥15 vscode调试编译找不到gcc,只有cl,但是检查cmd是对的,控制面板的路径也更改了
    • ¥20 access中怎么分割分别获取一下图中的值
    • ¥15 keras_tcn已经安装成功,还是显示ModuleNotFoundError: No module named 'keras_tcn'
    • ¥15 ENVI高分五号去除云层的方法
    • ¥15 16进制数据如何得到奇偶校验位
    • ¥15 求合并两个字节流VB6代码
    • ¥15 Pyqt 如何正确的关掉Qthread,并且释放其中的锁?
    • ¥30 网站服务器通过node.js部署了一个项目!前端访问失败
    • ¥15 WPS访问权限不足怎么解决