最近看莫凡的 强化学习 课程,DQN部分莫凡使用的是tensorflow1.x 书写。我想着自己用tensorflow2.x写一遍,但发现训练很久都没有收敛。
有没有能帮忙分析下原因的,非常感谢
```python
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
class DeepQNetwork:
def __init__(self, n_actions, n_features, learning_rate=0.01,
reward_decay=0.9, e_greedy=0.9, replace_target_iter=300,
memory_size=500, batch_size=64, e_greedy_increment=None):
self.n_actions = n_actions
self.n_features = n_features
self.lr = learning_rate
self.gamma = reward_decay
self.epsilon_max = e_greedy
self.replace_target_iter = replace_target_iter
self.memory_size = memory_size
self.batch_size = batch_size
self.epsilon_increment = e_greedy_increment
if e_greedy_increment is None:
self.epsilon = self.epsilon_max
else:
self.epsilon = 0
self.learn_step_counter = 0
self.memory = np.zeros((self.memory_size, n_features*2+3))
self.eval_net = self._build_net('eval_net')
self.eval_net.compile(loss=keras.losses.MeanAbsoluteError(), optimizer=keras.optimizers.Adam(learning_rate=0.1), metrics=[keras.metrics.MeanAbsoluteError()])
self.target_net = self._build_net('target_net')
self.target_net.set_weights(self.eval_net.get_weights())
self.early_stopping=keras.callbacks.EarlyStopping(monitor='loss', patience=10)
def _build_net(self, name):
net = keras.Sequential(name=name)
net.add(keras.layers.Dense(32, activation='relu'))
net.add(keras.layers.Dense(16, activation='relu'))
net.add(keras.layers.Dense(self.n_actions))
net.build((None, self.n_features))
checkpoint = tf.train.Checkpoint(model=net)
checkpoint.restore(tf.train.latest_checkpoint('./save'))
return net
def store_transition(self, s, a, r, done, s_):
if not hasattr(self, 'memory_counter'):
self.memory_counter = 0
transition = np.hstack((s, [a, r, int(done)], s_))
index = self.memory_counter % self.memory_size
self.memory[index, :] = transition
self.memory_counter += 1
def choose_action(self, observation, train=True):
observation = observation[np.newaxis, :]
if train == False or np.random.uniform() < self.epsilon:
actions_values = self.eval_net(observation)
action = np.argmax(actions_values)
else:
action = np.random.choice(self.n_actions)
return action
def learn(self):
if self.learn_step_counter % self.replace_target_iter == 0:
self.target_net.set_weights(self.eval_net.get_weights())
if self.memory_counter > self.memory_size:
sample_index = np.random.choice(self.memory_size, size=self.batch_size)
else:
sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
batch_memory = self.memory[sample_index, :]
q_eval = self.eval_net(batch_memory[:, :self.n_features])
q_next = self.target_net(batch_memory[:, -self.n_features:])
q_target = np.array(q_eval)
for i in range(self.batch_size):
action, reward, done = batch_memory[i, self.n_features], batch_memory[i, self.n_features+1], bool(batch_memory[i, self.n_features+2])
action = int(action)
max_q = np.max(q_next[i, :])
if done:
q_target[i, action] = q_eval[i, action] + self.lr * (reward - q_eval[i, action])
else:
q_target[i, action] = q_eval[i, action] + self.lr * (reward + self.gamma * max_q - q_eval[i, action])
if hasattr(self, 'debug') == False:
self.debug = True
print('q_eval : \n', q_eval)
print('q_target : \n', q_target)
self.eval_net.fit(batch_memory[:, :self.n_features], q_target, batch_size=self.batch_size, epochs=10, verbose=0)
if self.epsilon_increment is not None and self.epsilon < self.epsilon_max:
print('increase e-greedy : %f\n' % self.epsilon)
self.epsilon += self.epsilon_increment
# print('e-greedy : %d\n' % self.epsilon)
self.learn_step_counter += 1
def save(self):
checkpoint = tf.train.Checkpoint(model=self.eval_net)
manager = tf.train.CheckpointManager(checkpoint, directory='./save', max_to_keep=1)
path = manager.save(checkpoint_number=0)
print('model save to %s \n' % path)
```