if resume:
# creates a generic neural network architecture
model = Sequential()
# hidden layer takes a pre-processed frame as input, and has 200 units
model.add(Dense(units=200,input_dim=80*80, activation='relu', kernel_initializer='glorot_uniform'))
# output layer
model.add(Dense(units=1, activation='sigmoid', kernel_initializer='RandomNormal'))
# compile the model using traditional Machine Learning losses and optimizers
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#print model
model.summary()
if os.path.isfile('Basic_Rl_weights.h5'):
#load pre-trained model weight
print("loading previous weights")
model.load_weights('Basic_Rl_weights.h5')
else :
# creates a generic neural network architecture
model = Sequential()
# hidden layer takes a pre-processed frame as input, and has 200 units
model.add(Dense(units=200,input_dim=80*80, activation='relu', kernel_initializer='glorot_uniform'))
# output layer
model.add(Dense(units=1, activation='sigmoid', kernel_initializer='RandomNormal'))
# compile the model using traditional Machine Learning losses and optimizers
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#print model
model.summary()
#save model
# model.save_weights('my_model_weights.h5')
log_dir = './log' + datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
callbacks = callbacks.TensorBoard(log_dir=log_dir, histogram_freq=0,
write_graph=True, write_images=True)
gym initialization
env = gym.make("Pong-v0")
observation = env.reset()
prev_x = None # used in computing the difference frame
running_reward = None
initialization of variables used in the main loop
x_train, y_train, rewards = [],[],[]
reward_sum = 0
episode_number = 0
main loop
while True:
if render : env.render()
# preprocess the observation, set input as difference between images
cur_x = prepro(observation)
# i=np.expand_dims(cur_x,axis=0)
# print(i.shape)
# print(cur_x.shape)
if prev_x is not None :
x = cur_x - prev_x
else:
x = np.zeros(Input_dim)
# print(x.shape)
# print(np.expand_dims(cur_x,axis=0).shape)
prev_x = cur_x
# forward the policy network and sample action according to the proba distribution
# two ways to calculate returned probability
# print(x.shape)
prob = model.predict(np.expand_dims(x, axis=1).T)
# aprob = model.predict(np.expand_dims(x, axis=1).T)
if np.random.uniform() < prob:
action = action_up
else :
action = action_down
# 0 and 1 labels( a fake label in order to achive back propagation algorithm)
if action == 2:
y = 1
else:
y = 0
# log the input and label to train later
x_train.append(x)
y_train.append(y)
# do one step in our environment
observation, reward, done, info = env.step(action)
rewards.append(reward)
reward_sum += reward
# end of an episode
if done:
print('At the end of episode', episode_number, 'the total reward was :', reward_sum)
# increment episode number
episode_number += 1
# training
# history = LossHistory()
model.fit(x=np.vstack(x_train),
y=np.vstack(y_train),
verbose=1,
sample_weight=discount_rewards(rewards),
callbacks=[callbacks])
if episode_number % 100 == 0:
model.save_weights('Basic_Rl_weights' + datetime.now().strftime("%Y%m%d-%H%M%S") + '.h5')
# Log the reward
running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
# if episode_number % 10 == 0:
tflog('running_reward', running_reward, custom_dir=log_dir)
# Reinitialization
x_train, y_train, rewards = [],[],[]
observation = env.reset()
reward_sum = 0
prev_x = None