Skip to content

Commit 2436323

Browse files
committed
fix bug in deep reinforcement learning
1 parent 478f32b commit 2436323

File tree

2 files changed

+16
-6
lines changed
  • source/_static/code

2 files changed

+16
-6
lines changed

source/_static/code/en/model/rl/rl.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -54,16 +54,21 @@ def predict(self, inputs):
5454
action = action[0]
5555
next_state, reward, done, info = env.step(action) # Let the environment to execute the action, get the next state of the action, the reward of the action, whether the game is done and extra information.
5656
reward = -10. if done else reward # Give a large negative reward if the game is over.
57-
replay_buffer.append((state, action, reward, next_state, done)) # Put the (state, action, reward, next_state) quad back into the experience replay pool.
57+
replay_buffer.append((state, action, reward, next_state, 1 if done else 0)) # Put the (state, action, reward, next_state) quad back into the experience replay pool.
5858
state = next_state
5959

6060
if done: # Exit this round and enter the next episode if the game is over.
6161
print("episode %d, epsilon %f, score %d" % (episode_id, epsilon, t))
6262
break
6363

6464
if len(replay_buffer) >= batch_size:
65-
batch_state, batch_action, batch_reward, batch_next_state, batch_done = \
66-
[np.array(a, dtype=np.float32) for a in zip(*random.sample(replay_buffer, batch_size))] # Randomly take a batch quad from the experience replay pool.
65+
# Randomly take a batch quad from the experience replay pool and transform them to NumPy array respectively.
66+
batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip(
67+
*random.sample(replay_buffer, batch_size))
68+
batch_state, batch_reward, batch_next_state, batch_done = \
69+
[np.array(a, dtype=np.float32) for a in [batch_state, batch_reward, batch_next_state, batch_done]]
70+
batch_action = np.array(batch_action, dtype=np.int32)
71+
6772
q_value = model(tf.constant(batch_next_state, dtype=tf.float32))
6873
y = batch_reward + (gamma * tf.reduce_max(q_value, axis=1)) * (1 - batch_done) # Calculate y according to the method in the paper.
6974
with tf.GradientTape() as tape:

source/_static/code/zh/model/rl/rl.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -54,16 +54,21 @@ def predict(self, inputs):
5454
action = action[0]
5555
next_state, reward, done, info = env.step(action) # 让环境执行动作,获得执行完动作的下一个状态,动作的奖励,游戏是否已结束以及额外信息
5656
reward = -10. if done else reward # 如果游戏Game Over,给予大的负奖励
57-
replay_buffer.append((state, action, reward, next_state, done)) # 将(state, action, reward, next_state)的四元组(外加done标签表示是否结束)放入经验重放池
57+
replay_buffer.append((state, action, reward, next_state, 1 if done else 0)) # 将(state, action, reward, next_state)的四元组(外加done标签表示是否结束)放入经验重放池
5858
state = next_state
5959

6060
if done: # 游戏结束则退出本轮循环,进行下一个episode
6161
print("episode %d, epsilon %f, score %d" % (episode_id, epsilon, t))
6262
break
6363

6464
if len(replay_buffer) >= batch_size:
65-
batch_state, batch_action, batch_reward, batch_next_state, batch_done = \
66-
[np.array(a, dtype=np.float32) for a in zip(*random.sample(replay_buffer, batch_size))] # 从经验回放池中随机取一个batch的四元组
65+
# 从经验回放池中随机取一个batch的四元组,并分别转换为NumPy数组
66+
batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip(
67+
*random.sample(replay_buffer, batch_size))
68+
batch_state, batch_reward, batch_next_state, batch_done = \
69+
[np.array(a, dtype=np.float32) for a in [batch_state, batch_reward, batch_next_state, batch_done]]
70+
batch_action = np.array(batch_action, dtype=np.int32)
71+
6772
q_value = model(tf.constant(batch_next_state, dtype=tf.float32))
6873
y = batch_reward + (gamma * tf.reduce_max(q_value, axis=1)) * (1 - batch_done) # 按照论文计算y值
6974
with tf.GradientTape() as tape:

0 commit comments

Comments
 (0)