fix bug in deep reinforcement learning

snowkylin · snowkylin · commit 24363237c2e9 · 2018-09-18T19:38:24.000+08:00
diff --git a/source/_static/code/en/model/rl/rl.py b/source/_static/code/en/model/rl/rl.py
@@ -54,16 +54,21 @@ def predict(self, inputs):
             action = action[0]
         next_state, reward, done, info = env.step(action)               # Let the environment to execute the action, get the next state of the action, the reward of the action, whether the game is done and extra information.
         reward = -10. if done else reward                               # Give a large negative reward if the game is over.
-        replay_buffer.append((state, action, reward, next_state, done)) # Put the (state, action, reward, next_state) quad back into the experience replay pool.
+        replay_buffer.append((state, action, reward, next_state, 1 if done else 0)) # Put the (state, action, reward, next_state) quad back into the experience replay pool.
         state = next_state
 
         if done:                                                        # Exit this round and enter the next episode if the game is over.
             print("episode %d, epsilon %f, score %d" % (episode_id, epsilon, t))
             break
 
         if len(replay_buffer) >= batch_size:
-            batch_state, batch_action, batch_reward, batch_next_state, batch_done = \
-                [np.array(a, dtype=np.float32) for a in zip(*random.sample(replay_buffer, batch_size))] # Randomly take a batch quad from the experience replay pool.
+            # Randomly take a batch quad from the experience replay pool and transform them to NumPy array respectively.
+            batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip(
+                *random.sample(replay_buffer, batch_size))
+            batch_state, batch_reward, batch_next_state, batch_done = \
+                [np.array(a, dtype=np.float32) for a in [batch_state, batch_reward, batch_next_state, batch_done]]
+            batch_action = np.array(batch_action, dtype=np.int32)
+
             q_value = model(tf.constant(batch_next_state, dtype=tf.float32))
             y = batch_reward + (gamma * tf.reduce_max(q_value, axis=1)) * (1 - batch_done)  # Calculate y according to the method in the paper.
             with tf.GradientTape() as tape:
diff --git a/source/_static/code/zh/model/rl/rl.py b/source/_static/code/zh/model/rl/rl.py
@@ -54,16 +54,21 @@ def predict(self, inputs):
             action = action[0]
         next_state, reward, done, info = env.step(action)               # 让环境执行动作，获得执行完动作的下一个状态，动作的奖励，游戏是否已结束以及额外信息
         reward = -10. if done else reward                               # 如果游戏Game Over，给予大的负奖励
-        replay_buffer.append((state, action, reward, next_state, done)) # 将(state, action, reward, next_state)的四元组（外加done标签表示是否结束）放入经验重放池
+        replay_buffer.append((state, action, reward, next_state, 1 if done else 0)) # 将(state, action, reward, next_state)的四元组（外加done标签表示是否结束）放入经验重放池
         state = next_state
 
         if done:                                                        # 游戏结束则退出本轮循环，进行下一个episode
             print("episode %d, epsilon %f, score %d" % (episode_id, epsilon, t))
             break
 
         if len(replay_buffer) >= batch_size:
-            batch_state, batch_action, batch_reward, batch_next_state, batch_done = \
-                [np.array(a, dtype=np.float32) for a in zip(*random.sample(replay_buffer, batch_size))] # 从经验回放池中随机取一个batch的四元组
+            # 从经验回放池中随机取一个batch的四元组，并分别转换为NumPy数组
+            batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip(
+                *random.sample(replay_buffer, batch_size))
+            batch_state, batch_reward, batch_next_state, batch_done = \
+                [np.array(a, dtype=np.float32) for a in [batch_state, batch_reward, batch_next_state, batch_done]]
+            batch_action = np.array(batch_action, dtype=np.int32)
+
             q_value = model(tf.constant(batch_next_state, dtype=tf.float32))
             y = batch_reward + (gamma * tf.reduce_max(q_value, axis=1)) * (1 - batch_done)  # 按照论文计算y值
             with tf.GradientTape() as tape: