-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtaxi_q_agent_template.py
96 lines (78 loc) · 2.75 KB
/
taxi_q_agent_template.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import gym
import gym.spaces
import matplotlib.pyplot as plt
import numpy as np
from qlearning_template import QLearningAgent
from sarsa import SarsaAgent
from cliff_walking import CliffWalkingEnv
def play_and_train(env, agent, t_max=10 ** 4):
""" This function should
- run a full game (for t_max steps), actions given by agent
- train agent whenever possible
- return total reward
"""
s = env.reset()
total_reward = 0.0
for i in range(t_max):
# Play & train game
# Update rewards
a = agent.get_action(s)
new_s, r, is_done, _ = env.step(a)
total_reward += r
# rewards
agent.update(s, a, new_s, r)
if is_done:
break
# Decay agent epsilon
# agent.epsilon = ?
s = new_s
return total_reward
def getActionRange(state):
return range(env.nA)
if __name__ == '__main__':
max_iterations = 100000000000
visualize = True
# Create Taxi-v2 env
# env = gym.make('Taxi-v2')
env = CliffWalkingEnv()
env.reset()
env.render()
n_states = env.nS
n_actions = env.nA
print('States number = %i, Actions number = %i' % (n_states, n_actions))
# create q learning agent with
alpha=0.5
get_legal_actions = lambda s: range(n_actions)
epsilon=0.2
discount=0.99
agent = QLearningAgent(alpha, epsilon, discount, getActionRange)
agent2 = SarsaAgent(alpha, epsilon, discount, getActionRange)
plt.figure(figsize=[10, 4])
rewards1 = []
rewards2 = []
# Training loop
for i in range(max_iterations):
# Play & train game
rewards1.append(play_and_train(env, agent))
rewards2.append(play_and_train(env, agent2))
if (i + 1) % 100 == 0:
agent.epsilon = max(agent.epsilon * 0.99, 0.00001)
agent2.epsilon = max(agent2.epsilon * 0.99, 0.00001)
# agent.alpha = max(agent.alpha * 0.99, 0.00001)
# agent2.alpha = max(agent2.alpha * 0.99, 0.00001)
if i % 100 == 0:
print('Iteration {}, Average reward {:.2f}, Average reward {:.2f}, Epsilon {:.3f}'.format(i, np.mean(rewards1[-100:]), np.mean(rewards2[-100:]), agent.epsilon))
if visualize:
plt.subplot(1, 2, 1)
plt.plot(rewards1, color='r')
plt.plot(rewards2, color='b')
plt.xlabel('Iterations')
plt.ylabel('Total Reward')
plt.subplot(1, 2, 2)
plt.hist(rewards1, bins=20, range=[-700, +20], color='red', label='Rewards distribution')
plt.hist(rewards2, bins=20, range=[-700, +20], color='blue', label='Rewards distribution')
plt.xlabel('Reward')
plt.ylabel('p(Reward)')
plt.draw()
plt.pause(0.05)
plt.cla()