1
+ import gym
2
+ import gym .spaces
3
+ import gym .wrappers
4
+ import numpy as np
5
+
6
+ import torch , torch .nn as nn
7
+ from torch .autograd import Variable
8
+
9
+ from tensorboardX import SummaryWriter
10
+ import datetime
11
+
12
+
13
+ def test_define_network (environment , net ):
14
+ s = environment .reset ()
15
+ assert tuple (net (Variable (torch .FloatTensor ([s ]* 3 ))).size ()) == (3 , n_actions ), \
16
+ 'please make sure your model maps state s -> [Q(s,a0), ..., Q(s, a_last)]'
17
+
18
+ assert isinstance (list (net .modules ())[- 1 ], nn .Linear ), \
19
+ 'please make sure you predict q-values without nonlinearity (ignore if you know what you are doing)'
20
+ assert isinstance (get_action (s ), int ), \
21
+ 'get_action(s) must return int, not %s. try int(action)' % (type (get_action (s )))
22
+
23
+ print ('Test #1: define_network() & get_action() functions: OK!' )
24
+
25
+
26
+ def test_eps_greedy_strategy ():
27
+ # Test epsilon-greedy exploration
28
+ for eps in [0. , 0.1 , 0.5 , 1.0 ]:
29
+ state_frequencies = np .bincount ([get_action (s , epsilon = eps ) for i in range (10000 )], minlength = n_actions )
30
+ best_action = state_frequencies .argmax ()
31
+ assert abs (state_frequencies [best_action ] - 10000 * (1 - eps + eps / n_actions )) < 200
32
+ for other_action in range (n_actions ):
33
+ if other_action != best_action :
34
+ assert abs (state_frequencies [other_action ] - 10000 * (eps / n_actions )) < 200
35
+ print ('eps=%.1f tests passed' % eps )
36
+ print ('Test #2: epsilon greedy exploration: OK!' )
37
+
38
+
39
+ def test_td_loss (environment , net ):
40
+ s = environment .reset ()
41
+ a = environment .action_space .sample ()
42
+ next_s , r , done , _ = env .step (a )
43
+ loss = compute_td_loss ([s ], [a ], [r ], [next_s ], [done ], check_shapes = False )
44
+ loss .backward ()
45
+
46
+ # assert isinstance(loss, Variable) and tuple(loss.data.size()) == (1,), \
47
+ # 'you must return scalar loss - mean over batch'
48
+ assert np .any (next (net .parameters ()).grad .data .numpy () != 0 ), \
49
+ 'loss must be differentiable w.r.t. network weights'
50
+
51
+ print ('Test #3: compute_td_loss() function: OK!' )
52
+
53
+
54
+ def to_one_hot (y , n_dims = None ):
55
+ """ helper #1: take an integer vector (tensor of variable) and convert it to 1-hot matrix. """
56
+ y_tensor = y .data if isinstance (y , Variable ) else y
57
+ y_tensor = y_tensor .type (torch .LongTensor ).view (- 1 , 1 )
58
+ n_dims = n_dims if n_dims is not None else int (torch .max (y_tensor )) + 1
59
+ y_one_hot = torch .zeros (y_tensor .size ()[0 ], n_dims ).scatter_ (1 , y_tensor , 1 )
60
+ return Variable (y_one_hot ) if isinstance (y , Variable ) else y_one_hot
61
+
62
+
63
+ def where (cond , x_1 , x_2 ):
64
+ """ helper #2: like np.where but in PyTorch. """
65
+ return (cond * x_1 ) + ((1 - cond ) * x_2 )
66
+
67
+
68
+ # < YOUR CODE HERE >
69
+ def define_network (state_dim , n_actions ):
70
+ network = nn .Sequential ()
71
+ network .add_module ('layer0' , nn .Conv2d (3 , 6 , 5 ))
72
+ network .add_module ('layer1' , nn .Linear (state_dim [0 ], 40 ))
73
+ network .add_module ('layer2' , nn .ReLU ())
74
+ network .add_module ('layer3' , nn .Linear (40 , 40 ))
75
+ network .add_module ('layer4' , nn .ReLU ())
76
+ network .add_module ('layer5' , nn .Linear (40 , n_actions ))
77
+ return network
78
+
79
+
80
+ # < YOUR CODE HERE >
81
+ def get_action (state , epsilon = 0 ):
82
+ """
83
+ sample actions with epsilon-greedy policy
84
+ recap: with probability = epsilon pick random action, else pick action with highest Q(s,a)
85
+ """
86
+ state = Variable (torch .FloatTensor (state ))
87
+ q_values = network (state ).data .numpy ()
88
+
89
+ r = np .random .choice (n_actions , p = [epsilon , 1 - epsilon ])
90
+ if r == 1 :
91
+ return int (np .argmax (q_values ))
92
+ else :
93
+ return env .action_space .sample ()
94
+
95
+
96
+
97
+ # < YOUR CODE HERE >
98
+ def compute_td_loss (states , actions , rewards , next_states , is_done , gamma = 0.99 , check_shapes = False ):
99
+ """ Compute td loss using torch operations only."""
100
+ states = Variable (torch .FloatTensor (states )) # shape: [batch_size, state_size]
101
+ actions = Variable (torch .IntTensor (actions )) # shape: [batch_size]
102
+ rewards = Variable (torch .FloatTensor (rewards )) # shape: [batch_size]
103
+ next_states = Variable (torch .FloatTensor (next_states )) # shape: [batch_size, state_size]
104
+ is_done = Variable (torch .FloatTensor (is_done )) # shape: [batch_size]
105
+
106
+ # get q-values for all actions in current states
107
+ predicted_qvalues = network (states ) # < YOUR CODE HERE >
108
+
109
+ # select q-values for chosen actions
110
+ predicted_qvalues_for_actions = torch .sum (predicted_qvalues .cpu () * to_one_hot (actions , n_actions ), dim = 1 )
111
+
112
+ # compute q-values for all actions in next states
113
+ predicted_next_qvalues = network (next_states ) # < YOUR CODE HERE >
114
+
115
+ # compute V*(next_states) using predicted next q-values
116
+ next_state_values , _ = torch .max (predicted_next_qvalues , dim = 1 ) # < YOUR CODE HERE >
117
+
118
+ assert isinstance (next_state_values .data , torch .FloatTensor )
119
+
120
+ # compute 'target q-values' for loss
121
+ target_qvalues_for_actions = rewards + gamma * next_state_values # < YOUR CODE HERE >
122
+
123
+ # at the last state we shall use simplified formula: Q(s,a) = r(s,a) since s' doesn't exist
124
+ next_state_values = where (is_done , rewards , target_qvalues_for_actions ).cpu ()
125
+
126
+ # Mean Squared Error loss to minimize
127
+ loss = torch .mean ((predicted_qvalues_for_actions - target_qvalues_for_actions .detach ()) ** 2 )
128
+
129
+ if check_shapes :
130
+ assert predicted_next_qvalues .data .dim () == 2 , \
131
+ 'make sure you predicted q-values for all actions in next state'
132
+ assert next_state_values .data .dim () == 1 , \
133
+ 'make sure you computed V(s-prime) as maximum over just the actions axis and not all axes'
134
+ assert target_qvalues_for_actions .data .dim () == 1 , \
135
+ 'there is something wrong with target q-values, they must be a vector'
136
+
137
+ return loss
138
+
139
+
140
+ def generate_session (t_max = 1000 , epsilon = 0 , train = False ):
141
+ """Play env with approximate q-learning agent and train it at the same time"""
142
+ total_reward = 0
143
+ s = env .reset ()
144
+
145
+ for t in range (t_max ):
146
+ # a = <get_action_a> from agent # < YOUR CODE HERE >
147
+ a = get_action (s , epsilon )
148
+ next_s , r , done , _ = env .step (a )
149
+ if train :
150
+ opt .zero_grad ()
151
+ loss = compute_td_loss ([s ], [a ], [r ], [next_s ], [done ])
152
+ loss .backward ()
153
+ opt .step ()
154
+
155
+ total_reward += r
156
+ s = next_s
157
+ if done :
158
+ break
159
+
160
+ return total_reward
161
+
162
+
163
+ if __name__ == '__main__' :
164
+ print ('afasdf' )
165
+ dump_logs = True
166
+ record_video = False
167
+ env = gym .make ("CartPole-v0" ).env
168
+ s = env .reset ()
169
+ n_actions = env .action_space .n
170
+ state_dim = env .observation_space .shape
171
+ n_actions = env .action_space .n
172
+
173
+ print ('Actions number = %i , State example = %s ' % (n_actions , s ))
174
+ print ('State space upper bound: %s' % env .observation_space .high )
175
+ print ('State space lower bound: %s' % env .observation_space .low )
176
+
177
+ # Complete define_network() & get_action() functions
178
+ network = define_network (state_dim , n_actions )
179
+
180
+ # test_define_network(env, network)
181
+ # test_eps_greedy_strategy()
182
+
183
+ # Complete compute_td_loss function
184
+ test_td_loss (env , network )
185
+
186
+ # Create Adam optimizer with lr=1e-4
187
+ opt = torch .optim .Adam (network .parameters (), lr = 1e-4 )
188
+ epsilon = 0.5
189
+ max_epochs = 1000
190
+ if dump_logs :
191
+ log_path = './logs/{:%Y_%m_%d_%H_%M}' .format (datetime .datetime .now ())
192
+ writer = SummaryWriter (log_path )
193
+
194
+ for i in range (max_epochs ):
195
+ session_rewards = [generate_session (epsilon = epsilon , train = True ) for _ in range (100 )]
196
+ print ('Epoch #{}\t Mean reward = {:.3f}\t Epsilon = {:.3f}' .format (i , np .mean (session_rewards ), epsilon ))
197
+ if dump_logs :
198
+ writer .add_scalar ('Mean Reward' , np .mean (session_rewards ), i )
199
+
200
+ # Code Epsilon decay <HERE>
201
+ if (epsilon > 0.1 ):
202
+ epsilon *= 0.99
203
+ else :
204
+ epsilon = max (epsilon * 0.999 , 1e-4 )
205
+ assert epsilon >= 1e-4 , 'Make sure epsilon is always nonzero during training'
206
+
207
+ if np .mean (session_rewards ) > 300 :
208
+ print ('You Win!' )
209
+ break
210
+ if record_video :
211
+ env = gym .wrappers .Monitor (gym .make ('CartPole-v0' ).env , directory = 'videos' , force = True )
212
+ sessions = [generate_session (epsilon = 0 , train = False ) for _ in range (100 )]
213
+ env .close ()
0 commit comments