Skip to content

Commit cb38967

Browse files
committed
initial
0 parents  commit cb38967

12 files changed

+1900
-0
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
https://gitlab.com/donfaq/reinforcement-learning-classes

approx_cross_entropy.py

+151
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
import gym
2+
import gym.spaces
3+
import time
4+
import numpy as np
5+
from sklearn.neural_network import MLPClassifier
6+
import warnings
7+
warnings.filterwarnings("ignore", category=DeprecationWarning)
8+
9+
10+
def generate_session(agent, t_max=10**5):
11+
states, actions = [], []
12+
total_reward = 0
13+
14+
s = env.reset()
15+
16+
for t in range(t_max):
17+
# Choose action from policy
18+
# You can use np.random.choice() func
19+
# a = ?
20+
a = np.random.choice(2, p=agent.predict_proba(s.reshape(1, -1))[0])
21+
22+
# Do action `a` to obtain new_state, reward, is_done,
23+
new_s, r, is_done, _ = env.step(a)
24+
25+
# Record state, action and add up reward to states, actions and total_reward accordingly.
26+
states.append(s)
27+
actions.append(a)
28+
total_reward += r
29+
30+
# Update s for new cycle iteration
31+
s = new_s
32+
33+
if is_done:
34+
break
35+
36+
return states, actions, total_reward
37+
38+
39+
def select_elites(states_batch, actions_batch, rewards_batch, percentile=50):
40+
"""
41+
Select states and actions from games that have rewards >= percentile
42+
:param states_batch: list of lists of states, states_batch[session_i][t]
43+
:param actions_batch: list of lists of actions, actions_batch[session_i][t]
44+
:param rewards_batch: list of rewards, rewards_batch[session_i][t]
45+
46+
:returns: elite_states,elite_actions, both 1D lists of states and respective actions from elite sessions
47+
48+
Please return elite states and actions in their original order
49+
[i.e. sorted by session number and timestep within session]
50+
51+
If you're confused, see examples below. Please don't assume that states are integers (they'll get different later).
52+
"""
53+
54+
states_batch, actions_batch, rewards_batch = map(np.array, [states_batch, actions_batch, rewards_batch])
55+
56+
# Compute reward threshold
57+
reward_threshold = np.percentile(rewards_batch, percentile)
58+
59+
# Compute elite states using reward threshold
60+
elite_states = states_batch[rewards_batch >= reward_threshold]
61+
62+
# Compute elite actions using reward threshold
63+
elite_actions = actions_batch[rewards_batch >= reward_threshold]
64+
65+
elite_states, elite_actions = map(np.concatenate, [elite_states, elite_actions])
66+
67+
return elite_states, elite_actions
68+
69+
70+
def rl_approx_cross_entropy(nn_agent):
71+
n_sessions = 100
72+
percentile = 70
73+
total_iterations = 100
74+
log = []
75+
76+
for i in range(total_iterations):
77+
78+
# Generate n_sessions for further analysis.
79+
sessions = [generate_session(nn_agent) for _ in range(n_sessions)]
80+
states_batch, actions_batch, rewards_batch = map(np.array, zip(*sessions))
81+
82+
# Select elite states & actions.
83+
elite_states, elite_actions = select_elites(states_batch, actions_batch, rewards_batch, percentile)
84+
85+
# Update policy using elite_states, elite_actions.
86+
# nn_agent
87+
nn_agent.fit(elite_states, elite_actions)
88+
89+
# Info for debugging
90+
mean_reward = np.mean(rewards_batch)
91+
threshold = np.percentile(rewards_batch, percentile)
92+
log.append([mean_reward, threshold])
93+
94+
print('Iteration= %.3f, Mean Reward = %.3f, Threshold=%.3f' % (i, mean_reward, threshold))
95+
96+
if np.mean(rewards_batch) > 195:
97+
print('You Win! :)')
98+
break
99+
100+
101+
def test_rl_approx_cross_entropy(nn_agent):
102+
s = env.reset()
103+
total_reward = 0
104+
for t in range(1000):
105+
# Choose action from nn_agent
106+
# You can use np.random.choice() func
107+
# a = ?
108+
a = agent.predict(s)[0]
109+
110+
# Do action `a` to obtain new_state, reward, is_done,
111+
new_s, r, is_done, _ = env.step(a)
112+
113+
if is_done:
114+
break
115+
else:
116+
env.render()
117+
time.sleep(0.07)
118+
total_reward += r
119+
# Update s for new cycle iteration
120+
121+
s = new_s
122+
123+
print('Reward of Test agent = %.3f' % total_reward)
124+
125+
126+
if __name__ == '__main__':
127+
# Create environment 'CartPole-v0'
128+
env = gym.make('CartPole-v0')
129+
s = env.reset()
130+
131+
# Compute number of actions for this environment
132+
n_actions = 2
133+
134+
print('Actions number = %i' % n_actions)
135+
136+
# Create neural network with 2 hidden layers of 10 & 10 neurons each & tanh activations
137+
# use MLPClassifier from scikit-learn
138+
139+
agent = MLPClassifier(hidden_layer_sizes=(10, 10), activation='tanh')
140+
141+
# Initialize agent to the dimension of state and amount of actions
142+
agent.fit([s] * n_actions, range(n_actions))
143+
144+
# Train `deep` neural network to approximate cross entropy method
145+
rl_approx_cross_entropy(agent)
146+
147+
# Test our NN and see how it performs
148+
test_rl_approx_cross_entropy(agent)
149+
150+
# Close environment when everything is done
151+
env.close()

approx_q_learning_template.py

+213
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
import gym
2+
import gym.spaces
3+
import gym.wrappers
4+
import numpy as np
5+
6+
import torch, torch.nn as nn
7+
from torch.autograd import Variable
8+
9+
from tensorboardX import SummaryWriter
10+
import datetime
11+
12+
13+
def test_define_network(environment, net):
14+
s = environment.reset()
15+
assert tuple(net(Variable(torch.FloatTensor([s]*3))).size()) == (3, n_actions), \
16+
'please make sure your model maps state s -> [Q(s,a0), ..., Q(s, a_last)]'
17+
18+
assert isinstance(list(net.modules())[-1], nn.Linear), \
19+
'please make sure you predict q-values without nonlinearity (ignore if you know what you are doing)'
20+
assert isinstance(get_action(s), int), \
21+
'get_action(s) must return int, not %s. try int(action)' % (type(get_action(s)))
22+
23+
print('Test #1: define_network() & get_action() functions: OK!')
24+
25+
26+
def test_eps_greedy_strategy():
27+
# Test epsilon-greedy exploration
28+
for eps in [0., 0.1, 0.5, 1.0]:
29+
state_frequencies = np.bincount([get_action(s, epsilon=eps) for i in range(10000)], minlength=n_actions)
30+
best_action = state_frequencies.argmax()
31+
assert abs(state_frequencies[best_action] - 10000 * (1 - eps + eps / n_actions)) < 200
32+
for other_action in range(n_actions):
33+
if other_action != best_action:
34+
assert abs(state_frequencies[other_action] - 10000 * (eps / n_actions)) < 200
35+
print('eps=%.1f tests passed' % eps)
36+
print('Test #2: epsilon greedy exploration: OK!')
37+
38+
39+
def test_td_loss(environment, net):
40+
s = environment.reset()
41+
a = environment.action_space.sample()
42+
next_s, r, done, _ = env.step(a)
43+
loss = compute_td_loss([s], [a], [r], [next_s], [done], check_shapes=False)
44+
loss.backward()
45+
46+
# assert isinstance(loss, Variable) and tuple(loss.data.size()) == (1,), \
47+
# 'you must return scalar loss - mean over batch'
48+
assert np.any(next(net.parameters()).grad.data.numpy() != 0), \
49+
'loss must be differentiable w.r.t. network weights'
50+
51+
print('Test #3: compute_td_loss() function: OK!')
52+
53+
54+
def to_one_hot(y, n_dims=None):
55+
""" helper #1: take an integer vector (tensor of variable) and convert it to 1-hot matrix. """
56+
y_tensor = y.data if isinstance(y, Variable) else y
57+
y_tensor = y_tensor.type(torch.LongTensor).view(-1, 1)
58+
n_dims = n_dims if n_dims is not None else int(torch.max(y_tensor)) + 1
59+
y_one_hot = torch.zeros(y_tensor.size()[0], n_dims).scatter_(1, y_tensor, 1)
60+
return Variable(y_one_hot) if isinstance(y, Variable) else y_one_hot
61+
62+
63+
def where(cond, x_1, x_2):
64+
""" helper #2: like np.where but in PyTorch. """
65+
return (cond * x_1) + ((1-cond) * x_2)
66+
67+
68+
# < YOUR CODE HERE >
69+
def define_network(state_dim, n_actions):
70+
network = nn.Sequential()
71+
network.add_module('layer0', nn.Conv2d(3, 6, 5))
72+
network.add_module('layer1', nn.Linear(state_dim[0], 40))
73+
network.add_module('layer2', nn.ReLU())
74+
network.add_module('layer3', nn.Linear(40, 40))
75+
network.add_module('layer4', nn.ReLU())
76+
network.add_module('layer5', nn.Linear(40, n_actions))
77+
return network
78+
79+
80+
# < YOUR CODE HERE >
81+
def get_action(state, epsilon=0):
82+
"""
83+
sample actions with epsilon-greedy policy
84+
recap: with probability = epsilon pick random action, else pick action with highest Q(s,a)
85+
"""
86+
state = Variable(torch.FloatTensor(state))
87+
q_values = network(state).data.numpy()
88+
89+
r = np.random.choice(n_actions, p=[epsilon, 1-epsilon])
90+
if r == 1:
91+
return int(np.argmax(q_values))
92+
else:
93+
return env.action_space.sample()
94+
95+
96+
97+
# < YOUR CODE HERE >
98+
def compute_td_loss(states, actions, rewards, next_states, is_done, gamma=0.99, check_shapes=False):
99+
""" Compute td loss using torch operations only."""
100+
states = Variable(torch.FloatTensor(states)) # shape: [batch_size, state_size]
101+
actions = Variable(torch.IntTensor(actions)) # shape: [batch_size]
102+
rewards = Variable(torch.FloatTensor(rewards)) # shape: [batch_size]
103+
next_states = Variable(torch.FloatTensor(next_states)) # shape: [batch_size, state_size]
104+
is_done = Variable(torch.FloatTensor(is_done)) # shape: [batch_size]
105+
106+
# get q-values for all actions in current states
107+
predicted_qvalues = network(states) # < YOUR CODE HERE >
108+
109+
# select q-values for chosen actions
110+
predicted_qvalues_for_actions = torch.sum(predicted_qvalues.cpu() * to_one_hot(actions, n_actions), dim=1)
111+
112+
# compute q-values for all actions in next states
113+
predicted_next_qvalues = network(next_states) # < YOUR CODE HERE >
114+
115+
# compute V*(next_states) using predicted next q-values
116+
next_state_values, _ = torch.max(predicted_next_qvalues, dim=1) # < YOUR CODE HERE >
117+
118+
assert isinstance(next_state_values.data, torch.FloatTensor)
119+
120+
# compute 'target q-values' for loss
121+
target_qvalues_for_actions = rewards + gamma * next_state_values # < YOUR CODE HERE >
122+
123+
# at the last state we shall use simplified formula: Q(s,a) = r(s,a) since s' doesn't exist
124+
next_state_values = where(is_done, rewards, target_qvalues_for_actions).cpu()
125+
126+
# Mean Squared Error loss to minimize
127+
loss = torch.mean((predicted_qvalues_for_actions - target_qvalues_for_actions.detach()) ** 2)
128+
129+
if check_shapes:
130+
assert predicted_next_qvalues.data.dim() == 2, \
131+
'make sure you predicted q-values for all actions in next state'
132+
assert next_state_values.data.dim() == 1, \
133+
'make sure you computed V(s-prime) as maximum over just the actions axis and not all axes'
134+
assert target_qvalues_for_actions.data.dim() == 1, \
135+
'there is something wrong with target q-values, they must be a vector'
136+
137+
return loss
138+
139+
140+
def generate_session(t_max=1000, epsilon=0, train=False):
141+
"""Play env with approximate q-learning agent and train it at the same time"""
142+
total_reward = 0
143+
s = env.reset()
144+
145+
for t in range(t_max):
146+
# a = <get_action_a> from agent # < YOUR CODE HERE >
147+
a = get_action(s, epsilon)
148+
next_s, r, done, _ = env.step(a)
149+
if train:
150+
opt.zero_grad()
151+
loss = compute_td_loss([s], [a], [r], [next_s], [done])
152+
loss.backward()
153+
opt.step()
154+
155+
total_reward += r
156+
s = next_s
157+
if done:
158+
break
159+
160+
return total_reward
161+
162+
163+
if __name__ == '__main__':
164+
print('afasdf')
165+
dump_logs = True
166+
record_video = False
167+
env = gym.make("CartPole-v0").env
168+
s = env.reset()
169+
n_actions = env.action_space.n
170+
state_dim = env.observation_space.shape
171+
n_actions = env.action_space.n
172+
173+
print('Actions number = %i , State example = %s ' % (n_actions, s))
174+
print('State space upper bound: %s' % env.observation_space.high)
175+
print('State space lower bound: %s' % env.observation_space.low)
176+
177+
# Complete define_network() & get_action() functions
178+
network = define_network(state_dim, n_actions)
179+
180+
# test_define_network(env, network)
181+
# test_eps_greedy_strategy()
182+
183+
# Complete compute_td_loss function
184+
test_td_loss(env, network)
185+
186+
# Create Adam optimizer with lr=1e-4
187+
opt = torch.optim.Adam(network.parameters(), lr=1e-4)
188+
epsilon = 0.5
189+
max_epochs = 1000
190+
if dump_logs:
191+
log_path = './logs/{:%Y_%m_%d_%H_%M}'.format(datetime.datetime.now())
192+
writer = SummaryWriter(log_path)
193+
194+
for i in range(max_epochs):
195+
session_rewards = [generate_session(epsilon=epsilon, train=True) for _ in range(100)]
196+
print('Epoch #{}\tMean reward = {:.3f}\tEpsilon = {:.3f}'.format(i, np.mean(session_rewards), epsilon))
197+
if dump_logs:
198+
writer.add_scalar('Mean Reward', np.mean(session_rewards), i)
199+
200+
# Code Epsilon decay <HERE>
201+
if (epsilon > 0.1):
202+
epsilon *= 0.99
203+
else:
204+
epsilon = max(epsilon*0.999, 1e-4)
205+
assert epsilon >= 1e-4, 'Make sure epsilon is always nonzero during training'
206+
207+
if np.mean(session_rewards) > 300:
208+
print('You Win!')
209+
break
210+
if record_video:
211+
env = gym.wrappers.Monitor(gym.make('CartPole-v0').env, directory='videos', force=True)
212+
sessions = [generate_session(epsilon=0, train=False) for _ in range(100)]
213+
env.close()

0 commit comments

Comments
 (0)