-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapprox_cross_entropy.py
151 lines (106 loc) · 4.62 KB
/
approx_cross_entropy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import gym
import gym.spaces
import time
import numpy as np
from sklearn.neural_network import MLPClassifier
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
def generate_session(agent, t_max=10**5):
states, actions = [], []
total_reward = 0
s = env.reset()
for t in range(t_max):
# Choose action from policy
# You can use np.random.choice() func
# a = ?
a = np.random.choice(2, p=agent.predict_proba(s.reshape(1, -1))[0])
# Do action `a` to obtain new_state, reward, is_done,
new_s, r, is_done, _ = env.step(a)
# Record state, action and add up reward to states, actions and total_reward accordingly.
states.append(s)
actions.append(a)
total_reward += r
# Update s for new cycle iteration
s = new_s
if is_done:
break
return states, actions, total_reward
def select_elites(states_batch, actions_batch, rewards_batch, percentile=50):
"""
Select states and actions from games that have rewards >= percentile
:param states_batch: list of lists of states, states_batch[session_i][t]
:param actions_batch: list of lists of actions, actions_batch[session_i][t]
:param rewards_batch: list of rewards, rewards_batch[session_i][t]
:returns: elite_states,elite_actions, both 1D lists of states and respective actions from elite sessions
Please return elite states and actions in their original order
[i.e. sorted by session number and timestep within session]
If you're confused, see examples below. Please don't assume that states are integers (they'll get different later).
"""
states_batch, actions_batch, rewards_batch = map(np.array, [states_batch, actions_batch, rewards_batch])
# Compute reward threshold
reward_threshold = np.percentile(rewards_batch, percentile)
# Compute elite states using reward threshold
elite_states = states_batch[rewards_batch >= reward_threshold]
# Compute elite actions using reward threshold
elite_actions = actions_batch[rewards_batch >= reward_threshold]
elite_states, elite_actions = map(np.concatenate, [elite_states, elite_actions])
return elite_states, elite_actions
def rl_approx_cross_entropy(nn_agent):
n_sessions = 100
percentile = 70
total_iterations = 100
log = []
for i in range(total_iterations):
# Generate n_sessions for further analysis.
sessions = [generate_session(nn_agent) for _ in range(n_sessions)]
states_batch, actions_batch, rewards_batch = map(np.array, zip(*sessions))
# Select elite states & actions.
elite_states, elite_actions = select_elites(states_batch, actions_batch, rewards_batch, percentile)
# Update policy using elite_states, elite_actions.
# nn_agent
nn_agent.fit(elite_states, elite_actions)
# Info for debugging
mean_reward = np.mean(rewards_batch)
threshold = np.percentile(rewards_batch, percentile)
log.append([mean_reward, threshold])
print('Iteration= %.3f, Mean Reward = %.3f, Threshold=%.3f' % (i, mean_reward, threshold))
if np.mean(rewards_batch) > 195:
print('You Win! :)')
break
def test_rl_approx_cross_entropy(nn_agent):
s = env.reset()
total_reward = 0
for t in range(1000):
# Choose action from nn_agent
# You can use np.random.choice() func
# a = ?
a = agent.predict(s)[0]
# Do action `a` to obtain new_state, reward, is_done,
new_s, r, is_done, _ = env.step(a)
if is_done:
break
else:
env.render()
time.sleep(0.07)
total_reward += r
# Update s for new cycle iteration
s = new_s
print('Reward of Test agent = %.3f' % total_reward)
if __name__ == '__main__':
# Create environment 'CartPole-v0'
env = gym.make('CartPole-v0')
s = env.reset()
# Compute number of actions for this environment
n_actions = 2
print('Actions number = %i' % n_actions)
# Create neural network with 2 hidden layers of 10 & 10 neurons each & tanh activations
# use MLPClassifier from scikit-learn
agent = MLPClassifier(hidden_layer_sizes=(10, 10), activation='tanh')
# Initialize agent to the dimension of state and amount of actions
agent.fit([s] * n_actions, range(n_actions))
# Train `deep` neural network to approximate cross entropy method
rl_approx_cross_entropy(agent)
# Test our NN and see how it performs
test_rl_approx_cross_entropy(agent)
# Close environment when everything is done
env.close()