-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathqlearning_template.py
107 lines (88 loc) · 3.45 KB
/
qlearning_template.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from collections import defaultdict
import numpy as np
class QLearningAgent:
"""
Q-Learning Agent
Instance variables you have access to
- self.epsilon (exploration prob)
- self.alpha (learning rate)
- self.discount (discount rate aka gamma)
Functions you should use
- self.get_legal_actions(state)
which returns legal actions for a state
- self.get_q_value(state,action)
which returns Q(state,action)
- self.set_q_value(state,action,value)
which sets Q(state,action) := value
!!!Important!!!
NOTE: please avoid using self._qValues directly to make code cleaner
"""
def __init__(self, alpha, epsilon, discount, get_legal_actions):
"""We initialize agent and Q-values here."""
self.get_legal_actions = get_legal_actions
self._qValues = defaultdict(lambda: defaultdict(lambda: 0))
self.alpha = alpha
self.epsilon = epsilon
self.gamma = discount
def get_q_value(self, state, action):
"""
Returns Q(state,action)
"""
return self._qValues[state][action]
def set_q_value(self, state, action, value):
"""
Sets the Qvalue for [state,action] to the given value
"""
self._qValues[state][action] = value
################ YOUR CODE is here! ##################
def get_state_value(self, state):
"""
Returns max_action Q(state,action)
where the max is over legal actions.
"""
possible_actions = self.get_legal_actions(state)
# If there are no legal actions, return 0.0
if len(possible_actions) == 0:
return 0.0
q_value = self.get_q_value(state, self.get_best_policy_action(state))
return q_value
def get_best_policy_action(self, state):
"""
Compute the best action to take in a state according current policy
"""
possible_actions = self.get_legal_actions(state)
if len(possible_actions) == 0:
return None
maxv = -99999999999
maxi = 0
for i in possible_actions:
if self.get_q_value(state, i) > maxv:
maxv = self.get_q_value(state, i)
maxi = i
best_action = maxi
return best_action
def get_action(self, state):
"""
Compute the action to take in the current state, including exploration.
With probability self.epsilon, we should take a random action.
otherwise - the best policy action (self.getPolicy).
HINT: You might want to use random.random() or random.choice(list)
"""
# Pick Action
possible_actions = self.get_legal_actions(state)
# If there are no legal actions, return None
if len(possible_actions) == 0:
return None
if 1 == np.random.choice(2, p=(self.epsilon, 1 - self.epsilon)):
return self.get_best_policy_action(state)
else:
return np.random.choice(possible_actions)
def update(self, state, action, next_state, reward):
"""
You should do your Q-Value update here
NOTE: You should never call this function inside class,
it will be called later
"""
Qheat = reward + self.gamma * self.get_state_value(next_state)
Q = self.alpha * Qheat + (1 - self.alpha) * self.get_q_value(state, action)
self.set_q_value(state, action, Q)