-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
235 lines (206 loc) · 9.92 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
from commons.policies import reverseAlgoDict, AlgoDict, mask_function, ActionMasker
from commons.quartoenv import RandomOpponentEnv, RandomOpponentEnv_V1, RandomOpponentEnv_V2, CustomOpponentEnv_V3
from sb3_contrib.common.maskable.policies import MaskableActorCriticPolicy
from sb3_contrib.ppo_mask import MaskablePPO
from stable_baselines3.common.callbacks import CheckpointCallback, EveryNTimesteps
from commons.utils import WinPercentageCallback, UpdateOpponentCallback
from itertools import compress
import numpy as np
import wandb
from wandb.integration.sb3 import WandbCallback
import argparse
# not saving 5e6 along the way to present results with reduced randomicity
trainsteps_dict = {
3000: "3e3",
1e6: "1e6",
3e6: "3e6",
5e6: "5e6",
20e6: "20e6",
50e6: "50e6",
100e6: "100e6"
}
def boolean_string(s):
if s.lower() not in {'false', 'true'}:
raise ValueError('Not a valid boolean string')
return s.lower() == 'true'
def parse_args()->object:
"""Args function.
Returns:
(object): args parser
"""
parser = argparse.ArgumentParser()
parser.add_argument("--algorithm", default="PPO", type=str, help="RL Algorithm. One in ['PPO', 'A2C', 'maskedPPO']")
parser.add_argument("--verbose", default=0, type=int, help="Verbosity value")
parser.add_argument("--train-timesteps", default=1e5, type=float, help="Number of timesteps to train the RL algorithm with")
parser.add_argument("--evaluate_while_training", default=True, type=boolean_string, help="Whether or not to evaluate the RL algorithm while training")
parser.add_argument("--store-checkpoints", default=True, type = boolean_string, help="Whether or not to store partially-trained models. Recommended True for long trainings (>1e6 ts)")
parser.add_argument("--evaluation-frequency", default=1e3, type = float, help="Frequency with which to evaluate policy against random fair opponent")
parser.add_argument("--test-episodes", default=50, type=int, help="Number of test matches the agent plays during periodic evaluation")
parser.add_argument("--action-masking", default=False, type=boolean_string, help="Whether or not to perform action masking during training")
parser.add_argument("--losing-penalty", default=True, type=boolean_string, help="Whether or not to enforce a penalty (negative reward) for losing")
parser.add_argument("--duration-penalty", default=True, type=boolean_string, help="Whether or not to enforce a penalty (negative reward) on long games")
parser.add_argument("--show-progressbar", default=True, type=boolean_string, help="Whether or not to display a progressbar during training")
parser.add_argument("--save-model", default=False, type=boolean_string, help="Whether or not save the model currently trained")
parser.add_argument("--resume-training", default=False, type=boolean_string, help="Whether or not load and keep train an already trained model")
parser.add_argument("--model-path", default=None, type=str, help="Path to which the model to incrementally train is stored")
parser.add_argument("--use-symmetries", default=False, type=boolean_string, help="Whether or not let the agent exploit the game symmetries")
parser.add_argument("--self-play", default=False, type=boolean_string, help="Whether or not to let the agent play against checkpointed copies of itself")
parser.add_argument("--logwandb", default=True, type=boolean_string, help="Whether or not to log the training process on wandb")
parser.add_argument("--default", default=True, type=boolean_string, help="Default mode, ignore all configurations")
return parser.parse_args()
args = parse_args()
algorithm=args.algorithm
verbose=args.verbose
train_timesteps=args.train_timesteps
evaluate_while_training=args.evaluate_while_training
store_checkpoints=args.store_checkpoints
evaluation_frequency=args.evaluation_frequency
test_episodes=args.test_episodes
action_masking=args.action_masking
losing_penalty=args.losing_penalty
duration_penalty=args.duration_penalty
show_progressbar=args.show_progressbar
save_model=args.save_model
resume_training=args.resume_training
model_path=args.model_path
use_symmetries=args.use_symmetries
self_play=args.self_play
logwandb=args.logwandb
if args.default:
algorithm = "maskedPPO"
verbose=2
train_timesteps=100_000_000
evaluate_while_training=True
store_checkpoints=True
evaluation_frequency=1000
test_episodes=100
action_masking=True
losing_penalty=True
duration_penalty=True
show_progressbar=True
save_model=True
use_symmetries=False
self_play=False
model_path=None
logwandb = True
def main():
# no seed is setted, but it can be easily done uncommenting the following lines
seed = None
# np.random.seed(seed)
# random.seed(seed)
checkpoint_frequency = 250_000
opponent_update_frequency = 500_000
# input sanity check
if not action_masking:
if algorithm.upper() not in ["PPO", "A2C"]:
print(f"Prompted algorithm (upper): {algorithm.upper()}")
raise ValueError("Non-action masking algorithm currently supported are ['PPO', 'A2C'] only!")
# create environment in which agent plays against random-playing agent
if losing_penalty:
env = RandomOpponentEnv_V1()
version = "v1"
if duration_penalty:
env = RandomOpponentEnv_V2()
version = "v2"
if use_symmetries:
env = CustomOpponentEnv_V3()
version = "v3"
# creating an opponent from the one given in model path - opponent does always play legit moves
opponent = MaskablePPO.load(model_path, env=env, custom_objects={'learning_rate': 0.0, "clip_range": 0.0, "lr_schedule":0.0})
opponent.set_env(env=env)
# using this opponent to perform adversarial learning
env.update_opponent(new_opponent=opponent)
else:
env = RandomOpponentEnv()
version = "v0"
if action_masking:
# masking action space to those actually available
env = ActionMasker(env, mask_function)
# maskable PPO object
model = MaskablePPO(
MaskableActorCriticPolicy,
env=env,
verbose=verbose,
seed=seed,
tensorboard_log=f"logs/tensorboard.id"
)
else:
model_function = reverseAlgoDict[algorithm.upper()]
model = model_function("MlpPolicy", env=env, verbose=verbose, seed=seed)
model_name = algorithm.upper() + version + "_" + trainsteps_dict[train_timesteps]
# saving model every 5e5 timesteps
checkpoint_save = CheckpointCallback(
save_freq=checkpoint_frequency, save_path="checkpoints/", name_prefix=f"{algorithm}"
)
# saving the percentage of wins a model can achieve in n_episodes
winpercentage = WinPercentageCallback(env=env, n_episodes=test_episodes, logfile=f"logs/{model_name}_logfile.txt")
# levelling up competitiveness during training
update_opponent = UpdateOpponentCallback(checkpoints_dir="checkpoints/")
# evaluating the environment periodically every evaluation_frequency timesteps
evaluation_callback = EveryNTimesteps(n_steps=evaluation_frequency, callback=winpercentage)
# updating the competitiveness of the agents' environemnt during training
selfplay_callback = EveryNTimesteps(n_steps=opponent_update_frequency, callback=update_opponent)
callback_list = [
checkpoint_save,
evaluation_callback,
selfplay_callback
]
callback_mask = [store_checkpoints, evaluate_while_training, self_play]
# masking callbacks considering script input
callback_list = list(compress(callback_list, callback_mask))
training_config = {
"version": version,
"model": algorithm.upper(),
"total_timesteps": train_timesteps,
"losing_penalty": losing_penalty,
"duration_penalty": duration_penalty,
"use_symmetries": use_symmetries,
"action_masking": action_masking,
"incremental": resume_training
}
if logwandb:
run = wandb.init(
project="QuartoRL-v2 seedless training",
config=training_config,
sync_tensorboard=True,
monitor_gym=True,
save_code=True
)
# using W&B during training
wand_callback = WandbCallback(verbose=2, gradient_save_freq=500)
callback_list.append(wand_callback)
# training the model with train_timesteps
if not resume_training:
print(f"Training: {model_name}")
model.learn(total_timesteps=train_timesteps, callback=callback_list, progress_bar=show_progressbar)
elif resume_training and action_masking:
# only resuming training of
remaining_training_steps = int(float(input("Please enter new number of training steps: ")))
model = MaskablePPO.load(model_path, tensorboard_log=f"logs/tensorboard.id")
env = CustomOpponentEnv_V3()
version = "v3"
env = ActionMasker(env, mask_function)
model.set_env(env=env)
# creating an opponent from the one given in model path - opponent does always play legit moves
opponent = MaskablePPO.load(
model_path,
env=env,
custom_objects={'learning_rate': 0.0, "clip_range": 0.0, "lr_schedule":0.0}
)
opponent.set_env(env=env)
# using this opponent to perform adversarial learning
env.update_opponent(new_opponent=opponent)
model_name = algorithm.upper() + version + "_" + trainsteps_dict[train_timesteps]
print("'Resuming' training...")
model.learn(
remaining_training_steps,
reset_num_timesteps=False,
callback=callback_list,
progress_bar=show_progressbar
)
else:
raise ValueError("Resume-training is implemented for MaskablePPO only!")
if save_model:
model.save(f"commons/trainedmodels/{model_name}.zip")
if __name__ == "__main__":
main()