|
39 | 39 | help="checkpoint path for initializing the model.")
|
40 | 40 |
|
41 | 41 | # Optimization config
|
42 |
| -flags.DEFINE_float("learning_rate", default=2.5e-4, |
| 42 | +flags.DEFINE_float("learning_rate", default=1e-4, |
43 | 43 | help="Maximum learning rate.")
|
44 |
| -flags.DEFINE_float("clip", default=0.25, |
| 44 | +flags.DEFINE_float("clip", default=1.0, |
45 | 45 | help="Gradient clipping value.")
|
46 | 46 | # for cosine decay
|
47 |
| -flags.DEFINE_float("min_lr_ratio", default=0.004, |
| 47 | +flags.DEFINE_float("min_lr_ratio", default=0.001, |
48 | 48 | help="Minimum ratio learning rate.")
|
49 | 49 | flags.DEFINE_integer("warmup_steps", default=0,
|
50 | 50 | help="Number of steps for linear lr warmup.")
|
|
56 | 56 | help="weight decay")
|
57 | 57 |
|
58 | 58 | # Training config
|
59 |
| -flags.DEFINE_integer("train_batch_size", default=60, |
| 59 | +flags.DEFINE_integer("train_batch_size", default=16, |
60 | 60 | help="Size of train batch.")
|
61 | 61 | flags.DEFINE_integer("train_steps", default=100000,
|
62 | 62 | help="Total number of training steps.")
|
63 |
| -flags.DEFINE_integer("iterations", default=500, |
| 63 | +flags.DEFINE_integer("iterations", default=1000, |
64 | 64 | help="Number of iterations per repeat loop.")
|
65 |
| -flags.DEFINE_integer("save_steps", default=10000, |
| 65 | +flags.DEFINE_integer("save_steps", default=None, |
66 | 66 | help="number of steps for model checkpointing.")
|
67 | 67 |
|
68 | 68 | # Data config
|
|
73 | 73 | "Could be half of seq_len")
|
74 | 74 | flags.DEFINE_bool("bi_data", default=True,
|
75 | 75 | help="Use bidirectional data streams, i.e., forward & backward.")
|
76 |
| -flags.DEFINE_integer("mask_alpha", default=2, |
| 76 | +flags.DEFINE_integer("mask_alpha", default=6, |
77 | 77 | help="How many tokens to form a group.")
|
78 | 78 | flags.DEFINE_integer("mask_beta", default=1,
|
79 | 79 | help="How many tokens to mask within each group.")
|
|
86 | 86 | flags.DEFINE_integer("n_token", 32000, help="Vocab size")
|
87 | 87 |
|
88 | 88 | # Model config
|
89 |
| -flags.DEFINE_integer("mem_len", default=70, |
| 89 | +flags.DEFINE_integer("mem_len", default=0, |
90 | 90 | help="Number of steps to cache")
|
91 | 91 | flags.DEFINE_bool("same_length", default=False,
|
92 | 92 | help="Same length attention")
|
|
95 | 95 |
|
96 | 96 | flags.DEFINE_integer("n_layer", default=6,
|
97 | 97 | help="Number of layers.")
|
98 |
| -flags.DEFINE_integer("d_model", default=500, |
| 98 | +flags.DEFINE_integer("d_model", default=32, |
99 | 99 | help="Dimension of the model.")
|
100 |
| -flags.DEFINE_integer("d_embed", default=500, |
| 100 | +flags.DEFINE_integer("d_embed", default=32, |
101 | 101 | help="Dimension of the embeddings.")
|
102 |
| -flags.DEFINE_integer("n_head", default=10, |
| 102 | +flags.DEFINE_integer("n_head", default=4, |
103 | 103 | help="Number of attention heads.")
|
104 |
| -flags.DEFINE_integer("d_head", default=50, |
| 104 | +flags.DEFINE_integer("d_head", default=8, |
105 | 105 | help="Dimension of each attention head.")
|
106 |
| -flags.DEFINE_integer("d_inner", default=1000, |
| 106 | +flags.DEFINE_integer("d_inner", default=32, |
107 | 107 | help="Dimension of inner hidden size in positionwise feed-forward.")
|
108 |
| -flags.DEFINE_float("dropout", default=0.1, |
| 108 | +flags.DEFINE_float("dropout", default=0.0, |
109 | 109 | help="Dropout rate.")
|
110 |
| -flags.DEFINE_float("dropatt", default=0.1, |
| 110 | +flags.DEFINE_float("dropatt", default=0.0, |
111 | 111 | help="Attention dropout rate.")
|
112 | 112 | flags.DEFINE_bool("untie_r", default=False,
|
113 | 113 | help="Untie r_w_bias and r_r_bias")
|
114 |
| -flags.DEFINE_string("summary_type", default="attn", |
| 114 | +flags.DEFINE_string("summary_type", default="last", |
115 | 115 | help="Method used to summarize a sequence into a compact vector.")
|
116 | 116 | flags.DEFINE_string("ff_activation", default="relu",
|
117 | 117 | help="Activation type used in position-wise feed-forward.")
|
|
0 commit comments