kozistr · hatonosuke · Feb 17, 2025 · Feb 19, 2025 · Feb 19, 2025 · Feb 19, 2025
@@ -132,21 +132,15 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                 if len(state) == 0:
                     state['z'] = p.clone()
 
-                self.apply_weight_decay(
-                    p=p,
-                    grad=grad,
-                    lr=lr,
-                    weight_decay=group['weight_decay'],
-                    weight_decouple=group['weight_decouple'],
-                    fixed_decay=group['fixed_decay'],
-                )
-
                 z = state['z']
 
+                grad.mul_(lr)
+                grad.add_(p, alpha=group['weight_decay'] * (1.0 if group['fixed_decay'] else lr))
+
                 p.lerp_(z, weight=checkpoint)
-                p.add_(grad, alpha=lr * (momentum * (1.0 - checkpoint) - 1))
+                p.add_(grad, alpha=momentum * (1.0 - checkpoint) - 1)
 
-                z.sub_(grad, alpha=lr)
+                z.sub_(grad)
 
         return loss
 
@@ -259,9 +253,9 @@ def step(self, closure: CLOSURE = None) -> LOSS:
 
             beta1, beta2 = group['betas']
 
-            bias_correction2_sq: float = math.sqrt(1.0 - beta2 ** group['step'])
+            bias_correction2: float = 1.0 - beta2 ** group['step']
 
-            lr: float = group['lr'] * schedule * bias_correction2_sq
+            lr: float = group['lr'] * schedule
             lr_max = group['lr_max'] = max(lr, group['lr_max'])
 
             weight = (group['step'] ** group['r']) * (lr_max ** group['weight_lr_power'])
@@ -271,7 +265,9 @@ def step(self, closure: CLOSURE = None) -> LOSS:
 
             if group['use_palm']:
                 beta2: float = 1.0 - group['step'] ** -0.8
-                debias: float = (1.0 - beta2) / (1.0 - beta2 ** group['step'])
+                debias: float = 1.0 - (1.0 - beta2) / (1.0 - beta2 ** group['step'])
+                # unnecessary bias correction when PaLM beta2 scheduling
+                bias_correction2 = 1.0
             else:
                 debias: float = beta2
 
@@ -289,31 +285,27 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                     state['z'] = p.clone()
                     state['exp_avg_sq'] = torch.zeros_like(p)
 
-                self.apply_weight_decay(
-                    p=p,
-                    grad=grad,
-                    lr=lr,
-                    weight_decay=group['weight_decay'],
-                    weight_decouple=group['weight_decouple'],
-                    fixed_decay=group['fixed_decay'],
-                )
+                if not group['weight_decouple']:
+                    grad.add_(p, alpha=group['weight_decay'])
 
                 z, exp_avg_sq = state['z'], state['exp_avg_sq']
                 exp_avg_sq.mul_(debias).addcmul_(grad, grad, value=1.0 - debias)
-
                 de_nom = self.apply_ams_bound(
                     ams_bound=group['ams_bound'],
-                    exp_avg_sq=exp_avg_sq,
+                    exp_avg_sq=exp_avg_sq.div(bias_correction2),
                     max_exp_avg_sq=state.get('max_exp_avg_sq', None),
                     eps=group['eps'],
                 )
 
                 grad.div_(de_nom)
+                grad.mul_(lr)
+                if group['weight_decouple']:
+                    grad.add_(p, alpha=group['weight_decay'] * (1.0 if group['fixed_decay'] else lr))
 
                 p.lerp_(z, weight=checkpoint)
-                p.add_(grad, alpha=lr * (beta1 * (1.0 - checkpoint) - 1))
+                p.add_(grad, alpha=beta1 * (1.0 - checkpoint) - 1)
 
-                z.sub_(grad, alpha=lr)
+                z.sub_(grad)
 
         return loss
 
@@ -428,19 +420,22 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                 n_sma_threshold=4,
                 degenerated_to_sgd=group['degenerated_to_sgd'],
             )
+            if n_sma > 4:
+                # cancel bias correction2
+                lr = lr / bias_correction2_sq
 
-            lr_max = group['lr_max'] = max(lr, group['lr_max'])
+            lr_max = group['lr_max'] = max(lr, group['lr_max'], 0.0)
 
             weight = (group['step'] ** group['r']) * (lr_max ** group['weight_lr_power'])
             weight_sum = group['weight_sum'] = group['weight_sum'] + weight
 
             checkpoint: float = weight / weight_sum if weight_sum != 0.0 else 0.0
 
-            adaptive_y_lr: float = lr * (beta1 * (1.0 - checkpoint) - 1.0)
-
             if group['use_palm']:
                 beta2: float = 1.0 - group['step'] ** -0.8
-                debias: float = (1.0 - beta2) / (1.0 - beta2 ** group['step'])
+                debias: float = 1.0 - (1.0 - beta2) / (1.0 - beta2 ** group['step'])
+                # unnecessary bias correction when PaLM beta2 scheduling
+                bias_correction2_sq = 1.0
             else:
                 debias: float = beta2
 
@@ -458,25 +453,24 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                     state['z'] = p.clone()
                     state['exp_avg_sq'] = torch.zeros_like(p)
 
+                if not group['weight_decouple']:
+                    grad.add_(p, alpha=group['weight_decay'])
+
                 z, exp_avg_sq = state['z'], state['exp_avg_sq']
                 exp_avg_sq.mul_(debias).addcmul_(grad, grad, value=1.0 - debias)
 
                 if n_sma > 4.0:
                     de_nom = exp_avg_sq.sqrt().div_(bias_correction2_sq).add_(group['eps'])
                     grad.div_(de_nom)
 
-                self.apply_weight_decay(
-                    p=p,
-                    grad=grad,
-                    lr=lr,
-                    weight_decay=group['weight_decay'],
-                    weight_decouple=group['weight_decouple'],
-                    fixed_decay=group['fixed_decay'],
-                )
+                if lr > 0.0:
+                    grad.mul_(lr)
+                    if group['weight_decouple']:
+                        grad.add_(p, alpha=group['weight_decay'] * (1.0 if group['fixed_decay'] else lr))
 
-                p.lerp_(z, weight=checkpoint)
-                p.add_(grad, alpha=adaptive_y_lr)
+                    p.lerp_(z, weight=checkpoint)
+                    p.add_(grad, alpha=beta1 * (1.0 - checkpoint) - 1.0)
 
-                z.sub_(grad, alpha=lr)
+                    z.sub_(grad)
 
         return loss
@@ -502,7 +502,9 @@
     (Adalite, {'lr': 1e0, 'weight_decay': 1e-3}, 5),
     (ScheduleFreeSGD, {'lr': 1e0, 'weight_decay': 1e-3}, 5),
     (ScheduleFreeAdamW, {'lr': 1e0, 'weight_decay': 1e-3}, 5),
-    (ScheduleFreeAdamW, {'lr': 1e-2, 'weight_decay': 1e-3, 'use_palm': True}, 5),
+    (ScheduleFreeAdamW, {'lr': 1e0, 'weight_decay': 1e-3, 'use_palm': True}, 5),
+    (ScheduleFreeRAdam, {'lr': 5e0, 'weight_decay': 1e-3}, 10),
+    (ScheduleFreeRAdam, {'lr': 5e0, 'weight_decay': 1e-3, 'use_palm': True}, 10),
     (ScheduleFreeRAdam, {'lr': 1e0, 'weight_decay': 1e-3, 'degenerated_to_sgd': True}, 5),
     (ScheduleFreeRAdam, {'lr': 1e0, 'weight_decay': 1e-3, 'use_palm': True, 'degenerated_to_sgd': True}, 5),
     (FAdam, {'lr': 1e0, 'weight_decay': 1e-3}, 5),