yiliu30
diff --git a/‎docs/source/optim.rst
+188-1 b/‎docs/source/optim.rst
+188-1
diff --git a/‎test/test_optim.py
+104-9 b/‎test/test_optim.py
+104-9
@@ -13,14 +13,20 @@ Constructing it
 ^^^^^^^^^^^^^^^
 
 To construct an :class:`Optimizer` you have to give it an iterable containing the
-parameters (all should be :class:`~torch.autograd.Variable` s) to optimize. Then,
+parameters (all should be :class:`~torch.nn.Parameter` s) or named parameters
+(tuples of (str, :class:`~torch.nn.Parameter`)) to optimize. Then,
 you can specify optimizer-specific options such as the learning rate, weight decay, etc.
 
 Example::
 
     optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
     optimizer = optim.Adam([var1, var2], lr=0.0001)
 
+Named parameters example::
+
+    optimizer = optim.SGD(model.named_parameters(), lr=0.01, momentum=0.9)
+    optimizer = optim.Adam([('layer0', var1), ('layer1', var2)], lr=0.0001)
+
 Per-parameter options
 ^^^^^^^^^^^^^^^^^^^^^
 
@@ -38,6 +44,11 @@ For example, this is very useful when one wants to specify per-layer learning ra
                     {'params': model.classifier.parameters()}
                 ], lr=1e-3, momentum=0.9)
 
+    optim.SGD([
+                    {'params': model.base.named_parameters(), 'lr': 1e-2},
+                    {'params': model.classifier.named_parameters()}
+                ], lr=1e-3, momentum=0.9)
+
 This means that ``model.base``'s parameters will use a learning rate of ``1e-2``, whereas
 ``model.classifier``'s parameters will stick to the default learning rate of ``1e-3``.
 Finally a momentum of ``0.9`` will be used for all parameters.
@@ -303,6 +314,182 @@ algorithms.
     lr_scheduler.OneCycleLR
     lr_scheduler.CosineAnnealingWarmRestarts
 
+How to utilize named parameters to load optimizer state dict
+------------------------------------------------------------
+
+The function :func:`~Optimizer.load_state_dict` stores the optional ``param_names``content from the
+loaded state dict if present. However, the process of loading the optimizer state is not affected,
+as the order of the parameters matters to maintain compatibility (in case of different ordering).
+To utilize the loaded parameters names from the loaded state dict, a custom ``register_load_state_dict_pre_hook``
+needs to be implemented according to the desired behavior.
+
+This can be useful, for instance, when the model architecture changes, but the weights and optimizer states need to
+remain unchanged. The following example demonstrates how to implement this customization.
+
+Example::
+
+    class OneLayerModel(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.fc = nn.Linear(3, 4)
+
+        def forward(self, x):
+            return self.fc(x)
+
+    model = OneLayerModel()
+    optimizer = optim.SGD(model.named_parameters(), lr=0.01, momentum=0.9)
+    # training..
+    torch.save(optimizer.state_dict(), PATH)
+
+Let's say that ``model`` implements an expert (MoE), and we want to duplicate it and resume training
+for two experts, both initialized the same way as the ``fc`` layer. For the following ``model2`` we create two layers identical to ``fc`` and resume training by loading the model weights and optimizer states from ``model`` into both ``fc1`` and ``fc2`` of ``model2`` (and adjust them accordingly)::
+
+    class TwoLayerModel(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.fc1 = nn.Linear(3, 4)
+            self.fc2 = nn.Linear(3, 4)
+
+        def forward(self, x):
+            return (self.fc1(x) + self.fc2(x)) / 2
+
+    model2 = TwoLayerModel()
+    # adapt and load model weights..
+    optimizer2 = optim.SGD(model2.named_parameters(), lr=0.01, momentum=0.9)
+
+To load the state dict for ``optimizer2`` with the state dict of the previous optimizer such that both
+``fc1`` and ``fc2`` will be initialized with a copy of ``fc`` optimizer states
+(to resume training for each layer from ``fc``), we can use the following hook::
+
+    def adapt_state_dict_ids(optimizer, state_dict):
+        adapted_state_dict = deepcopy(optimizer.state_dict())
+        # Copy setup parameters (lr, weight_decay, etc.), in case they differ in the loaded state dict.
+        for k, v in state_dict['param_groups'][0].items():
+            if k not in ['params', 'param_names']:
+                adapted_state_dict['param_groups'][0][k] = v
+
+        lookup_dict = {
+            'fc1.weight': 'fc.weight',
+            'fc1.bias': 'fc.bias',
+            'fc2.weight': 'fc.weight',
+            'fc2.bias': 'fc.bias'
+        }
+        clone_deepcopy = lambda d: {k: (v.clone() if isinstance(v, torch.Tensor) else deepcopy(v)) for k, v in d.items()}
+        for param_id, param_name in zip(
+                optimizer.state_dict()['param_groups'][0]['params'],
+                optimizer.state_dict()['param_groups'][0]['param_names']):
+            name_in_loaded = lookup_dict[param_name]
+            index_in_loaded_list = state_dict['param_groups'][0]['param_names'].index(name_in_loaded)
+            id_in_loaded = state_dict['param_groups'][0]['params'][index_in_loaded_list]
+            # Copy the state of the corresponding parameter
+            if id_in_loaded in state_dict['state']:
+                adapted_state_dict['state'][param_id] = clone_deepcopy(state_dict['state'][id_in_loaded])
+
+        return adapted_state_dict
+
+    optimizer2.register_load_state_dict_pre_hook(adapt_state_dict_ids)
+    optimizer2.load_state_dict(torch.load(PATH)) # The previous optimizer saved state_dict
+
+This ensures that the adapted state_dict with the correct states for the layers of ``model2`` will be used
+during model loading.
+Note that this code is designed specifically for this example (e.g., assuming a single parameter group),
+and other cases might require different adaptations.
+
+The following example shows how to handle missing parameters in a loaded
+``state dict`` when the model structure changes.
+The ``Model_bypass`` adds a new ``bypass`` layer, which is not present in the original ``Model1``.
+To resume training, a custom ``adapt_state_dict_missing_param`` hook is used to adapt the optimizer's ``state_dict``,
+ensuring existing parameters are mapped correctly, while missing ones (like the bypass layer) remain unchanged
+(as initialized in this example).
+This approach enables smooth loading and resuming of the optimizer state despite model changes.
+The new bypass layer will be trained from scratch::
+
+    class Model1(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.fc = nn.Linear(5, 5)
+
+        def forward(self, x):
+            return self.fc(x) + x
+
+
+    model = Model1()
+    optimizer = optim.SGD(model.named_parameters(), lr=0.01, momentum=0.9)
+    # training..
+    torch.save(optimizer.state_dict(), PATH)
+
+    class Model_bypass(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.fc = nn.Linear(5, 5)
+            self.bypass = nn.Linear(5, 5, bias=False)
+            torch.nn.init.eye_(self.bypass.weight)
+
+        def forward(self, x):
+            return self.fc(x) + self.bypass(x)
+
+    model2 = Model_bypass()
+    optimizer2 = optim.SGD(model2.named_parameters(), lr=0.01, momentum=0.9)
+
+    def adapt_state_dict_missing_param(optimizer, state_dict):
+        adapted_state_dict = deepcopy(optimizer.state_dict())
+        # Copy setup parameters (lr, weight_decay, etc.), in case they differ in the loaded state dict.
+        for k, v in state_dict['param_groups'][0].items():
+            if k not in ['params', 'param_names']:
+                adapted_state_dict['param_groups'][0][k] = v
+
+        lookup_dict = {
+            'fc.weight': 'fc.weight',
+            'fc.bias': 'fc.bias',
+            'bypass.weight': None,
+        }
+
+        clone_deepcopy = lambda d: {k: (v.clone() if isinstance(v, torch.Tensor) else deepcopy(v)) for k, v in d.items()}
+        for param_id, param_name in zip(
+                optimizer.state_dict()['param_groups'][0]['params'],
+                optimizer.state_dict()['param_groups'][0]['param_names']):
+            name_in_loaded = lookup_dict[param_name]
+            if name_in_loaded in state_dict['param_groups'][0]['param_names']:
+                index_in_loaded_list = state_dict['param_groups'][0]['param_names'].index(name_in_loaded)
+                id_in_loaded = state_dict['param_groups'][0]['params'][index_in_loaded_list]
+                # Copy the state of the corresponding parameter
+                if id_in_loaded in state_dict['state']:
+                    adapted_state_dict['state'][param_id] = clone_deepcopy(state_dict['state'][id_in_loaded])
+
+        return adapted_state_dict
+
+    optimizer2.register_load_state_dict_pre_hook(adapt_state_dict_ids)
+    optimizer2.load_state_dict(torch.load(PATH)) # The previous optimizer saved state_dict
+
+
+
+As a third example, instead of loading a state according to the order of parameters (the default approach),
+this hook can be used to load according to the parameters' names::
+
+    def names_matching(optimizer, state_dict):
+        assert len(state_dict['param_groups']) == len(optimizer.state_dict()['param_groups'])
+        adapted_state_dict = deepcopy(optimizer.state_dict())
+        for g_ind in range(len(state_dict['param_groups'])):
+            assert len(state_dict['param_groups'][g_ind]['params']) == len(
+                optimizer.state_dict()['param_groups'][g_ind]['params'])
+
+            for k, v in state_dict['param_groups'][g_ind].items():
+                if k not in ['params', 'param_names']:
+                    adapted_state_dict['param_groups'][g_ind][k] = v
+
+            for param_id, param_name in zip(
+                    optimizer.state_dict()['param_groups'][g_ind]['params'],
+                    optimizer.state_dict()['param_groups'][g_ind]['param_names']):
+                index_in_loaded_list = state_dict['param_groups'][g_ind]['param_names'].index(param_name)
+                id_in_loaded = state_dict['param_groups'][g_ind]['params'][index_in_loaded_list]
+                # Copy the state of the corresponding parameter
+                if id_in_loaded in state_dict['state']:
+                    adapted_state_dict['state'][param_id] = deepcopy(state_dict['state'][id_in_loaded])
+
+        return adapted_state_dict
+
+
+
 Weight Averaging (SWA and EMA)
 ------------------------------
 
 
@@ -1341,8 +1341,12 @@ def test_optimizer_can_be_printed(self, device, dtype, optim_info):
             optimizer = optim_cls(params, **optim_input.kwargs)
             optimizer.__repr__()
 
+    @parametrize("is_named_optim0", [True, False])
+    @parametrize("is_named_optim1", [True, False])
     @optims(optim_db, dtypes=[torch.float32])
-    def test_state_dict_deterministic(self, device, dtype, optim_info):
+    def test_state_dict_deterministic(
+        self, device, dtype, optim_info, is_named_optim0, is_named_optim1
+    ):
         optim_cls = optim_info.optim_cls
 
         # Skip differentiable testing for now, see https://github.com/pytorch/pytorch/issues/116490
@@ -1356,6 +1360,17 @@ def test_state_dict_deterministic(self, device, dtype, optim_info):
         input = torch.randn(3, requires_grad=True, device=device, dtype=dtype)
         params = [weight, bias]
 
+        def make_named_param(param, is_named):
+            if not is_named:
+                return param
+            return [(f"name{i}", p) for i, p in enumerate(param)]
+
+        def without_param_names(state_dict):
+            new_state_dict = deepcopy(state_dict)
+            for pg in new_state_dict["param_groups"]:
+                pg.pop("param_names", None)
+            return new_state_dict
+
         def fwd_bwd(optim, w, b, i):
             optim.zero_grad()
             loss = (w.mv(i) + b).pow(2).sum()
@@ -1368,7 +1383,8 @@ def fwd_bwd(optim, w, b, i):
             return loss
 
         for optim_input in all_optim_inputs:
-            optimizer = optim_cls(params, **optim_input.kwargs)
+            params_in = make_named_param(params, is_named=is_named_optim0)
+            optimizer = optim_cls(params_in, **optim_input.kwargs)
             closure = functools.partial(fwd_bwd, optimizer, weight, bias, input)
 
             # Prime the optimizer
@@ -1383,8 +1399,8 @@ def fwd_bwd(optim, w, b, i):
             with torch.no_grad():
                 weight_c = Parameter(weight.clone())
                 bias_c = Parameter(bias.clone())
-
-            optimizer_c = optim_cls([weight_c, bias_c], **optim_input.kwargs)
+            params_c = make_named_param([weight_c, bias_c], is_named=is_named_optim1)
+            optimizer_c = optim_cls(params_c, **optim_input.kwargs)
             closure_c = functools.partial(fwd_bwd, optimizer_c, weight_c, bias_c, input)
 
             # Load the state dict from the original optimizer into the new one
@@ -1405,13 +1421,17 @@ def fwd_bwd(optim, w, b, i):
                 self.assertEqual(bias, bias_c)
 
             # Make sure state dict is deterministic with equal (not identical) parameters
-            self.assertEqual(optimizer.state_dict(), optimizer_c.state_dict())
+            # Param names are optional and not needed to be the consistent.
+            self.assertEqual(
+                without_param_names(optimizer.state_dict()),
+                without_param_names(optimizer_c.state_dict()),
+            )
 
             # Make sure repeated parameters have identical representation (see #36831)
             optimizer_c.param_groups.extend(optimizer_c.param_groups)
             self.assertEqual(
-                optimizer.state_dict()["param_groups"][-1],
-                optimizer_c.state_dict()["param_groups"][-1],
+                without_param_names(optimizer.state_dict())["param_groups"][-1],
+                without_param_names(optimizer_c.state_dict())["param_groups"][-1],
             )
 
     @optims(optim_db, dtypes=[torch.float32])
@@ -1462,8 +1482,77 @@ def fwd_bwd(optim, mod, i):
                 fwd_bwd(optimizer, model, input)
                 optimizer.step()
 
+    @parametrize("is_named_optim0", [True, False])
+    @parametrize("is_named_optim1", [True, False])
+    @optims(
+        [o for o in optim_db if not o.only_supports_sparse_grads],
+        dtypes=[torch.float32],
+    )
+    def test_can_load_from_to_named_state_dict(
+        self, device, dtype, optim_info, is_named_optim0, is_named_optim1
+    ):
+        optim_cls = optim_info.optim_cls
+
+        # Skip differentiable testing for now, see https://github.com/pytorch/pytorch/issues/116490
+        all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(
+            device, dtype, optim_info, skip=("differentiable",)
+        )
+        for optim_input in all_optim_inputs:
+            torch.manual_seed(1)
+            model = torch.nn.Sequential(
+                torch.nn.Conv2d(4, 2, 1, stride=2),
+                torch.nn.BatchNorm2d(2, eps=1e-05, momentum=0.1),
+            )
+            model.to(dtype=dtype, device=device)
+            input = torch.rand(1, 4, 16, 16, device=device, dtype=dtype)
+
+            def fwd_bwd(optim, mod, i):
+                optim.zero_grad()
+                loss = mod(i).sum()
+                loss.backward()
+                return loss
+
+            # test for parameters, named_parameters, and 2 groups:
+            params_to_optimizer = (
+                model.named_parameters() if is_named_optim0 else model.parameters()
+            )
+            optimizer = optim_cls(params_to_optimizer, **optim_input.kwargs)
+
+            for _ in range(3):
+                if optim_info.step_requires_closure:
+                    optimizer.step(functools.partial(fwd_bwd, optimizer, model, input))
+                else:
+                    fwd_bwd(optimizer, model, input)
+                    optimizer.step()
+
+            # old_state_dict has all new flags del'd
+            old_state_dict = deepcopy(optimizer.state_dict())
+
+            params_to_optimizer2 = (
+                model.named_parameters() if is_named_optim1 else model.parameters()
+            )
+            optimizer2 = optim_cls(params_to_optimizer2, **optim_input.kwargs)
+            optimizer2.load_state_dict(old_state_dict)
+
+            # Make sure we can still step
+            if optim_info.step_requires_closure:
+                optimizer2.step(functools.partial(fwd_bwd, optimizer2, model, input))
+            else:
+                fwd_bwd(optimizer2, model, input)
+                optimizer2.step()
+
+            # Make sure that param_names are preserved when provided to at least one of the optimizers
+            if is_named_optim0 or is_named_optim1:
+                self.assertEqual(
+                    optimizer2.state_dict()["param_groups"][0]["param_names"],
+                    ["0.weight", "0.bias", "1.weight", "1.bias"],
+                )
+
+    @parametrize("is_named_optim", [True, False])
     @optims(optim_db, dtypes=[torch.float32])
-    def test_save_load_equality_with_weights_only(self, device, dtype, optim_info):
+    def test_save_load_equality_with_weights_only(
+        self, device, dtype, optim_info, is_named_optim
+    ):
         optim_cls = optim_info.optim_cls
 
         # Skip differentiable testing for now, see https://github.com/pytorch/pytorch/issues/116490
@@ -1477,6 +1566,11 @@ def test_save_load_equality_with_weights_only(self, device, dtype, optim_info):
         input = torch.randn(3, requires_grad=True, device=device, dtype=dtype)
         params = [weight, bias]
 
+        def make_named_param(param, is_named):
+            if not is_named:
+                return param
+            return [(f"name{i}", p) for i, p in enumerate(param)]
+
         def fwd_bwd(optim, w, b, i):
             optim.zero_grad()
             loss = (w.mv(i) + b).pow(2).sum()
@@ -1487,7 +1581,8 @@ def fwd_bwd(optim, w, b, i):
             return loss
 
         for optim_input in all_optim_inputs:
-            optimizer = optim_cls(params, **optim_input.kwargs)
+            params_in = make_named_param(params, is_named=is_named_optim)
+            optimizer = optim_cls(params_in, **optim_input.kwargs)
             closure = functools.partial(fwd_bwd, optimizer, weight, bias, input)
 
             # Prime the optimizer