[FSDP] Propagate requires_grad attribute to unsharded params (pytorch#109892)

edpizzi · pytorchmergebot · commit c13177f2cb3e · 2023-09-24T01:30:50.000Z
Summary: This preserves `requires_grad` in the case where all parameters within a `FlatParameter` have the same `requires_grad` value. Currently, unsharded parameters have `requires_grad=True` in some cases where the `FlatParameter` and all original parameters have `requires_grad=False`. This could be extended to support `FlatParameters` with a mix of `requires_grad` states by extending `ParamInfo` to capture `requires_grad` for each parameter. Test Plan: test added Differential Revision: D49517155 Pull Request resolved: pytorch#109892 Approved by: https://github.com/awgu
diff --git a/test/distributed/fsdp/test_fsdp_freezing_weights.py b/test/distributed/fsdp/test_fsdp_freezing_weights.py
@@ -165,6 +165,10 @@ def test_freezing_weights(
             msg="FullyShardedDataParallel states didn't match PyTorch DDP states",
         )
 
+        if freezing_method == FreezingMethod.RequiresGrad:
+            for ddp_param, fsdp_param in zip(ddp_state, fsdp_state):
+                self.assertEqual(ddp_param.requires_grad, fsdp_param.requires_grad)
+
 
 instantiate_parametrized_tests(TestFreezingWeights)
 
diff --git a/torch/distributed/fsdp/flat_param.py b/torch/distributed/fsdp/flat_param.py
@@ -1807,13 +1807,21 @@ def _use_unsharded_views(self, as_params: bool) -> None:
                     # A `DTensor` `view` is not compatible with assigning
                     # `param.data = view`, so we cannot preserve the parameter
                     # variable.
-                    self._setattr_param(module, param_name, nn.Parameter(view))
+                    self._setattr_param(
+                        module,
+                        param_name,
+                        nn.Parameter(view, requires_grad=flat_param.requires_grad),
+                    )
                     continue
                 param = self.flat_param._params[i]
                 self._setattr_param(module, param_name, param)
                 param.data = view
             elif as_params:
-                self._setattr_param(module, param_name, nn.Parameter(view))
+                self._setattr_param(
+                    module,
+                    param_name,
+                    nn.Parameter(view, requires_grad=flat_param.requires_grad),
+                )
             else:  # `as_params=False`
                 param_var: Tensor = view
                 if self._use_orig_params:

Original file line number	Diff line number	Diff line change
`@@ -165,6 +165,10 @@ def test_freezing_weights(`
`165`	`165`	`msg="FullyShardedDataParallel states didn't match PyTorch DDP states",`
`166`	`166`	`)`
`167`	`167`
	`168`	`+ if freezing_method == FreezingMethod.RequiresGrad:`
	`169`	`+ for ddp_param, fsdp_param in zip(ddp_state, fsdp_state):`
	`170`	`+ self.assertEqual(ddp_param.requires_grad, fsdp_param.requires_grad)`
	`171`	`+`
`168`	`172`
`169`	`173`	`instantiate_parametrized_tests(TestFreezingWeights)`
`170`	`174`