yiliu30
diff --git a/‎aten/src/ATen/FunctionalStorageImpl.cpp
+10-1 b/‎aten/src/ATen/FunctionalStorageImpl.cpp
+10-1
diff --git a/‎aten/src/ATen/FunctionalStorageImpl.h
+24 b/‎aten/src/ATen/FunctionalStorageImpl.h
+24
diff --git a/‎aten/src/ATen/FunctionalTensorWrapper.cpp
+26 b/‎aten/src/ATen/FunctionalTensorWrapper.cpp
+26
diff --git a/‎aten/src/ATen/FunctionalTensorWrapper.h
+13 b/‎aten/src/ATen/FunctionalTensorWrapper.h
+13
diff --git a/‎aten/src/ATen/FunctionalizeFallbackKernel.cpp
+3 b/‎aten/src/ATen/FunctionalizeFallbackKernel.cpp
+3
diff --git a/‎aten/src/ATen/native/Copy.cpp
+36-1 b/‎aten/src/ATen/native/Copy.cpp
+36-1
diff --git a/‎aten/src/ATen/native/native_functions.yaml
+8-1 b/‎aten/src/ATen/native/native_functions.yaml
+8-1
diff --git a/‎test/dynamo/test_repros.py
+89 b/‎test/dynamo/test_repros.py
+89
diff --git a/‎test/dynamo_expected_failures/FakeTensorTest.test_embedding_bag_meta b/‎test/dynamo_expected_failures/FakeTensorTest.test_embedding_bag_meta
diff --git a/‎test/dynamo_expected_failures/TestNN.test_linear_autograd_device_cpu_bias_weightCSC b/‎test/dynamo_expected_failures/TestNN.test_linear_autograd_device_cpu_bias_weightCSC
diff --git a/‎test/dynamo_expected_failures/TestNN.test_linear_autograd_device_cpu_bias_weightCSR b/‎test/dynamo_expected_failures/TestNN.test_linear_autograd_device_cpu_bias_weightCSR
diff --git a/‎test/dynamo_expected_failures/TestNN.test_linear_autograd_device_cuda_bias_weightCOO b/‎test/dynamo_expected_failures/TestNN.test_linear_autograd_device_cuda_bias_weightCOO
diff --git a/‎test/dynamo_expected_failures/TestNN.test_linear_autograd_device_cuda_bias_weightCSC b/‎test/dynamo_expected_failures/TestNN.test_linear_autograd_device_cuda_bias_weightCSC
diff --git a/‎test/dynamo_expected_failures/TestNN.test_linear_autograd_device_cuda_bias_weightCSR b/‎test/dynamo_expected_failures/TestNN.test_linear_autograd_device_cuda_bias_weightCSR
diff --git a/‎test/dynamo_expected_failures/TestNN.test_linear_autograd_device_cuda_nobias_weightCOO b/‎test/dynamo_expected_failures/TestNN.test_linear_autograd_device_cuda_nobias_weightCOO
diff --git a/‎test/dynamo_expected_failures/TestNN.test_swap_module_params_fails_after_forward b/‎test/dynamo_expected_failures/TestNN.test_swap_module_params_fails_after_forward
diff --git a/‎test/dynamo_expected_failures/TestNNParametrizationDeviceCPU.test_weight_norm_parametrization_swap_False_cpu b/‎test/dynamo_expected_failures/TestNNParametrizationDeviceCPU.test_weight_norm_parametrization_swap_False_cpu
diff --git a/‎test/dynamo_expected_failures/TestNNParametrizationDeviceCPU.test_weight_norm_parametrization_swap_True_cpu b/‎test/dynamo_expected_failures/TestNNParametrizationDeviceCPU.test_weight_norm_parametrization_swap_True_cpu
diff --git a/‎test/dynamo_expected_failures/TestNNParametrizationDeviceCUDA.test_weight_norm_parametrization_swap_False_cuda b/‎test/dynamo_expected_failures/TestNNParametrizationDeviceCUDA.test_weight_norm_parametrization_swap_False_cuda
diff --git a/‎test/dynamo_expected_failures/TestNNParametrizationDeviceCUDA.test_weight_norm_parametrization_swap_True_cuda b/‎test/dynamo_expected_failures/TestNNParametrizationDeviceCUDA.test_weight_norm_parametrization_swap_True_cuda
diff --git a/‎test/dynamo_expected_failures/TestNestedTensorDeviceTypeCPU.test_embedding_jagged_cpu b/‎test/dynamo_expected_failures/TestNestedTensorDeviceTypeCPU.test_embedding_jagged_cpu
diff --git a/‎test/dynamo_skips/TestConvolutionNN.test_ConvTranspose2d_output_size_downsample_upsample b/‎test/dynamo_skips/TestConvolutionNN.test_ConvTranspose2d_output_size_downsample_upsample
diff --git a/‎test/dynamo_skips/TestConvolutionNNDeviceTypeCPU.test_conv2d_no_grad_cpu_float32 b/‎test/dynamo_skips/TestConvolutionNNDeviceTypeCPU.test_conv2d_no_grad_cpu_float32
diff --git a/‎test/dynamo_skips/TestNNParametrization.test_new_spectral_norm_dim_swap_False b/‎test/dynamo_skips/TestNNParametrization.test_new_spectral_norm_dim_swap_False
diff --git a/‎test/dynamo_skips/TestVmapOperators.test_conv2d b/‎test/dynamo_skips/TestVmapOperators.test_conv2d
@@ -97,7 +97,16 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
       /*resizable=*/true
     ),
     base_(base)
-  {
+{
+  // SparseTensorImpl has no storage, so we cannot query its nbytes.
+  // (original_storage_size is only used for storage resizing in fsdp anyway, which does not apply to sparse)
+  // Same for XLA
+  if (base.unsafeGetTensorImpl()->has_storage() && base.device().type() != c10::DeviceType::XLA) {
+    original_storage_size_ = base.unsafeGetTensorImpl()->unsafe_storage().unsafeGetStorageImpl()->sym_nbytes();
+  } else {
+    original_storage_size_ = -1;
+  }
+  curr_storage_size_ = original_storage_size_;
   TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(base_));
 }
 
 
@@ -105,6 +105,14 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
     frozen_ = true;
   }
 
+  c10::SymInt get_storage_size(bool before) {
+    if (before) {
+      return original_storage_size_;
+    } else {
+      return curr_storage_size_;
+    }
+  }
+
   ~FunctionalStorageImpl() override = default;
 
   void mark_mutation() {
@@ -132,6 +140,15 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
     return mutation_counter_ <= mutation_counter_hidden_from_autograd_;
   }
 
+  void mark_inductor_storage_resize(c10::SymInt new_size) {
+    inductor_storage_resized_ = true;
+    curr_storage_size_ = new_size;
+  }
+
+  bool was_inductor_storage_resized() {
+    return inductor_storage_resized_;
+  }
+
  private:
   // NB: base_ should always point to a tensor BELOW the current
   // functionalization layer. This is mainly to avoid reference cycles. e.g.
@@ -172,6 +189,13 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
   uint64_t mutation_counter_during_no_grad_or_inference_mode_ = 0;
   uint64_t mutation_counter_ = 0;
   uint64_t mutation_counter_hidden_from_autograd_ = 0;
+
+  // Used to tell if:
+  // (1) There were any storage resizes on a graph input
+  // (2) The original/curr storage size tell us if these resizes result in a nop
+  bool inductor_storage_resized_ = false;
+  c10::SymInt original_storage_size_;
+  c10::SymInt curr_storage_size_;
 };
 
 } // namespace at::functionalization
@@ -276,6 +276,32 @@ void FunctionalTensorWrapper::set__impl(const FunctionalTensorWrapper* other) {
   set_sizes_and_strides(sizes_, strides_, storage_offset_);
 }
 
+void FunctionalTensorWrapper::storage_resize_(c10::SymInt new_size) {
+  auto curr_storage_size = value_.unsafeGetTensorImpl()->unsafe_storage().unsafeGetStorageImpl()->sym_nbytes();
+  // storage resizing is severely limited: we only support resizing either to zero, or from zero bytes.
+  TORCH_CHECK(new_size == 0 || curr_storage_size == 0, "new_size: ", new_size, ". curr_storage_size: ", curr_storage_size);
+  // The "functionalization rule" for storage resizing is a giant no-op, mainly because we don't want
+  // resize_() calls to actualy emit any ops in the functional graph.
+  // How does it work?
+  // Resizing up (old size == 0):
+  //   We do nothing in this case.
+  //   The expection is that for the user code to be valid, the next op that should run against the current tensor "x"
+  //   will be a x.copy_(y) (or similar), that will fully overwrite the data of x.
+  //   If there are any outstanding aliases of x, we expect them not to be used until after the copy_() call
+  //   (otherwise the eager code would be invalid),
+  //   and therefore functionalization will regenerate the aliases off of the result of `x.copy(y)`.
+  // Resizing down (new size == 0):
+  //   We also do nothing in this case. The assumption is that after resizing a tensor down,
+  //   it is fully unused in the program (unless it is later resized back up first, has data copied in)
+  //   Although it might be saved for backward, which happens in FSDP.
+  //   The expected pattern is that the param will then be resized back up from zero in the backward.
+
+  // Mark the tensor as having its storage resized.
+  // This is so we can detect it for inputs in AOTAutograd and error / emit
+  // an input mutation resize_() appropriately
+  functional_storage_impl()->mark_inductor_storage_resize(new_size);
+}
+
 void FunctionalTensorWrapper::maybe_replace_storage(const Tensor& other) {
   // Note [resize_() in functionalization pass]
   // resize_() is a special operator in functionalization because it can reallocate its underlying storage.
 
@@ -141,6 +141,9 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
   // Custom implementation of self.set_(src)
   void set__impl(const FunctionalTensorWrapper* other);
 
+  // Custom implementation of resize_storage_bytes_(self, new_size)
+  void storage_resize_(c10::SymInt new_size);
+
   // Returns whether the current tensor's data was ever mutated
   bool has_data_mutation();
   //
@@ -150,6 +153,16 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
     return was_storage_changed_;
   }
 
+  c10::SymInt get_storage_size(bool before) {
+    return functional_storage_impl()->get_storage_size(before);
+  }
+
+  // Returns whether the FunctionalTensor experienced an
+  // untyped_storage().resize_() call
+  bool was_inductor_storage_resized() {
+    return functional_storage_impl()->was_inductor_storage_resized();
+  }
+
   // The functionalization pass can be used to remove mutations.
   // It does so by replacing any mutation op with it's corresponding
   // out-of-place op, followed by a call to replace_(). e.g:
 
@@ -335,6 +335,9 @@ static at::Tensor& set__functionalize(at::Tensor& self, const at::Tensor& src) {
   TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(src));
   auto self_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(self);
   auto src_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(src);
+  // See Note [Ordering of resize_() and set_()]
+  TORCH_CHECK(!self_impl->was_inductor_storage_resized(),
+    "storage_resize_() followed by set_() in torch.compile is not supported today");
   self_impl->set__impl(src_impl);
   return self;
 }
 
@@ -1,5 +1,6 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/native/Copy.h>
+#include <ATen/native/Copy.h>
 
 #include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
@@ -25,8 +26,12 @@
 #include <ATen/ops/_copy_from.h>
 #include <ATen/ops/_propagate_xla_data.h>
 #include <ATen/ops/_propagate_xla_data_native.h>
+#include <ATen/ops/copy.h>
 #include <ATen/ops/copy_native.h>
+#include <ATen/ops/_foreach_copy.h>
+#include <ATen/ops/_foreach_copy_native.h>
 #include <ATen/ops/empty.h>
+#include <ATen/ops/empty_strided.h>
 #include <ATen/ops/expand_copy.h>
 #endif
 
@@ -303,15 +308,45 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking)
   return self;
 }
 
+Tensor copy_meta(const Tensor& self, const Tensor& src, bool non_blocking) {
+  // Must directly use self(), so we can dispatch properly is self is a subclass
+  auto r = clone_preserve_strides(self);
+  r.copy_(src, non_blocking);
+  return r;
+}
+
 Tensor copy(const Tensor& self, const Tensor& src, bool non_blocking) {
+  at::Tensor r;
   // copy() is the "functional" form of copy_(). It exists so we can properly functionalize copy_(), but:
   // (1) It isn't exposed to the frontend (no python bindings)
   // (2) It isn't exposed to the backend (it's a composite, that decomposes into to() and expand_as() calls.
-  auto r = clone_preserve_strides(self);
+  auto self_storage = self.unsafeGetTensorImpl()->unsafe_storage().unsafeGetStorageImpl();
+  // If self has no real storage, we can't actually clone it.
+  // Instead, generate an empty tensor with the right sizes/strides, since we should be able to assume
+  // that copy_() will fully overwrite all data with that of src
+  if (self_storage->nbytes() == 0) {
+    r = at::empty_strided(self.sizes(), self.strides());
+  } else {
+    r = clone_preserve_strides(self);
+  }
   r.copy_(src, non_blocking);
   return r;
 }
 
+::std::vector<at::Tensor> _foreach_copy(at::TensorList self, at::TensorList src, bool non_blocking) {
+  std::vector<at::Tensor> outs;
+  outs.reserve(self.size());
+  // This is a very slow implementation, but needs to directly call the copy() kernel above to handle
+  // when self has zero storage.
+  // This kernel should never really be run, except with debugging using compile(backend="aot_eager")
+  for (const auto i : c10::irange(src.size())) {
+    auto curr_src = src[i];
+    auto curr_self = self[i];
+    outs.push_back(at::copy(curr_self, curr_src, non_blocking));
+  }
+  return outs;
+}
+
 Tensor& copy_(Tensor& self, const Tensor& src, bool non_blocking) {
   auto maybe_outnames = namedinference::compute_broadcast_outnames(self, src);
   {
 
@@ -1750,6 +1750,7 @@
 - func: copy(Tensor self, Tensor src, bool non_blocking=False) -> Tensor
   variants: function
   dispatch:
+    Meta: copy_meta
     CompositeExplicitAutogradNonFunctional: copy
   tags: core
 
@@ -11357,7 +11358,13 @@
   dispatch:
     CPU: foreach_tensor_copy_list_kernel_slow_
     CUDA: foreach_tensor_copy_list_kernel_cuda_
-  autogen: _foreach_copy, _foreach_copy.out
+  autogen: _foreach_copy.out
+
+- func: _foreach_copy(Tensor[] self, Tensor[] src, bool non_blocking=False) -> Tensor[] self_out
+  device_check: NoCheck
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: _foreach_copy
 
 - func: bucketize.Tensor(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor
   dispatch:
 
@@ -160,6 +160,32 @@ def shapes_to_tensor(x, device=None):
     return torch.as_tensor(x, device=device)
 
 
+fw_graph = [None]
+bw_graph = [None]
+
+
+def aot_graph_capture_backend(gm, args):
+    from functorch.compile import min_cut_rematerialization_partition
+    from torch._functorch.aot_autograd import aot_module_simplified
+
+    def fw_compiler(gm, _):
+        fw_graph[0] = gm
+        return gm
+
+    def bw_compiler(gm, _):
+        bw_graph[0] = gm
+        return gm
+
+    return aot_module_simplified(
+        gm,
+        args,
+        fw_compiler,
+        bw_compiler,
+        partition_fn=min_cut_rematerialization_partition,
+        keep_inference_input_mutations=True,
+    )
+
+
 class Boxes:
     # from detectron2 poolers.py
     def __init__(self, tensor: torch.Tensor):
@@ -4644,6 +4670,69 @@ def fn(instances):
         self.assertEqual(type(actual), type(expected))
         self.assertEqual(actual.__dict__, expected.__dict__)
 
+    def test_storage_resize_forward_full_graph(self):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.param = torch.nn.Parameter(torch.randn(4, 4))
+
+            def forward(self, x):
+                self.param.untyped_storage().resize_(
+                    self.param.numel() * self.param.itemsize
+                )
+                with torch.no_grad():
+                    torch._foreach_copy_([self.param], [x])
+                out = torch.matmul(self.param, self.param)
+                self.param.untyped_storage().resize_(0)
+                return out
+
+        def post_accumulate_grad_hook(param):
+            param.untyped_storage().resize_(0)
+
+        # Beginning of backward, resize and put data into the param
+        def pre_backward_hook(module, grad) -> None:
+            module.param.untyped_storage().resize_(
+                self.param.numel() * self.param.itemsize
+            )
+            with torch.no_grad():
+                # simulates loading data into param from allgather
+                module.param.fill_(2)
+
+        def post_forward_hook(module, args, output):
+            output.register_hook(functools.partial(pre_backward_hook, module))
+
+        x = torch.randn(4, 4)
+
+        mod_ref = TestModule()
+        mod_test = deepcopy(mod_ref)
+
+        # Start the param off with zero storage size to mimic fsdp
+        mod_ref.param.untyped_storage().resize_(0)
+        mod_test.param.untyped_storage().resize_(0)
+
+        # Resize storage at beginning of backward
+        # Free storage at end of backward
+        mod_ref.register_forward_hook(post_forward_hook, prepend=False)
+        mod_ref.param.register_post_accumulate_grad_hook(post_accumulate_grad_hook)
+        mod_test.register_forward_hook(post_forward_hook, prepend=False)
+        mod_test.param.register_post_accumulate_grad_hook(post_accumulate_grad_hook)
+
+        mod_test = torch.compile(mod_test, backend=aot_graph_capture_backend)
+
+        out_ref = mod_ref(x)
+        out_test = mod_test(x)
+        self.assertExpectedInline(
+            str(fw_graph[0].code.strip()),
+            """\
+def forward(self, primals_1, primals_2):
+    _foreach_copy = torch.ops.aten._foreach_copy.default([primals_1], [primals_2]);  primals_1 = primals_2 = None
+    getitem = _foreach_copy[0];  _foreach_copy = None
+    mm = torch.ops.aten.mm.default(getitem, getitem)
+    t_1 = torch.ops.aten.t.default(getitem);  getitem = None
+    return [mm, t_1]""",
+        )
+        self.assertEqual(out_ref, out_test)
+
     def test_super_in_staticmethod(self):
         class A:
             @staticmethod