huggingface · fxmarty · Apr 8, 2024 · Apr 5, 2024 · Apr 5, 2024 · Apr 8, 2024
diff --git a/src/transformers/models/vitdet/modeling_vitdet.py b/src/transformers/models/vitdet/modeling_vitdet.py
@@ -94,11 +94,12 @@ def get_absolute_positions(self, abs_pos_embeddings, has_cls_token, height, widt
         if has_cls_token:
             abs_pos_embeddings = abs_pos_embeddings[:, 1:]
         num_position = abs_pos_embeddings.shape[1]
-        size = int(math.sqrt(num_position))
+        size = int(math.sqrt(num_position))  # This is a constant and can be recorded as such in the ONNX export.
         if size * size != num_position:
             raise ValueError("Absolute position embeddings must be a square number.")
 
-        if size != height or size != width:
+        if torch.jit.is_tracing() or (size != height or size != width):
+            # nn.functional.interpolate is a noop in case size == height and size == width - we need to always capture this path with jit.trace.
             new_abs_pos_embeddings = nn.functional.interpolate(
                 abs_pos_embeddings.reshape(1, size, size, -1).permute(0, 3, 1, 2),
                 size=(height, width),
@@ -131,7 +132,7 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
 
         return embeddings
 
-
+@torch.jit.script_if_tracing  # nn.functional.interpolate's `size` needs to be dynamic.
 def get_rel_pos(q_size, k_size, rel_pos):
     """
     Get relative positional embeddings according to the relative positions of query and key sizes.
@@ -405,8 +406,7 @@ def window_partition(hidden_state, window_size):
 
     pad_height = (window_size - height % window_size) % window_size
     pad_width = (window_size - width % window_size) % window_size
-    if pad_height > 0 or pad_width > 0:
-        hidden_state = nn.functional.pad(hidden_state, (0, 0, 0, pad_width, 0, pad_height))
+    hidden_state = nn.functional.pad(hidden_state, (0, 0, 0, pad_width, 0, pad_height))  # Noop in case pad_width == 0 and pad_height == 0.
     patch_height, patch_width = height + pad_height, width + pad_width
 
     hidden_state = hidden_state.view(
@@ -441,8 +441,8 @@ def window_unpartition(windows, window_size, pad_height_width, height_width):
     )
     hidden_state = hidden_state.permute(0, 1, 3, 2, 4, 5).contiguous().view(batch_size, patch_height, patch_width, -1)
 
-    if patch_height > height or patch_width > width:
-        hidden_state = hidden_state[:, :height, :width, :].contiguous()
+    # We always have height <= patch_height and width <= patch_width
+    hidden_state = hidden_state[:, :height, :width, :].contiguous()
     return hidden_state