Add ViTPose ONNX export (#2183)

ariG23498 · xenova · web-flow · commit 02456194b9c0 · 2025-02-18T19:11:57.000+01:00
* Add ONNX export support for ViTPose

* building dummy inputs for vit post

* Move vitpose config to custom class

* Move input generators

* Patch VitPose models with num_experts&gt;1

* Formatting

* Add vitpose export unit tests

---------

Co-authored-by: Joshua Lochner &lt;admin@xenova.com&gt;
diff --git a/optimum/exporters/onnx/base.py b/optimum/exporters/onnx/base.py
@@ -159,6 +159,9 @@ class OnnxConfig(ExportConfig, ABC):
         "image-to-image": OrderedDict(
             {"reconstruction": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"}}
         ),
+        "keypoint-detection": OrderedDict(
+            {"heatmaps": {0: "batch_size", 1: "num_keypoints", 2: "height", 3: "width"}}
+        ),
         "mask-generation": OrderedDict({"logits": {0: "batch_size"}}),
         "masked-im": OrderedDict(
             {"reconstruction" if is_transformers_version(">=", "4.29.0") else "logits": {0: "batch_size"}}
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
@@ -24,6 +24,7 @@
 from ...utils import (
     DEFAULT_DUMMY_SHAPES,
     BloomDummyPastKeyValuesGenerator,
+    Dinov2DummyInputGenerator,
     DummyAudioInputGenerator,
     DummyCodegenDecoderTextInputGenerator,
     DummyDecisionTransformerInputGenerator,
@@ -63,6 +64,8 @@
     NormalizedTextConfigWithGQA,
     NormalizedTimeSeriesForecastingConfig,
     NormalizedVisionConfig,
+    PerceiverDummyInputGenerator,
+    VitPoseDummyInputGenerator,
     is_diffusers_available,
     is_diffusers_version,
     is_transformers_version,
@@ -93,6 +96,7 @@
     SentenceTransformersTransformerPatcher,
     SpeechT5ModelPatcher,
     VisionEncoderDecoderPatcher,
+    VitPoseModelPatcher,
     WavLMModelPatcher,
 )
 
@@ -847,6 +851,22 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
         return common_outputs
 
 
+class VitPoseOnnxConfig(ViTOnnxConfig):
+    DUMMY_INPUT_GENERATOR_CLASSES = (VitPoseDummyInputGenerator,)
+    ATOL_FOR_VALIDATION = 1e-4
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        return {"pixel_values": {0: "batch_size"}}
+
+    # Some VitPose models use multiple experts, which requires dataset_index to be provided.
+    # So, we need to patch the model for export to provide the dataset_index.
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return VitPoseModelPatcher(self, model, model_kwargs=model_kwargs)
+
+
 class CvTOnnxConfig(ViTOnnxConfig):
     DEFAULT_ONNX_OPSET = 13
     ATOL_FOR_VALIDATION = 1e-2
@@ -892,41 +912,6 @@ class VitMSNOnnxConfig(ViTOnnxConfig):
     DEFAULT_ONNX_OPSET = 14
 
 
-class Dinov2DummyInputGenerator(DummyVisionInputGenerator):
-    def __init__(
-        self,
-        task: str,
-        normalized_config: NormalizedVisionConfig,
-        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
-        num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"],
-        width: int = DEFAULT_DUMMY_SHAPES["width"],
-        height: int = DEFAULT_DUMMY_SHAPES["height"],
-        **kwargs,
-    ):
-        super().__init__(
-            task=task,
-            normalized_config=normalized_config,
-            batch_size=batch_size,
-            num_channels=num_channels,
-            width=width,
-            height=height,
-            **kwargs,
-        )
-
-        from transformers.onnx.utils import get_preprocessor
-
-        preprocessor = get_preprocessor(normalized_config._name_or_path)
-        if preprocessor is not None and hasattr(preprocessor, "crop_size"):
-            self.height = preprocessor.crop_size.get("height", self.height)
-            self.width = preprocessor.crop_size.get("width", self.width)
-
-    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
-        input_ = super().generate(
-            input_name=input_name, framework=framework, int_dtype=int_dtype, float_dtype=float_dtype
-        )
-        return input_
-
-
 class Dinov2OnnxConfig(ViTOnnxConfig):
     DUMMY_INPUT_GENERATOR_CLASSES = (Dinov2DummyInputGenerator,)
 
@@ -1606,41 +1591,6 @@ class Data2VecAudioOnnxConfig(AudioOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedConfig
 
 
-class PerceiverDummyInputGenerator(DummyVisionInputGenerator):
-    def __init__(
-        self,
-        task: str,
-        normalized_config: NormalizedVisionConfig,
-        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
-        num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"],
-        width: int = DEFAULT_DUMMY_SHAPES["width"],
-        height: int = DEFAULT_DUMMY_SHAPES["height"],
-        **kwargs,
-    ):
-        super().__init__(
-            task=task,
-            normalized_config=normalized_config,
-            batch_size=batch_size,
-            num_channels=num_channels,
-            width=width,
-            height=height,
-            **kwargs,
-        )
-
-        from transformers.onnx.utils import get_preprocessor
-
-        preprocessor = get_preprocessor(normalized_config._name_or_path)
-        if preprocessor is not None and hasattr(preprocessor, "size"):
-            self.height = preprocessor.size.get("height", self.height)
-            self.width = preprocessor.size.get("width", self.width)
-
-    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
-        input_ = super().generate(
-            input_name=input_name, framework=framework, int_dtype=int_dtype, float_dtype=float_dtype
-        )
-        return input_
-
-
 class PerceiverOnnxConfig(TextAndVisionOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
     DUMMY_INPUT_GENERATOR_CLASSES = (
diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py
@@ -1338,3 +1338,18 @@ def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
         if is_transformers_version(">=", "4.43"):
             CLIPSdpaAttention.forward = self.original_sdpa_forward
+
+
+class VitPoseModelPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        # Set dataset_index (defaulting to COCO=0), otherwise we will get an error like:
+        # ValueError: dataset_index must be provided when using multiple experts (num_experts=6). Please provide dataset_index to the forward pass.
+        if model.config.backbone_config.num_experts > 1:
+            model_kwargs["dataset_index"] = torch.tensor(0, device=model.device)
+
+        super().__init__(config, model, model_kwargs)
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
@@ -329,6 +329,7 @@ class TasksManager:
         ("pt", "visual-bert", "question-answering"): ("transformers", "VisualBertForQuestionAnswering"),
         # VisionEncoderDecoderModel is not registered in AutoModelForDocumentQuestionAnswering
         ("pt", "vision-encoder-decoder", "document-question-answering"): ("transformers", "VisionEncoderDecoderModel"),
+        ("pt", "vitpose", "keypoint-detection"): ("transformers", "VitPoseForPoseEstimation"),
     }
 
     _ENCODER_DECODER_TASKS = (
@@ -1241,6 +1242,7 @@ class TasksManager:
             "image-classification",
             onnx="VitMSNOnnxConfig",
         ),
+        "vitpose": supported_tasks_mapping("feature-extraction", "keypoint-detection", onnx="VitPoseOnnxConfig"),
         "vits": supported_tasks_mapping(
             "text-to-audio",
             onnx="VitsOnnxConfig",
diff --git a/optimum/utils/__init__.py b/optimum/utils/__init__.py
@@ -56,6 +56,7 @@
     DEFAULT_DUMMY_SHAPES,
     DTYPE_MAPPER,
     BloomDummyPastKeyValuesGenerator,
+    Dinov2DummyInputGenerator,
     DummyAudioInputGenerator,
     DummyBboxInputGenerator,
     DummyCodegenDecoderTextInputGenerator,
@@ -90,6 +91,8 @@
     MCTCTDummyAudioInputGenerator,
     MistralDummyPastKeyValuesGenerator,
     MultiQueryPastKeyValuesGenerator,
+    PerceiverDummyInputGenerator,
+    VitPoseDummyInputGenerator,
 )
 from .modeling_utils import recurse_getattr, recurse_setattr
 from .normalized_config import (
diff --git a/optimum/utils/input_generators.py b/optimum/utils/input_generators.py
@@ -1592,3 +1592,81 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
             return self.random_float_tensor(shape, min_value=-1, max_value=1, framework=framework, dtype=float_dtype)
 
         return super().generate(input_name, framework=framework, int_dtype=int_dtype, float_dtype=float_dtype)
+
+
+class Dinov2DummyInputGenerator(DummyVisionInputGenerator):
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedVisionConfig,
+        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
+        num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"],
+        width: int = DEFAULT_DUMMY_SHAPES["width"],
+        height: int = DEFAULT_DUMMY_SHAPES["height"],
+        **kwargs,
+    ):
+        super().__init__(
+            task=task,
+            normalized_config=normalized_config,
+            batch_size=batch_size,
+            num_channels=num_channels,
+            width=width,
+            height=height,
+            **kwargs,
+        )
+
+        from transformers.onnx.utils import get_preprocessor
+
+        preprocessor = get_preprocessor(normalized_config._name_or_path)
+        if preprocessor is not None and hasattr(preprocessor, "crop_size"):
+            self.height = preprocessor.crop_size.get("height", self.height)
+            self.width = preprocessor.crop_size.get("width", self.width)
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        input_ = super().generate(
+            input_name=input_name, framework=framework, int_dtype=int_dtype, float_dtype=float_dtype
+        )
+        return input_
+
+
+class DummyVisionStaticInputGenerator(DummyVisionInputGenerator):
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedVisionConfig,
+        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
+        num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"],
+        width: int = DEFAULT_DUMMY_SHAPES["width"],
+        height: int = DEFAULT_DUMMY_SHAPES["height"],
+        **kwargs,
+    ):
+        super().__init__(
+            task=task,
+            normalized_config=normalized_config,
+            batch_size=batch_size,
+            num_channels=num_channels,
+            width=width,
+            height=height,
+            **kwargs,
+        )
+
+        from transformers.onnx.utils import get_preprocessor
+
+        preprocessor = get_preprocessor(normalized_config._name_or_path)
+        if preprocessor is not None and hasattr(preprocessor, "size"):
+            self.height = preprocessor.size.get("height", self.height)
+            self.width = preprocessor.size.get("width", self.width)
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        input_ = super().generate(
+            input_name=input_name, framework=framework, int_dtype=int_dtype, float_dtype=float_dtype
+        )
+        return input_
+
+
+class PerceiverDummyInputGenerator(DummyVisionStaticInputGenerator):
+    pass
+
+
+class VitPoseDummyInputGenerator(DummyVisionStaticInputGenerator):
+    pass
diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py
@@ -174,6 +174,7 @@
     "vit-mae": "hf-internal-testing/tiny-random-ViTMAEModel",
     "vit-msn": "hf-internal-testing/tiny-random-ViTMSNForImageClassification",
     "vits": "echarlaix/tiny-random-vits",
+    "vitpose": "hf-internal-testing/tiny-random-VitPoseForPoseEstimation",
     "yolos": "hf-internal-testing/tiny-random-YolosModel",
     "whisper": "optimum-internal-testing/tiny-random-whisper",
     "hubert": "hf-internal-testing/tiny-random-HubertModel",
@@ -299,6 +300,7 @@
     "vit": "google/vit-base-patch16-224",
     "vit-mae": "facebook/vit-mae-base",
     "vit-msn": "facebook/vit-msn-small",
+    "vitpose": "usyd-community/vitpose-plus-small",
     "yolos": "hustvl/yolos-tiny",
     "whisper": "openai/whisper-tiny.en",
     "hubert": "facebook/hubert-base-ls960",