fix timestep export shapes in sd3 and flux and tests with diffusers 0.32 (#1094)

eaidova · web-flow · commit 014a8406de47 · 2024-12-26T17:43:06.000+04:00
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -1806,7 +1806,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
 
 @register_in_tasks_manager("unet", *["semantic-segmentation"], library_name="diffusers")
 @register_in_tasks_manager("unet-2d-condition", *["semantic-segmentation"], library_name="diffusers")
-class UnetOpenVINOConfig(UNetOnnxConfig):
+class UNetOpenVINOConfig(UNetOnnxConfig):
     DUMMY_INPUT_GENERATOR_CLASSES = (
         DummyUnetVisionInputGenerator,
         DummyUnetTimestepInputGenerator,
@@ -1821,10 +1821,10 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
 
 @register_in_tasks_manager("sd3-transformer", *["semantic-segmentation"], library_name="diffusers")
 @register_in_tasks_manager("sd3-transformer-2d", *["semantic-segmentation"], library_name="diffusers")
-class SD3TransformerOpenVINOConfig(UNetOnnxConfig):
+class SD3TransformerOpenVINOConfig(UNetOpenVINOConfig):
     DUMMY_INPUT_GENERATOR_CLASSES = (
         (DummyTransformerTimestpsInputGenerator,)
-        + UNetOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES
+        + UNetOpenVINOConfig.DUMMY_INPUT_GENERATOR_CLASSES
         + (PooledProjectionsDummyInputGenerator,)
     )
     NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args(
diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py
@@ -218,8 +218,8 @@ def test_shape(self, model_arch: str):
                         ),
                     )
                 else:
-                    packed_height = height // pipeline.vae_scale_factor
-                    packed_width = width // pipeline.vae_scale_factor
+                    packed_height = height // pipeline.vae_scale_factor // 2
+                    packed_width = width // pipeline.vae_scale_factor // 2
                     channels = pipeline.transformer.config.in_channels
                     self.assertEqual(outputs.shape, (batch_size, packed_height * packed_width, channels))
 
@@ -426,7 +426,7 @@ def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_
             height=height, width=width, batch_size=batch_size, channel=channel, input_type=input_type
         )
 
-        if "flux" == model_type:
+        if model_type in ["flux", "stable-diffusion-3"]:
             inputs["height"] = height
             inputs["width"] = width
 
@@ -529,8 +529,8 @@ def test_shape(self, model_arch: str):
                             ),
                         )
                     else:
-                        packed_height = height // pipeline.vae_scale_factor
-                        packed_width = width // pipeline.vae_scale_factor
+                        packed_height = height // pipeline.vae_scale_factor // 2
+                        packed_width = width // pipeline.vae_scale_factor // 2
                         channels = pipeline.transformer.config.in_channels
                         self.assertEqual(outputs.shape, (batch_size, packed_height * packed_width, channels))
 
@@ -780,8 +780,8 @@ def test_shape(self, model_arch: str):
                             ),
                         )
                     else:
-                        packed_height = height // pipeline.vae_scale_factor
-                        packed_width = width // pipeline.vae_scale_factor
+                        packed_height = height // pipeline.vae_scale_factor // 2
+                        packed_width = width // pipeline.vae_scale_factor // 2
                         channels = pipeline.transformer.config.in_channels
                         self.assertEqual(outputs.shape, (batch_size, packed_height * packed_width, channels))