fix generation for statically reshaped diffusion pipeline (huggingface#1199)

eaidova · web-flow · commit 5ac35448427b · 2025-03-17T14:45:19.000+01:00
* fix generation for statically reshaped diffusion pipeline

* add test

* fix sana

* add check warnings
diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
@@ -889,9 +889,7 @@ def reshape(
             )
 
         if self.text_encoder_3 is not None:
-            self.text_encoder_3.model = self._reshape_text_encoder(
-                self.text_encoder_3.model, batch_size, getattr(self.tokenizer_3, "model_max_length", -1)
-            )
+            self.text_encoder_3.model = self._reshape_text_encoder(self.text_encoder_3.model, batch_size, -1)
 
         self.clear_requests()
         return self
@@ -973,6 +971,63 @@ def __call__(self, *args, **kwargs):
         for k, v in kwargs.items():
             kwargs[k] = np_to_pt_generators(v, self.device)
 
+        height, width = None, None
+        height_idx, width_idx = None, None
+        shapes_overriden = False
+        sig = inspect.signature(self.auto_model_class.__call__)
+        sig_height_idx = list(sig.parameters).index("height") if "height" in sig.parameters else len(sig.parameters)
+        sig_width_idx = list(sig.parameters).index("width") if "width" in sig.parameters else len(sig.parameters)
+        if "height" in kwargs:
+            height = kwargs["height"]
+        elif len(args) > sig_height_idx:
+            height = args[sig_height_idx]
+            height_idx = sig_height_idx
+
+        if "width" in kwargs:
+            width = kwargs["width"]
+        elif len(args) > sig_width_idx:
+            width = args[sig_width_idx]
+            width_idx = sig_width_idx
+
+        if self.height != -1:
+            if height is not None and height != self.height:
+                logger.warning(f"Incompatible height argument provided {height}. Pipeline only support {self.height}.")
+                height = self.height
+            else:
+                height = self.height
+
+            if height_idx is not None:
+                args[height_idx] = height
+            else:
+                kwargs["height"] = height
+
+            shapes_overriden = True
+
+        if self.width != -1:
+            if width is not None and width != self.width:
+                logger.warning(f"Incompatible widtth argument provided {width}. Pipeline only support {self.width}.")
+                width = self.width
+            else:
+                width = self.width
+
+            if width_idx is not None:
+                args[width_idx] = width
+            else:
+                kwargs["width"] = width
+            shapes_overriden = True
+
+        # Sana generates images in specific resolution grid size and then resize to requested size by default, it may contradict with pipeline height / width
+        # Disable this behavior for static shape pipeline
+        if self.auto_model_class.__name__.startswith("Sana") and shapes_overriden:
+            sig_resolution_bining_idx = (
+                list(sig.parameters).index("use_resolution_binning")
+                if "use_resolution_binning" in sig.parameters
+                else len(sig.parameters)
+            )
+            if len(args) > sig_resolution_bining_idx:
+                args[sig_resolution_bining_idx] = False
+            else:
+                kwargs["use_resolution_binning"] = False
         # we use auto_model_class.__call__ here because we can't call super().__call__
         # as OptimizedModel already defines a __call__ which is the first in the MRO
         return self.auto_model_class.__call__(self, *args, **kwargs)
diff --git a/tests/openvino/test_diffusion.py b/tests/openvino/test_diffusion.py
@@ -13,6 +13,7 @@
 #  limitations under the License.
 
 import json
+import logging
 import unittest
 from pathlib import Path
 
@@ -438,6 +439,33 @@ def test_load_custom_weight_variant(self):
 
         np.testing.assert_allclose(ov_images, diffusers_images, atol=1e-4, rtol=1e-2)
 
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_static_shape_image_generation(self, model_arch):
+        pipeline = self.OVMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], compile=False)
+        pipeline.reshape(batch_size=1, height=32, width=32)
+        pipeline.compile()
+        # generation with incompatible size
+        height, width, batch_size = 64, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+        inputs["output_type"] = "pil"
+        from optimum.intel.openvino.modeling_diffusion import logger as diffusers_logger
+
+        with self.assertLogs(diffusers_logger, logging.WARN) as warning_log:
+            image = pipeline(**inputs).images[0]
+            self.assertTrue(
+                any(
+                    "Incompatible width argument provided" in log or "Incompatible height argument provided" in log
+                    for log in warning_log.output
+                )
+            )
+        self.assertTupleEqual(image.size, (32, 32))
+        # generation without height / width provided
+        inputs.pop("height")
+        inputs.pop("width")
+        image = pipeline(**inputs).images[0]
+        self.assertTupleEqual(image.size, (32, 32))
+
 
 class OVPipelineForImage2ImageTest(unittest.TestCase):
     SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"]