Apply suggestions from code review

eaidova · IlyasMoutawwakil · web-flow · commit 8ab6a84ff14b · 2025-03-04T15:00:43.000+04:00
Co-authored-by: Ilyas Moutawwakil &lt;57442720+IlyasMoutawwakil@users.noreply.github.com&gt;
diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py
@@ -1243,7 +1243,7 @@ def merge_vision_text_embeddings(
 
                 # Whether to turn off right padding
                 # 1. Create a mask to know where special image tokens are
-                special_image_token_mask = torch.tensor(input_ids == image_token_index)
+                special_image_token_mask = input_ids == image_token_index
                 # special_image_token_mask: [bsz, seqlen]
                 num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
                 # num_special_image_tokens: [bsz]
@@ -1336,7 +1336,7 @@ def merge_vision_text_embeddings(
             final_attention_mask |= image_to_overwrite
             position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
         else:
-            special_image_mask = torch.tensor((input_ids == image_token_index)).unsqueeze(-1).expand_as(inputs_embeds)
+            special_image_mask = (input_ids == image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
             image_features = image_features.to(inputs_embeds.dtype)
             final_embedding = inputs_embeds.masked_scatter(special_image_mask, image_features)
             final_attention_mask = attention_mask