@@ -349,13 +349,13 @@ def __init__(
349
349
language_model : ov .Model ,
350
350
text_embeddings : ov .Model ,
351
351
vision_embeddings : ov .Model ,
352
- lm_head : Optional [ov .Model ] = None ,
353
352
config : PretrainedConfig = None ,
354
353
device : str = "CPU" ,
355
354
dynamic_shapes : bool = True ,
356
355
ov_config : Optional [Dict [str , str ]] = None ,
357
356
model_save_dir : Optional [Union [str , Path , TemporaryDirectory ]] = None ,
358
357
quantization_config : Union [OVWeightQuantizationConfig , Dict ] = None ,
358
+ lm_head : Optional [ov .Model ] = None ,
359
359
** kwargs ,
360
360
):
361
361
self .config = config
@@ -717,6 +717,9 @@ def components(self):
717
717
def _submodel_names (self ):
718
718
model_names = ["lm_model" , "text_embeddings_model" , "vision_embeddings_model" ]
719
719
for part in self .additional_parts :
720
+ if part == "lm_head" and getattr (self , part + "_model" , None ) is not None :
721
+ model_names .append (part + "_model" )
722
+ continue
720
723
if getattr (self , part , None ) is not None :
721
724
model_names .append (part + "_model" )
722
725
return model_names
@@ -2472,6 +2475,7 @@ def generate_image(
2472
2475
image_token_num_per_image : int = 576 ,
2473
2476
img_size : int = 384 ,
2474
2477
patch_size : int = 16 ,
2478
+ generator = None
2475
2479
):
2476
2480
from PIL import Image
2477
2481
@@ -2520,7 +2524,7 @@ def generate_image(
2520
2524
logits = logit_uncond + cfg_weight * (logit_cond - logit_uncond )
2521
2525
probs = torch .softmax (logits / temperature , dim = - 1 )
2522
2526
2523
- next_token = torch .multinomial (probs , num_samples = 1 )
2527
+ next_token = torch .multinomial (probs , num_samples = 1 ) if generator is None else torch . multinomial ( probs , num_samples = 1 , generator = generator )
2524
2528
generated_tokens [:, i ] = next_token .squeeze (dim = - 1 )
2525
2529
2526
2530
next_token = torch .cat ([next_token .unsqueeze (dim = 1 ), next_token .unsqueeze (dim = 1 )], dim = 1 ).view (- 1 )
@@ -2563,11 +2567,10 @@ def preprocess_inputs(
2563
2567
},
2564
2568
{"role" : "<|Assistant|>" , "content" : "" },
2565
2569
]
2566
- prompt = None
2570
+ prepare_inputs = processor ( conversations = conversation , images = [ image ], force_batchify = True )
2567
2571
else :
2568
- conversation = None
2569
- prompt = text
2570
- prepare_inputs = processor (prompt = prompt , conversations = conversation , images = [image ], force_batchify = True )
2572
+ tokenizer = tokenizer if tokenizer is not None else processor .tokenizer
2573
+ prepare_inputs = tokenizer (text , return_tensors = "pt" )
2571
2574
required_keys = ["input_ids" , "pixel_values" , "images_seq_mask" , "images_emb_mask" ]
2572
2575
inputs = {}
2573
2576
for key in required_keys :
0 commit comments