@@ -3109,6 +3109,50 @@ def preprocess_inputs(
3109
3109
return processed_inputs
3110
3110
3111
3111
3112
+ class _OVGotOCR2ForCausalLM (OVModelForVisualCausalLM ):
3113
+ def get_vision_embeddings (self , pixel_values , input_ids , ** kwargs ):
3114
+ if input_ids is not None and input_ids .shape [1 ] == 1 and kwargs .get ("past_key_values" ) is not None :
3115
+ return None
3116
+ return self .vision_embeddings (pixel_values ).last_hidden_state
3117
+
3118
+ def merge_vision_text_embeddings (
3119
+ self , vision_embeds , inputs_embeds , input_ids = None , attention_mask = None , position_ids = None , ** kwargs
3120
+ ):
3121
+ # Adopted from https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/got_ocr2/modeling_got_ocr2.py#L836-L845
3122
+ image_features = torch .from_numpy (vision_embeds ) if isinstance (vision_embeds , np .ndarray ) else vision_embeds
3123
+ inputs_embeds = torch .from_numpy (inputs_embeds ) if isinstance (inputs_embeds , np .ndarray ) else inputs_embeds
3124
+ n_image_tokens = (input_ids == self .config .image_token_index ).sum ()
3125
+ n_image_features = image_features .shape [0 ] * image_features .shape [1 ]
3126
+ if n_image_tokens != n_image_features :
3127
+ raise ValueError (
3128
+ f"Image features and image tokens do not match: tokens: { n_image_tokens } , features { n_image_features } "
3129
+ )
3130
+ special_image_mask = (input_ids == self .config .image_token_index ).unsqueeze (- 1 )
3131
+ special_image_mask = special_image_mask .expand_as (inputs_embeds ).to (inputs_embeds .device )
3132
+ image_features = image_features .to (inputs_embeds .device , inputs_embeds .dtype )
3133
+ inputs_embeds = inputs_embeds .masked_scatter (special_image_mask , image_features )
3134
+
3135
+ return inputs_embeds , attention_mask , position_ids
3136
+
3137
+ @staticmethod
3138
+ def preprocess_inputs (
3139
+ text : Optional [str ] = None ,
3140
+ image : Optional ["Image" ] = None ,
3141
+ processor : Optional [AutoImageProcessor ] = None ,
3142
+ tokenizer : Optional [PreTrainedTokenizer ] = None ,
3143
+ config : Optional [PretrainedConfig ] = None ,
3144
+ video : Optional ["VideoInput" ] = None ,
3145
+ ):
3146
+ if processor is None :
3147
+ raise ValueError ("processor is required" )
3148
+ if video is not None :
3149
+ raise ValueError ("Video input is not supported" )
3150
+ if image is None :
3151
+ raise ValueError ("Image is required" )
3152
+ processed_inputs = processor (image , return_tensors = "pt" )
3153
+ return processed_inputs
3154
+
3155
+
3112
3156
MODEL_TYPE_TO_CLS_MAPPING = {
3113
3157
"llava" : _OVLlavaForCausalLM ,
3114
3158
"llava_next" : _OVLlavaNextForCausalLM ,
@@ -3120,4 +3164,5 @@ def preprocess_inputs(
3120
3164
"internvl_chat" : _OVInternVLForCausalLM ,
3121
3165
"qwen2_vl" : _OVQwen2VLForCausalLM ,
3122
3166
"qwen2_5_vl" : _OVQwen2_5_VLForCausalLM ,
3167
+ "got_ocr2" : _OVGotOCR2ForCausalLM ,
3123
3168
}
0 commit comments