25
25
from huggingface_hub .constants import HUGGINGFACE_HUB_CACHE
26
26
from openvino .preprocess import PrePostProcessor
27
27
from openvino .runtime import Core , Tensor , Type
28
+ from packaging .version import Version
28
29
from transformers import AutoModelForCausalLM , PretrainedConfig
29
30
from transformers .file_utils import add_start_docstrings , add_start_docstrings_to_model_forward
30
31
from transformers .generation import GenerationMixin
38
39
39
40
from ...exporters .openvino import ensure_stateful_is_available , main_export , patch_stateful
40
41
from ...exporters .openvino .stateful import model_has_state
41
- from ..utils .import_utils import is_nncf_available , is_transformers_version
42
+ from ..utils .import_utils import compare_versions , is_nncf_available , is_transformers_version
42
43
from ..utils .modeling_utils import MULTI_QUERY_ATTN_MODELS
43
44
from .configuration import (
44
45
OVConfig ,
51
52
52
53
53
54
if TYPE_CHECKING :
55
+ from transformers .generation .streamers import BaseStreamer
54
56
from transformers .modeling_utils import PreTrainedModel
55
- from transformers .streamers import BaseStreamer
56
57
57
58
58
59
logger = logging .getLogger (__name__ )
@@ -404,7 +405,10 @@ def prepare_inputs(
404
405
** kwargs ,
405
406
) -> Dict :
406
407
batch_size = input_ids .shape [0 ]
407
- if self .config .model_type == "bloom" :
408
+ model_transformers_version = Version (
409
+ self .model .rt_info ["optimum" ]["transformers_version" ].value if "optimum" in self .model .rt_info else "0.0.0"
410
+ )
411
+ if self .config .model_type == "bloom" and compare_versions (model_transformers_version , "<" , "4.44" ):
408
412
batch_size *= self .config .num_attention_heads
409
413
410
414
inputs = {}
@@ -619,7 +623,10 @@ def _deduplicate_inputs(self, model_inputs: Dict):
619
623
shape = input_tensor .shape if isinstance (input_tensor , Tensor ) else list (input_tensor .shape )
620
624
dtype = input_tensor .element_type if isinstance (input_tensor , Tensor ) else Type (input_tensor .dtype )
621
625
upd_batch_size = indicies .shape [0 ]
622
- if self .config .model_type == "bloom" :
626
+ export_transformers_version = Version (self .model .rt_info ["optimum" ]["transformers_version" ].value )
627
+ if self .config .model_type == "bloom" and compare_versions (
628
+ export_transformers_version , "<" , "4.44"
629
+ ):
623
630
upd_batch_size *= self .config .num_attention_heads
624
631
shape [
625
632
(
@@ -631,10 +638,11 @@ def _deduplicate_inputs(self, model_inputs: Dict):
631
638
upd_model_inputs [input_name ] = Tensor (dtype , shape )
632
639
upd_model_inputs ["input_ids" ] = unique_input_ids
633
640
if "beam_idx" in model_inputs :
641
+ export_transformers_version = Version (self .model .rt_info ["optimum" ]["transformers_version" ].value )
634
642
beam_range = (
635
- unique_input_ids .shape [0 ]
636
- if self .config .model_type != "bloom"
637
- else unique_input_ids .shape [0 ] * self . config . num_attention_heads
643
+ unique_input_ids .shape [0 ] * self . config . num_attention_heads
644
+ if ( self .config .model_type == "bloom" and compare_versions ( export_transformers_version , "<" , "4.44" ))
645
+ else unique_input_ids .shape [0 ]
638
646
)
639
647
beam_idx = np .arange (beam_range , dtype = int )
640
648
upd_model_inputs ["beam_idx" ] = beam_idx
@@ -781,7 +789,10 @@ def _from_pretrained(
781
789
model = cls .load_model (model_cache_path )
782
790
783
791
model_type = config .model_type .replace ("_" , "-" )
784
- if model_type == "bloom" :
792
+ export_transformers_version = Version (
793
+ model .rt_info ["optimum" ]["transformers_version" ].value if "optimum" in model .rt_info else "0.0.0"
794
+ )
795
+ if model_type == "bloom" and compare_versions (export_transformers_version , "<" , "4.44" ):
785
796
init_cls = OVBloomForCausalLM
786
797
elif model_type == "gpt-bigcode" :
787
798
init_cls = OVGPTBigCodeForCausalLM
0 commit comments