Skip to content

Commit d777896

Browse files
committed
Merge branch 'main' into ea/qwen25vl
2 parents 107d7ef + 8c94f53 commit d777896

12 files changed

+710
-546
lines changed

docs/source/openvino/models.mdx

+1
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ Here is the list of the supported architectures :
103103
- Qwen2(Qwen1.5, Qwen2.5)
104104
- Qwen2MoE
105105
- Qwen2VL
106+
- Qwen2.5VL
106107
- ResNet
107108
- Roberta
108109
- Roformer

notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb

+420-506
Large diffs are not rendered by default.

optimum/exporters/openvino/__main__.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222

2323
from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
2424
from requests.exceptions import ConnectionError as RequestsConnectionError
25-
from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizerBase
25+
from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizerBase, ProcessorMixin
2626
from transformers.utils import is_torch_available
2727

2828
from openvino.runtime import Core, Type, save_model
@@ -531,10 +531,15 @@ def maybe_convert_tokenizers(library_name: str, output: Path, model=None, prepro
531531

532532
if is_openvino_tokenizers_available():
533533
if library_name != "diffusers" and preprocessors:
534+
processor_chat_template = None
534535
tokenizer = next(filter(lambda it: isinstance(it, PreTrainedTokenizerBase), preprocessors), None)
536+
if len(preprocessors) > 1:
537+
for processor in preprocessors:
538+
if isinstance(processor, ProcessorMixin) and hasattr(processor, "chat_template"):
539+
processor_chat_template = processor.chat_template
535540
if tokenizer:
536541
try:
537-
export_tokenizer(tokenizer, output, task=task)
542+
export_tokenizer(tokenizer, output, task=task, processor_chat_template=processor_chat_template)
538543
except Exception as exception:
539544
logger.warning(
540545
"Could not load tokenizer using specified model ID or path. OpenVINO tokenizer/detokenizer "

optimum/exporters/openvino/convert.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@
7171
remove_none_from_dummy_inputs,
7272
save_config,
7373
save_preprocessors,
74+
set_simplified_chat_template,
7475
)
7576

7677

@@ -825,6 +826,7 @@ def export_tokenizer(
825826
output: Union[str, Path],
826827
suffix: Optional[str] = "",
827828
task: Optional[str] = None,
829+
processor_chat_template: Optional[str] = None,
828830
):
829831
# avoid circular imports
830832
from optimum.intel.openvino import OV_DETOKENIZER_NAME, OV_TOKENIZER_NAME
@@ -849,7 +851,7 @@ def export_tokenizer(
849851

850852
if (
851853
task is not None
852-
and task.startswith("text-generation")
854+
and (task.startswith("text-generation") or task == "image-text-to-text")
853855
and compare_versions("openvino-tokenizers", ">=", "2024.3.0.0")
854856
):
855857
logger.info(f"Set tokenizer padding side to left for `{task}` task.")
@@ -858,6 +860,8 @@ def export_tokenizer(
858860

859861
try:
860862
converted = convert_tokenizer(tokenizer, with_detokenizer=True)
863+
set_simplified_chat_template(converted[0], processor_chat_template)
864+
861865
except NotImplementedError:
862866
logger.info("Detokenizer is not supported, convert tokenizer only.")
863867
converted = convert_tokenizer(tokenizer, with_detokenizer=False)

optimum/exporters/openvino/utils.py

+33
Large diffs are not rendered by default.

optimum/intel/openvino/configuration.py

+8
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,14 @@ class OVQuantizationMethod(str, Enum):
210210
"quant_method": OVQuantizationMethod.AWQ,
211211
"scale_estimation": True,
212212
},
213+
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B": {
214+
"bits": 4,
215+
"sym": False,
216+
"group_size": 64,
217+
"ratio": 0.8,
218+
"dataset": "wikitext2",
219+
"quant_method": OVQuantizationMethod.AWQ,
220+
},
213221
}
214222

215223
_DEFAULT_4BIT_CONFIG = {

optimum/intel/openvino/modeling.py

+25-25
Original file line numberDiff line numberDiff line change
@@ -174,9 +174,9 @@ def forward(
174174

175175
np_inputs = isinstance(input_ids, np.ndarray)
176176
if not np_inputs:
177-
input_ids = np.array(input_ids)
178-
attention_mask = np.array(attention_mask)
179-
token_type_ids = np.array(token_type_ids) if token_type_ids is not None else token_type_ids
177+
input_ids = input_ids.cpu().numpy()
178+
attention_mask = attention_mask.cpu().numpy()
179+
token_type_ids = token_type_ids.cpu().numpy() if token_type_ids is not None else token_type_ids
180180

181181
inputs = {
182182
"input_ids": input_ids,
@@ -239,9 +239,9 @@ def forward(
239239

240240
np_inputs = isinstance(input_ids, np.ndarray)
241241
if not np_inputs:
242-
input_ids = np.array(input_ids)
243-
attention_mask = np.array(attention_mask)
244-
token_type_ids = np.array(token_type_ids) if token_type_ids is not None else token_type_ids
242+
input_ids = input_ids.cpu().numpy()
243+
attention_mask = attention_mask.cpu().numpy()
244+
token_type_ids = token_type_ids.cpu().numpy() if token_type_ids is not None else token_type_ids
245245

246246
inputs = {
247247
"input_ids": input_ids,
@@ -308,9 +308,9 @@ def forward(
308308

309309
np_inputs = isinstance(input_ids, np.ndarray)
310310
if not np_inputs:
311-
input_ids = np.array(input_ids)
312-
attention_mask = np.array(attention_mask)
313-
token_type_ids = np.array(token_type_ids) if token_type_ids is not None else token_type_ids
311+
input_ids = input_ids.cpu().numpy()
312+
attention_mask = attention_mask.cpu().numpy()
313+
token_type_ids = token_type_ids.cpu().numpy() if token_type_ids is not None else token_type_ids
314314

315315
inputs = {
316316
"input_ids": input_ids,
@@ -379,9 +379,9 @@ def forward(
379379

380380
np_inputs = isinstance(input_ids, np.ndarray)
381381
if not np_inputs:
382-
input_ids = np.array(input_ids)
383-
attention_mask = np.array(attention_mask)
384-
token_type_ids = np.array(token_type_ids) if token_type_ids is not None else token_type_ids
382+
input_ids = input_ids.cpu().numpy()
383+
attention_mask = attention_mask.cpu().numpy()
384+
token_type_ids = token_type_ids.cpu().numpy() if token_type_ids is not None else token_type_ids
385385

386386
inputs = {
387387
"input_ids": input_ids,
@@ -448,9 +448,9 @@ def forward(
448448

449449
np_inputs = isinstance(input_ids, np.ndarray)
450450
if not np_inputs:
451-
input_ids = np.array(input_ids)
452-
attention_mask = np.array(attention_mask)
453-
token_type_ids = np.array(token_type_ids) if token_type_ids is not None else token_type_ids
451+
input_ids = input_ids.cpu().numpy()
452+
attention_mask = attention_mask.cpu().numpy()
453+
token_type_ids = token_type_ids.cpu().numpy() if token_type_ids is not None else token_type_ids
454454

455455
inputs = {
456456
"input_ids": input_ids,
@@ -581,7 +581,7 @@ def forward(
581581

582582
np_inputs = isinstance(pixel_values, np.ndarray)
583583
if not np_inputs:
584-
pixel_values = np.array(pixel_values)
584+
pixel_values = pixel_values.cpu().numpy()
585585

586586
inputs = {
587587
"pixel_values": pixel_values,
@@ -640,8 +640,8 @@ def forward(
640640

641641
np_inputs = isinstance(input_values, np.ndarray)
642642
if not np_inputs:
643-
input_values = np.array(input_values)
644-
attention_mask = np.array(attention_mask) if attention_mask is not None else attention_mask
643+
input_values = input_values.cpu().numpy()
644+
attention_mask = attention_mask.cpu().numpy() if attention_mask is not None else attention_mask
645645

646646
inputs = {
647647
"input_values": input_values,
@@ -711,8 +711,8 @@ def forward(
711711
):
712712
np_inputs = isinstance(input_values, np.ndarray)
713713
if not np_inputs:
714-
input_values = np.array(input_values)
715-
attention_mask = np.array(attention_mask) if attention_mask is not None else attention_mask
714+
input_values = input_values.cpu().numpy()
715+
attention_mask = attention_mask.cpu().numpy() if attention_mask is not None else attention_mask
716716

717717
inputs = {
718718
"input_values": input_values,
@@ -791,8 +791,8 @@ def forward(
791791
):
792792
np_inputs = isinstance(input_values, np.ndarray)
793793
if not np_inputs:
794-
input_values = np.array(input_values)
795-
attention_mask = np.array(attention_mask) if attention_mask is not None else attention_mask
794+
input_values = input_values.cpu().numpy()
795+
attention_mask = attention_mask.cpu().numpy() if attention_mask is not None else attention_mask
796796

797797
inputs = {
798798
"input_values": input_values,
@@ -867,8 +867,8 @@ def forward(
867867
):
868868
np_inputs = isinstance(input_values, np.ndarray)
869869
if not np_inputs:
870-
input_values = np.array(input_values)
871-
attention_mask = np.array(attention_mask) if attention_mask is not None else attention_mask
870+
input_values = input_values.cpu().numpy()
871+
attention_mask = attention_mask.cpu().numpy() if attention_mask is not None else attention_mask
872872

873873
inputs = {
874874
"input_values": input_values,
@@ -929,7 +929,7 @@ def forward(self, **kwargs):
929929
np_inputs = isinstance(next(iter(kwargs.values())), np.ndarray)
930930
inputs = {}
931931
for input_name in self.input_names:
932-
inputs[input_name] = np.array(kwargs.pop(input_name)) if not np_inputs else kwargs.pop(input_name)
932+
inputs[input_name] = kwargs.pop(input_name).cpu().numpy() if not np_inputs else kwargs.pop(input_name)
933933

934934
outputs = self._inference(inputs)
935935
model_outputs = {}

optimum/intel/openvino/modeling_decoder.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@
2121
import openvino
2222
import torch
2323
from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
24+
from openvino import Core, Tensor, Type
2425
from openvino.preprocess import PrePostProcessor
25-
from openvino.runtime import Core, Tensor, Type
2626
from transformers import AutoModelForCausalLM, PretrainedConfig
2727
from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
2828
from transformers.generation import GenerationMixin
@@ -492,11 +492,11 @@ def prepare_inputs(
492492
self.next_beam_idx = np.arange(batch_size, dtype=int)
493493
self._past_length = 0
494494
past_len = self._get_past_length(past_key_values)
495-
inputs["input_ids"] = np.array(input_ids)
495+
inputs["input_ids"] = input_ids.cpu().numpy()
496496
# Add the attention_mask inputs when needed
497497
if "attention_mask" in self.input_names or "position_ids" in self.input_names:
498498
if attention_mask is not None:
499-
attention_mask = np.array(attention_mask)
499+
attention_mask = attention_mask.cpu().numpy()
500500
else:
501501
attention_mask = np.ones(
502502
(input_ids.shape[0], input_ids.shape[1] + past_len), dtype=inputs["input_ids"].dtype
@@ -507,7 +507,7 @@ def prepare_inputs(
507507

508508
if "position_ids" in self.input_names:
509509
if position_ids is not None:
510-
position_ids = np.array(position_ids)
510+
position_ids = position_ids.cpu().numpy()
511511
else:
512512
position_ids = np.cumsum(attention_mask, axis=1) - 1
513513
position_ids[attention_mask == 0] = 1

optimum/intel/openvino/modeling_sentence_transformers.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,9 @@ def forward(self, inputs: Dict[str, torch.Tensor]):
4343

4444
np_inputs = isinstance(input_ids, np.ndarray)
4545
if not np_inputs:
46-
input_ids = np.array(input_ids)
47-
attention_mask = np.array(attention_mask)
48-
token_type_ids = np.array(token_type_ids) if token_type_ids is not None else token_type_ids
46+
input_ids = input_ids.cpu().numpy()
47+
attention_mask = attention_mask.cpu().numpy()
48+
token_type_ids = token_type_ids.cpu().numpy() if token_type_ids is not None else token_type_ids
4949

5050
inputs = {
5151
"input_ids": input_ids,

optimum/intel/openvino/modeling_visual_language.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ def prepare_inputs(
150150
# Add the attention_mask inputs when needed
151151
if "attention_mask" in self.input_names or "position_ids" in self.input_names:
152152
if attention_mask is not None:
153-
attention_mask = np.array(attention_mask)
153+
attention_mask = attention_mask.cpu().numpy()
154154
else:
155155
attention_mask = np.ones((inputs_embeds.shape[0], inputs_embeds.shape[1] + past_len), dtype=int)
156156

@@ -159,7 +159,7 @@ def prepare_inputs(
159159

160160
if "position_ids" in self.input_names:
161161
if position_ids is not None:
162-
position_ids = np.array(position_ids)
162+
position_ids = position_ids.cpu().numpy()
163163
else:
164164
position_ids = np.cumsum(attention_mask, axis=1) - 1
165165
position_ids[attention_mask == 0] = 1

0 commit comments

Comments
 (0)