Skip to content

Commit a51d02a

Browse files
committed
Enable OpenVINO export of loaded model
1 parent 0ece48b commit a51d02a

File tree

3 files changed

+314
-213
lines changed

3 files changed

+314
-213
lines changed

optimum/exporters/openvino/__main__.py

+92-207
Original file line numberDiff line numberDiff line change
@@ -13,27 +13,22 @@
1313
# limitations under the License.
1414

1515
import logging
16-
import os
1716
from pathlib import Path
1817
from typing import Any, Callable, Dict, Optional, Union
1918

2019
from requests.exceptions import ConnectionError as RequestsConnectionError
21-
from transformers import AutoConfig, PreTrainedTokenizerBase
20+
from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizerBase
2221

2322
from optimum.exporters import TasksManager
24-
from optimum.exporters.onnx import __main__ as optimum_main
25-
from optimum.exporters.onnx.base import OnnxConfig, OnnxConfigWithPast
26-
from optimum.utils import DEFAULT_DUMMY_SHAPES
27-
from optimum.utils.save_utils import maybe_load_preprocessors, maybe_save_preprocessors
23+
from optimum.exporters.onnx.base import OnnxConfig
24+
from optimum.utils.save_utils import maybe_load_preprocessors
2825

2926
from ...intel.utils.import_utils import (
30-
is_nncf_available,
3127
is_openvino_tokenizers_available,
3228
is_optimum_version,
3329
is_transformers_version,
3430
)
35-
from .convert import export_models, export_tokenizer
36-
from .stateful import ensure_export_task_support_stateful
31+
from .convert import export_from_model, export_tokenizer
3732

3833

3934
if is_optimum_version(">=", "1.16.0"):
@@ -45,8 +40,6 @@
4540
"whisper",
4641
]
4742

48-
OV_XML_FILE_NAME = "openvino_model.xml"
49-
_MAX_UNCOMPRESSED_SIZE = 1e9
5043

5144
logger = logging.getLogger(__name__)
5245

@@ -143,70 +136,11 @@ def main_export(
143136
>>> main_export("gpt2", output="gpt2_onnx/")
144137
```
145138
"""
146-
if (
147-
compression_option is not None
148-
and compression_option != "fp16"
149-
and compression_option != "fp32"
150-
and not is_nncf_available()
151-
):
152-
raise ImportError(
153-
f"Compression of the weights to {compression_option} requires nncf, please install it with `pip install nncf`"
154-
)
155-
156-
model_kwargs = model_kwargs or {}
157-
158-
output = Path(output)
159-
if not output.exists():
160-
output.mkdir(parents=True)
161139

162140
original_task = task
163141
task = TasksManager.map_from_synonym(task)
164-
165-
# Patch the modules to export of GPTQ models w/o GPU
166-
do_gptq_patching = False
167-
try:
168-
config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code)
169-
model_type = config.model_type.replace("_", "-")
170-
config_dict = config.to_dict()
171-
quantization_config = config_dict.get("quantization_config", None)
172-
do_gptq_patching = quantization_config and quantization_config["quant_method"] == "gptq"
173-
except Exception:
174-
model_type = None
175-
pass
176-
177-
if do_gptq_patching:
178-
import torch
179-
180-
torch.set_default_dtype(torch.float32)
181-
orig_cuda_check = torch.cuda.is_available
182-
torch.cuda.is_available = lambda: True
183-
184-
from optimum.gptq import GPTQQuantizer
185-
186-
orig_post_init_model = GPTQQuantizer.post_init_model
187-
188-
def post_init_model(self, model):
189-
from auto_gptq import exllama_set_max_input_length
190-
191-
class StoreAttr(object):
192-
pass
193-
194-
model.quantize_config = StoreAttr()
195-
model.quantize_config.desc_act = self.desc_act
196-
if self.desc_act and not self.disable_exllama and self.max_input_length is not None:
197-
model = exllama_set_max_input_length(model, self.max_input_length)
198-
return model
199-
200-
GPTQQuantizer.post_init_model = post_init_model
201-
202142
framework = TasksManager.determine_framework(model_name_or_path, subfolder=subfolder, framework=framework)
203-
204-
# get the shapes to be used to generate dummy inputs
205-
input_shapes = {}
206-
for input_name in DEFAULT_DUMMY_SHAPES.keys():
207-
input_shapes[input_name] = (
208-
kwargs_shapes[input_name] if input_name in kwargs_shapes else DEFAULT_DUMMY_SHAPES[input_name]
209-
)
143+
library_name = TasksManager.infer_library_from_model(model_name_or_path, subfolder=subfolder)
210144

211145
if task == "auto":
212146
try:
@@ -220,9 +154,44 @@ class StoreAttr(object):
220154
f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
221155
)
222156

157+
if convert_tokenizer and not is_openvino_tokenizers_available():
158+
logger.warning(
159+
"`convert_tokenizer` requires openvino-tokenizers, please install it with `pip install optimum-intel[openvino-tokenizers]`"
160+
)
161+
convert_tokenizer = False
162+
163+
custom_architecture = False
223164
loading_kwargs = {}
224-
if is_transformers_version(">=", "4.36") and model_type in SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED:
225-
loading_kwargs["attn_implementation"] = "eager"
165+
if library_name == "transformers":
166+
config = AutoConfig.from_pretrained(
167+
model_name_or_path,
168+
subfolder=subfolder,
169+
revision=revision,
170+
cache_dir=cache_dir,
171+
use_auth_token=use_auth_token,
172+
local_files_only=local_files_only,
173+
force_download=force_download,
174+
trust_remote_code=trust_remote_code,
175+
)
176+
model_type = config.model_type.replace("_", "-")
177+
178+
if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
179+
custom_architecture = True
180+
elif task not in TasksManager.get_supported_tasks_for_model_type(
181+
model_type, exporter="onnx", library_name=library_name
182+
):
183+
if original_task == "auto":
184+
autodetected_message = " (auto-detected)"
185+
else:
186+
autodetected_message = ""
187+
model_tasks = TasksManager.get_supported_tasks_for_model_type(
188+
model_type, exporter="onnx", library_name=library_name
189+
)
190+
raise ValueError(
191+
f"Asked to export a {model_type} model for the task {task}{autodetected_message}, but the Optimum OpenVINO exporter only supports the tasks {', '.join(model_tasks.keys())} for {model_type}. Please use a supported task. Please open an issue at https://github.com/huggingface/optimum/issues if you would like the task {task} to be supported in the ONNX export for {model_type}."
192+
)
193+
if is_transformers_version(">=", "4.36") and model_type in SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED:
194+
loading_kwargs["attn_implementation"] = "eager"
226195

227196
model = TasksManager.get_model_from_task(
228197
task,
@@ -239,37 +208,35 @@ class StoreAttr(object):
239208
**loading_kwargs,
240209
)
241210

242-
custom_architecture = False
243-
is_stable_diffusion = "stable-diffusion" in task
244-
model_type = "stable-diffusion" if is_stable_diffusion else model.config.model_type.replace("_", "-")
245-
246-
if not is_stable_diffusion:
247-
if model_type in TasksManager._UNSUPPORTED_CLI_MODEL_TYPE:
248-
raise ValueError(
249-
f"{model_type} is not supported yet. Only {TasksManager._SUPPORTED_CLI_MODEL_TYPE} are supported. "
250-
f"If you want to support {model_type} please propose a PR or open up an issue."
251-
)
252-
if model.config.model_type.replace("-", "_") not in TasksManager.get_supported_model_type_for_task(
253-
task, exporter="onnx"
254-
):
255-
custom_architecture = True
211+
needs_pad_token_id = task == "text-classification" and getattr(model.config, "pad_token_id", None) is None
256212

257-
if custom_architecture and custom_onnx_configs is None:
258-
raise ValueError(
259-
"Trying to export a model with a custom architecture, but no custom onnx configuration was passed as `custom_onnx_configs`. Please refer to https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#custom-export-of-transformers-models for an example on how to export custom models."
260-
)
213+
if needs_pad_token_id:
214+
if pad_token_id is not None:
215+
model.config.pad_token_id = pad_token_id
216+
else:
217+
tok = AutoTokenizer.from_pretrained(model_name_or_path)
218+
pad_token_id = getattr(tok, "pad_token_id", None)
219+
if pad_token_id is None:
220+
raise ValueError(
221+
"Could not infer the pad token id, which is needed in this case, please provide it with the --pad_token_id argument"
222+
)
223+
model.config.pad_token_id = pad_token_id
261224

262-
if custom_architecture and original_task == "auto":
263-
raise ValueError(
264-
f'Automatic task detection is not supported with custom architectures. Please specify the `task` argument. Suggestion: task="{task}" (or task="{task}-with-past" if the model is decoder-based and supports KV cache)'
265-
)
225+
if "stable-diffusion" in task:
226+
model_type = "stable-diffusion"
227+
elif hasattr(model.config, "export_model_type"):
228+
model_type = model.config.export_model_type.replace("_", "-")
229+
else:
230+
model_type = model.config.model_type.replace("_", "-")
266231

267232
if (
268233
not custom_architecture
269-
and not is_stable_diffusion
270-
and task + "-with-past" in TasksManager.get_supported_tasks_for_model_type(model_type, "onnx")
234+
and library_name != "diffusers"
235+
and task + "-with-past"
236+
in TasksManager.get_supported_tasks_for_model_type(model_type, exporter="onnx", library_name=library_name)
271237
):
272-
if original_task == "auto": # Make -with-past the default if --task was not explicitely specified
238+
# Make -with-past the default if --task was not explicitely specified
239+
if original_task == "auto":
273240
task = task + "-with-past"
274241
else:
275242
logger.info(
@@ -286,127 +253,45 @@ class StoreAttr(object):
286253
possible_synonyms = ""
287254
logger.info(f"Automatic task detection to {task}{possible_synonyms}.")
288255

289-
task_support_stateful = ensure_export_task_support_stateful(task)
290-
stateful = stateful and task_support_stateful
291-
292256
preprocessors = maybe_load_preprocessors(
293257
model_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code
294258
)
295259

296-
onnx_config, models_and_onnx_configs = optimum_main._get_submodels_and_onnx_configs(
260+
export_from_model(
297261
model=model,
262+
output=output,
298263
task=task,
299-
monolith=False,
300-
custom_onnx_configs=custom_onnx_configs if custom_onnx_configs is not None else {},
301-
custom_architecture=custom_architecture,
264+
compression_option=compression_option,
265+
compression_ratio=compression_ratio,
266+
stateful=stateful,
267+
model_kwargs=model_kwargs,
268+
custom_onnx_configs=custom_onnx_configs,
302269
fn_get_submodels=fn_get_submodels,
303270
preprocessors=preprocessors,
304-
_variant="default",
305-
legacy=False,
271+
device=device,
272+
**kwargs_shapes,
306273
)
307274

308-
if compression_option is None:
309-
num_parameters = model.num_parameters() if not is_stable_diffusion else model.unet.num_parameters()
310-
if num_parameters >= _MAX_UNCOMPRESSED_SIZE:
311-
if is_nncf_available():
312-
compression_option = "int8"
313-
logger.info("The model weights will be quantized to int8.")
314-
else:
315-
logger.warning(
316-
"The model will be converted with no weights quantization. Quantization of the weights to int8 requires nncf."
317-
"please install it with `pip install nncf`"
318-
)
319-
320-
if not is_stable_diffusion:
321-
needs_pad_token_id = (
322-
isinstance(onnx_config, OnnxConfigWithPast)
323-
and getattr(model.config, "pad_token_id", None) is None
324-
and task in ["text-classification"]
325-
)
326-
327-
tokenizer = next(
328-
(preprocessor for preprocessor in preprocessors if isinstance(preprocessor, PreTrainedTokenizerBase)), None
329-
)
275+
if convert_tokenizer:
276+
if library_name != "diffusers":
277+
tokenizer = next(
278+
(preprocessor for preprocessor in preprocessors if isinstance(preprocessor, PreTrainedTokenizerBase)),
279+
None,
280+
)
330281

331-
if needs_pad_token_id:
332-
if pad_token_id is not None:
333-
model.config.pad_token_id = pad_token_id
334-
elif tokenizer is not None:
282+
if tokenizer is not None:
335283
try:
336-
model.config.pad_token_id = tokenizer.pad_token_id
337-
except Exception:
338-
raise ValueError(
339-
"Could not infer the pad token id, which is needed in this case, please provide it with the --pad_token_id argument"
284+
export_tokenizer(tokenizer, output)
285+
except Exception as exception:
286+
logger.warning(
287+
"Could not load tokenizer using specified model ID or path. OpenVINO tokenizer/detokenizer "
288+
f"models won't be generated. Exception: {exception}"
340289
)
341-
# Saving the model config and preprocessor as this is needed sometimes.
342-
model.config.save_pretrained(output)
343-
generation_config = getattr(model, "generation_config", None)
344-
if generation_config is not None:
345-
generation_config.save_pretrained(output)
346-
maybe_save_preprocessors(model_name_or_path, output)
347-
348-
if convert_tokenizer and tokenizer is not None and is_openvino_tokenizers_available():
349-
try:
350-
export_tokenizer(tokenizer, output)
351-
except Exception as exception:
352-
logger.warning(
353-
"Could not load tokenizer using specified model ID or path. OpenVINO tokenizer/detokenizer "
354-
f"models won't be generated. Exception: {exception}"
355-
)
356-
357-
if model.config.is_encoder_decoder and task.startswith("text-generation"):
358-
raise ValueError(
359-
f"model.config.is_encoder_decoder is True and task is `{task}`, which are incompatible. If the task was auto-inferred, please fill a bug report"
360-
f"at https://github.com/huggingface/optimum, if --task was explicitely passed, make sure you selected the right task for the model,"
361-
f" referring to `optimum.exporters.tasks.TaskManager`'s `_TASKS_TO_AUTOMODELS`."
362-
)
363-
364-
files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_onnx_configs.keys()]
365-
else:
366-
# save the subcomponent configuration
367-
for model_name in models_and_onnx_configs:
368-
subcomponent = models_and_onnx_configs[model_name][0]
369-
if hasattr(subcomponent, "save_config"):
370-
subcomponent.save_config(output / model_name)
371-
elif hasattr(subcomponent, "config") and hasattr(subcomponent.config, "save_pretrained"):
372-
subcomponent.config.save_pretrained(output / model_name)
373-
374-
files_subpaths = [os.path.join(name_dir, OV_XML_FILE_NAME) for name_dir in models_and_onnx_configs]
375-
376-
# Saving the additional components needed to perform inference.
377-
model.scheduler.save_pretrained(output.joinpath("scheduler"))
378-
379-
feature_extractor = getattr(model, "feature_extractor", None)
380-
if feature_extractor is not None:
381-
feature_extractor.save_pretrained(output.joinpath("feature_extractor"))
382-
383-
tokenizer = getattr(model, "tokenizer", None)
384-
if tokenizer is not None:
385-
tokenizer.save_pretrained(output.joinpath("tokenizer"))
386-
if convert_tokenizer and is_openvino_tokenizers_available():
290+
else:
291+
tokenizer = getattr(model, "tokenizer", None)
292+
if tokenizer is not None:
387293
export_tokenizer(tokenizer, output)
388294

389-
tokenizer_2 = getattr(model, "tokenizer_2", None)
390-
if tokenizer_2 is not None:
391-
tokenizer_2.save_pretrained(output.joinpath("tokenizer_2"))
392-
if convert_tokenizer and is_openvino_tokenizers_available():
393-
export_tokenizer(tokenizer, output, suffix="_2")
394-
395-
model.save_config(output)
396-
397-
export_models(
398-
models_and_onnx_configs=models_and_onnx_configs,
399-
output_dir=output,
400-
output_names=files_subpaths,
401-
input_shapes=input_shapes,
402-
device=device,
403-
compression_option=compression_option,
404-
compression_ratio=compression_ratio,
405-
stateful=stateful,
406-
model_kwargs=model_kwargs,
407-
)
408-
409-
# Unpatch modules after GPTQ export
410-
if do_gptq_patching:
411-
torch.cuda.is_available = orig_cuda_check
412-
GPTQQuantizer.post_init_model = orig_post_init_model
295+
tokenizer_2 = getattr(model, "tokenizer_2", None)
296+
if tokenizer_2 is not None:
297+
export_tokenizer(tokenizer_2, output, suffix="_2")

0 commit comments

Comments
 (0)