18
18
from typing import Any , Callable , Dict , Optional , Union
19
19
20
20
from requests .exceptions import ConnectionError as RequestsConnectionError
21
- from transformers import AutoConfig , AutoTokenizer
21
+ from transformers import AutoConfig , PreTrainedTokenizerBase
22
22
23
23
from optimum .exporters import TasksManager
24
24
from optimum .exporters .onnx import __main__ as optimum_main
25
25
from optimum .exporters .onnx .base import OnnxConfig , OnnxConfigWithPast
26
26
from optimum .utils import DEFAULT_DUMMY_SHAPES
27
27
from optimum .utils .save_utils import maybe_load_preprocessors , maybe_save_preprocessors
28
28
29
- from ...intel .utils .import_utils import is_nncf_available , is_optimum_version , is_transformers_version
30
- from .convert import export_models
29
+ from ...intel .utils .import_utils import (
30
+ is_nncf_available ,
31
+ is_openvino_tokenizers_available ,
32
+ is_optimum_version ,
33
+ is_transformers_version ,
34
+ )
35
+ from .convert import export_models , export_tokenizer
31
36
from .stateful import ensure_export_task_support_stateful
32
37
33
38
41
46
]
42
47
43
48
OV_XML_FILE_NAME = "openvino_model.xml"
44
-
45
49
_MAX_UNCOMPRESSED_SIZE = 1e9
46
50
47
51
logger = logging .getLogger (__name__ )
@@ -67,6 +71,7 @@ def main_export(
67
71
compression_option : Optional [str ] = None ,
68
72
compression_ratio : Optional [float ] = None ,
69
73
stateful : bool = True ,
74
+ convert_tokenizer : bool = False ,
70
75
** kwargs_shapes ,
71
76
):
72
77
"""
@@ -318,13 +323,17 @@ class StoreAttr(object):
318
323
and getattr (model .config , "pad_token_id" , None ) is None
319
324
and task in ["text-classification" ]
320
325
)
326
+
327
+ tokenizer = next (
328
+ (preprocessor for preprocessor in preprocessors if isinstance (preprocessor , PreTrainedTokenizerBase )), None
329
+ )
330
+
321
331
if needs_pad_token_id :
322
332
if pad_token_id is not None :
323
333
model .config .pad_token_id = pad_token_id
324
- else :
334
+ elif tokenizer is not None :
325
335
try :
326
- tok = AutoTokenizer .from_pretrained (model_name_or_path )
327
- model .config .pad_token_id = tok .pad_token_id
336
+ model .config .pad_token_id = tokenizer .pad_token_id
328
337
except Exception :
329
338
raise ValueError (
330
339
"Could not infer the pad token id, which is needed in this case, please provide it with the --pad_token_id argument"
@@ -336,6 +345,15 @@ class StoreAttr(object):
336
345
generation_config .save_pretrained (output )
337
346
maybe_save_preprocessors (model_name_or_path , output )
338
347
348
+ if convert_tokenizer and tokenizer is not None and is_openvino_tokenizers_available ():
349
+ try :
350
+ export_tokenizer (tokenizer , output )
351
+ except Exception as exception :
352
+ logger .warning (
353
+ "Could not load tokenizer using specified model ID or path. OpenVINO tokenizer/detokenizer "
354
+ f"models won't be generated. Exception: { exception } "
355
+ )
356
+
339
357
if model .config .is_encoder_decoder and task .startswith ("text-generation" ):
340
358
raise ValueError (
341
359
f"model.config.is_encoder_decoder is True and task is `{ task } `, which are incompatible. If the task was auto-inferred, please fill a bug report"
@@ -365,10 +383,14 @@ class StoreAttr(object):
365
383
tokenizer = getattr (model , "tokenizer" , None )
366
384
if tokenizer is not None :
367
385
tokenizer .save_pretrained (output .joinpath ("tokenizer" ))
386
+ if convert_tokenizer and is_openvino_tokenizers_available ():
387
+ export_tokenizer (tokenizer , output )
368
388
369
389
tokenizer_2 = getattr (model , "tokenizer_2" , None )
370
390
if tokenizer_2 is not None :
371
391
tokenizer_2 .save_pretrained (output .joinpath ("tokenizer_2" ))
392
+ if convert_tokenizer and is_openvino_tokenizers_available ():
393
+ export_tokenizer (tokenizer , output , suffix = "_2" )
372
394
373
395
model .save_config (output )
374
396
0 commit comments