Skip to content

Commit 2be2e75

Browse files
apaniukovslyalin
andauthored
Add OpenVINO Tokenizers (#513)
* Convert tokenizers with openvino_tokenizers * Update optimum/exporters/openvino/__main__.py * Refactor and Add Tests * Fix t5 Test * Add Warning * Return Tests * Move export_tokenizer to convert.py Reuse existing preprocessors * Avoid Double Tokenizer Save * Fix Style * Refactor After Review * Skip Tokenizers Tests If No Package Installed Check logs from tokneizers test * Style Fix * Fix OV Tokenizers Check * Fix Tests * Add Missing return * Turn off tokenizer message if not installed * Move tokenizers to OV dependencies * Check OV Compatibility * Bump OV Version * Move OpenVINO Tokenizers To Optional Dependencies * Add --convert-tokenizer Option to CLI * Fix SD Tokenizer --------- Co-authored-by: Sergey Lyalin <sergey.lyalin@intel.com>
1 parent 1c14957 commit 2be2e75

File tree

9 files changed

+182
-13
lines changed

9 files changed

+182
-13
lines changed

.github/workflows/test_openvino.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ jobs:
3232
python -m pip install --upgrade pip
3333
# install PyTorch CPU version to avoid installing CUDA packages on GitHub runner without GPU
3434
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
35-
pip install .[openvino,nncf,tests,diffusers]
35+
pip install .[openvino,openvino-tokenizers,nncf,tests,diffusers]
3636
- name: Test with Pytest
3737
run: |
3838
pytest tests/openvino/ --ignore test_modeling_basic

optimum/commands/export/openvino.py

+6
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,11 @@ def parse_args_openvino(parser: "ArgumentParser"):
103103
"OpenVINO native inference code that expects kv-cache inputs and outputs in the model."
104104
),
105105
)
106+
optional_group.add_argument(
107+
"--convert-tokenizer",
108+
action="store_true",
109+
help="Add converted tokenizer and detokenizer with OpenVINO Tokenizers",
110+
)
106111

107112

108113
class OVExportCommand(BaseOptimumCLICommand):
@@ -151,5 +156,6 @@ def run(self):
151156
compression_option=self.args.weight_format,
152157
compression_ratio=self.args.ratio,
153158
stateful=not self.args.disable_stateful,
159+
convert_tokenizer=self.args.convert_tokenizer,
154160
# **input_shapes,
155161
)

optimum/exporters/openvino/__main__.py

+29-7
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,21 @@
1818
from typing import Any, Callable, Dict, Optional, Union
1919

2020
from requests.exceptions import ConnectionError as RequestsConnectionError
21-
from transformers import AutoConfig, AutoTokenizer
21+
from transformers import AutoConfig, PreTrainedTokenizerBase
2222

2323
from optimum.exporters import TasksManager
2424
from optimum.exporters.onnx import __main__ as optimum_main
2525
from optimum.exporters.onnx.base import OnnxConfig, OnnxConfigWithPast
2626
from optimum.utils import DEFAULT_DUMMY_SHAPES
2727
from optimum.utils.save_utils import maybe_load_preprocessors, maybe_save_preprocessors
2828

29-
from ...intel.utils.import_utils import is_nncf_available, is_optimum_version, is_transformers_version
30-
from .convert import export_models
29+
from ...intel.utils.import_utils import (
30+
is_nncf_available,
31+
is_openvino_tokenizers_available,
32+
is_optimum_version,
33+
is_transformers_version,
34+
)
35+
from .convert import export_models, export_tokenizer
3136
from .stateful import ensure_export_task_support_stateful
3237

3338

@@ -41,7 +46,6 @@
4146
]
4247

4348
OV_XML_FILE_NAME = "openvino_model.xml"
44-
4549
_MAX_UNCOMPRESSED_SIZE = 1e9
4650

4751
logger = logging.getLogger(__name__)
@@ -67,6 +71,7 @@ def main_export(
6771
compression_option: Optional[str] = None,
6872
compression_ratio: Optional[float] = None,
6973
stateful: bool = True,
74+
convert_tokenizer: bool = False,
7075
**kwargs_shapes,
7176
):
7277
"""
@@ -318,13 +323,17 @@ class StoreAttr(object):
318323
and getattr(model.config, "pad_token_id", None) is None
319324
and task in ["text-classification"]
320325
)
326+
327+
tokenizer = next(
328+
(preprocessor for preprocessor in preprocessors if isinstance(preprocessor, PreTrainedTokenizerBase)), None
329+
)
330+
321331
if needs_pad_token_id:
322332
if pad_token_id is not None:
323333
model.config.pad_token_id = pad_token_id
324-
else:
334+
elif tokenizer is not None:
325335
try:
326-
tok = AutoTokenizer.from_pretrained(model_name_or_path)
327-
model.config.pad_token_id = tok.pad_token_id
336+
model.config.pad_token_id = tokenizer.pad_token_id
328337
except Exception:
329338
raise ValueError(
330339
"Could not infer the pad token id, which is needed in this case, please provide it with the --pad_token_id argument"
@@ -336,6 +345,15 @@ class StoreAttr(object):
336345
generation_config.save_pretrained(output)
337346
maybe_save_preprocessors(model_name_or_path, output)
338347

348+
if convert_tokenizer and tokenizer is not None and is_openvino_tokenizers_available():
349+
try:
350+
export_tokenizer(tokenizer, output)
351+
except Exception as exception:
352+
logger.warning(
353+
"Could not load tokenizer using specified model ID or path. OpenVINO tokenizer/detokenizer "
354+
f"models won't be generated. Exception: {exception}"
355+
)
356+
339357
if model.config.is_encoder_decoder and task.startswith("text-generation"):
340358
raise ValueError(
341359
f"model.config.is_encoder_decoder is True and task is `{task}`, which are incompatible. If the task was auto-inferred, please fill a bug report"
@@ -365,10 +383,14 @@ class StoreAttr(object):
365383
tokenizer = getattr(model, "tokenizer", None)
366384
if tokenizer is not None:
367385
tokenizer.save_pretrained(output.joinpath("tokenizer"))
386+
if convert_tokenizer and is_openvino_tokenizers_available():
387+
export_tokenizer(tokenizer, output)
368388

369389
tokenizer_2 = getattr(model, "tokenizer_2", None)
370390
if tokenizer_2 is not None:
371391
tokenizer_2.save_pretrained(output.joinpath("tokenizer_2"))
392+
if convert_tokenizer and is_openvino_tokenizers_available():
393+
export_tokenizer(tokenizer, output, suffix="_2")
372394

373395
model.save_config(output)
374396

optimum/exporters/openvino/convert.py

+52
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,11 @@
1919
from pathlib import Path
2020
from typing import Any, Dict, List, Optional, Tuple, Union
2121

22+
from transformers import T5Tokenizer, T5TokenizerFast
2223
from transformers.utils import is_tf_available, is_torch_available
2324

2425
from openvino.runtime import PartialShape, save_model
26+
from openvino.runtime.exceptions import OVTypeError
2527
from openvino.runtime.utils.types import get_element_type
2628
from openvino.tools.ovc import convert_model
2729
from optimum.exporters.onnx.base import OnnxConfig
@@ -536,3 +538,53 @@ def export_models(
536538

537539
outputs = list(map(list, zip(*outputs)))
538540
return outputs
541+
542+
543+
UNSUPPORTED_TOKENIZER_CLASSES = (
544+
T5Tokenizer,
545+
T5TokenizerFast,
546+
)
547+
548+
549+
def export_tokenizer(
550+
tokenizer,
551+
output: Union[str, Path],
552+
suffix: Optional[str] = "",
553+
):
554+
from optimum.intel.openvino import OV_DETOKENIZER_NAME, OV_TOKENIZER_NAME # avoid circular imports
555+
556+
if isinstance(tokenizer, UNSUPPORTED_TOKENIZER_CLASSES):
557+
logger.info(f"OpenVINO Tokenizer export for {type(tokenizer).__name__} is not supported.")
558+
return
559+
560+
try:
561+
from openvino_tokenizers import convert_tokenizer
562+
except ModuleNotFoundError:
563+
# avoid this message before tokenizers are part of the openvino dependencies
564+
# logger.info(
565+
# "Run `pip install openvino-tokenizers[transformers]` to get OpenVINO tokenizer/detokenizer models."
566+
# )
567+
return
568+
569+
if not isinstance(output, Path):
570+
output = Path(output)
571+
572+
try:
573+
converted = convert_tokenizer(tokenizer, with_detokenizer=True)
574+
except NotImplementedError:
575+
logger.warning("Detokenizer is not supported, convert tokenizer only.")
576+
converted = convert_tokenizer(tokenizer, with_detokenizer=False)
577+
except OVTypeError:
578+
logger.warning(f"OpenVINO Tokenizer export for {type(tokenizer).__name__} is not supported.")
579+
return
580+
except Exception as exception:
581+
logger.warning(
582+
f"OpenVINO Tokenizer export for {type(tokenizer).__name__} is not supported. Exception: {exception}"
583+
)
584+
return
585+
586+
if not isinstance(converted, tuple):
587+
converted = (converted,)
588+
589+
for model, file_name in zip(converted, (OV_TOKENIZER_NAME, OV_DETOKENIZER_NAME)):
590+
save_model(model, output / file_name.format(suffix))

optimum/intel/openvino/__init__.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,14 @@
1414
import logging
1515

1616
from ..utils.import_utils import is_diffusers_available, is_nncf_available
17-
from .utils import OV_DECODER_NAME, OV_DECODER_WITH_PAST_NAME, OV_ENCODER_NAME, OV_XML_FILE_NAME
17+
from .utils import (
18+
OV_DECODER_NAME,
19+
OV_DECODER_WITH_PAST_NAME,
20+
OV_DETOKENIZER_NAME,
21+
OV_ENCODER_NAME,
22+
OV_TOKENIZER_NAME,
23+
OV_XML_FILE_NAME,
24+
)
1825

1926

2027
if is_nncf_available():

optimum/intel/openvino/utils.py

+3
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@
3131
OV_DECODER_NAME = "openvino_decoder_model.xml"
3232
OV_DECODER_WITH_PAST_NAME = "openvino_decoder_with_past_model.xml"
3333

34+
OV_TOKENIZER_NAME = "openvino_tokenizer{}.xml"
35+
OV_DETOKENIZER_NAME = "openvino_detokenizer{}.xml"
36+
3437
ONNX_WEIGHTS_NAME = "model.onnx"
3538
ONNX_ENCODER_NAME = "encoder_model.onnx"
3639
ONNX_DECODER_NAME = "decoder_model.onnx"

optimum/intel/utils/import_utils.py

+34-3
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414

1515
import importlib.util
16+
import logging
1617
import operator as op
1718
import sys
1819
from collections import OrderedDict
@@ -27,6 +28,8 @@
2728
import importlib.metadata as importlib_metadata
2829

2930

31+
logger = logging.getLogger(__name__)
32+
3033
STR_OPERATION_TO_FUNC = {">": op.gt, ">=": op.ge, "==": op.eq, "!=": op.ne, "<=": op.le, "<": op.lt}
3134

3235
_optimum_version = importlib_metadata.version("optimum")
@@ -75,13 +78,38 @@
7578
version = get_version()
7679
# avoid invalid format
7780
if "-" in version:
78-
major_version, dev_info = version.split("-", 1)
81+
ov_major_version, dev_info = version.split("-", 1)
7982
commit_id = dev_info.split("-")[0]
80-
version = f"{major_version}-{commit_id}"
83+
version = f"{ov_major_version}-{commit_id}"
8184
_openvino_version = version
8285
except ImportError:
8386
_openvino_available = False
8487

88+
_openvino_tokenizers_available = importlib.util.find_spec("openvino_tokenizers") is not None and _openvino_available
89+
_openvino_tokenizers_version = "N/A"
90+
if _openvino_tokenizers_available:
91+
try:
92+
_openvino_tokenizers_version = importlib_metadata.version("openvino_tokenizers")
93+
except importlib_metadata.PackageNotFoundError:
94+
_openvino_tokenizers_available = False
95+
96+
if _openvino_tokenizers_available and _openvino_tokenizers_version != "N/A":
97+
_compatible_openvino_version = next(
98+
(
99+
requirement.split("==")[-1]
100+
for requirement in importlib_metadata.requires("openvino-tokenizers")
101+
if requirement.startswith("openvino==")
102+
),
103+
"",
104+
)
105+
_openvino_tokenizers_available = _compatible_openvino_version == ov_major_version
106+
if not _openvino_tokenizers_available:
107+
logger.warning(
108+
"OpenVINO Tokenizer version is not compatible with OpenVINO version. "
109+
f"Installed OpenVINO version: {ov_major_version},"
110+
f"OpenVINO Tokenizers requires {_compatible_openvino_version}. "
111+
f"OpenVINO Tokenizers models will not be added during export."
112+
)
85113

86114
_nncf_available = importlib.util.find_spec("nncf") is not None
87115
_nncf_version = "N/A"
@@ -91,7 +119,6 @@
91119
except importlib_metadata.PackageNotFoundError:
92120
_nncf_available = False
93121

94-
95122
_diffusers_available = importlib.util.find_spec("diffusers") is not None
96123
_diffusers_version = "N/A"
97124
if _diffusers_available:
@@ -135,6 +162,10 @@ def is_openvino_available():
135162
return _openvino_available
136163

137164

165+
def is_openvino_tokenizers_available():
166+
return _openvino_tokenizers_available
167+
168+
138169
def is_nncf_available():
139170
return _nncf_available
140171

setup.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,14 @@
4444
"onnxruntime<1.15.0",
4545
"transformers>=4.34.0",
4646
],
47-
"openvino": ["openvino>=2023.2", "onnx", "onnxruntime", "transformers>=4.36.0", "optimum>=1.16.1"],
47+
"openvino": [
48+
"openvino>=2023.3",
49+
"onnx",
50+
"onnxruntime",
51+
"transformers>=4.36.0",
52+
"optimum>=1.16.1",
53+
],
54+
"openvino-tokenizers": ["openvino-tokenizers[transformers]"],
4855
"nncf": ["nncf @ git+https://github.com/openvinotoolkit/nncf.git"],
4956
"ipex": ["intel-extension-for-pytorch", "onnx"],
5057
"diffusers": ["diffusers"],

tests/openvino/test_exporters_cli.py

+41
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414
import subprocess
1515
import unittest
16+
from pathlib import Path
1617
from tempfile import TemporaryDirectory
1718

1819
from parameterized import parameterized
@@ -38,6 +39,7 @@
3839
OVStableDiffusionXLPipeline,
3940
)
4041
from optimum.intel.openvino.utils import _HEAD_TO_AUTOMODELS
42+
from optimum.intel.utils.import_utils import is_openvino_tokenizers_available
4143

4244

4345
class OVCLIExportTestCase(unittest.TestCase):
@@ -61,6 +63,19 @@ class OVCLIExportTestCase(unittest.TestCase):
6163
("stable-diffusion-xl", "stable-diffusion-xl"),
6264
("stable-diffusion-xl", "stable-diffusion-xl-refiner"),
6365
)
66+
EXPECTED_NUMBER_OF_TOKENIZER_MODELS = {
67+
"gpt2": 2,
68+
"t5": 0, # failed internal sentencepiece check - no <s> token in the vocab
69+
"albert": 0, # not supported yet
70+
"distilbert": 1, # no detokenizer
71+
"roberta": 2,
72+
"vit": 0, # no tokenizer for image model
73+
"wav2vec2": 0, # no tokenizer
74+
"bert": 1, # no detokenizer
75+
"blenderbot": 2,
76+
"stable-diffusion": 0, # not supported
77+
"stable-diffusion-xl": 0, # not supported
78+
}
6479

6580
SUPPORTED_4BIT_ARCHITECTURES = (("text-generation-with-past", "opt125m"),)
6681

@@ -98,6 +113,32 @@ def test_exporters_cli(self, task: str, model_type: str):
98113
model_kwargs = {"use_cache": task.endswith("with-past")} if "generation" in task else {}
99114
eval(_HEAD_TO_AUTOMODELS[task.replace("-with-past", "")]).from_pretrained(tmpdir, **model_kwargs)
100115

116+
@parameterized.expand(
117+
arch
118+
for arch in SUPPORTED_ARCHITECTURES
119+
if not arch[0].endswith("-with-past") and not arch[1].endswith("-refiner")
120+
)
121+
@unittest.skipIf(not is_openvino_tokenizers_available(), reason="OpenVINO Tokenizers not available")
122+
def test_exporters_cli_tokenizers(self, task: str, model_type: str):
123+
with TemporaryDirectory() as tmpdir:
124+
output = subprocess.check_output(
125+
f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --convert-tokenizer --task {task} {tmpdir}",
126+
shell=True,
127+
stderr=subprocess.STDOUT,
128+
).decode()
129+
save_dir = Path(tmpdir)
130+
number_of_tokenizers = sum("tokenizer" in file for file in map(str, save_dir.rglob("*.xml")))
131+
self.assertEqual(
132+
self.EXPECTED_NUMBER_OF_TOKENIZER_MODELS[model_type],
133+
number_of_tokenizers,
134+
f"OVT: {is_openvino_tokenizers_available() }",
135+
)
136+
137+
if number_of_tokenizers == 1:
138+
self.assertTrue("Detokenizer is not supported, convert tokenizer only." in output, output)
139+
elif number_of_tokenizers == 0 and task not in ("image-classification", "audio-classification"):
140+
self.assertTrue(("OpenVINO Tokenizer export for" in output and "is not supported." in output), output)
141+
101142
@parameterized.expand(SUPPORTED_ARCHITECTURES)
102143
def test_exporters_cli_fp16(self, task: str, model_type: str):
103144
with TemporaryDirectory() as tmpdir:

0 commit comments

Comments
 (0)