Skip to content

Commit 0d943f8

Browse files
authored
Convert Tokenizers By Default (#580)
* Convert Tokenizers By Default * Add Warning to Deprecated Option * Update OV Tokenizers Availability Check * Move openvino-tokenizers to openvino dependencies * Make Style * Change Imports to Absolute * Check openvino-nightly compatibility * Change model skip explanation * Update OV Tokenizers Availability Check * Add Check for OpenVINO Nightly and Archive * Add linux distros compatibility message * Address Review Comments * Address Review Comments * Address Review Comments * Fix Style * Change Warnings to Debug Level * Fix Tests Debug Message * Fix Style * Fix Style
1 parent 4651ac2 commit 0d943f8

File tree

6 files changed

+107
-70
lines changed

6 files changed

+107
-70
lines changed

optimum/commands/export/openvino.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -126,10 +126,15 @@ def parse_args_openvino(parser: "ArgumentParser"):
126126
"OpenVINO native inference code that expects kv-cache inputs and outputs in the model."
127127
),
128128
)
129+
optional_group.add_argument(
130+
"--disable-convert-tokenizer",
131+
action="store_true",
132+
help="Do not add converted tokenizer and detokenizer OpenVINO models.",
133+
)
129134
optional_group.add_argument(
130135
"--convert-tokenizer",
131136
action="store_true",
132-
help="Add converted tokenizer and detokenizer with OpenVINO Tokenizers",
137+
help="[Deprecated] Add converted tokenizer and detokenizer with OpenVINO Tokenizers.",
133138
)
134139

135140
optional_group.add_argument(
@@ -247,6 +252,9 @@ def run(self):
247252
model.save_pretrained(self.args.output)
248253

249254
else:
255+
if self.args.convert_tokenizer:
256+
logger.warning("`--convert-tokenizer` option is deprecated. Tokenizer will be converted by default.")
257+
250258
# TODO : add input shapes
251259
main_export(
252260
model_name_or_path=self.args.model,
@@ -258,7 +266,7 @@ def run(self):
258266
pad_token_id=self.args.pad_token_id,
259267
ov_config=ov_config,
260268
stateful=not self.args.disable_stateful,
261-
convert_tokenizer=self.args.convert_tokenizer,
269+
convert_tokenizer=not self.args.disable_convert_tokenizer,
262270
library_name=library_name,
263271
# **input_shapes,
264272
)

optimum/exporters/openvino/__main__.py

+5-10
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,10 @@
2222
from optimum.exporters import TasksManager
2323
from optimum.exporters.onnx.base import OnnxConfig
2424
from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED
25+
from optimum.exporters.openvino.convert import export_from_model, export_tokenizer
26+
from optimum.intel.utils.import_utils import is_openvino_tokenizers_available, is_transformers_version
2527
from optimum.utils.save_utils import maybe_load_preprocessors
2628

27-
from ...intel.utils.import_utils import is_openvino_tokenizers_available, is_transformers_version
28-
from .convert import export_from_model, export_tokenizer
29-
3029

3130
if TYPE_CHECKING:
3231
from optimum.intel.openvino.configuration import OVConfig
@@ -187,12 +186,6 @@ def main_export(
187186
f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
188187
)
189188

190-
if convert_tokenizer and not is_openvino_tokenizers_available():
191-
logger.warning(
192-
"`convert_tokenizer` requires openvino-tokenizers, please install it with `pip install optimum-intel[openvino-tokenizers]`"
193-
)
194-
convert_tokenizer = False
195-
196189
do_gptq_patching = False
197190
custom_architecture = False
198191
loading_kwargs = {}
@@ -348,7 +341,7 @@ class StoreAttr(object):
348341
**kwargs_shapes,
349342
)
350343

351-
if convert_tokenizer:
344+
if convert_tokenizer and is_openvino_tokenizers_available():
352345
if library_name != "diffusers":
353346
tokenizer = next(
354347
(preprocessor for preprocessor in preprocessors if isinstance(preprocessor, PreTrainedTokenizerBase)),
@@ -371,6 +364,8 @@ class StoreAttr(object):
371364
tokenizer_2 = getattr(model, "tokenizer_2", None)
372365
if tokenizer_2 is not None:
373366
export_tokenizer(tokenizer_2, output, suffix="_2")
367+
elif convert_tokenizer and not is_openvino_tokenizers_available():
368+
logger.warning("Tokenizer won't be converted.")
374369

375370
# Unpatch modules after GPTQ export
376371
if do_gptq_patching:

optimum/exporters/openvino/convert.py

+3-11
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
from pathlib import Path
2121
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
2222

23-
from transformers import T5Tokenizer, T5TokenizerFast
2423
from transformers.utils import is_tf_available, is_torch_available
2524

2625
from openvino.runtime import PartialShape, save_model
@@ -49,9 +48,6 @@
4948
)
5049

5150

52-
UNSUPPORTED_TOKENIZER_CLASSES = (T5Tokenizer, T5TokenizerFast)
53-
54-
5551
logger = logging.getLogger(__name__)
5652

5753
if is_torch_available():
@@ -662,10 +658,6 @@ def export_tokenizer(
662658
):
663659
from optimum.intel.openvino import OV_DETOKENIZER_NAME, OV_TOKENIZER_NAME # avoid circular imports
664660

665-
if isinstance(tokenizer, UNSUPPORTED_TOKENIZER_CLASSES):
666-
logger.info(f"OpenVINO Tokenizer export for {type(tokenizer).__name__} is not supported.")
667-
return
668-
669661
try:
670662
from openvino_tokenizers import convert_tokenizer
671663
except ModuleNotFoundError:
@@ -681,13 +673,13 @@ def export_tokenizer(
681673
try:
682674
converted = convert_tokenizer(tokenizer, with_detokenizer=True)
683675
except NotImplementedError:
684-
logger.warning("Detokenizer is not supported, convert tokenizer only.")
676+
logger.info("Detokenizer is not supported, convert tokenizer only.")
685677
converted = convert_tokenizer(tokenizer, with_detokenizer=False)
686678
except OVTypeError:
687-
logger.warning(f"OpenVINO Tokenizer export for {type(tokenizer).__name__} is not supported.")
679+
logger.debug(f"OpenVINO Tokenizer export for {type(tokenizer).__name__} is not supported.")
688680
return
689681
except Exception as exception:
690-
logger.warning(
682+
logger.debug(
691683
f"OpenVINO Tokenizer export for {type(tokenizer).__name__} is not supported. Exception: {exception}"
692684
)
693685
return

optimum/intel/utils/import_utils.py

+75-28
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
14+
import functools
1515
import importlib.util
1616
import logging
1717
import operator as op
@@ -95,32 +95,6 @@
9595
except ImportError:
9696
_openvino_available = False
9797

98-
_openvino_tokenizers_available = importlib.util.find_spec("openvino_tokenizers") is not None and _openvino_available
99-
_openvino_tokenizers_version = "N/A"
100-
if _openvino_tokenizers_available:
101-
try:
102-
_openvino_tokenizers_version = importlib_metadata.version("openvino_tokenizers")
103-
except importlib_metadata.PackageNotFoundError:
104-
_openvino_tokenizers_available = False
105-
106-
if _openvino_tokenizers_available and _openvino_tokenizers_version != "N/A":
107-
_compatible_openvino_version = next(
108-
(
109-
requirement.split("==")[-1]
110-
for requirement in importlib_metadata.requires("openvino-tokenizers")
111-
if requirement.startswith("openvino==")
112-
),
113-
"",
114-
)
115-
_openvino_tokenizers_available = _compatible_openvino_version == ov_major_version
116-
if not _openvino_tokenizers_available:
117-
logger.warning(
118-
"OpenVINO Tokenizer version is not compatible with OpenVINO version. "
119-
f"Installed OpenVINO version: {ov_major_version},"
120-
f"OpenVINO Tokenizers requires {_compatible_openvino_version}. "
121-
f"OpenVINO Tokenizers models will not be added during export."
122-
)
123-
12498
_nncf_available = importlib.util.find_spec("nncf") is not None
12599
_nncf_version = "N/A"
126100
if _nncf_available:
@@ -196,8 +170,81 @@ def is_openvino_available():
196170
return _openvino_available
197171

198172

173+
@functools.lru_cache(1)
199174
def is_openvino_tokenizers_available():
200-
return _openvino_tokenizers_available
175+
if not is_openvino_available():
176+
return False
177+
178+
if importlib.util.find_spec("openvino_tokenizers") is None:
179+
logger.info(
180+
"OpenVINO Tokenizers is not available. To deploy models in production "
181+
"with C++ code, please follow installation instructions: "
182+
"https://github.com/openvinotoolkit/openvino_tokenizers?tab=readme-ov-file#installation\n"
183+
)
184+
return False
185+
186+
try:
187+
pip_metadata_version = importlib_metadata.version("openvino")
188+
except importlib_metadata.PackageNotFoundError:
189+
pip_metadata_version = False
190+
try:
191+
pip_metadata_version = importlib_metadata.version("openvino-nightly")
192+
is_nightly = True
193+
except importlib_metadata.PackageNotFoundError:
194+
is_nightly = False
195+
196+
try:
197+
import openvino_tokenizers
198+
199+
openvino_tokenizers._get_factory()
200+
except RuntimeError:
201+
tokenizers_version = openvino_tokenizers.__version__
202+
203+
if tokenizers_version == "0.0.0.0":
204+
try:
205+
tokenizers_version = importlib_metadata.version("openvino_tokenizers") or tokenizers_version
206+
except importlib_metadata.PackageNotFoundError:
207+
pass
208+
message = (
209+
"OpenVINO and OpenVINO Tokenizers versions are not binary compatible.\n"
210+
f"OpenVINO version: {_openvino_version}\n"
211+
f"OpenVINO Tokenizers version: {tokenizers_version}\n"
212+
"First 3 numbers should be the same. Update OpenVINO Tokenizers to compatible version. "
213+
)
214+
if not pip_metadata_version:
215+
message += (
216+
"For archive installation of OpenVINO try to build OpenVINO Tokenizers from source: "
217+
"https://github.com/openvinotoolkit/openvino_tokenizers/tree/master?tab=readme-ov-file"
218+
"#build-and-install-from-source"
219+
)
220+
if sys.platform == "linux":
221+
message += (
222+
"\nThe PyPI version of OpenVINO Tokenizers is built on CentOS and may not be compatible with other "
223+
"Linux distributions; rebuild OpenVINO Tokenizers from source."
224+
)
225+
else:
226+
message += (
227+
"It is recommended to use the same day builds for pre-release version. "
228+
"To install both OpenVINO and OpenVINO Tokenizers release version perform:\n"
229+
)
230+
if is_nightly:
231+
message += "pip uninstall -y openvino-nightly && "
232+
message += "pip install --force-reinstall openvino openvino-tokenizers\n"
233+
if is_nightly:
234+
message += (
235+
"openvino-nightly package will be deprecated in the future - use pre-release drops instead. "
236+
)
237+
message += "To update both OpenVINO and OpenVINO Tokenizers to the latest pre-release version perform:\n"
238+
if is_nightly:
239+
message += "pip uninstall -y openvino-nightly && "
240+
message += (
241+
"pip install --pre -U openvino openvino-tokenizers "
242+
"--extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly"
243+
)
244+
logger.warning(message)
245+
return False
246+
247+
return True
201248

202249

203250
def is_nncf_available():

setup.py

+2-7
Original file line numberDiff line numberDiff line change
@@ -58,13 +58,8 @@
5858
QUALITY_REQUIRE = ["black~=23.1", "ruff>=0.0.241"]
5959

6060
EXTRAS_REQUIRE = {
61-
"neural-compressor": [
62-
"neural-compressor>=2.2.0",
63-
"onnxruntime<1.15.0",
64-
"accelerate",
65-
],
66-
"openvino": ["openvino>=2023.3", "nncf>=2.8.1"],
67-
"openvino-tokenizers": ["openvino-tokenizers[transformers]"],
61+
"neural-compressor": ["neural-compressor>=2.2.0", "onnxruntime<1.15.0", "accelerate"],
62+
"openvino": ["openvino>=2023.3", "nncf>=2.8.1", "openvino-tokenizers[transformers]"],
6863
"nncf": ["nncf>=2.8.1"],
6964
"ipex": ["intel-extension-for-pytorch", "transformers>=4.36.0,<4.39.0"],
7065
"diffusers": ["diffusers"],

tests/openvino/test_exporters_cli.py

+12-12
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ class OVCLIExportTestCase(unittest.TestCase):
6666
)
6767
EXPECTED_NUMBER_OF_TOKENIZER_MODELS = {
6868
"gpt2": 2,
69-
"t5": 0, # failed internal sentencepiece check - no <s> token in the vocab
69+
"t5": 0, # no .model file in the repository
7070
"albert": 0, # not supported yet
7171
"distilbert": 1, # no detokenizer
7272
"roberta": 2,
@@ -125,26 +125,26 @@ def test_exporters_cli(self, task: str, model_type: str):
125125
for arch in SUPPORTED_ARCHITECTURES
126126
if not arch[0].endswith("-with-past") and not arch[1].endswith("-refiner")
127127
)
128-
@unittest.skipIf(not is_openvino_tokenizers_available(), reason="OpenVINO Tokenizers not available")
129128
def test_exporters_cli_tokenizers(self, task: str, model_type: str):
130129
with TemporaryDirectory() as tmpdir:
131130
output = subprocess.check_output(
132-
f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --convert-tokenizer --task {task} {tmpdir}",
131+
f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} {tmpdir}",
133132
shell=True,
134133
stderr=subprocess.STDOUT,
135134
).decode()
136-
save_dir = Path(tmpdir)
137-
number_of_tokenizers = sum("tokenizer" in file for file in map(str, save_dir.rglob("*.xml")))
138-
self.assertEqual(
139-
self.EXPECTED_NUMBER_OF_TOKENIZER_MODELS[model_type],
140-
number_of_tokenizers,
141-
f"OVT: {is_openvino_tokenizers_available() }",
142-
)
135+
if not is_openvino_tokenizers_available():
136+
self.assertTrue(
137+
"OpenVINO Tokenizers is not available." in output
138+
or "OpenVINO and OpenVINO Tokenizers versions are not binary compatible." in output,
139+
msg=output,
140+
)
141+
return
142+
143+
number_of_tokenizers = sum("tokenizer" in file for file in map(str, Path(tmpdir).rglob("*.xml")))
144+
self.assertEqual(self.EXPECTED_NUMBER_OF_TOKENIZER_MODELS[model_type], number_of_tokenizers, output)
143145

144146
if number_of_tokenizers == 1:
145147
self.assertTrue("Detokenizer is not supported, convert tokenizer only." in output, output)
146-
elif number_of_tokenizers == 0 and task not in ("image-classification", "audio-classification"):
147-
self.assertTrue(("OpenVINO Tokenizer export for" in output and "is not supported." in output), output)
148148

149149
@parameterized.expand(SUPPORTED_ARCHITECTURES)
150150
def test_exporters_cli_fp16(self, task: str, model_type: str):

0 commit comments

Comments
 (0)