Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OV Tokenizers Leftovers #697

Merged
merged 9 commits into from
May 14, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions optimum/commands/export/openvino.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,17 @@ def run(self):
)
model.save_pretrained(self.args.output)

# not export when using other exporters
from ...exporters.openvino.convert import export_tokenizer

output = Path(self.args.output)
tokenizer = getattr(model, "tokenizer", None)
if tokenizer is not None:
export_tokenizer(tokenizer, output / "tokenizer")

tokenizer_2 = getattr(model, "tokenizer_2", None)
if tokenizer_2 is not None:
export_tokenizer(tokenizer_2, output / "tokenizer_2")
else:
if self.args.convert_tokenizer:
logger.warning("`--convert-tokenizer` option is deprecated. Tokenizer will be converted by default.")
Expand Down
13 changes: 9 additions & 4 deletions optimum/exporters/openvino/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from optimum.exporters import TasksManager
from optimum.exporters.onnx.base import OnnxConfig
from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED
from optimum.exporters.openvino.convert import export_from_model, export_tokenizer
from optimum.exporters.openvino.convert import export_from_model
from optimum.intel.utils.import_utils import is_openvino_tokenizers_available, is_transformers_version
from optimum.utils.save_utils import maybe_load_preprocessors

Expand Down Expand Up @@ -355,6 +355,11 @@ class StoreAttr(object):
**kwargs_shapes,
)

# hide openvino import when using other exporters
# avoid circular import
from optimum.exporters.openvino.convert import export_tokenizer
from optimum.intel.openvino.utils import OV_TOKENIZER_FLOLDER

if convert_tokenizer and is_openvino_tokenizers_available():
if library_name != "diffusers":
tokenizer = next(
Expand All @@ -364,7 +369,7 @@ class StoreAttr(object):

if tokenizer is not None:
try:
export_tokenizer(tokenizer, output)
export_tokenizer(tokenizer, output / OV_TOKENIZER_FLOLDER)
except Exception as exception:
logger.warning(
"Could not load tokenizer using specified model ID or path. OpenVINO tokenizer/detokenizer "
Expand All @@ -373,11 +378,11 @@ class StoreAttr(object):
else:
tokenizer = getattr(model, "tokenizer", None)
if tokenizer is not None:
export_tokenizer(tokenizer, output)
export_tokenizer(tokenizer, output / "tokenizer")

tokenizer_2 = getattr(model, "tokenizer_2", None)
if tokenizer_2 is not None:
export_tokenizer(tokenizer_2, output, suffix="_2")
export_tokenizer(tokenizer_2, output / "tokenizer_2")
elif convert_tokenizer and not is_openvino_tokenizers_available():
logger.warning("Tokenizer won't be converted.")

Expand Down
11 changes: 6 additions & 5 deletions optimum/exporters/openvino/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -667,20 +667,21 @@ def export_tokenizer(
output: Union[str, Path],
suffix: Optional[str] = "",
):
from optimum.intel.openvino import OV_DETOKENIZER_NAME, OV_TOKENIZER_NAME # avoid circular imports
# avoid circular imports
from optimum.intel.openvino import OV_DETOKENIZER_NAME, OV_TOKENIZER_NAME
from optimum.intel.openvino.utils import maybe_convert_tokenizer_to_fast

try:
from openvino_tokenizers import convert_tokenizer
except ModuleNotFoundError:
# avoid this message before tokenizers are part of the openvino dependencies
# logger.info(
# "Run `pip install openvino-tokenizers[transformers]` to get OpenVINO tokenizer/detokenizer models."
# )
return

if not isinstance(output, Path):
output = Path(output)

if output.exists():
tokenizer = maybe_convert_tokenizer_to_fast(tokenizer, output)

try:
converted = convert_tokenizer(tokenizer, with_detokenizer=True)
except NotImplementedError:
Expand Down
24 changes: 24 additions & 0 deletions optimum/intel/openvino/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,13 @@
import logging
import os
from glob import glob
from pathlib import Path
from typing import List, Union

import numpy as np
from huggingface_hub import model_info
from openvino.runtime import Core, Type, properties
from transformers import AutoTokenizer, CLIPTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
from transformers.onnx.utils import ParameterFormat, compute_serialized_parameters_size


Expand All @@ -31,6 +34,7 @@
OV_DECODER_NAME = "openvino_decoder_model.xml"
OV_DECODER_WITH_PAST_NAME = "openvino_decoder_with_past_model.xml"

OV_TOKENIZER_FLOLDER = "openvino_tokenizer"
OV_TOKENIZER_NAME = "openvino_tokenizer{}.xml"
OV_DETOKENIZER_NAME = "openvino_detokenizer{}.xml"

Expand Down Expand Up @@ -107,6 +111,26 @@
}


NEED_CONVERT_TO_FAST_TOKENIZER: List[type(PreTrainedTokenizer)] = [
CLIPTokenizer,
]


def maybe_convert_tokenizer_to_fast(
hf_tokenizer: PreTrainedTokenizer, tokenizer_path: Path
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
if isinstance(hf_tokenizer, PreTrainedTokenizerFast):
return hf_tokenizer

if any(isinstance(hf_tokenizer, slow_class) for slow_class in NEED_CONVERT_TO_FAST_TOKENIZER):
try:
return AutoTokenizer.from_pretrained(tokenizer_path)
except Exception:
return hf_tokenizer

return hf_tokenizer


def use_external_data_format(num_parameters: int) -> bool:
"""
Returns whether or not the model requires using external data format for the ONNX export
Expand Down
4 changes: 2 additions & 2 deletions tests/openvino/test_exporters_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ class OVCLIExportTestCase(unittest.TestCase):
"wav2vec2": 0, # no tokenizer
"bert": 1, # no detokenizer
"blenderbot": 2,
"stable-diffusion": 0, # not supported
"stable-diffusion-xl": 0, # not supported
"stable-diffusion": 2,
"stable-diffusion-xl": 4,
}

SUPPORTED_SD_HYBRID_ARCHITECTURES = (
Expand Down
Loading