Skip to content

Commit 0ff2bbe

Browse files
eaidovaecharlaix
andauthored
Resolve complicated chat templates during tokenizer saving (#1151)
* resolve complicated chat templates during tokenizer saving * Apply suggestions from code review * improve template selection logic and add tests * add updated deepseek template * Update tests/openvino/test_exporters_cli.py * deepseek llama * add comparing templated strings * fix space for minicpm3 template * Update optimum/exporters/openvino/convert.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --------- Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
1 parent b1cc208 commit 0ff2bbe

File tree

4 files changed

+243
-4
lines changed

4 files changed

+243
-4
lines changed

optimum/exporters/openvino/__main__.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222

2323
from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
2424
from requests.exceptions import ConnectionError as RequestsConnectionError
25-
from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizerBase
25+
from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizerBase, ProcessorMixin
2626
from transformers.utils import is_torch_available
2727

2828
from openvino.runtime import Core, Type, save_model
@@ -531,10 +531,15 @@ def maybe_convert_tokenizers(library_name: str, output: Path, model=None, prepro
531531

532532
if is_openvino_tokenizers_available():
533533
if library_name != "diffusers" and preprocessors:
534+
processor_chat_template = None
534535
tokenizer = next(filter(lambda it: isinstance(it, PreTrainedTokenizerBase), preprocessors), None)
536+
if len(preprocessors) > 1:
537+
for processor in preprocessors:
538+
if isinstance(processor, ProcessorMixin) and hasattr(processor, "chat_template"):
539+
processor_chat_template = processor.chat_template
535540
if tokenizer:
536541
try:
537-
export_tokenizer(tokenizer, output, task=task)
542+
export_tokenizer(tokenizer, output, task=task, processor_chat_template=processor_chat_template)
538543
except Exception as exception:
539544
logger.warning(
540545
"Could not load tokenizer using specified model ID or path. OpenVINO tokenizer/detokenizer "

optimum/exporters/openvino/convert.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@
7171
remove_none_from_dummy_inputs,
7272
save_config,
7373
save_preprocessors,
74+
set_simplified_chat_template,
7475
)
7576

7677

@@ -825,6 +826,7 @@ def export_tokenizer(
825826
output: Union[str, Path],
826827
suffix: Optional[str] = "",
827828
task: Optional[str] = None,
829+
processor_chat_template: Optional[str] = None,
828830
):
829831
# avoid circular imports
830832
from optimum.intel.openvino import OV_DETOKENIZER_NAME, OV_TOKENIZER_NAME
@@ -849,7 +851,7 @@ def export_tokenizer(
849851

850852
if (
851853
task is not None
852-
and task.startswith("text-generation")
854+
and (task.startswith("text-generation") or task == "image-text-to-text")
853855
and compare_versions("openvino-tokenizers", ">=", "2024.3.0.0")
854856
):
855857
logger.info(f"Set tokenizer padding side to left for `{task}` task.")
@@ -858,6 +860,8 @@ def export_tokenizer(
858860

859861
try:
860862
converted = convert_tokenizer(tokenizer, with_detokenizer=True)
863+
set_simplified_chat_template(converted[0], processor_chat_template)
864+
861865
except NotImplementedError:
862866
logger.info("Detokenizer is not supported, convert tokenizer only.")
863867
converted = convert_tokenizer(tokenizer, with_detokenizer=False)

0 commit comments

Comments
 (0)