diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp index faa15145b3..a8a90ebb6c 100644 --- a/src/cpp/include/openvino/genai/tokenizer.hpp +++ b/src/cpp/include/openvino/genai/tokenizer.hpp @@ -23,7 +23,22 @@ struct TokenizedInputs { }; /** -* @brief class is used to encode prompts and decode resulting tokens + * @brief The class is used to encode prompts and decode resulting tokens + * + * Chat tempalte is initialized from sources in the following order + * overriding the previos value: + * 1. chat_template entry from tokenizer_config.json + * 2. chat_template entry from processor_config.json + * 3. chat_template entry from chat_template.json + * 4. chat_tempalte entry from rt_info section of ov::Model + * 5. If the tempalte is known to be not supported by GenAI, it's + * replaced with a simplified supported version. + * 6. Patch chat_tempalte replacing not supported instructions with + * eqvivalents. + * 7. If the template was not in the list of not supported GenAI + * templates from (5), it's blindly replaced with + * simplified_chat_template entry from rt_info section of + * ov::Model if the entry exists. */ class OPENVINO_GENAI_EXPORTS Tokenizer { public: diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index 0d0c3aeb0d..2e4e85ab34 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -70,6 +70,61 @@ const std::pair chat_template_fallback_map[] = { } }; +std::optional remap_template(const std::string& chat_template) { + for (const auto& [known, fallback] : chat_template_fallback_map) { + if (chat_template == known) { + return fallback; + } + } + return std::nullopt; +} + +void parse_if_exists(const std::filesystem::path& path, std::string& value) { + if (std::filesystem::exists(path)) { + ov::genai::utils::read_json_param(nlohmann::json::parse(std::ifstream{path}), "chat_template", value); + } +} + +template +const T& find_or_fallback(const ov::AnyMap& rt_info, const char name[], const T& fallback) { + auto iter = rt_info.find(name); + if (rt_info.end() == iter) { + return fallback; + } + return iter->second.as(); +} + +std::string patch_template(std::string&& chat_template) { + // Replace what jinja2cpp doesn't support + std::pair replace_str_map[] = { + {"'}", "' }"}, + {"{'", "{ '"}, + {".strip()", ""}, + {"is not none", "is defined"}, + {"is none", "is undefined"}, + {"= none", "= undefined"}, + // Jinja2Cpp does not support Python-style slicing, e.g. [1:]. + // If chat template contains such slicing, we replace it with + // a placeholder at the moment. + {"messages[1:]", "slice(messages, 1)"}, + }; + + for (const auto& [from, to] : replace_str_map) { + size_t pos = 0; + while ((pos = chat_template.find(from, pos)) != std::string::npos) { + chat_template.replace(pos, from.size(), to); + pos += to.size(); + } + } + return chat_template; +} + +std::string remap_and_patch(const std::string& chat_template) { + return patch_template( + remap_template(chat_template).value_or(chat_template) + ); +} + } // namespace namespace ov { @@ -195,11 +250,10 @@ class Tokenizer::TokenizerImpl { read_special_tokens_map(models_path); // Try to read tokenizer_config if some token ids or token str are not defined. read_tokenizer_config_if_necessary(models_path); + parse_if_exists(models_path / "tokenizer_config.json", m_chat_template); + parse_if_exists(models_path / "processor_config.json", m_chat_template); + parse_if_exists(models_path / "chat_template.json", m_chat_template); setup_tokenizer(std::make_pair(ov_tokenizer, ov_detokenizer), properties); - m_chat_template = chat_template_from_file_if_exists(models_path, "tokenizer_config.json"); - if (m_chat_template.empty()) { - m_chat_template = chat_template_from_file_if_exists(models_path, "chat_template.json"); - } } void setup_tokenizer(const std::pair, std::shared_ptr>& models, const ov::AnyMap& properties) { @@ -209,10 +263,8 @@ class Tokenizer::TokenizerImpl { auto core = get_core_singleton(); std::string device = "CPU"; // only CPU is supported for now - std::string version_str; - utils::read_rt_info(ov_tokenizer != nullptr ? ov_tokenizer: ov_detokenizer , "openvino_tokenizers_version", version_str); - // Saving IR version was added only in 24.5, so if it's empty, then it's older than 24.5 - m_older_than_24_5 = version_str.empty(); + // Saving IR version was added only in 24.5, so if it's missing, then it's older than 24.5 + m_older_than_24_5 = (ov_tokenizer ? ov_tokenizer: ov_detokenizer)->get_rt_info().count("openvino_tokenizers_version") == 0; if (ov_tokenizer) { ov::pass::Manager manager; @@ -227,6 +279,18 @@ class Tokenizer::TokenizerImpl { [this]() -> ov::InferRequest { return std::move(this->m_tokenizer.create_infer_request()); }); + + const ov::AnyMap& rt_info = ov_tokenizer->get_rt_info(); + m_pad_token_id = find_or_fallback(rt_info, "pad_token_id", m_pad_token_id); + m_bos_token_id = find_or_fallback(rt_info, "bos_token_id", m_bos_token_id); + m_eos_token_id = find_or_fallback(rt_info, "eos_token_id", m_eos_token_id); + + m_chat_template = find_or_fallback(rt_info, "chat_template", m_chat_template); + std::optional fallback = remap_template(m_chat_template); + m_chat_template = patch_template(fallback.value_or(m_chat_template)); + if (!fallback.has_value()) { + m_chat_template = find_or_fallback(rt_info, "simplified_chat_template", m_chat_template); + } } if (ov_detokenizer) { @@ -241,6 +305,14 @@ class Tokenizer::TokenizerImpl { [this]() -> ov::InferRequest { return std::move(this->m_detokenizer.create_infer_request()); }); + + // Unset/-1 token causes exception in SentencePiece detokenization. + if (m_pad_token_id != -1 && m_pad_token.empty()) + m_pad_token = decode(std::vector{m_pad_token_id}, {ov::genai::add_special_tokens(true)}); + if (m_bos_token_id != -1 && m_bos_token.empty()) + m_bos_token = decode(std::vector{m_bos_token_id}, {ov::genai::add_special_tokens(true)}); + if (m_eos_token_id != -1 && m_eos_token.empty()) + m_eos_token = decode(std::vector{m_eos_token_id}, {ov::genai::add_special_tokens(true)}); } // Initialize tokenizer's cache to save time later. @@ -251,24 +323,6 @@ class Tokenizer::TokenizerImpl { if (m_detokenizer) { decode({1, 33, 199, 42, 42}); } - - if (m_tokenizer) { - utils::read_rt_info(ov_tokenizer, "chat_template", m_chat_template); - utils::read_rt_info(ov_tokenizer, "pad_token_id", m_pad_token_id); - utils::read_rt_info(ov_tokenizer, "bos_token_id", m_bos_token_id); - utils::read_rt_info(ov_tokenizer, "eos_token_id", m_eos_token_id); - } - - m_chat_template = patch_chat_template(m_chat_template); - if (m_detokenizer) { - // Unset/-1 token causes exception in SentencePiece detokenization. - if (m_pad_token_id != -1 && m_pad_token.empty()) - m_pad_token = decode(std::vector{m_pad_token_id}, {ov::genai::add_special_tokens(true)}); - if (m_bos_token_id != -1 && m_bos_token.empty()) - m_bos_token = decode(std::vector{m_bos_token_id}, {ov::genai::add_special_tokens(true)}); - if (m_eos_token_id != -1 && m_eos_token.empty()) - m_eos_token = decode(std::vector{m_eos_token_id}, {ov::genai::add_special_tokens(true)}); - } } // load special tokens ids from config.json @@ -495,53 +549,10 @@ class Tokenizer::TokenizerImpl { return std::vector(res_data, res_data + res.get_shape()[0]); } - std::string patch_chat_template(std::string template_str) const { - for (const auto& [chat_template, fallback] : chat_template_fallback_map) { - if (template_str == chat_template) { - return fallback; - } - } - - // Replace what jinja2cpp doesn't support - std::pair replace_str_map[] = { - {"'}", "' }"}, - {"{'", "{ '"}, - {".strip()", ""}, - {"is not none", "is defined"}, - {"is none", "is undefined"}, - {"= none", "= undefined"}, - // Jinja2Cpp does not support Python-style slicing, e.g. [1:]. - // If chat template contains such slicing, we replace it with - // a placeholder at the moment. - {"messages[1:]", "slice(messages, 1)"}, - }; - - for (const auto& [from, to] : replace_str_map) { - size_t pos = 0; - while ((pos = template_str.find(from, pos)) != std::string::npos) { - template_str.replace(pos, from.size(), to); - pos += to.size(); - } - } - return template_str; - } - - std::string chat_template_from_file_if_exists(const std::filesystem::path& path, const std::string& file_name) { - auto tokenizer_config_file_path = path / file_name; - if (!std::filesystem::exists(tokenizer_config_file_path)) - return ""; - std::ifstream file(tokenizer_config_file_path); - - std::string res; - ov::genai::utils::read_json_param(nlohmann::json::parse(file), "chat_template", res); - - return patch_chat_template(res); - } - std::string apply_chat_template(ChatHistory history, bool add_generation_prompt, const std::string& chat_template) const { - std::string chat_tpl = chat_template.empty() ? m_chat_template : patch_chat_template(chat_template); + std::string chat_tpl = chat_template.empty() ? m_chat_template : remap_and_patch(chat_template); OPENVINO_ASSERT(!chat_tpl.empty(), "Chat template wasn't found. This may indicate that the model wasn't trained for chat scenario." " Please add 'chat_template' to tokenizer_config.json to use the model in chat scenario." @@ -599,7 +610,7 @@ class Tokenizer::TokenizerImpl { } void set_chat_template(const std::string& chat_template) { - m_chat_template = patch_chat_template(chat_template); + m_chat_template = remap_and_patch(chat_template); } std::string get_chat_template() { diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp index e137a85ff4..7f64baac7a 100644 --- a/src/cpp/src/utils.cpp +++ b/src/cpp/src/utils.cpp @@ -283,23 +283,6 @@ void apply_gather_before_matmul_transformation(std::shared_ptr model) } } -template -void read_rt_info(std::shared_ptr& model, const char* name, T& value) { - if (!model) - return; - if (model->get_rt_info().count(name) == 0) - return; - auto str_value = model->get_rt_info().at(name).as(); - if constexpr (std::is_same::value) { - value = std::stoll(str_value); - } else if constexpr (std::is_same::value) { - value = str_value; - } -} - -template void read_rt_info(std::shared_ptr&, const char*, int64_t&); -template void read_rt_info(std::shared_ptr&, const char*, std::string&); - ov::Core singleton_core() { static ov::Core core; return core; diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index 25747b24a0..56ac1c66ef 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -111,9 +111,6 @@ void apply_gather_before_matmul_transformation(std::shared_ptr model) ov::Core singleton_core(); -template -void read_rt_info(std::shared_ptr& model, const char* name, T& value); - size_t get_first_history_difference(const ov::Tensor& encoded_history, const std::vector tokenized_history, std::set stop_tokens); size_t get_seq_len_axis(std::shared_ptr model); diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index a518012239..e9a5a9c07e 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -1713,8 +1713,23 @@ class TokenizedInputs: ... class Tokenizer: """ - openvino_genai.Tokenizer object is used to initialize Tokenizer - if it's located in a different path than the main model. + + The class is used to encode prompts and decode resulting tokens + + Chat tempalte is initialized from sources in the following order + overriding the previos value: + 1. chat_template entry from tokenizer_config.json + 2. chat_template entry from processor_config.json + 3. chat_template entry from chat_template.json + 4. chat_tempalte entry from rt_info section of openvino.Model + 5. If the tempalte is known to be not supported by GenAI, it's + replaced with a simplified supported version. + 6. Patch chat_tempalte replacing not supported instructions with + eqvivalents. + 7. If the template was not in the list of not supported GenAI + templates from (5), it's blindly replaced with + simplified_chat_template entry from rt_info section of + openvino.Model if the entry exists. """ chat_template: str def __init__(self, tokenizer_path: os.PathLike, properties: dict[str, typing.Any] = {}, **kwargs) -> None: diff --git a/src/python/py_tokenizer.cpp b/src/python/py_tokenizer.cpp index 32318e360d..fdff99c422 100644 --- a/src/python/py_tokenizer.cpp +++ b/src/python/py_tokenizer.cpp @@ -13,6 +13,29 @@ #include "py_utils.hpp" +namespace { + +constexpr char class_docstring[] = R"( + The class is used to encode prompts and decode resulting tokens + + Chat tempalte is initialized from sources in the following order + overriding the previos value: + 1. chat_template entry from tokenizer_config.json + 2. chat_template entry from processor_config.json + 3. chat_template entry from chat_template.json + 4. chat_tempalte entry from rt_info section of openvino.Model + 5. If the tempalte is known to be not supported by GenAI, it's + replaced with a simplified supported version. + 6. Patch chat_tempalte replacing not supported instructions with + eqvivalents. + 7. If the template was not in the list of not supported GenAI + templates from (5), it's blindly replaced with + simplified_chat_template entry from rt_info section of + openvino.Model if the entry exists. +)"; + +} // namespace + namespace py = pybind11; namespace pyutils = ov::genai::pybind::utils; @@ -26,9 +49,7 @@ void init_tokenizer(py::module_& m) { .def_readwrite("input_ids", &TokenizedInputs::input_ids) .def_readwrite("attention_mask", &TokenizedInputs::attention_mask); - py::class_(m, "Tokenizer", - R"(openvino_genai.Tokenizer object is used to initialize Tokenizer - if it's located in a different path than the main model.)") + py::class_(m, "Tokenizer", class_docstring) .def(py::init([](const std::filesystem::path& tokenizer_path, const std::map& properties, const py::kwargs& kwargs) { ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); diff --git a/tests/python_tests/test_kv_cache_eviction.py b/tests/python_tests/test_kv_cache_eviction.py index 73e70a6d66..67107f1c47 100644 --- a/tests/python_tests/test_kv_cache_eviction.py +++ b/tests/python_tests/test_kv_cache_eviction.py @@ -117,6 +117,7 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t generation_config = GenerationConfig() # expecting default greedy sampling generation_config.num_return_sequences = 1 generation_config.max_new_tokens = test_struct.max_new_tokens + generation_config.apply_chat_template = False scheduler_config_opt = get_scheduler_config(test_struct.num_kv_blocks) scheduler_config_opt.use_cache_eviction = test_struct.use_cache_eviction diff --git a/tests/python_tests/test_tokenizer.py b/tests/python_tests/test_tokenizer.py index 6e7f53c79d..b27d0e8e62 100644 --- a/tests/python_tests/test_tokenizer.py +++ b/tests/python_tests/test_tokenizer.py @@ -5,6 +5,9 @@ import sys import pytest import numpy as np +import dataclasses +import openvino +import typing from transformers import AutoTokenizer from typing import Dict, Tuple, List from openvino_genai import Tokenizer @@ -444,3 +447,68 @@ def test_load_special_tokens_from_special_tokens_map_json_with_string_repr(model if 'eos_token' in token_str_int_map: assert tok.get_eos_token_id() == token_str_int_map['eos_token'] + +@dataclasses.dataclass(frozen=True) +class ChatTemplates: + reference: typing.Optional[str] + rt_simplified: typing.Optional[str] + rt_template: typing.Optional[str] + chat_template_json: typing.Optional[str] + processor_config_json: typing.Optional[str] + tokenizer_config_json: typing.Optional[str] + + +def generate_tokenizer(tmp_path, chat_templates): + input_ids = openvino.op.Constant(openvino.Type.i64, openvino.Shape([0, 0]), []).output(0) + input_ids.get_tensor().set_names({"input_ids"}) + attention_mask = openvino.op.Constant(openvino.Type.i64, openvino.Shape([0, 0]), []).output(0) + attention_mask.get_tensor().set_names({"attention_mask"}) + model = openvino.Model( + [openvino.op.Result(input_ids), openvino.op.Result(attention_mask)], + [openvino.op.Parameter(openvino.Type.string, openvino.Shape([1]))] + ) + if chat_templates.rt_simplified is not None: + model.set_rt_info(chat_templates.rt_simplified, "simplified_chat_template") + if chat_templates.rt_template is not None: + model.set_rt_info(chat_templates.rt_template, "chat_template") + if chat_templates.chat_template_json is not None: + with open(tmp_path / "chat_template.json", "w") as file: + json.dump({"chat_template": chat_templates.chat_template_json}, file) + if chat_templates.processor_config_json is not None: + with open(tmp_path / "processor_config.json", "w") as file: + json.dump({"chat_template": chat_templates.processor_config_json}, file) + if chat_templates.tokenizer_config_json is not None: + with open(tmp_path / "tokenizer_config.json", "w") as file: + json.dump({"chat_template": chat_templates.tokenizer_config_json}, file) + openvino.save_model(model, tmp_path / "openvino_tokenizer.xml") + return Tokenizer(tmp_path) + + +QWEN2_VL_2B = "{% if messages is string %}{{ messages }}{% else %}{% for content in messages %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}" + + +SIMPLIFIED_QWEN2_VL_2B = "{% for message in messages %}{{ message['content'] }}{% endfor %}" + + +@pytest.mark.precommit +@pytest.mark.nightly +def test_set_special_runtime_template(tmp_path): + tokenizer = generate_tokenizer(tmp_path, ChatTemplates(None, None, None, None, None, None)) + tokenizer.chat_template = QWEN2_VL_2B + assert tokenizer.chat_template == SIMPLIFIED_QWEN2_VL_2B + + +@pytest.mark.precommit +@pytest.mark.nightly +@pytest.mark.parametrize("chat_templates", [ + ChatTemplates("correct template", "correct template", "", "", "", ""), + ChatTemplates("correct template", None, "correct template", "", "", ""), + ChatTemplates("correct template", None, None, "correct template", "", ""), + ChatTemplates("correct template", None, None, None, "correct template", ""), + ChatTemplates("correct template", None, None, None, None, "correct template"), + ChatTemplates(SIMPLIFIED_QWEN2_VL_2B, "", QWEN2_VL_2B, "", "", ""), +]) +def test_template_priorities(tmp_path, chat_templates): + generate_tokenizer(tmp_path, chat_templates) + tokenizer = generate_tokenizer(tmp_path, chat_templates) + assert tokenizer.chat_template == chat_templates.reference