Skip to content

Commit 982a0dd

Browse files
authored
tokenizer: read simplified_chat_template (#1712)
Depends on huggingface/optimum-intel#1151 Close #1663 Ticket 161313
1 parent cfd220e commit 982a0dd

File tree

8 files changed

+208
-97
lines changed

8 files changed

+208
-97
lines changed

src/cpp/include/openvino/genai/tokenizer.hpp

+16-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,22 @@ struct TokenizedInputs {
2323
};
2424

2525
/**
26-
* @brief class is used to encode prompts and decode resulting tokens
26+
* @brief The class is used to encode prompts and decode resulting tokens
27+
*
28+
* Chat tempalte is initialized from sources in the following order
29+
* overriding the previos value:
30+
* 1. chat_template entry from tokenizer_config.json
31+
* 2. chat_template entry from processor_config.json
32+
* 3. chat_template entry from chat_template.json
33+
* 4. chat_tempalte entry from rt_info section of ov::Model
34+
* 5. If the tempalte is known to be not supported by GenAI, it's
35+
* replaced with a simplified supported version.
36+
* 6. Patch chat_tempalte replacing not supported instructions with
37+
* eqvivalents.
38+
* 7. If the template was not in the list of not supported GenAI
39+
* templates from (5), it's blindly replaced with
40+
* simplified_chat_template entry from rt_info section of
41+
* ov::Model if the entry exists.
2742
*/
2843
class OPENVINO_GENAI_EXPORTS Tokenizer {
2944
public:

src/cpp/src/tokenizer.cpp

+82-71
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,61 @@ const std::pair<std::string, std::string> chat_template_fallback_map[] = {
7070
}
7171
};
7272

73+
std::optional<std::string> remap_template(const std::string& chat_template) {
74+
for (const auto& [known, fallback] : chat_template_fallback_map) {
75+
if (chat_template == known) {
76+
return fallback;
77+
}
78+
}
79+
return std::nullopt;
80+
}
81+
82+
void parse_if_exists(const std::filesystem::path& path, std::string& value) {
83+
if (std::filesystem::exists(path)) {
84+
ov::genai::utils::read_json_param(nlohmann::json::parse(std::ifstream{path}), "chat_template", value);
85+
}
86+
}
87+
88+
template <typename T>
89+
const T& find_or_fallback(const ov::AnyMap& rt_info, const char name[], const T& fallback) {
90+
auto iter = rt_info.find(name);
91+
if (rt_info.end() == iter) {
92+
return fallback;
93+
}
94+
return iter->second.as<T>();
95+
}
96+
97+
std::string patch_template(std::string&& chat_template) {
98+
// Replace what jinja2cpp doesn't support
99+
std::pair<std::string, std::string> replace_str_map[] = {
100+
{"'}", "' }"},
101+
{"{'", "{ '"},
102+
{".strip()", ""},
103+
{"is not none", "is defined"},
104+
{"is none", "is undefined"},
105+
{"= none", "= undefined"},
106+
// Jinja2Cpp does not support Python-style slicing, e.g. [1:].
107+
// If chat template contains such slicing, we replace it with
108+
// a placeholder at the moment.
109+
{"messages[1:]", "slice(messages, 1)"},
110+
};
111+
112+
for (const auto& [from, to] : replace_str_map) {
113+
size_t pos = 0;
114+
while ((pos = chat_template.find(from, pos)) != std::string::npos) {
115+
chat_template.replace(pos, from.size(), to);
116+
pos += to.size();
117+
}
118+
}
119+
return chat_template;
120+
}
121+
122+
std::string remap_and_patch(const std::string& chat_template) {
123+
return patch_template(
124+
remap_template(chat_template).value_or(chat_template)
125+
);
126+
}
127+
73128
} // namespace
74129

75130
namespace ov {
@@ -195,11 +250,10 @@ class Tokenizer::TokenizerImpl {
195250
read_special_tokens_map(models_path);
196251
// Try to read tokenizer_config if some token ids or token str are not defined.
197252
read_tokenizer_config_if_necessary(models_path);
253+
parse_if_exists(models_path / "tokenizer_config.json", m_chat_template);
254+
parse_if_exists(models_path / "processor_config.json", m_chat_template);
255+
parse_if_exists(models_path / "chat_template.json", m_chat_template);
198256
setup_tokenizer(std::make_pair(ov_tokenizer, ov_detokenizer), properties);
199-
m_chat_template = chat_template_from_file_if_exists(models_path, "tokenizer_config.json");
200-
if (m_chat_template.empty()) {
201-
m_chat_template = chat_template_from_file_if_exists(models_path, "chat_template.json");
202-
}
203257
}
204258

205259
void setup_tokenizer(const std::pair<std::shared_ptr<ov::Model>, std::shared_ptr<ov::Model>>& models, const ov::AnyMap& properties) {
@@ -209,10 +263,8 @@ class Tokenizer::TokenizerImpl {
209263
auto core = get_core_singleton();
210264
std::string device = "CPU"; // only CPU is supported for now
211265

212-
std::string version_str;
213-
utils::read_rt_info(ov_tokenizer != nullptr ? ov_tokenizer: ov_detokenizer , "openvino_tokenizers_version", version_str);
214-
// Saving IR version was added only in 24.5, so if it's empty, then it's older than 24.5
215-
m_older_than_24_5 = version_str.empty();
266+
// Saving IR version was added only in 24.5, so if it's missing, then it's older than 24.5
267+
m_older_than_24_5 = (ov_tokenizer ? ov_tokenizer: ov_detokenizer)->get_rt_info().count("openvino_tokenizers_version") == 0;
216268

217269
if (ov_tokenizer) {
218270
ov::pass::Manager manager;
@@ -227,6 +279,18 @@ class Tokenizer::TokenizerImpl {
227279
[this]() -> ov::InferRequest {
228280
return std::move(this->m_tokenizer.create_infer_request());
229281
});
282+
283+
const ov::AnyMap& rt_info = ov_tokenizer->get_rt_info();
284+
m_pad_token_id = find_or_fallback(rt_info, "pad_token_id", m_pad_token_id);
285+
m_bos_token_id = find_or_fallback(rt_info, "bos_token_id", m_bos_token_id);
286+
m_eos_token_id = find_or_fallback(rt_info, "eos_token_id", m_eos_token_id);
287+
288+
m_chat_template = find_or_fallback(rt_info, "chat_template", m_chat_template);
289+
std::optional<std::string> fallback = remap_template(m_chat_template);
290+
m_chat_template = patch_template(fallback.value_or(m_chat_template));
291+
if (!fallback.has_value()) {
292+
m_chat_template = find_or_fallback(rt_info, "simplified_chat_template", m_chat_template);
293+
}
230294
}
231295

232296
if (ov_detokenizer) {
@@ -241,6 +305,14 @@ class Tokenizer::TokenizerImpl {
241305
[this]() -> ov::InferRequest {
242306
return std::move(this->m_detokenizer.create_infer_request());
243307
});
308+
309+
// Unset/-1 token causes exception in SentencePiece detokenization.
310+
if (m_pad_token_id != -1 && m_pad_token.empty())
311+
m_pad_token = decode(std::vector{m_pad_token_id}, {ov::genai::add_special_tokens(true)});
312+
if (m_bos_token_id != -1 && m_bos_token.empty())
313+
m_bos_token = decode(std::vector{m_bos_token_id}, {ov::genai::add_special_tokens(true)});
314+
if (m_eos_token_id != -1 && m_eos_token.empty())
315+
m_eos_token = decode(std::vector{m_eos_token_id}, {ov::genai::add_special_tokens(true)});
244316
}
245317

246318
// Initialize tokenizer's cache to save time later.
@@ -251,24 +323,6 @@ class Tokenizer::TokenizerImpl {
251323
if (m_detokenizer) {
252324
decode({1, 33, 199, 42, 42});
253325
}
254-
255-
if (m_tokenizer) {
256-
utils::read_rt_info(ov_tokenizer, "chat_template", m_chat_template);
257-
utils::read_rt_info(ov_tokenizer, "pad_token_id", m_pad_token_id);
258-
utils::read_rt_info(ov_tokenizer, "bos_token_id", m_bos_token_id);
259-
utils::read_rt_info(ov_tokenizer, "eos_token_id", m_eos_token_id);
260-
}
261-
262-
m_chat_template = patch_chat_template(m_chat_template);
263-
if (m_detokenizer) {
264-
// Unset/-1 token causes exception in SentencePiece detokenization.
265-
if (m_pad_token_id != -1 && m_pad_token.empty())
266-
m_pad_token = decode(std::vector{m_pad_token_id}, {ov::genai::add_special_tokens(true)});
267-
if (m_bos_token_id != -1 && m_bos_token.empty())
268-
m_bos_token = decode(std::vector{m_bos_token_id}, {ov::genai::add_special_tokens(true)});
269-
if (m_eos_token_id != -1 && m_eos_token.empty())
270-
m_eos_token = decode(std::vector{m_eos_token_id}, {ov::genai::add_special_tokens(true)});
271-
}
272326
}
273327

274328
// load special tokens ids from config.json
@@ -495,53 +549,10 @@ class Tokenizer::TokenizerImpl {
495549
return std::vector<std::string>(res_data, res_data + res.get_shape()[0]);
496550
}
497551

498-
std::string patch_chat_template(std::string template_str) const {
499-
for (const auto& [chat_template, fallback] : chat_template_fallback_map) {
500-
if (template_str == chat_template) {
501-
return fallback;
502-
}
503-
}
504-
505-
// Replace what jinja2cpp doesn't support
506-
std::pair<std::string, std::string> replace_str_map[] = {
507-
{"'}", "' }"},
508-
{"{'", "{ '"},
509-
{".strip()", ""},
510-
{"is not none", "is defined"},
511-
{"is none", "is undefined"},
512-
{"= none", "= undefined"},
513-
// Jinja2Cpp does not support Python-style slicing, e.g. [1:].
514-
// If chat template contains such slicing, we replace it with
515-
// a placeholder at the moment.
516-
{"messages[1:]", "slice(messages, 1)"},
517-
};
518-
519-
for (const auto& [from, to] : replace_str_map) {
520-
size_t pos = 0;
521-
while ((pos = template_str.find(from, pos)) != std::string::npos) {
522-
template_str.replace(pos, from.size(), to);
523-
pos += to.size();
524-
}
525-
}
526-
return template_str;
527-
}
528-
529-
std::string chat_template_from_file_if_exists(const std::filesystem::path& path, const std::string& file_name) {
530-
auto tokenizer_config_file_path = path / file_name;
531-
if (!std::filesystem::exists(tokenizer_config_file_path))
532-
return "";
533-
std::ifstream file(tokenizer_config_file_path);
534-
535-
std::string res;
536-
ov::genai::utils::read_json_param(nlohmann::json::parse(file), "chat_template", res);
537-
538-
return patch_chat_template(res);
539-
}
540-
541552
std::string apply_chat_template(ChatHistory history,
542553
bool add_generation_prompt,
543554
const std::string& chat_template) const {
544-
std::string chat_tpl = chat_template.empty() ? m_chat_template : patch_chat_template(chat_template);
555+
std::string chat_tpl = chat_template.empty() ? m_chat_template : remap_and_patch(chat_template);
545556
OPENVINO_ASSERT(!chat_tpl.empty(),
546557
"Chat template wasn't found. This may indicate that the model wasn't trained for chat scenario."
547558
" Please add 'chat_template' to tokenizer_config.json to use the model in chat scenario."
@@ -599,7 +610,7 @@ class Tokenizer::TokenizerImpl {
599610
}
600611

601612
void set_chat_template(const std::string& chat_template) {
602-
m_chat_template = patch_chat_template(chat_template);
613+
m_chat_template = remap_and_patch(chat_template);
603614
}
604615

605616
std::string get_chat_template() {

src/cpp/src/utils.cpp

-17
Original file line numberDiff line numberDiff line change
@@ -283,23 +283,6 @@ void apply_gather_before_matmul_transformation(std::shared_ptr<ov::Model> model)
283283
}
284284
}
285285

286-
template <typename T>
287-
void read_rt_info(std::shared_ptr<ov::Model>& model, const char* name, T& value) {
288-
if (!model)
289-
return;
290-
if (model->get_rt_info().count(name) == 0)
291-
return;
292-
auto str_value = model->get_rt_info().at(name).as<std::string>();
293-
if constexpr (std::is_same<T, int64_t>::value) {
294-
value = std::stoll(str_value);
295-
} else if constexpr (std::is_same<T, std::string>::value) {
296-
value = str_value;
297-
}
298-
}
299-
300-
template void read_rt_info<int64_t>(std::shared_ptr<ov::Model>&, const char*, int64_t&);
301-
template void read_rt_info<std::string>(std::shared_ptr<ov::Model>&, const char*, std::string&);
302-
303286
ov::Core singleton_core() {
304287
static ov::Core core;
305288
return core;

src/cpp/src/utils.hpp

-3
Original file line numberDiff line numberDiff line change
@@ -111,9 +111,6 @@ void apply_gather_before_matmul_transformation(std::shared_ptr<ov::Model> model)
111111

112112
ov::Core singleton_core();
113113

114-
template <typename T>
115-
void read_rt_info(std::shared_ptr<ov::Model>& model, const char* name, T& value);
116-
117114
size_t get_first_history_difference(const ov::Tensor& encoded_history, const std::vector<int64_t> tokenized_history, std::set<int64_t> stop_tokens);
118115

119116
size_t get_seq_len_axis(std::shared_ptr<const ov::Model> model);

src/python/openvino_genai/py_openvino_genai.pyi

+17-2
Original file line numberDiff line numberDiff line change
@@ -1713,8 +1713,23 @@ class TokenizedInputs:
17131713
...
17141714
class Tokenizer:
17151715
"""
1716-
openvino_genai.Tokenizer object is used to initialize Tokenizer
1717-
if it's located in a different path than the main model.
1716+
1717+
The class is used to encode prompts and decode resulting tokens
1718+
1719+
Chat tempalte is initialized from sources in the following order
1720+
overriding the previos value:
1721+
1. chat_template entry from tokenizer_config.json
1722+
2. chat_template entry from processor_config.json
1723+
3. chat_template entry from chat_template.json
1724+
4. chat_tempalte entry from rt_info section of openvino.Model
1725+
5. If the tempalte is known to be not supported by GenAI, it's
1726+
replaced with a simplified supported version.
1727+
6. Patch chat_tempalte replacing not supported instructions with
1728+
eqvivalents.
1729+
7. If the template was not in the list of not supported GenAI
1730+
templates from (5), it's blindly replaced with
1731+
simplified_chat_template entry from rt_info section of
1732+
openvino.Model if the entry exists.
17181733
"""
17191734
chat_template: str
17201735
def __init__(self, tokenizer_path: os.PathLike, properties: dict[str, typing.Any] = {}, **kwargs) -> None:

src/python/py_tokenizer.cpp

+24-3
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,29 @@
1313

1414
#include "py_utils.hpp"
1515

16+
namespace {
17+
18+
constexpr char class_docstring[] = R"(
19+
The class is used to encode prompts and decode resulting tokens
20+
21+
Chat tempalte is initialized from sources in the following order
22+
overriding the previos value:
23+
1. chat_template entry from tokenizer_config.json
24+
2. chat_template entry from processor_config.json
25+
3. chat_template entry from chat_template.json
26+
4. chat_tempalte entry from rt_info section of openvino.Model
27+
5. If the tempalte is known to be not supported by GenAI, it's
28+
replaced with a simplified supported version.
29+
6. Patch chat_tempalte replacing not supported instructions with
30+
eqvivalents.
31+
7. If the template was not in the list of not supported GenAI
32+
templates from (5), it's blindly replaced with
33+
simplified_chat_template entry from rt_info section of
34+
openvino.Model if the entry exists.
35+
)";
36+
37+
} // namespace
38+
1639
namespace py = pybind11;
1740
namespace pyutils = ov::genai::pybind::utils;
1841

@@ -26,9 +49,7 @@ void init_tokenizer(py::module_& m) {
2649
.def_readwrite("input_ids", &TokenizedInputs::input_ids)
2750
.def_readwrite("attention_mask", &TokenizedInputs::attention_mask);
2851

29-
py::class_<ov::genai::Tokenizer>(m, "Tokenizer",
30-
R"(openvino_genai.Tokenizer object is used to initialize Tokenizer
31-
if it's located in a different path than the main model.)")
52+
py::class_<ov::genai::Tokenizer>(m, "Tokenizer", class_docstring)
3253

3354
.def(py::init([](const std::filesystem::path& tokenizer_path, const std::map<std::string, py::object>& properties, const py::kwargs& kwargs) {
3455
ScopedVar env_manager(pyutils::ov_tokenizers_module_path());

tests/python_tests/test_kv_cache_eviction.py

+1
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t
117117
generation_config = GenerationConfig() # expecting default greedy sampling
118118
generation_config.num_return_sequences = 1
119119
generation_config.max_new_tokens = test_struct.max_new_tokens
120+
generation_config.apply_chat_template = False
120121

121122
scheduler_config_opt = get_scheduler_config(test_struct.num_kv_blocks)
122123
scheduler_config_opt.use_cache_eviction = test_struct.use_cache_eviction

0 commit comments

Comments
 (0)