Add/skip special tokens in runtime (openvinotoolkit#859)

pavel-esir · web-flow · commit c58ba64a8e0b · 2024-09-26T19:08:45.000Z
CVS-152371

In Python
```
tok = pipe.get_tokenizer()
res_genai = tok.encode(prompt, add_special_tokens=False).input_ids
```
In C++
```
tok = pipe.get_tokenizer()
res_genai = tok.encode(prompt, ov::genai::add_special_tokens(False)).input_ids
```
diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
@@ -665,7 +665,7 @@ jobs:
               output.write('question:\n')
               chat_history.append(gen_prompt(prompt))
               chat_prompt = tokenizer.apply_chat_template(chat_history, tokenize=False, add_generation_prompt=True)
-              tokenized = tokenizer(chat_prompt, return_tensors='pt')
+              tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False)
               answer = model.generate(**tokenized, max_length=1000, do_sample=False)
               answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True)
               chat_history.append(gen_answer(answer_str))
diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -9,6 +9,7 @@
 
 #include "openvino/runtime/tensor.hpp"
 #include "openvino/genai/visibility.hpp"
+#include <openvino/runtime/properties.hpp>
 
 namespace ov {
 namespace genai {
@@ -33,19 +34,44 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
 
     /**
     * @brief encode a single prompt
+    * @param prompt std::string with input prompt
+    * @param tokenization_params AnyMap with tokenization parameters, e.g. {'add_special_tokens', false}
     * @return pair of [input_ids, attention_mask]
     */
-    TokenizedInputs encode(const std::string prompt);
+    TokenizedInputs encode(const std::string prompt, const ov::AnyMap& tokenization_params = {});
     
     /**
     * @brief encode batch of prompts. Left padding will be applied by default
     * @param prompts vector storing batch of prompts
+    * @param tokenization_params AnyMap with tokenization parameters, e.g. {'add_special_tokens', false}
     * @return pair of [input_ids, attention_mask]
     */
-    TokenizedInputs encode(std::vector<std::string>& prompts);
-    TokenizedInputs encode(std::vector<std::string>&& prompts);
-    TokenizedInputs encode(std::initializer_list<std::string>& prompts);
-    
+    TokenizedInputs encode(std::vector<std::string>& prompt, const ov::AnyMap& tokenization_params = {});
+    TokenizedInputs encode(std::vector<std::string>&& prompts, const ov::AnyMap& tokenization_params = {});
+    TokenizedInputs encode(std::initializer_list<std::string>& prompts, const ov::AnyMap& tokenization_params = {});
+
+    /**
+    * @brief encode a single prompt
+    * @param prompt std::string with input prompt
+    * @param properties tokenization properties, e.g. ov::genai::add_special_tokens(false)
+    * @return pair of [input_ids, attention_mask]
+    */    
+    template <typename... Properties>
+    util::EnableIfAllStringAny<TokenizedInputs, Properties...> encode(std::string& prompt, Properties&&... properties) {
+        return encode(prompt, AnyMap{std::forward<Properties>(properties)...});
+    }
+
+    /**
+    * @brief encode batch of prompts. Left padding will be applied by default
+    * @param prompts vector storing batch of prompts
+    * @param properties tokenization properties, e.g. ov::genai::add_special_tokens(false)
+    * @return pair of [input_ids, attention_mask]
+    */
+    template <typename... Properties>
+    util::EnableIfAllStringAny<TokenizedInputs, Properties...> encode(std::vector<std::string>& prompts, Properties&&... properties) {
+        return encode(prompts, AnyMap{std::forward<Properties>(properties)...});
+    }
+
     /**
     * @brief decode sequence of tokens
     * @param tokens vector storing tokens
@@ -103,5 +129,8 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     class TokenizerImpl;
     std::shared_ptr<TokenizerImpl> m_pimpl;
 };
+
+static constexpr ov::Property<bool> add_special_tokens{"add_special_tokens"};
+
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
@@ -148,11 +148,12 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
                 m_history.push_back({{"role", "user"}, {"content", prompt}});
                 constexpr bool add_generation_prompt = true;
                 auto new_templated_chat_history  = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
-                auto new_chat_tokens = m_tokenizer.encode(new_templated_chat_history);
+                bool add_special_tokens_ = false;  // Do not add special tokens is chat scenario.
+                auto new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(add_special_tokens_));
                 if (m_is_cache_empty) {
                     encoded_input = new_chat_tokens;
                 } else {
-                    auto prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history);
+                    auto prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(add_special_tokens_));
                     encoded_input = subtract_chat_tokenized_inputs(new_chat_tokens, prev_chat_tokens);
                 }
                 m_templated_chat_history = new_templated_chat_history;
diff --git a/src/cpp/src/make_combine_segments_stateful.cpp b/src/cpp/src/make_combine_segments_stateful.cpp
@@ -0,0 +1,46 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "make_combine_segments_stateful.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/op/select.hpp"
+#include "openvino/op/read_value.hpp"
+#include "openvino/op/assign.hpp"
+
+
+using namespace ov;
+using namespace ov::op;
+
+bool ov::genai::MakeCombineSegmentsSatateful::run_on_model(const std::shared_ptr<ov::Model>& model) {
+
+    std::shared_ptr<ov::Node> combine_seg_node;
+    for (auto node: model->get_ordered_ops()) {
+        if (strcmp(node->get_type_info().name, "CombineSegments") == 0) {
+            combine_seg_node = node;
+        }
+    }
+    if (!combine_seg_node || combine_seg_node->input_value(1).get_element_type() != ov::element::i32) {
+        return false;
+    }
+    
+    std::shared_ptr<v0::Constant> input_1_const = std::dynamic_pointer_cast<v0::Constant>(combine_seg_node->get_input_node_shared_ptr(1));
+    if (!input_1_const) {
+        return false;
+    }
+    
+    op::util::VariableInfo var_info{ov::Shape{}, ov::element::boolean, ADD_SPECIAL_TOKENS_VAR_ID};
+    auto variable = std::make_shared<op::util::Variable>(var_info);
+
+    // Default mode is add_special_tokens.
+    auto default_mode_const = std::make_shared<v0::Constant>(ov::element::boolean, ov::Shape{}, std::vector{true});
+    auto read_value = std::make_shared<v6::ReadValue>(default_mode_const, variable);
+    auto zero_constant = std::make_shared<v0::Constant>(ov::element::i32, ov::Shape{}, std::vector{0});
+    auto select_node = std::make_shared<v1::Select>(read_value, input_1_const, zero_constant);
+    combine_seg_node->input(1).replace_source_output(select_node->output(0));
+
+    auto assign = std::make_shared<v6::Assign>(read_value, variable);
+    
+    model->add_sinks({assign});
+    model->add_variables({variable});
+    return true;
+}
diff --git a/src/cpp/src/make_combine_segments_stateful.hpp b/src/cpp/src/make_combine_segments_stateful.hpp
@@ -0,0 +1,44 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "openvino/op/constant.hpp"
+#include "openvino/pass/pass.hpp"
+
+namespace ov {
+namespace genai {
+
+/** 
+ * @brief This pass modifies tokenizer ov::Model so that special tokens adding will be
+ *  enabled or diabled depending on stateful value.
+ * 
+ *  +--------------+
+ *  |  DefaultMode |
+ *  +--------------+
+ *         |
+ *         |
+ *         v
+ *  +--------------+  +--------+  +------------------+
+ *  |  ReadValue   |  |  ends  |  | const value = 0  |
+ *  +--------------+  +--------+  +------------------+
+ *             \          |        /
+ *              \         |       /
+ *               v        v      v
+ *                +--------------+
+ *                |    Select    |
+ *                +--------------+
+ *                       |
+ *                       v
+ *          +-------------------------+
+ *          |     CombineSegments     |
+ *          +-------------------------+
+**/
+class MakeCombineSegmentsSatateful : public ov::pass::ModelPass {
+public:
+    OPENVINO_RTTI("MakeCombineSegmentsSatateful", "0");
+    bool run_on_model(const std::shared_ptr<ov::Model>& model) override;
+};
+
+const std::string ADD_SPECIAL_TOKENS_VAR_ID = "add_special_tokens";
+
+} // namespace genai
+} // namespace ov
diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
@@ -4,16 +4,17 @@
 #include <filesystem>
 #include <fstream>
 #include <memory>
-
 #include <jinja2cpp/template.h>
 #include <jinja2cpp/template_env.h>
 #include <jinja2cpp/user_callable.h>
 #include <jinja2cpp/generic_list.h>
 #include <jinja2cpp/generic_list_iterator.h>
 
+#include "openvino/pass/manager.hpp"
 #include "openvino/runtime/core.hpp"
 #include "openvino/genai/tokenizer.hpp"
 
+#include "make_combine_segments_stateful.hpp"
 #include "tokenizers_path.hpp"
 #include "circular_buffer_queue.hpp"
 #include "utils.hpp"
@@ -69,7 +70,10 @@ class Tokenizer::TokenizerImpl {
 
     std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_ireq_queue_tokenizer;
     std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_ireq_queue_detokenizer;
-
+    // To change the adding special tokens mode we use a statefull subgraph, 
+    // this flag holds the current state value of the CompiledModel.
+    bool m_add_special_tokens = true;  
+    
     int64_t m_pad_token_id = -1;
     int64_t m_bos_token_id = -1;
     int64_t m_eos_token_id = -1;
@@ -80,6 +84,29 @@ class Tokenizer::TokenizerImpl {
 
     std::string m_chat_template = "";
 
+    void set_state_if_necessary(CircularBufferQueueElementGuard<ov::InferRequest>& infer_request_guard, bool add_special_tokens) {
+        // If user requested add_special_tokens mode different from the current one,
+        // need to set state variable.
+        // If requested mode matches the stored state set, then don't touch states.
+        if (add_special_tokens == m_add_special_tokens) {
+            return;
+        }
+        
+        // auto states = m_ireq_queue_tokenizer->get(0).query_state();
+        ov::Tensor add_special_tensor = ov::Tensor(ov::element::boolean, {});
+        *add_special_tensor.data<bool>() = add_special_tokens;
+
+        for (auto& state: infer_request_guard.get().query_state()) {
+            if (state.get_name().find(ov::genai::ADD_SPECIAL_TOKENS_VAR_ID) == std::string::npos) {
+                // It's not add_special_tokens flag state.
+                continue;
+            }
+            state.set_state(add_special_tensor);
+            break;            
+        }
+        m_add_special_tokens = add_special_tokens;
+    }
+
     TokenizerImpl() = default;
 
     TokenizerImpl(std::filesystem::path tokenizer_path, const ov::AnyMap& plugin_config)
@@ -99,13 +126,18 @@ class Tokenizer::TokenizerImpl {
         read_tokenizer_config_if_necessary(tokenizer_path);
 
         auto device = "CPU"; // currently openvino_tokenizer supports only CPU
-        m_tokenizer = core.compile_model(tokenizer_path / "openvino_tokenizer.xml",
-                                                device, plugin_config);
+        auto ov_tokenizer = core.read_model(tokenizer_path / "openvino_tokenizer.xml");
+        
+        ov::pass::Manager manager;
+        manager.register_pass<MakeCombineSegmentsSatateful>();
+        manager.run_passes(ov_tokenizer);
+        
+        m_tokenizer = core.compile_model(ov_tokenizer, device, plugin_config);
         if (std::filesystem::exists(tokenizer_path / "openvino_detokenizer.xml")) {
-            m_detokenizer = core.compile_model(tokenizer_path / "openvino_detokenizer.xml",
-                                                    device, plugin_config);
+            m_detokenizer = core.compile_model(tokenizer_path / "openvino_detokenizer.xml", device, plugin_config);
         }
 
+        
         const size_t INFER_REQUEST_QUEUE_SIZE = m_tokenizer.get_property(ov::optimal_number_of_infer_requests);
         m_ireq_queue_tokenizer = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
             INFER_REQUEST_QUEUE_SIZE,
@@ -256,8 +288,12 @@ class Tokenizer::TokenizerImpl {
         get_id_from_str(m_eos_token, m_eos_token_id);
     }
 
-    TokenizedInputs encode(std::string prompt) {
+    TokenizedInputs encode(std::string prompt, const ov::AnyMap& tokenization_params = {}) {
+        bool add_special_tokens_flag = true;
+        ov::genai::utils::read_anymap_param(tokenization_params, add_special_tokens.name(), add_special_tokens_flag);
+
         CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_tokenizer.get());
+        set_state_if_necessary(infer_request_guard, add_special_tokens_flag);
         size_t batch_size = 1;
         infer_request_guard.get().set_input_tensor(ov::Tensor{ov::element::string, {batch_size}, &prompt});
         infer_request_guard.get().start_async();
@@ -268,10 +304,15 @@ class Tokenizer::TokenizerImpl {
         );
     }
 
-    TokenizedInputs encode(std::vector<std::string>& prompts) {
+    TokenizedInputs encode(std::vector<std::string>& prompts, const ov::AnyMap& tokenization_params = {}) {
+        
         TokenizedInputs unpadded;
         {
+            bool add_special_tokens_flag = true;
+            ov::genai::utils::read_anymap_param(tokenization_params, add_special_tokens.name(), add_special_tokens_flag);
+
             CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_tokenizer.get());
+            set_state_if_necessary(infer_request_guard, add_special_tokens_flag);
             infer_request_guard.get().set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()});
             auto size_ = infer_request_guard.get().get_input_tensor().get_shape();
             infer_request_guard.get().start_async();
@@ -454,20 +495,20 @@ Tokenizer::Tokenizer(const std::string& tokenizer_path, const ov::AnyMap& plugin
     m_pimpl = std::make_shared<TokenizerImpl>(tokenizer_path, plugin_config);
 }
 
-TokenizedInputs Tokenizer::encode(const std::string prompt) {
-    return m_pimpl->encode(std::move(prompt));
+TokenizedInputs Tokenizer::encode(const std::string prompt, const ov::AnyMap& tokenization_params) {
+    return m_pimpl->encode(std::move(prompt), tokenization_params);
 }
 
-TokenizedInputs Tokenizer::encode(std::vector<std::string>& prompts) {
-    return m_pimpl->encode(prompts);
+TokenizedInputs Tokenizer::encode(std::vector<std::string>& prompts, const ov::AnyMap& tokenization_params) {
+    return m_pimpl->encode(prompts, tokenization_params);
 }
 
-TokenizedInputs Tokenizer::encode(std::vector<std::string>&& prompts) {
-    return m_pimpl->encode(prompts);
+TokenizedInputs Tokenizer::encode(std::vector<std::string>&& prompts, const ov::AnyMap& tokenization_params) {
+    return m_pimpl->encode(prompts, tokenization_params);
 }
 
-TokenizedInputs Tokenizer::encode(std::initializer_list<std::string>& text) {
-    return encode(std::vector<std::string>(text.begin(), text.end()));
+TokenizedInputs Tokenizer::encode(std::initializer_list<std::string>& text, const ov::AnyMap& tokenization_params) {
+    return encode(std::vector<std::string>(text.begin(), text.end()), tokenization_params);
 }
 
 std::string Tokenizer::decode(std::vector<int64_t> tokens) {
diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp
@@ -520,12 +520,21 @@ PYBIND11_MODULE(py_generate_pipeline, m) {
             return std::make_unique<ov::genai::Tokenizer>(tokenizer_path, utils::properties_to_any_map(plugin_config));
         }), py::arg("tokenizer_path"), py::arg("plugin_config") = ov::AnyMap({}))
         
-        .def("encode", [](Tokenizer& tok, std::vector<std::string>& prompts) { return tok.encode(prompts); },
+        .def("encode", [](Tokenizer& tok, std::vector<std::string>& prompts, bool add_special_tokens) {
+                ov::AnyMap tokenization_params;
+                tokenization_params[ov::genai::add_special_tokens.name()] = add_special_tokens;
+                return tok.encode(prompts, tokenization_params);
+            },
             py::arg("prompts"),
+            py::arg("add_special_tokens") = true,
             R"(Encodes a list of prompts into tokenized inputs.)")
-
-        .def("encode", py::overload_cast<const std::string>(&Tokenizer::encode),
-            py::arg("prompt"),
+        
+        .def("encode", [](Tokenizer& tok, const std::string prompt, bool add_special_tokens) {
+                ov::AnyMap tokenization_params;
+                tokenization_params[ov::genai::add_special_tokens.name()] = add_special_tokens;
+                return tok.encode(prompt, tokenization_params);
+            },
+            py::arg("prompt"), py::arg("add_special_tokens") = true,
             R"(Encodes a single prompt into tokenized input.)")
         
         .def(
diff --git a/tests/python_tests/test_chat_generate_api.py b/tests/python_tests/test_chat_generate_api.py
@@ -42,9 +42,8 @@ def test_chat_compare_with_HF(model_descr, generation_config: Dict):
     chat_history_ov = []
     chat_prompt = ''
     
-    # HF in chat scenario does not add special tokens, but openvino tokenizer by default is converted with add_special_tokens=True.
-    # Need to regenerate openvino_tokenizer/detokenizer.
-    model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'), add_special_tokens=False)
+    # Will set add_special_tokens=False inside pipeline when start_chat() is called.
+    model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
 
     pipe.start_chat()    
     for prompt in quenstions:
@@ -197,3 +196,29 @@ def test_set_chat_template():
     pipe.finish_chat()
     reference = pipe.generate("a", max_new_tokens=1)
     assert generated == reference
+
+prompts = [
+    '1+1=',
+    'What is the previous answer?',
+    'Why is the Sun yellow?',
+    'What was my first question?',
+    ['Why is the Sun yellow?'],
+    "若我有一亿美元，在人工智能盛行的今天，我怎样投资才能收益最大化？",
+    "מחרוזת בדיקה",
+    "Multiline\nstring!\nWow!",
+]
+
+@pytest.mark.precommit
+@pytest.mark.nightly
+@pytest.mark.parametrize("add_special_tokens", [True, False])
+@pytest.mark.parametrize("prompt", prompts)
+def test_add_special_tokens(add_special_tokens, prompt):
+    import numpy as np
+    model_descr = get_chat_models_list()[0]
+    model_id, path, hf_tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
+    genai_tokenzier = pipe.get_tokenizer()
+    
+    # Calling encode with add_special_tokens will set state flag.
+    res_genai = genai_tokenzier.encode(prompt, add_special_tokens).input_ids.data
+    res_hf = hf_tokenizer(prompt, return_tensors="np", add_special_tokens=add_special_tokens)["input_ids"]
+    assert np.all(res_genai == res_hf)