openvinotoolkit
diff --git a/‎.github/dependency_review.yml
-2 b/‎.github/dependency_review.yml
-2
diff --git a/‎README.md
+42-42 b/‎README.md
+42-42
diff --git a/‎python/openvino_tokenizers/cli.py
+23-1 b/‎python/openvino_tokenizers/cli.py
+23-1
diff --git a/‎python/openvino_tokenizers/convert_tokenizer.py
+1-1 b/‎python/openvino_tokenizers/convert_tokenizer.py
+1-1
diff --git a/‎python/openvino_tokenizers/tokenizer_pipeline.py
+2-9 b/‎python/openvino_tokenizers/tokenizer_pipeline.py
+2-9
diff --git a/‎src/combine_segments.cpp
+1-1 b/‎src/combine_segments.cpp
+1-1
diff --git a/‎src/regex_normalization.cpp
+21-5 b/‎src/regex_normalization.cpp
+21-5
@@ -2,11 +2,9 @@ fail-on-severity: 'low'
 allow-licenses:
   - 'BSD-2-Clause'
   - 'BSD-3-Clause'
-  - 'BSD-2-Clause AND BSD-3-Clause'
   - 'MIT'
   - 'Apache-2.0'
   - 'ISC'
-  - 'Apache-2.0 AND MIT'
   - 'BlueOak-1.0.0'
   - '0BSD'
   - 'Python-2.0'
 
@@ -459,7 +459,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
   <tbody>
     <tr>
       <td >BPE</td>
-      <td >97.10</td>
+      <td >97.18</td>
       <td >4544</td>
     </tr>
     <tr>
@@ -567,7 +567,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
     <tr>
       <td >BPE</td>
       <td >laion/CLIP-ViT-bigG-14-laion2B-39B-b160k</td>
-      <td >98.47</td>
+      <td >100.00</td>
       <td >261</td>
     </tr>
     <tr>
@@ -603,163 +603,163 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
     <tr>
       <td >SentencePiece</td>
       <td >NousResearch/Llama-2-13b-hf</td>
-      <td >96.73</td>
+      <td >97.55</td>
       <td >245</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
-      <td >NousResearch/Llama-2-13b-hf_legacy</td>
-      <td >95.92</td>
+      <td >NousResearch/Llama-2-13b-hf_legacy_sp_backend</td>
+      <td >97.55</td>
       <td >245</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
       <td >NousResearch/Llama-2-13b-hf_sp_backend</td>
-      <td >95.10</td>
+      <td >94.29</td>
       <td >245</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
       <td >TinyLlama/TinyLlama-1.1B-Chat-v1.0</td>
-      <td >96.76</td>
+      <td >94.33</td>
       <td >247</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
-      <td >TinyLlama/TinyLlama-1.1B-Chat-v1.0_legacy</td>
+      <td >TinyLlama/TinyLlama-1.1B-Chat-v1.0_legacy_sp_backend</td>
       <td >95.14</td>
       <td >247</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
       <td >TinyLlama/TinyLlama-1.1B-Chat-v1.0_sp_backend</td>
-      <td >94.33</td>
+      <td >96.76</td>
       <td >247</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
-      <td >baichuan-inc/Baichuan2-7B-Chat_legacy</td>
+      <td >baichuan-inc/Baichuan2-7B-Chat_legacy_sp_backend</td>
       <td >100.00</td>
       <td >245</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
-      <td >camembert-base</td>
-      <td >52.24</td>
+      <td >camembert-base_legacy_sp_backend</td>
+      <td >75.51</td>
       <td >245</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
-      <td >camembert-base_legacy</td>
-      <td >75.51</td>
+      <td >camembert-base_sp_backend</td>
+      <td >52.24</td>
       <td >245</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
-      <td >facebook/musicgen-small</td>
-      <td >83.67</td>
+      <td >facebook/musicgen-small_legacy_sp_backend</td>
+      <td >78.37</td>
       <td >245</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
-      <td >facebook/musicgen-small_legacy</td>
-      <td >78.37</td>
+      <td >facebook/musicgen-small_sp_backend</td>
+      <td >83.67</td>
       <td >245</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
       <td >microsoft/Phi-3-mini-128k-instruct</td>
-      <td >95.95</td>
+      <td >95.14</td>
       <td >247</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
-      <td >microsoft/Phi-3-mini-128k-instruct_legacy</td>
+      <td >microsoft/Phi-3-mini-128k-instruct_legacy_sp_backend</td>
       <td >94.33</td>
       <td >247</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
       <td >microsoft/Phi-3-mini-128k-instruct_sp_backend</td>
-      <td >95.14</td>
+      <td >95.95</td>
       <td >247</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
-      <td >microsoft/deberta-v3-base</td>
-      <td >96.73</td>
+      <td >microsoft/deberta-v3-base_legacy_sp_backend</td>
+      <td >100.00</td>
       <td >245</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
-      <td >microsoft/deberta-v3-base_legacy</td>
-      <td >100.00</td>
+      <td >microsoft/deberta-v3-base_sp_backend</td>
+      <td >96.73</td>
       <td >245</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
       <td >mlx-community/quantized-gemma-7b-it</td>
-      <td >96.76</td>
+      <td >97.57</td>
       <td >247</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
-      <td >mlx-community/quantized-gemma-7b-it_legacy</td>
+      <td >mlx-community/quantized-gemma-7b-it_legacy_sp_backend</td>
       <td >97.57</td>
       <td >247</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
       <td >mlx-community/quantized-gemma-7b-it_sp_backend</td>
-      <td >97.57</td>
+      <td >96.76</td>
       <td >247</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
-      <td >rinna/bilingual-gpt-neox-4b</td>
-      <td >82.04</td>
+      <td >rinna/bilingual-gpt-neox-4b_legacy_sp_backend</td>
+      <td >86.12</td>
       <td >245</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
-      <td >rinna/bilingual-gpt-neox-4b_legacy</td>
-      <td >86.12</td>
+      <td >rinna/bilingual-gpt-neox-4b_sp_backend</td>
+      <td >80.41</td>
       <td >245</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
-      <td >t5-base</td>
-      <td >85.31</td>
+      <td >t5-base_legacy_sp_backend</td>
+      <td >80.00</td>
       <td >245</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
-      <td >t5-base_legacy</td>
-      <td >80.00</td>
+      <td >t5-base_sp_backend</td>
+      <td >85.31</td>
       <td >245</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
-      <td >xlm-roberta-base</td>
+      <td >xlm-roberta-base_legacy_sp_backend</td>
       <td >95.10</td>
       <td >245</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
-      <td >xlm-roberta-base_legacy</td>
+      <td >xlm-roberta-base_sp_backend</td>
       <td >95.10</td>
       <td >245</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
-      <td >xlnet-base-cased</td>
-      <td >64.49</td>
+      <td >xlnet-base-cased_legacy_sp_backend</td>
+      <td >57.96</td>
       <td >245</td>
     </tr>
     <tr>
       <td >SentencePiece</td>
-      <td >xlnet-base-cased_legacy</td>
-      <td >57.96</td>
+      <td >xlnet-base-cased_sp_backend</td>
+      <td >64.49</td>
       <td >245</td>
     </tr>
     <tr>
 
@@ -2,7 +2,7 @@
 # Copyright (C) 2023-2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-from argparse import Action, ArgumentParser
+from argparse import Action, ArgumentError, ArgumentParser
 from pathlib import Path
 
 from openvino import Type, save_model
@@ -21,6 +21,13 @@ def __call__(self, parser, namespace, values, option_string=None) -> None:
         setattr(namespace, self.dest, self.string_to_type_dict[values])
 
 
+def check_positive_int(value: str) -> int:
+    int_value = int(value)
+    if int_value <= 0:
+        raise ArgumentError(f"Value must be positive integer, got: {value}")
+    return int_value
+
+
 class TrueOrPositiveIntAction(Action):
     def __call__(self, parser, namespace, values, option_string=None) -> None:
         if values.isnumeric():
@@ -104,6 +111,17 @@ def get_parser() -> ArgumentParser:
             "Not supported for Sentencepiece-based tokenizers."
         ),
     )
+    parser.add_argument(
+        "--max_length",
+        "--max-length",
+        required=False,
+        type=check_positive_int,
+        help=(
+            "Set max_length to the tokenizer for truncation operation. "
+            "Tokenizer won't produce output longer than max_length. "
+            "The value will be replaced by the max_padding option if set."
+        ),
+    )
     skip_special_group = parser.add_mutually_exclusive_group()
     skip_special_group.add_argument(
         "--not-skip-special-tokens",
@@ -250,9 +268,13 @@ def convert_hf_tokenizer() -> None:
 
     print("Loading Huggingface Tokenizer...")
     hf_tokenizer = AutoTokenizer.from_pretrained(args.name, **tokenizer_init_kwargs)
+
     if isinstance(args.max_padding, int) and args.max_padding is not True:
         print(f"Set max_length to: {args.max_padding}")
         hf_tokenizer.model_max_length = args.max_padding
+    elif args.max_length:
+        print(f"Set max_length to: {args.max_length}")
+        hf_tokenizer.model_max_length = args.max_length
 
     print("Converting Huggingface Tokenizer to OpenVINO...")
     converted = convert_tokenizer(
 
@@ -16,8 +16,8 @@
     TokenzierConversionParams,
     change_inputs_type,
     change_outputs_type,
-    update_rt_info_with_params,
     update_rt_info_with_environment,
+    update_rt_info_with_params,
 )
 
 
 
@@ -218,7 +218,7 @@ def replace_spaces_metaspace(cls, replace_term=r"▁") -> "RegexNormalizationSte
 
     @classmethod
     def prepend_regex(cls, string: str) -> "RegexNormalizationStep":
-        return cls(regex_search_pattern=r"(^)(.+)", replace_term=rf"{string}$2")
+        return cls(regex_search_pattern=r"(^)(.)", replace_term=rf"{string}$2")
 
     @classmethod
     def prepend_with_check_regex(cls, string: str, check_string: str) -> "RegexNormalizationStep":
@@ -231,13 +231,6 @@ def del_control_chars_regex(cls) -> "RegexNormalizationStep":
             replace_term="",
         )
 
-    @classmethod
-    def clean_up_tokenization_spaces(cls) -> "RegexNormalizationStep":
-        return cls(
-            regex_search_pattern=r" ([\.\?\!\,])| ('[ms])| (') | ('[rv]e)",
-            replace_term="$1",
-        )
-
     def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
         input_nodes.extend(
             (
@@ -1076,7 +1069,7 @@ class RegexDecodingStep(DecodingStep):
     @classmethod
     def clean_up_tokenization_spaces(cls) -> "RegexDecodingStep":
         return cls(
-            regex_search_pattern=r" ([\\.\\?\\!,])| ('[ms])| (') | ('[rv]e)| (n't)",
+            regex_search_pattern=r"(?| ([\\.\\?\\!,])| ('[ms])| (') | ('[rv]e)| (n't))",
             replace_term=r"$1",
         )
 
 
@@ -11,7 +11,7 @@ void CombineSegments::validate_and_infer_types() {
     OPENVINO_ASSERT(get_input_size() > 0);
     OPENVINO_ASSERT((get_input_size() - 1)%3 == 0);
 
-    // First come several ragged tensors each represented as 3 regular tesors
+    // First come several ragged tensors each represented as 3 regular tensors
     size_t num_inputs = (get_input_size() - 1)/3;
     PartialShape ps = PartialShape::dynamic();
     element::Type et = element::dynamic;
 
@@ -18,8 +18,20 @@ namespace {
  * @return std::string Reformatted replace pattern
  */
 std::string reformat_replace_pattern(std::string replace_pattern) {
-    return std::regex_replace(replace_pattern, std::regex(R"((\\)([0-9]+))"), R"($$2)");
+    return std::regex_replace(replace_pattern, std::regex(R"((?:\\)([0-9]+))"), R"($$1)");
+}
 
+/**
+ * @brief Fix old search pattern for backward compatibility
+ *
+ * @param search_pattern Search pattern to replace
+ * @return std::string Replaced search pattern
+ */
+std::string fix_search_pattern(std::string search_pattern) {
+    if (search_pattern == R"( ([\\.\\?\\!,])| ('[ms])| (') | ('[rv]e)| (n't))") {
+        return R"((?| ([\\.\\?\\!,])| ('[ms])| (') | ('[rv]e)| (n't)))";
+    }
+    return search_pattern;
 }
 
 } // namespace
@@ -36,8 +48,10 @@ m_global_replace(global_replace) {
     auto replace_pattern_const = as_type_ptr<Constant>(arguments[pattern_input + 1].get_node_shared_ptr());
     auto search_pattern_buf = static_cast<const char*>(search_pattern_const->get_data_ptr());
     auto replace_pattern_buf = static_cast<const char*>(replace_pattern_const->get_data_ptr());
-    auto search_pattern = std::string(search_pattern_buf, search_pattern_const->get_byte_size());
-    m_replace_pattern = std::string(replace_pattern_buf, replace_pattern_const->get_byte_size());
+    auto search_pattern = fix_search_pattern(std::string(search_pattern_buf, search_pattern_const->get_byte_size()));
+    m_replace_pattern = reformat_replace_pattern(
+        std::string(replace_pattern_buf, replace_pattern_const->get_byte_size())
+    );
 
     m_search_pattern_pcre2 = std::make_shared<PCRE2Wrapper>(search_pattern);
 
@@ -66,7 +80,7 @@ RegexNormalization::RegexNormalization(
         if (m_search_pattern_pcre2 == nullptr) {
             search_pattern_buf = static_cast<const char*>(search_pattern_const->get_data_ptr());
             replace_pattern_buf = static_cast<const char*>(replace_pattern_const->get_data_ptr());
-            search_pattern = std::string(search_pattern_buf, search_pattern_const->get_byte_size());
+            search_pattern = fix_search_pattern(std::string(search_pattern_buf, search_pattern_const->get_byte_size()));
             m_replace_pattern = std::string(replace_pattern_buf, replace_pattern_const->get_byte_size());
             m_replace_pattern = reformat_replace_pattern(m_replace_pattern);
             m_search_pattern_pcre2 = std::make_shared<PCRE2Wrapper>(search_pattern);
@@ -100,7 +114,9 @@ bool RegexNormalization::evaluate(ov::TensorVector& outputs, const ov::TensorVec
         // Write to common trie structures should be protected to prevent race conditions.
         std::lock_guard<std::mutex> lock(m_mutex);
         if (m_search_pattern_pcre2 == nullptr) {
-            std::string search_pattern = std::string(inputs[pattern_input].data<const char>(), inputs[pattern_input].get_size());
+            std::string search_pattern = fix_search_pattern(
+                std::string(inputs[pattern_input].data<const char>(), inputs[pattern_input].get_size())
+            );
             m_replace_pattern = std::string(inputs[pattern_input + 1].data<const char>(), inputs[pattern_input + 1].get_size());
             m_replace_pattern = reformat_replace_pattern(m_replace_pattern);
             m_search_pattern_pcre2 = std::make_shared<PCRE2Wrapper>(search_pattern);
Original file line number	Diff line number	Diff line change
`@@ -16,8 +16,8 @@`
`16`	`16`	`TokenzierConversionParams,`
`17`	`17`	`change_inputs_type,`
`18`	`18`	`change_outputs_type,`
`19`		`- update_rt_info_with_params,`
`20`	`19`	`update_rt_info_with_environment,`
	`20`	`+ update_rt_info_with_params,`
`21`	`21`	`)`
`22`	`22`
`23`	`23`