Skip to content

Commit e30c99f

Browse files
apaniukovmryzhov
andauthored
Port Fixes from Master to Release (#315)
* Fix Split Operation For Merging Modes (#311) * Fix Split Operation For Merging Modes * Format Tests * Update Stats * Update Stats (cherry picked from commit 45f441a) * Add max_length Option to CLI Convert Tool (#309) * Add max_length Option to CLI Convert Tool * Add max_length Option to CLI Convert Tool * Add max_length Option to CLI Convert Tool * Add max_length Option to CLI Convert Tool (cherry picked from commit 5c61c1a) * Update Regex For Clean Tokenization Spaces * Fix Replace Pattern Rewrite * Update Pass Rate * fixed sdl issue (#313) (cherry picked from commit c30086c) * Del Unused Regex --------- Co-authored-by: Mikhail Ryzhov <mikhail.ryzhov@intel.com>
1 parent 92bec55 commit e30c99f

13 files changed

+4416
-4336
lines changed

.github/dependency_review.yml

-2
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,9 @@ fail-on-severity: 'low'
22
allow-licenses:
33
- 'BSD-2-Clause'
44
- 'BSD-3-Clause'
5-
- 'BSD-2-Clause AND BSD-3-Clause'
65
- 'MIT'
76
- 'Apache-2.0'
87
- 'ISC'
9-
- 'Apache-2.0 AND MIT'
108
- 'BlueOak-1.0.0'
119
- '0BSD'
1210
- 'Python-2.0'

README.md

+42-42
Original file line numberDiff line numberDiff line change
@@ -459,7 +459,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
459459
<tbody>
460460
<tr>
461461
<td >BPE</td>
462-
<td >97.10</td>
462+
<td >97.18</td>
463463
<td >4544</td>
464464
</tr>
465465
<tr>
@@ -567,7 +567,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
567567
<tr>
568568
<td >BPE</td>
569569
<td >laion/CLIP-ViT-bigG-14-laion2B-39B-b160k</td>
570-
<td >98.47</td>
570+
<td >100.00</td>
571571
<td >261</td>
572572
</tr>
573573
<tr>
@@ -603,163 +603,163 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
603603
<tr>
604604
<td >SentencePiece</td>
605605
<td >NousResearch/Llama-2-13b-hf</td>
606-
<td >96.73</td>
606+
<td >97.55</td>
607607
<td >245</td>
608608
</tr>
609609
<tr>
610610
<td >SentencePiece</td>
611-
<td >NousResearch/Llama-2-13b-hf_legacy</td>
612-
<td >95.92</td>
611+
<td >NousResearch/Llama-2-13b-hf_legacy_sp_backend</td>
612+
<td >97.55</td>
613613
<td >245</td>
614614
</tr>
615615
<tr>
616616
<td >SentencePiece</td>
617617
<td >NousResearch/Llama-2-13b-hf_sp_backend</td>
618-
<td >95.10</td>
618+
<td >94.29</td>
619619
<td >245</td>
620620
</tr>
621621
<tr>
622622
<td >SentencePiece</td>
623623
<td >TinyLlama/TinyLlama-1.1B-Chat-v1.0</td>
624-
<td >96.76</td>
624+
<td >94.33</td>
625625
<td >247</td>
626626
</tr>
627627
<tr>
628628
<td >SentencePiece</td>
629-
<td >TinyLlama/TinyLlama-1.1B-Chat-v1.0_legacy</td>
629+
<td >TinyLlama/TinyLlama-1.1B-Chat-v1.0_legacy_sp_backend</td>
630630
<td >95.14</td>
631631
<td >247</td>
632632
</tr>
633633
<tr>
634634
<td >SentencePiece</td>
635635
<td >TinyLlama/TinyLlama-1.1B-Chat-v1.0_sp_backend</td>
636-
<td >94.33</td>
636+
<td >96.76</td>
637637
<td >247</td>
638638
</tr>
639639
<tr>
640640
<td >SentencePiece</td>
641-
<td >baichuan-inc/Baichuan2-7B-Chat_legacy</td>
641+
<td >baichuan-inc/Baichuan2-7B-Chat_legacy_sp_backend</td>
642642
<td >100.00</td>
643643
<td >245</td>
644644
</tr>
645645
<tr>
646646
<td >SentencePiece</td>
647-
<td >camembert-base</td>
648-
<td >52.24</td>
647+
<td >camembert-base_legacy_sp_backend</td>
648+
<td >75.51</td>
649649
<td >245</td>
650650
</tr>
651651
<tr>
652652
<td >SentencePiece</td>
653-
<td >camembert-base_legacy</td>
654-
<td >75.51</td>
653+
<td >camembert-base_sp_backend</td>
654+
<td >52.24</td>
655655
<td >245</td>
656656
</tr>
657657
<tr>
658658
<td >SentencePiece</td>
659-
<td >facebook/musicgen-small</td>
660-
<td >83.67</td>
659+
<td >facebook/musicgen-small_legacy_sp_backend</td>
660+
<td >78.37</td>
661661
<td >245</td>
662662
</tr>
663663
<tr>
664664
<td >SentencePiece</td>
665-
<td >facebook/musicgen-small_legacy</td>
666-
<td >78.37</td>
665+
<td >facebook/musicgen-small_sp_backend</td>
666+
<td >83.67</td>
667667
<td >245</td>
668668
</tr>
669669
<tr>
670670
<td >SentencePiece</td>
671671
<td >microsoft/Phi-3-mini-128k-instruct</td>
672-
<td >95.95</td>
672+
<td >95.14</td>
673673
<td >247</td>
674674
</tr>
675675
<tr>
676676
<td >SentencePiece</td>
677-
<td >microsoft/Phi-3-mini-128k-instruct_legacy</td>
677+
<td >microsoft/Phi-3-mini-128k-instruct_legacy_sp_backend</td>
678678
<td >94.33</td>
679679
<td >247</td>
680680
</tr>
681681
<tr>
682682
<td >SentencePiece</td>
683683
<td >microsoft/Phi-3-mini-128k-instruct_sp_backend</td>
684-
<td >95.14</td>
684+
<td >95.95</td>
685685
<td >247</td>
686686
</tr>
687687
<tr>
688688
<td >SentencePiece</td>
689-
<td >microsoft/deberta-v3-base</td>
690-
<td >96.73</td>
689+
<td >microsoft/deberta-v3-base_legacy_sp_backend</td>
690+
<td >100.00</td>
691691
<td >245</td>
692692
</tr>
693693
<tr>
694694
<td >SentencePiece</td>
695-
<td >microsoft/deberta-v3-base_legacy</td>
696-
<td >100.00</td>
695+
<td >microsoft/deberta-v3-base_sp_backend</td>
696+
<td >96.73</td>
697697
<td >245</td>
698698
</tr>
699699
<tr>
700700
<td >SentencePiece</td>
701701
<td >mlx-community/quantized-gemma-7b-it</td>
702-
<td >96.76</td>
702+
<td >97.57</td>
703703
<td >247</td>
704704
</tr>
705705
<tr>
706706
<td >SentencePiece</td>
707-
<td >mlx-community/quantized-gemma-7b-it_legacy</td>
707+
<td >mlx-community/quantized-gemma-7b-it_legacy_sp_backend</td>
708708
<td >97.57</td>
709709
<td >247</td>
710710
</tr>
711711
<tr>
712712
<td >SentencePiece</td>
713713
<td >mlx-community/quantized-gemma-7b-it_sp_backend</td>
714-
<td >97.57</td>
714+
<td >96.76</td>
715715
<td >247</td>
716716
</tr>
717717
<tr>
718718
<td >SentencePiece</td>
719-
<td >rinna/bilingual-gpt-neox-4b</td>
720-
<td >82.04</td>
719+
<td >rinna/bilingual-gpt-neox-4b_legacy_sp_backend</td>
720+
<td >86.12</td>
721721
<td >245</td>
722722
</tr>
723723
<tr>
724724
<td >SentencePiece</td>
725-
<td >rinna/bilingual-gpt-neox-4b_legacy</td>
726-
<td >86.12</td>
725+
<td >rinna/bilingual-gpt-neox-4b_sp_backend</td>
726+
<td >80.41</td>
727727
<td >245</td>
728728
</tr>
729729
<tr>
730730
<td >SentencePiece</td>
731-
<td >t5-base</td>
732-
<td >85.31</td>
731+
<td >t5-base_legacy_sp_backend</td>
732+
<td >80.00</td>
733733
<td >245</td>
734734
</tr>
735735
<tr>
736736
<td >SentencePiece</td>
737-
<td >t5-base_legacy</td>
738-
<td >80.00</td>
737+
<td >t5-base_sp_backend</td>
738+
<td >85.31</td>
739739
<td >245</td>
740740
</tr>
741741
<tr>
742742
<td >SentencePiece</td>
743-
<td >xlm-roberta-base</td>
743+
<td >xlm-roberta-base_legacy_sp_backend</td>
744744
<td >95.10</td>
745745
<td >245</td>
746746
</tr>
747747
<tr>
748748
<td >SentencePiece</td>
749-
<td >xlm-roberta-base_legacy</td>
749+
<td >xlm-roberta-base_sp_backend</td>
750750
<td >95.10</td>
751751
<td >245</td>
752752
</tr>
753753
<tr>
754754
<td >SentencePiece</td>
755-
<td >xlnet-base-cased</td>
756-
<td >64.49</td>
755+
<td >xlnet-base-cased_legacy_sp_backend</td>
756+
<td >57.96</td>
757757
<td >245</td>
758758
</tr>
759759
<tr>
760760
<td >SentencePiece</td>
761-
<td >xlnet-base-cased_legacy</td>
762-
<td >57.96</td>
761+
<td >xlnet-base-cased_sp_backend</td>
762+
<td >64.49</td>
763763
<td >245</td>
764764
</tr>
765765
<tr>

python/openvino_tokenizers/cli.py

+23-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# Copyright (C) 2023-2024 Intel Corporation
33
# SPDX-License-Identifier: Apache-2.0
44

5-
from argparse import Action, ArgumentParser
5+
from argparse import Action, ArgumentError, ArgumentParser
66
from pathlib import Path
77

88
from openvino import Type, save_model
@@ -21,6 +21,13 @@ def __call__(self, parser, namespace, values, option_string=None) -> None:
2121
setattr(namespace, self.dest, self.string_to_type_dict[values])
2222

2323

24+
def check_positive_int(value: str) -> int:
25+
int_value = int(value)
26+
if int_value <= 0:
27+
raise ArgumentError(f"Value must be positive integer, got: {value}")
28+
return int_value
29+
30+
2431
class TrueOrPositiveIntAction(Action):
2532
def __call__(self, parser, namespace, values, option_string=None) -> None:
2633
if values.isnumeric():
@@ -104,6 +111,17 @@ def get_parser() -> ArgumentParser:
104111
"Not supported for Sentencepiece-based tokenizers."
105112
),
106113
)
114+
parser.add_argument(
115+
"--max_length",
116+
"--max-length",
117+
required=False,
118+
type=check_positive_int,
119+
help=(
120+
"Set max_length to the tokenizer for truncation operation. "
121+
"Tokenizer won't produce output longer than max_length. "
122+
"The value will be replaced by the max_padding option if set."
123+
),
124+
)
107125
skip_special_group = parser.add_mutually_exclusive_group()
108126
skip_special_group.add_argument(
109127
"--not-skip-special-tokens",
@@ -250,9 +268,13 @@ def convert_hf_tokenizer() -> None:
250268

251269
print("Loading Huggingface Tokenizer...")
252270
hf_tokenizer = AutoTokenizer.from_pretrained(args.name, **tokenizer_init_kwargs)
271+
253272
if isinstance(args.max_padding, int) and args.max_padding is not True:
254273
print(f"Set max_length to: {args.max_padding}")
255274
hf_tokenizer.model_max_length = args.max_padding
275+
elif args.max_length:
276+
print(f"Set max_length to: {args.max_length}")
277+
hf_tokenizer.model_max_length = args.max_length
256278

257279
print("Converting Huggingface Tokenizer to OpenVINO...")
258280
converted = convert_tokenizer(

python/openvino_tokenizers/convert_tokenizer.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@
1616
TokenzierConversionParams,
1717
change_inputs_type,
1818
change_outputs_type,
19-
update_rt_info_with_params,
2019
update_rt_info_with_environment,
20+
update_rt_info_with_params,
2121
)
2222

2323

python/openvino_tokenizers/tokenizer_pipeline.py

+2-9
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ def replace_spaces_metaspace(cls, replace_term=r"▁") -> "RegexNormalizationSte
218218

219219
@classmethod
220220
def prepend_regex(cls, string: str) -> "RegexNormalizationStep":
221-
return cls(regex_search_pattern=r"(^)(.+)", replace_term=rf"{string}$2")
221+
return cls(regex_search_pattern=r"(^)(.)", replace_term=rf"{string}$2")
222222

223223
@classmethod
224224
def prepend_with_check_regex(cls, string: str, check_string: str) -> "RegexNormalizationStep":
@@ -231,13 +231,6 @@ def del_control_chars_regex(cls) -> "RegexNormalizationStep":
231231
replace_term="",
232232
)
233233

234-
@classmethod
235-
def clean_up_tokenization_spaces(cls) -> "RegexNormalizationStep":
236-
return cls(
237-
regex_search_pattern=r" ([\.\?\!\,])| ('[ms])| (') | ('[rv]e)",
238-
replace_term="$1",
239-
)
240-
241234
def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
242235
input_nodes.extend(
243236
(
@@ -1076,7 +1069,7 @@ class RegexDecodingStep(DecodingStep):
10761069
@classmethod
10771070
def clean_up_tokenization_spaces(cls) -> "RegexDecodingStep":
10781071
return cls(
1079-
regex_search_pattern=r" ([\\.\\?\\!,])| ('[ms])| (') | ('[rv]e)| (n't)",
1072+
regex_search_pattern=r"(?| ([\\.\\?\\!,])| ('[ms])| (') | ('[rv]e)| (n't))",
10801073
replace_term=r"$1",
10811074
)
10821075

src/combine_segments.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ void CombineSegments::validate_and_infer_types() {
1111
OPENVINO_ASSERT(get_input_size() > 0);
1212
OPENVINO_ASSERT((get_input_size() - 1)%3 == 0);
1313

14-
// First come several ragged tensors each represented as 3 regular tesors
14+
// First come several ragged tensors each represented as 3 regular tensors
1515
size_t num_inputs = (get_input_size() - 1)/3;
1616
PartialShape ps = PartialShape::dynamic();
1717
element::Type et = element::dynamic;

src/regex_normalization.cpp

+21-5
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,20 @@ namespace {
1818
* @return std::string Reformatted replace pattern
1919
*/
2020
std::string reformat_replace_pattern(std::string replace_pattern) {
21-
return std::regex_replace(replace_pattern, std::regex(R"((\\)([0-9]+))"), R"($$2)");
21+
return std::regex_replace(replace_pattern, std::regex(R"((?:\\)([0-9]+))"), R"($$1)");
22+
}
2223

24+
/**
25+
* @brief Fix old search pattern for backward compatibility
26+
*
27+
* @param search_pattern Search pattern to replace
28+
* @return std::string Replaced search pattern
29+
*/
30+
std::string fix_search_pattern(std::string search_pattern) {
31+
if (search_pattern == R"( ([\\.\\?\\!,])| ('[ms])| (') | ('[rv]e)| (n't))") {
32+
return R"((?| ([\\.\\?\\!,])| ('[ms])| (') | ('[rv]e)| (n't)))";
33+
}
34+
return search_pattern;
2335
}
2436

2537
} // namespace
@@ -36,8 +48,10 @@ m_global_replace(global_replace) {
3648
auto replace_pattern_const = as_type_ptr<Constant>(arguments[pattern_input + 1].get_node_shared_ptr());
3749
auto search_pattern_buf = static_cast<const char*>(search_pattern_const->get_data_ptr());
3850
auto replace_pattern_buf = static_cast<const char*>(replace_pattern_const->get_data_ptr());
39-
auto search_pattern = std::string(search_pattern_buf, search_pattern_const->get_byte_size());
40-
m_replace_pattern = std::string(replace_pattern_buf, replace_pattern_const->get_byte_size());
51+
auto search_pattern = fix_search_pattern(std::string(search_pattern_buf, search_pattern_const->get_byte_size()));
52+
m_replace_pattern = reformat_replace_pattern(
53+
std::string(replace_pattern_buf, replace_pattern_const->get_byte_size())
54+
);
4155

4256
m_search_pattern_pcre2 = std::make_shared<PCRE2Wrapper>(search_pattern);
4357

@@ -66,7 +80,7 @@ RegexNormalization::RegexNormalization(
6680
if (m_search_pattern_pcre2 == nullptr) {
6781
search_pattern_buf = static_cast<const char*>(search_pattern_const->get_data_ptr());
6882
replace_pattern_buf = static_cast<const char*>(replace_pattern_const->get_data_ptr());
69-
search_pattern = std::string(search_pattern_buf, search_pattern_const->get_byte_size());
83+
search_pattern = fix_search_pattern(std::string(search_pattern_buf, search_pattern_const->get_byte_size()));
7084
m_replace_pattern = std::string(replace_pattern_buf, replace_pattern_const->get_byte_size());
7185
m_replace_pattern = reformat_replace_pattern(m_replace_pattern);
7286
m_search_pattern_pcre2 = std::make_shared<PCRE2Wrapper>(search_pattern);
@@ -100,7 +114,9 @@ bool RegexNormalization::evaluate(ov::TensorVector& outputs, const ov::TensorVec
100114
// Write to common trie structures should be protected to prevent race conditions.
101115
std::lock_guard<std::mutex> lock(m_mutex);
102116
if (m_search_pattern_pcre2 == nullptr) {
103-
std::string search_pattern = std::string(inputs[pattern_input].data<const char>(), inputs[pattern_input].get_size());
117+
std::string search_pattern = fix_search_pattern(
118+
std::string(inputs[pattern_input].data<const char>(), inputs[pattern_input].get_size())
119+
);
104120
m_replace_pattern = std::string(inputs[pattern_input + 1].data<const char>(), inputs[pattern_input + 1].get_size());
105121
m_replace_pattern = reformat_replace_pattern(m_replace_pattern);
106122
m_search_pattern_pcre2 = std::make_shared<PCRE2Wrapper>(search_pattern);

0 commit comments

Comments
 (0)