Skip to content

Commit f1f171c

Browse files
authored
Add Unigram Tokenizer Implementation (openvinotoolkit#431)
* Fix Shape Estimation in RegexSplit * Fix Shape Estimation in RegexSplit * Add Unigram Tokenizer Support - Add UnigramTokenizer operation - Change the default conversion behaviour for Fast tokenizer to use new Unigram implementation instead of the Sentencepiece backend - Add support for Strip normalization operation - Separate Sentencepiece backend tests from our implementation of BPE and Unigram * Ruff Check/Format * Update Tests * Update Tests * Fix Split Parsing * Fix Pass Rate
1 parent 485b90a commit f1f171c

20 files changed

+3732
-4193
lines changed

README.md

+102-73
Original file line numberDiff line numberDiff line change
@@ -446,7 +446,7 @@ int main(int argc, char* argv[]) {
446446
|---------------------------------|----------------------|----------|-----------|
447447
| Fast | WordPiece | ✅ | ✅ |
448448
| | BPE | ✅ | ✅ |
449-
| | Unigram | | |
449+
| | Unigram | | |
450450
| | WordLevel* | ✅ | ✅ |
451451
| Legacy | SentencePiece .model | ✅ | ✅ |
452452
| Custom | tiktoken | ✅ | ✅ |
@@ -469,19 +469,24 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
469469
<tbody>
470470
<tr>
471471
<td >BPE</td>
472-
<td >99.61</td>
473-
<td >4560</td>
472+
<td >99.46</td>
473+
<td >5546</td>
474474
</tr>
475475
<tr>
476476
<td >SentencePiece</td>
477-
<td >89.19</td>
478-
<td >6633</td>
477+
<td >89.82</td>
478+
<td >5157</td>
479479
</tr>
480480
<tr>
481481
<td >Tiktoken</td>
482482
<td >96.56</td>
483483
<td >524</td>
484484
</tr>
485+
<tr>
486+
<td >Unigram</td>
487+
<td >95.24</td>
488+
<td >1470</td>
489+
</tr>
485490
<tr>
486491
<td >WordLevel</td>
487492
<td >98.96</td>
@@ -507,6 +512,12 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
507512
</tr>
508513
</thead>
509514
<tbody>
515+
<tr>
516+
<td >BPE</td>
517+
<td >NousResearch/Llama-2-13b-hf</td>
518+
<td >97.55</td>
519+
<td >245</td>
520+
</tr>
510521
<tr>
511522
<td >BPE</td>
512523
<td >NousResearch/Meta-Llama-3-8B-Instruct</td>
@@ -519,6 +530,12 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
519530
<td >100.00</td>
520531
<td >261</td>
521532
</tr>
533+
<tr>
534+
<td >BPE</td>
535+
<td >TinyLlama/TinyLlama-1.1B-Chat-v1.0</td>
536+
<td >100.00</td>
537+
<td >247</td>
538+
</tr>
522539
<tr>
523540
<td >BPE</td>
524541
<td >Xenova/gpt-4o</td>
@@ -585,12 +602,24 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
585602
<td >100.00</td>
586603
<td >261</td>
587604
</tr>
605+
<tr>
606+
<td >BPE</td>
607+
<td >microsoft/Phi-3-mini-128k-instruct</td>
608+
<td >100.00</td>
609+
<td >247</td>
610+
</tr>
588611
<tr>
589612
<td >BPE</td>
590613
<td >microsoft/deberta-base</td>
591614
<td >100.00</td>
592615
<td >245</td>
593616
</tr>
617+
<tr>
618+
<td >BPE</td>
619+
<td >mlx-community/quantized-gemma-7b-it</td>
620+
<td >97.57</td>
621+
<td >247</td>
622+
</tr>
594623
<tr>
595624
<td >BPE</td>
596625
<td >roberta-base</td>
@@ -617,22 +646,28 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
617646
</tr>
618647
<tr>
619648
<td >SentencePiece</td>
620-
<td >NousResearch/Llama-2-13b-hf</td>
621-
<td >97.55</td>
649+
<td >BAAI/bge-reranker-v2-m3</td>
650+
<td >96.73</td>
622651
<td >245</td>
623652
</tr>
624653
<tr>
625654
<td >SentencePiece</td>
626-
<td >NousResearch/Llama-2-13b-hf_legacy_sp_backend</td>
627-
<td >97.55</td>
655+
<td >BAAI/bge-reranker-v2-m3_legacy</td>
656+
<td >96.73</td>
628657
<td >245</td>
629658
</tr>
630659
<tr>
631660
<td >SentencePiece</td>
632-
<td >NousResearch/Llama-2-13b-hf_sp_backend</td>
661+
<td >NousResearch/Llama-2-13b-hf</td>
633662
<td >94.29</td>
634663
<td >245</td>
635664
</tr>
665+
<tr>
666+
<td >SentencePiece</td>
667+
<td >NousResearch/Llama-2-13b-hf_legacy</td>
668+
<td >97.55</td>
669+
<td >245</td>
670+
</tr>
636671
<tr>
637672
<td >SentencePiece</td>
638673
<td >TinyLlama/TinyLlama-1.1B-Chat-v1.0</td>
@@ -641,153 +676,147 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
641676
</tr>
642677
<tr>
643678
<td >SentencePiece</td>
644-
<td >TinyLlama/TinyLlama-1.1B-Chat-v1.0_legacy_sp_backend</td>
679+
<td >TinyLlama/TinyLlama-1.1B-Chat-v1.0_legacy</td>
645680
<td >98.38</td>
646681
<td >247</td>
647682
</tr>
648683
<tr>
649684
<td >SentencePiece</td>
650-
<td >TinyLlama/TinyLlama-1.1B-Chat-v1.0_sp_backend</td>
685+
<td >baichuan-inc/Baichuan2-7B-Chat_legacy</td>
651686
<td >100.00</td>
652-
<td >247</td>
687+
<td >245</td>
653688
</tr>
654689
<tr>
655690
<td >SentencePiece</td>
656-
<td >baichuan-inc/Baichuan2-7B-Chat_legacy_sp_backend</td>
657-
<td >100.00</td>
691+
<td >camembert-base</td>
692+
<td >55.10</td>
658693
<td >245</td>
659694
</tr>
660695
<tr>
661696
<td >SentencePiece</td>
662-
<td >camembert-base_legacy_sp_backend</td>
663-
<td >75.51</td>
697+
<td >camembert-base_legacy</td>
698+
<td >78.37</td>
664699
<td >245</td>
665700
</tr>
666701
<tr>
667702
<td >SentencePiece</td>
668-
<td >camembert-base_sp_backend</td>
669-
<td >52.24</td>
703+
<td >facebook/musicgen-small</td>
704+
<td >82.45</td>
670705
<td >245</td>
671706
</tr>
672707
<tr>
673708
<td >SentencePiece</td>
674-
<td >facebook/musicgen-small_legacy_sp_backend</td>
675-
<td >78.37</td>
709+
<td >facebook/musicgen-small_legacy</td>
710+
<td >77.14</td>
676711
<td >245</td>
677712
</tr>
678713
<tr>
679714
<td >SentencePiece</td>
680-
<td >facebook/musicgen-small_sp_backend</td>
681-
<td >83.67</td>
715+
<td >google/flan-t5-xxl</td>
716+
<td >75.92</td>
682717
<td >245</td>
683718
</tr>
684719
<tr>
685720
<td >SentencePiece</td>
686-
<td >microsoft/Phi-3-mini-128k-instruct</td>
687-
<td >100.00</td>
688-
<td >247</td>
721+
<td >google/flan-t5-xxl_legacy</td>
722+
<td >75.51</td>
723+
<td >245</td>
689724
</tr>
690725
<tr>
691726
<td >SentencePiece</td>
692-
<td >microsoft/Phi-3-mini-128k-instruct_legacy_sp_backend</td>
693-
<td >97.57</td>
727+
<td >microsoft/Phi-3-mini-128k-instruct</td>
728+
<td >99.19</td>
694729
<td >247</td>
695730
</tr>
696731
<tr>
697732
<td >SentencePiece</td>
698-
<td >microsoft/Phi-3-mini-128k-instruct_sp_backend</td>
699-
<td >99.19</td>
733+
<td >microsoft/Phi-3-mini-128k-instruct_legacy</td>
734+
<td >97.57</td>
700735
<td >247</td>
701736
</tr>
702737
<tr>
703738
<td >SentencePiece</td>
704-
<td >microsoft/deberta-v3-base_legacy_sp_backend</td>
705-
<td >100.00</td>
739+
<td >microsoft/deberta-v3-base</td>
740+
<td >95.10</td>
706741
<td >245</td>
707742
</tr>
708743
<tr>
709744
<td >SentencePiece</td>
710-
<td >microsoft/deberta-v3-base_sp_backend</td>
711-
<td >96.73</td>
745+
<td >microsoft/deberta-v3-base_legacy</td>
746+
<td >98.37</td>
712747
<td >245</td>
713748
</tr>
714749
<tr>
715750
<td >SentencePiece</td>
716751
<td >mlx-community/quantized-gemma-7b-it</td>
717-
<td >97.57</td>
752+
<td >96.76</td>
718753
<td >247</td>
719754
</tr>
720755
<tr>
721756
<td >SentencePiece</td>
722-
<td >mlx-community/quantized-gemma-7b-it_legacy_sp_backend</td>
757+
<td >mlx-community/quantized-gemma-7b-it_legacy</td>
723758
<td >97.57</td>
724759
<td >247</td>
725760
</tr>
726761
<tr>
727762
<td >SentencePiece</td>
728-
<td >mlx-community/quantized-gemma-7b-it_sp_backend</td>
729-
<td >96.76</td>
730-
<td >247</td>
763+
<td >rinna/bilingual-gpt-neox-4b</td>
764+
<td >83.67</td>
765+
<td >245</td>
731766
</tr>
732767
<tr>
733768
<td >SentencePiece</td>
734-
<td >rinna/bilingual-gpt-neox-4b_legacy_sp_backend</td>
735-
<td >86.12</td>
769+
<td >rinna/bilingual-gpt-neox-4b_legacy</td>
770+
<td >89.39</td>
736771
<td >245</td>
737772
</tr>
738773
<tr>
739-
<td >SentencePiece</td>
740-
<td >rinna/bilingual-gpt-neox-4b_sp_backend</td>
741-
<td >80.41</td>
742-
<td >245</td>
774+
<td >Tiktoken</td>
775+
<td >Qwen/Qwen-14B-Chat</td>
776+
<td >100.00</td>
777+
<td >261</td>
743778
</tr>
744779
<tr>
745-
<td >SentencePiece</td>
746-
<td >t5-base_legacy_sp_backend</td>
747-
<td >80.00</td>
748-
<td >245</td>
780+
<td >Tiktoken</td>
781+
<td >THUDM/glm-4-9b-chat</td>
782+
<td >93.16</td>
783+
<td >263</td>
749784
</tr>
750785
<tr>
751-
<td >SentencePiece</td>
752-
<td >t5-base_sp_backend</td>
753-
<td >85.31</td>
786+
<td >Unigram</td>
787+
<td >BAAI/bge-reranker-v2-m3</td>
788+
<td >98.37</td>
754789
<td >245</td>
755790
</tr>
756791
<tr>
757-
<td >SentencePiece</td>
758-
<td >xlm-roberta-base_legacy_sp_backend</td>
759-
<td >95.10</td>
792+
<td >Unigram</td>
793+
<td >camembert-base</td>
794+
<td >84.49</td>
760795
<td >245</td>
761796
</tr>
762797
<tr>
763-
<td >SentencePiece</td>
764-
<td >xlm-roberta-base_sp_backend</td>
765-
<td >95.10</td>
798+
<td >Unigram</td>
799+
<td >facebook/musicgen-small</td>
800+
<td >98.37</td>
766801
<td >245</td>
767802
</tr>
768803
<tr>
769-
<td >SentencePiece</td>
770-
<td >xlnet-base-cased_legacy_sp_backend</td>
771-
<td >57.96</td>
804+
<td >Unigram</td>
805+
<td >google/flan-t5-xxl</td>
806+
<td >91.84</td>
772807
<td >245</td>
773808
</tr>
774809
<tr>
775-
<td >SentencePiece</td>
776-
<td >xlnet-base-cased_sp_backend</td>
777-
<td >64.49</td>
810+
<td >Unigram</td>
811+
<td >microsoft/deberta-v3-base</td>
812+
<td >98.37</td>
778813
<td >245</td>
779814
</tr>
780815
<tr>
781-
<td >Tiktoken</td>
782-
<td >Qwen/Qwen-14B-Chat</td>
816+
<td >Unigram</td>
817+
<td >rinna/bilingual-gpt-neox-4b</td>
783818
<td >100.00</td>
784-
<td >261</td>
785-
</tr>
786-
<tr>
787-
<td >Tiktoken</td>
788-
<td >THUDM/glm-4-9b-chat</td>
789-
<td >93.16</td>
790-
<td >263</td>
819+
<td >245</td>
791820
</tr>
792821
<tr>
793822
<td >WordLevel</td>

python/openvino_tokenizers/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# Copyright (C) 2018-2025 Intel Corporation
33
# SPDX-License-Identifier: Apache-2.0
44
import functools
5+
import logging
56
import os
67
import site
78
import sys
@@ -11,7 +12,6 @@
1112

1213
import openvino
1314
from openvino.utils.node_factory import NodeFactory
14-
import logging
1515

1616

1717
logger = logging.getLogger(__name__)

python/openvino_tokenizers/build_tokenizer.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from typing import Iterable, Tuple
22

3-
from openvino import Model, PartialShape, Type
4-
from openvino import op
3+
from openvino import Model, PartialShape, Type, op
54
from openvino import opset12 as opset
65
from openvino.utils.types import make_constant_node
76

python/openvino_tokenizers/convert_tokenizer.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@
1010

1111
from openvino import Model, Type
1212
from openvino.exceptions import OVTypeError
13-
from openvino_tokenizers.tokenizer_transformations import add_second_input
1413

1514
from openvino_tokenizers.constants import UTF8ReplaceMode
15+
from openvino_tokenizers.tokenizer_transformations import add_second_input
1616
from openvino_tokenizers.utils import (
1717
TokenzierConversionParams,
1818
change_inputs_type,
@@ -112,7 +112,6 @@ def convert_tokenizer(
112112
convert_fast_tokenizer,
113113
convert_sentencepiece_model_tokenizer,
114114
convert_tiktoken_model_tokenizer,
115-
is_sentencepiece_bpe_model,
116115
is_sentencepiece_model,
117116
is_tiktoken_model,
118117
)
@@ -122,9 +121,8 @@ def convert_tokenizer(
122121
tokenizer_object.model_max_length = params.max_length
123122

124123
can_use_sentencepiece = is_sentencepiece_model(tokenizer_object)
125-
is_unigram = can_use_sentencepiece and not is_sentencepiece_bpe_model(tokenizer_object)
126124
if isinstance(tokenizer_object, PreTrainedTokenizerBase):
127-
if can_use_sentencepiece and (is_unigram or not tokenizer_object.is_fast or params.use_sentencepiece_backend):
125+
if can_use_sentencepiece and (not tokenizer_object.is_fast or params.use_sentencepiece_backend):
128126
logger.info("Convert tokenizer using SentencePiece .model file.")
129127
ov_tokenizers = convert_sentencepiece_model_tokenizer(tokenizer_object, params)
130128
elif is_tiktoken_model(tokenizer_object):

0 commit comments

Comments
 (0)