@@ -446,7 +446,7 @@ int main(int argc, char* argv[]) {
446
446
|---------------------------------|----------------------|----------|-----------|
447
447
| Fast | WordPiece | ✅ | ✅ |
448
448
| | BPE | ✅ | ✅ |
449
- | | Unigram | ❌ | ❌ |
449
+ | | Unigram | ✅ | ✅ |
450
450
| | WordLevel* | ✅ | ✅ |
451
451
| Legacy | SentencePiece .model | ✅ | ✅ |
452
452
| Custom | tiktoken | ✅ | ✅ |
@@ -469,19 +469,24 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
469
469
<tbody>
470
470
<tr>
471
471
<td >BPE</td>
472
- <td >99.61 </td>
473
- <td >4560 </td>
472
+ <td >99.46 </td>
473
+ <td >5546 </td>
474
474
</tr>
475
475
<tr>
476
476
<td >SentencePiece</td>
477
- <td >89.19 </td>
478
- <td >6633 </td>
477
+ <td >89.82 </td>
478
+ <td >5157 </td>
479
479
</tr>
480
480
<tr>
481
481
<td >Tiktoken</td>
482
482
<td >96.56</td>
483
483
<td >524</td>
484
484
</tr>
485
+ <tr>
486
+ <td >Unigram</td>
487
+ <td >95.24</td>
488
+ <td >1470</td>
489
+ </tr>
485
490
<tr>
486
491
<td >WordLevel</td>
487
492
<td >98.96</td>
@@ -507,6 +512,12 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
507
512
</tr>
508
513
</thead>
509
514
<tbody>
515
+ <tr>
516
+ <td >BPE</td>
517
+ <td >NousResearch/Llama-2-13b-hf</td>
518
+ <td >97.55</td>
519
+ <td >245</td>
520
+ </tr>
510
521
<tr>
511
522
<td >BPE</td>
512
523
<td >NousResearch/Meta-Llama-3-8B-Instruct</td>
@@ -519,6 +530,12 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
519
530
<td >100.00</td>
520
531
<td >261</td>
521
532
</tr>
533
+ <tr>
534
+ <td >BPE</td>
535
+ <td >TinyLlama/TinyLlama-1.1B-Chat-v1.0</td>
536
+ <td >100.00</td>
537
+ <td >247</td>
538
+ </tr>
522
539
<tr>
523
540
<td >BPE</td>
524
541
<td >Xenova/gpt-4o</td>
@@ -585,12 +602,24 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
585
602
<td >100.00</td>
586
603
<td >261</td>
587
604
</tr>
605
+ <tr>
606
+ <td >BPE</td>
607
+ <td >microsoft/Phi-3-mini-128k-instruct</td>
608
+ <td >100.00</td>
609
+ <td >247</td>
610
+ </tr>
588
611
<tr>
589
612
<td >BPE</td>
590
613
<td >microsoft/deberta-base</td>
591
614
<td >100.00</td>
592
615
<td >245</td>
593
616
</tr>
617
+ <tr>
618
+ <td >BPE</td>
619
+ <td >mlx-community/quantized-gemma-7b-it</td>
620
+ <td >97.57</td>
621
+ <td >247</td>
622
+ </tr>
594
623
<tr>
595
624
<td >BPE</td>
596
625
<td >roberta-base</td>
@@ -617,22 +646,28 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
617
646
</tr>
618
647
<tr>
619
648
<td >SentencePiece</td>
620
- <td >NousResearch/Llama-2-13b-hf </td>
621
- <td >97.55 </td>
649
+ <td >BAAI/bge-reranker-v2-m3 </td>
650
+ <td >96.73 </td>
622
651
<td >245</td>
623
652
</tr>
624
653
<tr>
625
654
<td >SentencePiece</td>
626
- <td >NousResearch/Llama-2-13b-hf_legacy_sp_backend </td>
627
- <td >97.55 </td>
655
+ <td >BAAI/bge-reranker-v2-m3_legacy </td>
656
+ <td >96.73 </td>
628
657
<td >245</td>
629
658
</tr>
630
659
<tr>
631
660
<td >SentencePiece</td>
632
- <td >NousResearch/Llama-2-13b-hf_sp_backend </td>
661
+ <td >NousResearch/Llama-2-13b-hf </td>
633
662
<td >94.29</td>
634
663
<td >245</td>
635
664
</tr>
665
+ <tr>
666
+ <td >SentencePiece</td>
667
+ <td >NousResearch/Llama-2-13b-hf_legacy</td>
668
+ <td >97.55</td>
669
+ <td >245</td>
670
+ </tr>
636
671
<tr>
637
672
<td >SentencePiece</td>
638
673
<td >TinyLlama/TinyLlama-1.1B-Chat-v1.0</td>
@@ -641,153 +676,147 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
641
676
</tr>
642
677
<tr>
643
678
<td >SentencePiece</td>
644
- <td >TinyLlama/TinyLlama-1.1B-Chat-v1.0_legacy_sp_backend </td>
679
+ <td >TinyLlama/TinyLlama-1.1B-Chat-v1.0_legacy </td>
645
680
<td >98.38</td>
646
681
<td >247</td>
647
682
</tr>
648
683
<tr>
649
684
<td >SentencePiece</td>
650
- <td >TinyLlama/TinyLlama-1.1B-Chat-v1.0_sp_backend </td>
685
+ <td >baichuan-inc/Baichuan2-7B-Chat_legacy </td>
651
686
<td >100.00</td>
652
- <td >247 </td>
687
+ <td >245 </td>
653
688
</tr>
654
689
<tr>
655
690
<td >SentencePiece</td>
656
- <td >baichuan-inc/Baichuan2-7B-Chat_legacy_sp_backend </td>
657
- <td >100.00 </td>
691
+ <td >camembert-base </td>
692
+ <td >55.10 </td>
658
693
<td >245</td>
659
694
</tr>
660
695
<tr>
661
696
<td >SentencePiece</td>
662
- <td >camembert-base_legacy_sp_backend </td>
663
- <td >75.51 </td>
697
+ <td >camembert-base_legacy </td>
698
+ <td >78.37 </td>
664
699
<td >245</td>
665
700
</tr>
666
701
<tr>
667
702
<td >SentencePiece</td>
668
- <td >camembert-base_sp_backend </td>
669
- <td >52.24 </td>
703
+ <td >facebook/musicgen-small </td>
704
+ <td >82.45 </td>
670
705
<td >245</td>
671
706
</tr>
672
707
<tr>
673
708
<td >SentencePiece</td>
674
- <td >facebook/musicgen-small_legacy_sp_backend </td>
675
- <td >78.37 </td>
709
+ <td >facebook/musicgen-small_legacy </td>
710
+ <td >77.14 </td>
676
711
<td >245</td>
677
712
</tr>
678
713
<tr>
679
714
<td >SentencePiece</td>
680
- <td >facebook/musicgen-small_sp_backend </td>
681
- <td >83.67 </td>
715
+ <td >google/flan-t5-xxl </td>
716
+ <td >75.92 </td>
682
717
<td >245</td>
683
718
</tr>
684
719
<tr>
685
720
<td >SentencePiece</td>
686
- <td >microsoft/Phi-3-mini-128k-instruct </td>
687
- <td >100.00 </td>
688
- <td >247 </td>
721
+ <td >google/flan-t5-xxl_legacy </td>
722
+ <td >75.51 </td>
723
+ <td >245 </td>
689
724
</tr>
690
725
<tr>
691
726
<td >SentencePiece</td>
692
- <td >microsoft/Phi-3-mini-128k-instruct_legacy_sp_backend </td>
693
- <td >97.57 </td>
727
+ <td >microsoft/Phi-3-mini-128k-instruct </td>
728
+ <td >99.19 </td>
694
729
<td >247</td>
695
730
</tr>
696
731
<tr>
697
732
<td >SentencePiece</td>
698
- <td >microsoft/Phi-3-mini-128k-instruct_sp_backend </td>
699
- <td >99.19 </td>
733
+ <td >microsoft/Phi-3-mini-128k-instruct_legacy </td>
734
+ <td >97.57 </td>
700
735
<td >247</td>
701
736
</tr>
702
737
<tr>
703
738
<td >SentencePiece</td>
704
- <td >microsoft/deberta-v3-base_legacy_sp_backend </td>
705
- <td >100.00 </td>
739
+ <td >microsoft/deberta-v3-base </td>
740
+ <td >95.10 </td>
706
741
<td >245</td>
707
742
</tr>
708
743
<tr>
709
744
<td >SentencePiece</td>
710
- <td >microsoft/deberta-v3-base_sp_backend </td>
711
- <td >96.73 </td>
745
+ <td >microsoft/deberta-v3-base_legacy </td>
746
+ <td >98.37 </td>
712
747
<td >245</td>
713
748
</tr>
714
749
<tr>
715
750
<td >SentencePiece</td>
716
751
<td >mlx-community/quantized-gemma-7b-it</td>
717
- <td >97.57 </td>
752
+ <td >96.76 </td>
718
753
<td >247</td>
719
754
</tr>
720
755
<tr>
721
756
<td >SentencePiece</td>
722
- <td >mlx-community/quantized-gemma-7b-it_legacy_sp_backend </td>
757
+ <td >mlx-community/quantized-gemma-7b-it_legacy </td>
723
758
<td >97.57</td>
724
759
<td >247</td>
725
760
</tr>
726
761
<tr>
727
762
<td >SentencePiece</td>
728
- <td >mlx-community/quantized-gemma-7b-it_sp_backend </td>
729
- <td >96.76 </td>
730
- <td >247 </td>
763
+ <td >rinna/bilingual-gpt-neox-4b </td>
764
+ <td >83.67 </td>
765
+ <td >245 </td>
731
766
</tr>
732
767
<tr>
733
768
<td >SentencePiece</td>
734
- <td >rinna/bilingual-gpt-neox-4b_legacy_sp_backend </td>
735
- <td >86.12 </td>
769
+ <td >rinna/bilingual-gpt-neox-4b_legacy </td>
770
+ <td >89.39 </td>
736
771
<td >245</td>
737
772
</tr>
738
773
<tr>
739
- <td >SentencePiece </td>
740
- <td >rinna/bilingual-gpt-neox-4b_sp_backend </td>
741
- <td >80.41 </td>
742
- <td >245 </td>
774
+ <td >Tiktoken </td>
775
+ <td >Qwen/Qwen-14B-Chat </td>
776
+ <td >100.00 </td>
777
+ <td >261 </td>
743
778
</tr>
744
779
<tr>
745
- <td >SentencePiece </td>
746
- <td >t5-base_legacy_sp_backend </td>
747
- <td >80.00 </td>
748
- <td >245 </td>
780
+ <td >Tiktoken </td>
781
+ <td >THUDM/glm-4-9b-chat </td>
782
+ <td >93.16 </td>
783
+ <td >263 </td>
749
784
</tr>
750
785
<tr>
751
- <td >SentencePiece </td>
752
- <td >t5-base_sp_backend </td>
753
- <td >85.31 </td>
786
+ <td >Unigram </td>
787
+ <td >BAAI/bge-reranker-v2-m3 </td>
788
+ <td >98.37 </td>
754
789
<td >245</td>
755
790
</tr>
756
791
<tr>
757
- <td >SentencePiece </td>
758
- <td >xlm-roberta-base_legacy_sp_backend </td>
759
- <td >95.10 </td>
792
+ <td >Unigram </td>
793
+ <td >camembert-base </td>
794
+ <td >84.49 </td>
760
795
<td >245</td>
761
796
</tr>
762
797
<tr>
763
- <td >SentencePiece </td>
764
- <td >xlm-roberta-base_sp_backend </td>
765
- <td >95.10 </td>
798
+ <td >Unigram </td>
799
+ <td >facebook/musicgen-small </td>
800
+ <td >98.37 </td>
766
801
<td >245</td>
767
802
</tr>
768
803
<tr>
769
- <td >SentencePiece </td>
770
- <td >xlnet-base-cased_legacy_sp_backend </td>
771
- <td >57.96 </td>
804
+ <td >Unigram </td>
805
+ <td >google/flan-t5-xxl </td>
806
+ <td >91.84 </td>
772
807
<td >245</td>
773
808
</tr>
774
809
<tr>
775
- <td >SentencePiece </td>
776
- <td >xlnet-base-cased_sp_backend </td>
777
- <td >64.49 </td>
810
+ <td >Unigram </td>
811
+ <td >microsoft/deberta-v3-base </td>
812
+ <td >98.37 </td>
778
813
<td >245</td>
779
814
</tr>
780
815
<tr>
781
- <td >Tiktoken </td>
782
- <td >Qwen/Qwen-14B-Chat </td>
816
+ <td >Unigram </td>
817
+ <td >rinna/bilingual-gpt-neox-4b </td>
783
818
<td >100.00</td>
784
- <td >261</td>
785
- </tr>
786
- <tr>
787
- <td >Tiktoken</td>
788
- <td >THUDM/glm-4-9b-chat</td>
789
- <td >93.16</td>
790
- <td >263</td>
819
+ <td >245</td>
791
820
</tr>
792
821
<tr>
793
822
<td >WordLevel</td>
0 commit comments