Skip to content

Commit c615ec5

Browse files
authored
Cherry Pick For Release (#174)
* Fix Tokenization of Special Tokens in Sentencepiece (cherry picked from commit 6093bd1) * Add Left Padding and Padding to Max Length (cherry picked from commit 128f7fc)
1 parent e5cb83b commit c615ec5

10 files changed

+454
-149
lines changed

README.md

+82-82
Original file line numberDiff line numberDiff line change
@@ -336,8 +336,8 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
336336
<tbody>
337337
<tr>
338338
<td >BPE</td>
339-
<td >96.57</td>
340-
<td >4991</td>
339+
<td >94.45</td>
340+
<td >5535</td>
341341
</tr>
342342
<tr>
343343
<td >SentencePiece</td>
@@ -346,13 +346,13 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
346346
</tr>
347347
<tr>
348348
<td >Tiktoken</td>
349-
<td >98.17</td>
350-
<td >218</td>
349+
<td >93.98</td>
350+
<td >266</td>
351351
</tr>
352352
<tr>
353353
<td >WordPiece</td>
354-
<td >94.97</td>
355-
<td >1053</td>
354+
<td >91.31</td>
355+
<td >1301</td>
356356
</tr>
357357
</tbody>
358358
</table>
@@ -372,140 +372,140 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
372372
<tr>
373373
<td >BPE</td>
374374
<td >EleutherAI/gpt-j-6b</td>
375-
<td >98.16</td>
376-
<td >217</td>
375+
<td >95.18</td>
376+
<td >249</td>
377377
</tr>
378378
<tr>
379379
<td >BPE</td>
380380
<td >EleutherAI/gpt-neo-125m</td>
381-
<td >98.16</td>
382-
<td >217</td>
381+
<td >95.18</td>
382+
<td >249</td>
383383
</tr>
384384
<tr>
385385
<td >BPE</td>
386386
<td >EleutherAI/gpt-neox-20b</td>
387-
<td >97.24</td>
388-
<td >217</td>
387+
<td >95.71</td>
388+
<td >233</td>
389389
</tr>
390390
<tr>
391391
<td >BPE</td>
392392
<td >EleutherAI/pythia-12b-deduped</td>
393-
<td >97.24</td>
394-
<td >217</td>
393+
<td >95.71</td>
394+
<td >233</td>
395395
</tr>
396396
<tr>
397397
<td >BPE</td>
398398
<td >KoboldAI/fairseq-dense-13B</td>
399-
<td >98.16</td>
400-
<td >217</td>
399+
<td >96.57</td>
400+
<td >233</td>
401401
</tr>
402402
<tr>
403403
<td >BPE</td>
404404
<td >NousResearch/Meta-Llama-3-8B-Instruct</td>
405-
<td >97.24</td>
406-
<td >217</td>
405+
<td >95.71</td>
406+
<td >233</td>
407407
</tr>
408408
<tr>
409409
<td >BPE</td>
410410
<td >Salesforce/codegen-16B-multi</td>
411-
<td >99.08</td>
412-
<td >217</td>
411+
<td >95.98</td>
412+
<td >249</td>
413413
</tr>
414414
<tr>
415415
<td >BPE</td>
416416
<td >Xenova/gpt-4o</td>
417-
<td >97.24</td>
418-
<td >217</td>
417+
<td >94.38</td>
418+
<td >249</td>
419419
</tr>
420420
<tr>
421421
<td >BPE</td>
422422
<td >ai-forever/rugpt3large_based_on_gpt2</td>
423-
<td >96.31</td>
424-
<td >217</td>
423+
<td >90.36</td>
424+
<td >249</td>
425425
</tr>
426426
<tr>
427427
<td >BPE</td>
428428
<td >bigscience/bloom</td>
429-
<td >99.08</td>
430-
<td >217</td>
429+
<td >97.42</td>
430+
<td >233</td>
431431
</tr>
432432
<tr>
433433
<td >BPE</td>
434434
<td >databricks/dolly-v2-3b</td>
435-
<td >97.24</td>
436-
<td >217</td>
435+
<td >95.71</td>
436+
<td >233</td>
437437
</tr>
438438
<tr>
439439
<td >BPE</td>
440440
<td >facebook/bart-large-mnli</td>
441-
<td >98.16</td>
442-
<td >217</td>
441+
<td >95.18</td>
442+
<td >249</td>
443443
</tr>
444444
<tr>
445445
<td >BPE</td>
446446
<td >facebook/galactica-120b</td>
447-
<td >97.24</td>
448-
<td >217</td>
447+
<td >95.71</td>
448+
<td >233</td>
449449
</tr>
450450
<tr>
451451
<td >BPE</td>
452452
<td >facebook/opt-66b</td>
453-
<td >98.16</td>
454-
<td >217</td>
453+
<td >96.57</td>
454+
<td >233</td>
455455
</tr>
456456
<tr>
457457
<td >BPE</td>
458458
<td >gpt2</td>
459-
<td >98.16</td>
460-
<td >217</td>
459+
<td >95.18</td>
460+
<td >249</td>
461461
</tr>
462462
<tr>
463463
<td >BPE</td>
464464
<td >laion/CLIP-ViT-bigG-14-laion2B-39B-b160k</td>
465-
<td >70.97</td>
466-
<td >217</td>
465+
<td >74.70</td>
466+
<td >249</td>
467467
</tr>
468468
<tr>
469469
<td >BPE</td>
470470
<td >microsoft/deberta-base</td>
471-
<td >98.16</td>
472-
<td >217</td>
471+
<td >96.57</td>
472+
<td >233</td>
473473
</tr>
474474
<tr>
475475
<td >BPE</td>
476476
<td >roberta-base</td>
477-
<td >98.16</td>
478-
<td >217</td>
477+
<td >95.18</td>
478+
<td >249</td>
479479
</tr>
480480
<tr>
481481
<td >BPE</td>
482482
<td >sentence-transformers/all-roberta-large-v1</td>
483-
<td >98.16</td>
484-
<td >217</td>
483+
<td >95.18</td>
484+
<td >249</td>
485485
</tr>
486486
<tr>
487487
<td >BPE</td>
488488
<td >stabilityai/stablecode-completion-alpha-3b-4k</td>
489-
<td >97.24</td>
490-
<td >217</td>
489+
<td >95.71</td>
490+
<td >233</td>
491491
</tr>
492492
<tr>
493493
<td >BPE</td>
494494
<td >stabilityai/stablelm-2-1_6b</td>
495-
<td >97.24</td>
496-
<td >217</td>
495+
<td >95.71</td>
496+
<td >233</td>
497497
</tr>
498498
<tr>
499499
<td >BPE</td>
500500
<td >stabilityai/stablelm-tuned-alpha-7b</td>
501-
<td >97.24</td>
502-
<td >217</td>
501+
<td >95.71</td>
502+
<td >233</td>
503503
</tr>
504504
<tr>
505505
<td >BPE</td>
506506
<td >tiiuae/falcon-7b</td>
507-
<td >97.24</td>
508-
<td >217</td>
507+
<td >94.38</td>
508+
<td >249</td>
509509
</tr>
510510
<tr>
511511
<td >SentencePiece</td>
@@ -630,92 +630,92 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
630630
<tr>
631631
<td >Tiktoken</td>
632632
<td >Qwen/Qwen-14B-Chat</td>
633-
<td >98.17</td>
634-
<td >109</td>
633+
<td >92.91</td>
634+
<td >141</td>
635635
</tr>
636636
<tr>
637637
<td >Tiktoken</td>
638638
<td >Salesforce/xgen-7b-8k-base</td>
639-
<td >98.17</td>
640-
<td >109</td>
639+
<td >95.20</td>
640+
<td >125</td>
641641
</tr>
642642
<tr>
643643
<td >WordPiece</td>
644644
<td >ProsusAI/finbert</td>
645-
<td >97.53</td>
646-
<td >81</td>
645+
<td >91.43</td>
646+
<td >105</td>
647647
</tr>
648648
<tr>
649649
<td >WordPiece</td>
650650
<td >bert-base-multilingual-cased</td>
651-
<td >97.53</td>
652-
<td >81</td>
651+
<td >91.43</td>
652+
<td >105</td>
653653
</tr>
654654
<tr>
655655
<td >WordPiece</td>
656656
<td >bert-base-uncased</td>
657-
<td >97.53</td>
658-
<td >81</td>
657+
<td >91.43</td>
658+
<td >105</td>
659659
</tr>
660660
<tr>
661661
<td >WordPiece</td>
662662
<td >cointegrated/rubert-tiny2</td>
663-
<td >91.36</td>
664-
<td >81</td>
663+
<td >91.43</td>
664+
<td >105</td>
665665
</tr>
666666
<tr>
667667
<td >WordPiece</td>
668668
<td >distilbert-base-uncased-finetuned-sst-2-english</td>
669-
<td >97.53</td>
670-
<td >81</td>
669+
<td >91.43</td>
670+
<td >105</td>
671671
</tr>
672672
<tr>
673673
<td >WordPiece</td>
674674
<td >google/electra-base-discriminator</td>
675-
<td >97.53</td>
676-
<td >81</td>
675+
<td >91.43</td>
676+
<td >105</td>
677677
</tr>
678678
<tr>
679679
<td >WordPiece</td>
680680
<td >google/mobilebert-uncased</td>
681-
<td >97.53</td>
682-
<td >81</td>
681+
<td >94.38</td>
682+
<td >89</td>
683683
</tr>
684684
<tr>
685685
<td >WordPiece</td>
686686
<td >jhgan/ko-sbert-sts</td>
687-
<td >87.65</td>
688-
<td >81</td>
687+
<td >91.43</td>
688+
<td >105</td>
689689
</tr>
690690
<tr>
691691
<td >WordPiece</td>
692692
<td >prajjwal1/bert-mini</td>
693-
<td >97.53</td>
694-
<td >81</td>
693+
<td >94.38</td>
694+
<td >89</td>
695695
</tr>
696696
<tr>
697697
<td >WordPiece</td>
698698
<td >rajiv003/ernie-finetuned-qqp</td>
699-
<td >97.53</td>
700-
<td >81</td>
699+
<td >94.38</td>
700+
<td >89</td>
701701
</tr>
702702
<tr>
703703
<td >WordPiece</td>
704704
<td >rasa/LaBSE</td>
705-
<td >90.12</td>
706-
<td >81</td>
705+
<td >80.00</td>
706+
<td >105</td>
707707
</tr>
708708
<tr>
709709
<td >WordPiece</td>
710710
<td >sentence-transformers/all-MiniLM-L6-v2</td>
711-
<td >87.65</td>
712-
<td >81</td>
711+
<td >91.43</td>
712+
<td >105</td>
713713
</tr>
714714
<tr>
715715
<td >WordPiece</td>
716716
<td >squeezebert/squeezebert-uncased</td>
717-
<td >97.53</td>
718-
<td >81</td>
717+
<td >94.38</td>
718+
<td >89</td>
719719
</tr>
720720
</tbody>
721721
</table>

0 commit comments

Comments
 (0)