Skip to content

Commit e9d708d

Browse files
authored
Cherry-Pick From Master (#112)
* Parse QWEN EOD_TOKEN_ID (cherry picked from commit 029a341) * Fix Parse (cherry picked from commit 0298fea) * Fix Test (cherry picked from commit fd220db)
1 parent 78c2899 commit e9d708d

File tree

3 files changed

+17
-8
lines changed

3 files changed

+17
-8
lines changed

python/openvino_tokenizers/hf_parser.py

+8-7
Original file line numberDiff line numberDiff line change
@@ -156,8 +156,8 @@ def parse(
156156
),
157157
]:
158158
add_steps()
159-
self.pipeline.eos_token_id = getattr(self.original_tokenizer, "eos_token_id", None)
160159

160+
self.pipeline.eos_token_id = self.pipeline.get_eos_token_id(self.original_tokenizer)
161161
return self.pipeline
162162

163163
normalizers_map: Dict[
@@ -522,8 +522,9 @@ def convert_sentencepiece_model_tokenizer(
522522
tokenizer = Model(outputs, [input_node], TOKENIZER_NAME)
523523
tokenizer.validate_nodes_and_infer_types()
524524

525-
if hf_tokenizer.eos_token_id is not None:
526-
tokenizer.set_rt_info(hf_tokenizer.eos_token_id, EOS_TOKEN_ID_NAME)
525+
eos_token_id = TokenizerPipeline.get_eos_token_id(hf_tokenizer)
526+
if eos_token_id is not None:
527+
tokenizer.set_rt_info(eos_token_id, EOS_TOKEN_ID_NAME)
527528

528529
if not with_detokenizer:
529530
return tokenizer
@@ -537,8 +538,8 @@ def convert_sentencepiece_model_tokenizer(
537538
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
538539
)
539540

540-
if hf_tokenizer.eos_token_id is not None:
541-
detokenizer.set_rt_info(hf_tokenizer.eos_token_id, EOS_TOKEN_ID_NAME)
541+
if eos_token_id is not None:
542+
detokenizer.set_rt_info(eos_token_id, EOS_TOKEN_ID_NAME)
542543

543544
return tokenizer, detokenizer
544545

@@ -613,9 +614,9 @@ def convert_tiktoken_model_tokenizer(
613614
if clean_up_tokenization_spaces:
614615
pipeline.add_steps(RegexDecodingStep.clean_up_tokenization_spaces())
615616

617+
pipeline.eos_token_id = pipeline.get_eos_token_id(hf_tokenizer)
618+
616619
if not with_detokenizer:
617620
return pipeline.get_tokenizer_ov_subgraph()
618621

619-
pipeline.eos_token_id = hf_tokenizer.eos_token_id
620-
621622
return pipeline.get_tokenizer_ov_subgraph(), pipeline.get_detokenizer_ov_subgraph()

python/openvino_tokenizers/tokenizer_pipeline.py

+8
Original file line numberDiff line numberDiff line change
@@ -901,6 +901,14 @@ def _(self, steps: list) -> None:
901901
def __getitem__(self, item: int) -> BasePipelineStep:
902902
return self.steps[item]
903903

904+
@staticmethod
905+
def get_eos_token_id(hf_tokenizer) -> Optional[int]:
906+
if hf_tokenizer.eos_token_id is not None:
907+
return hf_tokenizer.eos_token_id
908+
909+
# qwen uses eod_id attrubute
910+
return getattr(hf_tokenizer, "eod_id", None)
911+
904912
def get_tokenizer_ov_subgraph(self) -> Model:
905913
string_inputs = [op.Parameter(Type.string, PartialShape(["?"])) for _ in range(self.number_of_inputs)]
906914

tests/tokenizers_test.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -498,7 +498,7 @@ def test_eos_token_id_rt_info_bpe(hf_bpe_tokenizers):
498498

499499

500500
def test_eos_token_id_rt_info_tiktoken(hf_tiktoken_tokenizers):
501-
eos_token_id = hf_tiktoken_tokenizers.eos_token_id
501+
eos_token_id = hf_tiktoken_tokenizers.eos_token_id or hf_tiktoken_tokenizers.eod_id
502502
ov_tokenizer, ov_detokenizer = convert_tokenizer(
503503
hf_tiktoken_tokenizers,
504504
with_detokenizer=True,

0 commit comments

Comments
 (0)