Skip to content

Commit a0d8203

Browse files
authored
Support GLM-4 Tokenizer (#202)
* Support GLM-4 Tokenizer (cherry picked from commit 9046fa9) * Support GLM-4 Detokenizer (cherry picked from commit 5c93773) * Add Reduce Size Build Example
1 parent e5b51db commit a0d8203

8 files changed

+223
-13
lines changed

README.md

+12-2
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,10 @@ By default, all available ICU locales are supported, which significantly increas
183183
```bash
184184
-DBUILD_FAST_TOKENIZERS=ON
185185
```
186+
- Example for a pip installation path:
187+
```bash
188+
ICU_DATA_FILTER_FILE=</path/to/filters.json> pip install git+https://github.com/openvinotoolkit/openvino_tokenizers.git --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --config-settings=override=cmake.options.BUILD_FAST_TOKENIZERS=ON
189+
```
186190

187191
By following these instructions, you can effectively reduce the size of the ICU libraries in your final package.
188192

@@ -434,8 +438,8 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
434438
</tr>
435439
<tr>
436440
<td >Tiktoken</td>
437-
<td >94.12</td>
438-
<td >272</td>
441+
<td >87.26</td>
442+
<td >416</td>
439443
</tr>
440444
<tr>
441445
<td >WordPiece</td>
@@ -745,6 +749,12 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
745749
<td >95.31</td>
746750
<td >128</td>
747751
</tr>
752+
<tr>
753+
<td >Tiktoken</td>
754+
<td >THUDM/glm-4-9b</td>
755+
<td >74.31</td>
756+
<td >144</td>
757+
</tr>
748758
<tr>
749759
<td >WordPiece</td>
750760
<td >ProsusAI/finbert</td>

python/openvino_tokenizers/hf_parser.py

+32-5
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
TOKENIZER_NAME,
3131
)
3232
from .tokenizer_pipeline import (
33+
AddToken,
3334
BasePipelineStep,
3435
BPETokenizationStep,
3536
ByteFallbackStep,
@@ -47,6 +48,7 @@
4748
RegexDecodingStep,
4849
RegexNormalizationStep,
4950
RegexSplitStep,
51+
Sequence,
5052
StripStringStep,
5153
TokenizerPipeline,
5254
TruncationStep,
@@ -449,10 +451,26 @@ def is_sentencepiece_model(hf_tokenizer: PreTrainedTokenizerBase) -> bool:
449451
if not hasattr(hf_tokenizer, "vocab_files_names") or "vocab_file" not in hf_tokenizer.vocab_files_names:
450452
return False
451453
vocab_file = Path(tmp) / hf_tokenizer.vocab_files_names["vocab_file"]
452-
return (
454+
vocab_file_exists = (
453455
getattr(hf_tokenizer, "vocab_files_names", {}).get("vocab_file", "").endswith(".model")
454456
and vocab_file.exists()
455457
)
458+
if vocab_file_exists:
459+
try:
460+
from google.protobuf.message import DecodeError
461+
except (ImportError, ModuleNotFoundError):
462+
return False
463+
464+
model_pb = import_protobuf()
465+
model = model_pb.ModelProto()
466+
try:
467+
with open(vocab_file, "rb") as model_file:
468+
model.ParseFromString(model_file.read())
469+
return True
470+
except DecodeError:
471+
pass # protobuf file is corrupted
472+
473+
return False
456474

457475

458476
def modify_sentencepiece_model(
@@ -831,11 +849,13 @@ def get_sp_detokenizer(
831849
def is_tiktoken_model(hf_tokenizer: PreTrainedTokenizerBase) -> bool:
832850
try:
833851
from tiktoken import Encoding
834-
except ImportError:
852+
except (ImportError, ModuleNotFoundError):
835853
return False
836854

837-
return getattr(hf_tokenizer, "vocab_files_names", {}).get("vocab_file", "").endswith(".tiktoken") or isinstance(
838-
getattr(hf_tokenizer, "encoder", None), Encoding
855+
return (
856+
getattr(hf_tokenizer, "vocab_files_names", {}).get("vocab_file", "").endswith(".tiktoken")
857+
or isinstance(getattr(hf_tokenizer, "encoder", None), Encoding)
858+
or isinstance(getattr(hf_tokenizer, "tokenizer", None), Encoding)
839859
)
840860

841861

@@ -854,13 +874,20 @@ def convert_tiktoken_model_tokenizer(
854874
if skip_special_tokens:
855875
skip_tokens = list(parse_special_tokens(hf_tokenizer))
856876

877+
add_prefix_steps = []
878+
if hasattr(hf_tokenizer, "get_prefix_tokens"):
879+
prefix_tokens = [AddToken(_token_id=token_id) for token_id in hf_tokenizer.get_prefix_tokens()]
880+
add_prefix_steps.append(CombineSegmentsStep(inputs=prefix_tokens + [Sequence()]))
881+
882+
reference_vocab = getattr(hf_tokenizer, "get_vocab", lambda: None)()
857883
pipeline.add_steps(
858884
[
859885
NormalizeUnicode("NFC"),
860886
RegexSplitStep(split_pattern, behaviour="contiguous"),
861887
BytesToCharsStep(),
862-
BPETokenizationStep.from_tiktoken_encoding(encoding),
888+
BPETokenizationStep.from_tiktoken_encoding(encoding, reference_vocab=reference_vocab),
863889
TruncationStep.from_hf_object(hf_tokenizer),
890+
*add_prefix_steps,
864891
PaddingStep(
865892
token=getattr(hf_tokenizer, "pad_token"),
866893
_token_id=getattr(hf_tokenizer, "pad_token_id"),

python/openvino_tokenizers/tiktoken_parser.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ def generate_vocab_and_merges(encoding: Encoding) -> Tuple[Dict[str, int], List[
3737
added_tokens = {}
3838

3939
for token, rank in mergeable_ranks.items():
40-
vocab[token_bytes_to_string(token)] = rank
40+
string_token = token_bytes_to_string(token)
41+
vocab[string_token] = rank
4142

4243
if len(token) == 1:
4344
continue
@@ -50,7 +51,10 @@ def generate_vocab_and_merges(encoding: Encoding) -> Tuple[Dict[str, int], List[
5051
if len(merged) == 2:
5152
merges.append(" ".join(map(token_bytes_to_string, merged)))
5253
else:
53-
added_tokens[rank] = token.decode("utf-8")
54+
try:
55+
added_tokens[rank] = token.decode("utf-8")
56+
except UnicodeDecodeError:
57+
added_tokens[rank] = string_token
5458

5559
# Also add special tokens
5660
vocab.update(encoding._special_tokens)

python/openvino_tokenizers/tokenizer_pipeline.py

+20-1
Original file line numberDiff line numberDiff line change
@@ -528,11 +528,30 @@ def from_hf_json(cls, tokenizer_json: Dict[str, Any]) -> "BPETokenizationStep":
528528
def from_tiktoken_encoding(
529529
cls,
530530
encoding: "Encoding", # noqa
531+
reference_vocab: Optional[Dict[Union[str, bytes], int]] = None,
531532
) -> "BPETokenizationStep":
532-
from .tiktoken_parser import generate_vocab_and_merges
533+
from .tiktoken_parser import generate_vocab_and_merges, token_bytes_to_string
534+
from .utils import apply_bytes_to_unicode
533535

534536
vocab, merges, added_tokens = generate_vocab_and_merges(encoding)
535537
added_tokens.update({idx: token for token, idx in encoding._special_tokens.items()})
538+
539+
if reference_vocab is not None:
540+
existing_indices = set(vocab.values())
541+
542+
for ref_token, ref_idx in reference_vocab.items():
543+
if ref_idx in existing_indices:
544+
continue
545+
546+
if isinstance(ref_token, bytes):
547+
ref_token = token_bytes_to_string(ref_token)
548+
549+
# (chat)GLM model adds spaces around <sop> token
550+
if ref_token == "<sop>":
551+
ref_token = f" {ref_token} "
552+
553+
vocab[apply_bytes_to_unicode(ref_token)] = ref_idx
554+
536555
return cls(
537556
unk_token="",
538557
fuse_unk=False,

src/vocab_decoder.cpp

+4-2
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,6 @@ bool VocabDecoder::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i
4646
auto new_ends = outputs[3].data<int32_t>();
4747

4848
std::deque<uint8_t> buffer;
49-
5049
for(size_t batch = 0; batch < batch_size; ++batch) {
5150
new_ragged_begins[batch] = batch * ((seq_len > 0) ? seq_len : 1);
5251
new_ragged_ends[batch] = new_ragged_begins[batch] + ((seq_len > 0) ? seq_len : 1);
@@ -60,7 +59,10 @@ bool VocabDecoder::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i
6059
for(size_t seq = new_ragged_begins[batch]; seq < new_ragged_ends[batch]; ++seq) {
6160
auto token_id = input_data[seq];
6261
new_begins[seq] = buffer.size();
63-
if (std::find(m_skip_tokens.begin(), m_skip_tokens.end(), token_id) == m_skip_tokens.end()) {
62+
if (
63+
token_id < vocab_size
64+
&& std::find(m_skip_tokens.begin(), m_skip_tokens.end(), token_id) == m_skip_tokens.end()
65+
) {
6466
buffer.insert(
6567
buffer.end(),
6668
vocab_chars + vocab_begins[token_id],

tests/pass_rates.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
{
2-
"tests/tokenizers_test.py::test_": 0.8899194532584818
2+
"tests/tokenizers_test.py::test_": 0.8882187374346602
33
}

0 commit comments

Comments
 (0)