Support GLM-4 Tokenizer (#202)

apaniukov · web-flow · commit a0d820342344 · 2024-07-23T16:08:30.000+01:00
* Support GLM-4 Tokenizer (cherry picked from commit 9046fa9) * Support GLM-4 Detokenizer (cherry picked from commit 5c93773) * Add Reduce Size Build Example
diff --git a/README.md b/README.md
@@ -183,6 +183,10 @@ By default, all available ICU locales are supported, which significantly increas
     ```bash
     -DBUILD_FAST_TOKENIZERS=ON
     ```
+   - Example for a pip installation path:
+   ```bash
+   ICU_DATA_FILTER_FILE=</path/to/filters.json> pip install git+https://github.com/openvinotoolkit/openvino_tokenizers.git --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --config-settings=override=cmake.options.BUILD_FAST_TOKENIZERS=ON
+   ```
 
 By following these instructions, you can effectively reduce the size of the ICU libraries in your final package.
 
@@ -434,8 +438,8 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
     </tr>
     <tr>
       <td >Tiktoken</td>
-      <td >94.12</td>
-      <td >272</td>
+      <td >87.26</td>
+      <td >416</td>
     </tr>
     <tr>
       <td >WordPiece</td>
@@ -745,6 +749,12 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
       <td >95.31</td>
       <td >128</td>
     </tr>
+    <tr>
+      <td >Tiktoken</td>
+      <td >THUDM/glm-4-9b</td>
+      <td >74.31</td>
+      <td >144</td>
+    </tr>
     <tr>
       <td >WordPiece</td>
       <td >ProsusAI/finbert</td>
diff --git a/python/openvino_tokenizers/hf_parser.py b/python/openvino_tokenizers/hf_parser.py
@@ -30,6 +30,7 @@
     TOKENIZER_NAME,
 )
 from .tokenizer_pipeline import (
+    AddToken,
     BasePipelineStep,
     BPETokenizationStep,
     ByteFallbackStep,
@@ -47,6 +48,7 @@
     RegexDecodingStep,
     RegexNormalizationStep,
     RegexSplitStep,
+    Sequence,
     StripStringStep,
     TokenizerPipeline,
     TruncationStep,
@@ -449,10 +451,26 @@ def is_sentencepiece_model(hf_tokenizer: PreTrainedTokenizerBase) -> bool:
         if not hasattr(hf_tokenizer, "vocab_files_names") or "vocab_file" not in hf_tokenizer.vocab_files_names:
             return False
         vocab_file = Path(tmp) / hf_tokenizer.vocab_files_names["vocab_file"]
-        return (
+        vocab_file_exists = (
             getattr(hf_tokenizer, "vocab_files_names", {}).get("vocab_file", "").endswith(".model")
             and vocab_file.exists()
         )
+        if vocab_file_exists:
+            try:
+                from google.protobuf.message import DecodeError
+            except (ImportError, ModuleNotFoundError):
+                return False
+
+            model_pb = import_protobuf()
+            model = model_pb.ModelProto()
+            try:
+                with open(vocab_file, "rb") as model_file:
+                    model.ParseFromString(model_file.read())
+                return True
+            except DecodeError:
+                pass  # protobuf file is corrupted
+
+        return False
 
 
 def modify_sentencepiece_model(
@@ -831,11 +849,13 @@ def get_sp_detokenizer(
 def is_tiktoken_model(hf_tokenizer: PreTrainedTokenizerBase) -> bool:
     try:
         from tiktoken import Encoding
-    except ImportError:
+    except (ImportError, ModuleNotFoundError):
         return False
 
-    return getattr(hf_tokenizer, "vocab_files_names", {}).get("vocab_file", "").endswith(".tiktoken") or isinstance(
-        getattr(hf_tokenizer, "encoder", None), Encoding
+    return (
+        getattr(hf_tokenizer, "vocab_files_names", {}).get("vocab_file", "").endswith(".tiktoken")
+        or isinstance(getattr(hf_tokenizer, "encoder", None), Encoding)
+        or isinstance(getattr(hf_tokenizer, "tokenizer", None), Encoding)
     )
 
 
@@ -854,13 +874,20 @@ def convert_tiktoken_model_tokenizer(
     if skip_special_tokens:
         skip_tokens = list(parse_special_tokens(hf_tokenizer))
 
+    add_prefix_steps = []
+    if hasattr(hf_tokenizer, "get_prefix_tokens"):
+        prefix_tokens = [AddToken(_token_id=token_id) for token_id in hf_tokenizer.get_prefix_tokens()]
+        add_prefix_steps.append(CombineSegmentsStep(inputs=prefix_tokens + [Sequence()]))
+
+    reference_vocab = getattr(hf_tokenizer, "get_vocab", lambda: None)()
     pipeline.add_steps(
         [
             NormalizeUnicode("NFC"),
             RegexSplitStep(split_pattern, behaviour="contiguous"),
             BytesToCharsStep(),
-            BPETokenizationStep.from_tiktoken_encoding(encoding),
+            BPETokenizationStep.from_tiktoken_encoding(encoding, reference_vocab=reference_vocab),
             TruncationStep.from_hf_object(hf_tokenizer),
+            *add_prefix_steps,
             PaddingStep(
                 token=getattr(hf_tokenizer, "pad_token"),
                 _token_id=getattr(hf_tokenizer, "pad_token_id"),
diff --git a/python/openvino_tokenizers/tiktoken_parser.py b/python/openvino_tokenizers/tiktoken_parser.py
@@ -37,7 +37,8 @@ def generate_vocab_and_merges(encoding: Encoding) -> Tuple[Dict[str, int], List[
     added_tokens = {}
 
     for token, rank in mergeable_ranks.items():
-        vocab[token_bytes_to_string(token)] = rank
+        string_token = token_bytes_to_string(token)
+        vocab[string_token] = rank
 
         if len(token) == 1:
             continue
@@ -50,7 +51,10 @@ def generate_vocab_and_merges(encoding: Encoding) -> Tuple[Dict[str, int], List[
         if len(merged) == 2:
             merges.append(" ".join(map(token_bytes_to_string, merged)))
         else:
-            added_tokens[rank] = token.decode("utf-8")
+            try:
+                added_tokens[rank] = token.decode("utf-8")
+            except UnicodeDecodeError:
+                added_tokens[rank] = string_token
 
     # Also add special tokens
     vocab.update(encoding._special_tokens)
diff --git a/python/openvino_tokenizers/tokenizer_pipeline.py b/python/openvino_tokenizers/tokenizer_pipeline.py
@@ -528,11 +528,30 @@ def from_hf_json(cls, tokenizer_json: Dict[str, Any]) -> "BPETokenizationStep":
     def from_tiktoken_encoding(
         cls,
         encoding: "Encoding",  # noqa
+        reference_vocab: Optional[Dict[Union[str, bytes], int]] = None,
     ) -> "BPETokenizationStep":
-        from .tiktoken_parser import generate_vocab_and_merges
+        from .tiktoken_parser import generate_vocab_and_merges, token_bytes_to_string
+        from .utils import apply_bytes_to_unicode
 
         vocab, merges, added_tokens = generate_vocab_and_merges(encoding)
         added_tokens.update({idx: token for token, idx in encoding._special_tokens.items()})
+
+        if reference_vocab is not None:
+            existing_indices = set(vocab.values())
+
+            for ref_token, ref_idx in reference_vocab.items():
+                if ref_idx in existing_indices:
+                    continue
+
+                if isinstance(ref_token, bytes):
+                    ref_token = token_bytes_to_string(ref_token)
+
+                # (chat)GLM model adds spaces around <sop> token
+                if ref_token == "<sop>":
+                    ref_token = f" {ref_token} "
+
+                vocab[apply_bytes_to_unicode(ref_token)] = ref_idx
+
         return cls(
             unk_token="",
             fuse_unk=False,
diff --git a/src/vocab_decoder.cpp b/src/vocab_decoder.cpp
@@ -46,7 +46,6 @@ bool VocabDecoder::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i
     auto new_ends   = outputs[3].data<int32_t>();
 
     std::deque<uint8_t> buffer;
-
     for(size_t batch = 0; batch < batch_size; ++batch) {
         new_ragged_begins[batch] = batch * ((seq_len > 0) ? seq_len : 1);
         new_ragged_ends[batch]   = new_ragged_begins[batch] + ((seq_len > 0) ? seq_len : 1);
@@ -60,7 +59,10 @@ bool VocabDecoder::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i
         for(size_t seq = new_ragged_begins[batch]; seq < new_ragged_ends[batch]; ++seq) {
             auto token_id = input_data[seq];
             new_begins[seq] = buffer.size();
-            if (std::find(m_skip_tokens.begin(), m_skip_tokens.end(), token_id) == m_skip_tokens.end()) {
+            if (
+                token_id < vocab_size
+                && std::find(m_skip_tokens.begin(), m_skip_tokens.end(), token_id) == m_skip_tokens.end()
+            ) {
                 buffer.insert(
                     buffer.end(),
                     vocab_chars + vocab_begins[token_id],
diff --git a/tests/pass_rates.json b/tests/pass_rates.json
@@ -1,3 +1,3 @@
 {
-    "tests/tokenizers_test.py::test_": 0.8899194532584818
+    "tests/tokenizers_test.py::test_": 0.8882187374346602
 }
diff --git a/tests/stats.json b/tests/stats.json
diff --git a/tests/tokenizers_test.py b/tests/tokenizers_test.py

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`{`
`2`		`- "tests/tokenizers_test.py::test_": 0.8899194532584818`
	`2`	`+ "tests/tokenizers_test.py::test_": 0.8882187374346602`
`3`	`3`	`}`