Cherry Pick For 24.1 Release (#118)

apaniukov · ilya-lavrenov · web-flow · commit ad37623ee335 · 2024-04-18T16:43:31.000+04:00
* Add Example RWKV to Readme (cherry picked from commit fb4142d) * Update Example (cherry picked from commit 6441bef) * Add Cache to VocabEncoder (cherry picked from commit fd32aa5) * Install torch CPU (cherry picked from commit dcc5fcd) * Parse QWEN EOD_TOKEN_ID (cherry picked from commit 029a341) * Fix Parse (cherry picked from commit 0298fea) * Fix Pass-Rate --------- Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
@@ -329,6 +329,8 @@ jobs:
             wheel_name=$(find . -name 'openvino_tokenizers*.whl')
             python3 -m pip install $wheel_name[dev]
           popd
+        env:
+          PIP_EXTRA_INDEX_URL: https://download.pytorch.org/whl/cpu
 
       - name: Tokenizers regression tests (using openvino python modules)
         if: needs.openvino_download.outputs.status == 'success'
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
@@ -319,6 +319,8 @@ jobs:
             wheel_name=$(find . -name 'openvino_tokenizers*.whl')
             python3 -m pip install $wheel_name[dev]
           popd
+        env:
+          PIP_EXTRA_INDEX_URL: https://download.pytorch.org/whl/cpu
 
       - name: Tokenizers regression tests (using openvino python modules)
         if: needs.openvino_download.outputs.status == 'success'
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
@@ -323,6 +323,8 @@ jobs:
           # Find and install wheel
           $ovCoreWheelPath=Get-ChildItem -Path "${{ env.INSTALL_DIR }}\\ov_tokenizers" -Filter openvino_tokenizers*.whl | % { $_.FullName }
           python3 -m pip install "$ovCoreWheelPath[all]"
+        env:
+          PIP_EXTRA_INDEX_URL: https://download.pytorch.org/whl/cpu
 
       - name: Tokenizers regression tests (using openvino python modules)
         if: needs.openvino_download.outputs.status == 'success'
diff --git a/README.md b/README.md
@@ -1,5 +1,7 @@
 # OpenVINO Tokenizers
 
+[![Downloads](https://static.pepy.tech/badge/openvino-tokenizers)](https://pepy.tech/project/openvino-tokenizers)
+
 OpenVINO Tokenizers adds text processing operations to OpenVINO.
 
 ## Features
@@ -59,6 +61,9 @@ This command is the equivalent of minimal installation. Install tokenizers conve
 ```bash
 pip install transformers[sentencepiece] tiktoken
 ```
+:warning: Latest commit of OpenVINO Tokenizers might rely on features that are not present in the release OpenVINO version. 
+Use [a nightly build](https://docs.openvino.ai/2024/get-started/install-openvino.html?VERSION=NIGHTLY) of OpenVINO or build 
+OpenVINO Tokenizers from a release branch if you have issues with the build process.
 
 ### Build and install for development
 ```bash
@@ -279,6 +284,29 @@ tf_result = tf_embed(sentences)
 assert np.all(np.isclose(ov_result, tf_result, atol=1e-4))
 ```
 
+### RWKV Tokenizer
+
+```python
+from urllib.request import urlopen
+
+from openvino import compile_model
+from openvino_tokenizers import build_rwkv_tokenizer
+
+
+rwkv_vocab_url = (
+    "https://raw.githubusercontent.com/BlinkDL/ChatRWKV/main/tokenizer/rwkv_vocab_v20230424.txt"
+)
+
+with urlopen(rwkv_vocab_url) as vocab_file:
+    vocab = map(bytes.decode, vocab_file)
+    tokenizer, detokenizer = build_rwkv_tokenizer(vocab)
+
+tokenizer, detokenizer = compile_model(tokenizer), compile_model(detokenizer)
+
+print(tokenized := tokenizer(["Test string"])["input_ids"])  # [[24235 47429]]
+print(detokenizer(tokenized)["string_output"])  # ['Test string']
+```
+
 ## Supported Tokenizer Types
 
 | Huggingface <br/>Tokenizer Type | Tokenizer Model Type | Tokenizer | Detokenizer |
@@ -288,6 +316,7 @@ assert np.all(np.isclose(ov_result, tf_result, atol=1e-4))
 |                                 | Unigram              | ❌         | ❌          |
 | Legacy                          | SentencePiece .model | ✅        | ✅          |
 | Custom                          | tiktoken             | ✅        | ✅          |
+| RWKV                            | Trie                 | ✅        | ✅          |
 
 ## Test Results
 
diff --git a/python/openvino_tokenizers/build_tokenizer.py b/python/openvino_tokenizers/build_tokenizer.py
@@ -1,4 +1,4 @@
-from typing import List, Tuple
+from typing import Iterable, Tuple
 
 from openvino import Model, PartialShape, Type
 from openvino.runtime import op
@@ -16,7 +16,7 @@
 
 
 def build_rwkv_tokenizer(
-    rwkv_vocab: List[str],
+    rwkv_vocab: Iterable[str],
     clean_up_tokenization_spaces: bool = False,
     tokenizer_output_type: Type = Type.i64,
     detokenizer_input_type: Type = Type.i64,
diff --git a/python/openvino_tokenizers/tokenizer_pipeline.py b/python/openvino_tokenizers/tokenizer_pipeline.py
@@ -9,7 +9,7 @@
 from dataclasses import dataclass, field
 from functools import singledispatchmethod
 from itertools import chain, islice
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
 from openvino.runtime import Model, Output, PartialShape, Type, op
@@ -382,7 +382,7 @@ def fill_vocab(vocab: List[str], indices: List[int]) -> Tuple[List[str], List[in
         return new_vocab, new_indices
 
     @classmethod
-    def from_rwkv_vocab(cls, vocab_file_strings: Iterator[str]) -> TrieTokenizerStep:
+    def from_rwkv_vocab(cls, vocab_file_strings: Iterable[str]) -> TrieTokenizerStep:
         vocab = []
         indices = []
         for line in vocab_file_strings:
diff --git a/src/vocab_encoder.cpp b/src/vocab_encoder.cpp
@@ -13,12 +13,6 @@
 using namespace ov;
 
 
-VocabEncoder::VocabEncoder (const ov::OutputVector& arguments) :
-    ov::op::Op(arguments) {
-    constructor_validate_and_infer_types();
-}
-
-
 void VocabEncoder::validate_and_infer_types() {
     // main string input
     check_string_input(this, 0);
@@ -44,19 +38,21 @@ bool VocabEncoder::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i
     auto ends   = inputs[1].data<const int32_t>();
     auto chars  = inputs[2].data<const uint8_t>();
 
-    // vocab string keys
-    auto vocab_begins = inputs[3].data<const int32_t>();
-    auto vocab_ends   = inputs[4].data<const int32_t>();
-    auto vocab_chars  = inputs[5].data<const uint8_t>();
+    if (m_vocab == nullptr) {
+        // vocab string keys
+        auto vocab_begins = inputs[3].data<const int32_t>();
+        auto vocab_ends   = inputs[4].data<const int32_t>();
+        auto vocab_chars  = inputs[5].data<const uint8_t>();
 
-    auto vocab_values = inputs[6].data<const int32_t>();
-    auto vocab_size = inputs[6].get_size();
+        auto vocab_values = inputs[6].data<const int32_t>();
+        auto vocab_size = inputs[6].get_size();
 
-    std::map<std::vector<uint8_t>, int32_t> vocab;
-    for (size_t i = 0; i < vocab_size; ++i) {
-        std::vector<uint8_t> token = std::vector<uint8_t>(vocab_chars + vocab_begins[i], vocab_chars + vocab_ends[i]);
-        vocab[token] = vocab_values[i];
-    };
+        m_vocab = std::make_shared<std::map<std::vector<unsigned char>, int32_t>>();
+        for (size_t i = 0; i < vocab_size; ++i) {
+            std::vector<uint8_t> token = std::vector<uint8_t>(vocab_chars + vocab_begins[i], vocab_chars + vocab_ends[i]);
+            m_vocab->insert(std::pair{token, vocab_values[i]});
+        };
+    }
 
     auto default_value = *inputs[7].data<const int32_t>();
     const size_t num_elements = inputs[0].get_size();
@@ -66,8 +62,8 @@ bool VocabEncoder::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i
     auto token_ids = outputs[0].data<int32_t>();
 
     for (size_t element_idx = 0; element_idx < num_elements; ++element_idx) {
-        auto element = vocab.find(std::vector<uint8_t>(chars + begins[element_idx], chars + ends[element_idx]));
-        if (element == vocab.end()) {
+        auto element = m_vocab->find(std::vector<uint8_t>(chars + begins[element_idx], chars + ends[element_idx]));
+        if (element == m_vocab->end()) {
             token_ids[element_idx] = default_value;
         } else {
             token_ids[element_idx] = element->second;
diff --git a/src/vocab_encoder.hpp b/src/vocab_encoder.hpp
@@ -5,25 +5,30 @@
 #pragma once
 #include <vector>
 #include <openvino/op/op.hpp>
-#include "openvino/opsets/opset13.hpp"
 
 using namespace ov;
-using namespace ov::opset13;
 
 
 class VocabEncoder : public ov::op::Op {
 public:
     OPENVINO_OP("VocabEncoder");
 
     VocabEncoder () = default;
-    VocabEncoder(
-        const ov::OutputVector& arguments
-    );
+
+    VocabEncoder(const ov::OutputVector& arguments) :
+        ov::op::Op(arguments) {
+        constructor_validate_and_infer_types();
+    }
+
+    VocabEncoder(const ov::OutputVector& arguments, std::shared_ptr<std::map<std::vector<unsigned char>, int32_t>> vocab) :
+        ov::op::Op(arguments), m_vocab(vocab) {
+        constructor_validate_and_infer_types();
+    }
 
     void validate_and_infer_types() override;
 
     std::shared_ptr<ov::Node> clone_with_new_inputs(const ov::OutputVector& inputs) const override {
-        return std::make_shared<VocabEncoder>(inputs);
+        return std::make_shared<VocabEncoder>(inputs, m_vocab);
     }
 
     bool visit_attributes(ov::AttributeVisitor& visitor) override {
@@ -35,4 +40,6 @@ class VocabEncoder : public ov::op::Op {
     bool has_evaluate() const override {
         return true;
     }
+private:
+    mutable std::shared_ptr<std::map<std::vector<unsigned char>, int32_t>> m_vocab;
 };