Skip to content

Commit ad37623

Browse files
Cherry Pick For 24.1 Release (#118)
* Add Example RWKV to Readme (cherry picked from commit fb4142d) * Update Example (cherry picked from commit 6441bef) * Add Cache to VocabEncoder (cherry picked from commit fd32aa5) * Install torch CPU (cherry picked from commit dcc5fcd) * Parse QWEN EOD_TOKEN_ID (cherry picked from commit 029a341) * Fix Parse (cherry picked from commit 0298fea) * Fix Pass-Rate --------- Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
1 parent e9d708d commit ad37623

File tree

8 files changed

+67
-29
lines changed

8 files changed

+67
-29
lines changed

.github/workflows/linux.yml

+2
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,8 @@ jobs:
329329
wheel_name=$(find . -name 'openvino_tokenizers*.whl')
330330
python3 -m pip install $wheel_name[dev]
331331
popd
332+
env:
333+
PIP_EXTRA_INDEX_URL: https://download.pytorch.org/whl/cpu
332334

333335
- name: Tokenizers regression tests (using openvino python modules)
334336
if: needs.openvino_download.outputs.status == 'success'

.github/workflows/mac.yml

+2
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,8 @@ jobs:
319319
wheel_name=$(find . -name 'openvino_tokenizers*.whl')
320320
python3 -m pip install $wheel_name[dev]
321321
popd
322+
env:
323+
PIP_EXTRA_INDEX_URL: https://download.pytorch.org/whl/cpu
322324

323325
- name: Tokenizers regression tests (using openvino python modules)
324326
if: needs.openvino_download.outputs.status == 'success'

.github/workflows/windows.yml

+2
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,8 @@ jobs:
323323
# Find and install wheel
324324
$ovCoreWheelPath=Get-ChildItem -Path "${{ env.INSTALL_DIR }}\\ov_tokenizers" -Filter openvino_tokenizers*.whl | % { $_.FullName }
325325
python3 -m pip install "$ovCoreWheelPath[all]"
326+
env:
327+
PIP_EXTRA_INDEX_URL: https://download.pytorch.org/whl/cpu
326328

327329
- name: Tokenizers regression tests (using openvino python modules)
328330
if: needs.openvino_download.outputs.status == 'success'

README.md

+29
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# OpenVINO Tokenizers
22

3+
[![Downloads](https://static.pepy.tech/badge/openvino-tokenizers)](https://pepy.tech/project/openvino-tokenizers)
4+
35
OpenVINO Tokenizers adds text processing operations to OpenVINO.
46

57
## Features
@@ -59,6 +61,9 @@ This command is the equivalent of minimal installation. Install tokenizers conve
5961
```bash
6062
pip install transformers[sentencepiece] tiktoken
6163
```
64+
:warning: Latest commit of OpenVINO Tokenizers might rely on features that are not present in the release OpenVINO version.
65+
Use [a nightly build](https://docs.openvino.ai/2024/get-started/install-openvino.html?VERSION=NIGHTLY) of OpenVINO or build
66+
OpenVINO Tokenizers from a release branch if you have issues with the build process.
6267

6368
### Build and install for development
6469
```bash
@@ -279,6 +284,29 @@ tf_result = tf_embed(sentences)
279284
assert np.all(np.isclose(ov_result, tf_result, atol=1e-4))
280285
```
281286

287+
### RWKV Tokenizer
288+
289+
```python
290+
from urllib.request import urlopen
291+
292+
from openvino import compile_model
293+
from openvino_tokenizers import build_rwkv_tokenizer
294+
295+
296+
rwkv_vocab_url = (
297+
"https://raw.githubusercontent.com/BlinkDL/ChatRWKV/main/tokenizer/rwkv_vocab_v20230424.txt"
298+
)
299+
300+
with urlopen(rwkv_vocab_url) as vocab_file:
301+
vocab = map(bytes.decode, vocab_file)
302+
tokenizer, detokenizer = build_rwkv_tokenizer(vocab)
303+
304+
tokenizer, detokenizer = compile_model(tokenizer), compile_model(detokenizer)
305+
306+
print(tokenized := tokenizer(["Test string"])["input_ids"]) # [[24235 47429]]
307+
print(detokenizer(tokenized)["string_output"]) # ['Test string']
308+
```
309+
282310
## Supported Tokenizer Types
283311

284312
| Huggingface <br/>Tokenizer Type | Tokenizer Model Type | Tokenizer | Detokenizer |
@@ -288,6 +316,7 @@ assert np.all(np.isclose(ov_result, tf_result, atol=1e-4))
288316
| | Unigram |||
289317
| Legacy | SentencePiece .model |||
290318
| Custom | tiktoken |||
319+
| RWKV | Trie |||
291320

292321
## Test Results
293322

python/openvino_tokenizers/build_tokenizer.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import List, Tuple
1+
from typing import Iterable, Tuple
22

33
from openvino import Model, PartialShape, Type
44
from openvino.runtime import op
@@ -16,7 +16,7 @@
1616

1717

1818
def build_rwkv_tokenizer(
19-
rwkv_vocab: List[str],
19+
rwkv_vocab: Iterable[str],
2020
clean_up_tokenization_spaces: bool = False,
2121
tokenizer_output_type: Type = Type.i64,
2222
detokenizer_input_type: Type = Type.i64,

python/openvino_tokenizers/tokenizer_pipeline.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from dataclasses import dataclass, field
1010
from functools import singledispatchmethod
1111
from itertools import chain, islice
12-
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
12+
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
1313

1414
import numpy as np
1515
from openvino.runtime import Model, Output, PartialShape, Type, op
@@ -382,7 +382,7 @@ def fill_vocab(vocab: List[str], indices: List[int]) -> Tuple[List[str], List[in
382382
return new_vocab, new_indices
383383

384384
@classmethod
385-
def from_rwkv_vocab(cls, vocab_file_strings: Iterator[str]) -> TrieTokenizerStep:
385+
def from_rwkv_vocab(cls, vocab_file_strings: Iterable[str]) -> TrieTokenizerStep:
386386
vocab = []
387387
indices = []
388388
for line in vocab_file_strings:

src/vocab_encoder.cpp

+15-19
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,6 @@
1313
using namespace ov;
1414

1515

16-
VocabEncoder::VocabEncoder (const ov::OutputVector& arguments) :
17-
ov::op::Op(arguments) {
18-
constructor_validate_and_infer_types();
19-
}
20-
21-
2216
void VocabEncoder::validate_and_infer_types() {
2317
// main string input
2418
check_string_input(this, 0);
@@ -44,19 +38,21 @@ bool VocabEncoder::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i
4438
auto ends = inputs[1].data<const int32_t>();
4539
auto chars = inputs[2].data<const uint8_t>();
4640

47-
// vocab string keys
48-
auto vocab_begins = inputs[3].data<const int32_t>();
49-
auto vocab_ends = inputs[4].data<const int32_t>();
50-
auto vocab_chars = inputs[5].data<const uint8_t>();
41+
if (m_vocab == nullptr) {
42+
// vocab string keys
43+
auto vocab_begins = inputs[3].data<const int32_t>();
44+
auto vocab_ends = inputs[4].data<const int32_t>();
45+
auto vocab_chars = inputs[5].data<const uint8_t>();
5146

52-
auto vocab_values = inputs[6].data<const int32_t>();
53-
auto vocab_size = inputs[6].get_size();
47+
auto vocab_values = inputs[6].data<const int32_t>();
48+
auto vocab_size = inputs[6].get_size();
5449

55-
std::map<std::vector<uint8_t>, int32_t> vocab;
56-
for (size_t i = 0; i < vocab_size; ++i) {
57-
std::vector<uint8_t> token = std::vector<uint8_t>(vocab_chars + vocab_begins[i], vocab_chars + vocab_ends[i]);
58-
vocab[token] = vocab_values[i];
59-
};
50+
m_vocab = std::make_shared<std::map<std::vector<unsigned char>, int32_t>>();
51+
for (size_t i = 0; i < vocab_size; ++i) {
52+
std::vector<uint8_t> token = std::vector<uint8_t>(vocab_chars + vocab_begins[i], vocab_chars + vocab_ends[i]);
53+
m_vocab->insert(std::pair{token, vocab_values[i]});
54+
};
55+
}
6056

6157
auto default_value = *inputs[7].data<const int32_t>();
6258
const size_t num_elements = inputs[0].get_size();
@@ -66,8 +62,8 @@ bool VocabEncoder::evaluate(ov::TensorVector& outputs, const ov::TensorVector& i
6662
auto token_ids = outputs[0].data<int32_t>();
6763

6864
for (size_t element_idx = 0; element_idx < num_elements; ++element_idx) {
69-
auto element = vocab.find(std::vector<uint8_t>(chars + begins[element_idx], chars + ends[element_idx]));
70-
if (element == vocab.end()) {
65+
auto element = m_vocab->find(std::vector<uint8_t>(chars + begins[element_idx], chars + ends[element_idx]));
66+
if (element == m_vocab->end()) {
7167
token_ids[element_idx] = default_value;
7268
} else {
7369
token_ids[element_idx] = element->second;

src/vocab_encoder.hpp

+13-6
Original file line numberDiff line numberDiff line change
@@ -5,25 +5,30 @@
55
#pragma once
66
#include <vector>
77
#include <openvino/op/op.hpp>
8-
#include "openvino/opsets/opset13.hpp"
98

109
using namespace ov;
11-
using namespace ov::opset13;
1210

1311

1412
class VocabEncoder : public ov::op::Op {
1513
public:
1614
OPENVINO_OP("VocabEncoder");
1715

1816
VocabEncoder () = default;
19-
VocabEncoder(
20-
const ov::OutputVector& arguments
21-
);
17+
18+
VocabEncoder(const ov::OutputVector& arguments) :
19+
ov::op::Op(arguments) {
20+
constructor_validate_and_infer_types();
21+
}
22+
23+
VocabEncoder(const ov::OutputVector& arguments, std::shared_ptr<std::map<std::vector<unsigned char>, int32_t>> vocab) :
24+
ov::op::Op(arguments), m_vocab(vocab) {
25+
constructor_validate_and_infer_types();
26+
}
2227

2328
void validate_and_infer_types() override;
2429

2530
std::shared_ptr<ov::Node> clone_with_new_inputs(const ov::OutputVector& inputs) const override {
26-
return std::make_shared<VocabEncoder>(inputs);
31+
return std::make_shared<VocabEncoder>(inputs, m_vocab);
2732
}
2833

2934
bool visit_attributes(ov::AttributeVisitor& visitor) override {
@@ -35,4 +40,6 @@ class VocabEncoder : public ov::op::Op {
3540
bool has_evaluate() const override {
3641
return true;
3742
}
43+
private:
44+
mutable std::shared_ptr<std::map<std::vector<unsigned char>, int32_t>> m_vocab;
3845
};

0 commit comments

Comments
 (0)