Skip to content

Commit afaa521

Browse files
authoredJan 20, 2025
Revert "Revert "Revert "Use opset15 version of Str Pack/Unpack (#351)" (#374)…" (#383)
This reverts commit 2e59c96.
1 parent 2e59c96 commit afaa521

13 files changed

+8431
-8415
lines changed
 

‎python/openvino_tokenizers/__init__.py

+2-17
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ def new_fe_init(self, *args, **kwargs):
6666

6767

6868
openvino.runtime.Core.__init__ = new_core_init
69+
openvino.runtime.utils.node_factory.NodeFactory.__init__ = new_factory_init
6970
openvino.frontend.frontend.FrontEnd.__init__ = new_fe_init
7071

7172

@@ -75,22 +76,6 @@ def _get_factory_callable() -> Callable[[], NodeFactory]:
7576
def inner(opset_version: Optional[str] = None) -> NodeFactory:
7677
nonlocal factory
7778
if opset_version not in factory:
78-
openvino.runtime.utils.node_factory.NodeFactory.__init__ = new_factory_init
79-
factory[opset_version] = NodeFactory() if opset_version is None else NodeFactory(opset_version)
80-
81-
return factory[opset_version]
82-
83-
return inner
84-
85-
86-
def _get_opset_factory_callable() -> Callable[[], NodeFactory]:
87-
# factory without extensions
88-
factory = {}
89-
90-
def inner(opset_version: Optional[str] = None) -> NodeFactory:
91-
nonlocal factory
92-
if opset_version not in factory:
93-
openvino.runtime.utils.node_factory.NodeFactory.__init__ = old_factory_init
9479
factory[opset_version] = NodeFactory() if opset_version is None else NodeFactory(opset_version)
9580

9681
return factory[opset_version]
@@ -99,10 +84,10 @@ def inner(opset_version: Optional[str] = None) -> NodeFactory:
9984

10085

10186
_get_factory = _get_factory_callable()
102-
_get_opset_factory = _get_opset_factory_callable()
10387

10488
# some files uses _get_factory function
10589
from .__version__ import __version__ # noqa
10690
from .build_tokenizer import build_rwkv_tokenizer # noqa
10791
from .convert_tokenizer import convert_tokenizer # noqa
92+
from .str_pack import pack_strings, unpack_strings # noqa
10893
from .utils import add_greedy_decoding, connect_models # noqa

‎python/openvino_tokenizers/build_tokenizer.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,12 @@ def build_rwkv_tokenizer(
2121
tokenizer_output_type: Type = Type.i64,
2222
detokenizer_input_type: Type = Type.i64,
2323
) -> Tuple[Model, Model]:
24-
from openvino_tokenizers import _get_factory, _get_opset_factory
24+
from openvino_tokenizers import _get_factory
2525

2626
input_node = op.Parameter(Type.string, PartialShape(["?"]))
2727
input_node.set_friendly_name("string_input")
2828

29-
output = _get_opset_factory("opset15").create("StringTensorUnpack", input_node.outputs()).outputs()
29+
output = _get_factory().create("StringTensorUnpack", input_node.outputs()).outputs()
3030
trie_node = TrieTokenizerStep.from_rwkv_vocab(rwkv_vocab)
3131
output = trie_node.get_ov_subgraph(TokenizerPipeline.add_ragged_dimension(output))
3232

@@ -56,7 +56,7 @@ def build_rwkv_tokenizer(
5656
_get_factory()
5757
.create(
5858
"VocabDecoder",
59-
[*detokenizer_input.outputs(), *BasePipelineStep.create_string_constant_node(trie_node.vocab)],
59+
[*detokenizer_input.outputs(), *BasePipelineStep.create_string_constant_node(trie_node.vocab).outputs()],
6060
)
6161
.outputs()
6262
)
@@ -65,7 +65,7 @@ def build_rwkv_tokenizer(
6565
if clean_up_tokenization_spaces:
6666
RegexDecodingStep.clean_up_tokenization_spaces().get_ov_subgraph(detokenizer_output)
6767

68-
detokenizer_output = _get_opset_factory("opset15").create("StringTensorPack", detokenizer_output).outputs()
68+
detokenizer_output = _get_factory().create("StringTensorPack", detokenizer_output).outputs()
6969
detokenizer_output[0].tensor.add_names({STRING_OUTPUT_NAME})
7070

7171
detokenizer = Model(detokenizer_output, [detokenizer_input], DETOKENIZER_NAME)

‎python/openvino_tokenizers/hf_parser.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from transformers import PreTrainedTokenizerBase, PreTrainedTokenizerFast
2121
from transformers.convert_slow_tokenizer import import_protobuf
2222

23-
from . import _get_factory, _get_opset_factory
23+
from . import _get_factory
2424
from .constants import (
2525
ATTENTION_MASK_INPUT_NAME,
2626
DETOKENIZER_NAME,
@@ -810,7 +810,7 @@ def convert_sentencepiece_model_tokenizer(
810810
if params.handle_special_tokens_with_re:
811811
tokens, ids = zip(*sorted(((token, id) for id, token in add_tokens.items()), reverse=True))
812812
added_inputs = [
813-
*BasePipelineStep.create_string_constant_node(tokens),
813+
*BasePipelineStep.create_string_constant_node(tokens).outputs(),
814814
make_constant_node(np.array(ids, dtype=np.int32), Type.i32).output(0),
815815
]
816816
else:
@@ -1013,7 +1013,7 @@ def get_sp_detokenizer(
10131013
if params.utf8_replace_mode is not None and params.utf8_replace_mode != UTF8ReplaceMode.DISABLE:
10141014
last_sinks = UTF8ValidateStep(params.utf8_replace_mode).get_ov_subgraph(detokenizer)
10151015

1016-
string_output = _get_opset_factory("opset15").create("StringTensorPack", last_sinks).outputs()
1016+
string_output = _get_factory().create("StringTensorPack", last_sinks).outputs()
10171017
string_output[0].tensor.add_names({STRING_OUTPUT_NAME})
10181018
tokenizer_detokenizer = Model(string_output, [model_input], DETOKENIZER_NAME)
10191019
tokenizer_detokenizer.validate_nodes_and_infer_types()
+62
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# -*- coding: utf-8 -*-
2+
# Copyright (C) 2018-2025 Intel Corporation
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from io import BytesIO
6+
from typing import Iterable, List
7+
8+
import numpy as np
9+
from numpy.typing import NDArray
10+
11+
12+
def to_bytes(number: int) -> bytes:
13+
return number.to_bytes(4, "little")
14+
15+
16+
def pack_string(string: str) -> NDArray:
17+
return np.frombuffer(bytes(string, "utf-8"), dtype=np.uint8)
18+
19+
20+
def pack_strings(strings: Iterable[str]) -> NDArray:
21+
"""
22+
Convert any list of string to U8/1D numpy array compatible with converted OV model input
23+
"""
24+
strings = list(strings)
25+
batch_size = len(strings)
26+
if batch_size == 0:
27+
return np.frombuffer(to_bytes(0), np.uint8)
28+
29+
buffer = BytesIO()
30+
buffer.write(to_bytes(batch_size))
31+
symbols = BytesIO()
32+
offset = 0
33+
buffer.write(to_bytes(offset))
34+
for string in strings:
35+
byte_string = string.encode("utf-8") if isinstance(string, str) else string
36+
offset += len(byte_string)
37+
38+
buffer.write(to_bytes(offset))
39+
symbols.write(byte_string)
40+
41+
buffer.write(symbols.getvalue())
42+
return np.frombuffer(buffer.getvalue(), np.uint8)
43+
44+
45+
# TODO: handle possible sighed values in batch size and offsets
46+
def unpack_strings(u8_tensor: NDArray, decoding_errors: str = "replace") -> List[str]:
47+
"""
48+
Convert an array of uint8 elements to a list of strings; reverse to pack_strings
49+
"""
50+
51+
def from_bytes(offset: int, size: int) -> int:
52+
return int.from_bytes(u8_tensor[offset : offset + size], "little")
53+
54+
batch_size = from_bytes(0, 4)
55+
strings = []
56+
for i in range(batch_size):
57+
begin = from_bytes(4 + i * 4, 4)
58+
end = from_bytes(4 + (i + 1) * 4, 4)
59+
length = end - begin
60+
begin += 4 * (batch_size + 2)
61+
strings.append(bytes(u8_tensor[begin : begin + length]).decode("utf-8", errors=decoding_errors))
62+
return strings

‎python/openvino_tokenizers/tokenizer_pipeline.py

+29-40
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from openvino.runtime.exceptions import OVTypeError, UserInputError
2020
from openvino.runtime.utils.types import as_node, make_constant_node
2121

22-
from . import _get_factory, _get_opset_factory
22+
from . import _get_factory
2323
from .constants import (
2424
ATTENTION_MASK_INPUT_NAME,
2525
DETOKENIZER_NAME,
@@ -31,13 +31,8 @@
3131
VOCAB_SIZE_CACHE_PROPORTION,
3232
UTF8ReplaceMode,
3333
)
34-
from .utils import (
35-
apply_unicode_to_bytes,
36-
create_unpacked_string,
37-
generate_tokens_with_space_symbols,
38-
has_incompatible_re2_op,
39-
quote_meta,
40-
)
34+
from .str_pack import pack_string, pack_strings
35+
from .utils import apply_unicode_to_bytes, generate_tokens_with_space_symbols, has_incompatible_re2_op, quote_meta
4136

4237

4338
logger = logging.getLogger(__name__)
@@ -71,15 +66,15 @@ def get_ov_subgraph(self, *input_nodes: List[Output]) -> List[Output]:
7166
raise NotImplementedError
7267

7368
@staticmethod
74-
def create_string_constant_node(value: Union[str, Iterable[str]]) -> List[Output]:
69+
def create_string_constant_node(value: Union[str, Iterable[str]]) -> op.Constant:
7570
if isinstance(value, str):
7671
# string scalar
77-
return op.Constant(np.frombuffer(bytes(value, "utf-8"), dtype=np.uint8)).outputs()
78-
elif isinstance(value, Iterable):
79-
# support only 1D strings for now
80-
return create_unpacked_string(value)
72+
ps = pack_string(value)
73+
return op.Constant(ps)
8174
else:
82-
raise ValueError(f"Unsupported value type {type(value)}")
75+
# support only 1D strings for now
76+
ps = pack_strings(value)
77+
return _get_factory().create("StringTensorUnpack", op.Constant(ps).outputs())
8378

8479
def finalize(self) -> None:
8580
"""Called after the entire pipeline has been built"""
@@ -149,7 +144,7 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
149144
return list(input_nodes)
150145

151146
split_pattern = "|".join(token.regex_repr() for token in self.special_tokens)
152-
input_nodes.extend(self.create_string_constant_node(split_pattern))
147+
input_nodes.extend(self.create_string_constant_node(split_pattern).outputs())
153148

154149
return _get_factory().create("SpecialTokensSplit", input_nodes).outputs()
155150

@@ -238,10 +233,10 @@ def del_control_chars_regex(cls) -> "RegexNormalizationStep":
238233

239234
def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
240235
input_nodes.extend(
241-
[
242-
*self.create_string_constant_node(self.regex_search_pattern),
243-
*self.create_string_constant_node(self.replace_term),
244-
]
236+
(
237+
self.create_string_constant_node(self.regex_search_pattern),
238+
self.create_string_constant_node(self.replace_term),
239+
)
245240
)
246241
return (
247242
_get_factory().create("RegexNormalization", input_nodes, {"global_replace": self.global_replace}).outputs()
@@ -362,7 +357,7 @@ def punctuation_splitter(cls, behaviour="isolate") -> "RegexSplitStep":
362357
)
363358

364359
def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
365-
input_nodes.extend(self.create_string_constant_node(self.split_pattern))
360+
input_nodes.extend(self.create_string_constant_node(self.split_pattern).outputs())
366361
return (
367362
_get_factory()
368363
.create(
@@ -428,7 +423,7 @@ def get_vocab_node_outputs(self) -> Optional[List[Output]]:
428423

429424
def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
430425
pipeline = self.get_pipeline()
431-
pipeline.vocab_node_outputs = self.create_string_constant_node(self.vocab)
426+
pipeline.vocab_node_outputs = self.create_string_constant_node(self.vocab).outputs()
432427

433428
ragged_dims, other_dims = [], input_nodes
434429
if len(input_nodes) > 4:
@@ -480,7 +475,7 @@ def from_rwkv_vocab(cls, vocab_file_strings: Iterable[str]) -> TrieTokenizerStep
480475
def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
481476
input_nodes.extend(
482477
(
483-
*self.create_string_constant_node(self.vocab),
478+
*self.create_string_constant_node(self.vocab).outputs(),
484479
make_constant_node(np.array(self.indices, dtype=np.int32), Type.i32),
485480
)
486481
)
@@ -516,7 +511,7 @@ def from_hf_json(cls, tokenizer_json: Dict[str, Any]) -> "WordPieceTokenizationS
516511
def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
517512
input_nodes.extend(
518513
(
519-
*self.create_string_constant_node(self.vocab),
514+
*self.create_string_constant_node(self.vocab).outputs(),
520515
*as_node(self.unk_token_id).outputs(),
521516
)
522517
)
@@ -648,10 +643,10 @@ def merges_are_pairs(self) -> bool:
648643

649644
def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
650645
pipeline = self.get_pipeline()
651-
pipeline.vocab_node_outputs = self.create_string_constant_node(self.vocab)
646+
pipeline.vocab_node_outputs = self.create_string_constant_node(self.vocab).outputs()
652647

653648
if self.added_tokens:
654-
special_tokens_outputs = self.create_string_constant_node(self.added_tokens)
649+
special_tokens_outputs = self.create_string_constant_node(self.added_tokens).outputs()
655650
else:
656651
special_tokens_outputs = []
657652

@@ -664,12 +659,12 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
664659
left_merges, right_merges = zip(*self.merges)
665660
input_nodes.extend(
666661
(
667-
*self.create_string_constant_node(left_merges),
668-
*self.create_string_constant_node(right_merges),
662+
*self.create_string_constant_node(left_merges).outputs(),
663+
*self.create_string_constant_node(right_merges).outputs(),
669664
)
670665
)
671666
else:
672-
input_nodes.extend(self.create_string_constant_node(self.merges))
667+
input_nodes.extend(self.create_string_constant_node(self.merges).outputs())
673668

674669
if special_tokens_outputs:
675670
input_nodes.extend(
@@ -1040,13 +1035,7 @@ def finalize(self) -> None:
10401035
self.skip_tokens = pipeline.skip_tokens or []
10411036

10421037
@classmethod
1043-
def from_hf_json(
1044-
cls,
1045-
tokenizer_json: Dict[str, Any],
1046-
pipeline_vocab: Optional[List[str]],
1047-
skip_tokens: Optional[List[int]] = None,
1048-
do_skip_tokens: bool = True,
1049-
) -> "VocabDecoderStep":
1038+
def from_hf_json(cls, tokenizer_json: Dict[str, Any], pipeline_vocab: Optional[List[str]], skip_tokens: Optional[List[int]] = None, do_skip_tokens: bool = True) -> "VocabDecoderStep":
10501039
model_type = tokenizer_json["model"]["type"]
10511040

10521041
if pipeline_vocab is not None and model_type == "WordLevel":
@@ -1068,7 +1057,7 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
10681057
if self.vocab is None:
10691058
vocab_outputs = self.get_vocab_node_outputs()
10701059
else:
1071-
vocab_outputs = self.create_string_constant_node(self.vocab)
1060+
vocab_outputs = self.create_string_constant_node(self.vocab).outputs()
10721061
input_nodes.extend(vocab_outputs)
10731062

10741063
# Put constant with skip tokens even if do_skip_tokens=False, so that it can be switched on/off at runtime.
@@ -1189,8 +1178,8 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
11891178

11901179
input_nodes.extend(
11911180
(
1192-
*self.create_string_constant_node(self.regex_search_pattern),
1193-
*self.create_string_constant_node(self.replace_term),
1181+
*self.create_string_constant_node(self.regex_search_pattern).outputs(),
1182+
*self.create_string_constant_node(self.replace_term).outputs(),
11941183
)
11951184
)
11961185
return ragged_dims + _get_factory().create("RegexNormalization", input_nodes).outputs()
@@ -1245,7 +1234,7 @@ def get_tokenizer_ov_subgraph(self) -> Model:
12451234

12461235
processing_outputs = []
12471236
for input_node in string_inputs:
1248-
input_node = _get_opset_factory("opset15").create("StringTensorUnpack", input_node.outputs()).outputs()
1237+
input_node = _get_factory().create("StringTensorUnpack", input_node.outputs()).outputs()
12491238

12501239
ragged = []
12511240
if isinstance(self.steps[0], SpecialTokensSplit):
@@ -1318,7 +1307,7 @@ def create_decoding_pipeline(self, input_nodes: List[Output]) -> List[Output]:
13181307
pipeline_step = step.get_ov_subgraph(input_nodes)
13191308
input_nodes = pipeline_step
13201309

1321-
return _get_opset_factory("opset15").create("StringTensorPack", input_nodes).outputs()
1310+
return _get_factory().create("StringTensorPack", input_nodes).outputs()
13221311

13231312
def get_detokenizer_ov_subgraph(self) -> Model:
13241313
self.finalize()

0 commit comments

Comments
 (0)