openvinotoolkit · Jan 20, 2025
diff --git a/‎python/openvino_tokenizers/__init__.py
+2-17 b/‎python/openvino_tokenizers/__init__.py
+2-17
diff --git a/‎python/openvino_tokenizers/build_tokenizer.py
+4-4 b/‎python/openvino_tokenizers/build_tokenizer.py
+4-4
diff --git a/‎python/openvino_tokenizers/hf_parser.py
+3-3 b/‎python/openvino_tokenizers/hf_parser.py
+3-3
diff --git a/‎python/openvino_tokenizers/str_pack.py
+62 b/‎python/openvino_tokenizers/str_pack.py
+62
diff --git a/‎python/openvino_tokenizers/tokenizer_pipeline.py
+29-40 b/‎python/openvino_tokenizers/tokenizer_pipeline.py
+29-40
@@ -66,6 +66,7 @@ def new_fe_init(self, *args, **kwargs):
 
 
 openvino.runtime.Core.__init__ = new_core_init
+openvino.runtime.utils.node_factory.NodeFactory.__init__ = new_factory_init
 openvino.frontend.frontend.FrontEnd.__init__ = new_fe_init
 
 
@@ -75,22 +76,6 @@ def _get_factory_callable() -> Callable[[], NodeFactory]:
     def inner(opset_version: Optional[str] = None) -> NodeFactory:
         nonlocal factory
         if opset_version not in factory:
-            openvino.runtime.utils.node_factory.NodeFactory.__init__ = new_factory_init
-            factory[opset_version] = NodeFactory() if opset_version is None else NodeFactory(opset_version)
-
-        return factory[opset_version]
-
-    return inner
-
-
-def _get_opset_factory_callable() -> Callable[[], NodeFactory]:
-    # factory without extensions
-    factory = {}
-
-    def inner(opset_version: Optional[str] = None) -> NodeFactory:
-        nonlocal factory
-        if opset_version not in factory:
-            openvino.runtime.utils.node_factory.NodeFactory.__init__ = old_factory_init
             factory[opset_version] = NodeFactory() if opset_version is None else NodeFactory(opset_version)
 
         return factory[opset_version]
@@ -99,10 +84,10 @@ def inner(opset_version: Optional[str] = None) -> NodeFactory:
 
 
 _get_factory = _get_factory_callable()
-_get_opset_factory = _get_opset_factory_callable()
 
 # some files uses _get_factory function
 from .__version__ import __version__  # noqa
 from .build_tokenizer import build_rwkv_tokenizer  # noqa
 from .convert_tokenizer import convert_tokenizer  # noqa
+from .str_pack import pack_strings, unpack_strings  # noqa
 from .utils import add_greedy_decoding, connect_models  # noqa
@@ -21,12 +21,12 @@ def build_rwkv_tokenizer(
     tokenizer_output_type: Type = Type.i64,
     detokenizer_input_type: Type = Type.i64,
 ) -> Tuple[Model, Model]:
-    from openvino_tokenizers import _get_factory, _get_opset_factory
+    from openvino_tokenizers import _get_factory
 
     input_node = op.Parameter(Type.string, PartialShape(["?"]))
     input_node.set_friendly_name("string_input")
 
-    output = _get_opset_factory("opset15").create("StringTensorUnpack", input_node.outputs()).outputs()
+    output = _get_factory().create("StringTensorUnpack", input_node.outputs()).outputs()
     trie_node = TrieTokenizerStep.from_rwkv_vocab(rwkv_vocab)
     output = trie_node.get_ov_subgraph(TokenizerPipeline.add_ragged_dimension(output))
 
@@ -56,7 +56,7 @@ def build_rwkv_tokenizer(
         _get_factory()
         .create(
             "VocabDecoder",
-            [*detokenizer_input.outputs(), *BasePipelineStep.create_string_constant_node(trie_node.vocab)],
+            [*detokenizer_input.outputs(), *BasePipelineStep.create_string_constant_node(trie_node.vocab).outputs()],
         )
         .outputs()
     )
@@ -65,7 +65,7 @@ def build_rwkv_tokenizer(
     if clean_up_tokenization_spaces:
         RegexDecodingStep.clean_up_tokenization_spaces().get_ov_subgraph(detokenizer_output)
 
-    detokenizer_output = _get_opset_factory("opset15").create("StringTensorPack", detokenizer_output).outputs()
+    detokenizer_output = _get_factory().create("StringTensorPack", detokenizer_output).outputs()
     detokenizer_output[0].tensor.add_names({STRING_OUTPUT_NAME})
 
     detokenizer = Model(detokenizer_output, [detokenizer_input], DETOKENIZER_NAME)
 
@@ -20,7 +20,7 @@
 from transformers import PreTrainedTokenizerBase, PreTrainedTokenizerFast
 from transformers.convert_slow_tokenizer import import_protobuf
 
-from . import _get_factory, _get_opset_factory
+from . import _get_factory
 from .constants import (
     ATTENTION_MASK_INPUT_NAME,
     DETOKENIZER_NAME,
@@ -810,7 +810,7 @@ def convert_sentencepiece_model_tokenizer(
     if params.handle_special_tokens_with_re:
         tokens, ids = zip(*sorted(((token, id) for id, token in add_tokens.items()), reverse=True))
         added_inputs = [
-            *BasePipelineStep.create_string_constant_node(tokens),
+            *BasePipelineStep.create_string_constant_node(tokens).outputs(),
             make_constant_node(np.array(ids, dtype=np.int32), Type.i32).output(0),
         ]
     else:
@@ -1013,7 +1013,7 @@ def get_sp_detokenizer(
     if params.utf8_replace_mode is not None and params.utf8_replace_mode != UTF8ReplaceMode.DISABLE:
         last_sinks = UTF8ValidateStep(params.utf8_replace_mode).get_ov_subgraph(detokenizer)
 
-    string_output = _get_opset_factory("opset15").create("StringTensorPack", last_sinks).outputs()
+    string_output = _get_factory().create("StringTensorPack", last_sinks).outputs()
     string_output[0].tensor.add_names({STRING_OUTPUT_NAME})
     tokenizer_detokenizer = Model(string_output, [model_input], DETOKENIZER_NAME)
     tokenizer_detokenizer.validate_nodes_and_infer_types()
 
@@ -0,0 +1,62 @@
+# -*- coding: utf-8 -*-
+# Copyright (C) 2018-2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+from io import BytesIO
+from typing import Iterable, List
+
+import numpy as np
+from numpy.typing import NDArray
+
+
+def to_bytes(number: int) -> bytes:
+    return number.to_bytes(4, "little")
+
+
+def pack_string(string: str) -> NDArray:
+    return np.frombuffer(bytes(string, "utf-8"), dtype=np.uint8)
+
+
+def pack_strings(strings: Iterable[str]) -> NDArray:
+    """
+    Convert any list of string to U8/1D numpy array compatible with converted OV model input
+    """
+    strings = list(strings)
+    batch_size = len(strings)
+    if batch_size == 0:
+        return np.frombuffer(to_bytes(0), np.uint8)
+
+    buffer = BytesIO()
+    buffer.write(to_bytes(batch_size))
+    symbols = BytesIO()
+    offset = 0
+    buffer.write(to_bytes(offset))
+    for string in strings:
+        byte_string = string.encode("utf-8") if isinstance(string, str) else string
+        offset += len(byte_string)
+
+        buffer.write(to_bytes(offset))
+        symbols.write(byte_string)
+
+    buffer.write(symbols.getvalue())
+    return np.frombuffer(buffer.getvalue(), np.uint8)
+
+
+# TODO: handle possible sighed values in batch size and offsets
+def unpack_strings(u8_tensor: NDArray, decoding_errors: str = "replace") -> List[str]:
+    """
+    Convert an array of uint8 elements to a list of strings; reverse to pack_strings
+    """
+
+    def from_bytes(offset: int, size: int) -> int:
+        return int.from_bytes(u8_tensor[offset : offset + size], "little")
+
+    batch_size = from_bytes(0, 4)
+    strings = []
+    for i in range(batch_size):
+        begin = from_bytes(4 + i * 4, 4)
+        end = from_bytes(4 + (i + 1) * 4, 4)
+        length = end - begin
+        begin += 4 * (batch_size + 2)
+        strings.append(bytes(u8_tensor[begin : begin + length]).decode("utf-8", errors=decoding_errors))
+    return strings
@@ -19,7 +19,7 @@
 from openvino.runtime.exceptions import OVTypeError, UserInputError
 from openvino.runtime.utils.types import as_node, make_constant_node
 
-from . import _get_factory, _get_opset_factory
+from . import _get_factory
 from .constants import (
     ATTENTION_MASK_INPUT_NAME,
     DETOKENIZER_NAME,
@@ -31,13 +31,8 @@
     VOCAB_SIZE_CACHE_PROPORTION,
     UTF8ReplaceMode,
 )
-from .utils import (
-    apply_unicode_to_bytes,
-    create_unpacked_string,
-    generate_tokens_with_space_symbols,
-    has_incompatible_re2_op,
-    quote_meta,
-)
+from .str_pack import pack_string, pack_strings
+from .utils import apply_unicode_to_bytes, generate_tokens_with_space_symbols, has_incompatible_re2_op, quote_meta
 
 
 logger = logging.getLogger(__name__)
@@ -71,15 +66,15 @@ def get_ov_subgraph(self, *input_nodes: List[Output]) -> List[Output]:
         raise NotImplementedError
 
     @staticmethod
-    def create_string_constant_node(value: Union[str, Iterable[str]]) -> List[Output]:
+    def create_string_constant_node(value: Union[str, Iterable[str]]) -> op.Constant:
         if isinstance(value, str):
             # string scalar
-            return op.Constant(np.frombuffer(bytes(value, "utf-8"), dtype=np.uint8)).outputs()
-        elif isinstance(value, Iterable):
-            # support only 1D strings for now
-            return create_unpacked_string(value)
+            ps = pack_string(value)
+            return op.Constant(ps)
         else:
-            raise ValueError(f"Unsupported value type {type(value)}")
+            # support only 1D strings for now
+            ps = pack_strings(value)
+            return _get_factory().create("StringTensorUnpack", op.Constant(ps).outputs())
 
     def finalize(self) -> None:
         """Called after the entire pipeline has been built"""
@@ -149,7 +144,7 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
             return list(input_nodes)
 
         split_pattern = "|".join(token.regex_repr() for token in self.special_tokens)
-        input_nodes.extend(self.create_string_constant_node(split_pattern))
+        input_nodes.extend(self.create_string_constant_node(split_pattern).outputs())
 
         return _get_factory().create("SpecialTokensSplit", input_nodes).outputs()
 
@@ -238,10 +233,10 @@ def del_control_chars_regex(cls) -> "RegexNormalizationStep":
 
     def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
         input_nodes.extend(
-            [
-                *self.create_string_constant_node(self.regex_search_pattern),
-                *self.create_string_constant_node(self.replace_term),
-            ]
+            (
+                self.create_string_constant_node(self.regex_search_pattern),
+                self.create_string_constant_node(self.replace_term),
+            )
         )
         return (
             _get_factory().create("RegexNormalization", input_nodes, {"global_replace": self.global_replace}).outputs()
@@ -362,7 +357,7 @@ def punctuation_splitter(cls, behaviour="isolate") -> "RegexSplitStep":
         )
 
     def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
-        input_nodes.extend(self.create_string_constant_node(self.split_pattern))
+        input_nodes.extend(self.create_string_constant_node(self.split_pattern).outputs())
         return (
             _get_factory()
             .create(
@@ -428,7 +423,7 @@ def get_vocab_node_outputs(self) -> Optional[List[Output]]:
 
     def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
         pipeline = self.get_pipeline()
-        pipeline.vocab_node_outputs = self.create_string_constant_node(self.vocab)
+        pipeline.vocab_node_outputs = self.create_string_constant_node(self.vocab).outputs()
 
         ragged_dims, other_dims = [], input_nodes
         if len(input_nodes) > 4:
@@ -480,7 +475,7 @@ def from_rwkv_vocab(cls, vocab_file_strings: Iterable[str]) -> TrieTokenizerStep
     def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
         input_nodes.extend(
             (
-                *self.create_string_constant_node(self.vocab),
+                *self.create_string_constant_node(self.vocab).outputs(),
                 make_constant_node(np.array(self.indices, dtype=np.int32), Type.i32),
             )
         )
@@ -516,7 +511,7 @@ def from_hf_json(cls, tokenizer_json: Dict[str, Any]) -> "WordPieceTokenizationS
     def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
         input_nodes.extend(
             (
-                *self.create_string_constant_node(self.vocab),
+                *self.create_string_constant_node(self.vocab).outputs(),
                 *as_node(self.unk_token_id).outputs(),
             )
         )
@@ -648,10 +643,10 @@ def merges_are_pairs(self) -> bool:
 
     def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
         pipeline = self.get_pipeline()
-        pipeline.vocab_node_outputs = self.create_string_constant_node(self.vocab)
+        pipeline.vocab_node_outputs = self.create_string_constant_node(self.vocab).outputs()
 
         if self.added_tokens:
-            special_tokens_outputs = self.create_string_constant_node(self.added_tokens)
+            special_tokens_outputs = self.create_string_constant_node(self.added_tokens).outputs()
         else:
             special_tokens_outputs = []
 
@@ -664,12 +659,12 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
             left_merges, right_merges = zip(*self.merges)
             input_nodes.extend(
                 (
-                    *self.create_string_constant_node(left_merges),
-                    *self.create_string_constant_node(right_merges),
+                    *self.create_string_constant_node(left_merges).outputs(),
+                    *self.create_string_constant_node(right_merges).outputs(),
                 )
             )
         else:
-            input_nodes.extend(self.create_string_constant_node(self.merges))
+            input_nodes.extend(self.create_string_constant_node(self.merges).outputs())
 
         if special_tokens_outputs:
             input_nodes.extend(
@@ -1040,13 +1035,7 @@ def finalize(self) -> None:
             self.skip_tokens = pipeline.skip_tokens or []
 
     @classmethod
-    def from_hf_json(
-        cls,
-        tokenizer_json: Dict[str, Any],
-        pipeline_vocab: Optional[List[str]],
-        skip_tokens: Optional[List[int]] = None,
-        do_skip_tokens: bool = True,
-    ) -> "VocabDecoderStep":
+    def from_hf_json(cls, tokenizer_json: Dict[str, Any], pipeline_vocab: Optional[List[str]], skip_tokens: Optional[List[int]] = None, do_skip_tokens: bool = True) -> "VocabDecoderStep":
         model_type = tokenizer_json["model"]["type"]
 
         if pipeline_vocab is not None and model_type == "WordLevel":
@@ -1068,7 +1057,7 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
         if self.vocab is None:
             vocab_outputs = self.get_vocab_node_outputs()
         else:
-            vocab_outputs = self.create_string_constant_node(self.vocab)
+            vocab_outputs = self.create_string_constant_node(self.vocab).outputs()
         input_nodes.extend(vocab_outputs)
 
         # Put constant with skip tokens even if do_skip_tokens=False, so that it can be switched on/off at runtime.
@@ -1189,8 +1178,8 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
 
         input_nodes.extend(
             (
-                *self.create_string_constant_node(self.regex_search_pattern),
-                *self.create_string_constant_node(self.replace_term),
+                *self.create_string_constant_node(self.regex_search_pattern).outputs(),
+                *self.create_string_constant_node(self.replace_term).outputs(),
             )
         )
         return ragged_dims + _get_factory().create("RegexNormalization", input_nodes).outputs()
@@ -1245,7 +1234,7 @@ def get_tokenizer_ov_subgraph(self) -> Model:
 
         processing_outputs = []
         for input_node in string_inputs:
-            input_node = _get_opset_factory("opset15").create("StringTensorUnpack", input_node.outputs()).outputs()
+            input_node = _get_factory().create("StringTensorUnpack", input_node.outputs()).outputs()
 
             ragged = []
             if isinstance(self.steps[0], SpecialTokensSplit):
@@ -1318,7 +1307,7 @@ def create_decoding_pipeline(self, input_nodes: List[Output]) -> List[Output]:
             pipeline_step = step.get_ov_subgraph(input_nodes)
             input_nodes = pipeline_step
 
-        return _get_opset_factory("opset15").create("StringTensorPack", input_nodes).outputs()
+        return _get_factory().create("StringTensorPack", input_nodes).outputs()
 
     def get_detokenizer_ov_subgraph(self) -> Model:
         self.finalize()