2
2
// SPDX-License-Identifier: Apache-2.0
3
3
4
4
#include < openvino/openvino.hpp>
5
+ #include < utils.hpp>
5
6
6
7
namespace {
7
8
std::pair<ov::Tensor, ov::Tensor> tokenize (ov::InferRequest&& tokenizer, std::string_view prompt) {
8
9
constexpr size_t BATCH_SIZE = 1 ;
9
- constexpr size_t INDEXES_SIZE = (2 + BATCH_SIZE) * sizeof (int32_t );
10
10
ov::Tensor destination = tokenizer.get_input_tensor ();
11
- destination.set_shape ({INDEXES_SIZE + prompt.length ()});
12
- // N - batch size, E - end offset (and start for the next string). Tensor layout in bytes:
13
- // Nnnn0000EeeeEeeeChars1Chars2
14
- int32_t * int_ptr = reinterpret_cast <int32_t *>(destination.data <uint8_t >());
15
- int_ptr[0 ] = BATCH_SIZE;
16
- int_ptr[1 ] = 0 ;
17
- int_ptr[2 ] = int32_t (prompt.length ());
18
- std::copy (prompt.cbegin (), prompt.cend (), reinterpret_cast <char *>(int_ptr + 3 ));
11
+ pack_strings (std::array<std::string_view, BATCH_SIZE>{prompt}, destination);
19
12
tokenizer.infer ();
20
13
return {tokenizer.get_tensor (" input_ids" ), tokenizer.get_tensor (" attention_mask" )};
21
14
}
22
15
23
16
void print_token (ov::InferRequest& detokenizer, int32_t out_token) {
24
17
constexpr size_t BATCH_SIZE = 1 ;
25
- constexpr size_t INDEXES_SIZE = (2 + BATCH_SIZE) * sizeof (int32_t );
26
18
ov::Tensor inp = detokenizer.get_input_tensor ();
27
19
inp.set_shape ({BATCH_SIZE, 1 });
28
20
inp.data <int32_t >()[0 ] = out_token;
29
21
detokenizer.infer ();
30
- const ov::Tensor& detokenized = detokenizer.get_output_tensor ();
31
- size_t tensor_size = detokenized.get_size ();
32
- if (tensor_size <= INDEXES_SIZE) {
33
- throw std::runtime_error (" The detokenized tensor must contain batch size, first string offset and end indices" );
34
- }
35
- const char * char_ptr = reinterpret_cast <const char *>(detokenized.data <const uint8_t >());
36
- if (reinterpret_cast <const int32_t *>(char_ptr)[0 ] != BATCH_SIZE) {
37
- throw std::runtime_error (" Expected batch 1 in the detokenized tensor" );
38
- }
39
- std::cout.write (char_ptr + INDEXES_SIZE, std::streamsize (tensor_size - INDEXES_SIZE)).flush ();
22
+ std::cout << unpack_strings (detokenizer.get_output_tensor ()).front () << std::flush;
40
23
}
41
24
}
42
25
@@ -50,37 +33,28 @@ int main(int argc, char* argv[]) try {
50
33
ov::InferRequest detokenizer = core.compile_model (argv[3 ], " CPU" ).create_infer_request ();
51
34
std::shared_ptr<ov::Model> model = core.read_model (argv[1 ]);
52
35
constexpr size_t BATCH_SIZE = 1 ;
53
- std::map<std::string , ov::PartialShape> shapes = {
54
- {" input_ids " , ov::PartialShape{
36
+ std::map<size_t , ov::PartialShape> shapes = {
37
+ {0 , ov::PartialShape{
55
38
BATCH_SIZE, {1 , std::numeric_limits<ov::Dimension::value_type>::max ()}
56
39
}},
57
- {" attention_mask " , ov::PartialShape{
40
+ {1 , ov::PartialShape{
58
41
BATCH_SIZE, {1 , std::numeric_limits<ov::Dimension::value_type>::max ()}
59
42
}}
60
43
};
61
- for (const ov::Output<ov::Node>& input : model->inputs ()) {
62
- for (const std::string& name : input.get_names ()) {
63
- if (name.rfind (" past_key_values" , 0 ) == 0 ) {
64
- ov::PartialShape shape = input.get_partial_shape ();
65
- shape[0 ] = BATCH_SIZE;
66
- shapes.emplace (name, shape);
67
- break ;
68
- }
69
- }
44
+ std::vector<ov::Output<ov::Node>> inputs = model->inputs ();
45
+ for (size_t idx = 2 ; idx < inputs.size (); ++idx) {
46
+ ov::PartialShape shape = inputs.at (idx).get_partial_shape ();
47
+ shape.at (0 ) = BATCH_SIZE;
48
+ shapes.emplace (idx, shape);
70
49
}
71
50
model->reshape (shapes);
72
51
ov::preprocess::PrePostProcessor p3 (model);
73
52
p3.input (" input_ids" ).tensor ().set_element_type (ov::element::i32); // cast to the type of tokenyzer's output
74
53
p3.input (" attention_mask" ).tensor ().set_element_type (ov::element::i32);
75
54
model = p3.build ();
76
55
ov::InferRequest ireq = core.compile_model (model, " CPU" , {ov::cache_dir (" llm-cache" )}).create_infer_request ();
77
- for (const ov::Output<ov::Node>& input : model->inputs ()) {
78
- for (const std::string& name : input.get_names ()) {
79
- if (name.rfind (" past_key_values" , 0 ) == 0 ) {
80
- ireq.get_tensor (input).set_shape (input.get_partial_shape ().get_min_shape ());
81
- break ;
82
- }
83
- }
56
+ for (size_t idx = 2 ; idx < inputs.size (); ++idx) {
57
+ ireq.get_input_tensor (idx).set_shape (inputs.at (idx).get_partial_shape ().get_min_shape ());
84
58
}
85
59
ireq.get_tensor (" input_ids" ).set_shape (input_ids.get_shape ()); // TODO: replace with ireq.set_tensor("input_ids", input_ids); after it's fixed
86
60
ireq.get_tensor (" attention_mask" ).set_shape (input_ids.get_shape ());
@@ -96,13 +70,8 @@ int main(int argc, char* argv[]) try {
96
70
ireq.get_tensor (" attention_mask" ).data <int32_t >()[0 ] = 1 ;
97
71
constexpr int32_t SPECIAL_EOS_TOKEN = 2 ; // There's no way to extract the value from the tokenizer for now
98
72
while (out_token != SPECIAL_EOS_TOKEN) {
99
- for (const ov::Output<ov::Node>& input : model->inputs ()) {
100
- for (const std::string& name : input.get_names ()) {
101
- if (name.rfind (" past_key_values" , 0 ) == 0 ) {
102
- ireq.set_tensor (input, ireq.get_tensor (" present" + name.substr (15 )));
103
- break ;
104
- }
105
- }
73
+ for (size_t idx = 2 ; idx < inputs.size (); ++idx) {
74
+ ireq.set_input_tensor (idx, ireq.get_output_tensor (idx - 1 ));
106
75
}
107
76
ireq.get_tensor (" input_ids" ).data <int32_t >()[0 ] = out_token;
108
77
ireq.start_async ();
0 commit comments