5
5
#include " openvino/genai/llm_pipeline.hpp"
6
6
#include " utils.hpp"
7
7
8
- namespace {
9
-
10
- void update_position_ids (ov::Tensor& position_ids, const ov::Tensor& attention_mask);
11
- void initialize_position_ids (ov::Tensor& position_ids, const ov::Tensor& attention_mask, int64_t start_pos = 0 );
12
- ov::Tensor extend_attention (ov::Tensor attention_mask);
13
-
14
- void update_position_ids (ov::Tensor& position_ids, const ov::Tensor& attention_mask) {
15
- const size_t batch_size = attention_mask.get_shape ()[0 ];
16
- const size_t atten_length = attention_mask.get_shape ()[1 ];
17
- position_ids.set_shape ({batch_size, 1 });
18
-
19
- for (size_t batch = 0 ; batch < batch_size; batch++) {
20
- int64_t * start = attention_mask.data <int64_t >() + batch * atten_length;
21
- position_ids.data <int64_t >()[batch] = std::accumulate (start, start + atten_length, 0 );
22
- }
23
- }
24
-
25
- void initialize_position_ids (ov::Tensor& position_ids, const ov::Tensor& attention_mask, int64_t start_pos) {
26
- const size_t batch_size = attention_mask.get_shape ()[0 ];
27
- const size_t seq_length = attention_mask.get_shape ()[1 ];
28
-
29
- const int64_t * attention_mask_data = attention_mask.data <int64_t >();
30
- int64_t * position_ids_data = position_ids.data <int64_t >();
31
-
32
- for (size_t batch = 0 ; batch < batch_size; batch++) {
33
- size_t sum = start_pos;
34
- for (size_t i = 0 ; i < seq_length; i++) {
35
- const size_t element_offset = batch * seq_length + i;
36
- position_ids_data[element_offset] = sum;
37
- if (attention_mask_data[element_offset] == 1 ) {
38
- sum += 1 ;
39
- }
40
- }
41
- }
42
- }
43
-
44
- ov::Tensor extend_attention (ov::Tensor attention_mask) {
45
- auto shape = attention_mask.get_shape ();
46
- auto batch_size = shape[0 ];
47
- auto seq_len = shape[1 ];
48
-
49
- ov::Tensor new_atten_mask = ov::Tensor{attention_mask.get_element_type (), {batch_size, seq_len + 1 }};
50
- auto old_data = attention_mask.data <int64_t >();
51
- auto new_data = new_atten_mask.data <int64_t >();
52
- for (size_t batch = 0 ; batch < batch_size; ++batch) {
53
- std::memcpy (new_data + batch * (seq_len + 1 ), old_data + batch * seq_len, seq_len * sizeof (int64_t ));
54
- new_data[batch * (seq_len + 1 ) + seq_len] = 1 ;
55
- }
56
- return new_atten_mask;
57
- }
58
-
59
- }
60
-
61
8
namespace ov {
62
9
63
10
ov::EncodedResults greedy_decoding (ov::InferRequest& m_model_runner,
@@ -73,7 +20,7 @@ ov::EncodedResults greedy_decoding(ov::InferRequest& m_model_runner,
73
20
74
21
// todo: make this work even if position_ids are not specified
75
22
auto position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape ()};
76
- initialize_position_ids (position_ids, attention_mask, kv_cache_len);
23
+ generate_utils:: initialize_position_ids (position_ids, attention_mask, kv_cache_len);
77
24
78
25
ov::EncodedResults results;
79
26
results.scores .resize (batch_size);
@@ -139,8 +86,8 @@ ov::EncodedResults greedy_decoding(ov::InferRequest& m_model_runner,
139
86
return results;
140
87
141
88
for (size_t i = 0 ; i < max_tokens - 1 ; ++i) {
142
- update_position_ids (position_ids, m_model_runner.get_tensor (" attention_mask" ));
143
- m_model_runner.set_tensor (" attention_mask" , extend_attention (m_model_runner.get_tensor (" attention_mask" )));
89
+ generate_utils:: update_position_ids (m_model_runner. get_tensor ( " position_ids" ) , m_model_runner.get_tensor (" attention_mask" ));
90
+ m_model_runner.set_tensor (" attention_mask" , generate_utils:: extend_attention (m_model_runner.get_tensor (" attention_mask" )));
144
91
145
92
// todo: consider replacing with start_async and run callback right after that
146
93
m_model_runner.infer ();
0 commit comments