Skip to content

Commit 72c045e

Browse files
committed
fixed difference between old greddy sample and generate
1 parent dcb4b86 commit 72c045e

11 files changed

+298
-491
lines changed

src/cpp/include/openvino/genai/llm_pipeline.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
109109
* @return DecodedResults a structure with resulting texts & scores
110110
*/
111111
DecodedResults generate(std::vector<std::string> texts, OptionalGenerationConfig generation_config);
112+
DecodedResults generate(std::initializer_list<std::string> text, OptionalGenerationConfig generation_config);
112113

113114
/**
114115
* @brief Low level generate to be called with already encoded input_ids tokens.

src/cpp/src/beam_search_decoding.cpp

-91
This file was deleted.

src/cpp/src/greedy_decoding.cpp

+3-56
Original file line numberDiff line numberDiff line change
@@ -5,59 +5,6 @@
55
#include "openvino/genai/llm_pipeline.hpp"
66
#include "utils.hpp"
77

8-
namespace {
9-
10-
void update_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask);
11-
void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask, int64_t start_pos = 0);
12-
ov::Tensor extend_attention(ov::Tensor attention_mask);
13-
14-
void update_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask) {
15-
const size_t batch_size = attention_mask.get_shape()[0];
16-
const size_t atten_length = attention_mask.get_shape()[1];
17-
position_ids.set_shape({batch_size, 1});
18-
19-
for (size_t batch = 0; batch < batch_size; batch++) {
20-
int64_t* start = attention_mask.data<int64_t>() + batch * atten_length;
21-
position_ids.data<int64_t>()[batch] = std::accumulate(start, start + atten_length, 0);
22-
}
23-
}
24-
25-
void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask, int64_t start_pos) {
26-
const size_t batch_size = attention_mask.get_shape()[0];
27-
const size_t seq_length = attention_mask.get_shape()[1];
28-
29-
const int64_t* attention_mask_data = attention_mask.data<int64_t>();
30-
int64_t* position_ids_data = position_ids.data<int64_t>();
31-
32-
for (size_t batch = 0; batch < batch_size; batch++) {
33-
size_t sum = start_pos;
34-
for (size_t i = 0; i < seq_length; i++) {
35-
const size_t element_offset = batch * seq_length + i;
36-
position_ids_data[element_offset] = sum;
37-
if (attention_mask_data[element_offset] == 1) {
38-
sum += 1;
39-
}
40-
}
41-
}
42-
}
43-
44-
ov::Tensor extend_attention(ov::Tensor attention_mask) {
45-
auto shape = attention_mask.get_shape();
46-
auto batch_size = shape[0];
47-
auto seq_len = shape[1];
48-
49-
ov::Tensor new_atten_mask = ov::Tensor{attention_mask.get_element_type(), {batch_size, seq_len + 1}};
50-
auto old_data = attention_mask.data<int64_t>();
51-
auto new_data = new_atten_mask.data<int64_t>();
52-
for (size_t batch = 0; batch < batch_size; ++batch) {
53-
std::memcpy(new_data + batch * (seq_len + 1), old_data + batch * seq_len, seq_len * sizeof(int64_t));
54-
new_data[batch * (seq_len + 1) + seq_len] = 1;
55-
}
56-
return new_atten_mask;
57-
}
58-
59-
}
60-
618
namespace ov {
629

6310
ov::EncodedResults greedy_decoding(ov::InferRequest& m_model_runner,
@@ -73,7 +20,7 @@ ov::EncodedResults greedy_decoding(ov::InferRequest& m_model_runner,
7320

7421
// todo: make this work even if position_ids are not specified
7522
auto position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()};
76-
initialize_position_ids(position_ids, attention_mask, kv_cache_len);
23+
generate_utils::initialize_position_ids(position_ids, attention_mask, kv_cache_len);
7724

7825
ov::EncodedResults results;
7926
results.scores.resize(batch_size);
@@ -139,8 +86,8 @@ ov::EncodedResults greedy_decoding(ov::InferRequest& m_model_runner,
13986
return results;
14087

14188
for (size_t i = 0; i < max_tokens - 1; ++i) {
142-
update_position_ids(position_ids, m_model_runner.get_tensor("attention_mask"));
143-
m_model_runner.set_tensor("attention_mask", extend_attention(m_model_runner.get_tensor("attention_mask")));
89+
generate_utils::update_position_ids(m_model_runner.get_tensor("position_ids"), m_model_runner.get_tensor("attention_mask"));
90+
m_model_runner.set_tensor("attention_mask", generate_utils::extend_attention(m_model_runner.get_tensor("attention_mask")));
14491

14592
// todo: consider replacing with start_async and run callback right after that
14693
m_model_runner.infer();

0 commit comments

Comments
 (0)