Skip to content

Commit 9e37273

Browse files
Add sampling decoding (#6)
* greedy-sampling * greedy sampling * greedy sampling * Update greedy_sampling.hpp don't transform the logits during top_p * Update greedy_causal_lm.cpp * format * early exit for arg_max * Update default parameters * Add multinomial sampling * Remove unused hpp * Reuse util functions * Apply review comments * Merge config * Use size_t for iteration * apply comments * Move rand gen to class member * Apply comments * Add multinomial sampling * Remove path * Apply comments * Fix merge --------- Co-authored-by: wenyi5608 <93560477+wenyi5608@users.noreply.github.com>
1 parent bbc8c25 commit 9e37273

9 files changed

+337
-11
lines changed

.gitignore

+3-1
Original file line numberDiff line numberDiff line change
@@ -42,4 +42,6 @@ CMakeUserPresets.json
4242
# Python-specific
4343
*.?env*
4444
*.pyc
45-
__pycache__
45+
__pycache__
46+
47+
*.so

src/cpp/include/openvino/genai/generation_config.hpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,9 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
8181
StopCriteria stop_criteria = StopCriteria::heuristic;
8282

8383
// Multinomial
84-
float temperature = 0.0f;
84+
float temperature = 1.0f;
8585
float top_p = 1.0f;
86-
int top_k = -1;
86+
size_t top_k = 50;
8787
bool do_sample = false;
8888
float repetition_penalty = 1.0f;
8989

@@ -99,7 +99,7 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
9999
size_t get_max_new_tokens(size_t prompt_length = 0) const;
100100
bool is_greedy_decoding() const;
101101
bool is_beam_search() const;
102-
bool is_multimomial() const;
102+
bool is_multinomial() const;
103103
static GenerationConfig anymap_to_generation_config(const ov::AnyMap& config_map = {});
104104
};
105105

src/cpp/src/generation_config.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ bool GenerationConfig::is_beam_search() const {
100100
return num_beams > 1;
101101
}
102102

103-
bool GenerationConfig::is_multimomial() const {
103+
bool GenerationConfig::is_multinomial() const {
104104
return do_sample;
105105
}
106106

src/cpp/src/generation_config_helper.hpp

Whitespace-only changes.

src/cpp/src/llm_pipeline.cpp

+14-5
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,14 @@ ov::genai::EncodedResults greedy_decoding(
7070
const bool is_chat_conversation = false
7171
);
7272

73+
ov::genai::EncodedResults multinominal_decoding(
74+
ov::InferRequest& model_runner,
75+
ov::Tensor prompts,
76+
ov::Tensor attentin_mask,
77+
GenerationConfig sampling_params,
78+
std::shared_ptr<StreamerBase> streamer
79+
);
80+
7381
EncodedResults beam_search(ov::InferRequest& lm, ov::Tensor prompts, ov::Tensor attentin_mask, GenerationConfig config);
7482

7583

@@ -252,8 +260,8 @@ ov::genai::EncodedResults ov::genai::LLMPipeline::LLMPipelineImpl::generate(
252260
streamer_ptr = std::make_shared<TextCallbackStreamer>(m_tokenizer, *callback);
253261
}
254262
auto batch_size = input_ids.get_shape().at(0);
255-
if ((batch_size != 1 || !config.is_greedy_decoding()) && streamer_ptr) {
256-
OPENVINO_THROW("Currently streaming is possible only with batch size=1 and greedy decoding");
263+
if ((batch_size != 1 || !(config.is_greedy_decoding() || config.is_multinomial())) && streamer_ptr) {
264+
OPENVINO_THROW("Currently streaming is possible only with batch size=1 and greedy or multinomial decoding");
257265
}
258266

259267
auto attention_mask_data = attention_mask.has_value() ? *attention_mask : ov::genai::utils::init_attention_mask(input_ids);
@@ -262,10 +270,11 @@ ov::genai::EncodedResults ov::genai::LLMPipeline::LLMPipelineImpl::generate(
262270
result = ov::genai::greedy_decoding(m_model_runner, input_ids, attention_mask_data, config, streamer_ptr, is_chat_conversation);
263271
} else if (config.is_beam_search()) {
264272
result = beam_search(m_model_runner, input_ids, attention_mask_data, config);
273+
} else if (config.is_multinomial()) {
274+
result = multinominal_decoding(m_model_runner, input_ids, attention_mask_data, config, streamer_ptr);
265275
} else {
266-
// todo: implement multinomial sampling
267-
// result = multinomial_sampling(input_ids, config);
268-
}
276+
OPENVINO_THROW("No decoding algorithm found for provided configuration parameters.");
277+
}
269278

270279
if (!is_chat_conversation)
271280
m_model_runner.reset_state();

src/cpp/src/multinomial_decoding.cpp

+262
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,262 @@
1+
// Copyright (C) 2023-2024 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
#include <algorithm>
5+
#include <cmath>
6+
#include <iostream>
7+
#include <numeric>
8+
#include <random>
9+
#include <regex>
10+
#include <vector>
11+
12+
#include "generation_config_helper.hpp"
13+
#include "openvino/genai/llm_pipeline.hpp"
14+
#include "utils.hpp"
15+
16+
17+
namespace {
18+
19+
struct TokenIdScore {
20+
int64_t id;
21+
float score;
22+
23+
bool operator<(const TokenIdScore& other) const {
24+
return score < other.score;
25+
}
26+
27+
bool operator>(const TokenIdScore& other) const {
28+
return score > other.score;
29+
}
30+
};
31+
32+
void apply_softmax_inplace(std::vector<TokenIdScore>& tokens) {
33+
float max_score = std::max_element(tokens.begin(), tokens.end())->score;
34+
float sum = 0.f;
35+
36+
for (auto& token : tokens) {
37+
float s = std::exp(token.score - max_score);
38+
token.score = s;
39+
sum += s;
40+
}
41+
42+
float inv_sum = 1.f / sum;
43+
44+
for (auto& token : tokens) {
45+
token.score *= inv_sum;
46+
}
47+
}
48+
49+
TokenIdScore* sample_top_p(TokenIdScore* first, TokenIdScore* last, float top_p) {
50+
// sort score
51+
std::sort(first, last, std::greater<TokenIdScore>());
52+
53+
int tokens_size = last - first;
54+
std::vector<TokenIdScore> token_scores(tokens_size);
55+
for (size_t i = 0; i < tokens_size; i++) {
56+
token_scores[i] = first[i];
57+
}
58+
59+
// calculate softmax
60+
apply_softmax_inplace(token_scores);
61+
62+
float prefix_sum = 0.0f;
63+
64+
// top_p
65+
for (size_t i = 0; i < tokens_size; i++) {
66+
prefix_sum += token_scores[i].score;
67+
if (prefix_sum >= top_p) {
68+
return first + (i + 1);
69+
}
70+
}
71+
72+
return last;
73+
}
74+
75+
void apply_repetition_penalty(float* first, float* last, const std::vector<int64_t>& input_ids, float penalty) {
76+
const float inv_penalty = 1.f / penalty;
77+
const int vocab_size = last - first;
78+
std::vector<bool> occurrence(vocab_size, false);
79+
for (const int64_t id : input_ids) {
80+
if (!occurrence[id]) {
81+
first[id] *= (first[id] > 0) ? inv_penalty : penalty;
82+
}
83+
occurrence[id] = true;
84+
}
85+
}
86+
87+
void apply_inv_temperature(float* first, float* last, float inv_temperature) {
88+
for (float* it = first; it != last; it++) {
89+
*it *= inv_temperature;
90+
}
91+
}
92+
93+
struct RandomSampling {
94+
const size_t top_k;
95+
const float top_p;
96+
const float inv_temperature;
97+
const float repetition_penalty;
98+
99+
std::mt19937 gen{std::random_device{}()};
100+
101+
RandomSampling(ov::genai::GenerationConfig generation_config)
102+
: top_k{generation_config.top_k},
103+
top_p{generation_config.top_p},
104+
inv_temperature{1.f / generation_config.temperature},
105+
repetition_penalty{generation_config.repetition_penalty} {
106+
// parameters validation
107+
OPENVINO_ASSERT(generation_config.top_k > 0,
108+
"top_k must be a strictly positive, but got ",
109+
generation_config.top_p);
110+
OPENVINO_ASSERT(generation_config.top_p > 0 || generation_config.top_p < 1.0f,
111+
"top_p must be a positive float > 0 and < 1, but got ",
112+
generation_config.top_p);
113+
OPENVINO_ASSERT(generation_config.temperature > 0,
114+
"Temperature must be a strictly positive float, but got ",
115+
generation_config.temperature);
116+
OPENVINO_ASSERT(generation_config.repetition_penalty > 0,
117+
"Repetition penalty must be a strictly positive float, but got ",
118+
generation_config.repetition_penalty);
119+
}
120+
121+
TokenIdScore get_out_token(float* logits, size_t vocab_size, const std::vector<int64_t>& tokens) {
122+
// logits pre-process
123+
if (repetition_penalty != 1.0f) {
124+
apply_repetition_penalty(logits, logits + vocab_size, tokens, repetition_penalty);
125+
}
126+
127+
if (inv_temperature != 1.0f) {
128+
apply_inv_temperature(logits, logits + vocab_size, inv_temperature);
129+
}
130+
131+
std::vector<TokenIdScore> token_scores(vocab_size);
132+
for (size_t i = 0; i < vocab_size; i++) {
133+
token_scores[i] = TokenIdScore{int64_t(i), logits[i]};
134+
}
135+
136+
// top_k sampling
137+
if (0 < top_k && top_k < token_scores.size()) {
138+
std::nth_element(token_scores.data(),
139+
token_scores.data() + top_k,
140+
token_scores.data() + token_scores.size(),
141+
std::greater<TokenIdScore>());
142+
token_scores.resize(top_k);
143+
}
144+
145+
// top_p sampling
146+
if (0.f < top_p && top_p < 1.0f) {
147+
auto pos = sample_top_p(token_scores.data(), token_scores.data() + token_scores.size(), top_p);
148+
token_scores.resize(pos - token_scores.data());
149+
}
150+
151+
// sample next token
152+
apply_softmax_inplace(token_scores);
153+
for (size_t i = 0; i < token_scores.size(); i++) {
154+
logits[i] = token_scores[i].score;
155+
}
156+
157+
std::discrete_distribution<> dist(logits, logits + token_scores.size());
158+
return token_scores[dist(gen)];
159+
}
160+
};
161+
} // namespace
162+
163+
namespace ov {
164+
namespace genai {
165+
166+
ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner,
167+
ov::Tensor input_ids,
168+
ov::Tensor attention_mask,
169+
ov::genai::GenerationConfig config,
170+
std::shared_ptr<ov::genai::StreamerBase> streamer) {
171+
ov::Shape prompts_shape = input_ids.get_shape();
172+
size_t batch_size = prompts_shape[0];
173+
174+
OPENVINO_ASSERT(batch_size == 1, "Only batch size = 1 supported for multinomial decoding");
175+
176+
size_t prompt_len = prompts_shape[1];
177+
178+
ov::genai::EncodedResults results;
179+
results.scores.resize(batch_size, 0);
180+
results.tokens.resize(batch_size);
181+
182+
// Initialize inputs
183+
m_model_runner.set_tensor("input_ids", input_ids);
184+
m_model_runner.set_tensor("attention_mask", attention_mask);
185+
186+
ov::Tensor position_ids = m_model_runner.get_tensor("position_ids");
187+
position_ids.set_shape(input_ids.get_shape());
188+
std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
189+
190+
// Input values are persistent between inference calls.
191+
// That allows to set values, which aren't going to change, only once
192+
m_model_runner.get_tensor("beam_idx").set_shape({batch_size});
193+
m_model_runner.get_tensor("beam_idx").data<int32_t>()[0] = 0;
194+
195+
m_model_runner.infer();
196+
197+
auto logits_tensor = m_model_runner.get_tensor("logits");
198+
199+
int64_t sequence_offset = logits_tensor.get_shape().at(1) - 1;
200+
size_t vocab_size = logits_tensor.get_shape().back();
201+
202+
float* logits = logits_tensor.data<float>() + sequence_offset * vocab_size;
203+
204+
const int64_t* input_ids_data = input_ids.data<const int64_t>();
205+
206+
std::vector<int64_t> tokens{input_ids_data, input_ids_data + input_ids.get_size()};
207+
208+
RandomSampling sampling{config};
209+
210+
TokenIdScore out_token = sampling.get_out_token(logits, vocab_size, tokens);
211+
212+
tokens.push_back(out_token.id);
213+
results.tokens[0].push_back(out_token.id);
214+
results.scores[0] += out_token.score;
215+
216+
if (streamer) {
217+
streamer->put(out_token.id);
218+
}
219+
220+
if (!config.ignore_eos && out_token.id == config.eos_token_id) {
221+
return results;
222+
}
223+
224+
m_model_runner.get_tensor("input_ids").set_shape({batch_size, 1});
225+
m_model_runner.get_tensor("position_ids").set_shape({batch_size, 1});
226+
227+
size_t max_new_tokens = config.get_max_new_tokens(prompt_len);
228+
229+
for (size_t i = 0; i < max_new_tokens - 1; i++) {
230+
ov::genai::utils::update_position_ids(m_model_runner.get_tensor("position_ids"),
231+
m_model_runner.get_tensor("attention_mask"));
232+
m_model_runner.set_tensor("attention_mask",
233+
ov::genai::utils::extend_attention(m_model_runner.get_tensor("attention_mask")));
234+
235+
m_model_runner.get_tensor("input_ids").data<int64_t>()[0] = out_token.id;
236+
237+
m_model_runner.infer();
238+
239+
logits = m_model_runner.get_tensor("logits").data<float>();
240+
out_token = sampling.get_out_token(logits, vocab_size, tokens);
241+
242+
tokens.push_back(out_token.id);
243+
results.tokens[0].push_back(out_token.id);
244+
results.scores[0] += out_token.score;
245+
246+
if (streamer) {
247+
streamer->put(out_token.id);
248+
}
249+
250+
if (!config.ignore_eos && out_token.id == config.eos_token_id) {
251+
break;
252+
}
253+
}
254+
255+
if (streamer) {
256+
streamer->end();
257+
}
258+
259+
return results;
260+
}
261+
} // namespace genai
262+
} // namespace ov

src/cpp/src/utils.cpp

+10
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,11 @@ void set_attention_mask(ov::Tensor&& attention_mask, std::vector<int32_t> next_b
109109
}
110110
}
111111

112+
/**
113+
* Set position ids tensor data for next token inference based on provided attention mask
114+
* Supports multi batch
115+
* Supports sparse attention_mask
116+
*/
112117
void update_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention_mask) {
113118
const size_t batch_size = attention_mask.get_shape().at(0);
114119
const size_t atten_length = attention_mask.get_shape().at(1);
@@ -121,6 +126,11 @@ void update_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention
121126
}
122127
}
123128

129+
/**
130+
* Get attention mask tensor for next token inference
131+
* Supports multi batch
132+
* Supports sparse attention_mask
133+
*/
124134
ov::Tensor extend_attention(ov::Tensor attention_mask) {
125135
auto shape = attention_mask.get_shape();
126136
auto batch_size = shape[0];

text_generation/causal_lm/cpp/CMakeLists.txt

+7-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,13 @@ target_include_directories(chat_sample PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
5252
set_target_properties(chat_sample PROPERTIES CXX_STANDARD 17)
5353
set_target_properties(chat_sample PROPERTIES CXX_STANDARD_REQUIRED ON)
5454

55-
install(TARGETS greedy_causal_lm beam_search_causal_lm speculative_decoding_lm prompt_lookup_decoding_lm chat_sample
55+
add_executable(multinomial_causal_lm multinomial_causal_lm.cpp)
56+
target_link_libraries(multinomial_causal_lm PRIVATE openvino::genai)
57+
target_include_directories(multinomial_causal_lm PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
58+
set_target_properties(multinomial_causal_lm PROPERTIES CXX_STANDARD 17)
59+
set_target_properties(multinomial_causal_lm PROPERTIES CXX_STANDARD_REQUIRED ON)
60+
61+
install(TARGETS greedy_causal_lm beam_search_causal_lm speculative_decoding_lm prompt_lookup_decoding_lm chat_sample multinomial_causal_lm
5662
RUNTIME DESTINATION samples_bin/
5763
COMPONENT samples_bin
5864
EXCLUDE_FROM_ALL)

0 commit comments

Comments
 (0)