Skip to content

Commit 72caf05

Browse files
authored
Update greedy_causal_lm.cpp to read EOS Token (#315)
*Details:* Made*changes to accommodate the dynamic EOS Token *Tickets:* #277 132861
1 parent c21b149 commit 72caf05

File tree

4 files changed

+60
-47
lines changed

4 files changed

+60
-47
lines changed

text_generation/causal_lm/cpp/beam_search_causal_lm.cpp

+20-8
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ std::string detokenize(ov::InferRequest& detokenizer, const std::vector<int64_t>
2222
detokenizer.infer();
2323
return detokenizer.get_output_tensor().data<std::string>()[0];
2424
}
25-
} // namespace
25+
}
2626

2727
int main(int argc, char* argv[]) try {
2828
if (argc != 3) {
@@ -31,15 +31,17 @@ int main(int argc, char* argv[]) try {
3131
// Compile models
3232
ov::Core core;
3333
core.add_extension(OPENVINO_TOKENIZERS_PATH); // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
34+
//Read the tokenizer model information from the file to later get the runtime information
35+
auto tokenizer_model = core.read_model(std::string{argv[1]} + "/openvino_tokenizer.xml");
3436
// tokenizer and detokenizer work on CPU only
35-
ov::InferRequest tokenizer =
36-
core.compile_model(std::string{argv[1]} + "/openvino_tokenizer.xml", "CPU").create_infer_request();
37+
ov::InferRequest tokenizer = core.compile_model(
38+
tokenizer_model, "CPU").create_infer_request();
3739
auto [input_ids, attention_mask] = tokenize(tokenizer, argv[2]);
38-
ov::InferRequest detokenizer =
39-
core.compile_model(std::string{argv[1]} + "/openvino_detokenizer.xml", "CPU").create_infer_request();
40+
ov::InferRequest detokenizer = core.compile_model(
41+
std::string{argv[1]} + "/openvino_detokenizer.xml", "CPU").create_infer_request();
4042
// The model can be compiled for GPU as well
41-
ov::InferRequest lm =
42-
core.compile_model(std::string{argv[1]} + "/openvino_model.xml", "CPU").create_infer_request();
43+
ov::InferRequest lm = core.compile_model(
44+
std::string{argv[1]} + "/openvino_model.xml", "CPU").create_infer_request();
4345
// Initialize inputs
4446
lm.set_tensor("input_ids", input_ids);
4547
lm.set_tensor("attention_mask", attention_mask);
@@ -49,8 +51,18 @@ int main(int argc, char* argv[]) try {
4951
lm.get_tensor("beam_idx").set_shape({1});
5052
lm.get_tensor("beam_idx").data<int32_t>()[0] = 0;
5153

54+
// Get the runtime info from the tokenizer model that we read earlier
55+
auto rt_info = tokenizer_model->get_rt_info(); //Get the runtime info for the model
56+
int64_t SPECIAL_EOS_TOKEN;
57+
58+
if (rt_info.count("eos_token_id") > 0) { //check if the runtime information has a valid EOS token ID
59+
SPECIAL_EOS_TOKEN = rt_info["eos_token_id"].as<int64_t>();
60+
61+
} else {
62+
throw std::runtime_error("EOS token ID not found in model's runtime information.");
63+
}
5264
const int64_t* prompt_data = input_ids.data<const int64_t>();
53-
Parameters parameters{std::vector<int64_t>{prompt_data, prompt_data + input_ids.get_size()}};
65+
Parameters parameters{std::vector<int64_t>{prompt_data, prompt_data + input_ids.get_size()}, SPECIAL_EOS_TOKEN};
5466
GroupBeamSearcher group_beam_searcher{parameters};
5567
std::vector<int64_t> next_tokens;
5668
std::vector<int32_t> next_beams;

text_generation/causal_lm/cpp/greedy_causal_lm.cpp

+14-4
Original file line numberDiff line numberDiff line change
@@ -61,9 +61,11 @@ int main(int argc, char* argv[]) try {
6161
// Compile models
6262
ov::Core core;
6363
core.add_extension(OPENVINO_TOKENIZERS_PATH); // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
64+
//Read the tokenizer model information from the file to later get the runtime information
65+
auto tokenizer_model = core.read_model(std::string{argv[1]} + "/openvino_tokenizer.xml");
6466
// tokenizer and detokenizer work on CPU only
6567
ov::InferRequest tokenizer = core.compile_model(
66-
std::string{argv[1]} + "/openvino_tokenizer.xml", "CPU").create_infer_request();
68+
tokenizer_model, "CPU").create_infer_request();
6769
auto [input_ids, attention_mask] = tokenize(tokenizer, argv[2]);
6870
ov::InferRequest detokenizer = core.compile_model(
6971
std::string{argv[1]} + "/openvino_detokenizer.xml", "CPU").create_infer_request();
@@ -91,9 +93,17 @@ int main(int argc, char* argv[]) try {
9193
lm.get_tensor("input_ids").set_shape({BATCH_SIZE, 1});
9294
position_ids.set_shape({BATCH_SIZE, 1});
9395
TextStreamer text_streamer{std::move(detokenizer)};
94-
// There's no way to extract special token values from the detokenizer for now
95-
constexpr int64_t SPECIAL_EOS_TOKEN = 2;
96-
96+
97+
// Get the runtime info from the tokenizer model that we read earlier
98+
auto rt_info = tokenizer_model->get_rt_info(); //Get the runtime info for the model
99+
int64_t SPECIAL_EOS_TOKEN;
100+
101+
if (rt_info.count("eos_token_id") > 0) { //check if the runtime information has a valid EOS token ID
102+
SPECIAL_EOS_TOKEN = rt_info["eos_token_id"].as<int64_t>();
103+
} else {
104+
throw std::runtime_error("EOS token ID not found in model's runtime information.");
105+
}
106+
97107
int max_sequence_length = 100;
98108
while (out_token != SPECIAL_EOS_TOKEN && seq_len < max_sequence_length) {
99109
++seq_len;

text_generation/causal_lm/cpp/group_beam_searcher.hpp

+25-34
Original file line numberDiff line numberDiff line change
@@ -44,10 +44,7 @@ std::vector<int64_t> kmp_search(const std::vector<int64_t>& haystack, const std:
4444
return res;
4545
}
4646

47-
struct Token {
48-
float log_prob;
49-
int64_t idx;
50-
};
47+
struct Token {float log_prob; int64_t idx;};
5148

5249
std::vector<Token> log_softmax(const ov::Tensor& logits, size_t batch_idx) {
5350
if (logits.get_shape().at(0) <= batch_idx) {
@@ -58,10 +55,10 @@ std::vector<Token> log_softmax(const ov::Tensor& logits, size_t batch_idx) {
5855
size_t sequence_offset = (logits.get_shape().at(1) - 1) * vocab_size;
5956
const float* beam_logits = logits.data<const float>() + batch_offset + sequence_offset;
6057
float max_logit = *std::max_element(beam_logits, beam_logits + vocab_size);
61-
float log_sum = std::log(
62-
std::accumulate(beam_logits, beam_logits + vocab_size, 0.0f, [max_logit](float accumulated, float to_add) {
58+
float log_sum = std::log(std::accumulate(
59+
beam_logits, beam_logits + vocab_size, 0.0f, [max_logit](float accumulated, float to_add) {
6360
return accumulated + std::exp(to_add - max_logit);
64-
}));
61+
}));
6562
std::vector<Token> tokens;
6663
tokens.reserve(vocab_size);
6764
for (size_t idx = 0; idx < vocab_size; ++idx) {
@@ -80,26 +77,24 @@ bool greater(const Beam& left, const Beam& right) {
8077
return left.score > right.score;
8178
}
8279

83-
enum class StopCriteria { early, heuristic, never };
80+
enum class StopCriteria {early, heuristic, never};
8481

8582
struct Parameters {
8683
std::vector<int64_t> prompt;
84+
int64_t eos_token;
8785
size_t n_groups = 3;
8886
size_t group_size = 5;
8987
float diversity_penalty = 1.0;
9088
size_t max_new_tokens = 20;
9189
StopCriteria stop_criteria = StopCriteria::heuristic;
9290
float length_penalty = 1.0;
9391
size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max();
94-
// There's no way to extract special token values from the tokenizer for now
95-
int64_t eos_token = 2;
96-
std::function<bool(const Beam&)> early_finish = [](const Beam&) {
97-
return false;
98-
};
92+
93+
std::function<bool(const Beam&)> early_finish = [](const Beam&){return false;};
9994
};
10095

10196
struct Group {
102-
std::vector<Beam> ongoing; // Best beams in front
97+
std::vector<Beam> ongoing; // Best beams in front
10398
std::vector<Beam> min_heap; // The worst of the best completed beams is the first
10499
bool done = false;
105100

@@ -126,30 +121,26 @@ struct Group {
126121
float best_sum_logprobs = ongoing.front().score;
127122
float worst_score = min_heap.front().score;
128123
switch (parameters.stop_criteria) {
129-
case StopCriteria::early:
130-
done = true;
131-
return;
132-
case StopCriteria::heuristic: {
133-
float highest_attainable_score = best_sum_logprobs / std::pow(float(cur_len), parameters.length_penalty);
134-
done = worst_score >= highest_attainable_score;
135-
return;
136-
}
137-
case StopCriteria::never: {
138-
size_t length = parameters.length_penalty > 0.0 ? parameters.max_new_tokens : cur_len;
139-
float highest_attainable_score = best_sum_logprobs / std::pow(float(length), parameters.length_penalty);
140-
done = worst_score >= highest_attainable_score;
141-
return;
142-
}
143-
default:
144-
throw std::runtime_error("Never reached");
124+
case StopCriteria::early:
125+
done = true;
126+
return;
127+
case StopCriteria::heuristic: {
128+
float highest_attainable_score = best_sum_logprobs / std::pow(float(cur_len), parameters.length_penalty);
129+
done = worst_score >= highest_attainable_score;
130+
return;
131+
}
132+
case StopCriteria::never: {
133+
size_t length = parameters.length_penalty > 0.0 ? parameters.max_new_tokens : cur_len;
134+
float highest_attainable_score = best_sum_logprobs / std::pow(float(length), parameters.length_penalty);
135+
done = worst_score >= highest_attainable_score;
136+
return;
137+
}
138+
default: throw std::runtime_error("Never reached");
145139
}
146140
}
147141
};
148142

149-
struct TokenToBeam {
150-
int64_t token_idx;
151-
int32_t beam_idx;
152-
};
143+
struct TokenToBeam {int64_t token_idx; int32_t beam_idx;};
153144

154145
// GroupBeamSearcher processes logits prduced by a language model and accumulates beams using group beam search
155146
// algorithm. select_next_tokens() returns token ids selected by the algorithm and corresponding beam ids. These values

0 commit comments

Comments
 (0)