Skip to content

Commit 34ed84d

Browse files
committed
Merge branch 'generate_pipeline' into archive
2 parents 8976cad + e7fa974 commit 34ed84d

File tree

4 files changed

+32
-13
lines changed

4 files changed

+32
-13
lines changed

.github/workflows/causal_lm_cpp.yml

+11-11
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ jobs:
7474
tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
7575
tokenized = tokenizer('Why is the Sun yellow?', return_tensors='pt')
7676
for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
77-
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
77+
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
7878
idx = predictions.find(ref)
7979
if -1 == idx:
8080
raise RuntimeError(f'Missing "{ref=}" from predictions')
@@ -90,7 +90,7 @@ jobs:
9090
tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
9191
tokenized = tokenizer('69', return_tensors='pt')
9292
for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
93-
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
93+
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
9494
idx = predictions.find(ref)
9595
if -1 == idx:
9696
raise RuntimeError(f'Missing "{ref=}" from predictions')
@@ -106,7 +106,7 @@ jobs:
106106
tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
107107
tokenized = tokenizer('Hi', return_tensors='pt')
108108
for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
109-
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
109+
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
110110
idx = predictions.find(ref)
111111
if -1 == idx:
112112
raise RuntimeError(f'Missing "{ref=}" from predictions')
@@ -122,7 +122,7 @@ jobs:
122122
tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
123123
tokenized = tokenizer('return 0', return_tensors='pt')
124124
for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
125-
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
125+
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
126126
idx = predictions.find(ref)
127127
if -1 == idx:
128128
raise RuntimeError(f'Missing "{ref=}" from predictions')
@@ -138,7 +138,7 @@ jobs:
138138
tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')
139139
tokenized = tokenizer('你好! 你好嗎?', return_tensors='pt')
140140
for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
141-
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
141+
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
142142
idx = predictions.find(ref)
143143
if -1 == idx:
144144
raise RuntimeError(f'Missing "{ref=}" from predictions')
@@ -160,7 +160,7 @@ jobs:
160160
for prompt in prompts:
161161
tokenized = tokenizer(prompt, return_tensors='pt')
162162
for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False):
163-
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
163+
ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True)
164164
idx = predictions.find(ref)
165165
if -1 == idx:
166166
raise RuntimeError(f'Missing "{ref=}" from predictions')
@@ -201,7 +201,7 @@ jobs:
201201
echo tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') >> ref.py
202202
echo tokenized = tokenizer('69', return_tensors='pt') >> ref.py
203203
echo for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): >> ref.py
204-
echo ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' >> ref.py
204+
echo ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) >> ref.py
205205
echo idx = predictions.find(ref) >> ref.py
206206
echo if -1 == idx: >> ref.py
207207
echo raise RuntimeError(f'Missing "{ref=}" from predictions') >> ref.py
@@ -347,7 +347,7 @@ jobs:
347347
- name: run and compare
348348
run: |
349349
source ./ov/setupvars.sh
350-
./build/speculative_decoding_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_speculative.txt
350+
./build/text_generation/causal_lm/cpp/speculative_decoding_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_speculative.txt
351351
./build/text_generation/causal_lm/cpp/greedy_causal_lm ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt
352352
python -c "
353353
with open('predictions_greedy.txt', 'r') as f:
@@ -393,7 +393,7 @@ jobs:
393393
Question: Can you please add 2 and 3
394394
A:' > ./prompt.txt
395395
396-
./build/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_prompt_lookup.txt
396+
./build/text_generation/causal_lm/cpp/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_prompt_lookup.txt
397397
./build/text_generation/causal_lm/cpp/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_greedy.txt
398398
python -c "
399399
with open('predictions_greedy.txt', 'r') as f:
@@ -441,7 +441,7 @@ jobs:
441441
tokenizer = transformers.AutoTokenizer.from_pretrained('microsoft/phi-1_5')
442442
tokenized = tokenizer('Alan Turing was a', return_tensors='pt')
443443
for output in transformers.AutoModelForCausalLM.from_pretrained('microsoft/phi-1_5').generate(**tokenized, max_length=100, do_sample=False):
444-
ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
444+
ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True)
445445
idx = predictions.find(ref)
446446
if -1 == idx:
447447
raise RuntimeError(f'Missing "{ref=}" from predictions')
@@ -486,7 +486,7 @@ jobs:
486486
tokenizer = transformers.AutoTokenizer.from_pretrained('ikala/redpajama-3b-chat')
487487
tokenized = tokenizer('Alan Turing was a', return_tensors='pt')
488488
for output in transformers.AutoModelForCausalLM.from_pretrained('ikala/redpajama-3b-chat').generate(**tokenized, max_length=100, do_sample=False):
489-
ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n'
489+
ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True)
490490
idx = predictions.find(ref)
491491
if -1 == idx:
492492
raise RuntimeError(f'Missing "{ref}" from predictions')

src/cpp/src/generation_config.cpp

+6-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,12 @@ GenerationConfig::GenerationConfig(std::string json_path) {
3636
if (data.contains("repetition_penalty")) repetition_penalty = data["repetition_penalty"];
3737
if (data.contains("pad_token_id")) pad_token_id = data["pad_token_id"];
3838
if (data.contains("bos_token_id")) bos_token_id = data["bos_token_id"];
39-
if (data.contains("eos_token_id")) eos_token_id = data["eos_token_id"];
39+
40+
if (data.contains("eos_token_id") && data["eos_token_id"].type() == nlohmann::json::value_t::number_integer) {
41+
// todo: qwen contains several eos_token_id
42+
eos_token_id = data["eos_token_id"];
43+
}
44+
4045
if (data.contains("bos_token")) bos_token = data["bos_token"];
4146
if (data.contains("eos_token")) eos_token = data["eos_token"];
4247

src/cpp/src/tokenizer.cpp

+8-1
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ class Tokenizer::TokenizerImpl {
8080
m_bos_token_id = rt_info["bos_token_id"].as<int64_t>();
8181
if (rt_info.count("pad_token_id") > 0)
8282
m_pad_token_id = rt_info["pad_token_id"].as<int64_t>();
83-
}
83+
}
8484

8585
std::pair<ov::Tensor, ov::Tensor> encode(std::string prompt) {
8686
size_t batch_size = 1;
@@ -94,6 +94,13 @@ class Tokenizer::TokenizerImpl {
9494
auto size_ = m_tokenize_request.get_input_tensor().get_shape();
9595
m_tokenize_request.infer();
9696
pad_left(m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask"), m_pad_token_id);
97+
98+
// todo: fix mask filled with '2' instead of '0'
99+
// https://github.com/openvinotoolkit/openvino_tokenizers/pull/90 should've fixed this
100+
ov::Tensor attention_mask = m_tokenize_request.get_tensor("attention_mask");
101+
int64_t* attention_mask_data = attention_mask.data<int64_t>();
102+
std::replace(attention_mask_data, attention_mask_data + attention_mask.get_size(), 2, 0);
103+
97104
return {m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask")};
98105
}
99106

text_generation/causal_lm/cpp/beam_search_causal_lm.cpp

+7
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@
33

44
#include <openvino_genai/llm_pipeline.hpp>
55

6+
namespace {
7+
enum SPECIAL_TOKEN { PAD_TOKEN = 2 };
8+
}
9+
610
int main(int argc, char* argv[]) try {
711
if (argc < 3) {
812
throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> '<PROMPT 1>' ['<PROMPT 2>' ...]");
@@ -19,6 +23,9 @@ int main(int argc, char* argv[]) try {
1923
config.num_beams = 15;
2024
config.num_return_sequences = config.num_beams * prompts.size();
2125

26+
// workaround until pad_token_id is not written into IR
27+
pipe.get_tokenizer().set_pad_token_id(PAD_TOKEN);
28+
2229
auto beams = pipe.generate(prompts, config);
2330
for (int i = 0; i < beams.scores.size(); i++)
2431
std::cout << beams.scores[i] << ": " << beams.texts[i] << '\n';

0 commit comments

Comments
 (0)