Skip to content

Commit 11fbaa2

Browse files
committed
tokenizer minor fixes
1 parent 72c045e commit 11fbaa2

File tree

2 files changed

+7
-9
lines changed

2 files changed

+7
-9
lines changed

src/cpp/src/llm_pipeline.cpp

+6-2
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ std::string ov::LLMPipeline::LLMPipelineImpl::generate(
143143

144144
auto [input_ids, attention_mask] = m_tokenizer.encode(text);
145145

146-
// todo: W/A If sentence begins with a special tokens (<bos>, <s>, etc.) openvino_tokenizer inserts 2 special extra tokens <bos> and "▁",
146+
// todo: W/A If sentence begins with a specfial tokens (<bos>, <s>, etc.) openvino_tokenizer inserts 2 special extra tokens <bos> and "▁",
147147
// but HF does not do that. Moreover openvino_tokenizer always inserts <bos> but in chat scenario HF does not do that because skip_special_tokens=True.
148148
// Need to remove both of that tokens manually to get exact token by token alignment with HF
149149
auto size = input_ids.get_shape();
@@ -155,7 +155,7 @@ std::string ov::LLMPipeline::LLMPipelineImpl::generate(
155155
std::vector<float> tmp_attn_mask(attention_mask_data, attention_mask_data + attention_mask.get_size());
156156
// tmp_attn_mask.erase(tmp_attn_mask.begin());
157157

158-
std::vector<std::string> prefixes_to_exclude = {"<s>", "</s>"}; // todo: for TinyLlama, need to get them form generation_config
158+
std::vector<std::string> prefixes_to_exclude = {config.eos_token, config.bos_token};
159159
auto prefix_match = [&text](std::string prefix) { return text.substr(0, prefix.length()) == prefix; };
160160
if (std::any_of(prefixes_to_exclude.begin(), prefixes_to_exclude.end(), prefix_match)) {
161161
tmp_ids.erase(tmp_ids.begin());
@@ -221,6 +221,10 @@ ov::EncodedResults ov::LLMPipeline::LLMPipelineImpl::generate(
221221
} else if (auto callback = std::get_if<std::function<void(std::string)>>(&*streamer)) {
222222
streamer_ptr = std::make_shared<TextCallbackStreamer>(m_tokenizer, *callback);
223223
}
224+
auto batch_size = input_ids.get_shape().at(0);
225+
if ((batch_size != 1 || !config_helper.is_greedy_decoding()) && streamer_ptr) {
226+
OPENVINO_THROW("Currently streaming is possible only with batch size=1 and greedy decoding");
227+
}
224228

225229
auto attention_mask_data = attention_mask.has_value() ? *attention_mask : ov::generate_utils::init_attention_mask(input_ids);
226230

src/cpp/src/tokenizer.cpp

+1-7
Original file line numberDiff line numberDiff line change
@@ -93,13 +93,7 @@ class Tokenizer::TokenizerImpl {
9393
m_tokenize_request.set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()});
9494
auto size_ = m_tokenize_request.get_input_tensor().get_shape();
9595
m_tokenize_request.infer();
96-
97-
::pad_left(m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask"), m_pad_token_id);
98-
// todo: fix mask filled with '2' instead of '0'
99-
ov::Tensor attention_mask = m_tokenize_request.get_tensor("attention_mask");
100-
int64_t* attention_mask_data = attention_mask.data<int64_t>();
101-
std::replace(attention_mask_data, attention_mask_data + attention_mask.get_size(), 2, 0);
102-
96+
pad_left(m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask"), m_pad_token_id);
10397
return {m_tokenize_request.get_tensor("input_ids"), m_tokenize_request.get_tensor("attention_mask")};
10498
}
10599

0 commit comments

Comments
 (0)