Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use get_max_new_tokens() insted of max_new_tokens field when stopping… #1417

Merged
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/cpp/src/continuous_batching_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -752,7 +752,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs(
}
currently_processed_tokens += output_seq_len * num_running_sequences;
// For max_new_tokens == 0, we don't reach sampling so need to notify handle separately
if(sequence_group->get_sampling_parameters().max_new_tokens == 0) {
if(sequence_group->get_max_new_tokens() == 0) {
sequence_group->notify_handle_echo_only();
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingForPromptLookupImpl::generate
const auto sampling_params = request->get_sampling_parameters();
{
const auto generated_len = running_sequence->get_generated_len();
const auto left_generated_len = std::min(sampling_params.max_new_tokens, sampling_params.max_length) - generated_len - 1;
const auto left_generated_len = request->get_max_new_tokens() - generated_len - 1;
min_num_assistant_tokens = std::min(sampling_params.num_assistant_tokens, left_generated_len);
}
TokenIds candidates = generate_candidates(full_input_ids, min_num_assistant_tokens, sampling_params.max_ngram_size);
Expand Down
4 changes: 2 additions & 2 deletions src/cpp/src/sequence_group.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -457,7 +457,7 @@ class SequenceGroup : public std::enable_shared_from_this<SequenceGroup> {
}

bool requires_sampling() const {
return get_context_len() >= get_prompt_len() && get_context_len() > m_max_content_len && m_sampling_params.max_new_tokens > 0;
return get_context_len() >= get_prompt_len() && get_context_len() > m_max_content_len && get_max_new_tokens() > 0;
}

void schedule_tokens(size_t num_tokens) {
Expand Down Expand Up @@ -699,7 +699,7 @@ class SequenceGroup : public std::enable_shared_from_this<SequenceGroup> {
m_generation_stream->push(std::move(outputs));
}

size_t get_max_new_tokens() {
size_t get_max_new_tokens() const {
return m_sampling_params.get_max_new_tokens(get_prompt_len());
}
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::update
const size_t num_processed_tokens = request->get_num_processed_tokens(),
prompt_len = request->get_prompt_len(),
updated_context_len = min_candidate_len + prompt_len,
max_new_tokens = request->get_sampling_parameters().max_new_tokens;
max_new_tokens = request->get_max_new_tokens();
size_t generated_len = request->get_context_len() >= request->get_prompt_len() ? request->get_context_len() - request->get_prompt_len() + 1 : 0;
if (generated_len > 0 && result.removed_tokens_cnt > 0) {
request->update_processed_tokens_num(num_processed_tokens - result.removed_tokens_cnt + 1);
Expand Down Expand Up @@ -323,13 +323,13 @@ void ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::m
// generate only one token in case of non speculative decoding
request->pause_generation(true);
} else if (request->get_num_processed_tokens() >= request->get_prompt_len() &&
(request->get_num_processed_tokens() - request->get_prompt_len() + 1) >= sampling_params.max_new_tokens - 1) {
(request->get_num_processed_tokens() - request->get_prompt_len() + 1) >= request->get_max_new_tokens() - 1) {
request->pause_generation(true);
} else if (request->get_num_processed_tokens() == 0 && sampling_params.num_return_sequences > 1) {
request->pause_generation(true);
} else if (sampling_params.num_assistant_tokens <= generated_tokens_cnt && sampling_params.assistant_confidence_threshold == 0.f) {
request->pause_generation(true);
} else if (sampling_params.max_new_tokens == 0) {
} else if (request->get_max_new_tokens() == 0) {
request->pause_generation(true);
} else if (request->get_num_processed_tokens() == request->get_prompt_len()) {
request->pause_generation(true);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ GenerationHandle
ContinuousBatchingPipeline::SpeculativeDecodingImpl::add_request(uint64_t request_id,
const ov::Tensor& input_ids,
ov::genai::GenerationConfig sampling_params) {
m_sd_metrics.set_generated_len(request_id, sampling_params.max_new_tokens);
m_sd_metrics.set_generated_len(request_id, sampling_params.get_max_new_tokens(input_ids.get_size()));
std::lock_guard<std::mutex> lock(m_draft_generations_mutex);
auto draft_sampling_params = sampling_params;
draft_sampling_params.ignore_eos = true;
Expand All @@ -108,7 +108,7 @@ GenerationHandle
ContinuousBatchingPipeline::SpeculativeDecodingImpl::add_request(uint64_t request_id,
const std::string& prompt,
ov::genai::GenerationConfig sampling_params) {
m_sd_metrics.set_generated_len(request_id, sampling_params.max_new_tokens);
m_sd_metrics.set_generated_len(request_id, sampling_params.get_max_new_tokens(input_ids.get_size()));
std::lock_guard<std::mutex> lock(m_draft_generations_mutex);
auto draft_sampling_params = sampling_params;
draft_sampling_params.ignore_eos = true;
Expand Down Expand Up @@ -240,7 +240,7 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector<

std::vector<GenerationHandle> main_generations;
for (size_t request_id = 0; request_id < input_ids.size(); ++request_id) {
m_sd_metrics.set_generated_len(request_id, sampling_params[request_id].max_new_tokens);
m_sd_metrics.set_generated_len(request_id, sampling_params[request_id].get_max_new_tokens(input_ids.get_size()));
OPENVINO_ASSERT(1 == input_ids[request_id].get_shape().at(0), "Use multiple tensors to pass a batch.");
main_generations.push_back(m_main_pipeline->add_request(request_id, input_ids[request_id], sampling_params[request_id]));

Expand Down
Loading