Skip to content

Commit 42dd049

Browse files
authored
[Continuous batching] In the event of OOM, return tokens generated so far for the request (openvinotoolkit#661)
1 parent cc5e235 commit 42dd049

File tree

2 files changed

+36
-46
lines changed

2 files changed

+36
-46
lines changed

src/cpp/src/sequence_group.hpp

+29-42
Original file line numberDiff line numberDiff line change
@@ -425,59 +425,46 @@ class SequenceGroup {
425425
return m_generation_stream->get_status() == GenerationStatus::DROPPED_BY_HANDLE;
426426
}
427427

428-
void notify_handle() {
428+
void push_outputs() {
429+
GenerationOutputs outputs;
430+
for (auto& sequence: m_sequences) {
431+
GenerationOutput output;
432+
output.generated_token_ids = sequence->get_generated_ids();
433+
output.score = sequence->get_beam_search_score(m_sampling_params);
434+
outputs.emplace(sequence->get_grouped_id(), output);
435+
}
436+
m_generation_stream->push(outputs);
437+
}
438+
439+
void push_partial_outputs() {
440+
GenerationOutputs outputs;
441+
// TODO: support streamimg for n seqs
442+
for (auto& sequence : m_sequences) {
443+
// todo: check seq.is_finished() to generate without several </s>
444+
// or is it ok to use padding?
445+
const auto last_gen_token = sequence->get_last_generation_output();
446+
outputs.emplace(sequence->get_grouped_id(), last_gen_token);
447+
}
448+
m_generation_stream->push(outputs);
449+
}
429450

451+
void notify_handle() {
430452
if (out_of_memory()) {
431453
set_generation_status(GenerationStatus::IGNORED);
432454
} else if (has_finished()) {
433455
set_generation_status(GenerationStatus::FINISHED);
434456
}
435-
436-
GenerationOutputs outputs;
437-
438457
// For beam search streaming is not available, so we notify only upon finishing
439458
if(m_sampling_params.is_beam_search()) {
440-
if (has_finished()) {
441-
std::vector<Sequence::CPtr> finished_sequences = get_finished_sequences();
442-
443-
OPENVINO_ASSERT(finished_sequences.size() == num_total_seqs() && has_finished());
444-
for (auto& sequence: finished_sequences) {
445-
GenerationOutput output;
446-
output.generated_token_ids = sequence->get_generated_ids();
447-
output.score = sequence->get_beam_search_score(m_sampling_params);
448-
outputs.emplace(sequence->get_grouped_id(), output);
449-
}
450-
451-
if (outputs.size()) {
452-
m_generation_stream->push(outputs);
453-
}
459+
if (has_finished() || out_of_memory()) {
460+
push_outputs();
454461
}
455-
// For greedy or multinomial sampling we decide whever to stream partial results depending on the user parameter
456462
} else if (m_sampling_params.is_greedy_decoding() || m_sampling_params.is_multinomial()) {
457463
// TO DO: Now we always stream for greedy search for the sake of benchmarking
458-
if (num_total_seqs() == 1 /* m_sampling_params.stream */) {
459-
// TODO: support streamimg for n seqs
460-
for (auto& sequence : m_sequences) {
461-
// todo: check seq.is_finished() to generate without several </s>
462-
// or is it ok to use padding?
463-
const auto last_gen_token = sequence->get_last_generation_output();
464-
outputs.emplace(sequence->get_grouped_id(), last_gen_token);
465-
}
466-
m_generation_stream->push(outputs);
467-
} else if (has_finished()) {
468-
std::vector<Sequence::CPtr> finished_sequences = get_finished_sequences();
469-
470-
OPENVINO_ASSERT(finished_sequences.size() == num_total_seqs() && has_finished());
471-
for (auto& sequence: finished_sequences) {
472-
GenerationOutput output;
473-
output.generated_token_ids = sequence->get_generated_ids();
474-
output.score = sequence->get_cumulative_log_probs();
475-
outputs.emplace(sequence->get_grouped_id(), output);
476-
}
477-
478-
if (outputs.size()) {
479-
m_generation_stream->push(outputs);
480-
}
464+
if (num_total_seqs() == 1) {
465+
push_partial_outputs();
466+
} else if (has_finished() || out_of_memory()) {
467+
push_outputs();
481468
}
482469
}
483470
}

tests/python_tests/test_sampling.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -291,8 +291,9 @@ def test_individual_generation_configs_random(tmp_path, test_struct: RandomSampl
291291

292292

293293
@pytest.mark.precommit
294-
def test_post_oom_health(tmp_path):
295-
generation_config = get_greedy()
294+
@pytest.mark.parametrize("sampling_config", [get_greedy(), get_beam_search(), get_multinomial_all_parameters()])
295+
def test_post_oom_health(tmp_path, sampling_config):
296+
generation_config = sampling_config
296297
generation_config.ignore_eos = True
297298
generation_config.max_new_tokens = 1000000
298299

@@ -309,9 +310,11 @@ def test_post_oom_health(tmp_path):
309310
pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), Tokenizer(model_path.absolute().as_posix(), {}), scheduler_config, "CPU", {})
310311
# First run should return incomplete response
311312
output = pipe.generate(["What is OpenVINO?"], generation_configs)
312-
assert(len(output))
313+
assert (len(output))
314+
assert(len(output[0].m_generation_ids))
313315
# Same for the second run, here we want to make sure the cleanup works and we have free blocks after recent OOM
314316
output = pipe.generate(["What is OpenVINO?"], generation_configs)
315-
assert(len(output))
317+
assert (len(output))
318+
assert(len(output[0].m_generation_ids))
316319
del pipe
317320
shutil.rmtree(model_path)

0 commit comments

Comments
 (0)