@@ -425,59 +425,46 @@ class SequenceGroup {
425
425
return m_generation_stream->get_status () == GenerationStatus::DROPPED_BY_HANDLE;
426
426
}
427
427
428
- void notify_handle () {
428
+ void push_outputs () {
429
+ GenerationOutputs outputs;
430
+ for (auto & sequence: m_sequences) {
431
+ GenerationOutput output;
432
+ output.generated_token_ids = sequence->get_generated_ids ();
433
+ output.score = sequence->get_beam_search_score (m_sampling_params);
434
+ outputs.emplace (sequence->get_grouped_id (), output);
435
+ }
436
+ m_generation_stream->push (outputs);
437
+ }
438
+
439
+ void push_partial_outputs () {
440
+ GenerationOutputs outputs;
441
+ // TODO: support streamimg for n seqs
442
+ for (auto & sequence : m_sequences) {
443
+ // todo: check seq.is_finished() to generate without several </s>
444
+ // or is it ok to use padding?
445
+ const auto last_gen_token = sequence->get_last_generation_output ();
446
+ outputs.emplace (sequence->get_grouped_id (), last_gen_token);
447
+ }
448
+ m_generation_stream->push (outputs);
449
+ }
429
450
451
+ void notify_handle () {
430
452
if (out_of_memory ()) {
431
453
set_generation_status (GenerationStatus::IGNORED);
432
454
} else if (has_finished ()) {
433
455
set_generation_status (GenerationStatus::FINISHED);
434
456
}
435
-
436
- GenerationOutputs outputs;
437
-
438
457
// For beam search streaming is not available, so we notify only upon finishing
439
458
if (m_sampling_params.is_beam_search ()) {
440
- if (has_finished ()) {
441
- std::vector<Sequence::CPtr> finished_sequences = get_finished_sequences ();
442
-
443
- OPENVINO_ASSERT (finished_sequences.size () == num_total_seqs () && has_finished ());
444
- for (auto & sequence: finished_sequences) {
445
- GenerationOutput output;
446
- output.generated_token_ids = sequence->get_generated_ids ();
447
- output.score = sequence->get_beam_search_score (m_sampling_params);
448
- outputs.emplace (sequence->get_grouped_id (), output);
449
- }
450
-
451
- if (outputs.size ()) {
452
- m_generation_stream->push (outputs);
453
- }
459
+ if (has_finished () || out_of_memory ()) {
460
+ push_outputs ();
454
461
}
455
- // For greedy or multinomial sampling we decide whever to stream partial results depending on the user parameter
456
462
} else if (m_sampling_params.is_greedy_decoding () || m_sampling_params.is_multinomial ()) {
457
463
// TO DO: Now we always stream for greedy search for the sake of benchmarking
458
- if (num_total_seqs () == 1 /* m_sampling_params.stream */ ) {
459
- // TODO: support streamimg for n seqs
460
- for (auto & sequence : m_sequences) {
461
- // todo: check seq.is_finished() to generate without several </s>
462
- // or is it ok to use padding?
463
- const auto last_gen_token = sequence->get_last_generation_output ();
464
- outputs.emplace (sequence->get_grouped_id (), last_gen_token);
465
- }
466
- m_generation_stream->push (outputs);
467
- } else if (has_finished ()) {
468
- std::vector<Sequence::CPtr> finished_sequences = get_finished_sequences ();
469
-
470
- OPENVINO_ASSERT (finished_sequences.size () == num_total_seqs () && has_finished ());
471
- for (auto & sequence: finished_sequences) {
472
- GenerationOutput output;
473
- output.generated_token_ids = sequence->get_generated_ids ();
474
- output.score = sequence->get_cumulative_log_probs ();
475
- outputs.emplace (sequence->get_grouped_id (), output);
476
- }
477
-
478
- if (outputs.size ()) {
479
- m_generation_stream->push (outputs);
480
- }
464
+ if (num_total_seqs () == 1 ) {
465
+ push_partial_outputs ();
466
+ } else if (has_finished () || out_of_memory ()) {
467
+ push_outputs ();
481
468
}
482
469
}
483
470
}
0 commit comments