@@ -99,7 +99,7 @@ GenerationHandle
99
99
ContinuousBatchingPipeline::SpeculativeDecodingImpl::add_request (uint64_t request_id,
100
100
const ov::Tensor& input_ids,
101
101
ov::genai::GenerationConfig sampling_params) {
102
- m_sd_metrics.set_generated_len (request_id, sampling_params.max_new_tokens );
102
+ m_sd_metrics.set_generated_len (request_id, sampling_params.get_max_new_tokens (input_ids. get_size ()) );
103
103
std::lock_guard<std::mutex> lock (m_draft_generations_mutex);
104
104
auto draft_sampling_params = sampling_params;
105
105
draft_sampling_params.ignore_eos = true ;
@@ -112,7 +112,7 @@ GenerationHandle
112
112
ContinuousBatchingPipeline::SpeculativeDecodingImpl::add_request (uint64_t request_id,
113
113
const std::string& prompt,
114
114
ov::genai::GenerationConfig sampling_params) {
115
- m_sd_metrics.set_generated_len (request_id, sampling_params.max_new_tokens );
115
+ m_sd_metrics.set_generated_len (request_id, sampling_params.get_max_new_tokens (prompt. length ()) );
116
116
std::lock_guard<std::mutex> lock (m_draft_generations_mutex);
117
117
auto draft_sampling_params = sampling_params;
118
118
draft_sampling_params.ignore_eos = true ;
@@ -245,7 +245,7 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector<
245
245
246
246
std::vector<GenerationHandle> main_generations;
247
247
for (size_t request_id = 0 ; request_id < input_ids.size (); ++request_id) {
248
- m_sd_metrics.set_generated_len (request_id, sampling_params[request_id].max_new_tokens );
248
+ m_sd_metrics.set_generated_len (request_id, sampling_params[request_id].get_max_new_tokens (input_ids[request_id]. get_size ()) );
249
249
OPENVINO_ASSERT (1 == input_ids[request_id].get_shape ().at (0 ), " Use multiple tensors to pass a batch." );
250
250
main_generations.push_back (m_main_pipeline->add_request (request_id, input_ids[request_id], sampling_params[request_id]));
251
251
0 commit comments