Skip to content

Commit f0645d2

Browse files
committed
Use verbose InferRequest API for inference
1 parent 61a9e13 commit f0645d2

File tree

1 file changed

+11
-5
lines changed

1 file changed

+11
-5
lines changed

modules/llama_cpp_plugin/notebooks/qwen.ipynb

+11-5
Original file line numberDiff line numberDiff line change
@@ -226,8 +226,11 @@
226226
"sequence_length = len(initial_prompt_tokens[0])\n",
227227
"position_ids = np.arange(0, sequence_length).reshape(initial_prompt_tokens.shape)\n",
228228
"\n",
229-
"output = ov_model({\"input_ids\": initial_prompt_tokens, \"position_ids\": position_ids})\n",
230-
"logits = output[\"logits\"]\n",
229+
"infer_request = ov_model.create_infer_request()\n",
230+
"infer_request.set_tensors({\"input_ids\": ov.Tensor(initial_prompt_tokens), \"position_ids\": ov.Tensor(position_ids)})\n",
231+
"infer_request.infer()\n",
232+
"logits = infer_request.get_tensor(\"logits\").data\n",
233+
"\n",
231234
"curr_token_ids = np.argmax(logits[:, -1, :], axis=1).reshape([1, 1])\n",
232235
"\n",
233236
"MAX_TOKENS_GENERATED = 256\n",
@@ -245,12 +248,15 @@
245248
" curr_position_ids = np.ndarray([1, 1], dtype=np.int64)\n",
246249
" curr_position_ids[0][0] = next_position_id \n",
247250
" next_position_id += 1\n",
248-
" curr_generated_output = ov_model({\"input_ids\": curr_token_ids, \"position_ids\": curr_position_ids})\n",
249-
" curr_logits = curr_generated_output[\"logits\"]\n",
251+
" \n",
252+
" infer_request.set_tensors({\"input_ids\": ov.Tensor(curr_token_ids), \"position_ids\": ov.Tensor(curr_position_ids)})\n",
253+
" infer_request.infer()\n",
254+
" curr_logits = infer_request.get_tensor(\"logits\").data\n",
255+
" \n",
250256
" curr_token_ids = np.argmax(curr_logits[:, -1, :], axis=1).reshape([1, 1])\n",
251257
" last_token_id = curr_token_ids[0][0]\n",
252258
"\n",
253-
"ov_model.create_infer_request().reset_state()"
259+
"infer_request.reset_state()"
254260
]
255261
},
256262
{

0 commit comments

Comments
 (0)