|
1 | 1 | {
|
2 | 2 | "cells": [
|
| 3 | + { |
| 4 | + "cell_type": "markdown", |
| 5 | + "id": "b2cb321e-2c20-45ea-93e0-a3bd16e5a120", |
| 6 | + "metadata": {}, |
| 7 | + "source": [ |
| 8 | + "## QWEN model inference w/ OpenVINO's LLAMA_CPP plugin" |
| 9 | + ] |
| 10 | + }, |
3 | 11 | {
|
4 | 12 | "cell_type": "markdown",
|
5 | 13 | "id": "2bb9b46a-d9c5-42dc-8e50-6700180aad0c",
|
|
25 | 33 | "source": [
|
26 | 34 | "!pip install transformers[torch]\n",
|
27 | 35 | "!pip install tiktoken\n",
|
28 |
| - "!git clone https://github.com/ggerganov/llama.cpp\n", |
29 |
| - "!pip install -r llama.cpp/requirements/requirements-convert-hf-to-gguf.txt\n", |
30 |
| - "!huggingface-cli download \"Qwen/Qwen-7B-Chat\" --local-dir qwen-7b-chat\n", |
31 |
| - "!python3 llama.cpp/convert-hf-to-gguf.py qwen-7b-chat --outtype f32 --outfile qwen_7b_chat.gguf" |
| 36 | + "!huggingface-cli download Qwen/Qwen1.5-7B-Chat-GGUF qwen1_5-7b-chat-q5_k_m.gguf --local-dir . --local-dir-use-symlinks False" |
32 | 37 | ]
|
33 | 38 | },
|
34 | 39 | {
|
|
51 | 56 | "!git clone https://github.com/openvinotoolkit/openvino_contrib\n",
|
52 | 57 | "!git clone --recurse-submodules https://github.com/openvinotoolkit/openvino\n",
|
53 | 58 | "\n",
|
54 |
| - "# Add -DLLAMA_CUBLAS=1 to the cmake line below build the plugin with the CUDA backend.\n", |
| 59 | + "# Add -DLLAMA_CUBLAS=1 to the cmake line below to build the plugin with the CUDA backend.\n", |
55 | 60 | "# The underlying llama.cpp inference code will be executed on CUDA-powered GPUs on your host.\n",
|
56 | 61 | "!cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=../openvino_contrib/modules/llama_cpp_plugin -DENABLE_PLUGINS_XML=ON -DENABLE_LLAMA_CPP_PLUGIN_REGISTRATION=ON -DENABLE_PYTHON=1 -DENABLE_WHEEL=ON openvino #-DLLAMA_CUBLAS=1\n",
|
57 | 62 | "\n",
|
|
95 | 100 | "outputs": [],
|
96 | 101 | "source": [
|
97 | 102 | "import openvino as ov\n",
|
98 |
| - "ov_model = ov.Core().compile_model(\"qwen_7b_chat.gguf\", \"LLAMA_CPP\")" |
| 103 | + "ov_model = ov.Core().compile_model(\"qwen1_5-7b-chat-q5_k_m.gguf\", \"LLAMA_CPP\")" |
99 | 104 | ]
|
100 | 105 | },
|
101 | 106 | {
|
|
184 | 189 | "formatted_input_prompt = convert_history([[user_prompt, \"\"]])\n",
|
185 | 190 | "\n",
|
186 | 191 | "from transformers import AutoTokenizer\n",
|
187 |
| - "tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n", |
| 192 | + "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n", |
188 | 193 | "\n",
|
189 |
| - "initial_prompt_tokens = tok(formatted_input_prompt, return_tensors=\"np\", **tokenizer_kwargs).input_ids" |
| 194 | + "initial_prompt_tokens = tokenizer(formatted_input_prompt, return_tensors=\"np\", **tokenizer_kwargs).input_ids" |
190 | 195 | ]
|
191 | 196 | },
|
192 | 197 | {
|
|
218 | 223 | "\n",
|
219 | 224 | "output = ov_model({\"input_ids\": initial_prompt_tokens, \"position_ids\": position_ids})\n",
|
220 | 225 | "logits = output[\"logits\"]\n",
|
221 |
| - "last_token_id = np.argmax(logits[:, -1, :], axis=1).reshape([1, 1])\n", |
| 226 | + "curr_token_ids = np.argmax(logits[:, -1, :], axis=1).reshape([1, 1])\n", |
222 | 227 | "\n",
|
223 | 228 | "MAX_TOKENS_GENERATED = 256\n",
|
224 |
| - "STOP_TOKENS = [tok(st, return_tensors=\"np\").input_ids[0][0] for st in model_configuration[\"stop_tokens\"]]\n", |
| 229 | + "STOP_TOKENS = [tokenizer(st, return_tensors=\"np\").input_ids[0][0] for st in model_configuration[\"stop_tokens\"]]\n", |
225 | 230 | "\n",
|
226 | 231 | "curr_tokens_generated = 0\n",
|
227 |
| - "curr_token_ids = last_token_id.reshape([1, 1])\n", |
| 232 | + "last_token_id = curr_token_ids[0][0]\n", |
228 | 233 | "\n",
|
229 | 234 | "response_tokens = []\n",
|
230 | 235 | "next_position_id = sequence_length - 1\n",
|
231 | 236 | "\n",
|
232 |
| - "while (curr_token_ids[0][0] not in STOP_TOKENS) and (curr_tokens_generated < MAX_TOKENS_GENERATED):\n", |
| 237 | + "while (last_token_id not in STOP_TOKENS) and (curr_tokens_generated < MAX_TOKENS_GENERATED): \n", |
| 238 | + " print(tokenizer.decode(last_token_id), end='')\n", |
233 | 239 | " curr_tokens_generated += 1\n",
|
234 | 240 | " curr_position_ids = np.ndarray([1, 1], dtype=np.int64)\n",
|
235 | 241 | " curr_position_ids[0][0] = next_position_id \n",
|
236 | 242 | " next_position_id += 1\n",
|
237 | 243 | " curr_generated_output = ov_model({\"input_ids\": curr_token_ids, \"position_ids\": curr_position_ids})\n",
|
238 | 244 | " curr_logits = curr_generated_output[\"logits\"]\n",
|
239 | 245 | " curr_token_ids = np.argmax(curr_logits[:, -1, :], axis=1).reshape([1, 1])\n",
|
240 |
| - " print(tok.decode(curr_token_ids[0][0]), end='')\n", |
241 |
| - " response_tokens.append(curr_token_ids)\n", |
| 246 | + " last_token_id = curr_token_ids[0][0]\n", |
242 | 247 | "\n",
|
243 | 248 | "ov_model.create_infer_request().reset_state()"
|
244 | 249 | ]
|
|
0 commit comments