|
75 | 75 | "metadata": {},
|
76 | 76 | "outputs": [],
|
77 | 77 | "source": [
|
78 |
| - "model_name = 'microsoft/phi-2'\n", |
79 |
| - "save_name = model_name.split(\"/\")[-1] + '_openvino'\n", |
80 |
| - "precision = 'f32'\n", |
| 78 | + "model_name = \"microsoft/phi-2\"\n", |
| 79 | + "save_name = model_name.split(\"/\")[-1] + \"_openvino\"\n", |
| 80 | + "precision = \"f32\"\n", |
81 | 81 | "quantization_config = OVWeightQuantizationConfig(\n",
|
82 | 82 | " bits=4,\n",
|
83 | 83 | " sym=False,\n",
|
84 | 84 | " group_size=128,\n",
|
85 | 85 | " ratio=0.8,\n",
|
86 | 86 | ")\n",
|
87 |
| - "device = 'gpu'" |
| 87 | + "device = \"gpu\"" |
88 | 88 | ]
|
89 | 89 | },
|
90 | 90 | {
|
|
114 | 114 | "source": [
|
115 | 115 | "# Load kwargs\n",
|
116 | 116 | "load_kwargs = {\n",
|
117 |
| - " 'device': device,\n", |
118 |
| - " 'ov_config': {\n", |
| 117 | + " \"device\": device,\n", |
| 118 | + " \"ov_config\": {\n", |
119 | 119 | " \"PERFORMANCE_HINT\": \"LATENCY\",\n",
|
120 | 120 | " \"INFERENCE_PRECISION_HINT\": precision,\n",
|
121 | 121 | " \"CACHE_DIR\": os.path.join(save_name, \"model_cache\"), # OpenVINO will use this directory as cache\n",
|
122 | 122 | " },\n",
|
123 |
| - " 'compile': False,\n", |
124 |
| - " 'quantization_config': quantization_config\n", |
| 123 | + " \"compile\": False,\n", |
| 124 | + " \"quantization_config\": quantization_config\n", |
125 | 125 | "}\n",
|
126 | 126 | "\n",
|
127 | 127 | "# Check whether the model was already exported\n",
|
|
143 | 143 | "\n",
|
144 | 144 | "# TODO Optional: export to huggingface/hub\n",
|
145 | 145 | "\n",
|
146 |
| - "model_size = os.stat(os.path.join(save_name, 'openvino_model.bin')).st_size / 1024 ** 3\n", |
| 146 | + "model_size = os.stat(os.path.join(save_name, \"openvino_model.bin\")).st_size / 1024 ** 3\n", |
147 | 147 | "print(f'Model size in FP32: ~5.4GB, current model size in 4bit: {model_size:.2f}GB')"
|
148 | 148 | ]
|
149 | 149 | },
|
|
312 | 312 | " for idx, (user_msg, model_msg) in enumerate(history):\n",
|
313 | 313 | " # skip the last assistant message if its empty, the tokenizer will do the formating\n",
|
314 | 314 | " if idx == len(history) - 1 and not model_msg:\n",
|
315 |
| - " messages.append({'role': 'User', 'content': user_msg})\n", |
| 315 | + " messages.append({\"role\": \"User\", \"content\": user_msg})\n", |
316 | 316 | " break\n",
|
317 | 317 | " if user_msg:\n",
|
318 |
| - " messages.append({'role': 'User', 'content': user_msg})\n", |
| 318 | + " messages.append({\"role\": \"User\", \"content\": user_msg})\n", |
319 | 319 | " if model_msg:\n",
|
320 |
| - " messages.append({'role': 'Assistant', 'content': model_msg})\n", |
| 320 | + " messages.append({\"role\": \"Assistant\", \"content\": model_msg})\n", |
321 | 321 | " input_token = tokenizer.apply_chat_template(\n",
|
322 | 322 | " messages,\n",
|
323 | 323 | " add_generation_prompt=True,\n",
|
|
356 | 356 | "\n",
|
357 | 357 | " prompt_char = '▌'\n",
|
358 | 358 | " history[-1][1] = prompt_char\n",
|
359 |
| - " yield (history, 'Status: Generating...')\n", |
| 359 | + " yield (history, \"Status: Generating...\")\n", |
360 | 360 | " \n",
|
361 | 361 | " streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)\n",
|
362 | 362 | "\n",
|
|
394 | 394 | " break\n",
|
395 | 395 | " elif is_partial_stop(partial_text, stop_str):\n",
|
396 | 396 | " continue\n",
|
397 |
| - " yield (history, 'Status: Generating...')\n", |
| 397 | + " yield (history, \"Status: Generating...\")\n", |
398 | 398 | " history[-1][1] = partial_text\n",
|
399 | 399 | " generation_time = time.perf_counter() - start\n",
|
400 | 400 | " yield (history, f'Generation time: {generation_time:.2f} sec')"
|
|
519 | 519 | " queue=True\n",
|
520 | 520 | " )\n",
|
521 | 521 | " \n",
|
522 |
| - " clear.click(fn=lambda: (None, 'Status: Idle'), inputs=None, outputs=[chatbot, status], queue=False)" |
| 522 | + " clear.click(fn=lambda: (None, \"Status: Idle\"), inputs=None, outputs=[chatbot, status], queue=False)" |
523 | 523 | ]
|
524 | 524 | },
|
525 | 525 | {
|
|
0 commit comments