add

echarlaix · echarlaix · commit 2e6d25b98b09 · 2024-03-18T16:18:24.000+01:00
diff --git a/notebooks/openvino/quantized_generation_demo.ipynb b/notebooks/openvino/quantized_generation_demo.ipynb
@@ -75,16 +75,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model_name = 'microsoft/phi-2'\n",
-    "save_name = model_name.split(\"/\")[-1] + '_openvino'\n",
-    "precision = 'f32'\n",
+    "model_name = \"microsoft/phi-2\"\n",
+    "save_name = model_name.split(\"/\")[-1] + \"_openvino\"\n",
+    "precision = \"f32\"\n",
     "quantization_config = OVWeightQuantizationConfig(\n",
     "    bits=4,\n",
     "    sym=False,\n",
     "    group_size=128,\n",
     "    ratio=0.8,\n",
     ")\n",
-    "device = 'gpu'"
+    "device = \"gpu\""
    ]
   },
   {
@@ -114,14 +114,14 @@
    "source": [
     "# Load kwargs\n",
     "load_kwargs = {\n",
-    "    'device': device,\n",
-    "    'ov_config': {\n",
+    "    \"device\": device,\n",
+    "    \"ov_config\": {\n",
     "        \"PERFORMANCE_HINT\": \"LATENCY\",\n",
     "        \"INFERENCE_PRECISION_HINT\": precision,\n",
     "        \"CACHE_DIR\": os.path.join(save_name, \"model_cache\"),  # OpenVINO will use this directory as cache\n",
     "    },\n",
-    "    'compile': False,\n",
-    "    'quantization_config': quantization_config\n",
+    "    \"compile\": False,\n",
+    "    \"quantization_config\": quantization_config\n",
     "}\n",
     "\n",
     "# Check whether the model was already exported\n",
@@ -143,7 +143,7 @@
     "\n",
     "# TODO Optional: export to huggingface/hub\n",
     "\n",
-    "model_size = os.stat(os.path.join(save_name, 'openvino_model.bin')).st_size / 1024 ** 3\n",
+    "model_size = os.stat(os.path.join(save_name, \"openvino_model.bin\")).st_size / 1024 ** 3\n",
     "print(f'Model size in FP32: ~5.4GB, current model size in 4bit: {model_size:.2f}GB')"
    ]
   },
@@ -312,12 +312,12 @@
     "    for idx, (user_msg, model_msg) in enumerate(history):\n",
     "        # skip the last assistant message if its empty, the tokenizer will do the formating\n",
     "        if idx == len(history) - 1 and not model_msg:\n",
-    "            messages.append({'role': 'User', 'content': user_msg})\n",
+    "            messages.append({\"role\": \"User\", \"content\": user_msg})\n",
     "            break\n",
     "        if user_msg:\n",
-    "            messages.append({'role': 'User', 'content': user_msg})\n",
+    "            messages.append({\"role\": \"User\", \"content\": user_msg})\n",
     "        if model_msg:\n",
-    "            messages.append({'role': 'Assistant', 'content': model_msg})\n",
+    "            messages.append({\"role\": \"Assistant\", \"content\": model_msg})\n",
     "    input_token = tokenizer.apply_chat_template(\n",
     "        messages,\n",
     "        add_generation_prompt=True,\n",
@@ -356,7 +356,7 @@
     "\n",
     "    prompt_char = '▌'\n",
     "    history[-1][1] = prompt_char\n",
-    "    yield (history, 'Status: Generating...')\n",
+    "    yield (history, \"Status: Generating...\")\n",
     "    \n",
     "    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)\n",
     "\n",
@@ -394,7 +394,7 @@
     "            break\n",
     "        elif is_partial_stop(partial_text, stop_str):\n",
     "            continue\n",
-    "        yield (history, 'Status: Generating...')\n",
+    "        yield (history, \"Status: Generating...\")\n",
     "    history[-1][1] = partial_text\n",
     "    generation_time = time.perf_counter() - start\n",
     "    yield (history, f'Generation time: {generation_time:.2f} sec')"
@@ -519,7 +519,7 @@
     "        queue=True\n",
     "    )\n",
     "    \n",
-    "    clear.click(fn=lambda: (None, 'Status: Idle'), inputs=None, outputs=[chatbot, status], queue=False)"
+    "    clear.click(fn=lambda: (None, \"Status: Idle\"), inputs=None, outputs=[chatbot, status], queue=False)"
    ]
   },
   {