SmolVLM2 notebook (#2835)

eaidova · web-flow · commit 7a220a3261e6 · 2025-03-24T15:05:35.000+04:00
diff --git a/.ci/spellcheck/.pyspelling.wordlist.txt b/.ci/spellcheck/.pyspelling.wordlist.txt
@@ -876,6 +876,7 @@ slowfast
 slowmo
 SML
 sml
+SmolVLM
 softmax
 softvc
 SoftVC
diff --git a/notebooks/smolvlm2/README.md b/notebooks/smolvlm2/README.md
@@ -0,0 +1,30 @@
+# Visual-Language Assistant with SmolVLM2 and OpenVINO
+
+SmolVLM2 represents a fundamental shift in how we think about video understanding - moving from massive models that require substantial computing resources to efficient models that can run anywhere.
+
+![](https://github.com/user-attachments/assets/23eb3d5c-8a3e-4d3e-aeaf-56e166c1ec82)
+
+Its goal is simple: make video understanding accessible across all devices and use cases, from phones to servers.
+
+Compared with the previous SmolVLM family, SmolVLM2 2.2B model got better at solving math problems with images, reading text in photos, understanding complex diagrams, and tackling scientific visual questions.
+
+You can find more details about model in [model card](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct) and [HuggingFace blog post](https://huggingface.co/blog/smolvlm2)
+
+In this tutorial we consider how to convert and optimize SmolVLM2 model for creating multimodal chatbot using [Optimum Intel](https://github.com/huggingface/optimum-intel). Additionally, we demonstrate how to apply model optimization techniques like weights compression using [NNCF](https://github.com/openvinotoolkit/nncf).
+
+## Notebook contents
+The tutorial consists from following steps:
+
+- Install requirements
+- Convert and Optimize model
+- Run OpenVINO model inference
+- Launch Interactive demo
+
+In this demonstration, you'll create interactive chatbot that can answer questions about provided image's or video's content.
+
+## Installation instructions
+This is a self-contained example that relies solely on its own code.</br>
+We recommend running the notebook in a virtual environment. You only need a Jupyter server to start.
+For details, please refer to [Installation Guide](../../README.md).
+
+<img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=5b5a4db0-7875-4bfb-bdbd-01698b5b1a77&file=notebooks/smolvlm2/README.md" />
diff --git a/notebooks/smolvlm2/gradio_helper.py b/notebooks/smolvlm2/gradio_helper.py
@@ -0,0 +1,133 @@
+import gradio as gr
+from transformers import TextIteratorStreamer
+from threading import Thread
+import re
+import time
+import requests
+from pathlib import Path
+from PIL import Image
+
+
+def download_examples():
+    example_images = {
+        "weather.png": "https://github.com/user-attachments/assets/85af4410-6e46-484d-b13b-fd9260eb2b7c",
+        "newyork.jpg": "https://github.com/user-attachments/assets/c530b689-2ff6-4c4d-91bc-e6ac5331df59",
+        "document.jpg": "https://github.com/user-attachments/assets/ac7225b6-bf90-4faf-b05f-bbba41a87142",
+        "rococo.jpg": "https://github.com/user-attachments/assets/9e26e36e-f2be-4fa2-affd-891448abcc7d",
+        "rococo_1.jpg": "https://github.com/user-attachments/assets/d39bdb95-833c-4ebd-8390-15a8fc2cd0b6",
+    }
+    for file_name, url in example_images.items():
+        if not Path(file_name).exists():
+            Image.open(requests.get(url, stream=True).raw).save(file_name)
+
+
+def make_demo(model, processor):
+    download_examples()
+
+    def model_inference(input_dict, history, max_tokens):
+        resulting_messages = []
+        user_content = []
+        media_queue = []
+        for hist in history:
+            if hist["role"] == "user" and isinstance(hist["content"], tuple):
+                file_name = hist["content"][0]
+            if file_name.endswith((".png", ".jpg", ".jpeg")):
+                media_queue.append({"type": "image", "path": file_name})
+            elif file_name.endswith(".mp4"):
+                media_queue.append({"type": "video", "path": file_name})
+
+        for hist in history:
+            if hist["role"] == "user" and isinstance(hist["content"], str):
+                text = hist["content"]
+                parts = re.split(r"(<image>|<video>)", text)
+
+                for part in parts:
+                    if part == "<image>" and media_queue:
+                        user_content.append(media_queue.pop(0))
+                    elif part == "<video>" and media_queue:
+                        user_content.append(media_queue.pop(0))
+                    elif part.strip():
+                        user_content.append({"type": "text", "text": part.strip()})
+
+            elif hist["role"] == "assistant":
+                resulting_messages.append({"role": "user", "content": user_content})
+                resulting_messages.append({"role": "assistant", "content": [{"type": "text", "text": hist["content"]}]})
+                user_content = []
+
+        text = input_dict["text"]
+        c_user_content = []
+        c_media_queue = []
+        text = input_dict["text"].strip()
+        for file in input_dict.get("files", []):
+            if file.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
+                c_media_queue.append({"type": "image", "path": file})
+            elif file.endswith((".mp4", ".mov", ".avi", ".mkv", ".flv")):
+                c_media_queue.append({"type": "video", "path": file})
+
+        if "<image>" in text or "<video>" in text:
+            parts = re.split(r"(<image>|<video>)", text)
+            for part in parts:
+                if part == "<image>" and c_media_queue:
+                    c_user_content.append(c_media_queue.pop(0))
+                elif part == "<video>" and c_media_queue:
+                    c_user_content.append(c_media_queue.pop(0))
+                elif part.strip():
+                    c_user_content.append({"type": "text", "text": part.strip()})
+        else:
+            c_user_content.append({"type": "text", "text": text})
+
+            for media in c_media_queue:
+                c_user_content.append(media)
+
+        current_message = {"role": "user", "content": c_user_content}
+
+        if text == "":
+            gr.Error("Please input a query and optionally image(s).")
+        resulting_messages.append(current_message)
+        print("resulting_messages", resulting_messages)
+        inputs = processor.apply_chat_template(
+            resulting_messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        )
+
+        # Generate
+        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+        generation_args = dict(inputs, streamer=streamer, max_new_tokens=max_tokens)
+        generated_text = ""
+
+        thread = Thread(target=model.generate, kwargs=generation_args)
+        thread.start()
+
+        yield "..."
+        buffer = ""
+
+        for new_text in streamer:
+            buffer += new_text
+            generated_text_without_prompt = buffer
+            time.sleep(0.01)
+            yield buffer
+
+    examples = [
+        [{"text": "Where do the severe droughts happen according to this diagram?", "files": ["weather.png"]}],
+        [{"text": "What art era this artpiece <image> and this artpiece <image> belong to?", "files": ["rococo.jpg", "rococo_1.jpg"]}],
+        [{"text": "Describe this image.", "files": ["newyork.jpg"]}],
+        [{"text": "What is the date in this document?", "files": ["document.jpg"]}],
+        [{"text": "What is happening in the video?", "files": ["dog.mp4"]}],
+    ]
+    demo = gr.ChatInterface(
+        fn=model_inference,
+        title="SmolVLM2: The Smollest Video Model Ever 📺",
+        description="Play with SmolVLM2 and OpenVINO in this demo. To get started, upload an image and text or try one of the examples.",
+        examples=examples,
+        textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", ".mp4"], file_count="multiple"),
+        stop_btn="Stop Generation",
+        multimodal=True,
+        cache_examples=False,
+        additional_inputs=[gr.Slider(minimum=100, maximum=500, step=50, value=200, label="Max Tokens")],
+        type="messages",
+    )
+
+    return demo
diff --git a/notebooks/smolvlm2/smolvlm2.ipynb b/notebooks/smolvlm2/smolvlm2.ipynb

-Original file line number
+Diff line change
 slowmo
 SML
 sml
 +SmolVLM
 softmax
 softvc
 SoftVC