Skip to content

Commit 7a220a3

Browse files
authored
SmolVLM2 notebook (#2835)
1 parent c9f12ed commit 7a220a3

File tree

4 files changed

+735
-0
lines changed

4 files changed

+735
-0
lines changed

.ci/spellcheck/.pyspelling.wordlist.txt

+1
Original file line numberDiff line numberDiff line change
@@ -876,6 +876,7 @@ slowfast
876876
slowmo
877877
SML
878878
sml
879+
SmolVLM
879880
softmax
880881
softvc
881882
SoftVC

notebooks/smolvlm2/README.md

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Visual-Language Assistant with SmolVLM2 and OpenVINO
2+
3+
SmolVLM2 represents a fundamental shift in how we think about video understanding - moving from massive models that require substantial computing resources to efficient models that can run anywhere.
4+
5+
![](https://github.com/user-attachments/assets/23eb3d5c-8a3e-4d3e-aeaf-56e166c1ec82)
6+
7+
Its goal is simple: make video understanding accessible across all devices and use cases, from phones to servers.
8+
9+
Compared with the previous SmolVLM family, SmolVLM2 2.2B model got better at solving math problems with images, reading text in photos, understanding complex diagrams, and tackling scientific visual questions.
10+
11+
You can find more details about model in [model card](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct) and [HuggingFace blog post](https://huggingface.co/blog/smolvlm2)
12+
13+
In this tutorial we consider how to convert and optimize SmolVLM2 model for creating multimodal chatbot using [Optimum Intel](https://github.com/huggingface/optimum-intel). Additionally, we demonstrate how to apply model optimization techniques like weights compression using [NNCF](https://github.com/openvinotoolkit/nncf).
14+
15+
## Notebook contents
16+
The tutorial consists from following steps:
17+
18+
- Install requirements
19+
- Convert and Optimize model
20+
- Run OpenVINO model inference
21+
- Launch Interactive demo
22+
23+
In this demonstration, you'll create interactive chatbot that can answer questions about provided image's or video's content.
24+
25+
## Installation instructions
26+
This is a self-contained example that relies solely on its own code.</br>
27+
We recommend running the notebook in a virtual environment. You only need a Jupyter server to start.
28+
For details, please refer to [Installation Guide](../../README.md).
29+
30+
<img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=5b5a4db0-7875-4bfb-bdbd-01698b5b1a77&file=notebooks/smolvlm2/README.md" />

notebooks/smolvlm2/gradio_helper.py

+133
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
import gradio as gr
2+
from transformers import TextIteratorStreamer
3+
from threading import Thread
4+
import re
5+
import time
6+
import requests
7+
from pathlib import Path
8+
from PIL import Image
9+
10+
11+
def download_examples():
12+
example_images = {
13+
"weather.png": "https://github.com/user-attachments/assets/85af4410-6e46-484d-b13b-fd9260eb2b7c",
14+
"newyork.jpg": "https://github.com/user-attachments/assets/c530b689-2ff6-4c4d-91bc-e6ac5331df59",
15+
"document.jpg": "https://github.com/user-attachments/assets/ac7225b6-bf90-4faf-b05f-bbba41a87142",
16+
"rococo.jpg": "https://github.com/user-attachments/assets/9e26e36e-f2be-4fa2-affd-891448abcc7d",
17+
"rococo_1.jpg": "https://github.com/user-attachments/assets/d39bdb95-833c-4ebd-8390-15a8fc2cd0b6",
18+
}
19+
for file_name, url in example_images.items():
20+
if not Path(file_name).exists():
21+
Image.open(requests.get(url, stream=True).raw).save(file_name)
22+
23+
24+
def make_demo(model, processor):
25+
download_examples()
26+
27+
def model_inference(input_dict, history, max_tokens):
28+
resulting_messages = []
29+
user_content = []
30+
media_queue = []
31+
for hist in history:
32+
if hist["role"] == "user" and isinstance(hist["content"], tuple):
33+
file_name = hist["content"][0]
34+
if file_name.endswith((".png", ".jpg", ".jpeg")):
35+
media_queue.append({"type": "image", "path": file_name})
36+
elif file_name.endswith(".mp4"):
37+
media_queue.append({"type": "video", "path": file_name})
38+
39+
for hist in history:
40+
if hist["role"] == "user" and isinstance(hist["content"], str):
41+
text = hist["content"]
42+
parts = re.split(r"(<image>|<video>)", text)
43+
44+
for part in parts:
45+
if part == "<image>" and media_queue:
46+
user_content.append(media_queue.pop(0))
47+
elif part == "<video>" and media_queue:
48+
user_content.append(media_queue.pop(0))
49+
elif part.strip():
50+
user_content.append({"type": "text", "text": part.strip()})
51+
52+
elif hist["role"] == "assistant":
53+
resulting_messages.append({"role": "user", "content": user_content})
54+
resulting_messages.append({"role": "assistant", "content": [{"type": "text", "text": hist["content"]}]})
55+
user_content = []
56+
57+
text = input_dict["text"]
58+
c_user_content = []
59+
c_media_queue = []
60+
text = input_dict["text"].strip()
61+
for file in input_dict.get("files", []):
62+
if file.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
63+
c_media_queue.append({"type": "image", "path": file})
64+
elif file.endswith((".mp4", ".mov", ".avi", ".mkv", ".flv")):
65+
c_media_queue.append({"type": "video", "path": file})
66+
67+
if "<image>" in text or "<video>" in text:
68+
parts = re.split(r"(<image>|<video>)", text)
69+
for part in parts:
70+
if part == "<image>" and c_media_queue:
71+
c_user_content.append(c_media_queue.pop(0))
72+
elif part == "<video>" and c_media_queue:
73+
c_user_content.append(c_media_queue.pop(0))
74+
elif part.strip():
75+
c_user_content.append({"type": "text", "text": part.strip()})
76+
else:
77+
c_user_content.append({"type": "text", "text": text})
78+
79+
for media in c_media_queue:
80+
c_user_content.append(media)
81+
82+
current_message = {"role": "user", "content": c_user_content}
83+
84+
if text == "":
85+
gr.Error("Please input a query and optionally image(s).")
86+
resulting_messages.append(current_message)
87+
print("resulting_messages", resulting_messages)
88+
inputs = processor.apply_chat_template(
89+
resulting_messages,
90+
add_generation_prompt=True,
91+
tokenize=True,
92+
return_dict=True,
93+
return_tensors="pt",
94+
)
95+
96+
# Generate
97+
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
98+
generation_args = dict(inputs, streamer=streamer, max_new_tokens=max_tokens)
99+
generated_text = ""
100+
101+
thread = Thread(target=model.generate, kwargs=generation_args)
102+
thread.start()
103+
104+
yield "..."
105+
buffer = ""
106+
107+
for new_text in streamer:
108+
buffer += new_text
109+
generated_text_without_prompt = buffer
110+
time.sleep(0.01)
111+
yield buffer
112+
113+
examples = [
114+
[{"text": "Where do the severe droughts happen according to this diagram?", "files": ["weather.png"]}],
115+
[{"text": "What art era this artpiece <image> and this artpiece <image> belong to?", "files": ["rococo.jpg", "rococo_1.jpg"]}],
116+
[{"text": "Describe this image.", "files": ["newyork.jpg"]}],
117+
[{"text": "What is the date in this document?", "files": ["document.jpg"]}],
118+
[{"text": "What is happening in the video?", "files": ["dog.mp4"]}],
119+
]
120+
demo = gr.ChatInterface(
121+
fn=model_inference,
122+
title="SmolVLM2: The Smollest Video Model Ever 📺",
123+
description="Play with SmolVLM2 and OpenVINO in this demo. To get started, upload an image and text or try one of the examples.",
124+
examples=examples,
125+
textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", ".mp4"], file_count="multiple"),
126+
stop_btn="Stop Generation",
127+
multimodal=True,
128+
cache_examples=False,
129+
additional_inputs=[gr.Slider(minimum=100, maximum=500, step=50, value=200, label="Max Tokens")],
130+
type="messages",
131+
)
132+
133+
return demo

0 commit comments

Comments
 (0)