|
1 | 1 | from pathlib import Path
|
2 |
| -from typing import Callable |
3 | 2 | import gradio as gr
|
4 | 3 |
|
5 | 4 |
|
6 | 5 | from PIL import Image
|
7 |
| -from typing import Callable |
8 | 6 | import numpy as np
|
9 | 7 | import requests
|
10 | 8 | from threading import Event, Thread
|
@@ -132,70 +130,98 @@ def generate_and_signal_complete():
|
132 | 130 | return demo
|
133 | 131 |
|
134 | 132 |
|
135 |
| -def make_demo_videollava(fn: Callable): |
136 |
| - examples_dir = Path("Video-LLaVA/videollava/serve/examples") |
137 |
| - gr.close_all() |
138 |
| - demo = gr.Interface( |
139 |
| - fn=fn, |
140 |
| - inputs=[ |
141 |
| - gr.Image(label="Input Image", type="filepath"), |
142 |
| - gr.Video(label="Input Video"), |
143 |
| - gr.Textbox(label="Question"), |
144 |
| - ], |
145 |
| - outputs=gr.Textbox(lines=10), |
| 133 | +def make_demo_llava_optimum(model, processor): |
| 134 | + from transformers import TextIteratorStreamer |
| 135 | + |
| 136 | + has_additonal_buttons = "undo_button" in inspect.signature(gr.ChatInterface.__init__).parameters |
| 137 | + |
| 138 | + def bot_streaming(message, history): |
| 139 | + print(f"message is - {message}") |
| 140 | + print(f"history is - {history}") |
| 141 | + files = message["files"] if isinstance(message, dict) else message.files |
| 142 | + message_text = message["text"] if isinstance(message, dict) else message.text |
| 143 | + if files: |
| 144 | + # message["files"][-1] is a Dict or just a string |
| 145 | + if isinstance(files[-1], dict): |
| 146 | + image = files[-1]["path"] |
| 147 | + else: |
| 148 | + if isinstance(files[-1], (str, Path)): |
| 149 | + image = files[-1] |
| 150 | + else: |
| 151 | + image = files[-1] if isinstance(files[-1], (list, tuple)) else files[-1].path |
| 152 | + else: |
| 153 | + # if there's no image uploaded for this turn, look for images in the past turns |
| 154 | + # kept inside tuples, take the last one |
| 155 | + for hist in history: |
| 156 | + if type(hist[0]) == tuple: |
| 157 | + image = hist[0][0] |
| 158 | + try: |
| 159 | + if image is None: |
| 160 | + # Handle the case where image is None |
| 161 | + raise gr.Error("You need to upload an image for Llama-3.2-Vision to work. Close the error and try again with an Image.") |
| 162 | + except NameError: |
| 163 | + # Handle the case where 'image' is not defined at all |
| 164 | + raise gr.Error("You need to upload an image for Llama-3.2-Vision to work. Close the error and try again with an Image.") |
| 165 | + |
| 166 | + conversation = [] |
| 167 | + flag = False |
| 168 | + for user, assistant in history: |
| 169 | + if assistant is None: |
| 170 | + # pass |
| 171 | + flag = True |
| 172 | + conversation.extend([{"role": "user", "content": []}]) |
| 173 | + continue |
| 174 | + if flag == True: |
| 175 | + conversation[0]["content"] = [{"type": "text", "text": f"{user}"}] |
| 176 | + conversation.append({"role": "assistant", "text": assistant}) |
| 177 | + flag = False |
| 178 | + continue |
| 179 | + conversation.extend([{"role": "user", "content": [{"type": "text", "text": user}]}, {"role": "assistant", "text": assistant}]) |
| 180 | + |
| 181 | + conversation.append({"role": "user", "content": [{"type": "text", "text": f"{message_text}"}, {"type": "image"}]}) |
| 182 | + prompt = processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True) |
| 183 | + print(f"prompt is -\n{prompt}") |
| 184 | + image = Image.open(image) |
| 185 | + inputs = processor(text=prompt, images=image, return_tensors="pt") |
| 186 | + |
| 187 | + streamer = TextIteratorStreamer( |
| 188 | + processor, |
| 189 | + **{ |
| 190 | + "skip_special_tokens": True, |
| 191 | + "skip_prompt": True, |
| 192 | + "clean_up_tokenization_spaces": False, |
| 193 | + }, |
| 194 | + ) |
| 195 | + generation_kwargs = dict( |
| 196 | + inputs, |
| 197 | + streamer=streamer, |
| 198 | + max_new_tokens=1024, |
| 199 | + do_sample=False, |
| 200 | + temperature=0.0, |
| 201 | + eos_token_id=processor.tokenizer.eos_token_id, |
| 202 | + ) |
| 203 | + |
| 204 | + thread = Thread(target=model.generate, kwargs=generation_kwargs) |
| 205 | + thread.start() |
| 206 | + |
| 207 | + buffer = "" |
| 208 | + for new_text in streamer: |
| 209 | + buffer += new_text |
| 210 | + yield buffer |
| 211 | + |
| 212 | + additional_buttons = {} |
| 213 | + if has_additonal_buttons: |
| 214 | + additional_buttons = {"undo_button": None, "retry_button": None} |
| 215 | + |
| 216 | + demo = gr.ChatInterface( |
| 217 | + fn=bot_streaming, |
| 218 | + title="LLaVA OpenVINO Chatbot", |
146 | 219 | examples=[
|
147 |
| - [ |
148 |
| - f"{examples_dir}/extreme_ironing.jpg", |
149 |
| - None, |
150 |
| - "What is unusual about this image?", |
151 |
| - ], |
152 |
| - [ |
153 |
| - f"{examples_dir}/waterview.jpg", |
154 |
| - None, |
155 |
| - "What are the things I should be cautious about when I visit here?", |
156 |
| - ], |
157 |
| - [ |
158 |
| - f"{examples_dir}/desert.jpg", |
159 |
| - None, |
160 |
| - "If there are factual errors in the questions, point it out; if not, proceed answering the question. What’s happening in the desert?", |
161 |
| - ], |
162 |
| - [ |
163 |
| - None, |
164 |
| - f"{examples_dir}/sample_demo_1.mp4", |
165 |
| - "Why is this video funny?", |
166 |
| - ], |
167 |
| - [ |
168 |
| - None, |
169 |
| - f"{examples_dir}/sample_demo_3.mp4", |
170 |
| - "Can you identify any safety hazards in this video?", |
171 |
| - ], |
172 |
| - [ |
173 |
| - None, |
174 |
| - f"{examples_dir}/sample_demo_9.mp4", |
175 |
| - "Describe the video.", |
176 |
| - ], |
177 |
| - [ |
178 |
| - None, |
179 |
| - f"{examples_dir}/sample_demo_22.mp4", |
180 |
| - "Describe the activity in the video.", |
181 |
| - ], |
182 |
| - [ |
183 |
| - f"{examples_dir}/sample_img_22.png", |
184 |
| - f"{examples_dir}/sample_demo_22.mp4", |
185 |
| - "Are the instruments in the pictures used in the video?", |
186 |
| - ], |
187 |
| - [ |
188 |
| - f"{examples_dir}/sample_img_13.png", |
189 |
| - f"{examples_dir}/sample_demo_13.mp4", |
190 |
| - "Does the flag in the image appear in the video?", |
191 |
| - ], |
192 |
| - [ |
193 |
| - f"{examples_dir}/sample_img_8.png", |
194 |
| - f"{examples_dir}/sample_demo_8.mp4", |
195 |
| - "Are the image and the video depicting the same place?", |
196 |
| - ], |
| 220 | + {"text": "What is on the flower?", "files": ["./bee.jpg"]}, |
| 221 | + {"text": "How to make this pastry?", "files": ["./baklava.png"]}, |
197 | 222 | ],
|
198 |
| - title="Video-LLaVA🚀", |
199 |
| - allow_flagging="never", |
| 223 | + stop_btn=None, |
| 224 | + multimodal=True, |
| 225 | + **additional_buttons, |
200 | 226 | )
|
201 | 227 | return demo
|
0 commit comments