pixtral notebook (openvinotoolkit#2426)

eaidova · web-flow · commit b0644bcc7e74 · 2024-10-03T08:20:21.000+04:00
diff --git a/.ci/ignore_treon_docker.txt b/.ci/ignore_treon_docker.txt
@@ -82,4 +82,5 @@ notebooks/qwen2-vl/qwen2-vl.ipynb
 notebooks/qwen2-audio/qwen2-audio.ipynb
 notebooks/stable-fast-3d/stable-fast-3d.ipynb
 notebooks/mllama-3.2/mllama-3.2.ipynb
-notebooks/segment-anything/segment-anything-2-image.ipynb
+notebooks/segment-anything/segment-anything-2-image.ipynb
+notebooks/pixtral/pixtral.ipynb
diff --git a/.ci/skipped_notebooks.yml b/.ci/skipped_notebooks.yml
@@ -583,3 +583,10 @@
         - '3.9'
     - os:
         - macos-12
+- notebook: notebooks/pixtral/pixtral.ipynb
+  skips:
+    - os:
+        - macos-12
+        - ubuntu-20.04
+        - ubuntu-22.04
+        - windows-2019
diff --git a/.ci/spellcheck/.pyspelling.wordlist.txt b/.ci/spellcheck/.pyspelling.wordlist.txt
@@ -527,6 +527,7 @@ nar
 NAS
 natively
 NCE
+Nemo
 NEOX
 NER
 NETP
@@ -633,6 +634,8 @@ PixArt
 PIXART
 PixelShuffleUpsampleNetwork
 pixelwise
+Pixtral
+pixtral
 PIL
 PNDM
 png
diff --git a/notebooks/pixtral/README.md b/notebooks/pixtral/README.md
@@ -0,0 +1,29 @@
+ Visual-language assistant with Pixtral and OpenVINO
+
+Pixtral-12b is multimodal model that consists of 12B parameter multimodal decoder based on Mistral Nemo and 400M parameter vision encoder trained from scratch. It is trained to understand both natural images and documents. The model shows strong abilities in tasks such as chart and figure understanding, document question answering, multimodal reasoning and instruction following. Pixtral is able to ingest images at their natural resolution and aspect ratio, giving the user flexibility on the number of tokens used to process an image. Pixtral is also able to process any number of images in its long context window of 128K tokens. Unlike previous open-source models, Pixtral does not compromise on text benchmark performance to excel in multimodal tasks.
+
+![](https://mistral.ai/images/news/pixtral-12b/pixtral-model-architecture.png)
+
+
+More details about model are available in [blog post](https://mistral.ai/news/pixtral-12b/) and [model card](https://huggingface.co/mistralai/Pixtral-12B-2409)
+
+In this tutorial we consider how to convert, optimize and run this model using OpenVINO.
+
+## Notebook contents
+The tutorial consists from following steps:
+
+- Install requirements
+- Convert and Optimize model
+- Run OpenVINO model inference
+- Launch Interactive demo
+
+In this demonstration, you'll create interactive chatbot that can answer questions about provided image's content.
+
+The image bellow illustrates example of input prompt and model answer.
+![example.png](https://github.com/user-attachments/assets/b61a9e8e-32c7-4b60-aa00-b4b867e823be)
+
+## Installation instructions
+This is a self-contained example that relies solely on its own code.</br>
+We recommend running the notebook in a virtual environment. You only need a Jupyter server to start.
+For details, please refer to [Installation Guide](../../README.md).
+<img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=5b5a4db0-7875-4bfb-bdbd-01698b5b1a77&file=notebooks/pixtral/README.md" />
diff --git a/notebooks/pixtral/gradio_helper.py b/notebooks/pixtral/gradio_helper.py
@@ -0,0 +1,122 @@
+from pathlib import Path
+import requests
+import gradio as gr
+from PIL import Image
+from threading import Thread
+from transformers import TextIteratorStreamer
+
+chat_template = """
+{%- if messages[0][\"role\"] == \"system\" %}\n    {%- set system_message = messages[0][\"content\"] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n    {%- endif %}\n    {%- if message[\"role\"] == \"user\" %}\n        {%- if loop.last and system_message is defined %}\n            {{- \"[INST]\" + system_message + \"\n\n\" }}\n        {%- else %}\n            {{- \"[INST]\" }}\n        {%- endif %}\n        {%- if message[\"content\"] is not string %}\n            {%- for chunk in message[\"content\"] %}\n                {%- if chunk[\"type\"] == \"text\" %}\n                    {{- chunk[\"content\"] }}\n                {%- elif chunk[\"type\"] == \"image\" %}\n                    {{- \"[IMG]\" }}\n                {%- else %}\n                    {{- raise_exception(\"Unrecognized content type!\") }}\n                {%- endif %}\n            {%- endfor %}\n        {%- else %}\n            {{- message[\"content\"] }}\n        {%- endif %}\n        {{- \"[/INST]\" }}\n    {%- elif message[\"role\"] == \"assistant\" %}\n        {{- message[\"content\"] + eos_token}}\n    {%- else %}\n        {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n    {%- endif %}\n{%- endfor %}
+"""
+
+
+def resize_with_aspect_ratio(image: Image, dst_height=512, dst_width=512):
+    width, height = image.size
+    if width > dst_width or height > dst_height:
+        im_scale = min(dst_height / height, dst_width / width)
+        resize_size = (int(width * im_scale), int(height * im_scale))
+        return image.resize(resize_size)
+    return image
+
+
+def make_demo(model, processor):
+    model_name = Path(model.config._name_or_path).parent.name
+
+    example_image_urls = [
+        ("https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/dd5105d6-6a64-4935-8a34-3058a82c8d5d", "small.png"),
+        ("https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/1221e2a8-a6da-413a-9af6-f04d56af3754", "chart.png"),
+    ]
+
+    for url, file_name in example_image_urls:
+        if not Path(file_name).exists():
+            Image.open(requests.get(url, stream=True).raw).save(file_name)
+    if processor.chat_template is None:
+        processor.set_chat_template(chat_template)
+
+    def bot_streaming(message, history):
+        print(f"message is - {message}")
+        print(f"history is - {history}")
+        files = message["files"] if isinstance(message, dict) else message.files
+        message_text = message["text"] if isinstance(message, dict) else message.text
+        if files:
+            # message["files"][-1] is a Dict or just a string
+            if isinstance(files[-1], dict):
+                image = files[-1]["path"]
+            else:
+                image = files[-1] if isinstance(files[-1], (list, tuple)) else files[-1].path
+        else:
+            # if there's no image uploaded for this turn, look for images in the past turns
+            # kept inside tuples, take the last one
+            for hist in history:
+                if type(hist[0]) == tuple:
+                    image = hist[0][0]
+        try:
+            if image is None:
+                # Handle the case where image is None
+                raise gr.Error("You need to upload an image for Llama-3.2-Vision to work. Close the error and try again with an Image.")
+        except NameError:
+            # Handle the case where 'image' is not defined at all
+            raise gr.Error("You need to upload an image for Llama-3.2-Vision to work. Close the error and try again with an Image.")
+
+        conversation = []
+        flag = False
+        for user, assistant in history:
+            if assistant is None:
+                # pass
+                flag = True
+                conversation.extend([{"role": "user", "content": []}])
+                continue
+            if flag == True:
+                conversation[0]["content"] = [{"type": "text", "content": f"{user}"}]
+                conversation.append({"role": "assistant", "content": assistant})
+                flag = False
+                continue
+            conversation.extend([{"role": "user", "content": [{"type": "text", "content": user}]}, {"role": "assistant", "content": assistant}])
+
+        conversation.append({"role": "user", "content": [{"type": "text", "content": f"{message_text}"}, {"type": "image"}]})
+        prompt = processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
+        print(f"prompt is -\n{prompt}")
+        image = Image.open(image)
+        image = resize_with_aspect_ratio(image)
+        inputs = processor(prompt, image, return_tensors="pt")
+
+        streamer = TextIteratorStreamer(
+            processor,
+            **{
+                "skip_special_tokens": True,
+                "skip_prompt": True,
+                "clean_up_tokenization_spaces": False,
+            },
+        )
+        generation_kwargs = dict(
+            inputs,
+            streamer=streamer,
+            max_new_tokens=1024,
+            do_sample=False,
+            temperature=0.0,
+            eos_token_id=processor.tokenizer.eos_token_id,
+        )
+
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+
+        buffer = ""
+        for new_text in streamer:
+            buffer += new_text
+            yield buffer
+
+    demo = gr.ChatInterface(
+        fn=bot_streaming,
+        title=f"{model_name} with OpenVINO",
+        examples=[
+            {"text": "What is the text saying?", "files": ["./small.png"]},
+            {"text": "What does the chart display?", "files": ["./chart.png"]},
+        ],
+        description=f"{model_name} with OpenVINO. Upload an image and start chatting about it, or simply try one of the examples below. If you won't upload an image, you will receive an error.",
+        stop_btn=None,
+        retry_btn=None,
+        undo_btn=None,
+        multimodal=True,
+    )
+
+    return demo
diff --git a/notebooks/pixtral/pixtral.ipynb b/notebooks/pixtral/pixtral.ipynb