Merge releases/2024/5 into master (openvinotoolkit#1168)

Wovchena · yatarkan · ilya-lavrenov · web-flow · commit 747c5d2577f5 · 2024-11-08T03:43:37.000Z
Co-authored-by: yatarkan &lt;yaroslav.tarkan@intel.com&gt;
Co-authored-by: Ilya Lavrenov &lt;ilya.lavrenov@intel.com&gt;
Co-authored-by: TolyaTalamanov &lt;anatoliy.talamanov@intel.com&gt;
Co-authored-by: wgzintel &lt;guozhong.wang@intel.com&gt;
Co-authored-by: Sergey Lyalin &lt;sergey.lyalin@gmail.com&gt;
diff --git a/.github/actions/build_app/action.yml b/.github/actions/build_app/action.yml
@@ -0,0 +1,23 @@
+name: 'Build App'
+inputs:
+  ov_dir:
+    description: 'Directory where OpenVINO is installed'
+    default: './ov'
+    required: false
+  build_dir:
+    description: 'Directory where the app is built'
+    default: './build'
+    required: false
+  build_target:
+    description: 'Target to build'
+    default: ''
+    required: false
+runs:
+  using: "composite"
+  steps:
+    - name: Build app
+      shell: bash
+      run: |
+        source ${{ inputs.ov_dir }}/setupvars.sh
+        cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ${{ inputs.build_dir }}
+        cmake --build ${{ inputs.build_dir }} --config Release ${{ inputs.build_target && format('--target {0}', inputs.build_target) || '' }} -j
diff --git a/.github/actions/install_openvino/action.yml b/.github/actions/install_openvino/action.yml
@@ -0,0 +1,18 @@
+name: 'Install OpenVINO'
+inputs:
+  ov_link:
+    description: 'URL to download OpenVINO'
+    required: true
+  ov_dir:
+    description: 'Directory to install OpenVINO'
+    default: './ov'
+    required: false
+runs:
+  using: "composite"
+  steps:
+    - name: 'Install OpenVINO'
+      shell: bash
+      run: |
+        mkdir ${{ inputs.ov_dir }}
+        curl ${{ inputs.ov_link }} | tar --directory ${{ inputs.ov_dir }} --strip-components 1 -xz
+        sudo ${{ inputs.ov_dir }}/install_dependencies/install_openvino_dependencies.sh
diff --git a/.github/actions/install_python_deps/action.yml b/.github/actions/install_python_deps/action.yml
@@ -0,0 +1,15 @@
+name: 'Install Python Dependencies'
+inputs:
+  ov_dir:
+    description: 'Directory where OpenVINO is installed'
+    default: './ov'
+    required: false
+runs:
+  using: "composite"
+  steps:
+    - name: Install Python dependencies
+      shell: bash
+      run: |
+        source ${{ inputs.ov_dir }}/setupvars.sh
+        python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
+        python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt
diff --git a/.github/scripts/generate_reference_llava.py b/.github/scripts/generate_reference_llava.py
@@ -0,0 +1,58 @@
+import argparse
+from pathlib import Path
+from optimum.intel.openvino import OVModelForVisualCausalLM
+from transformers import AutoProcessor
+from PIL import Image
+
+IMAGE_EXTENSIONS = [".jpg", ".jpeg", ".png", ".bmp"]
+
+
+def main(model_path: str, images_path: str):
+    print(f"Selected model: {model_path}\n")
+    
+    if Path(images_path).is_file():
+        image_files = [Path(images_path)]
+    else:
+        image_files = sorted(
+            [f for f in Path(images_path).glob("*") if f.is_file() and f.suffix.lower() in IMAGE_EXTENSIONS],
+            key=lambda x: x.name
+        )
+    
+    if not image_files:
+        raise FileNotFoundError(f"No images found in '{images_path}' directory. Supported formats: {IMAGE_EXTENSIONS}")
+
+    images = []
+    for file in image_files:
+        images.append(
+            Image.open(file).convert("RGB")
+        )
+    
+    print("Images:", image_files)
+
+    model = OVModelForVisualCausalLM.from_pretrained(model_path, trust_remote_code=True)
+    processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+
+    conversation = [{
+        "role": "user",
+        "content": [
+            *[{"type": "image"} for _ in images],
+            {"type": "text", "text": "Describe the images."},
+        ],
+    }]
+
+    prompt = processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
+    print(prompt)
+    inputs = processor(text=[prompt], images=images, return_tensors="pt")
+    result = model.generate(**inputs, max_new_tokens=100, do_sample=False)
+    decoded = processor.tokenizer.batch_decode(result[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0]
+    print(decoded)
+    with open("ref.txt", "w") as f:
+        f.write(f"question:\n{decoded}\n----------\nquestion:\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-m", "--model_path", type=str, required=True, help="Path to the model.")
+    parser.add_argument("-i", "--images_path", type=str, required=True, help="Path to the directory with images.")
+    args = parser.parse_args()
+    main(args.model_path, args.images_path)
diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
@@ -713,7 +713,7 @@ jobs:
           echo "Chat sample python" passed
 
 
-  visual_language_chat_sample-ubuntu:
+  visual_language_chat_sample-ubuntu-minicpm_v2_6:
     runs-on: ubuntu-22.04-16-cores
     steps:
       - uses: actions/checkout@v4
@@ -722,21 +722,13 @@ jobs:
       - uses: actions/setup-python@v4
         with:
           python-version: 3.11
-      - name: Install OpenVINO
-        run: |
-          mkdir ./ov/
-          curl ${{ env.l_u22_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
-          sudo ./ov/install_dependencies/install_openvino_dependencies.sh
-      - name: Build app
-        run: |
-          source ./ov/setupvars.sh
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release --target visual_language_chat py_openvino_genai -j
-      - name: Install dependencies
-        run: |
-          source ./ov/setupvars.sh
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt opencv-python
+      - uses: ./.github/actions/install_openvino
+        with:
+          ov_link: ${{ env.l_u22_ov_link }}
+      - uses: ./.github/actions/build_app
+        with:
+          build_target: 'visual_language_chat py_openvino_genai'
+      - uses: ./.github/actions/install_python_deps
       - name: Download and convert tiny-random-minicpmv-2_6 model and an image
         run: |
           python -m pip install git+https://github.com/eaidova/optimum-intel.git@ea/minicpmv
@@ -764,13 +756,6 @@ jobs:
           && ./build/samples/cpp/visual_language_chat/visual_language_chat ./tiny-random-minicpmv-2_6/ ./images/
           <<< $'Describe the images?' | tee cpp.txt
         timeout-minutes: 2
-      - name: Encode cpp.txt with Python encoding instead of terminal one
-        shell: python
-        run: |
-          with open("cpp.txt", "rb") as f:
-            content = f.read().decode("utf-8", "replace")
-          with open("cpp.txt", "wb") as f:
-            f.write(content.encode("utf-8"))
       - name: Run visual_language_chat Python sample - tiny-random-minicpmv-2_6
         run: >
           set -o pipefail
@@ -779,6 +764,13 @@ jobs:
           <<< $'Describe the images?' | tee py.txt
         env:
           PYTHONPATH: "./build/"
+      - name: Encode cpp.txt with Python encoding instead of terminal one
+        shell: python
+        run: |
+          with open("cpp.txt", "rb") as f:
+            content = f.read().decode("utf-8", "replace")
+          with open("cpp.txt", "wb") as f:
+            f.write(content.encode("utf-8"))
       - run: diff cpp.txt py.txt
       - name: Run visual_language_chat C++ sample with 2 prompts - tiny-random-minicpmv-2_6
         run: >
@@ -803,39 +795,51 @@ jobs:
           with open("cpp2.txt", "wb") as f:
             f.write(content.encode("utf-8"))
       - run: diff cpp2.txt py2.txt
-      - name: Download and convert LLaVa 1.5 model and an image
-        run: |
-          source ./ov/setupvars.sh
-          optimum-cli export openvino --model llava-hf/llava-1.5-7b-hf ./llava_1_5_7b_ov/
-          wget https://llava-vl.github.io/static/images/monalisa.jpg
-      - name: Run visual_language_chat C++ sample - LLaVa 1.5
-        run: >
-          source ./ov/setupvars.sh
-          && ./build/samples/cpp/visual_language_chat/visual_language_chat ./llava_1_5_7b_ov/ monalisa.jpg
-          <<< $'Who drew this painting?\nWhen did the painter live?'
-        timeout-minutes: 4
-      - name: Download and convert LLaVa-Next model
-        run: |
-          source ./ov/setupvars.sh
-          optimum-cli export openvino --model llava-hf/llava-v1.6-mistral-7b-hf ./llava_v1_6_mistral_7b_ov/
-      - name: Run visual_language_chat C++ sample - LLaVa-Next
-        run: >
-          source ./ov/setupvars.sh
-          && ./build/samples/cpp/visual_language_chat/visual_language_chat ./llava_v1_6_mistral_7b_ov/ monalisa.jpg
-          <<< $'Who drew this painting?\nWhen did the painter live?'
-        timeout-minutes: 4
+
+  visual_language_chat_sample-ubuntu-llava_1_5:
+    uses: ./.github/workflows/job_vlm_sample_llava.yml
+    with:
+      model_id: llava-hf/llava-1.5-7b-hf
+      model_dir: llava_1_5_7b_ov
+
+  visual_language_chat_sample-ubuntu-llava_next:
+    uses: ./.github/workflows/job_vlm_sample_llava.yml
+    with:
+      model_id: llava-hf/llava-v1.6-mistral-7b-hf
+      model_dir: llava_v1_6_mistral_7b_ov
+
+  visual_language_chat_sample-ubuntu-internvl2:
+    runs-on: ubuntu-22.04-16-cores
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - uses: actions/setup-python@v4
+        with:
+          python-version: 3.11
+      - uses: ./.github/actions/install_openvino
+        with:
+          ov_link: ${{ env.l_u22_ov_link }}
+      - uses: ./.github/actions/build_app
+        with:
+          build_target: 'visual_language_chat py_openvino_genai'
+      - uses: ./.github/actions/install_python_deps
       - name: Download and convert InternVL2 model
         run: |
+          # Lowering transformers version, workaround for https://huggingface.co/OpenGVLab/InternVL2-1B/discussions/7
+          python -m pip install -U "transformers<4.45.0"
           source ./ov/setupvars.sh
           optimum-cli export openvino --model OpenGVLab/InternVL2-4B ./internvl2_4b_ov/ --trust-remote-code
+      - name: Download images
+        run: |
+          wget https://llava-vl.github.io/static/images/monalisa.jpg
       - name: Run visual_language_chat C++ sample - InternVL2
         run: >
           source ./ov/setupvars.sh
           && ./build/samples/cpp/visual_language_chat/visual_language_chat ./internvl2_4b_ov/ monalisa.jpg
           <<< $'Who drew this painting?\nWhen did the painter live?'
         timeout-minutes: 4
 
-
   cpp-continuous-batching-ubuntu:
     runs-on: ubuntu-20.04-8-cores
     defaults:
@@ -975,7 +979,7 @@ jobs:
             cpp-greedy_causal_lm-Qwen-7B-Chat, cpp-beam_search_causal_lm-Qwen1_5-7B-Chat, cpp-beam_search_causal_lm-Phi-2,
             cpp-beam_search_causal_lm-notus-7b-v1, cpp-speculative_decoding_lm-ubuntu, cpp-prompt_lookup_decoding_lm-ubuntu,
             cpp-Phi-1_5, cpp-greedy_causal_lm-redpajama-3b-chat, cpp-chat_sample-ubuntu, cpp-continuous-batching-ubuntu,
-            visual_language_chat_sample-ubuntu,
+            visual_language_chat_sample-ubuntu-minicpm_v2_6, visual_language_chat_sample-ubuntu-llava_1_5, visual_language_chat_sample-ubuntu-llava_next, visual_language_chat_sample-ubuntu-internvl2,
             cpp-continuous-batching-windows, cpp-continuous-batching-macos]
     if: ${{ always() }}
     runs-on: ubuntu-latest
diff --git a/.github/workflows/job_vlm_sample_llava.yml b/.github/workflows/job_vlm_sample_llava.yml
@@ -0,0 +1,44 @@
+name: visual_language_chat sample - LLaVA
+
+on:
+  workflow_call:
+    inputs:
+      model_id:
+        required: true
+        type: string
+      model_dir:
+        required: true
+        type: string
+
+env:
+  l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17289-7cf2bbb8391/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241105_x86_64.tgz
+jobs:
+  visual_language_chat_sample-ubuntu-llava:
+    runs-on: ubuntu-22.04-16-cores
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - uses: actions/setup-python@v4
+        with:
+          python-version: 3.11
+      - uses: ./.github/actions/install_openvino
+        with:
+          ov_link: ${{ env.l_u22_ov_link }}
+      - uses: ./.github/actions/build_app
+        with:
+          build_target: 'visual_language_chat py_openvino_genai'
+      - uses: ./.github/actions/install_python_deps
+      - name: Download and convert model
+        run: |
+          source ./ov/setupvars.sh
+          optimum-cli export openvino --model ${{ inputs.model_id }} ./${{ inputs.model_dir }}
+      - name: Download images
+        run: |
+          wget https://llava-vl.github.io/static/images/monalisa.jpg
+      - name: Run visual_language_chat C++ sample
+        run: >
+          source ./ov/setupvars.sh
+          && ./build/samples/cpp/visual_language_chat/visual_language_chat ./${{ inputs.model_dir }} monalisa.jpg
+          <<< $'Who drew this painting?\nWhen did the painter live?'
+        timeout-minutes: 4
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,9 @@
 !__init__.py
 !__main__.py
 
+# don't skip GitHub Actions files and directories
+!.github/**
+
 # developer tools
 *.idea
 .vscode
diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
@@ -20,6 +20,26 @@
 
 namespace {
 
+uint32_t align_to(uint32_t value, uint32_t alignment) {
+    return (value + alignment - 1) & ~(alignment - 1);
+}
+
+enum class GenerateHint {
+    FAST_COMPILE,
+    BEST_PERF
+};
+
+GenerateHint str_to_hint(const std::string& str) {
+    if (str == "FAST_COMPILE") {
+        return GenerateHint::FAST_COMPILE;
+    }
+    if (str == "BEST_PERF") {
+        return GenerateHint::BEST_PERF;
+    }
+    OPENVINO_THROW("Unsupported \"GENERATE_HINT\" provided: " +
+                   str + ". Please select either \"FAST_COMPILE\" or \"BEST_PERF\".");
+}
+
 std::shared_ptr<ov::Model> cvt_kvcache_to_fp16(const std::shared_ptr<ov::Model>& model) {
     ov::preprocess::PrePostProcessor ppp(model);
 
@@ -275,8 +295,12 @@ ov::AnyMap get_default_prefill_config(const std::shared_ptr<ov::Model>& model,
 }
 
 ov::AnyMap get_default_generate_config(const std::shared_ptr<ov::Model>& model,
-                                       const std::optional<NPUDesc>& npudesc) {
+                                       const std::optional<NPUDesc>& npudesc,
+                                       const GenerateHint hint) {
     auto config = get_default_common_config(model);
+    if (hint == GenerateHint::BEST_PERF) {
+        config.emplace("NPUW_ONLINE_PIPELINE", "NONE");
+    }
     // NB: Unconditionally set for generation model
     config.emplace("NPUW_DQ", "YES");
     if (npudesc.has_value() && npudesc->arch == "4000") {
@@ -404,8 +428,8 @@ void StaticLLMPipeline::setupAndCompileModels(
     m_prefill_model = m_kvcache_model->clone();
     m_prefill_model->set_friendly_name(m_kvcache_model->get_friendly_name() + "_prefill");
     // (7) Reshape both models to static shape
-    const uint32_t kMaxPromptLen = pop_int_and_cast(properties, "MAX_PROMPT_LEN").value_or(1024u);
-    const uint32_t kMinResponseLen = pop_int_and_cast(properties, "MIN_RESPONSE_LEN").value_or(128u);
+    const uint32_t kMaxPromptLen = align_to(pop_int_and_cast(properties, "MAX_PROMPT_LEN").value_or(1024u), 64u);
+    const uint32_t kMinResponseLen = align_to(pop_int_and_cast(properties, "MIN_RESPONSE_LEN").value_or(128u), 64u);
     KVAxesPosition axes = get_kv_axes(get_model_type_from_json(models_path / "config.json"));
     m_kvcache_desc = KVCacheDesc { kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, axes.seq_len };
     reshape_to_static(m_prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes);
@@ -414,8 +438,10 @@ void StaticLLMPipeline::setupAndCompileModels(
     auto prefill_config = pop_or_default(
         properties, "PREFILL_CONFIG", get_default_prefill_config(m_prefill_model, npudesc)
     );
+    // NB: GENERATE_HINT is only applicable for default generate config!
+    auto generate_hint = str_to_hint(pop_or_default<std::string>(properties, "GENERATE_HINT", "FAST_COMPILE"));
     auto generate_config = pop_or_default(
-        properties, "GENERATE_CONFIG", get_default_generate_config(m_kvcache_model, npudesc)
+        properties, "GENERATE_CONFIG", get_default_generate_config(m_kvcache_model, npudesc, generate_hint)
     );
     merge_config_with(prefill_config, properties);
     merge_config_with(generate_config, properties);
diff --git a/src/cpp/src/lora_adapter.cpp b/src/cpp/src/lora_adapter.cpp
diff --git a/tools/llm_bench/llm_bench_utils/ov_utils.py b/tools/llm_bench/llm_bench_utils/ov_utils.py
diff --git a/tools/llm_bench/task/speech_to_text_generation.py b/tools/llm_bench/task/speech_to_text_generation.py
diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py