Skip to content

Move speculative decoding from streamer to metrics benchmarking appro… #10

Move speculative decoding from streamer to metrics benchmarking appro…

Move speculative decoding from streamer to metrics benchmarking appro… #10

Workflow file for this run

name: causal_lm_cpp
on:
workflow_dispatch:
pull_request:
merge_group:
push:
branches:
- master
- 'releases/**'
permissions: read-all # Required by https://github.com/ossf/scorecard/blob/e23b8ad91fd6a64a0a971ca4fc0a4d1650725615/docs/checks.md#token-permissions
concurrency:
# github.ref is not unique in post-commit
group: ${{ github.event_name == 'push' && github.run_id || github.ref }}-causal-lm-cpp
cancel-in-progress: true
env:
l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.1.0-18343-5e16b688156/openvino_toolkit_ubuntu20_2025.1.0.dev20250304_x86_64.tgz
l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.1.0-18343-5e16b688156/openvino_toolkit_ubuntu22_2025.1.0.dev20250304_x86_64.tgz
m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.1.0-18343-5e16b688156/openvino_toolkit_macos_12_6_2025.1.0.dev20250304_x86_64.tgz
w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.1.0-18343-5e16b688156/openvino_toolkit_windows_2025.1.0.dev20250304_x86_64.zip
jobs:
cpp-chat_sample-ubuntu:
runs-on: ubuntu-24.04
defaults:
run:
shell: bash
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install OpenVINO
run: |
mkdir ./ov/
curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
sudo ./ov/install_dependencies/install_openvino_dependencies.sh
- name: Build app
run: |
source ./ov/setupvars.sh
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
cmake --build ./build/ --config Release -j
- name: Download and convert and model
run: |
source ./ov/setupvars.sh
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
python -m pip install -r ./samples/requirements.txt
optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
- name: Compare
env:
PYTHONPATH: "./build"
run: |
source ./ov/setupvars.sh
printf 'What is 2 + 2?\nWhat is the previous answer?\nAdd 1 to it.\nSubtract 5 from it.\nWhy is the sun yellow?\nWhat was my first question?\n' > ./input.txt
timeout 30s ./build/samples/cpp/text_generation/chat_sample ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred.txt
python -c "
from transformers import AutoTokenizer, AutoModelForCausalLM
model_id = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
prompts = ['What is 2 + 2?', 'What is the previous answer?', 'Add 1 to it.', 'Subtract 5 from it.', 'Why is the sun yellow?', 'What was my first question?']
def gen_prompt(prompt):
return {'role': 'user', 'content': prompt}
def gen_answer(answer):
return {'role': 'assistant', 'content': answer}
chat_history = []
chat_prompt = ''
output = open('ref.txt', 'w')
for prompt in prompts:
output.write('question:\n')
chat_history.append(gen_prompt(prompt))
chat_prompt = tokenizer.apply_chat_template(chat_history, tokenize=False, add_generation_prompt=True)
tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False)
answer = model.generate(**tokenized, max_length=1000, do_sample=False)
answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True)
chat_history.append(gen_answer(answer_str))
output.write(answer_str)
output.write('\n----------\n')
output.write('question:\n')
output.close()
"
diff pred.txt ref.txt
echo "Chat sample cpp" passed
timeout 30s ./samples/python/text_generation/chat_sample.py ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred2.txt
diff pred2.txt ref.txt
echo "Chat sample python" passed
visual_language_chat_sample-ubuntu-minicpm_v2_6:
runs-on: ubuntu-22.04-16-cores
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-python@v4
with:
python-version: 3.11
- uses: ./.github/actions/install_openvino
with:
ov_link: ${{ env.l_u22_ov_link }}
- uses: ./.github/actions/build_app
with:
build_target: 'visual_language_chat benchmark_vlm py_openvino_genai'
- uses: ./.github/actions/install_python_deps
- name: Download and convert tiny-random-minicpmv-2_6 model and an image
run: |
source ./ov/setupvars.sh
optimum-cli export openvino -m katuni4ka/tiny-random-minicpmv-2_6 tiny-random-minicpmv-2_6 --trust-remote-code --task image-text-to-text
mkdir images
- name: Generate images - tiny-random-minicpmv-2_6
shell: python
run: |
from PIL import Image
import numpy as np
import requests
res = 28, 28
lines = np.arange(res[0] * res[1] * 3, dtype=np.uint8) % 255
lines = lines.reshape([*res, 3])
lines_image = Image.fromarray(lines)
lines_image.save("images/lines.png")
cat = Image.open(requests.get("https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11", stream=True).raw).convert('RGB')
cat.save("images/cat.png")
- name: Run visual_language_chat C++ sample - tiny-random-minicpmv-2_6
run: >
set -o pipefail
&& source ./ov/setupvars.sh
&& ./build/samples/cpp/visual_language_chat/visual_language_chat ./tiny-random-minicpmv-2_6/ ./images/
<<< $'Describe the images?' | tee cpp.txt
timeout-minutes: 2
- name: Run benchmark_vlm C++ sample - tiny-random-minicpmv-2_6
run: >
set -o pipefail
&& source ./ov/setupvars.sh
&& ./build/samples/cpp/visual_language_chat/benchmark_vlm -m ./tiny-random-minicpmv-2_6/ -i ./images/cat.png -n 3
timeout-minutes: 2
- name: Run visual_language_chat Python sample - tiny-random-minicpmv-2_6
run: >
set -o pipefail
&& source ./ov/setupvars.sh
&& ./samples/python/visual_language_chat/visual_language_chat.py ./tiny-random-minicpmv-2_6/ ./images/
<<< $'Describe the images?' | tee py.txt
env:
PYTHONPATH: "./build/"
- name: Run benchmark_vlm Python sample - tiny-random-minicpmv-2_6
run: >
set -o pipefail
&& source ./ov/setupvars.sh
&& ./samples/python/visual_language_chat/benchmark_vlm.py -m ./tiny-random-minicpmv-2_6/ -i ./images/cat.png -n 3
env:
PYTHONPATH: "./build/"
- name: Encode cpp.txt with Python encoding instead of terminal one
shell: python
run: |
with open("cpp.txt", "rb") as f:
content = f.read().decode("utf-8", "replace")
with open("cpp.txt", "wb") as f:
f.write(content.encode("utf-8"))
- run: diff cpp.txt py.txt
- name: Run visual_language_chat C++ sample with 2 prompts - tiny-random-minicpmv-2_6
run: >
set -o pipefail
&& source ./ov/setupvars.sh
&& ./build/samples/cpp/visual_language_chat/visual_language_chat ./tiny-random-minicpmv-2_6/ ./images/cat.png
<<< $'What is unusual on this image?\nGo on.' | tee cpp2.txt
timeout-minutes: 2
- name: Run visual_language_chat Python sample with 2 prompts - tiny-random-minicpmv-2_6
run: >
set -o pipefail
&& source ./ov/setupvars.sh
&& ./samples/python/visual_language_chat/visual_language_chat.py ./tiny-random-minicpmv-2_6/ ./images/cat.png
<<< $'What is unusual on this image?\nGo on.' | tee py2.txt
env:
PYTHONPATH: "./build/"
- name: Encode cpp2.txt with Python encoding instead of terminal one
shell: python
run: |
with open("cpp2.txt", "rb") as f:
content = f.read().decode("utf-8", "replace")
with open("cpp2.txt", "wb") as f:
f.write(content.encode("utf-8"))
- run: diff cpp2.txt py2.txt
cpp-continuous-batching-ubuntu:
runs-on: ubuntu-22.04-8-cores
defaults:
run:
shell: bash
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-python@v4
with:
python-version: 3.12
- name: Install OpenVINO
run: |
mkdir ./ov/
curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
sudo ./ov/install_dependencies/install_openvino_dependencies.sh
- name: Build app
run: |
source ./ov/setupvars.sh
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
cmake --build ./build/ --config Release -j
- name: Download and convert and model
run: |
source ./ov/setupvars.sh
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
python -m pip install -r ./samples/requirements.txt
optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
- name: Run gtests
run: |
source ./ov/setupvars.sh
./build/tests/cpp/tests_continuous_batching
- name: Run accuracy_sample
run: |
source ./ov/setupvars.sh
timeout --verbose 50s ./build/tools/continuous_batching/accuracy/continuous_batching_accuracy -m ./TinyLlama-1.1B-Chat-v1.0/ -n 5
- name: Run throughput_benchmark
run: |
wget -q https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
source ./ov/setupvars.sh
timeout --verbose 200s ./build/tools/continuous_batching/benchmark/continuous_batching_benchmark -n 10 -m ./TinyLlama-1.1B-Chat-v1.0/ --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json --cache_size 1
timeout --verbose 200s ./build/tools/continuous_batching/benchmark/continuous_batching_benchmark -n 10 --dynamic_split_fuse --max_batch_size 256 --max_input_len 256 -m ./TinyLlama-1.1B-Chat-v1.0/ --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json --cache_size 1
cpp-continuous-batching-windows:
runs-on: windows-latest
env:
PYTHONIOENCODING: "utf8"
defaults:
run:
shell: cmd
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install OpenVINO
run: |
curl --output ov.zip ${{ env.w_ov_link }}
unzip -d ov ov.zip
dirs=(ov/*) && mv ov/*/* ov && rmdir "${dirs[@]}"
shell: bash
- name: Build app
run: |
call .\ov\setupvars.bat
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
cmake --build ./build/ --config Release -j
env:
CMAKE_TLS_VERIFY: 0
- name: Download and convert and model
run: |
call .\ov\setupvars.bat
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
python -m pip install -r ./samples/requirements.txt
optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
- name: Run gtests
run: |
set PATH=.\build\openvino_genai\;%PATH%
call .\ov\setupvars.bat
.\build\tests\cpp\Release\tests_continuous_batching.exe
- name: Run accuracy_sample
run: |
set PATH=.\build\openvino_genai\;%PATH%
call .\ov\setupvars.bat
.\build\tools\continuous_batching\accuracy\Release\continuous_batching_accuracy.exe -m .\TinyLlama-1.1B-Chat-v1.0\ -n 5
- name: Run throughput_benchmark
run: |
curl -o .\ShareGPT_V3_unfiltered_cleaned_split.json -s -L "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"
set PATH=.\build\openvino_genai\;%PATH%
call .\ov\setupvars.bat
.\build\tools\continuous_batching\benchmark\Release\continuous_batching_benchmark.exe -n 2 -m .\TinyLlama-1.1B-Chat-v1.0\ --dataset .\ShareGPT_V3_unfiltered_cleaned_split.json --cache_size 1
cpp-continuous-batching-macos:
runs-on: macos-13
defaults:
run:
shell: bash
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install OpenVINO
run: |
mkdir ./ov/
curl ${{ env.m_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
brew install coreutils scons
- name: Build app
run: |
source ./ov/setupvars.sh
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
cmake --build ./build/ --config Release -j
- name: Download and convert and model
run: |
source ./ov/setupvars.sh
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
python -m pip install -r ./samples/requirements.txt
optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
- name: Run gtests
run: |
source ./ov/setupvars.sh
./build/tests/cpp/tests_continuous_batching
- name: Run accuracy_sample
run: |
source ./ov/setupvars.sh
timeout --verbose 120s ./build/tools/continuous_batching/accuracy/continuous_batching_accuracy -m ./TinyLlama-1.1B-Chat-v1.0/ -n 5
- name: Run throughput_benchmark
run: |
wget -q https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
source ./ov/setupvars.sh
./build/tools/continuous_batching/benchmark/continuous_batching_benchmark -n 5 -m ./TinyLlama-1.1B-Chat-v1.0/ --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json --cache_size 1
Overall_Status:
name: ci/gha_overall_status_causal_lm
needs: [cpp-chat_sample-ubuntu, cpp-continuous-batching-ubuntu,
visual_language_chat_sample-ubuntu-minicpm_v2_6, cpp-continuous-batching-windows, cpp-continuous-batching-macos]
if: ${{ always() }}
runs-on: ubuntu-latest
steps:
- name: Check status of all jobs
if: >-
${{
contains(needs.*.result, 'failure') ||
contains(needs.*.result, 'cancelled')
}}
run: exit 1