Move speculative decoding from streamer to metrics benchmarking appro… #10
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: causal_lm_cpp | |
on: | |
workflow_dispatch: | |
pull_request: | |
merge_group: | |
push: | |
branches: | |
- master | |
- 'releases/**' | |
permissions: read-all # Required by https://github.com/ossf/scorecard/blob/e23b8ad91fd6a64a0a971ca4fc0a4d1650725615/docs/checks.md#token-permissions | |
concurrency: | |
# github.ref is not unique in post-commit | |
group: ${{ github.event_name == 'push' && github.run_id || github.ref }}-causal-lm-cpp | |
cancel-in-progress: true | |
env: | |
l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.1.0-18343-5e16b688156/openvino_toolkit_ubuntu20_2025.1.0.dev20250304_x86_64.tgz | |
l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.1.0-18343-5e16b688156/openvino_toolkit_ubuntu22_2025.1.0.dev20250304_x86_64.tgz | |
m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.1.0-18343-5e16b688156/openvino_toolkit_macos_12_6_2025.1.0.dev20250304_x86_64.tgz | |
w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.1.0-18343-5e16b688156/openvino_toolkit_windows_2025.1.0.dev20250304_x86_64.zip | |
jobs: | |
cpp-chat_sample-ubuntu: | |
runs-on: ubuntu-24.04 | |
defaults: | |
run: | |
shell: bash | |
steps: | |
- uses: actions/checkout@v4 | |
with: | |
submodules: recursive | |
- uses: actions/setup-python@v4 | |
with: | |
python-version: 3.11 | |
- name: Install OpenVINO | |
run: | | |
mkdir ./ov/ | |
curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz | |
sudo ./ov/install_dependencies/install_openvino_dependencies.sh | |
- name: Build app | |
run: | | |
source ./ov/setupvars.sh | |
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ | |
cmake --build ./build/ --config Release -j | |
- name: Download and convert and model | |
run: | | |
source ./ov/setupvars.sh | |
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly | |
python -m pip install -r ./samples/requirements.txt | |
optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 | |
- name: Compare | |
env: | |
PYTHONPATH: "./build" | |
run: | | |
source ./ov/setupvars.sh | |
printf 'What is 2 + 2?\nWhat is the previous answer?\nAdd 1 to it.\nSubtract 5 from it.\nWhy is the sun yellow?\nWhat was my first question?\n' > ./input.txt | |
timeout 30s ./build/samples/cpp/text_generation/chat_sample ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred.txt | |
python -c " | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
model_id = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0' | |
tokenizer = AutoTokenizer.from_pretrained(model_id) | |
model = AutoModelForCausalLM.from_pretrained(model_id) | |
prompts = ['What is 2 + 2?', 'What is the previous answer?', 'Add 1 to it.', 'Subtract 5 from it.', 'Why is the sun yellow?', 'What was my first question?'] | |
def gen_prompt(prompt): | |
return {'role': 'user', 'content': prompt} | |
def gen_answer(answer): | |
return {'role': 'assistant', 'content': answer} | |
chat_history = [] | |
chat_prompt = '' | |
output = open('ref.txt', 'w') | |
for prompt in prompts: | |
output.write('question:\n') | |
chat_history.append(gen_prompt(prompt)) | |
chat_prompt = tokenizer.apply_chat_template(chat_history, tokenize=False, add_generation_prompt=True) | |
tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False) | |
answer = model.generate(**tokenized, max_length=1000, do_sample=False) | |
answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True) | |
chat_history.append(gen_answer(answer_str)) | |
output.write(answer_str) | |
output.write('\n----------\n') | |
output.write('question:\n') | |
output.close() | |
" | |
diff pred.txt ref.txt | |
echo "Chat sample cpp" passed | |
timeout 30s ./samples/python/text_generation/chat_sample.py ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred2.txt | |
diff pred2.txt ref.txt | |
echo "Chat sample python" passed | |
visual_language_chat_sample-ubuntu-minicpm_v2_6: | |
runs-on: ubuntu-22.04-16-cores | |
steps: | |
- uses: actions/checkout@v4 | |
with: | |
submodules: recursive | |
- uses: actions/setup-python@v4 | |
with: | |
python-version: 3.11 | |
- uses: ./.github/actions/install_openvino | |
with: | |
ov_link: ${{ env.l_u22_ov_link }} | |
- uses: ./.github/actions/build_app | |
with: | |
build_target: 'visual_language_chat benchmark_vlm py_openvino_genai' | |
- uses: ./.github/actions/install_python_deps | |
- name: Download and convert tiny-random-minicpmv-2_6 model and an image | |
run: | | |
source ./ov/setupvars.sh | |
optimum-cli export openvino -m katuni4ka/tiny-random-minicpmv-2_6 tiny-random-minicpmv-2_6 --trust-remote-code --task image-text-to-text | |
mkdir images | |
- name: Generate images - tiny-random-minicpmv-2_6 | |
shell: python | |
run: | | |
from PIL import Image | |
import numpy as np | |
import requests | |
res = 28, 28 | |
lines = np.arange(res[0] * res[1] * 3, dtype=np.uint8) % 255 | |
lines = lines.reshape([*res, 3]) | |
lines_image = Image.fromarray(lines) | |
lines_image.save("images/lines.png") | |
cat = Image.open(requests.get("https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11", stream=True).raw).convert('RGB') | |
cat.save("images/cat.png") | |
- name: Run visual_language_chat C++ sample - tiny-random-minicpmv-2_6 | |
run: > | |
set -o pipefail | |
&& source ./ov/setupvars.sh | |
&& ./build/samples/cpp/visual_language_chat/visual_language_chat ./tiny-random-minicpmv-2_6/ ./images/ | |
<<< $'Describe the images?' | tee cpp.txt | |
timeout-minutes: 2 | |
- name: Run benchmark_vlm C++ sample - tiny-random-minicpmv-2_6 | |
run: > | |
set -o pipefail | |
&& source ./ov/setupvars.sh | |
&& ./build/samples/cpp/visual_language_chat/benchmark_vlm -m ./tiny-random-minicpmv-2_6/ -i ./images/cat.png -n 3 | |
timeout-minutes: 2 | |
- name: Run visual_language_chat Python sample - tiny-random-minicpmv-2_6 | |
run: > | |
set -o pipefail | |
&& source ./ov/setupvars.sh | |
&& ./samples/python/visual_language_chat/visual_language_chat.py ./tiny-random-minicpmv-2_6/ ./images/ | |
<<< $'Describe the images?' | tee py.txt | |
env: | |
PYTHONPATH: "./build/" | |
- name: Run benchmark_vlm Python sample - tiny-random-minicpmv-2_6 | |
run: > | |
set -o pipefail | |
&& source ./ov/setupvars.sh | |
&& ./samples/python/visual_language_chat/benchmark_vlm.py -m ./tiny-random-minicpmv-2_6/ -i ./images/cat.png -n 3 | |
env: | |
PYTHONPATH: "./build/" | |
- name: Encode cpp.txt with Python encoding instead of terminal one | |
shell: python | |
run: | | |
with open("cpp.txt", "rb") as f: | |
content = f.read().decode("utf-8", "replace") | |
with open("cpp.txt", "wb") as f: | |
f.write(content.encode("utf-8")) | |
- run: diff cpp.txt py.txt | |
- name: Run visual_language_chat C++ sample with 2 prompts - tiny-random-minicpmv-2_6 | |
run: > | |
set -o pipefail | |
&& source ./ov/setupvars.sh | |
&& ./build/samples/cpp/visual_language_chat/visual_language_chat ./tiny-random-minicpmv-2_6/ ./images/cat.png | |
<<< $'What is unusual on this image?\nGo on.' | tee cpp2.txt | |
timeout-minutes: 2 | |
- name: Run visual_language_chat Python sample with 2 prompts - tiny-random-minicpmv-2_6 | |
run: > | |
set -o pipefail | |
&& source ./ov/setupvars.sh | |
&& ./samples/python/visual_language_chat/visual_language_chat.py ./tiny-random-minicpmv-2_6/ ./images/cat.png | |
<<< $'What is unusual on this image?\nGo on.' | tee py2.txt | |
env: | |
PYTHONPATH: "./build/" | |
- name: Encode cpp2.txt with Python encoding instead of terminal one | |
shell: python | |
run: | | |
with open("cpp2.txt", "rb") as f: | |
content = f.read().decode("utf-8", "replace") | |
with open("cpp2.txt", "wb") as f: | |
f.write(content.encode("utf-8")) | |
- run: diff cpp2.txt py2.txt | |
cpp-continuous-batching-ubuntu: | |
runs-on: ubuntu-22.04-8-cores | |
defaults: | |
run: | |
shell: bash | |
steps: | |
- uses: actions/checkout@v4 | |
with: | |
submodules: recursive | |
- uses: actions/setup-python@v4 | |
with: | |
python-version: 3.12 | |
- name: Install OpenVINO | |
run: | | |
mkdir ./ov/ | |
curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz | |
sudo ./ov/install_dependencies/install_openvino_dependencies.sh | |
- name: Build app | |
run: | | |
source ./ov/setupvars.sh | |
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ | |
cmake --build ./build/ --config Release -j | |
- name: Download and convert and model | |
run: | | |
source ./ov/setupvars.sh | |
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly | |
python -m pip install -r ./samples/requirements.txt | |
optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 | |
- name: Run gtests | |
run: | | |
source ./ov/setupvars.sh | |
./build/tests/cpp/tests_continuous_batching | |
- name: Run accuracy_sample | |
run: | | |
source ./ov/setupvars.sh | |
timeout --verbose 50s ./build/tools/continuous_batching/accuracy/continuous_batching_accuracy -m ./TinyLlama-1.1B-Chat-v1.0/ -n 5 | |
- name: Run throughput_benchmark | |
run: | | |
wget -q https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json | |
source ./ov/setupvars.sh | |
timeout --verbose 200s ./build/tools/continuous_batching/benchmark/continuous_batching_benchmark -n 10 -m ./TinyLlama-1.1B-Chat-v1.0/ --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json --cache_size 1 | |
timeout --verbose 200s ./build/tools/continuous_batching/benchmark/continuous_batching_benchmark -n 10 --dynamic_split_fuse --max_batch_size 256 --max_input_len 256 -m ./TinyLlama-1.1B-Chat-v1.0/ --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json --cache_size 1 | |
cpp-continuous-batching-windows: | |
runs-on: windows-latest | |
env: | |
PYTHONIOENCODING: "utf8" | |
defaults: | |
run: | |
shell: cmd | |
steps: | |
- uses: actions/checkout@v4 | |
with: | |
submodules: recursive | |
- uses: actions/setup-python@v4 | |
with: | |
python-version: 3.9 | |
- name: Install OpenVINO | |
run: | | |
curl --output ov.zip ${{ env.w_ov_link }} | |
unzip -d ov ov.zip | |
dirs=(ov/*) && mv ov/*/* ov && rmdir "${dirs[@]}" | |
shell: bash | |
- name: Build app | |
run: | | |
call .\ov\setupvars.bat | |
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ | |
cmake --build ./build/ --config Release -j | |
env: | |
CMAKE_TLS_VERIFY: 0 | |
- name: Download and convert and model | |
run: | | |
call .\ov\setupvars.bat | |
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly | |
python -m pip install -r ./samples/requirements.txt | |
optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 | |
- name: Run gtests | |
run: | | |
set PATH=.\build\openvino_genai\;%PATH% | |
call .\ov\setupvars.bat | |
.\build\tests\cpp\Release\tests_continuous_batching.exe | |
- name: Run accuracy_sample | |
run: | | |
set PATH=.\build\openvino_genai\;%PATH% | |
call .\ov\setupvars.bat | |
.\build\tools\continuous_batching\accuracy\Release\continuous_batching_accuracy.exe -m .\TinyLlama-1.1B-Chat-v1.0\ -n 5 | |
- name: Run throughput_benchmark | |
run: | | |
curl -o .\ShareGPT_V3_unfiltered_cleaned_split.json -s -L "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json" | |
set PATH=.\build\openvino_genai\;%PATH% | |
call .\ov\setupvars.bat | |
.\build\tools\continuous_batching\benchmark\Release\continuous_batching_benchmark.exe -n 2 -m .\TinyLlama-1.1B-Chat-v1.0\ --dataset .\ShareGPT_V3_unfiltered_cleaned_split.json --cache_size 1 | |
cpp-continuous-batching-macos: | |
runs-on: macos-13 | |
defaults: | |
run: | |
shell: bash | |
steps: | |
- uses: actions/checkout@v4 | |
with: | |
submodules: recursive | |
- uses: actions/setup-python@v4 | |
with: | |
python-version: 3.9 | |
- name: Install OpenVINO | |
run: | | |
mkdir ./ov/ | |
curl ${{ env.m_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz | |
brew install coreutils scons | |
- name: Build app | |
run: | | |
source ./ov/setupvars.sh | |
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ | |
cmake --build ./build/ --config Release -j | |
- name: Download and convert and model | |
run: | | |
source ./ov/setupvars.sh | |
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly | |
python -m pip install -r ./samples/requirements.txt | |
optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 | |
- name: Run gtests | |
run: | | |
source ./ov/setupvars.sh | |
./build/tests/cpp/tests_continuous_batching | |
- name: Run accuracy_sample | |
run: | | |
source ./ov/setupvars.sh | |
timeout --verbose 120s ./build/tools/continuous_batching/accuracy/continuous_batching_accuracy -m ./TinyLlama-1.1B-Chat-v1.0/ -n 5 | |
- name: Run throughput_benchmark | |
run: | | |
wget -q https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json | |
source ./ov/setupvars.sh | |
./build/tools/continuous_batching/benchmark/continuous_batching_benchmark -n 5 -m ./TinyLlama-1.1B-Chat-v1.0/ --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json --cache_size 1 | |
Overall_Status: | |
name: ci/gha_overall_status_causal_lm | |
needs: [cpp-chat_sample-ubuntu, cpp-continuous-batching-ubuntu, | |
visual_language_chat_sample-ubuntu-minicpm_v2_6, cpp-continuous-batching-windows, cpp-continuous-batching-macos] | |
if: ${{ always() }} | |
runs-on: ubuntu-latest | |
steps: | |
- name: Check status of all jobs | |
if: >- | |
${{ | |
contains(needs.*.result, 'failure') || | |
contains(needs.*.result, 'cancelled') | |
}} | |
run: exit 1 |