diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000000..ebe747b783 --- /dev/null +++ b/.clang-format @@ -0,0 +1,28 @@ +BasedOnStyle: Google +IndentWidth: 4 +UseTab: Never +ColumnLimit: 120 + +Language: Cpp +Standard: Cpp11 + +AccessModifierOffset: -4 +AlignConsecutiveMacros: true +AllowAllArgumentsOnNextLine: false +AllowAllConstructorInitializersOnNextLine: false +AllowAllParametersOfDeclarationOnNextLine: false +AllowShortFunctionsOnASingleLine: Empty +AllowShortIfStatementsOnASingleLine: Never +AllowShortLambdasOnASingleLine: Empty +AllowShortLoopsOnASingleLine: false +AlwaysBreakBeforeMultilineStrings: false +BinPackArguments: false +BinPackParameters: false +CommentPragmas: '^#' +DerivePointerAlignment: false +FixNamespaceComments: true +IndentCaseLabels: false +IndentPPDirectives: AfterHash +ForEachMacros: + - foreach + - FOREACH_CHILD diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000..dfeac125fb --- /dev/null +++ b/.gitattributes @@ -0,0 +1,67 @@ +############################################################################### +# Set default behavior to automatically normalize line endings. +############################################################################### +* text=auto +############################################################################### +# Set default behavior for command prompt diff. +# +# This is need for earlier builds of msysgit that does not have it on by +# default for csharp files. +# Note: This is only used by command line +############################################################################### +#*.cs diff=csharp +*.py text eol=lf +############################################################################### +# Set the merge driver for project and solution files +# +# Merging from the command prompt will add diff markers to the files if there +# are conflicts (Merging from VS is not affected by the settings below, in VS +# the diff markers are never inserted). Diff markers may cause the following +# file extensions to fail to load in VS. An alternative would be to treat +# these files as binary and thus will always conflict and require user +# intervention with every merge. To do so, just uncomment the entries below +############################################################################### +#*.sln merge=binary +#*.csproj merge=binary +#*.vbproj merge=binary +#*.vcxproj merge=binary +#*.vcproj merge=binary +#*.dbproj merge=binary +#*.fsproj merge=binary +#*.lsproj merge=binary +#*.wixproj merge=binary +#*.modelproj merge=binary +#*.sqlproj merge=binary +#*.wwaproj merge=binary +############################################################################### +# behavior for image files +# +# image files are treated as binary by default. +############################################################################### +#*.jpg binary +#*.png binary +#*.gif binary +############################################################################### +# diff behavior for common document formats +# +# Convert binary document formats to text before diffing them. This feature +# is only available from the command line. Turn it on by uncommenting the +# entries below. +############################################################################### +#*.doc diff=astextplain +#*.DOC diff=astextplain +#*.docx diff=astextplain +#*.DOCX diff=astextplain +#*.dot diff=astextplain +#*.DOT diff=astextplain +#*.pdf diff=astextplain +#*.PDF diff=astextplain +#*.rtf diff=astextplain +#*.RTF diff=astextplain +*.PNG filter=lfs diff=lfs merge=lfs -text +*.png filter=lfs diff=lfs merge=lfs -text +*.jpg filter=lfs diff=lfs merge=lfs -text +*.gif filter=lfs diff=lfs merge=lfs -text +*.vsdx filter=lfs diff=lfs merge=lfs -text +*.bmp filter=lfs diff=lfs merge=lfs -text +*.svg filter=lfs diff=lfs merge=lfs -text diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000000..f908b5aceb --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,22 @@ +version: 2 +updates: + - package-ecosystem: "pip" + directory: "./" + schedule: + interval: "weekly" + - package-ecosystem: "pip" + directory: "image_generation/stable_diffusion_1_5/cpp/scripts/" + schedule: + interval: "weekly" + - package-ecosystem: "pip" + directory: "image_generation/lcm_dreamshaper_v7/cpp/scripts/" + schedule: + interval: "weekly" + - package-ecosystem: "pip" + directory: "./tests/python_tests/" + schedule: + interval: "weekly" + - package-ecosystem: "pip" + directory: "samples/" + schedule: + interval: "weekly" diff --git a/.github/label_config.yml b/.github/label_config.yml new file mode 100644 index 0000000000..a98691db81 --- /dev/null +++ b/.github/label_config.yml @@ -0,0 +1,13 @@ +# https://github.com/actions/labeler + +# Add label to the PRs changing files under llm_bench/ +llm_bench: +- changed-files: + - any-glob-to-any-file: + - 'llm_bench/**' + - '.github/workflows/llm_bench-python.yml' + +WWB: +- changed-files: + - any-glob-to-any-file: + - 'llm_bench/python/who_what_benchmark/**' diff --git a/.github/workflows/assign_issue.yml b/.github/workflows/assign_issue.yml new file mode 100644 index 0000000000..4a4579e2c7 --- /dev/null +++ b/.github/workflows/assign_issue.yml @@ -0,0 +1,25 @@ +name: Take Issue + +on: + issue_comment: + types: + - created + - edited + +permissions: read-all # Required by https://github.com/ossf/scorecard/blob/e23b8ad91fd6a64a0a971ca4fc0a4d1650725615/docs/checks.md#token-permissions + +jobs: + take-issue: + name: Take issue + runs-on: ubuntu-latest + permissions: + issues: write + timeout-minutes: 10 + steps: + - name: take an issue + uses: bdougie/take-action@v1.6.1 + with: + message: Thank you for looking into this issue! Please let us know if you have any questions or require any help. + issueCurrentlyAssignedMessage: Thanks for being interested in this issue. It looks like this ticket is already assigned to a contributor. Please communicate with the assigned contributor to confirm the status of the issue. + trigger: .take + token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/bandit.yml b/.github/workflows/bandit.yml new file mode 100644 index 0000000000..fce770d101 --- /dev/null +++ b/.github/workflows/bandit.yml @@ -0,0 +1,17 @@ +name: python -m bandit --recursive --configfile bandit.yml . +on: + pull_request: + paths-ignore: + - 'thirdparty' + - '**.md' +permissions: read-all # Required by https://github.com/ossf/scorecard/blob/e23b8ad91fd6a64a0a971ca4fc0a4d1650725615/docs/checks.md#token-permissions +jobs: + bandit: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v4 + with: + python-version: 3.11 + - run: python -m pip install bandit + - run: python -m bandit --recursive --configfile bandit.yml . diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 050e397148..2deca3f88e 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -1,203 +1,924 @@ name: causal_lm_cpp on: + workflow_dispatch: pull_request: - paths: - - .github/workflows/causal_lm_cpp.yml - - llm_bench/python/** - - text_generation/causal_lm/cpp/* - - thirdparty/openvino_contrib - - '!**.md' + merge_group: + push: + branches: + - master + - 'releases/**' +permissions: read-all # Required by https://github.com/ossf/scorecard/blob/e23b8ad91fd6a64a0a971ca4fc0a4d1650725615/docs/checks.md#token-permissions concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true + +env: + l_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16993-9c432a3641a/l_openvino_toolkit_ubuntu20_2024.5.0.dev20241014_x86_64.tgz + l_u22_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16993-9c432a3641a/l_openvino_toolkit_ubuntu22_2024.5.0.dev20241014_x86_64.tgz + m_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16993-9c432a3641a/m_openvino_toolkit_macos_12_6_2024.5.0.dev20241014_x86_64.tgz + w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16993-9c432a3641a/w_openvino_toolkit_windows_2024.5.0.dev20241014_x86_64.zip jobs: - cpp-greedy_causal_lm-ubuntu: + cpp-multinomial-greedy_causal_lm-ubuntu: runs-on: ubuntu-20.04-8-cores + defaults: + run: + shell: bash steps: - uses: actions/checkout@v4 with: submodules: recursive - uses: actions/setup-python@v4 with: - python-version: 3.8 - - run: ./text_generation/causal_lm/cpp/set_up_and_run.sh + python-version: 3.9 + - name: Install OpenVINO + run: | + mkdir ./ov/ + curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz + sudo ./ov/install_dependencies/install_openvino_dependencies.sh + - name: Build app + run: | + source ./ov/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Download and convert and model + run: | + source ./ov/setupvars.sh + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model openlm-research/open_llama_3b_v2 open_llama_3b_v2 + - run: > + . ./ov/setupvars.sh + && PYTHONPATH=./build/:$PYTHONPATH timeout 25s + ./build/samples/cpp/multinomial_causal_lm/multinomial_causal_lm ./open_llama_3b_v2/ a + - run: > + . ./ov/setupvars.sh + && PYTHONPATH=./build/:$PYTHONPATH timeout 25s + ./samples/python/multinomial_causal_lm/multinomial_causal_lm.py ./open_llama_3b_v2/ b + - run: > + . ./ov/setupvars.sh + && export PYTHONPATH=./build/:$PYTHONPATH + && timeout 25s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./open_llama_3b_v2/ "return 0" + | diff <(timeout 25s samples/python/greedy_causal_lm/greedy_causal_lm.py ./open_llama_3b_v2/ "return 0") - + cpp-beam_search_causal_lm-ubuntu: + strategy: + matrix: + executable: + [ + ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm, + python ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py, + ] runs-on: ubuntu-20.04 + defaults: + run: + shell: bash steps: - uses: actions/checkout@v4 with: submodules: recursive - uses: actions/setup-python@v4 with: - python-version: 3.8 + python-version: '3.10' - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2023.3.0-13739-294cc6668c4/l_openvino_toolkit_ubuntu20_2023.3.0.dev20231219_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - - name: Download, convert and build + - name: Build app run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager transformers==4.35.2 "optimum[openvino]>=1.14" -r ./llm_bench/python/requirements.txt ./thirdparty/openvino_contrib/modules/custom_operations/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu && python -m pip uninstall --yes openvino && python ./llm_bench/python/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v0.6 --output_dir ./TinyLlama-1.1B-Chat-v0.6/ --precision FP16 --stateful & - cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j - wait + - name: Download and convert and model + run: | + source ./ov/setupvars.sh + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - name: Compare run: | source ./ov/setupvars.sh - convert_tokenizer ./TinyLlama-1.1B-Chat-v0.6/pytorch/dldt/FP16/ --output ./TinyLlama-1.1B-Chat-v0.6/pytorch/dldt/FP16/ --with-detokenizer + export PYTHONPATH=./build/:$PYTHONPATH # C++ ignores that - timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v0.6/pytorch/dldt/FP16/ 69 > ./pred.txt + timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Why is the Sun yellow?" > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: predictions = file.read() - tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v0.6') - tokenized = tokenizer('69', return_tensors='pt') - for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v0.6').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): - ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' + tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') + tokenized = tokenizer('Why is the Sun yellow?', return_tensors='pt') + for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): + ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) if -1 == idx: raise RuntimeError(f'Missing "{ref=}" from predictions') predictions = predictions[:idx] + predictions[idx + len(ref):] " - echo 69 passed + echo "Why is the Sun yellow?" passed - timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v0.6/pytorch/dldt/FP16/ Hi > ./pred.txt + timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ 69 > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: predictions = file.read() - tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v0.6') - tokenized = tokenizer('Hi', return_tensors='pt') - for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v0.6').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): - ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' + tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') + tokenized = tokenizer('69', return_tensors='pt') + for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): + ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) if -1 == idx: raise RuntimeError(f'Missing "{ref=}" from predictions') predictions = predictions[:idx] + predictions[idx + len(ref):] " - echo Hi passed + echo 69 passed - timeout 25s ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v0.6/pytorch/dldt/FP16/ "return 0" > ./pred.txt + timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ Hi > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: predictions = file.read() - tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v0.6') - tokenized = tokenizer('return 0', return_tensors='pt') - for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v0.6').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): - ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' + tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') + tokenized = tokenizer('Hi', return_tensors='pt') + for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): + ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) if -1 == idx: raise RuntimeError(f'Missing "{ref=}" from predictions') predictions = predictions[:idx] + predictions[idx + len(ref):] " - echo return 0 passed + echo "Hi" passed - ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v0.6/pytorch/dldt/FP16/ "" > ./pred.txt + timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "return 0" > ./pred.txt python -c " import transformers with open('pred.txt', 'r') as file: predictions = file.read() - tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v0.6') - tokenized = tokenizer('', return_tensors='pt') - for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v0.6').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): - ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' + tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') + tokenized = tokenizer('return 0', return_tensors='pt') + for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): + ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) idx = predictions.find(ref) if -1 == idx: raise RuntimeError(f'Missing "{ref=}" from predictions') predictions = predictions[:idx] + predictions[idx + len(ref):] " - echo '""' passed + echo "return 0" passed - ./build/beam_search_causal_lm ./TinyLlama-1.1B-Chat-v0.6/pytorch/dldt/FP16/ "你好! 你好嗎?" > ./pred.txt + timeout 25s ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "你好! 你好嗎?" > ./pred.txt python -c " import transformers - with open('pred.txt', 'r') as file: + with open('pred.txt', 'r', errors='ignore') as file: predictions = file.read() - tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v0.6') + tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') tokenized = tokenizer('你好! 你好嗎?', return_tensors='pt') - for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v0.6').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): - ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' - idx = predictions.find(ref) + for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): + ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + idx = predictions.find(ref.replace('�', '')) if -1 == idx: raise RuntimeError(f'Missing "{ref=}" from predictions') predictions = predictions[:idx] + predictions[idx + len(ref):] " - echo 你好! 你好嗎? passed - cpp-beam_search_causal_lm-windows: + echo "你好! 你好嗎?" passed + + timeout 1m ${{ matrix.executable }} ./TinyLlama-1.1B-Chat-v1.0/ "Alan Turing was a" "return 0" "你好! 你好嗎?" > ./pred.txt + python -c " + import transformers + with open('pred.txt', 'r', errors='ignore') as file: + predictions = file.read() + tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') + prompts = [ + 'Alan Turing was a', + 'return 0', + '你好! 你好嗎?' + ] + for prompt in prompts: + tokenized = tokenizer(prompt, return_tensors='pt') + for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): + ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + idx = predictions.find(ref.replace('�', '')) + if -1 == idx: + raise RuntimeError(f'Missing "{ref=}" from predictions') + predictions = predictions[:idx] + predictions[idx + len(ref):] + " + echo "Multi prompt" passed + + cpp-greedy_causal_lm-windows: runs-on: windows-latest + env: + PYTHONIOENCODING: "utf8" + defaults: + run: + shell: cmd steps: - uses: actions/checkout@v4 with: submodules: recursive - uses: actions/setup-python@v4 with: - python-version: 3.8 - - uses: actions/checkout@v4 - with: - repository: TinyLlama/TinyLlama-1.1B-Chat-v0.6 - ref: bf9ae1c8bf026667e6f810768de259bb4a7f4777 - path: TinyLlama-1.1B-Chat-v0.6 - lfs: true - github-server-url: https://huggingface.co - - name: Install OpenVINO + python-version: 3.9 + - run: curl --output ov.zip ${{ env.w_ov_link }} + - run: unzip -d ov ov.zip + - run: dirs=(ov/*) && mv ov/*/* ov && rmdir "${dirs[@]}" shell: bash + - name: Build app run: | - curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2023.3.0-13739-294cc6668c4/w_openvino_toolkit_windows_2023.3.0.dev20231219_x86_64.zip - unzip ov.zip - - name: Download, convert and build - shell: cmd - run: | - call w_openvino_toolkit_windows_2023.3.0.dev20231219_x86_64\setupvars.bat - python -m pip install --upgrade-strategy eager transformers==4.35.2 "optimum[openvino]>=1.14" -r ./llm_bench/python/requirements.txt ./thirdparty/openvino_contrib/modules/custom_operations/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu - python -m pip uninstall --yes openvino - python ./llm_bench/python/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v0.6 --output_dir ./TinyLlama-1.1B-Chat-v0.6/ --precision FP16 --stateful - cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ + call .\ov\setupvars.bat + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j - - name: Compare - shell: cmd + - name: Download and convert model run: | - call w_openvino_toolkit_windows_2023.3.0.dev20231219_x86_64\setupvars.bat - convert_tokenizer .\TinyLlama-1.1B-Chat-v0.6\pytorch\dldt\FP16\ --output .\TinyLlama-1.1B-Chat-v0.6\pytorch\dldt\FP16\ --with-detokenizer - - .\build\Release\beam_search_causal_lm.exe .\TinyLlama-1.1B-Chat-v0.6\pytorch\dldt\FP16\ "69" > .\pred.txt + call .\ov\setupvars.bat + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 + - run: > + set PATH=.\build\openvino_genai\;%PATH% + && call .\ov\setupvars.bat + && .\build\samples\cpp\greedy_causal_lm\Release\greedy_causal_lm.exe .\TinyLlama-1.1B-Chat-v1.0\ 69 > .\cpp.txt + - run: | echo import transformers > ref.py - echo predictions = open('pred.txt', 'r').read() >> ref.py - echo tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v0.6') >> ref.py + echo predictions = open('cpp.txt', 'r').read() >> ref.py + echo tokenizer = transformers.AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0', trust_remote_code=True) >> ref.py echo tokenized = tokenizer('69', return_tensors='pt') >> ref.py - echo for beam in transformers.LlamaForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v0.6').generate(**tokenized, num_beam_groups=3, num_beams=15, num_return_sequences=15, diversity_penalty=1.0, max_new_tokens=20, early_stopping=False, length_penalty=1.0, no_repeat_ngram_size=9**9, do_sample=False): >> ref.py - echo ref = ': ' + tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) + '\n' >> ref.py + echo for beam in transformers.AutoModelForCausalLM.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0', trust_remote_code=True).generate(**tokenized, max_new_tokens=100, do_sample=False): >> ref.py + echo ref = tokenizer.decode(beam[tokenized['input_ids'].numel():], skip_special_tokens=True) >> ref.py echo idx = predictions.find(ref) >> ref.py echo if -1 == idx: >> ref.py echo raise RuntimeError(f'Missing "{ref=}" from predictions') >> ref.py echo predictions = predictions[:idx] + predictions[idx + len(ref):] >> ref.py - python ref.py - cpp-beam_search_causal_lm-Qwen-7B-Chat: - if: false # TODO: enable after shape inference failure is fixed + - run: python ref.py + - run: > + set PATH=.\build\openvino_genai\;%PATH% + && set "PYTHONPATH=./build/" + && call .\ov\setupvars.bat + && python samples\python\greedy_causal_lm\greedy_causal_lm.py .\TinyLlama-1.1B-Chat-v1.0\ 69 > .\py.txt + - run: fc .\cpp.txt .\py.txt + + cpp-greedy_causal_lm-Qwen-7B-Chat: + runs-on: ubuntu-20.04-16-cores + defaults: + run: + shell: bash + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - uses: actions/setup-python@v4 + with: + python-version: 3.11 + - name: Install OpenVINO + run: | + mkdir ./ov/ + curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz + sudo ./ov/install_dependencies/install_openvino_dependencies.sh + - name: Build app + run: | + source ./ov/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Download and convert and model + run: | + source ./ov/setupvars.sh + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat + - run: > + . ./ov/setupvars.sh + && export PYTHONPATH=./build/:$PYTHONPATH + && timeout 2m ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./Qwen-7B-Chat/ 69 | diff <(timeout 2m samples/python/greedy_causal_lm/greedy_causal_lm.py ./Qwen-7B-Chat/ 69) - + + cpp-beam_search_causal_lm-Qwen1_5-7B-Chat: + runs-on: ubuntu-20.04-16-cores + defaults: + run: + shell: bash + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - uses: actions/setup-python@v4 + with: + python-version: 3.12 + - name: Install OpenVINO + run: | + mkdir ./ov/ + curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz + sudo ./ov/install_dependencies/install_openvino_dependencies.sh + - name: Build app + run: | + source ./ov/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Download and convert and model + run: | + source ./ov/setupvars.sh + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen1.5-7B-Chat Qwen1.5-7B-Chat + - run: > + . ./ov/setupvars.sh + && export PYTHONPATH=./build/:$PYTHONPATH + && timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./Qwen1.5-7B-Chat/ "你好!" + | diff <(timeout 50s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./Qwen1.5-7B-Chat/ "你好!") - + + cpp-beam_search_causal_lm-Phi-2: + runs-on: ubuntu-20.04-16-cores + defaults: + run: + shell: bash + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - uses: actions/setup-python@v4 + with: + python-version: 3.9 + - name: Install OpenVINO + run: | + mkdir ./ov/ + curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz + sudo ./ov/install_dependencies/install_openvino_dependencies.sh + - name: Build app + run: | + source ./ov/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Download and convert and model + run: | + source ./ov/setupvars.sh + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-2 phi-2 + - run: > + . ./ov/setupvars.sh + && export PYTHONPATH=./build/:$PYTHONPATH + && timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./phi-2/ 69 + | diff <(timeout 50s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./phi-2/ 69) - + + cpp-beam_search_causal_lm-notus-7b-v1: + runs-on: ubuntu-20.04-16-cores + defaults: + run: + shell: bash + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - uses: actions/setup-python@v4 + with: + python-version: '3.10' + - name: Install OpenVINO + run: | + mkdir ./ov/ + curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz + sudo ./ov/install_dependencies/install_openvino_dependencies.sh + - name: Build app + run: | + source ./ov/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Download and convert and model + run: | + source ./ov/setupvars.sh + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model argilla/notus-7b-v1 notus-7b-v1 + - run: > + . ./ov/setupvars.sh + && export PYTHONPATH=./build/:$PYTHONPATH + && timeout 50s ./build/samples/cpp/beam_search_causal_lm/beam_search_causal_lm ./notus-7b-v1/ 69 + | diff <(timeout 50s ./samples/python/beam_search_causal_lm/beam_search_causal_lm.py ./notus-7b-v1/ 69) - + + cpp-speculative_decoding_lm-ubuntu: + runs-on: ubuntu-20.04-16-cores + defaults: + run: + shell: bash + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - uses: actions/setup-python@v4 + with: + python-version: 3.11 + - name: Install OpenVINO + run: | + mkdir ./ov/ + curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz + sudo ./ov/install_dependencies/install_openvino_dependencies.sh + - name: Build app + run: | + source ./ov/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Download and convert and model + run: | + source ./ov/setupvars.sh + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-3b dolly-v2-3b + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model databricks/dolly-v2-7b dolly-v2-7b + - name: run and compare + run: | + source ./ov/setupvars.sh + ./build/samples/cpp/speculative_decoding_lm/speculative_decoding_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_speculative.txt + ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt + python -c " + with open('predictions_greedy.txt', 'r') as f: + predicted_greedy = f.readline() + with open('predictions_speculative.txt', 'r') as f: + predicted_speculative = f.readline() + assert predicted_greedy == predicted_speculative + " + echo "Alan Turing was a" passed + + cpp-prompt_lookup_decoding_lm-ubuntu: + runs-on: ubuntu-20.04-16-cores + defaults: + run: + shell: bash + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - uses: actions/setup-python@v4 + with: + python-version: 3.12 + - name: Install OpenVINO + run: | + mkdir ./ov/ + curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz + sudo ./ov/install_dependencies/install_openvino_dependencies.sh + - name: Build app + run: | + source ./ov/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Download and convert and model + run: | + source ./ov/setupvars.sh + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model Qwen/Qwen-7B-Chat Qwen-7B-Chat --task text-generation-with-past + - name: run and compare + run: | + source ./ov/setupvars.sh + + echo 'Code:```python + def add(a, b): + return a + b + ``` + Question: Can you please add 2 and 3 + A:' > ./prompt.txt + + ./build/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_prompt_lookup.txt + ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_greedy.txt + python -c " + with open('predictions_greedy.txt', 'r') as f: + predicted_greedy = f.readline() + with open('predictions_prompt_lookup.txt', 'r') as f: + predicted_prompt_lookup = f.readline() + assert predicted_greedy == predicted_prompt_lookup + " + echo "Prompt lookup" passed + - name: run and compare (model with seq_length_axis = 1) + run: | + source ./ov/setupvars.sh + + echo 'Code:```python + def add(a, b): + return a + b + ``` + Question: Can you please add 2 and 3 + A:' > ./prompt.txt + + ./build/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm ./Qwen-7B-Chat/ "$(<prompt.txt)" > predictions_prompt_lookup.txt + ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./Qwen-7B-Chat/ "$(<prompt.txt)" > predictions_greedy.txt + + python -c " + with open('predictions_greedy.txt', 'r') as f: + predicted_greedy = f.readline() + with open('predictions_prompt_lookup.txt', 'r') as f: + predicted_prompt_lookup = f.readline() + assert predicted_greedy == predicted_prompt_lookup + " + echo "Prompt lookup" passed + + cpp-Phi-1_5: runs-on: ubuntu-20.04-16-cores + defaults: + run: + shell: bash + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - uses: actions/setup-python@v4 + with: + python-version: 3.9 + - name: Install OpenVINO + run: | + mkdir ./ov/ + curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz + sudo ./ov/install_dependencies/install_openvino_dependencies.sh + - name: Build app + run: | + source ./ov/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Download and convert and model + run: | + source ./ov/setupvars.sh + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model microsoft/phi-1_5 phi-1_5 + - name: Run Generation + run: | + source ./ov/setupvars.sh + timeout 50s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./phi-1_5/ "Alan Turing was a" > ./pred_greedy.txt + - name: Compare + run: | + python -c " + import transformers + with open('pred_greedy.txt', 'r') as file: + predictions = file.read() + tokenizer = transformers.AutoTokenizer.from_pretrained('microsoft/phi-1_5') + tokenized = tokenizer('Alan Turing was a', return_tensors='pt') + for output in transformers.AutoModelForCausalLM.from_pretrained('microsoft/phi-1_5').generate(**tokenized, max_length=100, do_sample=False): + ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True) + idx = predictions.find(ref) + if -1 == idx: + raise RuntimeError(f'Missing "{ref=}" from predictions') + predictions = predictions[:idx] + predictions[idx + len(ref):] + " + echo Phi-1_5 passed + - run: > + . ./ov/setupvars.sh + && export PYTHONPATH=./build/:$PYTHONPATH + && timeout 50s samples/python/greedy_causal_lm/greedy_causal_lm.py ./phi-1_5/ "Alan Turing was a" + | diff ./pred_greedy.txt - + + cpp-greedy_causal_lm-redpajama-3b-chat: + runs-on: ubuntu-20.04-4-cores + defaults: + run: + shell: bash + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - uses: actions/setup-python@v4 + with: + python-version: '3.10' + - name: Install OpenVINO + run: | + mkdir ./ov/ + curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz + sudo ./ov/install_dependencies/install_openvino_dependencies.sh + - name: Build app + run: | + source ./ov/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Download and convert and model + run: | + source ./ov/setupvars.sh + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model ikala/redpajama-3b-chat redpajama-3b-chat + - name: Run Generation + run: | + source ./ov/setupvars.sh + timeout 50s ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./redpajama-3b-chat/ "Alan Turing was a" > ./pred_greedy.txt + - name: Compare + run: | + python -c " + import transformers + with open('pred_greedy.txt', 'r') as file: + predictions = file.read() + tokenizer = transformers.AutoTokenizer.from_pretrained('ikala/redpajama-3b-chat') + tokenized = tokenizer('Alan Turing was a', return_tensors='pt') + for output in transformers.AutoModelForCausalLM.from_pretrained('ikala/redpajama-3b-chat').generate(**tokenized, max_length=100, do_sample=False): + ref = tokenizer.decode(output[tokenized['input_ids'].numel():], skip_special_tokens=True) + idx = predictions.find(ref) + if -1 == idx: + raise RuntimeError(f'Missing "{ref}" from predictions') + predictions = predictions[:idx] + predictions[idx + len(ref):] + " + echo "Alan Turing was a" passed + - run: > + . ./ov/setupvars.sh + && export PYTHONPATH=./build/:$PYTHONPATH + && timeout 50s samples/python/greedy_causal_lm/greedy_causal_lm.py ./redpajama-3b-chat/ "Alan Turing was a" + | diff ./pred_greedy.txt - + + cpp-chat_sample-ubuntu: + runs-on: ubuntu-24.04 + defaults: + run: + shell: bash steps: - uses: actions/checkout@v4 with: submodules: recursive - uses: actions/setup-python@v4 with: - python-version: 3.8 + python-version: 3.11 - name: Install OpenVINO run: | mkdir ./ov/ - curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2023.3.0-13739-294cc6668c4/l_openvino_toolkit_ubuntu20_2023.3.0.dev20231219_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz sudo ./ov/install_dependencies/install_openvino_dependencies.sh - - name: Download, convert and build + - name: Build app run: | source ./ov/setupvars.sh - python -m pip install --upgrade-strategy eager transformers==4.35.2 "optimum[openvino]>=1.14" -r ./llm_bench/python/requirements.txt ./thirdparty/openvino_contrib/modules/custom_operations/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu && python -m pip uninstall --yes openvino && python ./llm_bench/python/convert.py --model_id Qwen/Qwen-7B-Chat --output_dir ./Qwen-7B-Chat/ --precision FP16 --stateful & - cmake -DCMAKE_BUILD_TYPE=Release -S ./text_generation/causal_lm/cpp/ -B ./build/ + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release -j - wait + - name: Download and convert and model + run: | + source ./ov/setupvars.sh + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - name: Compare run: | source ./ov/setupvars.sh - convert_tokenizer ./Qwen-7B-Chat/pytorch/dldt/FP16/ --output ./Qwen-7B-Chat/pytorch/dldt/FP16/ --with-detokenizer --trust-remote-code - timeout 25s ./build/beam_search_causal_lm ./Qwen-7B-Chat/pytorch/dldt/FP16/ 69 > ./pred.txt + printf 'What is 2 + 2?\nWhat is the previous answer?\nAdd 1 to it.\nSubtract 5 from it.\nWhy is the sun yellow?\nWhat was my first question?\n' > ./input.txt + timeout 30s ./build/samples/cpp/chat_sample/chat_sample ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred.txt + python -c " + from transformers import LlamaTokenizer, AutoModelForCausalLM + model_id = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0' + tokenizer = LlamaTokenizer.from_pretrained(model_id) + model = AutoModelForCausalLM.from_pretrained(model_id) + prompts = ['What is 2 + 2?', 'What is the previous answer?', 'Add 1 to it.', 'Subtract 5 from it.', 'Why is the sun yellow?', 'What was my first question?'] + def gen_prompt(prompt): + return {'role': 'user', 'content': prompt} + def gen_answer(answer): + return {'role': 'assistant', 'content': answer} + chat_history = [] + chat_prompt = '' + output = open('ref.txt', 'w') + for prompt in prompts: + output.write('question:\n') + chat_history.append(gen_prompt(prompt)) + chat_prompt = tokenizer.apply_chat_template(chat_history, tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False) + answer = model.generate(**tokenized, max_length=1000, do_sample=False) + answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True) + chat_history.append(gen_answer(answer_str)) + output.write(answer_str) + output.write('\n----------\n') + output.write('question:\n') + output.close() + " + diff pred.txt ref.txt + echo "Chat sample cpp" passed + export PYTHONPATH=./build/:$PYTHONPATH + timeout 30s ./samples/python/chat_sample/chat_sample.py ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred2.txt + diff pred2.txt ref.txt + echo "Chat sample python" passed + + visual_language_chat_sample-ubuntu: + runs-on: ubuntu-22.04-16-cores + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - uses: actions/setup-python@v4 + with: + python-version: 3.11 + - name: Install OpenVINO + run: | + mkdir ./ov/ + curl ${{ env.l_u22_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz + sudo ./ov/install_dependencies/install_openvino_dependencies.sh + - name: Build app + run: | + source ./ov/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release --target visual_language_chat py_generate_pipeline -j + - name: Download and convert MiniCPM-V-2_6 model and an image + run: | + source ./ov/setupvars.sh + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt opencv-python --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies + optimum-cli export openvino -m openbmb/MiniCPM-V-2_6 MiniCPM-V-2_6 --trust-remote-code + wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 --output-document cat.jpg + - name: Generate reference + shell: python + run: | + from optimum.intel.openvino import OVModelForVisualCausalLM + from transformers import AutoProcessor + from PIL import Image + import requests + import cv2 + import numpy as np + res = 448, 448 + im = np.arange(res[0] * res[1] * 3, dtype=np.uint8) % 255 + im = im.reshape([*res, 3]) + cv2.imwrite("lines.png", im) + model_id = "openbmb/MiniCPM-V-2_6" + processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) + prompt = processor.tokenizer.apply_chat_template([{"role": "user", "content": "(<image>./</image>)\nWhat is unusual on this image?"}], tokenize=False, add_generation_prompt=True) + image = Image.open("/home/vzlobin/r/g/g.png").convert('RGB') + # image = Image.open(requests.get("https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11", stream=True).raw).convert('RGB') + model = OVModelForVisualCausalLM.from_pretrained("MiniCPM-V-2_6", trust_remote_code=True) + inputs = processor([prompt], [image], return_tensors="pt") + result = model.generate(**inputs, max_new_tokens=200) + decoded = processor.tokenizer.batch_decode(result[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0] + print(decoded) + with open("ref.txt", "w") as f: + f.write(decoded) + + - name: Run visual_language_chat sample - MiniCPM-V-2_6 + run: > + source ./ov/setupvars.sh + && timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./MiniCPM-V-2_6/ cat.jpg + <<< $'What is on the image?\nWhat is special on the image?' + - name: Download and convert LLaVa 1.5 model and an image + run: | + source ./ov/setupvars.sh + optimum-cli export openvino --model llava-hf/llava-1.5-7b-hf ./llava_1_5_7b_ov/ + wget https://llava-vl.github.io/static/images/monalisa.jpg + - name: Run visual_language_chat sample - LLaVa 1.5 + run: > + source ./ov/setupvars.sh + && ./build/samples/cpp/visual_language_chat/visual_language_chat ./llava_1_5_7b_ov/ monalisa.jpg + <<< $'Who drew this painting?\nWhen did the painter live?' + timeout-minutes: 4 + + - name: Run python chat sample + run: | + source ./ov/setupvars.sh + export PYTHONPATH=./build/:$PYTHONPATH + printf 'What is on the image?\nWhat is special on the image?\n' > ./input.txt + timeout 120s python ./samples/python/visual_language_chat/visual_language_chat.py ./MiniCPM-V-2_6/ cat.jpg < input.txt > ./pred.txt + + cpp-continuous-batching-ubuntu: + runs-on: ubuntu-20.04-8-cores + defaults: + run: + shell: bash + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - uses: actions/setup-python@v4 + with: + python-version: 3.12 + - name: Install OpenVINO + run: | + mkdir ./ov/ + curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz + sudo ./ov/install_dependencies/install_openvino_dependencies.sh + - name: Build app + run: | + source ./ov/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Download and convert and model + run: | + source ./ov/setupvars.sh + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 + - name: Run gtests + run: | + source ./ov/setupvars.sh + ./build/tests/cpp/tests_continuous_batching + - name: Run accuracy_sample + run: | + source ./ov/setupvars.sh + timeout 50s ./build/samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy -m ./TinyLlama-1.1B-Chat-v1.0/ -n 5 + - name: Run throughput_benchmark + run: | + wget -q https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + source ./ov/setupvars.sh + timeout 200s ./build/samples/cpp/continuous_batching_benchmark/continuous_batching_benchmark -n 10 -m ./TinyLlama-1.1B-Chat-v1.0/ --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json --cache_size 1 + timeout 200s ./build/samples/cpp/continuous_batching_benchmark/continuous_batching_benchmark -n 10 --dynamic_split_fuse --max_batch_size 256 --max_input_len 256 -m ./TinyLlama-1.1B-Chat-v1.0/ --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json --cache_size 1 + + cpp-continuous-batching-windows: + runs-on: windows-latest + env: + PYTHONIOENCODING: "utf8" + defaults: + run: + shell: cmd + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - uses: actions/setup-python@v4 + with: + python-version: 3.9 + - name: Install OpenVINO + run: | + curl --output ov.zip ${{ env.w_ov_link }} + unzip -d ov ov.zip + dirs=(ov/*) && mv ov/*/* ov && rmdir "${dirs[@]}" + shell: bash + - name: Build app + run: | + call .\ov\setupvars.bat + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Download and convert and model + run: | + call .\ov\setupvars.bat + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 + - name: Run gtests + run: | + set PATH=.\build\openvino_genai\;%PATH% + call .\ov\setupvars.bat + .\build\tests\cpp\Release\tests_continuous_batching.exe + - name: Run accuracy_sample + run: | + set PATH=.\build\openvino_genai\;%PATH% + call .\ov\setupvars.bat + .\build\samples\cpp\continuous_batching_accuracy\Release\continuous_batching_accuracy.exe -m .\TinyLlama-1.1B-Chat-v1.0\ -n 5 + - name: Run throughput_benchmark + run: | + curl -o .\ShareGPT_V3_unfiltered_cleaned_split.json -s -L "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json" + set PATH=.\build\openvino_genai\;%PATH% + call .\ov\setupvars.bat + .\build\samples\cpp\continuous_batching_benchmark\Release\continuous_batching_benchmark.exe -n 2 -m .\TinyLlama-1.1B-Chat-v1.0\ --dataset .\ShareGPT_V3_unfiltered_cleaned_split.json --cache_size 1 + + cpp-continuous-batching-macos: + runs-on: macos-12 + defaults: + run: + shell: bash + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - uses: actions/setup-python@v4 + with: + python-version: 3.9 + - name: Install OpenVINO + run: | + mkdir ./ov/ + curl ${{ env.m_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz + brew install coreutils scons + - name: Build app + run: | + source ./ov/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + - name: Download and convert and model + run: | + source ./ov/setupvars.sh + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 + - name: Run gtests + run: | + source ./ov/setupvars.sh + ./build/tests/cpp/tests_continuous_batching + - name: Run accuracy_sample + run: | + source ./ov/setupvars.sh + timeout 120s ./build/samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy -m ./TinyLlama-1.1B-Chat-v1.0/ -n 5 + - name: Run throughput_benchmark + run: | + wget -q https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + source ./ov/setupvars.sh + ./build/samples/cpp/continuous_batching_benchmark/continuous_batching_benchmark -n 5 -m ./TinyLlama-1.1B-Chat-v1.0/ --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json --cache_size 1 + + Overall_Status: + name: ci/gha_overall_status_causal_lm + needs: [cpp-multinomial-greedy_causal_lm-ubuntu, cpp-beam_search_causal_lm-ubuntu, cpp-greedy_causal_lm-windows, + cpp-greedy_causal_lm-Qwen-7B-Chat, cpp-beam_search_causal_lm-Qwen1_5-7B-Chat, cpp-beam_search_causal_lm-Phi-2, + cpp-beam_search_causal_lm-notus-7b-v1, cpp-speculative_decoding_lm-ubuntu, cpp-prompt_lookup_decoding_lm-ubuntu, + cpp-Phi-1_5, cpp-greedy_causal_lm-redpajama-3b-chat, cpp-chat_sample-ubuntu, cpp-continuous-batching-ubuntu, + visual_language_chat_sample-ubuntu, + cpp-continuous-batching-windows, cpp-continuous-batching-macos] + if: ${{ always() }} + runs-on: ubuntu-latest + steps: + - name: Check status of all jobs + if: >- + ${{ + contains(needs.*.result, 'failure') || + contains(needs.*.result, 'cancelled') + }} + run: exit 1 diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml new file mode 100644 index 0000000000..ddb40f29bc --- /dev/null +++ b/.github/workflows/labeler.yml @@ -0,0 +1,23 @@ +# https://github.com/actions/labeler +name: label PRs + +on: + pull_request_target: + types: [opened, edited, synchronize] + paths: + - llm_bench/python/** + - .github/workflows/llm_bench-python.yml + +permissions: read-all # Required by https://github.com/ossf/scorecard/blob/e23b8ad91fd6a64a0a971ca4fc0a4d1650725615/docs/checks.md#token-permissions + +jobs: + labeler: + permissions: + contents: read + pull-requests: write + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/labeler@v5 + with: + configuration-path: '.github/label_config.yml' diff --git a/.github/workflows/lcm_dreamshaper_cpp.yml b/.github/workflows/lcm_dreamshaper_cpp.yml new file mode 100644 index 0000000000..94ebcafe7b --- /dev/null +++ b/.github/workflows/lcm_dreamshaper_cpp.yml @@ -0,0 +1,136 @@ +name: lcm_dreamshaper + +on: + workflow_dispatch: + pull_request: + merge_group: + push: + branches: + - master + - 'releases/**' + +permissions: read-all # Required by https://github.com/ossf/scorecard/blob/e23b8ad91fd6a64a0a971ca4fc0a4d1650725615/docs/checks.md#token-permissions + +env: + PYTHON_VERSION: '3.9' + LINUX_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16993-9c432a3641a/l_openvino_toolkit_ubuntu20_2024.5.0.dev20241014_x86_64.tgz + WINDOWS_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16993-9c432a3641a/w_openvino_toolkit_windows_2024.5.0.dev20241014_x86_64.zip + OV_INSTALL_DIR: ${{ github.workspace }}/ov + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + lcm_dreamshaper_v7_cpp-linux: + runs-on: ubuntu-22.04 + defaults: + run: + shell: bash -l {0} + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Download OpenVINO archive + run: | + wget ${{ env.LINUX_OV_ARCHIVE_URL}} --progress=bar:force:noscroll -O openvino_package.tar.gz + mkdir ${{ env.OV_INSTALL_DIR }} + tar -xzf openvino_package.tar.gz -C ${{ env.OV_INSTALL_DIR }} --strip-components=1 + + - name: Build app + run: | + source ${{ env.OV_INSTALL_DIR }}/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release --parallel + + - name: Setup Python ${{ env.PYTHON_VERSION }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + cache: 'pip' + + - name: Create virtual environment + run: python3 -m venv openvino_lcm_cpp + + - name: Install python dependencies + run: | + source openvino_lcm_cpp/bin/activate + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies + + - name: Download and convert models and tokenizer + run: | + source openvino_lcm_cpp/bin/activate + optimum-cli export openvino --model SimianLuo/LCM_Dreamshaper_v7 --task stable-diffusion --weight-format fp16 models/lcm_dreamshaper_v7/FP16 + + - name: Run app + run: | + source ${{ env.OV_INSTALL_DIR }}/setupvars.sh + ./build/samples/cpp/text2image/stable_diffusion ./models/lcm_dreamshaper_v7/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" + + lcm_dreamshaper_v7_cpp-windows: + runs-on: windows-latest + defaults: + run: + shell: pwsh + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Download OpenVINO archive + run: | + mkdir ${{ env.OV_INSTALL_DIR }} + pushd ${{ env.OV_INSTALL_DIR }} + Invoke-WebRequest "${{ env.WINDOWS_OV_ARCHIVE_URL}}" -OutFile "openvino_package.zip" + Expand-Archive openvino_package.zip -DestinationPath ./tmp + mv ./tmp/*/* . + popd + + - name: Build app + run: | + . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release --parallel + + - name: Setup Python ${{ env.PYTHON_VERSION }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + cache: 'pip' + + - name: Create virtual environment + run: python -m venv openvino_lcm_cpp + + - name: Install python dependencies + run: | + . "./openvino_lcm_cpp/Scripts/Activate.ps1" + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies + + - name: Download and convert models and tokenizer + run: | + . "./openvino_lcm_cpp/Scripts/Activate.ps1" + optimum-cli export openvino --model SimianLuo/LCM_Dreamshaper_v7 --task stable-diffusion --weight-format fp16 models/lcm_dreamshaper_v7/FP16 + + - name: Run app + run: > + . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" + & "./build/samples/cpp/text2image/Release/stable_diffusion.exe ./models/lcm_dreamshaper_v7/FP16 'cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting'" + + Overall_Status: + name: ci/gha_overall_status_lcm + needs: [lcm_dreamshaper_v7_cpp-linux, lcm_dreamshaper_v7_cpp-windows] + if: ${{ always() }} + runs-on: ubuntu-latest + steps: + - name: Check status of all jobs + if: >- + ${{ + contains(needs.*.result, 'failure') || + contains(needs.*.result, 'cancelled') + }} + run: exit 1 diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml new file mode 100644 index 0000000000..2f327ecf31 --- /dev/null +++ b/.github/workflows/linux.yml @@ -0,0 +1,481 @@ +name: Linux (Ubuntu 20.04, Python 3.9) +on: + workflow_dispatch: + pull_request: + merge_group: + push: + branches: + - master + - 'releases/**' + +concurrency: + # github.ref is not unique in post-commit + group: ${{ github.event_name == 'push' && github.run_id || github.ref }}-linux + cancel-in-progress: true + +env: + PYTHON_VERSION: '3.9' + OV_BRANCH: ${{ github.base_ref || github.event.merge_group.base_ref || github.ref }} + OV_TARBALL: '' + +permissions: read-all # Required by https://github.com/ossf/scorecard/blob/e23b8ad91fd6a64a0a971ca4fc0a4d1650725615/docs/checks.md#token-permissions + +jobs: + openvino_download: + name: Download OpenVINO package + outputs: + status: ${{ steps.openvino_download.outcome }} + timeout-minutes: 10 + defaults: + run: + shell: bash + runs-on: ubuntu-20.04 + + steps: + - name: Download OpenVINO build + id: openvino_download + run: | + wget ${{ env.OV_TARBALL}} --progress=bar:force:noscroll -O openvino_package.tar.gz + tar -tvf openvino_package.tar.gz + continue-on-error: true + + # + # Upload to artifacts + # + + - name: Upload openvino package + if: steps.openvino_download.outcome == 'success' + uses: actions/upload-artifact@v4 + with: + name: openvino_package + path: openvino_package.tar.gz + if-no-files-found: 'error' + + openvino_build: + name: Build OpenVINO package + needs: [openvino_download] + if: needs.openvino_download.outputs.status != 'success' + timeout-minutes: 150 + defaults: + run: + shell: bash + runs-on: ubuntu-20.04-16-cores + env: + DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input + CMAKE_BUILD_TYPE: 'Release' + CMAKE_GENERATOR: 'Ninja Multi-Config' + CMAKE_CXX_COMPILER_LAUNCHER: ccache + CMAKE_C_COMPILER_LAUNCHER: ccache + OPENVINO_REPO: ${{ github.workspace }}/openvino + INSTALL_DIR: ${{ github.workspace }}/openvino/install + BUILD_DIR: ${{ github.workspace }}/openvino/build + CCACHE_DIR: ${{ github.workspace }}/ccache + CCACHE_MAXSIZE: 2000Mi + + steps: + - name: Set apt + run: | + echo 'Acquire::Retries "10";' | sudo tee -a /etc/apt/apt.conf.d/80-retries > /dev/null + echo 'APT::Get::Assume-Yes "true";' | sudo tee -a /etc/apt/apt.conf.d/81-assume-yes > /dev/null + echo 'APT::Get::Fix-Broken "true";' | sudo tee -a /etc/apt/apt.conf.d/82-fix-broken > /dev/null + echo 'APT::Get::no-install-recommends "true";' | sudo tee -a /etc/apt/apt.conf.d/83-no-reсommends > /dev/null + + - name: Clone OpenVINO + uses: actions/checkout@v4 + with: + repository: 'openvinotoolkit/openvino' + path: ${{ env.OPENVINO_REPO }} + submodules: 'true' + ref: ${{ env.OV_BRANCH}} + + # + # Dependencies + # + + - name: Install build dependencies + run: | + sudo -E ${OPENVINO_REPO}/install_build_dependencies.sh + sudo apt-get install ccache + + - name: Setup Python ${{ env.PYTHON_VERSION }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + cache: 'pip' + + - name: Install python dependencies + run: | + # For Python API: build and wheel packaging + python3 -m pip install -r ${OPENVINO_REPO}/src/bindings/python/wheel/requirements-dev.txt + + # + # Build + # + + - name: Setup ccache + uses: actions/cache@v4 + with: + # Should save cache only if run in the master branch of the base repo + # github.ref_name is 'ref/PR_#' in case of the PR, and 'branch_name' when executed on push + save-always: ${{ github.ref_name == 'master' && 'true' || 'false' }} + path: ${{ env.CCACHE_DIR }} + key: ${{ runner.os }}-${{ runner.arch }}-ccache-ov-${{ github.sha }} + restore-keys: | + ${{ runner.os }}-${{ runner.arch }}-ccache-ov + + - name: CMake configure - OpenVINO + run: | + cmake \ + -G "${{ env.CMAKE_GENERATOR }}" \ + -DENABLE_CPPLINT=OFF \ + -DENABLE_NCC_STYLE=OFF \ + -DENABLE_TESTS=OFF \ + -DENABLE_STRICT_DEPENDENCIES=OFF \ + -DENABLE_SYSTEM_OPENCL=ON \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DCPACK_GENERATOR=TGZ \ + -DENABLE_JS=OFF \ + -DENABLE_SAMPLES=ON \ + -DENABLE_INTEL_NPU=OFF \ + -DENABLE_OV_ONNX_FRONTEND=OFF \ + -DENABLE_OV_PADDLE_FRONTEND=OFF \ + -DENABLE_OV_PYTORCH_FRONTEND=ON \ + -DENABLE_OV_TF_FRONTEND=ON \ + -DENABLE_OV_TF_LITE_FRONTEND=OFF \ + -DENABLE_INTEL_GPU=OFF \ + -DENABLE_INTEL_NPU=OFF \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DENABLE_PYTHON=ON \ + -DENABLE_WHEEL=ON \ + -S ${OPENVINO_REPO} \ + -B ${BUILD_DIR} + + - name: Clean ccache stats + run: ccache --zero-stats --show-config + + - name: Cmake build - OpenVINO + run: cmake --build ${BUILD_DIR} --parallel --config ${{ env.CMAKE_BUILD_TYPE }} + + - name: Show ccache stats + run: ccache --show-stats + + - name: Cmake install - OpenVINO + run: | + cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR}/openvino_package -P ${BUILD_DIR}/cmake_install.cmake + cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR}/openvino_package -DCOMPONENT=python_wheels -P ${BUILD_DIR}/cmake_install.cmake + + - name: Pack Artifacts + run: | + pushd ${INSTALL_DIR} + tar -czvf ${BUILD_DIR}/openvino_package.tar.gz * + popd + + # + # Upload build artifacts and logs + # + + - name: Upload openvino package + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: openvino_package + path: ${{ env.BUILD_DIR }}/openvino_package.tar.gz + if-no-files-found: 'error' + + genai_python_lib: + name: OpenVINO genai extension (cmake + wheel) + needs: [ openvino_download, openvino_build ] + if: | + always() && + (needs.openvino_download.outputs.status == 'success' || needs.openvino_build.result == 'success') + timeout-minutes: 120 + defaults: + run: + shell: bash + runs-on: ubuntu-20.04-16-cores + env: + CMAKE_GENERATOR: Unix Makefiles + CMAKE_BUILD_PARALLEL_LEVEL: null + OV_INSTALL_DIR: ${{ github.workspace }}/ov + CCACHE_DIR: ${{ github.workspace }}/ccache + CCACHE_MAXSIZE: 500Mi + CMAKE_CXX_COMPILER_LAUNCHER: ccache + CMAKE_C_COMPILER_LAUNCHER: ccache + + steps: + - name: Clone openvino.genai + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Setup Python ${{ env.PYTHON_VERSION }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + cache: 'pip' + + - name: Download OpenVINO package + uses: actions/download-artifact@v4 + with: + name: openvino_package + path: ${{ env.OV_INSTALL_DIR }} + + - name: Extract OpenVINO packages + run: | + pushd ${OV_INSTALL_DIR} + tar -xzf openvino_package.tar.gz -C ${OV_INSTALL_DIR} --strip-components=1 + popd + + - name: Set apt + run: | + echo 'Acquire::Retries "10";' | sudo tee -a /etc/apt/apt.conf.d/80-retries > /dev/null + echo 'APT::Get::Assume-Yes "true";' | sudo tee -a /etc/apt/apt.conf.d/81-assume-yes > /dev/null + echo 'APT::Get::Fix-Broken "true";' | sudo tee -a /etc/apt/apt.conf.d/82-fix-broken > /dev/null + echo 'APT::Get::no-install-recommends "true";' | sudo tee -a /etc/apt/apt.conf.d/83-no-reсommends > /dev/null + + - name: Install build dependencies + run: | + sudo ${OV_INSTALL_DIR}/install_dependencies/install_openvino_dependencies.sh + sudo apt-get install ccache + + - name: Setup ccache + uses: actions/cache@v4 + with: + # Should save cache only if run in the master branch of the base repo + # github.ref_name is 'ref/PR_#' in case of the PR, and 'branch_name' when executed on push + save-always: ${{ github.ref_name == 'master' && 'true' || 'false' }} + path: ${{ env.CCACHE_DIR }} + key: ${{ runner.os }}-${{ runner.arch }}-ccache-genai-release-${{ github.sha }} + restore-keys: | + ${{ runner.os }}-${{ runner.arch }}-ccache-genai-release + + - name: Build genai + run: | + source ${OV_INSTALL_DIR}/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + + - name: Test bindings + run: | + source ${OV_INSTALL_DIR}/setupvars.sh + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager + python -m pytest ./tests/python_tests --ignore ./tests/python_tests/test_whisper_generate_api.py + env: + PYTHONPATH: "./build/:$PYTHONPATH" + + - name: Test bindings (wheel) + run: | + source ${OV_INSTALL_DIR}/setupvars.sh + python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels + python -m pytest ./tests/python_tests --ignore ./tests/python_tests/test_whisper_generate_api.py + + genai_python_lib_whisper: + name: OpenVINO genai extension whisper tests (cmake + wheel) + needs: [ openvino_download, openvino_build ] + if: | + always() && + (needs.openvino_download.outputs.status == 'success' || needs.openvino_build.result == 'success') + timeout-minutes: 90 + defaults: + run: + shell: bash + runs-on: ubuntu-20.04-16-cores + env: + CMAKE_GENERATOR: Unix Makefiles + CMAKE_BUILD_PARALLEL_LEVEL: null + OV_INSTALL_DIR: ${{ github.workspace }}/ov + CCACHE_DIR: ${{ github.workspace }}/ccache + CCACHE_MAXSIZE: 500Mi + CMAKE_CXX_COMPILER_LAUNCHER: ccache + CMAKE_C_COMPILER_LAUNCHER: ccache + + steps: + - name: Clone openvino.genai + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Setup Python ${{ env.PYTHON_VERSION }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + cache: 'pip' + + - name: Download OpenVINO package + uses: actions/download-artifact@v4 + with: + name: openvino_package + path: ${{ env.OV_INSTALL_DIR }} + + - name: Extract OpenVINO packages + run: | + pushd ${OV_INSTALL_DIR} + tar -xzf openvino_package.tar.gz -C ${OV_INSTALL_DIR} --strip-components=1 + popd + + - name: Set apt + run: | + echo 'Acquire::Retries "10";' | sudo tee -a /etc/apt/apt.conf.d/80-retries > /dev/null + echo 'APT::Get::Assume-Yes "true";' | sudo tee -a /etc/apt/apt.conf.d/81-assume-yes > /dev/null + echo 'APT::Get::Fix-Broken "true";' | sudo tee -a /etc/apt/apt.conf.d/82-fix-broken > /dev/null + echo 'APT::Get::no-install-recommends "true";' | sudo tee -a /etc/apt/apt.conf.d/83-no-reсommends > /dev/null + + - name: Install build dependencies + run: | + sudo ${OV_INSTALL_DIR}/install_dependencies/install_openvino_dependencies.sh + sudo apt-get install ccache + + - name: Setup ccache + uses: actions/cache@v4 + with: + # Should save cache only if run in the master branch of the base repo + # github.ref_name is 'ref/PR_#' in case of the PR, and 'branch_name' when executed on push + save-always: ${{ github.ref_name == 'master' && 'true' || 'false' }} + path: ${{ env.CCACHE_DIR }} + key: ${{ runner.os }}-${{ runner.arch }}-ccache-genai-release-${{ github.sha }} + restore-keys: | + ${{ runner.os }}-${{ runner.arch }}-ccache-genai-release + + - name: Build genai + run: | + source ${OV_INSTALL_DIR}/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + + - name: Test bindings + run: | + source ${OV_INSTALL_DIR}/setupvars.sh + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager + python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k test_smoke + env: + PYTHONPATH: "./build/:$PYTHONPATH" + + - name: Test bindings (wheel) + run: | + source ${OV_INSTALL_DIR}/setupvars.sh + python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels + python -m pytest ./tests/python_tests/test_whisper_generate_api.py + + genai_package: + name: OpenVINO genai extension (install to OpenVINO package) + strategy: + matrix: + build-type: [Release, Debug] + needs: [ openvino_download, openvino_build ] + if: | + always() && + (needs.openvino_download.outputs.status == 'success' || needs.openvino_build.result == 'success') + timeout-minutes: 60 + defaults: + run: + shell: bash + runs-on: ubuntu-20.04 + env: + CMAKE_BUILD_PARALLEL_LEVEL: null + OV_INSTALL_DIR: ${{ github.workspace }}/ov + CCACHE_DIR: ${{ github.workspace }}/ccache + CCACHE_MAXSIZE: 500Mi + CMAKE_CXX_COMPILER_LAUNCHER: ccache + CMAKE_C_COMPILER_LAUNCHER: ccache + + steps: + - name: Clone openvino.genai + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Setup Python ${{ env.PYTHON_VERSION }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + cache: 'pip' + + - name: Download OpenVINO package + uses: actions/download-artifact@v4 + with: + name: openvino_package + path: ${{ env.OV_INSTALL_DIR }} + + - name: Extract OpenVINO packages + run: | + pushd ${OV_INSTALL_DIR} + tar -xzf openvino_package.tar.gz -C ${OV_INSTALL_DIR} --strip-components=1 + popd + + - name: Set apt + run: | + echo 'Acquire::Retries "10";' | sudo tee -a /etc/apt/apt.conf.d/80-retries > /dev/null + echo 'APT::Get::Assume-Yes "true";' | sudo tee -a /etc/apt/apt.conf.d/81-assume-yes > /dev/null + echo 'APT::Get::Fix-Broken "true";' | sudo tee -a /etc/apt/apt.conf.d/82-fix-broken > /dev/null + echo 'APT::Get::no-install-recommends "true";' | sudo tee -a /etc/apt/apt.conf.d/83-no-reсommends > /dev/null + + - name: Install build dependencies + run: | + sudo ${OV_INSTALL_DIR}/install_dependencies/install_openvino_dependencies.sh + sudo apt-get install ccache + + - name: Setup ccache + uses: actions/cache@v4 + with: + save-always: true + path: ${{ env.CCACHE_DIR }} + key: ${{ runner.os }}-${{ runner.arch }}-ccache-genai-${{ matrix.build-type }}-${{ github.sha }} + restore-keys: | + ${{ runner.os }}-${{ runner.arch }}-ccache-genai-${{ matrix.build-type }} + + - name: Build genai + run: | + source ${OV_INSTALL_DIR}/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/ + cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j + + - name: Build and Install dependencies + run: | + source ${OV_INSTALL_DIR}/setupvars.sh + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --find-links ${OV_INSTALL_DIR}/wheels + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels + python -m pip install -U "optimum<1.23" --no-dependencies + optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 + + - name: Install samples + run: | + source ${OV_INSTALL_DIR}/setupvars.sh + cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ${OV_INSTALL_DIR} + + - name: Build samples (Release) + if: ${{ 'Release' == matrix.build-type }} # build_samples enforces Release build + run: | + ${OV_INSTALL_DIR}/samples/cpp/build_samples.sh -i ${{ github.workspace }}/s\ pace + + - name: Build samples (Debug) + if: ${{ 'Release' != matrix.build-type }} + run: | + source ${OV_INSTALL_DIR}/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ${OV_INSTALL_DIR}/samples/cpp/ -B ./samples\ build/ && cmake --build ./samples\ build/ --config ${{ matrix.build-type }} -j + cmake --install ./samples\ build/ --config ${{ matrix.build-type }} --component samples_bin --prefix s\ pace + + - name: Test C++ samples (greedy_causal_lm) + run: | + source ${OV_INSTALL_DIR}/setupvars.sh + timeout 25s ${{ github.workspace }}/s\ pace/samples_bin/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "" + + - name: Test python samples (multinomial_causal_lm) + if: ${{ 'Release' == matrix.build-type }} # Python bindings can be built in Release only + run: | + source ${OV_INSTALL_DIR}/setupvars.sh + timeout 25s ${OV_INSTALL_DIR}/samples/python/multinomial_causal_lm/multinomial_causal_lm.py ./TinyLlama-1.1B-Chat-v1.0/ 0 + + Overall_Status: + name: ci/gha_overall_status_linux + needs: [openvino_download, openvino_build, genai_python_lib, genai_package, genai_python_lib_whisper] + if: ${{ always() }} + runs-on: ubuntu-latest + steps: + - name: Check status of all jobs + if: >- + ${{ + contains(needs.*.result, 'failure') || + contains(needs.*.result, 'cancelled') + }} + run: exit 1 diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml index bc318afca9..2554330601 100644 --- a/.github/workflows/llm_bench-python.yml +++ b/.github/workflows/llm_bench-python.yml @@ -5,6 +5,7 @@ name: llm_bench Python Test env: LLM_BENCH_PYPATH: llm_bench/python + WWB_PATH: llm_bench/python/who_what_benchmark on: push: @@ -16,8 +17,7 @@ on: - llm_bench/python/** - .github/workflows/llm_bench-python.yml -permissions: - contents: read +permissions: read-all # Required by https://github.com/ossf/scorecard/blob/e23b8ad91fd6a64a0a971ca4fc0a4d1650725615/docs/checks.md#token-permissions jobs: build: @@ -27,7 +27,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.9"] + python-version: ["3.10"] steps: - uses: actions/checkout@v4 @@ -39,8 +39,8 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install flake8 pytest black - pip install -r ${{ env.LLM_BENCH_PYPATH }}/requirements.txt - + GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ${{ env.LLM_BENCH_PYPATH }}/requirements.txt + python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names @@ -55,19 +55,45 @@ jobs: with: name: llm.bench_diff path: llm.bench_diff.diff - - name: Run llm_bench test on linux - run: llm_bench/python/llm_run_on_linux.sh + - name: Test native pytorch model on Linux + run: | + export GIT_LFS_SKIP_SMUDGE=0 + git clone --depth 1 https://huggingface.co/katuni4ka/tiny-random-qwen + python ./llm_bench/python/benchmark.py -m tiny-random-qwen -d cpu -n 1 -f pt + - name: Test tiny-random-baichuan2 on Linux + run: | + optimum-cli export openvino --model katuni4ka/tiny-random-baichuan2 --trust-remote-code --weight-format fp16 ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16 + python ./llm_bench/python/benchmark.py -m ./ov_models/tiny-random-baichuan2/pytorch/dldt/FP16/ -d cpu -n 1 + - name: Test tiny-stable-diffusion on Linux + run: | + optimum-cli export openvino --model segmind/tiny-sd --trust-remote-code --weight-format fp16 ./ov_models/tiny-sd/pytorch/dldt/FP16/ + python ./llm_bench/python/benchmark.py -m ./ov_models/tiny-sd/pytorch/dldt/FP16/ -pf ./llm_bench/python/prompts/stable-diffusion.jsonl -d cpu -n 1 + - name: WWB Tests + run: | + GIT_CLONE_PROTECTION_ACTIVE=false pip install -r ${{ env.WWB_PATH }}/requirements.txt + pip install git+https://github.com/huggingface/optimum.git + GIT_CLONE_PROTECTION_ACTIVE=false pip install ${{ env.WWB_PATH }} + python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --force-reinstall + python -m pytest llm_bench/python/who_what_benchmark/tests stateful: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: - python-version: 3.8 + python-version: "3.10" - name: Test stateful run: | - python -m pip install -r llm_bench/python/requirements.txt optimum + GIT_CLONE_PROTECTION_ACTIVE=false python -m pip install -r llm_bench/python/requirements.txt python -m pip uninstall --yes openvino - python -m pip install openvino-nightly - python llm_bench/python/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v0.6 --output_dir . --stateful + python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python llm_bench/python/convert.py --model_id TinyLlama/TinyLlama-1.1B-Chat-v1.0 --output_dir . --stateful grep beam_idx pytorch/dldt/FP32/openvino_model.xml + - name: WWB Tests + run: | + GIT_CLONE_PROTECTION_ACTIVE=false pip install -r llm_bench/python/who_what_benchmark/requirements.txt + pip install git+https://github.com/huggingface/optimum.git + GIT_CLONE_PROTECTION_ACTIVE=false pip install llm_bench/python/who_what_benchmark/ + pip install pytest + python -m pip install -U --pre openvino openvino-tokenizers openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly --force-reinstall + python -m pytest llm_bench/python/who_what_benchmark/tests diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml new file mode 100644 index 0000000000..7cc9c5551c --- /dev/null +++ b/.github/workflows/mac.yml @@ -0,0 +1,404 @@ +name: macOS (12, Python 3.9) +on: + workflow_dispatch: + pull_request: + merge_group: + push: + branches: + - master + - 'releases/**' + +concurrency: + # github.ref is not unique in post-commit + group: ${{ github.event_name == 'push' && github.run_id || github.ref }}-mac + cancel-in-progress: true + +env: + PYTHON_VERSION: '3.9' + OV_BRANCH: ${{ github.base_ref || github.event.merge_group.base_ref || github.ref }} + OV_TARBALL: '' + +permissions: read-all # Required by https://github.com/ossf/scorecard/blob/e23b8ad91fd6a64a0a971ca4fc0a4d1650725615/docs/checks.md#token-permissions + +jobs: + openvino_download: + name: Download OpenVINO package + outputs: + status: ${{ steps.openvino_download.outcome }} + timeout-minutes: 10 + defaults: + run: + shell: bash + runs-on: ubuntu-20.04 + + steps: + - name: Download OpenVINO build + id: openvino_download + run: | + wget ${{ env.OV_TARBALL}} --progress=bar:force:noscroll -O openvino_package.tar.gz + tar -tvf openvino_package.tar.gz + continue-on-error: true + + # + # Upload to artifacts + # + + - name: Upload openvino package + if: steps.openvino_download.outcome == 'success' + uses: actions/upload-artifact@v4 + with: + name: openvino_package + path: openvino_package.tar.gz + if-no-files-found: 'error' + + openvino_build: + name: Build OpenVINO package + needs: [openvino_download] + if: needs.openvino_download.outputs.status != 'success' + timeout-minutes: 150 + defaults: + run: + shell: bash + runs-on: 'macos-12-large' + env: + MACOSX_DEPLOYMENT_TARGET: '10.15' + CMAKE_BUILD_TYPE: 'Release' + CMAKE_GENERATOR: 'Ninja Multi-Config' + CMAKE_CXX_COMPILER_LAUNCHER: ccache + CMAKE_C_COMPILER_LAUNCHER: ccache + OPENVINO_REPO: ${{ github.workspace }}/openvino + INSTALL_DIR: ${{ github.workspace }}/openvino/install + BUILD_DIR: ${{ github.workspace }}/openvino/build + + steps: + - name: Clone OpenVINO + uses: actions/checkout@v4 + with: + repository: 'openvinotoolkit/openvino' + path: ${{ env.OPENVINO_REPO }} + submodules: 'true' + ref: ${{ env.OV_BRANCH }} + + # + # Dependencies + # + + - name: Install build dependencies + run: brew install coreutils ninja + + - name: Setup Python ${{ env.PYTHON_VERSION }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + cache: 'pip' + + - name: Install python dependencies + run: | + # For Python API: build and wheel packaging + python3 -m pip install -r ${OPENVINO_REPO}/src/bindings/python/wheel/requirements-dev.txt + + # + # Build + # + + - name: Setup ccache + uses: hendrikmuhs/ccache-action@v1.2 + with: + max-size: "2000M" + # Should save cache only if run in the master branch of the base repo + # github.ref_name is 'ref/PR_#' in case of the PR, and 'branch_name' when executed on push + save: ${{ github.ref_name == 'master' && 'true' || 'false' }} + verbose: 2 + key: ccache-mac + restore-keys: | + ccache-mac + + - name: CMake configure - OpenVINO + run: | + cmake \ + -G "${{ env.CMAKE_GENERATOR }}" \ + -DENABLE_CPPLINT=OFF \ + -DENABLE_NCC_STYLE=OFF \ + -DENABLE_TESTS=OFF \ + -DENABLE_STRICT_DEPENDENCIES=OFF \ + -DENABLE_SYSTEM_OPENCL=ON \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DCPACK_GENERATOR=TGZ \ + -DENABLE_JS=OFF \ + -DENABLE_SAMPLES=ON \ + -DENABLE_INTEL_NPU=OFF \ + -DENABLE_OV_ONNX_FRONTEND=OFF \ + -DENABLE_OV_PADDLE_FRONTEND=OFF \ + -DENABLE_OV_PYTORCH_FRONTEND=ON \ + -DENABLE_OV_TF_FRONTEND=ON \ + -DENABLE_OV_TF_LITE_FRONTEND=OFF \ + -DENABLE_INTEL_GPU=OFF \ + -DENABLE_INTEL_NPU=OFF \ + -DCMAKE_COMPILE_WARNING_AS_ERROR=ON \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + -DENABLE_PYTHON=ON \ + -DENABLE_WHEEL=ON \ + -S ${OPENVINO_REPO} \ + -B ${BUILD_DIR} + + - name: Clean ccache stats + run: ccache --zero-stats --show-config + + - name: Cmake build - OpenVINO + run: cmake --build ${BUILD_DIR} --parallel --config ${{ env.CMAKE_BUILD_TYPE }} + + - name: Show ccache stats + run: ccache --show-stats + + - name: Cmake install - OpenVINO + run: | + cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR}/openvino_package -P ${BUILD_DIR}/cmake_install.cmake + cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR}/openvino_package -DCOMPONENT=python_wheels -P ${BUILD_DIR}/cmake_install.cmake + + - name: Pack Artifacts + run: | + pushd ${INSTALL_DIR} + tar -czvf ${BUILD_DIR}/openvino_package.tar.gz * + popd + + # + # Upload build artifacts and logs + # + + - name: Upload openvino package + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: openvino_package + path: ${{ env.BUILD_DIR }}/openvino_package.tar.gz + if-no-files-found: 'error' + + genai_python_lib: + name: OpenVINO genai extension (cmake + wheel) + needs: [ openvino_download, openvino_build ] + if: | + always() && + (needs.openvino_download.outputs.status == 'success' || needs.openvino_build.result == 'success') + timeout-minutes: 90 + defaults: + run: + shell: bash + runs-on: macos-13 + + env: + OV_INSTALL_DIR: ${{ github.workspace }}/ov + MACOSX_DEPLOYMENT_TARGET: '11.0' + + steps: + - name: Clone openvino.genai + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Setup Python ${{ env.PYTHON_VERSION }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + cache: 'pip' + + - name: Download OpenVINO package + uses: actions/download-artifact@v4 + with: + name: openvino_package + path: ${{ env.OV_INSTALL_DIR }} + + - name: Extract OpenVINO packages + run: | + pushd ${OV_INSTALL_DIR} + tar -xzf openvino_package.tar.gz -C ${OV_INSTALL_DIR} --strip-components=1 + popd + + - name: Install build dependencies + run: brew install coreutils ninja scons + + - name: Build genai + run: | + source ${OV_INSTALL_DIR}/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + + - name: Test bindings + run: | + source ${OV_INSTALL_DIR}/setupvars.sh + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager + python -m pytest ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py + env: + PYTHONPATH: "./build/:$PYTHONPATH" + + - name: Test bindings (wheel) + run: | + source ${OV_INSTALL_DIR}/setupvars.sh + python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels + python -c "from openvino_genai import LLMPipeline" + python -m pytest ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py + + genai_python_lib_whisper: + name: OpenVINO genai extension whisper tests (cmake + wheel) + needs: [ openvino_download, openvino_build ] + if: | + always() && + (needs.openvino_download.outputs.status == 'success' || needs.openvino_build.result == 'success') + timeout-minutes: 90 + defaults: + run: + shell: bash + runs-on: macos-13 + + env: + OV_INSTALL_DIR: ${{ github.workspace }}/ov + MACOSX_DEPLOYMENT_TARGET: '11.0' + + steps: + - name: Clone openvino.genai + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Setup Python ${{ env.PYTHON_VERSION }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + cache: 'pip' + + - name: Download OpenVINO package + uses: actions/download-artifact@v4 + with: + name: openvino_package + path: ${{ env.OV_INSTALL_DIR }} + + - name: Extract OpenVINO packages + run: | + pushd ${OV_INSTALL_DIR} + tar -xzf openvino_package.tar.gz -C ${OV_INSTALL_DIR} --strip-components=1 + popd + + - name: Install build dependencies + run: brew install coreutils ninja scons + + - name: Build genai + run: | + source ${OV_INSTALL_DIR}/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + + - name: Test bindings + run: | + source ${OV_INSTALL_DIR}/setupvars.sh + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager + python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k test_smoke + env: + PYTHONPATH: "./build/:$PYTHONPATH" + + - name: Test bindings (wheel) + run: | + source ${OV_INSTALL_DIR}/setupvars.sh + python -m pip install . --verbose --find-links ${OV_INSTALL_DIR}/wheels + python -c "from openvino_genai import LLMPipeline" + python -m pytest ./tests/python_tests/test_whisper_generate_api.py + + genai_package: + name: OpenVINO genai extension (install to OpenVINO package) + strategy: + matrix: + build-type: [Release, Debug] + needs: [ openvino_download, openvino_build ] + if: | + always() && + (needs.openvino_download.outputs.status == 'success' || needs.openvino_build.result == 'success') + timeout-minutes: 30 + defaults: + run: + shell: bash + runs-on: macos-13 + env: + OV_INSTALL_DIR: ${{ github.workspace }}/ov + + steps: + - name: Clone openvino.genai + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Setup Python ${{ env.PYTHON_VERSION }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + cache: 'pip' + + - name: Download OpenVINO package + uses: actions/download-artifact@v4 + with: + name: openvino_package + path: ${{ env.OV_INSTALL_DIR }} + + - name: Extract OpenVINO packages + run: | + pushd ${OV_INSTALL_DIR} + tar -xzf openvino_package.tar.gz -C ${OV_INSTALL_DIR} --strip-components=1 + popd + + - name: Install build dependencies + run: brew install coreutils scons + + - name: Build genai + run: | + source ${OV_INSTALL_DIR}/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/ + cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j + + - name: Build and Install dependencies + run: | + source ${OV_INSTALL_DIR}/setupvars.sh + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --find-links ${OV_INSTALL_DIR}/wheels + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels + python -m pip install -U "optimum<1.23" --no-dependencies + optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 + + - name: Install samples + run: | + source ${OV_INSTALL_DIR}/setupvars.sh + cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ${OV_INSTALL_DIR} + + - name: Build samples (Release) + if: ${{ 'Release' == matrix.build-type }} # build_samples enforces Release build + run: | + ${OV_INSTALL_DIR}/samples/cpp/build_samples.sh -i ${{ github.workspace }}/s\ pace + + - name: Build samples (Debug) + if: ${{ 'Release' != matrix.build-type }} + run: | + source ${OV_INSTALL_DIR}/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ${OV_INSTALL_DIR}/samples/cpp/ -B ./samples\ build/ + cmake --build ./samples\ build/ --config ${{ matrix.build-type }} -j + cmake --install ./samples\ build/ --config ${{ matrix.build-type }} --component samples_bin --prefix s\ pace + + - name: Test C++ samples (greedy_causal_lm) + run: | + source ${OV_INSTALL_DIR}/setupvars.sh + timeout 25s ${{ github.workspace }}/s\ pace/samples_bin/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "" + + - name: Test python samples (multinomial_causal_lm) + if: ${{ 'Release' == matrix.build-type }} # Python bindings can be built in Release only + run: | + source ${OV_INSTALL_DIR}/setupvars.sh + timeout --verbose 27s ${OV_INSTALL_DIR}/samples/python/multinomial_causal_lm/multinomial_causal_lm.py ./TinyLlama-1.1B-Chat-v1.0/ 0 + + Overall_Status: + name: ci/gha_overall_status_macos + needs: [openvino_download, openvino_build, genai_python_lib, genai_package, genai_python_lib_whisper] + if: ${{ always() }} + runs-on: ubuntu-latest + steps: + - name: Check status of all jobs + if: >- + ${{ + contains(needs.*.result, 'failure') || + contains(needs.*.result, 'cancelled') + }} + run: exit 1 diff --git a/.github/workflows/stable_diffusion_1_5_cpp.yml b/.github/workflows/stable_diffusion_1_5_cpp.yml index a4c29644f4..100de4d1ba 100644 --- a/.github/workflows/stable_diffusion_1_5_cpp.yml +++ b/.github/workflows/stable_diffusion_1_5_cpp.yml @@ -1,22 +1,148 @@ name: stable_diffusion_1_5_cpp + on: + workflow_dispatch: pull_request: - paths: - - image_generation/stable_diffusion_1_5/cpp/** - - image_generation/stable_diffusion_1_5/common/** - - .github/workflows/stable_diffusion_1_5_cpp.yml - - thirdparty/openvino_contrib + merge_group: + push: + branches: + - master + - 'releases/**' + +permissions: read-all # Required by https://github.com/ossf/scorecard/blob/e23b8ad91fd6a64a0a971ca4fc0a4d1650725615/docs/checks.md#token-permissions + +env: + PYTHON_VERSION: '3.10' + LINUX_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16993-9c432a3641a/l_openvino_toolkit_ubuntu20_2024.5.0.dev20241014_x86_64.tgz + WINDOWS_OV_ARCHIVE_URL: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.5.0-16993-9c432a3641a/w_openvino_toolkit_windows_2024.5.0.dev20241014_x86_64.zip + OV_INSTALL_DIR: ${{ github.workspace }}/ov + concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true + jobs: - stable_diffusion_1_5_cpp: - runs-on: ubuntu-20.04 + stable_diffusion_1_5_cpp-linux: + runs-on: ubuntu-20.04-8-cores + defaults: + run: + shell: bash -l {0} steps: - uses: actions/checkout@v4 with: submodules: recursive - - uses: actions/setup-python@v4 + + - name: Download OpenVINO archive + run: | + wget ${{ env.LINUX_OV_ARCHIVE_URL}} --progress=bar:force:noscroll -O openvino_package.tar.gz + mkdir ${{ env.OV_INSTALL_DIR }} + tar -xzf openvino_package.tar.gz -C ${{ env.OV_INSTALL_DIR }} --strip-components=1 + + - name: Build app + run: | + source ${{ env.OV_INSTALL_DIR }}/setupvars.sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release --parallel + + - name: Setup Python ${{ env.PYTHON_VERSION }} + uses: actions/setup-python@v5 with: - python-version: 3.8 - - run: ./image_generation/stable_diffusion_1_5/cpp/set_up_and_run.sh + python-version: ${{ env.PYTHON_VERSION }} + cache: 'pip' + + - name: Create virtual environment + run: python3 -m venv openvino_sd_cpp + + - name: Install python dependencies + run: | + source openvino_sd_cpp/bin/activate + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies + + - name: Download and convert models and tokenizer + run: | + source openvino_sd_cpp/bin/activate + optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --weight-format fp16 --task stable-diffusion models/dreamlike-art-dreamlike-anime-1.0/FP16 + wget -O ./models/soulcard.safetensors https://civitai.com/api/download/models/72591 + + - name: Run main app + run: | + source ${{ env.OV_INSTALL_DIR }}/setupvars.sh + ./build/samples/cpp/text2image/stable_diffusion ./models/dreamlike-art-dreamlike-anime-1.0/FP16 "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" + + - name: Run LoRA app + run: | + source ${{ env.OV_INSTALL_DIR }}/setupvars.sh + ./build/samples/cpp/text2image/lora_stable_diffusion ./models/dreamlike-art-dreamlike-anime-1.0/FP16 "curly-haired unicorn in the forest, anime, line" ./models/soulcard.safetensors 0.7 + + stable_diffusion_1_5_cpp-windows: + runs-on: windows-latest + defaults: + run: + shell: pwsh + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Download OpenVINO archive + run: | + mkdir ${{ env.OV_INSTALL_DIR }} + pushd ${{ env.OV_INSTALL_DIR }} + Invoke-WebRequest "${{ env.WINDOWS_OV_ARCHIVE_URL}}" -OutFile "openvino_package.zip" + Expand-Archive openvino_package.zip -DestinationPath ./tmp + mv ./tmp/*/* . + popd + + - name: Build app + run: | + . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release --parallel + + - name: Setup Python ${{ env.PYTHON_VERSION }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + cache: 'pip' + + - name: Create virtual environment + run: python -m venv openvino_sd_cpp + + - name: Install python dependencies + run: | + . "./openvino_sd_cpp/Scripts/Activate.ps1" + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install -U "optimum<1.23" --no-dependencies + + - name: Download and convert models and tokenizer + run: | + . "./openvino_sd_cpp/Scripts/Activate.ps1" + optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format fp16 models/dreamlike-art-dreamlike-anime-1.0/FP16 + Invoke-WebRequest -Uri 'https://civitai.com/api/download/models/72591' -OutFile 'models/soulcard.safetensors' + + - name: Run main app + run: > + . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" + & "./build/samples/cpp/text2image/Release/stable_diffusion.exe ./models/dreamlike-art-dreamlike-anime-1.0/FP16 'cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting'" + + - name: Run LoRA app + run: > + . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" + & "./build/samples/cpp/text2image/Release/lora_stable_diffusion.exe ./models/dreamlike-art-dreamlike-anime-1.0/FP16 'curly-haired unicorn in the forest, anime, line' ./models/soulcard.safetensors 0.7" + + Overall_Status: + name: ci/gha_overall_status_stable_diffusion + needs: [stable_diffusion_1_5_cpp-linux, stable_diffusion_1_5_cpp-windows] + if: ${{ always() }} + runs-on: ubuntu-latest + steps: + - name: Check status of all jobs + if: >- + ${{ + contains(needs.*.result, 'failure') || + contains(needs.*.result, 'cancelled') + }} + run: exit 1 diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml new file mode 100644 index 0000000000..0b8cece3fb --- /dev/null +++ b/.github/workflows/windows.yml @@ -0,0 +1,482 @@ +name: Windows (VS 2019, Python 3.11) +on: + workflow_dispatch: + pull_request: + merge_group: + push: + branches: + - master + - 'releases/**' + +concurrency: + # github.ref is not unique in post-commit + group: ${{ github.event_name == 'push' && github.run_id || github.ref }}-windows + cancel-in-progress: true + +env: + PYTHON_VERSION: '3.11' + OV_BRANCH: ${{ github.base_ref || github.event.merge_group.base_ref || github.ref }} + OV_TARBALL: '' + +permissions: read-all # Required by https://github.com/ossf/scorecard/blob/e23b8ad91fd6a64a0a971ca4fc0a4d1650725615/docs/checks.md#token-permissions + +jobs: + openvino_download: + name: Download OpenVINO package + outputs: + status: ${{ steps.openvino_download.outcome }} + timeout-minutes: 10 + defaults: + run: + shell: bash + runs-on: ubuntu-20.04 + + steps: + - name: Download OpenVINO build + id: openvino_download + run: | + wget ${{ env.OV_TARBALL}} --progress=bar:force:noscroll -O openvino_package.zip + unzip -l openvino_package.zip + continue-on-error: true + + # + # Upload to artifacts + # + + - name: Upload openvino package + if: steps.openvino_download.outcome == 'success' + uses: actions/upload-artifact@v4 + with: + name: openvino_package + path: openvino_package.zip + if-no-files-found: 'error' + + openvino_build: + name: Build OpenVINO package + needs: [openvino_download] + if: needs.openvino_download.outputs.status != 'success' + timeout-minutes: 150 + defaults: + run: + shell: pwsh + runs-on: windows-2019-16-core + env: + CMAKE_BUILD_TYPE: 'Release' + CMAKE_GENERATOR: 'Ninja Multi-Config' + CMAKE_CXX_COMPILER_LAUNCHER: ccache + CMAKE_C_COMPILER_LAUNCHER: ccache + OPENVINO_REPO: ${{ github.workspace }}\\openvino + INSTALL_DIR: ${{ github.workspace }}\\openvino\\install + BUILD_DIR: ${{ github.workspace }}\\openvino\\build + + steps: + - name: git configuration + run: git config --system core.longpaths true + + - name: Clone OpenVINO + uses: actions/checkout@v4 + with: + repository: 'openvinotoolkit/openvino' + path: ${{ env.OPENVINO_REPO }} + submodules: 'true' + ref: ${{ env.OV_BRANCH }} + + # + # Dependencies + # + + - name: Setup Python ${{ env.PYTHON_VERSION }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + cache: 'pip' + + - name: Install python dependencies + run: | + # For Python API: build and wheel packaging + python3 -m pip install -r ${env:OPENVINO_REPO}/src/bindings/python/wheel/requirements-dev.txt + + - name: Install build dependencies + run: | + Invoke-WebRequest https://github.com/ninja-build/ninja/releases/download/v1.11.1/ninja-win.zip -OutFile ninja-win.zip -MaximumRetryCount 10 + Expand-Archive -Force ninja-win.zip + # Add it to the GitHub Path so it would be available in the subsequent steps + Add-Content -Path $env:GITHUB_PATH -Value "${{ github.workspace }}/ninja-win" + + # + # Build + # + + - name: Setup ccache + uses: hendrikmuhs/ccache-action@v1.2 + with: + max-size: "2000M" + # Should save cache only if run in the master branch of the base repo + # github.ref_name is 'ref/PR_#' in case of the PR, and 'branch_name' when executed on push + save: ${{ github.ref_name == 'master' && 'true' || 'false' }} + verbose: 2 + key: ccache-windows + restore-keys: | + ccache-windows + + - name: Configure Developer Command Prompt for Microsoft Visual C++ + uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0 + + - name: CMake configure - OpenVINO + run: | + cmake -G "${{ env.CMAKE_GENERATOR }}" ` + -DENABLE_CPPLINT=OFF ` + -DBUILD_nvidia_plugin=OFF ` + -DBUILD_SHARED_LIBS=ON ` + -DENABLE_TESTS=OFF ` + -DCMAKE_COMPILE_WARNING_AS_ERROR=OFF ` + -DENABLE_STRICT_DEPENDENCIES=OFF ` + -DENABLE_PYTHON=ON ` + -DENABLE_WHEEL=ON ` + -DENABLE_JS=OFF ` + -DENABLE_SAMPLES=ON ` + -DENABLE_INTEL_NPU=OFF ` + -DENABLE_OV_ONNX_FRONTEND=OFF ` + -DENABLE_OV_PADDLE_FRONTEND=OFF ` + -DENABLE_OV_PYTORCH_FRONTEND=ON ` + -DENABLE_OV_TF_FRONTEND=ON ` + -DENABLE_OV_TF_LITE_FRONTEND=OFF ` + -DENABLE_INTEL_GPU=OFF ` + -DENABLE_INTEL_NPU=OFF ` + -DCMAKE_DISABLE_FIND_PACKAGE_PkgConfig=ON ` + -S ${{ env.OPENVINO_REPO }} ` + -B ${{ env.BUILD_DIR }} + + - name: Clean ccache stats + run: ccache --zero-stats --show-config + + - name: Cmake build - OpenVINO + run: cmake --build ${{ env.BUILD_DIR }} --parallel --config ${{ env.CMAKE_BUILD_TYPE }} --verbose + + - name: Show ccache stats + run: ccache --show-stats + + - name: Cmake install - OpenVINO + run: | + cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_DIR }}/ov_package -P ${{ env.BUILD_DIR }}/cmake_install.cmake + cmake -DCMAKE_INSTALL_PREFIX=${{ env.INSTALL_DIR }}/ov_package -DCOMPONENT=python_wheels -P ${{ env.BUILD_DIR }}/cmake_install.cmake + + - name: Pack Artifacts + run: | + $file=Get-ChildItem -Path "${{ env.INSTALL_DIR }}" + $compress = @{ + Path = $file + CompressionLevel = "Optimal" + DestinationPath = "${{ env.BUILD_DIR }}/openvino_package.zip" + } + Compress-Archive @compress + + # + # Upload build artifacts and logs + # + + - name: Upload openvino package + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: openvino_package + path: ${{ env.BUILD_DIR }}/openvino_package.zip + if-no-files-found: 'error' + + genai_python_lib: + name: OpenVINO genai extension (cmake + wheel) + needs: [ openvino_download, openvino_build ] + if: | + always() && + (needs.openvino_download.outputs.status == 'success' || needs.openvino_build.result == 'success') + timeout-minutes: 90 + defaults: + run: + shell: pwsh + runs-on: windows-2019 + + env: + OV_INSTALL_DIR: ${{ github.workspace }}\\ov + CMAKE_BUILD_PARALLEL_LEVEL: null + + steps: + - name: Clone openvino.genai + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Setup Python ${{ env.PYTHON_VERSION }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + cache: 'pip' + + - name: Download OpenVINO package + uses: actions/download-artifact@v4 + with: + name: openvino_package + path: ${{ env.OV_INSTALL_DIR }} + + - name: Extract OpenVINO packages + run: | + pushd ${{ env.OV_INSTALL_DIR }} + Expand-Archive openvino_package.zip -DestinationPath ./tmp + mv ./tmp/*/* . + popd + + - name: Configure Developer Command Prompt for Microsoft Visual C++ + uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0 + + - name: Build genai libs + run: | + . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + + - name: Test bindings + run: | + . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager + python -m pytest ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py + env: + PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that. + + - name: Test bindings (wheel) + run: | + . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" + python -m pip install . --verbose + python -m pytest ./tests/python_tests/ --ignore ./tests/python_tests/test_whisper_generate_api.py --ignore ./tests/python_tests/test_vlm_api.py + + genai_python_lib_whisper: + name: OpenVINO genai extension whisper tests (cmake + wheel) + needs: [ openvino_download, openvino_build ] + if: | + always() && + (needs.openvino_download.outputs.status == 'success' || needs.openvino_build.result == 'success') + timeout-minutes: 90 + defaults: + run: + shell: pwsh + runs-on: windows-2019 + + env: + OV_INSTALL_DIR: ${{ github.workspace }}\\ov + CMAKE_BUILD_PARALLEL_LEVEL: null + + steps: + - name: Clone openvino.genai + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Setup Python ${{ env.PYTHON_VERSION }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + cache: 'pip' + + - name: Download OpenVINO package + uses: actions/download-artifact@v4 + with: + name: openvino_package + path: ${{ env.OV_INSTALL_DIR }} + + - name: Extract OpenVINO packages + run: | + pushd ${{ env.OV_INSTALL_DIR }} + Expand-Archive openvino_package.zip -DestinationPath ./tmp + mv ./tmp/*/* . + popd + + - name: Configure Developer Command Prompt for Microsoft Visual C++ + uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0 + + - name: Build genai libs + run: | + . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + + - name: Test bindings + run: | + . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager + python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k test_smoke + env: + PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that. + + - name: Test bindings (wheel) + run: | + . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" + python -m pip install . --verbose + python -m pytest ./tests/python_tests/test_whisper_generate_api.py + + + genai_python_lib_vlm: + name: OpenVINO genai VLM tests (cmake + wheel) + needs: [ openvino_download, openvino_build ] + if: | + always() && + (needs.openvino_download.outputs.status == 'success' || needs.openvino_build.result == 'success') + timeout-minutes: 90 + defaults: + run: + shell: pwsh + runs-on: windows-2019-16-core + + env: + OV_INSTALL_DIR: ${{ github.workspace }}\\ov + CMAKE_BUILD_PARALLEL_LEVEL: null + + steps: + - name: Clone openvino.genai + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Setup Python ${{ env.PYTHON_VERSION }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + cache: 'pip' + + - name: Download OpenVINO package + uses: actions/download-artifact@v4 + with: + name: openvino_package + path: ${{ env.OV_INSTALL_DIR }} + + - name: Extract OpenVINO packages + run: | + pushd ${{ env.OV_INSTALL_DIR }} + Expand-Archive openvino_package.zip -DestinationPath ./tmp + mv ./tmp/*/* . + popd + + - name: Configure Developer Command Prompt for Microsoft Visual C++ + uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0 + + - name: Build genai libs + run: | + . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + + - name: Test bindings + run: | + . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager + python -m pytest ./tests/python_tests/test_vlm_api.py + env: + PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that. + + - name: Test bindings (wheel) + run: | + . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" + python -m pip install . --verbose + python -m pytest ./tests/python_tests/test_vlm_api.py + + + genai_package: + name: OpenVINO genai extension (install to OpenVINO package) + strategy: + matrix: + build-type: [Release, Debug] + needs: [ openvino_download, openvino_build ] + if: | + always() && + (needs.openvino_download.outputs.status == 'success' || needs.openvino_build.result == 'success') + timeout-minutes: 60 + defaults: + run: + shell: pwsh + runs-on: windows-2019 + + env: + OV_INSTALL_DIR: ${{ github.workspace }}\\ov + CMAKE_BUILD_PARALLEL_LEVEL: null + + steps: + - name: Clone openvino.genai + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Setup Python ${{ env.PYTHON_VERSION }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + cache: 'pip' + + - name: Download OpenVINO package + uses: actions/download-artifact@v4 + with: + name: openvino_package + path: ${{ env.OV_INSTALL_DIR }} + + - name: Extract OpenVINO packages + run: | + pushd ${{ env.OV_INSTALL_DIR }} + Expand-Archive openvino_package.zip -DestinationPath ./tmp + mv ./tmp/*/* . + popd + + - name: Configure Developer Command Prompt for Microsoft Visual C++ + uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0 + + - name: Build genai libs + run: | + . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" + cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/ + cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j + + - name: Build and Install dependencies + run: | + . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --find-links ${env:OV_INSTALL_DIR}/wheels + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels + python -m pip install -U "optimum<1.23" --no-dependencies + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 + + - name: Install samples + run: | + . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" + cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ${{ env.OV_INSTALL_DIR }} + + - name: Build samples (Release) + if: ${{ 'Release' == matrix.build-type }} # build_samples enforces Release build + run: | + & ${{ env.OV_INSTALL_DIR }}\samples\cpp\build_samples.ps1 -i ${{ github.workspace }}/samples_install + + - name: Build samples (Debug) + if: ${{ 'Release' != matrix.build-type }} + run: | + . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" + cmake -DCMAKE_BUILD_TYPE= -S ./ov/samples/cpp/ -B "samples build" + cmake --build "samples build" --config ${{ matrix.build-type }} -j + cmake --install "samples build" --config ${{ matrix.build-type }} --component samples_bin --prefix samples_install + + - name: Test C++ samples (greedy_causal_lm) + run: | + . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" + Start-Process -FilePath "${{ github.workspace }}/samples_install/samples_bin/greedy_causal_lm.exe" -ArgumentList "TinyLlama-1.1B-Chat-v1.0 ''" + + - name: Test python samples (multinomial_causal_lm) + if: ${{ 'Release' == matrix.build-type }} # Python bindings can be built in Release only + run: | + . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1" + python ${{ env.OV_INSTALL_DIR }}\samples\python\multinomial_causal_lm\multinomial_causal_lm.py TinyLlama-1.1B-Chat-v1.0 0 + + Overall_Status: + name: ci/gha_overall_status_windows + needs: [openvino_download, openvino_build, genai_python_lib, genai_package, genai_python_lib_whisper] + if: ${{ always() }} + runs-on: ubuntu-latest + steps: + - name: Check status of all jobs + if: >- + ${{ + contains(needs.*.result, 'failure') || + contains(needs.*.result, 'cancelled') + }} + run: exit 1 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000..83f354d57a --- /dev/null +++ b/.gitignore @@ -0,0 +1,37 @@ +# build/artifact dirs +[Bb]uild*/ + +# but ensure we don't skip __init__.py and __main__.py +!__init__.py +!__main__.py + +# developer tools +*.idea +.vscode +.vs/ +.vsconan/ +.DS_Store +**/tags +compile_commands.json +.local_vimrc +.gdb_history +.vimspector.json +doc/ +temp/ +.repo/ +CMakeLists.txt.user +CMakeUserPresets.json + +*.project +*.cproject +*.pydevproject +*.settings +*/gen/ +*.swp +/config.xml + +# Python-specific +*.?env* +*.pyc +__pycache__ +.py-build-cmake_cache diff --git a/.gitmodules b/.gitmodules index e1478cc156..f72fd83489 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ -[submodule "thirdparty/openvino_contrib"] - path = thirdparty/openvino_contrib - url = https://github.com/openvinotoolkit/openvino_contrib.git +[submodule "thirdparty/openvino_tokenizers"] + path = thirdparty/openvino_tokenizers + url = https://github.com/openvinotoolkit/openvino_tokenizers.git diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000000..f146b1a02c --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,82 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +cmake_minimum_required(VERSION 3.23.0) # The requirement comes from Jinja2Cpp + +# Multi config generators such as Visual Studio ignore CMAKE_BUILD_TYPE. Multi config generators are configured with +# CMAKE_CONFIGURATION_TYPES, but limiting options in it completely removes such build options +get_property(GENERATOR_IS_MULTI_CONFIG_VAR GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG) +if(CMAKE_GENERATOR STREQUAL "Ninja Multi-Config") + # 'Ninja Multi-Config' specific, see: + # https://cmake.org/cmake/help/latest/variable/CMAKE_DEFAULT_BUILD_TYPE.html + set(CMAKE_DEFAULT_BUILD_TYPE "Release" CACHE STRING "CMake default build type") +elseif(NOT GENERATOR_IS_MULTI_CONFIG_VAR AND NOT DEFINED CMAKE_BUILD_TYPE) + message(STATUS "CMAKE_BUILD_TYPE is not defined, 'Release' will be used") + # Setting CMAKE_BUILD_TYPE as CACHE must go before project(). Otherwise project() sets its value and set() doesn't take an effect + set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel ...") +endif() + +if(POLICY CMP0135) + cmake_policy(SET CMP0135 NEW) +endif() + +if(POLICY CMP0169) + cmake_policy(SET CMP0169 OLD) +endif() + +project(OpenVINOGenAI + VERSION 2024.5.0.0 + DESCRIPTION "OpenVINO GenAI" + HOMEPAGE_URL "https://github.com/openvinotoolkit/openvino.genai" + LANGUAGES CXX C) + +# Find OpenVINODeveloperPackage first to compile with SDL flags +find_package(OpenVINODeveloperPackage ${OpenVINOGenAI_VERSION} QUIET + COMPONENTS Runtime + PATHS "${OpenVINO_DIR}") +if(NOT OpenVINODeveloperPackage_FOUND) + find_package(OpenVINO ${OpenVINOGenAI_VERSION} REQUIRED + COMPONENTS Runtime) +endif() + +include(cmake/features.cmake) + +if(ENABLE_PYTHON) + # the following two calls are required for cross-compilation + if(OpenVINODeveloperPackage_DIR) + ov_find_python3(REQUIRED) + ov_detect_python_module_extension() + else() + if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) + find_package(Python3 REQUIRED COMPONENTS Interpreter Development.Module) + else() + find_package(Python3 REQUIRED COMPONENTS Interpreter Development) + endif() + endif() +endif() + +if(WIN32 OR APPLE) + set(CMAKE_DEBUG_POSTFIX "d") +endif() + +add_subdirectory(thirdparty) +add_subdirectory(src) +add_subdirectory(samples) +add_subdirectory(tests/cpp) + +install(FILES LICENSE DESTINATION docs/licensing COMPONENT licensing_genai RENAME LICENSE-GENAI) +install(FILES third-party-programs.txt DESTINATION docs/licensing COMPONENT licensing_genai RENAME third-party-programs-genai.txt) +if(NOT DEFINED CPACK_ARCHIVE_COMPONENT_INSTALL) + set(CPACK_ARCHIVE_COMPONENT_INSTALL ON) +endif() +set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY OFF) +# Workaround https://gitlab.kitware.com/cmake/cmake/-/issues/2614 +set(CPACK_COMPONENTS_ALL core_genai core_genai_dev cpp_samples_genai licensing_genai openvino_tokenizers openvino_tokenizers_docs) +if(ENABLE_PYTHON) + list(APPEND CPACK_COMPONENTS_ALL pygenai_${Python3_VERSION_MAJOR}_${Python3_VERSION_MINOR}) +endif() +if(WIN32 AND NOT DEFINED CPACK_GENERATOR) + set(CPACK_GENERATOR "ZIP") +endif() +include(CPack) diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 0000000000..ff5fd03945 --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,22 @@ +#!groovy + +properties([ + parameters([ + booleanParam(defaultValue: false, + description: 'Cancel the rest of parallel stages if one of them fails and return status immediately', + name: 'failFast'), + booleanParam(defaultValue: true, + description: 'Whether to propagate commit status to GitHub', + name: 'propagateStatus'), + booleanParam(defaultValue: false, + description: 'If true, forces running pre-commit scope', + name: 'forceRunPrecommitScope'), + string(defaultValue: '', + description: 'Pipeline shared library version (branch/tag/commit). Determined automatically if empty', + name: 'library_version') + ]) +]) + +loadOpenVinoLibrary { + entrypoint(this) +} diff --git a/README.md b/README.md index 4925926b08..163768b18e 100644 --- a/README.md +++ b/README.md @@ -1,27 +1,242 @@ -## GenAI Pipeline Repository +# OpenVINO™ GenAI +OpenVINO™ GenAI is a library of most popular Generative AI model pipelines, optimized execution methods and samples that runs on top of highly performant [OpenVINO Runtime](https://github.com/openvinotoolkit/openvino). -The GenAI repository contains pipelines that implement image and text generation tasks. -The implementation uses OpenVINO capabilities to optimize the pipelines. Each sample covers -a family of models and suggests certain modifications to adapt the code to specific needs. -It includes the following pipelines: +Library is friendly to PC and laptop execution, optimized for resource consumption and requires no external dependencies to run generative models and includes all required functionality (e.g. tokenization via openvino-tokenizers). -1. [Benchmarking script for large language models](./llm_bench/python/) -2. [Causal LM](./text_generation/causal_lm/cpp/) -3. [OpenVINO Stable Diffuison (with LoRA) C++ pipeline](./image_generation/stable_diffusion_1_5/cpp/) + -> [!NOTE] -> This project is not for production use. +## Supported Generative AI scenarios -### License +OpenVINO™ GenAI library provides very lightweight C++ and Python APIs to run following Generative Scenarios: + - Text generation using Large Language Models. For example, chat with local LLaMa model + - Image generation using Diffuser models, for example generation using Stable Diffusion models + - Speech recognition using Whisper family models + - Text generation using Large Visual Models, for instance Image analysis using LLaVa or miniCPM models family -The GenAI repository is licensed under [Apache License Version 2.0](LICENSE). -By contributing to the project, you agree to the license and copyright terms therein and release -your contribution under these terms. +Library efficiently supports LoRA adapters for Text and Image generation scenarios: +- Load multiple adapters per model +- Select active adapters for every generation +- Mix multiple adapters with coefficients via alpha blending + +All scenarios are run on top of OpenVINO Runtime that supports inference on CPU, GPU and NPU. See [here](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html) for platform support matrix. + +## Supported Generative AI optimization methods + +OpenVINO™ GenAI library provides transparent way to use state of the art generation optimizations: +- Speculative decoding that employs two models of different size and uses large model to periodically correct results of small model. See [here](https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/) for more detailed overview +- KVCache token eviction algorithm that reduces size of the KVCache by pruning less impacting tokens. + +Additionally, OpenVINO™ GenAI library implements continuous batching approach to use OpenVINO within LLM serving. Continuous batching library could be used in LLM serving frameworks and supports following features: +- Prefix caching that caches fragments of previous generation requests and corresponding KVCache entries internally and uses them in case of repeated query. See [here](https://google.com) for more detailed overview + +Continuous batching functionality is used within OpenVINO Model Server (OVMS) to serve LLMs, see [here](https://docs.openvino.ai/2024/ovms_docs_llm_reference.html) for more details. + +## Installing OpenVINO GenAI + +```sh + # Installing OpenVINO GenAI via pip + pip install openvino-genai + + # Install optimum-intel to be able to download, convert and optimize LLMs from Hugging Face + # Optimum is not required to run models, only to convert and compress + pip install optimum-intel@git+https://github.com/huggingface/optimum-intel.git + + # (Optional) Install (TBD) to be able to download models from Model Scope +``` + +## Performing text generation +<details> +For more examples check out our [LLM Inference Guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html) + +### Converting and compressing text generation model from Hugging Face library + +```sh +#(Basic) download and convert to OpenVINO TinyLlama-Chat-v1.0 model +optimum-cli export openvino --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --weight-format fp16 --trust-remote-code "TinyLlama-1.1B-Chat-v1.0" + +#(Recommended) download, convert to OpenVINO and compress to int4 TinyLlama-Chat-v1.0 model +optimum-cli export openvino --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --weight-format int4 --trust-remote-code "TinyLlama-1.1B-Chat-v1.0" +``` + +### Run generation using LLMPipeline API in Python + +```python +import openvino_genai as ov_genai +#Will run model on CPU, GPU or NPU are possible options +pipe = ov_genai.LLMPipeline("./TinyLlama-1.1B-Chat-v1.0/", "CPU") +print(pipe.generate("The Sun is yellow because", max_new_tokens=100)) +``` + +### Run generation using LLM Pipeline in C++ + +Code below requires installation of C++ compatible package (see [here](https://docs.openvino.ai/2024/get-started/install-openvino/install-openvino-genai.html#archive-installation) for more details) + +```cpp +#include "openvino/genai/llm_pipeline.hpp" +#include <iostream> + +int main(int argc, char* argv[]) { + std::string model_path = argv[1]; + ov::genai::LLMPipeline pipe(model_path, "CPU"); + std::cout << pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(100)); +} +``` + +### Sample notebooks using this API + +(TBD) + +</details> + +## Performing image generation + +<details> +For more examples check out our [LLM Inference Guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html) + +### Converting and compressing image generation model from Hugging Face library + +```sh +#Download and convert to OpenVINO dreamlike-anime-1.0 model +optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format fp16 dreamlike_anime_1_0_ov/FP16 +``` + +### Run generation using Text2Image API in Python + +```python + +#WIP + +``` + +### Run generation using Text2Image API in C++ + +Code below requires installation of C++ compatible package (see [here](https://docs.openvino.ai/2024/get-started/install-openvino/install-openvino-genai.html#archive-installation) for more details) + +```cpp +#include "openvino/genai/text2image/pipeline.hpp" +#include "imwrite.hpp" +int main(int argc, char* argv[]) { -## Requirements + const std::string models_path = argv[1], prompt = argv[2]; + const std::string device = "CPU"; // GPU, NPU can be used as well -Requirements may vary for different samples. See respective readme files for more details, -and make sure to install the OpenVINO version listed there. Refer to documentation to see -[how to install OpenVINO](docs.openvino.ai/install). + ov::genai::Text2ImagePipeline pipe(models_path, device); + ov::Tensor image = pipe.generate(prompt, + ov::genai::width(512), + ov::genai::height(512), + ov::genai::num_inference_steps(20)); + imwrite("image.bmp", image, true); +} +``` +### Sample notebooks using this API + +(TBD) + +</details> + +## Speech to text processing using Whisper Pipeline +<details> +For more examples check out our [LLM Inference Guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html) + +NOTE: Whisper Pipeline requires preprocessing of audio input (to adjust sampling rate and normalize) + + ### Converting and compressing image generation model from Hugging Face library +```sh +#Download and convert to OpenVINO whisper-base model +optimum-cli export openvino --trust-remote-code --model openai/whisper-base whisper-base +``` + +### Run generation using Whisper Pipeline API in Python + +NOTE: this sample is simplified version of full sample that is available [here](./samples/python/whisper_speech_recognition/whisper_speech_recognition.py) + +```python +import argparse +import openvino_genai +import librosa + +def read_wav(filepath): + raw_speech, samplerate = librosa.load(filepath, sr=16000) + return raw_speech.tolist() + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("model_dir") + parser.add_argument("wav_file_path") + args = parser.parse_args() + + raw_speech = read_wav(args.wav_file_path) + + pipe = openvino_genai.WhisperPipeline(args.model_dir) + + def streamer(word: str) -> bool: + print(word, end="") + return False + + pipe.generate( + raw_speech, + max_new_tokens=100, + # 'task' and 'language' parameters are supported for multilingual models only + language="<|en|>", + task="transcribe", + streamer=streamer, + ) + + print() +``` + + +### Run generation using Whisper Pipeline API in C++ + +NOTE: this sample is simplified version of full sample that is available [here](./samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp) + +```cpp +#include "audio_utils.hpp" +#include "openvino/genai/whisper_pipeline.hpp" + +int main(int argc, char* argv[]) try { + + std::string model_path = argv[1]; + std::string wav_file_path = argv[2]; + + ov::genai::RawSpeechInput raw_speech = utils::audio::read_wav(wav_file_path); + + ov::genai::WhisperPipeline pipeline{model_path}; + + ov::genai::WhisperGenerationConfig config{model_path + "/generation_config.json"}; + config.max_new_tokens = 100; + // 'task' and 'language' parameters are supported for multilingual models only + config.language = "<|en|>"; + config.task = "transcribe"; + + auto streamer = [](std::string word) { + std::cout << word; + return false; + }; + + pipeline.generate(raw_speech, config, streamer); + + std::cout << std::endl; +} +``` + + ### Sample notebooks using this API + +(TBD) + +</details> + + +## Additional materials + +- [List of supported models](https://github.com/openvinotoolkit/openvino.genai/blob/master/src/docs/SUPPORTED_MODELS.md) (NOTE: models can work, but were not tried yet) +- [OpenVINO LLM inference Guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html) +- [Optimum-intel and OpenVINO](https://huggingface.co/docs/optimum/intel/openvino/export) + +## License + +The OpenVINO™ GenAI repository is licensed under [Apache License Version 2.0](LICENSE). +By contributing to the project, you agree to the license and copyright terms therein and release +your contribution under these terms. diff --git a/bandit.yml b/bandit.yml new file mode 100644 index 0000000000..491108a69c --- /dev/null +++ b/bandit.yml @@ -0,0 +1,398 @@ +### This config may optionally select a subset of tests to run or skip by +### filling out the 'tests' and 'skips' lists given below. If no tests are +### specified for inclusion then it is assumed all tests are desired. The skips +### set will remove specific tests from the include set. This can be controlled +### using the -t/-s CLI options. Note that the same test ID should not appear +### in both 'tests' and 'skips', this would be nonsensical and is detected by +### Bandit at runtime. + +# Available tests: +# B101 : assert_used +# B102 : exec_used +# B103 : set_bad_file_permissions +# B104 : hardcoded_bind_all_interfaces +# B105 : hardcoded_password_string +# B106 : hardcoded_password_funcarg +# B107 : hardcoded_password_default +# B108 : hardcoded_tmp_directory +# B110 : try_except_pass +# B112 : try_except_continue +# B201 : flask_debug_true +# B301 : pickle +# B302 : marshal +# B303 : md5 +# B304 : ciphers +# B305 : cipher_modes +# B306 : mktemp_q +# B307 : eval +# B308 : mark_safe +# B310 : urllib_urlopen +# B311 : random +# B312 : telnetlib +# B313 : xml_bad_cElementTree +# B314 : xml_bad_ElementTree +# B315 : xml_bad_expatreader +# B316 : xml_bad_expatbuilder +# B317 : xml_bad_sax +# B318 : xml_bad_minidom +# B319 : xml_bad_pulldom +# B320 : xml_bad_etree +# B321 : ftplib +# B323 : unverified_context +# B324 : hashlib_new_insecure_functions +# B401 : import_telnetlib +# B402 : import_ftplib +# B403 : import_pickle +# B404 : import_subprocess +# B405 : import_xml_etree +# B406 : import_xml_sax +# B407 : import_xml_expat +# B408 : import_xml_minidom +# B409 : import_xml_pulldom +# B410 : import_lxml +# B411 : import_xmlrpclib +# B412 : import_httpoxy +# B413 : import_pycrypto +# B501 : request_with_no_cert_validation +# B502 : ssl_with_bad_version +# B503 : ssl_with_bad_defaults +# B504 : ssl_with_no_version +# B505 : weak_cryptographic_key +# B506 : yaml_load +# B507 : ssh_no_host_key_verification +# B601 : paramiko_calls +# B602 : subprocess_popen_with_shell_equals_true +# B603 : subprocess_without_shell_equals_true +# B604 : any_other_function_with_shell_equals_true +# B605 : start_process_with_a_shell +# B606 : start_process_with_no_shell +# B607 : start_process_with_partial_path +# B608 : hardcoded_sql_expressions +# B609 : linux_commands_wildcard_injection +# B610 : django_extra_used +# B611 : django_rawsql_used +# B701 : jinja2_autoescape_false +# B702 : use_of_mako_templates +# B703 : django_mark_safe + +# (optional) list included test IDs here, eg '[B101, B406]': +# IPAS Required Checkers. Do not disable these +# Additional checkers may be added if desired +tests: + [ 'B301', 'B302', 'B303', 'B304', 'B305', 'B306', 'B308', 'B310', 'B311', 'B312', 'B313', 'B314', 'B315', 'B316', 'B317', 'B318', 'B319', 'B320', 'B321', 'B323', 'B324', 'B401', 'B402', 'B403', 'B404', 'B405', 'B406', 'B407', 'B408', 'B409', 'B410', 'B411', 'B412', 'B413'] + +# (optional) list skipped test IDs here, eg '[B101, B406]': +# The following checkers are not required but be added to tests list if desired +skips: + [ 'B101', 'B102', 'B103', 'B104', 'B105', 'B106', 'B107', 'B108', 'B110', 'B112', 'B201', 'B501', 'B502', 'B503', 'B504', 'B505', 'B506', 'B507', 'B601', 'B602', 'B603', 'B604', 'B605', 'B606', 'B607', 'B608', 'B609', 'B610', 'B611', 'B701', 'B702', 'B703'] + +### (optional) plugin settings - some test plugins require configuration data +### that may be given here, per-plugin. All bandit test plugins have a built in +### set of sensible defaults and these will be used if no configuration is +### provided. It is not necessary to provide settings for every (or any) plugin +### if the defaults are acceptable. + +any_other_function_with_shell_equals_true: + no_shell: + - os.execl + - os.execle + - os.execlp + - os.execlpe + - os.execv + - os.execve + - os.execvp + - os.execvpe + - os.spawnl + - os.spawnle + - os.spawnlp + - os.spawnlpe + - os.spawnv + - os.spawnve + - os.spawnvp + - os.spawnvpe + - os.startfile + shell: + - os.system + - os.popen + - os.popen2 + - os.popen3 + - os.popen4 + - popen2.popen2 + - popen2.popen3 + - popen2.popen4 + - popen2.Popen3 + - popen2.Popen4 + - commands.getoutput + - commands.getstatusoutput + subprocess: + - subprocess.Popen + - subprocess.call + - subprocess.check_call + - subprocess.check_output + - subprocess.run +assert_used: + skips: ["llm_bench/python/who_what_benchmark/tests/test_*.py"] +hardcoded_tmp_directory: + tmp_dirs: + - /tmp + - /var/tmp + - /dev/shm +linux_commands_wildcard_injection: + no_shell: + - os.execl + - os.execle + - os.execlp + - os.execlpe + - os.execv + - os.execve + - os.execvp + - os.execvpe + - os.spawnl + - os.spawnle + - os.spawnlp + - os.spawnlpe + - os.spawnv + - os.spawnve + - os.spawnvp + - os.spawnvpe + - os.startfile + shell: + - os.system + - os.popen + - os.popen2 + - os.popen3 + - os.popen4 + - popen2.popen2 + - popen2.popen3 + - popen2.popen4 + - popen2.Popen3 + - popen2.Popen4 + - commands.getoutput + - commands.getstatusoutput + subprocess: + - subprocess.Popen + - subprocess.call + - subprocess.check_call + - subprocess.check_output + - subprocess.run +ssl_with_bad_defaults: + bad_protocol_versions: + - PROTOCOL_SSLv2 + - SSLv2_METHOD + - SSLv23_METHOD + - PROTOCOL_SSLv3 + - PROTOCOL_TLSv1 + - SSLv3_METHOD + - TLSv1_METHOD +ssl_with_bad_version: + bad_protocol_versions: + - PROTOCOL_SSLv2 + - SSLv2_METHOD + - SSLv23_METHOD + - PROTOCOL_SSLv3 + - PROTOCOL_TLSv1 + - SSLv3_METHOD + - TLSv1_METHOD +start_process_with_a_shell: + no_shell: + - os.execl + - os.execle + - os.execlp + - os.execlpe + - os.execv + - os.execve + - os.execvp + - os.execvpe + - os.spawnl + - os.spawnle + - os.spawnlp + - os.spawnlpe + - os.spawnv + - os.spawnve + - os.spawnvp + - os.spawnvpe + - os.startfile + shell: + - os.system + - os.popen + - os.popen2 + - os.popen3 + - os.popen4 + - popen2.popen2 + - popen2.popen3 + - popen2.popen4 + - popen2.Popen3 + - popen2.Popen4 + - commands.getoutput + - commands.getstatusoutput + subprocess: + - subprocess.Popen + - subprocess.call + - subprocess.check_call + - subprocess.check_output + - subprocess.run +start_process_with_no_shell: + no_shell: + - os.execl + - os.execle + - os.execlp + - os.execlpe + - os.execv + - os.execve + - os.execvp + - os.execvpe + - os.spawnl + - os.spawnle + - os.spawnlp + - os.spawnlpe + - os.spawnv + - os.spawnve + - os.spawnvp + - os.spawnvpe + - os.startfile + shell: + - os.system + - os.popen + - os.popen2 + - os.popen3 + - os.popen4 + - popen2.popen2 + - popen2.popen3 + - popen2.popen4 + - popen2.Popen3 + - popen2.Popen4 + - commands.getoutput + - commands.getstatusoutput + subprocess: + - subprocess.Popen + - subprocess.call + - subprocess.check_call + - subprocess.check_output + - subprocess.run +start_process_with_partial_path: + no_shell: + - os.execl + - os.execle + - os.execlp + - os.execlpe + - os.execv + - os.execve + - os.execvp + - os.execvpe + - os.spawnl + - os.spawnle + - os.spawnlp + - os.spawnlpe + - os.spawnv + - os.spawnve + - os.spawnvp + - os.spawnvpe + - os.startfile + shell: + - os.system + - os.popen + - os.popen2 + - os.popen3 + - os.popen4 + - popen2.popen2 + - popen2.popen3 + - popen2.popen4 + - popen2.Popen3 + - popen2.Popen4 + - commands.getoutput + - commands.getstatusoutput + subprocess: + - subprocess.Popen + - subprocess.call + - subprocess.check_call + - subprocess.check_output + - subprocess.run +subprocess_popen_with_shell_equals_true: + no_shell: + - os.execl + - os.execle + - os.execlp + - os.execlpe + - os.execv + - os.execve + - os.execvp + - os.execvpe + - os.spawnl + - os.spawnle + - os.spawnlp + - os.spawnlpe + - os.spawnv + - os.spawnve + - os.spawnvp + - os.spawnvpe + - os.startfile + shell: + - os.system + - os.popen + - os.popen2 + - os.popen3 + - os.popen4 + - popen2.popen2 + - popen2.popen3 + - popen2.popen4 + - popen2.Popen3 + - popen2.Popen4 + - commands.getoutput + - commands.getstatusoutput + subprocess: + - subprocess.Popen + - subprocess.call + - subprocess.check_call + - subprocess.check_output + - subprocess.run +subprocess_without_shell_equals_true: + no_shell: + - os.execl + - os.execle + - os.execlp + - os.execlpe + - os.execv + - os.execve + - os.execvp + - os.execvpe + - os.spawnl + - os.spawnle + - os.spawnlp + - os.spawnlpe + - os.spawnv + - os.spawnve + - os.spawnvp + - os.spawnvpe + - os.startfile + shell: + - os.system + - os.popen + - os.popen2 + - os.popen3 + - os.popen4 + - popen2.popen2 + - popen2.popen3 + - popen2.popen4 + - popen2.Popen3 + - popen2.Popen4 + - commands.getoutput + - commands.getstatusoutput + subprocess: + - subprocess.Popen + - subprocess.call + - subprocess.check_call + - subprocess.check_output + - subprocess.run +try_except_continue: + check_typed_exception: false +try_except_pass: + check_typed_exception: false +weak_cryptographic_key: + weak_key_size_dsa_high: 1024 + weak_key_size_dsa_medium: 2048 + weak_key_size_ec_high: 160 + weak_key_size_ec_medium: 224 + weak_key_size_rsa_high: 1024 + weak_key_size_rsa_medium: 2048 +exclude_dirs: + - thirdparty diff --git a/cmake/features.cmake b/cmake/features.cmake new file mode 100644 index 0000000000..0434b21ee9 --- /dev/null +++ b/cmake/features.cmake @@ -0,0 +1,5 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +option(ENABLE_PYTHON "Enable Python API build" ON) diff --git a/cmake/templates/OpenVINOGenAIConfig.cmake.in b/cmake/templates/OpenVINOGenAIConfig.cmake.in new file mode 100644 index 0000000000..c1f9c86c52 --- /dev/null +++ b/cmake/templates/OpenVINOGenAIConfig.cmake.in @@ -0,0 +1,10 @@ +@PACKAGE_INIT@ + +include(CMakeFindDependencyMacro) +find_dependency(OpenVINO COMPONENTS Runtime) + +if(NOT TARGET openvino_genai) + include("${CMAKE_CURRENT_LIST_DIR}/OpenVINOGenAITargets.cmake") +endif() + +check_required_components(OpenVINOGenAI) diff --git a/cmake/templates/__version__.py.in b/cmake/templates/__version__.py.in new file mode 100644 index 0000000000..ce8e01a246 --- /dev/null +++ b/cmake/templates/__version__.py.in @@ -0,0 +1,5 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Will be overwritten by cmake. +__version__ = "@OpenVINOGenAI_VERSION@" diff --git a/image_generation/common/diffusers/CMakeLists.txt b/image_generation/common/diffusers/CMakeLists.txt deleted file mode 100644 index cb5944d569..0000000000 --- a/image_generation/common/diffusers/CMakeLists.txt +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright (C) 2023 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -file(GLOB_RECURSE "diffusers_SRC" "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp") - -add_library(diffusers STATIC ${diffusers_SRC}) -add_library(diffusers::diffusers ALIAS diffusers) - -target_include_directories(diffusers - PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include" - PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src") - -if(CMAKE_COMPILER_IS_GNUCXX) - target_compile_options(diffusers PUBLIC -march=native -Wall) -endif() - -# dependencies - -find_package(OpenVINO REQUIRED COMPONENTS Runtime) -target_link_libraries(diffusers PUBLIC openvino::runtime) - -find_package(Eigen3 REQUIRED) -target_link_libraries(diffusers PUBLIC Eigen3::Eigen) diff --git a/image_generation/common/diffusers/include/lora.hpp b/image_generation/common/diffusers/include/lora.hpp deleted file mode 100644 index a47034b68e..0000000000 --- a/image_generation/common/diffusers/include/lora.hpp +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright (C) 2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include <map> -#include <memory> -#include <string> -#include <vector> - -#include "openvino/op/constant.hpp" -#include "openvino/pass/graph_rewrite.hpp" - -class InsertLoRA : public ov::pass::MatcherPass { -public: - OPENVINO_RTTI("InsertLoRA", "0"); - - using LoRAMap = std::map<std::string, std::shared_ptr<ov::op::v0::Constant>>; - - explicit InsertLoRA(LoRAMap& lora_map); - -private: - LoRAMap* m_lora_map; -}; - -std::map<std::string, InsertLoRA::LoRAMap> -read_lora_adapters(const std::string& filename, const float alpha = 0.75f); diff --git a/image_generation/common/diffusers/include/scheduler.hpp b/image_generation/common/diffusers/include/scheduler.hpp deleted file mode 100644 index 08a17eece5..0000000000 --- a/image_generation/common/diffusers/include/scheduler.hpp +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright (C) 2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include <cstdint> -#include <vector> - -#include "openvino/runtime/tensor.hpp" - -enum class BetaSchedule { - LINEAR, - SCALED_LINEAR, -}; - -enum class PredictionType { - EPSILON, - SAMPLE, - V_PREDICTION -}; - -class Scheduler { -public: - virtual void set_timesteps(size_t num_inference_steps) = 0; - - virtual std::vector<std::int64_t> get_timesteps() const = 0; - - virtual float get_init_noise_sigma() const = 0; - - virtual void scale_model_input(ov::Tensor sample, size_t inference_step) = 0; - - virtual ov::Tensor step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step) = 0; -}; diff --git a/image_generation/common/diffusers/include/scheduler_lms_discrete.hpp b/image_generation/common/diffusers/include/scheduler_lms_discrete.hpp deleted file mode 100644 index 962f862b75..0000000000 --- a/image_generation/common/diffusers/include/scheduler_lms_discrete.hpp +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (C) 2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include <list> - -#include "scheduler.hpp" - -class LMSDiscreteScheduler : public Scheduler { -public: - LMSDiscreteScheduler(int32_t num_train_timesteps = 1000, - float beta_start = 0.00085f, - float beta_end = 0.012f, - BetaSchedule beta_schedule = BetaSchedule::SCALED_LINEAR, - PredictionType prediction_type = PredictionType::EPSILON, - const std::vector<float>& trained_betas = {}); - - void set_timesteps(size_t num_inference_steps) override; - - std::vector<std::int64_t> get_timesteps() const override; - - float get_init_noise_sigma() const override; - - void scale_model_input(ov::Tensor sample, size_t inference_step) override; - - ov::Tensor step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step) override; - -private: - std::vector<float> m_log_sigmas; - std::vector<float> m_sigmas; - std::vector<int64_t> m_timesteps; - std::list<std::vector<float>> m_derivative_list; - - int64_t _sigma_to_t(float sigma) const; -}; diff --git a/image_generation/common/diffusers/src/lora.cpp b/image_generation/common/diffusers/src/lora.cpp deleted file mode 100644 index e6f5ba0b6e..0000000000 --- a/image_generation/common/diffusers/src/lora.cpp +++ /dev/null @@ -1,172 +0,0 @@ -// Copyright (C) 2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "lora.hpp" - -#include <algorithm> -#include <set> -#include <map> -#include <string> -#include <vector> -#include <fstream> - -#include <Eigen/Dense> - -#include "openvino/op/add.hpp" -#include "openvino/op/convolution.hpp" -#include "openvino/op/matmul.hpp" -#include "openvino/pass/pattern/matcher.hpp" -#include "openvino/pass/pattern/op/wrap_type.hpp" - -#define SAFETENSORS_IMPLEMENTATION -#include "safetensors.h" - -InsertLoRA::InsertLoRA(LoRAMap& lora_map) : - m_lora_map(&lora_map) { - OPENVINO_ASSERT(!m_lora_map->empty(), "Map with LoRA weights is empty"); - - auto pattern = ov::pass::pattern::wrap_type<ov::op::v0::MatMul, ov::op::v1::Convolution>(); - - ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) { - auto root = m.get_match_root(); - if (!root) { - return false; - } - std::string root_name = root->get_friendly_name(); - std::replace(root_name.begin(), root_name.end(), '.', '_'); - - auto it = m_lora_map->begin(); - while (it != m_lora_map->end()) { - if (root_name.find(it->first) != std::string::npos) { - ov::Output<ov::Node> weights_port = root->input_value(1); - std::set<ov::Input<ov::Node>> consumers = weights_port.get_target_inputs(); - auto reshaped_const = std::make_shared<ov::op::v0::Constant>(*(it->second), weights_port.get_shape()); - auto lora_add = std::make_shared<ov::op::v1::Add>(weights_port, reshaped_const); - for (auto consumer : consumers) { - consumer.replace_source_output(lora_add->output(0)); - } - register_new_node(lora_add); - it = m_lora_map->erase(it); - break; - } else { - it++; - } - } - return true; - }; - - // Register pattern with Parameter operation as a pattern root node - auto m = std::make_shared<ov::pass::pattern::Matcher>(pattern, "InsertLoRA"); - // Register Matcher - register_matcher(m, callback); -} - -namespace { - -std::vector<std::uint8_t> read_file(const std::string& filename) { - std::ifstream file(filename, std::ios::binary | std::ios::ate); - OPENVINO_ASSERT(file.is_open(), "Cannot open file ", filename, " with LoRA weights"); - - size_t filesize = file.tellg(); - std::vector<std::uint8_t> buffer; - buffer.reserve(filesize); - - file.seekg(0, std::ios::beg); - std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), std::back_inserter(buffer)); - - return buffer; -} - -std::vector<float> convert_to_float(const safetensors_TensorDescriptor& tensor) { - std::vector<float> data; - size_t tensor_size = (tensor.end_offset_bytes - tensor.begin_offset_bytes) / sizeof(ov::float16); - - const ov::float16* ptr = static_cast<const ov::float16*>(tensor.ptr); - for (size_t i = 0; i < tensor_size; ++i) { - data.push_back(ptr[i]); - } - - return data; -} - -} // namespace - -std::map<std::string, InsertLoRA::LoRAMap> -read_lora_adapters(const std::string& filename, const float alpha) { - std::vector<std::uint8_t> file_buffer = read_file(filename); - void* buffer_ptr = file_buffer.data(); - - safetensors_File safe_tensors_file = {0}; - OPENVINO_ASSERT(safetensors_file_init(buffer_ptr, file_buffer.size(), &safe_tensors_file) == NULL, "Cannot parse ", filename, " using safetensors"); - - using FloatMatrix = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>; - using FloatMatrixMap = Eigen::Map<FloatMatrix>; - - // modify the layer name - std::map<std::string, InsertLoRA::LoRAMap> lora_constants; - - std::set<std::string> visited; - const std::string LORA_PREFIX_UNET = "lora_unet"; - const std::string LORA_PREFIX_TEXT_ENCODER = "lora_te"; - - // loading safetensor - for (int i = 0; i < safe_tensors_file.num_tensors; i++) { - std::map<std::string, std::string> lora_map; - - safetensors_TensorDescriptor tensor = safe_tensors_file.tensors[i]; - std::string tensor_name(tensor.name.ptr, tensor.name.ptr + tensor.name.len); - - const bool tensor_visited = std::find(visited.begin(), visited.end(), tensor_name) != visited.end(); - // alpha tensors are overriden by users' alpha - bool alpha_tensor = tensor_name.find(".alpha") != std::string::npos; - if (alpha_tensor || tensor_visited) - continue; - - const bool is_text_lora = tensor_name.find("text") != std::string::npos; - const std::string lora_prefix = is_text_lora ? LORA_PREFIX_TEXT_ENCODER : LORA_PREFIX_UNET; - std::string layer_infos = tensor_name.substr(tensor_name.find(lora_prefix) + lora_prefix.length() + 1); - // drop LoRA name suffixes which comes after '.' - std::string layer_name_str = layer_infos.substr(0, layer_infos.find(".")); - // Create C++ lora_map instead of Python lora_dict - lora_map["name"] = layer_name_str; - lora_map["type"] = is_text_lora ? "text_encoder" : "unet"; - - // update value of weights - std::vector<safetensors_TensorDescriptor> pair_tensor; - - // up at first, down at second - if (tensor_name.find("lora_down") != std::string::npos) { - pair_tensor.push_back(safe_tensors_file.tensors[i + 1]); - pair_tensor.push_back(safe_tensors_file.tensors[i]); - } else { - pair_tensor.push_back(safe_tensors_file.tensors[i]); - pair_tensor.push_back(safe_tensors_file.tensors[i + 1]); - } - - for (auto p_t : pair_tensor) { - safetensors_Str key_st = p_t.name; - std::string k_s(key_st.ptr, key_st.ptr + key_st.len); - visited.insert(k_s); - } - - ov::Shape shape_vec_0(pair_tensor[0].shape, pair_tensor[0].shape + pair_tensor[0].n_dimensions); - ov::Shape shape_vec_1(pair_tensor[1].shape, pair_tensor[1].shape + pair_tensor[1].n_dimensions); - - // matmul with floats - std::vector<float> float_data_0 = convert_to_float(pair_tensor[0]); - std::vector<float> float_data_1 = convert_to_float(pair_tensor[1]); - - // RowMajor - FloatMatrixMap mat2d_f_0(float_data_0.data(), shape_vec_0[0], shape_vec_0[1]); - FloatMatrixMap mat2d_f_1(float_data_1.data(), shape_vec_1[0], shape_vec_1[1]); - FloatMatrix matmul_f = alpha * mat2d_f_0 * mat2d_f_1; - - lora_constants[is_text_lora ? "text_encoder" : "unet"][layer_name_str] = - ov::op::v0::Constant::create(ov::element::f32, {static_cast<size_t>(matmul_f.rows() * matmul_f.cols())}, matmul_f.data()); - } - - free(safe_tensors_file.tensors); - free(safe_tensors_file.metadata); - - return lora_constants; -} diff --git a/image_generation/common/diffusers/src/safetensors.h b/image_generation/common/diffusers/src/safetensors.h deleted file mode 100644 index 49c08a9963..0000000000 --- a/image_generation/common/diffusers/src/safetensors.h +++ /dev/null @@ -1,539 +0,0 @@ -/* - Harris M. Snyder, 2023 - This is free and unencumbered software released into the public domain. - - safetensors.h: a library for reading .safetensors files from C. - - Basic usage: read the entire .safetensors file into memory (this is not - handled by safetensors.h) and feed it to safetensors_file_init(). This - will populate a safetensors_File struct, which contains an array of - tensor descriptors. You can then loop over the tensor descriptors and - pull out what you need. See the structs and functions below for details. - - This file is a single-header library (credit to Sean Barrett for the - idea); it includes both the header and the actual definitions in - a single file. To use this library, copy it into your project, and - define SAFETENSORS_IMPLEMENTATION in exactly one .c file, immediately - before you include safetensors.h - - The library depends only on the following headers from the standard - library: - - limits.h - - stdint.h - - stdlib.h - The latter is for realloc. A future update will allow the user to - control the memory allocation, so that stdlib.h is not needed. - -*/ - -#ifndef SAFETENSORS_H -#define SAFETENSORS_H - -#include <stdint.h> - -#ifndef SAFETENSORS_MAX_DIM -# define SAFETENSORS_MAX_DIM 20 -#endif - -typedef struct { - int len; - char* ptr; -} safetensors_Str; - -typedef struct { - safetensors_Str name; - // the pointer inside this struct will point into the - // memory block passed to safetensors_file_init() - - int dtype; - // will be one of the enum values below - - int n_dimensions; - int64_t shape[SAFETENSORS_MAX_DIM]; - // only the first n_dimensions entry of shape are meaningful - - int64_t begin_offset_bytes; - int64_t end_offset_bytes; - // values taken directly from file. an offset of 0 means the - // exact start of the portion of the file that follows the - // header (i.e. it is NOT an offset into the entire file). - - void* ptr; - // this will be pre-populated assuming that the memory block - // that was fed to safetensors_file_init() was the entire file, - // i.e. that the actual data immediately follows the header. - // if this is not the case, this pointer will be bogus, use the - // offsets to manually compute the location. -} safetensors_TensorDescriptor; - -typedef struct { - safetensors_Str name; - safetensors_Str value; -} safetensors_MetadataEntry; - -typedef struct { - int c; // internal use - - char* error_context; - // if safetensors_file_init() fails, this pointer will be set to - // where in the file memory block the error occurred. - - void* one_byte_past_end_of_header; - // after calling safetensors_file_init, this will point to the - // next byte after the end of the header - - safetensors_TensorDescriptor* tensors; - safetensors_MetadataEntry* metadata; - // these ^ are allocated automatically by safetensors_file_init() - // and are contiguous arrays. the user should free() them when done. - - int num_tensors; - int num_metadata; - // the lengths of the above arrays -} safetensors_File; - -char* safetensors_file_init(void* file_buffer, int64_t file_buffer_size_bytes, safetensors_File* out); -// Given a file buffer, parses the safetensors header and populates a safetensors_File -// structure so that the client program can find the data it wants. file_buffer should -// point to a buffer that contains at least the entire header, or more preferably the -// whole safetensors file. -// -// Returns 0 on success. On failure, returns a static error message string and sets -// out->error_context such that it points to where in file_buffer the error happened. - -static int safetensors_str_equal(safetensors_Str a, const char* b) -// For convenience: easily check if a tensor name matches a given string literal -{ - if (!b) - return 0; - int equal = 1; - for (int i = 0; (i < a.len && equal && b[i]); i++) - equal = equal && a.ptr[i] == b[i]; - return equal; -} - -// Enum values for the 'dtype' field -enum { - SAFETENSORS_F64 = 0, - SAFETENSORS_F32, - SAFETENSORS_F16, - SAFETENSORS_BF16, - SAFETENSORS_I64, - SAFETENSORS_I32, - SAFETENSORS_I16, - SAFETENSORS_I8, - SAFETENSORS_U8, - SAFETENSORS_BOOL, - - SAFETENSORS_NUM_DTYPES -}; - -#endif - -/* - ============================================================================ - END OF HEADER SECTION - Implementation follows - ============================================================================ -*/ - -#ifdef SAFETENSORS_IMPLEMENTATION - -#ifndef assert -# ifdef SAFETENSORS_DISABLE_ASSERTIONS -# define assert(c) -# else -# if defined(_MSC_VER) -# define assert(c) \ - if (!(c)) { \ - __debugbreak(); \ - } -# else -# if defined(__GNUC__) || defined(__clang__) -# define assert(c) \ - if (!(c)) { \ - __builtin_trap(); \ - } -# else -# define assert(c) \ - if (!(c)) { \ - *(volatile int*)0 = 0; \ - } -# endif -# endif -# endif -#endif - -#include <limits.h> -#include <stdlib.h> - -static int64_t parse_positive_int(char** ptr, char* limit) { - /* - - Skips preceeding spaces and tabs - - Won't read past 'limit' - - Doesn't check for integer overflow - - Doesn't parse negative numbers - - Returns -1 on failure - */ - char* str = *ptr; - - while (*str == ' ' || *str == '\t') - str++; - - int64_t v = 0; - int n = 0; - while (str < limit && *str >= 48 && *str <= 57) { - int digit = *str - 48; - v *= 10; - v += digit; - str++; - n++; - } - - if (n > 0) { - *ptr = str; - return v; - } - - return -1; -} - -static int eat(char** ptr, char* limit, char expected) { - // skip whitespace - // return 0 if we hit the limit - // return 0 if data don't match the expected string - // otherwise, return 1 and move the pointer to the end of the match - char* p = *ptr; - while (*p == ' ' || *p == '\t') - ++p; - if (p + 1 > limit) - return 0; - if (*p != expected) - return 0; - *ptr = p + 1; - return 1; -} - -static int peek(char* ptr, char* limit, char expected) { - // same as eat, but doesn't adjust the pointer - char* tmp = ptr; - return eat(&tmp, limit, expected); -} - -typedef struct { - int num_entries; - int64_t entries[SAFETENSORS_MAX_DIM]; -} IntList; - -static int eat_intlist(char** ptr, char* limit, IntList* out) { - // *out = (IntList){0}; - *out = IntList(); - out->num_entries = 0; - char* p = *ptr; - if (!eat(&p, limit, '[')) - return 0; - - while (p < limit) { - char* p_save = p; - if (eat(&p, limit, ']')) - break; - - int64_t val = parse_positive_int(&p, limit); - if (val == -1) { - return 0; - } else { - out->entries[out->num_entries++] = val; - if (out->num_entries == SAFETENSORS_MAX_DIM) { - return 0; // unsupported tensor dimensions (TODO improve handling) - } - } - - if (!eat(&p, limit, ',')) - if (!peek(p, limit, ']')) - return 0; - - assert(p != p_save); - } - - *ptr = p; - return 1; -} - -static int eat_string(char** ptr, char* limit, safetensors_Str* out) { - char delim = 0; - - if (eat(ptr, limit, '\'')) - delim = '\''; - else if (eat(ptr, limit, '"')) - delim = '"'; - else - return 0; // bad delimiter - - int len = 0; - char* p = *ptr; - char* start = p; - - while (p < limit) { - if (*p == delim && p[-1] != '\\') { - ++p; - goto string_ok; - } else { - ++p; - ++len; - } - } - return 0; // unterminated - -string_ok: - assert(p <= limit); - *ptr = p; - // *out = (safetensors_Str) { .len=len, .ptr=start}; - safetensors_Str str; - str.len = len; - str.ptr = start; - *out = str; - return 1; -} - -typedef struct { - safetensors_Str key; - int value_is_str; - union { - safetensors_Str svalue; - IntList ivalue; - }; -} KeyValuePair; - -static int eat_kv_pair(char** ptr, char* limit, KeyValuePair* kvp) { - char* p = *ptr; - - // mandatory string (key) - if (!eat_string(&p, limit, &kvp->key)) - return 0; - - if (!eat(&p, limit, ':')) - return 0; - - // value can be string, or list of integers - safetensors_Str str_value = {0}; - IntList intlist_value = {0}; - - if (!eat_string(&p, limit, &str_value)) { - if (!eat_intlist(&p, limit, &intlist_value)) { - return 0; - } else { - kvp->value_is_str = 0; - kvp->ivalue = intlist_value; - } - } else { - kvp->value_is_str = 1; - kvp->svalue = str_value; - } - - *ptr = p; - return 1; -} - -static void mem_copy(void* dest, void* source, unsigned num) { - // unsigned char *d=dest, *s=source; - unsigned char* d = (unsigned char*)dest; - unsigned char* s = (unsigned char*)source; - for (unsigned i = 0; i < num; i++) - d[i] = s[i]; -} - -static char* more_memory(safetensors_File* out) { - if (out->num_tensors == out->c || out->num_metadata == out->c) { - void* new_tensors = realloc(out->tensors, sizeof(out->tensors[0]) * (out->c + 100)); - // if (!new_tensors) return "Out of memory"; - if (!new_tensors) - return const_cast<char*>("Out of memory"); - - // out->tensors = new_tensors; - out->tensors = (safetensors_TensorDescriptor*)new_tensors; - - void* new_metadata = realloc(out->metadata, sizeof(out->metadata[0]) * (out->c + 100)); - // if (!new_metadata) return "Out of memory"; - if (!new_metadata) - return const_cast<char*>("Out of memory"); - - // out->metadata = new_metadata; - out->metadata = (safetensors_MetadataEntry*)new_metadata; - out->c += 100; - } - return 0; -} - -char* apply_key_value_pair(safetensors_File* out, KeyValuePair kvp, char* baseptr) { - // #define KNOWN_DTYPES "F64, F32, F16, BF16, I64, I32, I16, I8, U8, or BOOL" - if (safetensors_str_equal(kvp.key, "dtype")) { - if (!kvp.value_is_str) - return const_cast<char*>("Expected a string value for 'dtype'"); - if (safetensors_str_equal(kvp.svalue, "F64")) - out->tensors[out->num_tensors].dtype = SAFETENSORS_F64; - else if (safetensors_str_equal(kvp.svalue, "F32")) - out->tensors[out->num_tensors].dtype = SAFETENSORS_F32; - else if (safetensors_str_equal(kvp.svalue, "F16")) - out->tensors[out->num_tensors].dtype = SAFETENSORS_F16; - else if (safetensors_str_equal(kvp.svalue, "BF16")) - out->tensors[out->num_tensors].dtype = SAFETENSORS_BF16; - else if (safetensors_str_equal(kvp.svalue, "I64")) - out->tensors[out->num_tensors].dtype = SAFETENSORS_I64; - else if (safetensors_str_equal(kvp.svalue, "I32")) - out->tensors[out->num_tensors].dtype = SAFETENSORS_I32; - else if (safetensors_str_equal(kvp.svalue, "I16")) - out->tensors[out->num_tensors].dtype = SAFETENSORS_I16; - else if (safetensors_str_equal(kvp.svalue, "I8")) - out->tensors[out->num_tensors].dtype = SAFETENSORS_I8; - else if (safetensors_str_equal(kvp.svalue, "U8")) - out->tensors[out->num_tensors].dtype = SAFETENSORS_U8; - else if (safetensors_str_equal(kvp.svalue, "BOOL")) - out->tensors[out->num_tensors].dtype = SAFETENSORS_BOOL; - // else return "Unrecognized datatype (expected " KNOWN_DTYPES ")"; - else - return const_cast<char*>( - "Unrecognized datatype (expected F64, F32, F16, BF16, I64, I32, I16, I8, U8, or BOOL)"); - - } else if (safetensors_str_equal(kvp.key, "shape")) { - if (kvp.value_is_str) - return const_cast<char*>("Expected an integer list value for 'shape'"); - out->tensors[out->num_tensors].n_dimensions = kvp.ivalue.num_entries; - for (int i = 0; i < kvp.ivalue.num_entries; i++) - out->tensors[out->num_tensors].shape[i] = kvp.ivalue.entries[i]; - } else if (safetensors_str_equal(kvp.key, "data_offsets")) { - if (kvp.value_is_str) - return const_cast<char*>("Expected an integer list value for 'shape'"); - if (kvp.ivalue.num_entries != 2) - return const_cast<char*>("Expected exactly two entries for the value of 'offsets'"); - out->tensors[out->num_tensors].begin_offset_bytes = kvp.ivalue.entries[0]; - out->tensors[out->num_tensors].end_offset_bytes = kvp.ivalue.entries[1]; - out->tensors[out->num_tensors].ptr = baseptr + kvp.ivalue.entries[0]; - } else { - // error? ignore? - return const_cast<char*>("Unexpected key (expected dtype, shape, or data_offsets)"); - } - return 0; -} - -char* safetensors_file_init(void* file_buffer, int64_t file_buffer_bytes, safetensors_File* out) { - // *out = (safetensors_File){0}; - safetensors_File file; - file.c = 0; - file.error_context = nullptr; - file.one_byte_past_end_of_header = nullptr; - file.tensors = nullptr; - file.metadata = nullptr; - file.num_tensors = 0; - file.num_metadata = 0; - - *out = file; - int header_len = 0; - { - uint64_t header_len_u64 = 0; - mem_copy(&header_len_u64, file_buffer, sizeof(header_len_u64)); - // if (header_len_u64 > (uint64_t)INT_MAX) - if (header_len_u64 > static_cast<uint64_t>(INT_MAX)) { -#define STRINGIFY(x) #x - // return "File header allegedly more than INT_MAX (" STRINGIFY(INT_MAX) ") bytes, file likely corrupt"; - return const_cast<char*>( - "File header allegedly more than INT_MAX (" STRINGIFY(INT_MAX) ") bytes, file likely corrupt"); - } - header_len = header_len_u64; - } - assert(header_len >= 0); - if (header_len == 0) - return const_cast<char*>("File header allegedly zero bytes, file likely corrupt"); - - char* t = ((char*)file_buffer) + 8; - char* e = t + header_len; - out->one_byte_past_end_of_header = e; - - char* tensor_data_baseptr = t + header_len; - -// #define ST_ERR(message) return out->error_context=t, (message); -#define ST_ERR(message) return out->error_context = t, const_cast<char*>(message) - // mandatory open brace starts the header - if (!eat(&t, e, '{')) - ST_ERR("Expected '{'"); - - // loop over header entries - while (t < e) { - char* t_save = t; - - // if we hit a close brace, we're done - if (eat(&t, e, '}')) - goto header_ok; - - // mandatory string (tensor name) - safetensors_Str tensor_name = {0}; - if (!eat_string(&t, e, &tensor_name)) - ST_ERR("Expected tensor name"); - if (!eat(&t, e, ':')) - ST_ERR("Expected colon after tensor name"); - - char* alloc_error = more_memory(out); - if (alloc_error) - ST_ERR(alloc_error); - - out->tensors[out->num_tensors].name = tensor_name; - - // open brace starts a header entry - if (eat(&t, e, '{')) { - // loop over key-value pairs inside the header entry - while (t < e) { - char* t_save = 0; - - // close brace terminates the header entry - if (eat(&t, e, '}')) { - if (!safetensors_str_equal(tensor_name, "__metadata__")) - ++out->num_tensors; - break; - } - - // otherwise it's a key-value pair - KeyValuePair kvp = {0}; - char* error_context = t; - if (!eat_kv_pair(&t, e, &kvp)) - ST_ERR("Expected a key-value pair"); - - // figure out what to do with the key-value pair - if (safetensors_str_equal(tensor_name, "__metadata__")) { - if (!kvp.value_is_str) - return out->error_context = error_context, - const_cast<char*>("Expected a string value for a metadata entry"); - // out->metadata[out->num_metadata++] = - // (safetensors_MetadataEntry) { - // .name = kvp.key, - // .value = kvp.svalue - // }; - safetensors_MetadataEntry entry; - entry.name = kvp.key; - entry.value = kvp.svalue; - out->metadata[out->num_metadata++] = entry; - } else { - char* kvp_error = apply_key_value_pair(out, kvp, tensor_data_baseptr); - if (kvp_error) - return out->error_context = error_context, kvp_error; - } - - if (!eat(&t, e, ',')) - if (!peek(t, e, '}')) - ST_ERR("Expected comma"); - - assert(t != t_save); - } - } - - if (!eat(&t, e, ',')) - if (!peek(t, e, '}')) - ST_ERR("Expected comma"); - - assert(t != t_save); - } - ST_ERR("Unterminated header"); -header_ok: - return 0; -#undef ST_ERR -} - -#endif diff --git a/image_generation/common/imwrite/CMakeLists.txt b/image_generation/common/imwrite/CMakeLists.txt deleted file mode 100644 index b6d457063e..0000000000 --- a/image_generation/common/imwrite/CMakeLists.txt +++ /dev/null @@ -1,10 +0,0 @@ -# Copyright (C) 2023 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -add_library(imwrite STATIC src/imwrite.cpp) -add_library(imwrite::imwrite ALIAS imwrite) - -target_include_directories(imwrite PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include") - -find_package(OpenVINO REQUIRED COMPONENTS Runtime) -target_link_libraries(imwrite PRIVATE openvino::runtime) diff --git a/image_generation/common/imwrite/include/imwrite.hpp b/image_generation/common/imwrite/include/imwrite.hpp deleted file mode 100644 index 9d8ee04efb..0000000000 --- a/image_generation/common/imwrite/include/imwrite.hpp +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright (C) 2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include <string> - -#include "openvino/runtime/tensor.hpp" - -/** - * @brief Writes image to file - * @param name File name - * @param image Image tensor - * @param convert_bgr2rgb Convert BGR to RGB - */ -void imwrite(const std::string& name, ov::Tensor image, bool convert_bgr2rgb); diff --git a/image_generation/stable_diffusion_1_5/cpp/CMakeLists.txt b/image_generation/stable_diffusion_1_5/cpp/CMakeLists.txt deleted file mode 100644 index 0e14623e15..0000000000 --- a/image_generation/stable_diffusion_1_5/cpp/CMakeLists.txt +++ /dev/null @@ -1,50 +0,0 @@ -# Copyright (C) 2023 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -cmake_minimum_required(VERSION 3.15) - -project(stable_diffusion LANGUAGES CXX) - -if(POLICY CMP0135) - cmake_policy(SET CMP0135 NEW) -endif() - -set(CMAKE_CXX_STANDARD 17) - -set(CMAKE_BUILD_TYPE "Release" CACHE STRING "CMake build type") - -# dependencies - -find_package(OpenVINO REQUIRED COMPONENTS Runtime) - -include(FetchContent) - -FetchContent_Declare(cxxopts - URL https://github.com/jarro2783/cxxopts/archive/refs/tags/v3.1.1.tar.gz - URL_HASH SHA256=523175f792eb0ff04f9e653c90746c12655f10cb70f1d5e6d6d9491420298a08) - -FetchContent_MakeAvailable(cxxopts) - -add_subdirectory(../../common/imwrite _deps/imwrite) -add_subdirectory(../../common/diffusers _deps/diffusers) - -set(CUSTOM_OPERATIONS tokenizer) -add_subdirectory(../../../thirdparty/openvino_contrib/modules/custom_operations/ _deps/tokenizers) - -# create executable - -add_executable(${PROJECT_NAME} ${PROJECT_SOURCE_DIR}/src/main.cpp) - -target_include_directories(${PROJECT_NAME} PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/src - ${EIGEN3_INCLUDE_DIR}) - -target_link_libraries(${PROJECT_NAME} PRIVATE - openvino::runtime - cxxopts::cxxopts - diffusers::diffusers - imwrite::imwrite) - -add_dependencies(${PROJECT_NAME} user_ov_extensions) - -target_compile_definitions(${PROJECT_NAME} PRIVATE TOKENIZERS_LIBRARY_PATH=\"$<TARGET_FILE:user_ov_extensions>\") diff --git a/image_generation/stable_diffusion_1_5/cpp/README.md b/image_generation/stable_diffusion_1_5/cpp/README.md deleted file mode 100644 index 6d81d2ff1f..0000000000 --- a/image_generation/stable_diffusion_1_5/cpp/README.md +++ /dev/null @@ -1,117 +0,0 @@ -# OpenVINO Stable Diffusion (with LoRA) C++ pipeline -The pure C++ text-to-image pipeline, driven by the OpenVINO native API for Stable Diffusion v1.5 with LMS Discrete Scheduler, supports both static and dynamic model inference. It includes advanced features like LoRA integration with safetensors and [OpenVINO extension for tokenizers](https://github.com/openvinotoolkit/openvino_contrib/blob/master/modules/custom_operations/user_ie_extensions/tokenizer/python/README.md). This demo has been tested on Windows and Linux platform. - -> [!NOTE] ->This tutorial assumes that the current working directory is `<openvino.genai repo>/image_generation/stable_diffusion_1_5/cpp/` and all paths are relative to this folder. - -## Step 1: Prepare build environment - -C++ Packages: -* [CMake](https://cmake.org/download/): Cross-platform build tool -* [OpenVINO](https://docs.openvino.ai/2023.2/openvino_docs_install_guides_overview.html): Model inference -* Eigen3: Lora enabling - -Prepare a python environment and install dependencies: -```shell -conda create -n openvino_sd_cpp python==3.10 -conda activate openvino_sd_cpp -conda install openvino eigen c-compiler cxx-compiler make -``` - -## Step 2: Convert Stable Diffusion v1.5 and Tokenizer models - -### Stable Diffusion v1.5 model: - -1. Install dependencies to import models from HuggingFace: -```shell -conda activate openvino_sd_cpp -python -m pip install -r scripts/requirements.txt -python -m pip install ../../../thirdparty/openvino_contrib/modules/custom_operations/[transformers] -``` -2. Download a huggingface SD v1.5 model like: -- [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) -- [dreamlike-anime-1.0](https://huggingface.co/dreamlike-art/dreamlike-anime-1.0) to run Stable Diffusion with LoRA adapters. - - -Example command: -```shell -huggingface-cli download --resume-download --local-dir-use-symlinks False dreamlike-art/dreamlike-anime-1.0 --local-dir models/dreamlike-anime-1.0 -``` - -Please, refer to the official website for [model downloading](https://huggingface.co/docs/hub/models-downloading) to read more details. - -3. Run model conversion script to convert PyTorch model to OpenVINO IR via [optimum-intel](https://github.com/huggingface/optimum-intel). Please, use the script `scripts/convert_model.py` to convert the model into `FP16_static` or `FP16_dyn`, which will be saved into the `models` folder: -```shell -cd scripts -python convert_model.py -b 1 -t FP16 -sd ../models/dreamlike-anime-1.0 # to convert to models with static shapes -python convert_model.py -b 1 -t FP16 -sd ../models/dreamlike-anime-1.0 -dyn True # to keep models with dynamic shapes -python convert_model.py -b 1 -t INT8 -sd ../models/dreamlike-anime-1.0 -dyn True # to compress the models to INT8 -``` - -> [!NOTE] ->Now the pipeline support batch size = 1 only, i.e. static model `(1, 3, 512, 512)` - -### LoRA enabling with safetensors - -Refer to [python pipeline blog](https://blog.openvino.ai/blog-posts/enable-lora-weights-with-stable-diffusion-controlnet-pipeline). -The safetensor model is loaded via [safetensors.h](https://github.com/hsnyder/safetensors.h). The layer name and weight are modified with `Eigen Lib` and inserted into the SD model with `ov::pass::MatcherPass` in the file [common/diffusers/src/lora.cpp](https://github.com/openvinotoolkit/openvino.genai/blob/master/image_generation/common/diffusers/src/lora.cpp). - -SD model [dreamlike-anime-1.0](https://huggingface.co/dreamlike-art/dreamlike-anime-1.0) and Lora [soulcard](https://civitai.com/models/67927?modelVersionId=72591) are tested in this pipeline. - -Download and put safetensors and model IR into the models folder. - -## Step 3: Build the SD application - -```shell -conda activate openvino_sd_cpp -cmake -DCMAKE_BUILD_TYPE=Release -S . -B build -cmake --build build --parallel -``` - -## Step 4: Run Pipeline -```shell -./stable_diffusion [-p <posPrompt>] [-n <negPrompt>] [-s <seed>] [--height <output image>] [--width <output image>] [-d <device>] [-r <readNPLatent>] [-l <lora.safetensors>] [-a <alpha>] [-h <help>] [-m <modelPath>] [-t <modelType>] - -Usage: - stable_diffusion [OPTION...] -``` - -* `-p, --posPrompt arg` Initial positive prompt for SD (default: cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting) -* `-n, --negPrompt arg` Default is empty with space (default: ) -* `-d, --device arg` AUTO, CPU, or GPU (default: CPU) -* `--step arg` Number of diffusion step ( default: 20) -* `-s, --seed arg` Number of random seed to generate latent (default: 42) -* `--num arg` Number of image output(default: 1) -* `--height arg` Height of output image (default: 512) -* `--width arg` Width of output image (default: 512) -* `-c, --useCache` Use model caching -* `-r, --readNPLatent` Read numpy generated latents from file -* `-m, --modelPath arg` Specify path of SD model IR (default: ../models/dreamlike-anime-1.0) -* `-t, --type arg` Specify the type of SD model IR (FP16_static or FP16_dyn) (default: FP16_static) -* `-l, --loraPath arg` Specify path of lora file. (*.safetensors). (default: ) -* `-a, --alpha arg` alpha for lora (default: 0.75) -* `-h, --help` Print usage - -#### Examples - -Positive prompt: cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting - -Negative prompt: (empty, here couldn't use OV tokenizer, check the issues for details) - -Read the numpy latent instead of C++ std lib for the alignment with Python pipeline - -* Generate image without lora `./stable_diffusion -r` - - - -* Generate image with soulcard lora `./stable_diffusion -r` - - - -* Generate different size image with dynamic model (C++ lib generated latent): `./stable_diffusion -m ../models/dreamlike-anime-1.0 -t FP16_dyn --height 448 --width 704` - - - -## Notes: - -For the generation quality, be careful with the negative prompt and random latent generation. C++ random generation with MT19937 results is differ from `numpy.random.randn()`. Hence, please use `-r, --readNPLatent` for the alignment with Python (this latent file is for output image 512X512 only) diff --git a/image_generation/stable_diffusion_1_5/cpp/scripts/convert_model.py b/image_generation/stable_diffusion_1_5/cpp/scripts/convert_model.py deleted file mode 100644 index 3dd309296a..0000000000 --- a/image_generation/stable_diffusion_1_5/cpp/scripts/convert_model.py +++ /dev/null @@ -1,49 +0,0 @@ -from pathlib import Path -import argparse -from optimum.intel.openvino import OVStableDiffusionPipeline -from openvino import Type, save_model -from transformers import AutoTokenizer -from openvino_tokenizers import convert_tokenizer -import torch - - -def parse_args() -> argparse.Namespace: - """Parse and return command line arguments.""" - parser = argparse.ArgumentParser(add_help=False) - args = parser.add_argument_group('Options') - # fmt: off - args.add_argument('-h', '--help', action = 'help', - help='Show this help message and exit.') - args.add_argument('-b', '--batch', type = int, default = 1, required = True, - help='Required. batch_size for solving single/multiple prompt->image generation.') - args.add_argument('-t', '--type', type = str, default = "FP32", required = True, - help='Required. data type, FP32, FP16, and compressed type INT8.') - args.add_argument('-dyn', '--dynamic', type = bool, default = False, required = False, - help='Specify the model input shape to use dynamic shape.') - args.add_argument('-sd','--sd_weights', type = str, default="", required = True, - help='Specify the path of stable diffusion model') - # fmt: on - return parser.parse_args() - -args = parse_args() - -load_in_8bit = True if args.type == "INT8" else False -output_path = Path(args.sd_weights) / (args.type + ("_dyn" if args.dynamic else "_static")) - -# convert SD models to IR - -model = OVStableDiffusionPipeline.from_pretrained(args.sd_weights, trust_remote_code=True, export=True, compile=False, load_in_8bit=load_in_8bit) -if args.type == "FP16": - model.half() -if not args.dynamic: - model.reshape(args.batch, 512, 512, 1) - -model.save_pretrained(output_path) - -# convert tokenizer - -tokenizer_path = output_path / "tokenizer" -hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) -ov_tokenizer_encoder = convert_tokenizer(hf_tokenizer, tokenizer_output_type=Type.i32) - -save_model(ov_tokenizer_encoder, tokenizer_path / "openvino_tokenizer.xml", compress_to_fp16=False) diff --git a/image_generation/stable_diffusion_1_5/cpp/scripts/np_latents_512x512.txt b/image_generation/stable_diffusion_1_5/cpp/scripts/np_latents_512x512.txt deleted file mode 100644 index 8d378e474f..0000000000 --- a/image_generation/stable_diffusion_1_5/cpp/scripts/np_latents_512x512.txt +++ /dev/null @@ -1,4096 +0,0 @@ -4.96714145e-01 -1.38264298e-01 6.47688568e-01 1.52302980e+00 - -2.34153375e-01 -2.34136954e-01 1.57921278e+00 7.67434716e-01 - -4.69474375e-01 5.42560041e-01 -4.63417679e-01 -4.65729743e-01 - 2.41962269e-01 -1.91328025e+00 -1.72491789e+00 -5.62287509e-01 - -1.01283109e+00 3.14247340e-01 -9.08024073e-01 -1.41230369e+00 - 1.46564877e+00 -2.25776300e-01 6.75282031e-02 -1.42474818e+00 - -5.44382751e-01 1.10922590e-01 -1.15099359e+00 3.75698030e-01 - -6.00638688e-01 -2.91693747e-01 -6.01706624e-01 1.85227823e+00 - -1.34972250e-02 -1.05771089e+00 8.22544932e-01 -1.22084367e+00 - 2.08863601e-01 -1.95967007e+00 -1.32818604e+00 1.96861237e-01 - 7.38466561e-01 1.71368286e-01 -1.15648285e-01 -3.01103681e-01 - -1.47852194e+00 -7.19844222e-01 -4.60638762e-01 1.05712223e+00 - 3.43618304e-01 -1.76304018e+00 3.24083984e-01 -3.85082275e-01 - -6.76922023e-01 6.11676276e-01 1.03099954e+00 9.31280136e-01 - -8.39217544e-01 -3.09212387e-01 3.31263423e-01 9.75545108e-01 - -4.79174227e-01 -1.85658976e-01 -1.10633492e+00 -1.19620657e+00 - 8.12525809e-01 1.35624003e+00 -7.20101222e-02 1.00353289e+00 - 3.61636013e-01 -6.45119727e-01 3.61395597e-01 1.53803658e+00 - -3.58260386e-02 1.56464362e+00 -2.61974502e+00 8.21902514e-01 - 8.70470703e-02 -2.99007356e-01 9.17607769e-02 -1.98756886e+00 - -2.19671890e-01 3.57112557e-01 1.47789407e+00 -5.18270195e-01 - -8.08493614e-01 -5.01757026e-01 9.15402114e-01 3.28751117e-01 - -5.29760182e-01 5.13267457e-01 9.70775485e-02 9.68644977e-01 - -7.02053070e-01 -3.27662140e-01 -3.92108142e-01 -1.46351492e+00 - 2.96120286e-01 2.61055261e-01 5.11345686e-03 -2.34587133e-01 - -1.41537070e+00 -4.20645326e-01 -3.42714518e-01 -8.02277267e-01 - -1.61285713e-01 4.04050857e-01 1.88618588e+00 1.74577817e-01 - 2.57550389e-01 -7.44459182e-02 -1.91877127e+00 -2.65138745e-02 - 6.02302104e-02 2.46324205e+00 -1.92360967e-01 3.01547348e-01 - -3.47117707e-02 -1.16867805e+00 1.14282286e+00 7.51933038e-01 - 7.91031957e-01 -9.09387469e-01 1.40279436e+00 -1.40185106e+00 - 5.86857080e-01 2.19045568e+00 -9.90536332e-01 -5.66297710e-01 - 9.96513665e-02 -5.03475666e-01 -1.55066347e+00 6.85629770e-02 - -1.06230366e+00 4.73592430e-01 -9.19424236e-01 1.54993439e+00 - -7.83253312e-01 -3.22061509e-01 8.13517213e-01 -1.23086429e+00 - 2.27459937e-01 1.30714273e+00 -1.60748327e+00 1.84633866e-01 - 2.59882808e-01 7.81822860e-01 -1.23695076e+00 -1.32045662e+00 - 5.21941543e-01 2.96984673e-01 2.50492841e-01 3.46448213e-01 - -6.80024743e-01 2.32253700e-01 2.93072462e-01 -7.14351416e-01 - 1.86577451e+00 4.73832935e-01 -1.19130349e+00 6.56553626e-01 - -9.74681675e-01 7.87084579e-01 1.15859556e+00 -8.20682347e-01 - 9.63376105e-01 4.12780941e-01 8.22060168e-01 1.89679301e+00 - -2.45388120e-01 -7.53736138e-01 -8.89514446e-01 -8.15810263e-01 - -7.71017075e-02 3.41151983e-01 2.76690811e-01 8.27183247e-01 - 1.30018918e-02 1.45353413e+00 -2.64656842e-01 2.72016907e+00 - 6.25667334e-01 -8.57157528e-01 -1.07089245e+00 4.82472420e-01 - -2.23462790e-01 7.14000523e-01 4.73237634e-01 -7.28289112e-02 - -8.46793711e-01 -1.51484728e+00 -4.46514964e-01 8.56398821e-01 - 2.14093745e-01 -1.24573874e+00 1.73180923e-01 3.85317385e-01 - -8.83857429e-01 1.53725103e-01 5.82087189e-02 -1.14297032e+00 - 3.57787371e-01 5.60784519e-01 1.08305120e+00 1.05380201e+00 - -1.37766933e+00 -9.37825024e-01 5.15035272e-01 5.13785958e-01 - 5.15047669e-01 3.85273147e+00 5.70890486e-01 1.13556564e+00 - 9.54001784e-01 6.51391268e-01 -3.15269232e-01 7.58969247e-01 - -7.72825241e-01 -2.36818612e-01 -4.85363543e-01 8.18741396e-02 - 2.31465864e+00 -1.86726522e+00 6.86260164e-01 -1.61271584e+00 - -4.71931875e-01 1.08895063e+00 6.42800182e-02 -1.07774472e+00 - -7.15303719e-01 6.79597735e-01 -7.30366647e-01 2.16458589e-01 - 4.55718413e-02 -6.51600361e-01 2.14394403e+00 6.33919001e-01 - -2.02514267e+00 1.86454311e-01 -6.61786437e-01 8.52433324e-01 - -7.92520761e-01 -1.14736438e-01 5.04987299e-01 8.65755200e-01 - -1.20029640e+00 -3.34501237e-01 -4.74945307e-01 -6.53329253e-01 - 1.76545429e+00 4.04981703e-01 -1.26088393e+00 9.17861938e-01 - 2.12215614e+00 1.03246522e+00 -1.51936996e+00 -4.84234065e-01 - 1.26691115e+00 -7.07669437e-01 4.43819433e-01 7.74634063e-01 - -9.26930487e-01 -5.95253557e-02 -3.24126744e+00 -1.02438760e+00 - -2.52568156e-01 -1.24778318e+00 1.63241136e+00 -1.43014133e+00 - -4.40044492e-01 1.30740583e-01 1.44127333e+00 -1.43586218e+00 - 1.16316378e+00 1.02330614e-02 -9.81508672e-01 4.62103486e-01 - 1.99059695e-01 -6.00216866e-01 6.98020831e-02 -3.85313600e-01 - 1.13517344e-01 6.62130654e-01 1.58601677e+00 -1.23781550e+00 - 2.13303328e+00 -1.95208776e+00 -1.51785091e-01 5.88317215e-01 - 2.80991882e-01 -6.22699499e-01 -2.08122253e-01 -4.93000925e-01 - -5.89364767e-01 8.49602103e-01 3.57015491e-01 -6.92909598e-01 - 8.99599850e-01 3.07299525e-01 8.12862098e-01 6.29628837e-01 - -8.28994989e-01 -5.60181022e-01 7.47293591e-01 6.10370278e-01 - -2.09015943e-02 1.17327385e-01 1.27766490e+00 -5.91571391e-01 - 5.47097385e-01 -2.02192649e-01 -2.17681199e-01 1.09877682e+00 - 8.25416327e-01 8.13509643e-01 1.30547881e+00 2.10038424e-02 - 6.81952953e-01 -3.10266763e-01 3.24166358e-01 -1.30143061e-01 - 9.69959646e-02 5.95157027e-01 -8.18220675e-01 2.09238720e+00 - -1.00601733e+00 -1.21418858e+00 1.15811086e+00 7.91662693e-01 - 6.24119818e-01 6.28345490e-01 -1.22467726e-02 -8.97254348e-01 - 7.58045614e-02 -6.77161694e-01 9.75119710e-01 -1.47057384e-01 - -8.25497210e-01 -3.21385831e-01 4.12931442e-01 -5.63724577e-01 - -8.22220385e-01 2.43687212e-01 2.44966567e-01 -5.06943166e-01 - -4.71038312e-01 2.32049942e-01 -1.44808435e+00 -1.40746379e+00 - -7.18444228e-01 -2.13447154e-01 3.10907573e-01 1.47535622e+00 - 8.57659638e-01 -1.59938529e-01 -1.90162081e-02 -1.00252938e+00 - -1.85131356e-02 -2.88658649e-01 3.22718561e-01 -8.27230930e-01 - 5.19346535e-01 1.53273892e+00 -1.08760148e-01 4.01711732e-01 - 6.90144002e-01 -4.01220471e-01 2.24092484e-01 1.25924004e-02 - 9.76760983e-02 -7.73009777e-01 2.45101750e-02 4.97998297e-01 - 1.45114362e+00 9.59270835e-01 2.15318251e+00 -7.67347574e-01 - 8.72320652e-01 1.83342010e-01 2.18980289e+00 -8.08298290e-01 - -8.39721859e-01 -5.99392653e-01 -2.12389565e+00 -5.25755048e-01 - -7.59132683e-01 1.50393784e-01 3.41755986e-01 1.87617087e+00 - 9.50423837e-01 -5.76903641e-01 -8.98414671e-01 4.91919160e-01 - -1.32023323e+00 1.83145881e+00 1.17944014e+00 -4.69175667e-01 - -1.71313453e+00 1.35387242e+00 -1.14539847e-01 1.23781633e+00 - -1.59442770e+00 -5.99375010e-01 5.24369953e-03 4.69805934e-02 - -4.50065464e-01 6.22849941e-01 -1.06762040e+00 -1.42379478e-01 - 1.20295629e-01 5.14438808e-01 7.11614907e-01 -1.12464213e+00 - -1.53411412e+00 1.27767682e+00 3.32314014e-01 -7.48486519e-01 - 1.55115199e+00 1.15674637e-01 1.17929721e+00 6.75184801e-02 - 2.06074786e+00 1.75534081e+00 -2.48964146e-01 9.71570969e-01 - 6.45375967e-01 1.36863160e+00 -9.64923441e-01 6.86051488e-01 - 1.05842447e+00 -1.75873947e+00 -1.18325853e+00 -2.03923225e+00 - -2.69406825e-01 7.17542231e-01 1.50235701e+00 7.40947798e-02 - 1.62861550e+00 -1.38010144e+00 -1.70338249e+00 -5.55476993e-02 - 3.84065449e-01 -3.26947495e-02 -2.06744218e+00 -8.91200379e-02 - -1.30446947e+00 6.69672549e-01 3.66598248e-01 -9.39879775e-01 - -5.13866901e-01 -1.05921352e+00 -6.26790971e-02 9.55142319e-01 - -9.85726058e-01 5.04046500e-01 -5.30257642e-01 -7.92872846e-01 - -1.07030362e-01 -1.03524232e+00 -5.53649306e-01 -1.19787788e+00 - 1.96472514e+00 3.52635533e-02 -6.99725509e-01 2.13979915e-01 - -1.12328053e-01 -2.20969602e-01 6.14166677e-01 7.57507682e-01 - -5.30501127e-01 -5.75818241e-01 -2.75051683e-01 -2.30192113e+00 - -1.51519108e+00 1.36687422e+00 1.64496768e+00 -2.49036044e-01 - 5.76556981e-01 3.11250150e-01 3.07888079e+00 1.11957490e+00 - -1.27917588e-01 -9.55540419e-01 -1.60644627e+00 2.03463629e-01 - -7.56350756e-01 -1.42225373e+00 -6.46572888e-01 -1.08154798e+00 - 1.68714166e+00 8.81639779e-01 -7.97264092e-03 1.47994411e+00 - 7.73683041e-02 -8.61284196e-01 1.52312410e+00 5.38910031e-01 - -1.03724611e+00 -1.90338671e-01 -8.75618279e-01 -1.38279974e+00 - 9.26177561e-01 1.90941668e+00 -1.39856756e+00 5.62969208e-01 - -6.50642574e-01 -4.87125397e-01 -5.92393935e-01 -8.63990784e-01 - 4.85216267e-02 -8.30950141e-01 2.70456821e-01 -5.02381101e-02 - -2.38948047e-01 -9.07563686e-01 -5.76771319e-01 7.55391240e-01 - 5.00917196e-01 -9.77555215e-01 9.93323028e-02 7.51387119e-01 - -1.66940522e+00 5.43360174e-01 -6.62623763e-01 5.70598662e-01 - -7.63259172e-01 -1.80488205e+00 -1.62754250e+00 4.80849482e-02 - 2.59722501e-01 -9.04316604e-01 6.38592482e-01 -1.66152000e+00 - -6.60797954e-02 -1.21101618e+00 -6.51836097e-01 4.73986715e-02 - -8.60413373e-01 -3.84555548e-01 1.00629282e+00 -5.76891899e-01 - 8.35692108e-01 -1.12970686e+00 5.29804170e-01 1.44156861e+00 - -2.47164440e+00 -7.96895266e-01 5.77072144e-01 -2.03045383e-01 - 3.71145874e-01 -6.03985190e-01 8.65897909e-02 -1.55677229e-01 - 1.16778207e+00 2.54420847e-01 3.37602675e-01 -4.11876976e-01 - -4.87606227e-01 -4.32558179e-01 3.94452155e-01 -4.20984477e-01 - 2.89774865e-01 2.07540083e+00 8.71124685e-01 -3.26023519e-01 - 1.20121396e+00 -4.08075362e-01 -2.03812456e+00 -1.00808632e+00 - -1.87079191e+00 -3.51513475e-01 1.84183791e-02 1.67643726e+00 - 3.26927364e-01 -2.19100535e-01 8.29405606e-01 -2.21113539e+00 - 2.35614553e-01 7.70865202e-01 -1.47858620e+00 1.14375401e+00 - 3.38496417e-01 -4.15287912e-01 6.32781863e-01 2.27069283e+00 - 1.81866258e-01 2.48220593e-01 -4.59360898e-01 -8.49844396e-01 - 8.30335796e-01 -8.56083810e-01 7.15662390e-02 -4.77657437e-01 - 4.78979826e-01 3.33662093e-01 1.03753996e+00 -5.10016382e-01 - -2.69874930e-01 -9.78763700e-01 -4.44293261e-01 3.77300501e-01 - 7.56988645e-01 -9.22165334e-01 8.69605899e-01 1.35563791e+00 - 4.13434893e-01 1.87679577e+00 -7.73789227e-01 -1.24465466e+00 - -1.77872026e+00 1.49604428e+00 6.54365659e-01 -5.55846691e-02 - 2.79968619e-01 -1.12548900e+00 2.44575191e+00 1.29221186e-01 - 1.09394796e-01 7.25766599e-01 4.81009245e-01 2.23884031e-01 - -7.90474474e-01 4.71468359e-01 1.88202453e+00 1.34542000e+00 - 1.59318662e+00 -5.11215687e-01 -9.89604831e-01 -1.25786915e-01 - 5.57249114e-02 1.09419155e+00 -1.69246459e+00 1.52955031e+00 - -1.58007905e-01 -4.26881075e-01 -1.01210439e+00 -1.65485668e+00 - 8.23170602e-01 7.33179674e-02 -1.28996086e+00 -1.29507875e+00 - -3.35784703e-01 1.66902149e+00 -2.59591341e-01 -1.50314295e+00 - -2.45743066e-01 -2.72723556e-01 -2.69688654e+00 -5.42948656e-02 - -2.30934530e-01 6.96206391e-01 1.84895611e+00 1.12656498e+00 - -2.68888682e-01 -1.10652590e+00 2.57335973e+00 5.92184328e-02 - 1.39292916e-02 -2.41250880e-02 1.98084757e-01 -1.44360408e-01 - -5.73661983e-01 -5.46858966e-01 -3.27532701e-02 -5.43424785e-01 - -7.12845802e-01 1.06430225e-01 -2.54977226e-01 1.50399303e+00 - -2.65096974e+00 1.09150684e+00 1.24608517e+00 -2.07339025e+00 - -3.42687607e-01 -3.71440858e-01 -1.40751171e+00 -7.77816713e-01 - -1.11057580e+00 1.75227046e+00 9.35678422e-01 1.27155507e+00 - 7.21672058e-01 -1.12905180e+00 -5.24520278e-01 4.89374548e-01 - -1.22212780e+00 7.12998450e-01 -2.40325391e-01 -3.74820799e-01 - 7.10959971e-01 4.44263309e-01 -3.60966176e-01 1.15932977e+00 - -1.08106327e+00 6.15935624e-01 5.93101263e-01 -3.09546441e-01 - 3.26133013e-01 -1.25111353e+00 9.24027026e-01 -1.84902132e-01 - -5.22723019e-01 1.04900920e+00 -7.04343677e-01 -1.40846133e+00 - -1.55662918e+00 6.06009960e-01 -1.28042936e+00 1.75479424e+00 - -2.08192945e+00 1.69645631e+00 2.11017475e-01 -9.67131108e-02 - -5.44919074e-01 3.99136126e-01 -3.76347043e-02 1.10330188e+00 - 1.14227645e-01 1.50301754e-01 -3.63612205e-01 -5.69456220e-02 - 3.07801783e-01 -1.71016836e+00 -1.34818542e+00 7.43264079e-01 - 1.70865431e-01 -1.83983341e-01 1.84339322e-02 3.47581714e-01 - -5.39759696e-01 -7.78304696e-01 1.95845261e-01 -9.78372753e-01 - 4.08252746e-01 -1.70258355e+00 1.02915561e+00 4.72597480e-01 - 2.56029725e-01 9.82690990e-01 1.66547441e+00 1.01437008e+00 - -1.84087420e+00 -1.27957702e+00 -6.24818563e-01 2.60910504e-02 - 5.17659009e-01 -7.25743830e-01 1.86766759e-01 -7.55382955e-01 - -6.11517787e-01 -1.40666115e+00 -9.23233271e-01 -1.35168457e+00 - -9.75873232e-01 1.05364180e+00 -9.49398875e-01 2.63238215e+00 - 4.93317902e-01 1.84836119e-01 -8.58357787e-01 7.00309873e-01 - -5.75637817e-01 1.22009814e-01 2.56008458e+00 -9.60598961e-02 - 1.14927328e+00 -7.03176439e-01 -3.49884890e-02 1.77080059e+00 - -6.26967072e-01 1.81244850e+00 7.07751930e-01 -5.62466800e-01 - 6.32407725e-01 9.72554445e-01 6.21809959e-01 -1.57022476e+00 - -7.27137148e-01 -2.47518629e-01 -7.44334310e-02 6.20672107e-01 - 1.77700996e-01 -1.33534431e+00 3.80197853e-01 6.10585749e-01 - 5.59790432e-01 1.08078074e+00 8.33922148e-01 4.59180087e-01 - -7.01657087e-02 -1.66096091e+00 4.29618210e-01 2.07687691e-01 - 2.71578848e-01 -1.27674854e+00 -1.08105659e+00 1.05315280e+00 - -3.95551547e-02 6.81500673e-01 2.83183753e-02 2.97561400e-02 - 9.38283801e-01 -5.16044736e-01 9.61207747e-02 -4.62275296e-01 - -4.34496224e-01 -3.09172124e-01 2.22133771e-01 -4.78748620e-01 - 1.25575614e+00 -8.94607306e-01 -1.86871648e-01 -4.39731061e-01 - 1.44697785e+00 1.96554780e-01 1.03184450e+00 -1.48556042e+00 - 2.67050266e-01 8.89630795e-01 8.22839886e-02 1.06548035e+00 - -5.17288446e-01 1.40934741e+00 2.29889822e+00 -3.62838566e-01 - -4.45502520e-01 1.45338452e+00 1.57957220e+00 -5.22860050e-01 - -4.20186818e-01 -2.81784594e-01 -1.34445047e+00 -9.18651938e-01 - -1.00414073e+00 -7.67797589e-01 -3.46848890e-02 2.34214738e-01 - 1.55050051e+00 -9.98354018e-01 9.84322369e-01 -2.13988841e-01 - -4.94637080e-02 6.74819469e-01 -1.12272203e+00 3.82409751e-01 - 1.66452214e-01 4.92451251e-01 2.89168656e-01 2.45530009e+00 - -6.37739956e-01 -5.30996978e-01 -6.23140514e-01 -5.55477142e-01 - -6.37387156e-01 1.18901658e+00 1.42050421e+00 -5.70746303e-01 - -8.32355559e-01 4.71415550e-01 -5.52223027e-01 6.32931828e-01 - 2.02923015e-01 -1.51574409e+00 1.54750526e+00 1.79587770e+00 - -6.12788677e-01 -3.87701571e-01 2.85865396e-01 3.34456801e-01 - 6.58544302e-01 2.01020455e+00 -1.76947221e-01 -7.98297226e-01 - -1.37931919e+00 -7.30930030e-01 -3.31269726e-02 1.79455781e+00 - -5.17611325e-01 2.23787948e-01 -1.64228957e-02 1.18839324e+00 - 2.52693248e+00 -5.30868769e-01 -4.89439428e-01 1.04416084e+00 - 6.81891501e-01 1.84670734e+00 5.83928168e-01 -3.59292090e-01 - 5.90654850e-01 1.10870361e+00 8.20482194e-01 5.07274032e-01 - 1.06667471e+00 1.16929555e+00 1.38215899e+00 6.48709893e-01 - -1.67118087e-01 1.46713689e-01 1.20650899e+00 -8.16935658e-01 - 3.68673295e-01 -3.93338799e-01 2.87448224e-02 1.27845192e+00 - 1.91099063e-01 4.64365482e-02 -1.35985613e+00 7.46253550e-01 - 6.45484209e-01 2.16325474e+00 -3.07778239e-01 2.19150335e-01 - 2.49383688e-01 1.57745326e+00 -9.52955335e-02 2.79021531e-01 - 6.07896507e-01 1.86609119e-01 -4.46433604e-01 1.94089994e-01 - 1.07363176e+00 -1.02651525e+00 1.32969677e-01 -7.00120807e-01 - 1.19504666e+00 -1.52318692e+00 -5.58921874e-01 3.77211869e-01 - 1.56552398e+00 -6.57502636e-02 -5.55199504e-01 1.88115704e+00 - -1.44801390e+00 -2.19880605e+00 4.40014452e-01 -5.02054214e-01 - -1.02123284e+00 7.08356440e-01 2.43800715e-01 -5.64078629e-01 - -1.28030443e+00 8.72457325e-01 6.50201201e-01 -9.91758630e-02 - 1.84663701e+00 -1.07008481e+00 -1.52552521e+00 -6.91908062e-01 - -4.55860160e-02 2.43339449e-01 -2.41236061e-01 3.52055401e-01 - -1.25153947e+00 1.44376457e+00 -8.21511820e-02 1.11729586e+00 - 3.42725337e-01 4.56753224e-01 5.69767296e-01 4.47708547e-01 - 6.42722785e-01 1.32915258e+00 1.96521163e-01 7.09003747e-01 - -8.97356942e-02 1.44011724e+00 -6.76392317e-01 1.80094039e+00 - -4.01579514e-02 -1.43077505e+00 1.28104419e-01 -6.81051672e-01 - 8.40643525e-01 -6.52623951e-01 -4.46183443e-01 -1.88954067e+00 - -4.52306330e-01 -2.42387938e+00 -1.58390284e+00 7.60414660e-01 - 7.85800159e-01 4.25457567e-01 -9.66976166e-01 -4.77113575e-02 - -3.60253919e-03 -1.15836465e+00 1.50339830e+00 8.77362311e-01 - -2.20964178e-01 2.68858392e-02 2.08382815e-01 -2.04173493e+00 - -2.47177377e-01 -6.81984246e-01 -1.00162005e+00 -2.81100303e-01 - 1.79768658e+00 6.40842855e-01 -5.71178973e-01 5.72582781e-01 - 1.39935541e+00 9.24633682e-01 5.96303716e-02 -6.46936774e-01 - 6.98223293e-01 3.93485397e-01 8.95193219e-01 6.35171831e-01 - 1.04955268e+00 -5.35235226e-01 1.31739402e+00 1.97599605e-01 - 2.07526088e+00 -6.89187825e-01 1.73596382e+00 1.97910786e-01 - -6.51418030e-01 -4.83885825e-01 -3.20347309e-01 4.24165934e-01 - 5.22835493e-01 -5.73700011e-01 -2.43545920e-02 2.14227033e+00 - 1.72754312e+00 4.36323673e-01 3.80034782e-02 1.20031327e-01 - 6.13518000e-01 -1.02279258e+00 -2.57376552e-01 -1.66858411e+00 - 3.99223119e-01 6.47195935e-01 -4.83186454e-01 1.57398677e+00 - -1.22576571e+00 -1.46437490e+00 2.24451825e-01 1.04709828e+00 - 1.68392766e+00 -4.58884269e-01 1.07868087e+00 -3.85084711e-02 - -1.72627300e-01 8.83659959e-01 6.52322888e-01 -1.57639217e+00 - 1.47654033e+00 1.38009131e+00 -6.25562727e-01 3.95803541e-01 - 4.94030178e-01 2.60673761e-01 -5.50305128e-01 -6.71623349e-01 - -2.55540702e-02 1.17272902e+00 5.43600142e-01 -3.70614320e-01 - 7.71698713e-01 -2.84854269e+00 1.14876568e+00 -1.73971379e+00 - -3.62440944e-01 -1.11966991e+00 -1.29468143e+00 1.16082680e+00 - -4.67701197e-01 3.46503884e-01 -4.69205789e-02 4.77040827e-01 - 7.68218935e-02 -1.28299224e+00 9.96266842e-01 -4.93756592e-01 - -1.55658185e+00 -4.28115159e-01 1.50075984e+00 8.50221753e-01 - -3.48652124e-01 -3.49257708e-01 -3.21635038e-01 2.07674789e+00 - 3.81935447e-01 4.30041641e-01 1.03028345e+00 2.38789156e-01 - -2.59042144e-01 -1.96349844e-01 -7.16012567e-02 -3.72222364e-02 - 7.27629542e-01 5.19458875e-02 7.32640088e-01 -8.07165802e-02 - 7.86351934e-02 -1.99820065e+00 9.16327655e-01 3.46488476e-01 - 9.98010099e-01 -2.89625549e+00 2.08837461e+00 -1.39589623e-01 - 1.10818279e+00 -1.03990591e+00 6.12773895e-01 -1.05341554e+00 - -6.23768985e-01 1.91403139e+00 -1.90682396e-01 2.17432871e-01 - 8.70067716e-01 4.95681882e-01 1.50418907e-01 3.64960998e-01 - 2.40341568e+00 -5.76187968e-02 2.01099053e-01 1.05065441e+00 - 1.10552597e+00 1.18703032e+00 6.38730228e-01 -1.14300489e+00 - 1.63343155e+00 -1.14634538e+00 3.02635461e-01 -7.54275858e-01 - -6.41383454e-02 3.28762412e-01 3.21357220e-01 4.21920747e-01 - 1.61371124e+00 4.53534305e-01 -2.44156629e-01 9.64087188e-01 - 1.18947053e+00 -1.22760785e+00 5.97400069e-01 7.01172769e-01 - -2.97563493e-01 1.37570679e+00 -1.50055587e-01 1.25576451e-01 - -1.73071831e-01 1.55790476e-02 -1.09627509e+00 -1.44005084e+00 - 1.59450507e+00 -8.46961319e-01 -9.91392374e-01 -2.15339017e+00 - -6.38961732e-01 -1.32308984e+00 1.64201522e+00 1.00981712e+00 - -6.88150346e-01 2.25243592e+00 9.81765509e-01 -3.24831396e-01 - -2.49940562e+00 2.29094267e+00 -1.38957250e+00 -1.64539874e+00 - 1.02257049e+00 2.43975234e+00 1.38427281e+00 5.63909113e-01 - 5.94754338e-01 8.53415549e-01 7.58928597e-01 2.81191438e-01 - 1.04201101e-01 -6.25931248e-02 -7.53964603e-01 -2.80675083e-01 - -1.69295681e+00 -9.83396247e-02 -9.88591135e-01 -1.10358930e+00 - 1.79894149e-01 1.39200234e+00 9.18316603e-01 -1.57050061e+00 - -9.89628136e-01 9.40771163e-01 -9.82487381e-01 -2.24633157e-01 - 5.50052106e-01 -9.68344450e-01 1.05375506e-01 -1.33402550e+00 - -6.01367652e-01 3.19781929e-01 -1.59299374e+00 4.40474749e-01 - -1.96377989e-02 5.52489936e-01 2.23914132e-01 1.36414039e+00 - 1.25224501e-01 -4.29405540e-01 1.22297503e-01 5.43298006e-01 - 4.88600694e-02 4.05916907e-02 -7.01991677e-01 -6.62900925e-01 - -1.40260530e+00 1.74957669e+00 -1.24386322e+00 -6.92905188e-01 - -7.18407273e-01 8.94924402e-01 -2.94949681e-01 1.24774206e+00 - -6.73490644e-01 2.78994173e-01 -8.35347056e-01 2.14514923e+00 - -1.18759847e+00 3.09820712e-01 6.33776903e-01 4.13799107e-01 - -1.85287654e-01 -1.29820704e-01 4.38114703e-02 -1.47001997e-01 - 9.63879108e-01 2.21052289e+00 -5.57491779e-01 -1.36980295e+00 - -8.82820487e-02 2.57970929e+00 -8.03674579e-01 1.63911676e+00 - 1.67770076e+00 -5.53588271e-01 5.68983078e-01 1.62839663e+00 - -3.79127741e-01 -2.03580365e-01 -5.81680894e-01 -1.01475668e+00 - -6.49277568e-01 -1.22394025e+00 3.40834670e-02 -7.69973218e-01 - 2.33785912e-01 -1.55589569e+00 3.30880225e-01 8.33528936e-01 - -1.99373567e+00 3.74056578e-01 1.22766900e+00 -1.20964098e+00 - 1.67257237e+00 4.19019014e-01 -7.05011845e-01 -5.57690784e-02 - 5.58326900e-01 7.60053918e-02 5.38756013e-01 -9.20673609e-01 - 1.69360831e-01 -1.41371453e+00 -1.11226059e-01 -9.03907657e-01 - -7.35529959e-01 1.23609316e+00 1.09131014e+00 6.09138131e-01 - -1.09231281e+00 -3.16408455e-01 1.21309769e+00 1.41716912e-01 - 2.31932950e+00 3.93317848e-01 1.92049116e-01 -3.09116453e-01 - 1.33540899e-01 -1.52469844e-01 7.08108664e-01 9.56702292e-01 - -7.85989463e-01 -1.33123291e+00 -1.83620536e+00 5.07991314e-01 - -1.10336661e+00 -2.15289068e+00 3.88578594e-01 2.49299955e+00 - -6.07091142e-03 8.38490784e-01 8.18293616e-02 -9.88896564e-02 - 9.19076502e-01 -2.90274531e-01 2.67392308e-01 3.21697801e-01 - -6.68090463e-01 9.92042363e-01 -1.74959764e-01 -7.55745173e-01 - 5.36509871e-01 -8.98467958e-01 2.81811580e-02 -9.11899656e-03 - 1.08589554e+00 4.74698246e-01 -2.50269584e-02 8.17766309e-01 - 1.39020753e+00 5.57810307e-01 1.03526199e-02 -1.31183624e+00 - -1.06511366e+00 -3.05224717e-01 -6.09512210e-01 -1.86971307e-01 - 5.66499233e-02 5.29692769e-01 -7.04987794e-02 4.86501634e-01 - 6.44744113e-02 -1.97546661e+00 -9.39335406e-01 -1.44087553e-01 - -1.20969474e+00 5.99928737e-01 1.53075087e+00 1.21876180e+00 - -2.13442877e-01 1.49072611e+00 1.48667455e-01 -3.37085962e-01 - -6.13402665e-01 -3.02469701e-01 -3.88176829e-01 1.70416221e-01 - 1.60573974e-01 3.04602017e-03 4.36938167e-01 1.19064629e+00 - 9.49554145e-01 -1.48489797e+00 -2.55392122e+00 9.34319913e-01 - -1.36687875e+00 -2.24765405e-01 -1.17011297e+00 -1.80198050e+00 - 5.41462719e-01 7.59155154e-01 -5.76510429e-01 -2.59104228e+00 - -5.46244442e-01 3.91804010e-01 -1.47891152e+00 1.83359921e-01 - -1.53098488e-02 5.79291523e-01 1.19580366e-01 -9.73068953e-01 - 1.19657147e+00 -1.58529580e-01 -2.73045395e-02 -9.33267951e-01 - -4.43282247e-01 -8.84802699e-01 -1.72946066e-01 1.71170843e+00 - -1.37190115e+00 -1.61356139e+00 1.47117031e+00 -2.09323674e-01 - -6.69072747e-01 1.03990471e+00 -6.05615556e-01 1.82600975e+00 - 6.77925885e-01 -4.87911403e-01 2.15730810e+00 -6.05714917e-01 - 7.42095351e-01 2.99292594e-01 1.30174124e+00 1.56151116e+00 - 3.20041478e-02 -7.53417850e-01 4.59972143e-01 -6.77715361e-01 - 2.01338720e+00 1.36535332e-01 -3.65321547e-01 1.84680313e-01 - -1.34712625e+00 -9.71614063e-01 1.20041394e+00 -6.56894267e-01 - -1.04691100e+00 5.36652744e-01 1.18570411e+00 7.18953311e-01 - 9.96047676e-01 -7.56795108e-01 -1.42181063e+00 1.50133359e+00 - -3.22679847e-01 -2.50833005e-01 1.32819414e+00 5.56230009e-01 - 4.55887765e-01 2.16500235e+00 -6.43518209e-01 9.27840114e-01 - 5.70131242e-02 2.68592268e-01 1.52846837e+00 5.07835746e-01 - 5.38296103e-01 1.07250738e+00 -3.64952743e-01 -8.39209676e-01 - -1.04480922e+00 -1.96635664e+00 2.05620718e+00 -1.10320842e+00 - -2.21253619e-01 -2.76813298e-01 3.07406694e-01 8.15737188e-01 - 8.60473514e-01 -5.83077431e-01 -1.67121708e-01 2.82579958e-01 - -2.48691127e-01 1.60734558e+00 4.90974963e-01 7.34877765e-01 - 6.62881255e-01 1.17347383e+00 1.81021556e-01 -1.29683197e+00 - 3.99687946e-01 -6.51356876e-01 -5.28616667e-01 5.86364031e-01 - 1.23828304e+00 2.12715771e-02 3.08833003e-01 1.70221496e+00 - 2.40753174e-01 2.60168314e+00 5.65509617e-01 -1.76076281e+00 - 7.53341615e-01 3.81158382e-01 1.28975272e+00 6.73181355e-01 - -1.38455987e-01 -1.22429824e+00 -2.09023252e-01 -8.50520432e-01 - -5.80523431e-01 5.88578403e-01 1.66990447e+00 3.94671530e-01 - -1.19588304e+00 4.44602668e-01 1.19663143e+00 -6.09782875e-01 - -1.34017169e-01 1.46881947e-02 -7.84898341e-01 6.48280442e-01 - -1.20948151e-01 4.19532448e-01 -8.87492180e-01 -4.37458307e-01 - 7.22381353e-01 -3.72833163e-01 1.72696388e+00 -3.99636179e-01 - 2.24684730e-01 9.32590842e-01 -1.41836572e+00 -1.76080883e+00 - -1.52565634e+00 1.26258409e+00 -5.51858127e-01 2.55819917e+00 - -5.64247608e-01 1.84551299e-01 1.54210997e+00 2.00609279e+00 - 2.06150365e+00 1.20836627e+00 1.02406251e+00 5.92526972e-01 - 7.78361082e-01 -5.51185727e-01 -8.18198860e-01 -3.37445736e-03 - -1.70184627e-01 -4.53228056e-01 6.96387470e-01 9.55305219e-01 - 8.84068832e-02 1.47753012e+00 -1.14168906e+00 -1.93659455e-01 - -7.16822326e-01 -1.86653662e+00 -8.26806873e-02 -1.21747509e-01 - 1.51344979e+00 6.30811691e-01 -1.02418685e+00 1.85409260e+00 - 1.22103369e+00 5.82097709e-01 -2.26484105e-01 -9.59439218e-01 - -3.72206777e-01 1.08874857e+00 1.88458633e+00 1.54324353e+00 - -4.88849431e-01 -1.11961746e+00 1.40886247e-01 -1.76843941e+00 - 3.23167711e-01 -1.47602588e-01 -4.66036499e-01 -1.59470308e+00 - 5.13600111e-01 -5.32700837e-01 -1.16991687e+00 -2.87226224e+00 - -2.75149047e-02 1.77225161e+00 1.66125917e+00 -4.57096249e-01 - -6.02212071e-01 4.68774259e-01 -9.98385429e-01 3.01791906e-01 - 7.66080260e-01 1.22693324e+00 -1.00154079e-01 -2.03673750e-01 - -8.77982616e-01 -8.26880336e-01 -2.26478890e-01 3.67365509e-01 - 9.13584650e-01 -8.03178966e-01 1.49268854e+00 -2.71123588e-01 - -2.13672947e-02 -7.47211695e-01 -2.42424035e+00 8.84045422e-01 - 7.36843884e-01 -2.81327546e-01 6.69907182e-02 5.15939236e-01 - -1.56254590e+00 -5.29052675e-01 7.94264674e-01 -1.25428939e+00 - 2.93557942e-01 -1.35658181e+00 4.66429979e-01 -3.56414840e-02 - -1.61513186e+00 1.16473937e+00 -7.34591603e-01 -8.10252428e-01 - 2.00569198e-01 1.14863729e+00 -1.01582181e+00 6.16798513e-02 - 4.28816497e-01 6.93105578e-01 1.76441550e-01 -3.67027849e-01 - -8.27590227e-01 8.61438811e-02 -1.07213902e+00 -2.92135048e+00 - 4.36559796e-01 9.03934836e-01 -2.36293244e+00 -1.00973070e+00 - 6.19154274e-01 2.05749536e+00 2.07937080e-02 -7.28002965e-01 - -1.82896435e-01 1.37487638e+00 -6.45964205e-01 -7.99192011e-01 - -4.82743531e-01 -9.53328609e-01 1.22670315e-01 1.62467849e+00 - 3.23079288e-01 -2.52353489e-01 -2.91811258e-01 -1.56319082e+00 - 8.83109748e-01 -7.78371990e-02 -1.80479914e-01 3.19310760e+00 - 2.98752904e-01 -7.51791060e-01 -4.26357597e-01 1.14844573e+00 - 1.13270409e-01 -1.43827796e+00 9.19228911e-01 -6.68144107e-01 - 1.87329781e+00 1.08004808e+00 -4.47321892e-01 1.28101635e+00 - 6.78555071e-02 8.52773666e-01 4.84732807e-01 -8.46356630e-01 - -6.43549860e-01 1.02996087e+00 -3.34775329e-01 -4.03648466e-01 - -9.55122590e-01 4.23599064e-01 2.06252503e+00 -1.06753290e+00 - 2.42194571e-02 1.41222060e+00 -7.96413943e-02 4.52371806e-01 - -1.06239355e+00 4.28307086e-01 -1.87144265e-01 9.85729992e-01 - 1.18738604e+00 2.58956361e+00 5.79633176e-01 3.25796306e-01 - 1.94384307e-01 -3.53166282e-01 3.38483840e-01 -2.95401424e-01 - 1.68460980e-01 1.31759751e+00 -1.00654256e+00 1.13987851e+00 - 1.31711507e+00 -1.18068524e-01 -2.12185478e+00 -6.07822001e-01 - 1.29699457e+00 -2.28680614e-02 -9.99302208e-01 -5.04774928e-01 - 8.40620041e-01 5.46733558e-01 -2.38932103e-01 -3.66824418e-01 - -3.91758144e-01 -9.22410190e-01 1.61537564e+00 -3.22320461e-01 - 1.21715856e+00 1.52131605e+00 9.98310924e-01 -4.31620300e-01 - 4.03730094e-01 -2.41956059e-02 -9.03701842e-01 3.24359268e-01 - -1.17903984e+00 1.18767941e+00 -4.64617312e-01 2.01159656e-01 - 2.83287883e-01 -2.58904964e-01 5.86693823e-01 -4.74903643e-01 - 8.71297300e-01 -1.34597969e+00 1.26379579e-01 1.93892896e+00 - -1.00033128e+00 -6.77744985e-01 5.13907850e-01 1.79581776e-01 - 3.50630105e-01 4.89187121e-01 6.34721458e-01 1.10969985e+00 - 4.09818649e-01 -2.41257653e-01 6.72573686e-01 1.89988196e+00 - -1.32633746e-01 -9.74529326e-01 1.10708070e+00 -1.20381162e-01 - -2.17266965e+00 8.47421706e-01 -5.35328209e-01 -9.05332789e-02 - 3.31980318e-01 1.90499678e-01 7.09451795e-01 -4.35486376e-01 - 5.13105810e-01 -2.59546667e-01 7.38810480e-01 6.15367472e-01 - -9.35438693e-01 1.08598208e+00 -5.35963416e-01 8.08057785e-01 - 3.67287308e-01 1.83818364e+00 -2.23465979e-01 -3.49316806e-01 - -1.94196068e-02 -3.03179771e-01 7.99941897e-01 -1.61631060e+00 - -1.05368245e+00 -1.06780291e+00 9.50307608e-01 1.71061337e+00 - -1.04449220e-01 -1.68821722e-01 7.00521618e-02 1.16187835e+00 - -9.27353144e-01 2.38368988e-01 9.75197613e-01 5.01094162e-01 - 1.89581618e-01 1.00104606e+00 -2.70323229e+00 6.77875340e-01 - -6.54075682e-01 -1.83063293e+00 5.11202574e-01 1.37365854e+00 - -1.37448519e-01 9.52874541e-01 1.61227822e+00 1.31491446e+00 - 1.63996458e+00 7.42127478e-01 7.54336417e-02 -1.60196579e+00 - -2.46062487e-01 -8.43246579e-01 2.17094278e+00 -1.75885811e-01 - 1.23204805e-01 5.51485360e-01 4.36024480e-02 1.69505107e+00 - -6.22649372e-01 1.94607452e-01 -7.42470562e-01 -1.32002246e+00 - -6.11769080e-01 -3.70367989e-02 -4.29302216e-01 -6.92420959e-01 - -1.40631747e+00 -8.31055716e-02 -1.50472033e+00 7.60055959e-01 - 8.24397504e-02 -1.45755148e+00 -3.09209079e-01 -7.52156377e-01 - 3.19174498e-01 1.34045041e+00 -1.87517250e+00 1.15026079e-01 - -1.60132796e-01 6.71340048e-01 2.13196635e-01 -7.51969337e-01 - -3.19053948e-01 -7.96025872e-01 1.07600713e+00 2.13116482e-02 - 1.90119064e+00 -6.06608130e-02 -7.08406746e-01 -1.51371443e+00 - -1.80313969e+00 -1.58413589e+00 2.67126650e-01 5.08725047e-01 - -1.58119071e+00 8.95038307e-01 -4.83061045e-01 1.46793008e-01 - 1.61222064e+00 8.96839321e-01 -2.68530637e-01 -8.91192198e-01 - -2.15181541e+00 -7.19153345e-01 -2.11130232e-01 -9.87179518e-01 - -1.31256968e-01 7.68518820e-02 -2.24855989e-01 -6.50002599e-01 - 1.68654665e-01 4.41940635e-01 -1.09039903e+00 1.41093242e+00 - -9.85881314e-02 1.88496225e-02 7.08214402e-01 2.33216137e-01 - 9.53136623e-01 2.87124157e-01 -6.12437367e-01 3.61503512e-01 - -1.14372623e+00 1.08559705e-01 -3.32299918e-02 -2.08116859e-01 - -1.28537506e-01 -1.88184905e+00 -5.48724890e-01 9.28449035e-02 - 1.59856394e-01 -1.02767515e+00 1.26570785e+00 -8.66174877e-01 - 9.69457209e-01 4.27194357e-01 -6.46227300e-01 1.77531087e+00 - -1.19363678e+00 9.19154167e-01 1.00058234e+00 -6.70620203e-01 - 1.39246535e+00 -2.50046521e-01 2.88693637e-01 2.60321826e-01 - -1.34308740e-01 8.10808301e-01 7.93488741e-01 -1.74853194e+00 - 1.30434024e+00 -1.66249204e+00 1.03254616e+00 1.12670517e+00 - -1.09096646e+00 -4.10814285e-01 -1.10570467e+00 -2.14921027e-01 - -3.08034271e-01 7.79660523e-01 1.31030869e+00 1.39568377e+00 - -5.62168002e-01 -2.09221870e-01 -1.68343818e+00 -8.05870056e-01 - 9.64851618e-01 1.61558282e+00 -1.23434889e+00 -5.92464268e-01 - -2.64057573e-02 2.80161172e-01 -8.09603512e-01 4.24061030e-01 - -4.73839343e-01 -1.44522712e-02 5.46284258e-01 6.42226962e-03 - -4.36385900e-01 -1.09609805e-01 -8.83629620e-02 -3.70011032e-01 - -2.58796066e-01 1.59864712e+00 5.60919464e-01 -2.95480311e-01 - 6.96954429e-01 -3.33819479e-01 1.17312467e+00 3.69642198e-01 - -1.07302144e-01 4.47716922e-01 -1.57087612e+00 -1.12676752e+00 - -1.19392502e+00 1.42943099e-01 1.73251498e+00 2.23130012e+00 - 6.38051093e-01 5.00844479e-01 -1.80105770e+00 -5.42673826e-01 - -7.87783742e-01 -6.20847583e-01 -1.68143824e-01 -4.72090572e-01 - -1.97929978e+00 7.47910261e-01 -1.07274306e+00 2.39246860e-01 - 2.07408261e+00 -9.19384599e-01 -2.53028750e+00 -2.86004215e-01 - 1.10095966e+00 1.95834744e+00 -1.22954965e+00 4.96699214e-01 - -4.65310454e-01 -1.05948351e-01 2.64434338e+00 -1.50297034e+00 - 2.53739655e-01 4.67693180e-01 1.08578634e+00 9.79831517e-02 - 3.08050543e-01 -3.91648561e-01 2.69126952e-01 -3.43192279e-01 - 6.21571898e-01 -3.69610339e-01 3.77100408e-01 -2.92628128e-02 - 1.12605023e+00 -5.13937250e-02 -1.77303243e+00 1.26192153e+00 - -9.05732274e-01 -6.53766334e-01 -5.95661283e-01 1.37443805e+00 - -2.13567424e+00 3.13774848e+00 1.05605686e+00 2.23238915e-01 - -5.48940003e-02 2.85554081e-01 5.21122456e-01 6.45215571e-01 - 5.55604458e-01 8.95806849e-02 -1.97338417e-01 -1.51273280e-01 - -1.94908276e-01 1.13377011e+00 5.93556702e-01 -2.94038868e+00 - 6.55900776e-01 1.94736198e-01 -1.87087562e-02 -3.88518453e-01 - 1.12411273e+00 9.47526455e-01 -7.72877693e-01 4.07052279e-01 - -9.71656740e-01 -1.37961817e+00 -6.26717210e-01 8.62393498e-01 - 9.53125060e-01 5.13085425e-01 7.25095809e-01 5.16178310e-01 - -6.41481578e-01 4.31922555e-01 8.00409496e-01 7.54291356e-01 - 1.18891335e+00 7.08303869e-01 3.51448208e-01 1.07015026e+00 - -2.65212599e-02 -8.81874681e-01 -1.63066968e-01 -7.44902670e-01 - -6.75178289e-01 -1.44518673e-01 -7.92419910e-01 -3.07961524e-01 - -1.89361465e+00 2.13293701e-01 1.20547542e-03 -8.17088604e-01 - 6.59245670e-01 9.37570155e-01 -1.60755992e+00 -7.62724757e-01 - -7.69142330e-01 -9.39903080e-01 8.29474866e-01 -1.93826139e-01 - -2.64514834e-01 -2.00386238e+00 6.35418057e-01 -1.23925841e+00 - 5.99326305e-02 2.77376950e-01 1.36065865e+00 -1.30882037e+00 - -3.01951218e+00 1.83849543e-01 1.80051124e+00 1.23894632e+00 - 2.09659413e-01 -4.91636187e-01 8.07122588e-01 -9.73546207e-01 - 4.76357758e-01 5.05470157e-01 1.06020987e+00 2.75966001e+00 - 3.92415971e-01 -5.08963645e-01 -2.55743116e-02 -1.76907587e+00 - -6.94713116e-01 -4.09282446e-01 -5.24088383e-01 1.52354866e-01 - -8.22419941e-01 1.12103069e+00 2.07064921e-04 -9.30032134e-03 - -3.27894717e-01 1.55190676e-01 8.25098276e-01 -8.67130220e-01 - -6.58116460e-01 -3.03726017e-01 -1.34587097e+00 -8.19257855e-01 - -4.76220876e-01 8.74389172e-01 2.62561321e-01 1.93589911e-01 - 8.50897968e-01 -1.37371793e-01 3.90464842e-01 -1.03221752e-01 - 2.65362352e-01 -5.82758665e-01 -2.43881726e+00 -1.34279162e-01 - 1.42274809e+00 9.26215410e-01 9.65397477e-01 1.23613071e+00 - 8.86575878e-02 1.97315708e-01 -6.17651880e-01 -3.16072851e-01 - 6.15771174e-01 1.20388448e+00 -1.39446273e-01 -4.50189263e-01 - 5.27958851e-04 6.01206720e-01 -1.44385481e+00 -2.29618096e+00 - -5.50536871e-01 -1.22071242e+00 -5.08140206e-01 -1.47780314e-01 - -4.53248411e-01 1.45246756e+00 3.26744765e-01 3.00474346e-01 - 6.22207165e-01 -1.13883317e+00 1.03911173e+00 -7.57642835e-02 - 6.70480609e-01 -1.07185721e+00 -1.55375874e+00 8.17889631e-01 - 3.76409531e-01 -9.02051568e-01 -8.69663060e-01 1.12543476e+00 - -1.18941188e+00 1.64267290e+00 -9.00620759e-01 6.38391912e-01 - -3.28618467e-01 6.03187442e-01 -5.44113994e-01 -1.62792906e-01 - 4.09191772e-02 -1.00218749e+00 7.40824342e-01 -5.13213575e-01 - -2.28599921e-01 -9.94349360e-01 -2.56233358e+00 -1.91027761e-01 - 2.41261554e+00 7.84604311e-01 -1.92603804e-02 -2.62890816e-01 - 2.24658921e-02 5.47119141e-01 -1.18081284e+00 1.11432171e+00 - 7.15381324e-01 7.18185723e-01 4.38475728e-01 1.96166579e-02 - 6.72861218e-01 5.91814160e-01 -3.54040712e-01 -5.73601782e-01 - 1.01856299e-01 1.54902017e+00 -1.23910713e+00 -1.46752524e+00 - 1.64760843e-01 5.08877002e-02 1.73341662e-01 2.43953198e-01 - -2.23202258e-01 1.48986340e+00 -1.60090399e+00 -8.28497052e-01 - -1.03254758e-01 -1.64318883e+00 -1.75854400e-01 1.66147017e+00 - 2.08864883e-02 2.30700910e-01 -1.26016474e+00 -6.16361380e-01 - -3.75196368e-01 -3.17715108e-01 1.28164375e+00 5.57691038e-01 - -1.11145794e+00 2.46504784e-01 4.98221755e-01 1.14014900e+00 - 1.58054066e+00 -1.01509416e+00 -8.10857534e-01 -1.25757790e+00 - -2.34019861e-01 4.66358364e-01 9.87334490e-01 -7.59510249e-02 - -3.19847524e-01 1.51757985e-01 -8.35142910e-01 2.08953881e+00 - -1.60766101e+00 1.84740573e-01 2.02360630e+00 6.79984456e-03 - -1.90039039e-01 -3.57445419e-01 -1.80391684e-01 1.37284851e+00 - -2.21186185e+00 1.53343368e+00 -1.42395711e+00 -2.66652316e-01 - -4.29244429e-01 5.88553250e-01 -1.59812438e+00 4.62172657e-01 - 2.02430964e+00 -1.36317396e+00 1.89706162e-01 -6.61982179e-01 - 4.25887197e-01 1.91477779e-02 -6.41486883e-01 4.87872273e-01 - 1.80434811e+00 -1.90903798e-01 7.19757915e-01 -1.29327297e+00 - -9.56436396e-01 4.72406268e-01 1.48411608e+00 3.55613351e-01 - -3.13058019e-01 -7.08594569e-04 -1.25040770e+00 6.04515314e-01 - 8.82333040e-01 -4.52090323e-01 -4.70042139e-01 2.65878230e-01 - -4.36719745e-01 -6.61326125e-02 2.09972167e+00 -2.47025698e-01 - -3.58340144e-01 -6.47541821e-01 7.44191945e-01 -1.81224167e-01 - -6.49373055e-01 1.32130408e+00 1.41960299e+00 -6.00423574e-01 - -1.86653996e+00 1.00751364e+00 -6.84629798e-01 7.90625572e-01 - -1.97010386e+00 8.92597318e-01 -1.21117198e+00 7.30764091e-01 - 1.42730679e-02 -9.53939438e-01 -4.07036304e-01 6.86318457e-01 - 1.05894454e-01 5.84412992e-01 1.97644055e+00 -1.56424165e+00 - 1.61721253e+00 1.04355939e-01 -8.98784041e-01 -1.33031368e+00 - -1.89120397e-01 9.21650112e-01 -1.27549127e-01 1.51115489e+00 - -1.45117569e+00 -1.20889479e-02 -1.25239348e+00 3.63631874e-01 - 8.86887372e-01 -4.20761794e-01 -2.60421395e+00 1.98948145e-01 - 4.36738551e-01 4.04294759e-01 1.23578215e+00 -1.07105362e+00 - 6.80216134e-01 1.19250751e+00 -1.77858758e+00 3.19651812e-01 - -5.04179835e-01 -8.15226734e-02 3.47676486e-01 -4.87228572e-01 - -6.75708055e-01 3.41524184e-02 -1.08724594e+00 -1.08582509e+00 - 6.79373264e-01 -1.14879405e+00 6.66312516e-01 4.62591290e-01 - -1.72580671e+00 -6.77564681e-01 1.19410944e+00 -9.81165528e-01 - -4.64404255e-01 4.62060899e-01 7.83390522e-01 -2.51538873e-01 - -5.97510219e-01 1.42236984e+00 1.73889971e+00 9.78858054e-01 - 8.53181407e-02 -8.08265746e-01 -8.30444396e-01 5.22514105e-01 - 4.18398231e-01 1.40159857e+00 6.50450110e-01 -1.50308025e+00 - 1.05194759e+00 -9.98060644e-01 -3.83971274e-01 2.50200212e-01 - 1.99566746e+00 3.10991859e+00 6.06723070e-01 -1.83196619e-01 - 5.34505665e-01 8.87655437e-01 -3.20667624e-01 1.79521132e+00 - 2.30070755e-01 4.97743100e-01 6.65924251e-01 4.21586603e-01 - 8.38940501e-01 -6.17253184e-01 -5.58301747e-01 -1.10015428e+00 - 4.39501226e-01 7.78936505e-01 4.57772881e-01 1.67449248e+00 - -5.59601886e-03 6.68741643e-01 -1.09170091e+00 -3.87099713e-01 - 6.95537746e-01 8.49102139e-01 -2.93966949e-01 -7.15992525e-02 - -1.51787376e+00 -3.57029200e-01 8.90383184e-01 5.75205028e-01 - 5.00665724e-01 4.97736856e-02 7.06315879e-03 -6.60321116e-01 - 6.98828399e-01 4.20972943e-01 4.92018849e-01 -5.26034832e-01 - -2.15334296e+00 1.09715271e+00 -4.78837460e-01 -8.62775743e-01 - 6.93478882e-01 -3.92012626e-01 1.05993640e+00 6.17005944e-01 - 6.83569312e-01 -1.36595571e+00 1.21194398e+00 2.61250526e-01 - -3.69277149e-01 1.43388465e-01 -1.77623522e+00 4.08652812e-01 - -1.02937150e+00 -1.35267031e+00 -1.52235913e+00 1.11268842e+00 - -6.29263401e-01 1.53372777e+00 -5.35801470e-01 -1.70735800e+00 - -1.11652446e+00 1.23581159e+00 -1.55898184e-01 -5.48287272e-01 - 1.60018221e-01 5.01782715e-01 1.11739886e+00 1.44849920e+00 - -3.59768659e-01 -1.32604766e+00 -4.13465172e-01 2.60280520e-01 - -9.63758945e-01 -9.57150936e-01 3.43787879e-01 -4.86523509e-02 - 3.27969939e-02 -7.58495331e-01 -2.30400711e-01 -9.24233139e-01 - 8.90198410e-01 1.03524935e+00 -1.84618783e+00 -9.29510832e-01 - -1.49652922e+00 -6.50023937e-01 -8.34379718e-02 -1.44964516e+00 - -9.21859801e-01 -1.00395739e+00 2.07267329e-01 6.93443343e-02 - -7.21737564e-01 1.76820874e-01 -5.46680093e-01 -2.71655113e-01 - 1.67345214e+00 1.34046102e+00 -1.29958129e+00 8.29732418e-01 - 8.11396539e-01 -1.14826334e+00 8.18777621e-01 1.53793192e+00 - -1.12254477e+00 -9.17502820e-01 1.01766145e+00 2.71495312e-01 - 5.51476300e-01 3.40589225e-01 3.90695930e-01 -1.32647216e+00 - 1.04731774e+00 1.16959000e+00 -2.29390740e-01 -4.34769504e-02 - -1.53110754e+00 5.14254749e-01 5.72057426e-01 -6.21910170e-02 - 1.12435341e+00 -3.34077239e-01 5.64605951e-01 -1.01966429e+00 - -2.35521514e-02 -1.74354255e-01 2.25307927e-01 -3.69526923e-01 - -1.31472602e-01 8.26047182e-01 -4.36764240e-01 -1.60657716e+00 - 1.74958396e+00 1.38145411e+00 -1.29226267e+00 6.89707816e-01 - -5.02974749e-01 2.63487220e-01 2.94224143e-01 -2.34407797e-01 - -7.83765674e-01 -6.90541029e-01 -9.16191936e-01 -8.31822217e-01 - -6.71782866e-02 -7.15759993e-01 6.82052076e-01 1.48724616e+00 - -5.80053270e-01 2.39404500e-01 4.99685109e-01 4.72002268e-01 - 7.59552643e-02 7.42682397e-01 4.82248962e-01 -1.23766243e+00 - 8.69156063e-01 8.87290835e-01 -7.63286114e-01 3.79378349e-02 - 6.83328986e-01 -2.09313825e-01 1.07297802e+00 2.36387253e+00 - -7.85986006e-01 -1.38104463e+00 3.03818583e-01 7.21562505e-01 - -2.30866373e-01 1.45326066e+00 -1.33860576e+00 6.92918599e-01 - -6.05860829e-01 1.71937788e+00 1.99251533e+00 -7.66656578e-01 - -5.49557149e-01 8.59587908e-01 -3.87130648e-01 -4.54112180e-02 - 2.53884196e-02 -1.91967320e+00 -1.38377463e-02 -6.89727783e-01 - -4.93122816e-01 1.44363201e+00 -1.25650692e+00 8.13205183e-01 - -2.78962940e-01 -2.79760182e-01 7.90371895e-01 3.40051293e-01 - 5.70613027e-01 9.68184650e-01 -3.31308395e-01 -6.12236738e-01 - -1.08515048e+00 -8.25410545e-01 2.94909453e+00 1.24467957e+00 - -1.35107434e+00 -1.32245791e+00 4.81866062e-01 5.47264814e-01 - 5.48883855e-01 -2.54955798e-01 -1.25454262e-01 3.27879518e-01 - 8.58930051e-02 -2.21930027e+00 -2.29799628e-01 -8.51406455e-01 - 1.75211415e-01 2.98525906e+00 3.67481679e-01 -3.13529700e-01 - 9.21801507e-01 4.82687891e-01 4.20094490e-01 6.06850564e-01 - 2.05654359e+00 -1.13088846e+00 4.73632067e-01 -9.26381111e-01 - 5.55512667e-01 -9.18686509e-01 -4.17366803e-01 -2.95089900e-01 - 9.75816727e-01 9.18454468e-01 -1.24571741e+00 5.49341105e-02 - -7.09579825e-01 -1.25850487e+00 -2.17017055e-01 -3.08482796e-01 - 2.42671657e+00 4.32959825e-01 -1.37761748e+00 -5.64773977e-01 - 1.02138329e+00 2.08844280e+00 -1.58598328e+00 -1.87992442e+00 - 1.87096512e+00 3.89614165e-01 -8.68292749e-01 5.34629107e-01 - -2.63574767e+00 3.37550649e-03 3.27821165e-01 9.24270153e-01 - -1.01389599e+00 8.56871605e-02 -9.25424635e-01 2.55383760e-01 - -8.95346344e-01 -4.08100843e-01 -9.95815396e-01 6.51136220e-01 - 8.58451128e-01 -2.34620571e-01 3.82379964e-02 -1.44859147e+00 - -2.99676657e-01 -5.02046160e-02 2.62079310e+00 -1.11196697e+00 - 1.46417725e+00 9.96858776e-01 2.80636400e-01 1.75862050e+00 - 3.72105479e-01 3.89159203e-01 -5.31206541e-02 1.22356892e+00 - 1.09646928e+00 -6.25677288e-01 -1.31924725e+00 -2.02980638e-01 - -8.00038576e-01 -6.35245740e-02 1.23743808e+00 -4.57301646e-01 - -4.28233817e-02 5.80227338e-02 8.48430872e-01 -2.24688935e+00 - -6.06700301e-01 2.11283699e-01 1.20007896e+00 -4.91902441e-01 - -1.87655270e+00 6.19711459e-01 -6.35362387e-01 -1.18966663e+00 - -6.24345362e-01 -1.84525356e-01 -6.04744673e-01 -2.20556617e+00 - 8.97066176e-01 1.27487504e+00 6.51521981e-01 -1.13768566e+00 - -2.01491535e-01 -6.52131531e-03 5.98794401e-01 6.68340504e-01 - -7.34173775e-01 8.19955543e-02 4.57280397e-01 1.45582235e+00 - 7.04506516e-01 7.89056659e-01 8.38266313e-02 1.41045916e+00 - 4.09762800e-01 -8.61087918e-01 1.40337002e+00 6.97903275e-01 - -4.43044007e-01 -4.79492962e-01 2.96575546e-01 4.63184804e-01 - 1.97292268e-01 3.11308682e-01 1.69995737e+00 1.07154310e+00 - 1.90635592e-01 9.41310585e-01 -1.03252387e+00 3.97927046e-01 - 1.80930638e+00 -2.18045890e-01 -8.47633958e-01 -6.52088881e-01 - -1.08963299e+00 -7.84761906e-01 -3.70508105e-01 -1.40556705e+00 - 1.56183336e-02 9.02277172e-01 -9.07756209e-01 1.51928389e+00 - 5.10792077e-01 1.03058064e+00 -6.57834530e-01 8.54525208e-01 - -1.09193385e+00 8.90247524e-01 1.71839342e-01 5.53251207e-01 - -1.17559457e+00 -8.95069838e-01 5.97228229e-01 -9.48347986e-01 - 4.63206291e-01 -1.36713779e+00 8.48174214e-01 -1.23252332e+00 - 5.52299917e-01 6.25630915e-01 -6.96771801e-01 5.82026541e-01 - 2.60442257e-01 -5.38841844e-01 -1.00855470e+00 -1.96262574e+00 - 3.49650055e-01 -1.56480277e+00 9.53442007e-02 -2.63448209e-01 - 6.79142773e-01 -3.02045494e-01 -3.29447776e-01 7.31870711e-01 - 3.35058033e-01 3.16155970e-01 4.69267577e-01 -1.53557217e+00 - 7.56788671e-01 6.12468779e-01 -1.01668310e+00 -2.44080469e-01 - -3.93072888e-02 -1.34496778e-01 3.33859980e-01 1.43136680e+00 - 1.08176672e+00 -1.31221914e+00 6.22070014e-01 1.32893336e+00 - 3.86808515e-01 1.09098041e+00 2.01227045e+00 1.02370954e+00 - 2.49308601e-01 1.04508853e+00 1.44887835e-01 2.39030439e-02 - -3.51414233e-01 1.56341338e+00 -8.18428695e-01 1.53236771e+00 - 4.99864131e-01 -1.39930618e+00 3.67649287e-01 -2.10002661e+00 - 6.25508010e-01 8.85231495e-01 -5.92355967e-01 1.23547696e-01 - 1.95415747e+00 -5.05746722e-01 -1.05890763e+00 1.48166335e+00 - 1.96258724e+00 3.69567191e-03 1.01146305e+00 1.34147596e+00 - -7.42484212e-01 -4.85305846e-01 1.23087454e+00 1.68501425e+00 - 5.62880874e-01 -8.79814327e-01 1.98706114e+00 -5.30970871e-01 - -3.35137784e-01 3.42337847e-01 1.55416048e+00 8.53975952e-01 - 4.14865494e-01 4.63288516e-01 4.35146689e-02 5.58139503e-01 - -2.52956009e+00 -2.96607465e-01 2.41321936e-01 -1.15101409e+00 - 3.86323303e-01 -2.04471216e-01 1.75547564e+00 1.57301974e+00 - -4.65805560e-01 -1.50837705e-01 -7.39475638e-02 -4.51659471e-01 - 1.95024982e-01 -7.58282065e-01 -1.13061535e+00 6.22711241e-01 - 6.29529595e-01 -8.04315746e-01 8.95355403e-01 -6.31577730e-01 - 2.53165245e-01 8.20344031e-01 -3.36125642e-02 4.54740912e-01 - -5.16429663e-01 -1.96466580e-01 -2.05859438e-01 -7.46578336e-01 - -1.76774159e-01 -1.54861832e+00 -4.91153747e-01 -2.84875482e-01 - -2.56179750e-01 -2.41497442e-01 -6.17641993e-02 4.79441524e-01 - 8.74517083e-01 -6.49765253e-01 -1.20320082e+00 -1.04204440e+00 - -4.87202913e-01 -3.51921320e-01 -7.69996047e-01 -1.29611671e+00 - -4.56120819e-01 1.81426615e-01 5.95029473e-01 -5.57423055e-01 - -4.12997812e-01 -9.26698327e-01 -3.14391702e-02 -8.48428607e-01 - 5.73127806e-01 -1.78586650e+00 -3.59629661e-01 3.01107347e-01 - 1.83834523e-01 2.69303370e+00 3.49800020e-01 -1.00405455e+00 - -9.54642445e-02 -1.77624631e+00 -8.05997476e-02 -8.33056033e-01 - 9.15390253e-01 -5.49540281e-01 -1.16765916e-01 -6.35557830e-01 - 1.73885131e+00 -3.21242779e-01 1.83255708e+00 8.14151764e-01 - 4.82066721e-01 3.68733138e-01 3.93797308e-01 -1.92767262e+00 - -2.78883398e-01 8.45157981e-01 -4.88971844e-02 -1.40518987e+00 - -5.24044521e-02 1.70410550e+00 1.24689174e+00 -6.20040111e-02 - 9.51449275e-01 -3.66507024e-01 -1.70387161e+00 -9.24466014e-01 - 1.56288874e+00 -2.73993075e-01 -2.43383974e-01 -2.99838185e-01 - 1.90413654e+00 1.62595892e+00 2.21878028e+00 -1.59045205e-01 - 2.95624495e-01 -1.51664257e+00 1.47693443e+00 -1.16777956e+00 - 2.16775566e-01 -1.09730256e+00 -5.88866830e-01 -8.37262452e-01 - -6.07875288e-01 -5.39122701e-01 -5.48268795e-01 8.33333910e-01 - -1.10486281e+00 2.20541432e-01 1.21795917e+00 -5.13196349e-01 - -7.17918873e-01 -2.30524629e-01 1.17972517e+00 1.94107637e-01 - -5.31214297e-01 4.83876646e-01 -1.10229218e+00 6.81359529e-01 - 4.08607304e-01 -3.07808459e-01 -8.38585794e-01 -8.86680901e-01 - 5.34817338e-01 1.22898054e+00 -6.37307763e-01 4.58386689e-01 - -2.08702707e+00 -5.84617734e-01 -3.10589336e-02 -9.09683406e-01 - -9.36506391e-01 -6.67779565e-01 2.92192727e-01 -1.87329024e-01 - -2.23823118e+00 -2.12070012e+00 -6.06865168e-01 4.57686573e-01 - -2.74750495e+00 -4.99730170e-01 -5.26247859e-01 1.38833773e+00 - -3.85021806e-01 3.82988989e-01 1.41257137e-01 -2.13056660e+00 - 7.68207192e-01 2.15396509e-01 5.08268654e-01 3.92623782e+00 - -2.08411288e+00 1.72469664e+00 -2.87447512e-01 2.87328899e-01 - -4.55122441e-02 -4.24236327e-01 -5.69832921e-01 3.29508722e-01 - -1.51717365e+00 7.50579178e-01 -4.16194409e-01 -1.13006938e+00 - -4.50012863e-01 1.25714922e+00 -5.35335064e-01 3.58453631e-01 - -7.30956256e-01 7.57921875e-01 6.85507715e-01 1.84860885e+00 - -1.75563946e-01 6.68654799e-01 9.80675370e-02 1.29587173e+00 - -7.18832374e-01 7.45005310e-01 -1.94697291e-01 -6.41881749e-02 - 9.17524844e-02 2.52240390e-01 -1.16229042e-01 2.14713186e-01 - 1.57811809e+00 9.85449672e-01 8.69154572e-01 -4.55539525e-01 - -8.89240801e-01 9.55300570e-01 8.76046777e-01 1.47267067e+00 - -6.03980601e-01 -2.29548648e-01 -1.63948417e+00 -3.92471105e-01 - 9.96570528e-01 -4.66495365e-01 6.40479803e-01 -1.90503106e-01 - 3.24057043e-01 -1.14769137e+00 8.52094516e-02 -2.99113607e+00 - -1.86896160e-01 -1.63047326e+00 1.20599449e+00 7.77407348e-01 - 4.66670990e-01 1.52100623e+00 -9.48916495e-01 1.74736321e+00 - 9.32191551e-01 -2.36555293e-01 1.13562012e+00 -1.10629356e+00 - -8.24514151e-01 -6.08589709e-01 -5.28977692e-01 -1.05655766e+00 - 1.22308302e+00 -2.58854389e-01 3.52504969e-01 -5.70351362e-01 - -1.82060325e+00 2.70057350e-01 -1.91225493e+00 -6.86340481e-02 - -1.36831498e+00 1.98727596e+00 9.11362708e-01 1.05753794e-01 - 1.26370668e+00 -8.46315980e-01 5.43479383e-01 1.99810430e-01 - 2.64020085e-01 1.27235353e+00 7.32492089e-01 2.88723677e-01 - -1.65488744e+00 -9.60046291e-01 -1.22708932e-01 9.33723748e-02 - -1.13020372e+00 2.41167665e+00 1.51639402e+00 6.02118313e-01 - 7.20368624e-02 -2.12208971e-01 -9.51918483e-01 7.74805173e-02 - 2.57752538e-01 -1.24176061e+00 3.34176421e-01 -1.55259043e-01 - -1.90780759e+00 -8.60385001e-01 -4.13605541e-01 1.88768768e+00 - 5.56553125e-01 -1.33548152e+00 4.86036301e-01 -1.54730403e+00 - 1.08269107e+00 -4.71124649e-01 -9.36361924e-02 1.32579660e+00 - -1.28716362e+00 -1.39711821e+00 -5.83599329e-01 1.03837883e+00 - -1.51934612e+00 -2.83215570e+00 -4.51158851e-01 5.51740825e-01 - 1.20026171e+00 -4.63161349e-01 -4.11426604e-01 1.15390074e+00 - -1.86974168e+00 -3.88520777e-01 1.90423891e-01 4.49218720e-01 - -5.09806693e-01 3.44410129e-02 -2.48832726e+00 -6.58248425e-01 - 4.53508705e-01 -9.82294023e-01 5.89842200e-02 4.46872503e-01 - -3.42549205e-01 1.70463771e-01 -9.62732553e-01 -2.06611276e-01 - 6.10216141e-01 1.56972960e-01 -5.86531281e-01 2.24216402e-01 - 7.14609802e-01 -2.04994130e+00 1.15907407e+00 -3.36254746e-01 - 4.25489932e-01 1.19724691e+00 -1.37167370e+00 -7.09441006e-01 - -2.88548708e-01 -7.83753514e-01 1.73493659e+00 -8.56772065e-01 - -5.55546761e-01 2.04389036e-01 -1.20214951e+00 -3.95680726e-01 - 3.17453265e-01 -3.32860410e-01 -9.33868587e-02 -5.29332101e-01 - -1.51447034e+00 3.21593225e-01 1.75493312e+00 1.84016302e-02 - 2.25264117e-01 6.92722738e-01 -1.26933050e+00 1.70251465e+00 - 2.02328801e-01 1.63185692e+00 -7.33033001e-01 1.81806195e+00 - 7.75155485e-01 5.53040087e-01 2.34024450e-01 -2.48528615e-01 - 1.20053160e+00 1.40359864e-01 -1.96706975e+00 -1.11718643e+00 - -1.85845748e-01 3.09998989e-01 -5.65884896e-02 1.21897078e+00 - -1.95110190e+00 1.43587932e-01 -1.81797922e+00 7.59712279e-01 - -9.44587141e-02 4.19609964e-01 -8.63882959e-01 1.27946496e+00 - 1.04172468e+00 5.83544314e-01 -1.29517242e-01 5.79857290e-01 - -7.06892550e-01 8.55555654e-01 1.64948094e+00 1.07061052e+00 - -7.29602456e-01 3.61417323e-01 -1.29314148e+00 5.72390318e-01 - 4.50603426e-01 -1.86981630e+00 -1.16178381e+00 -2.83138901e-01 - -3.00859749e-01 -1.20947695e+00 3.88978720e-01 2.51473606e-01 - -1.94269136e-01 -7.55791485e-01 1.04870713e+00 1.65540707e+00 - -4.83561486e-01 -6.12166941e-01 4.91207659e-01 -3.58029366e-01 - -1.39397204e-01 7.39653170e-01 -1.90935612e+00 1.31830227e+00 - 7.26805255e-02 -4.11493152e-01 -8.92341509e-02 -3.75706442e-02 - -1.73120129e+00 1.49493825e+00 4.13256325e-02 4.43002135e-01 - 9.51549768e-01 -1.02116251e+00 4.73471910e-01 -2.67641097e-01 - 8.46770763e-01 -2.12722731e+00 -9.90943015e-02 -6.02820635e-01 - 4.32262957e-01 4.70044196e-01 -7.07625866e-01 -7.12173879e-01 - -1.10665366e-01 -8.96642208e-01 8.41983676e-01 -3.69207203e-01 - -2.90698814e+00 -3.74822050e-01 -1.03854382e+00 -1.63127577e+00 - -1.23742664e+00 1.09408475e-01 1.32864082e+00 3.13184470e-01 - -6.06503367e-01 4.55904186e-01 -4.59090322e-01 -6.94600344e-01 - -1.15436268e+00 -1.75182879e+00 -3.89923692e-01 1.58053488e-01 - -9.66236666e-02 -4.15966928e-01 -9.45746064e-01 6.08246207e-01 - -1.31713188e+00 7.76027918e-01 -1.00221705e+00 -7.52435267e-01 - -1.46678519e+00 -5.01402140e-01 9.75312471e-01 5.15628099e-01 - 9.78422105e-01 5.22142529e-01 -1.10366988e+00 -3.30789238e-01 - -7.80199111e-01 1.33057487e+00 -1.19678867e+00 8.93697679e-01 - 8.92953873e-01 1.82962024e+00 -4.09686595e-01 7.12322176e-01 - 2.28165174e+00 -6.17641687e-01 -1.53504026e+00 -1.88001001e+00 - 7.12712049e-01 -1.88315022e+00 -3.72319043e-01 4.37313199e-01 - 1.85175538e-01 4.25444871e-01 2.22289860e-01 1.27886581e+00 - -9.52316403e-01 -6.76995158e-01 -7.72659421e-01 8.30191791e-01 - 9.00398612e-01 4.51225311e-01 1.18349361e+00 -1.17804086e+00 - 1.66722417e+00 1.52323866e+00 7.35621989e-01 1.78179896e+00 - -1.65662324e+00 -5.24448156e-01 -7.35303462e-01 7.21479297e-01 - -1.05014074e+00 7.57495165e-01 1.37053609e+00 6.95202887e-01 - 2.83751011e-01 -9.87873316e-01 -8.46850991e-01 1.24961889e+00 - 7.79349029e-01 -3.90195176e-02 -4.17909533e-01 -2.02580786e+00 - -1.07085240e+00 1.91141903e+00 -1.28045559e+00 1.89479828e-01 - 1.00744402e+00 -1.25492263e+00 1.84710473e-01 9.37915564e-01 - 1.24993222e-02 2.86840296e+00 -1.66859925e+00 1.05872858e+00 - -1.72801733e-01 7.71920145e-01 4.41307157e-01 -7.33156204e-01 - 2.28996396e-01 -1.85790145e+00 6.03247523e-01 2.98158467e-01 - 6.38660192e-01 1.05811834e+00 3.67619842e-01 1.48089424e-01 - -8.81068349e-01 -7.12220788e-01 1.18673468e+00 1.43633544e+00 - -2.38110438e-01 4.60027456e-02 -9.04614389e-01 1.17275679e+00 - 6.65416837e-01 1.94121742e+00 -8.78323078e-01 -3.78007889e-01 - 2.31445923e-01 6.46489561e-01 -2.15667635e-01 -8.72961104e-01 - 8.81407499e-01 7.21135378e-01 -9.16274130e-01 1.35544276e+00 - 1.17019868e+00 1.34296492e-01 7.95978606e-02 5.54057777e-01 - -8.61703098e-01 3.00314799e-02 -2.15238357e+00 8.76455665e-01 - -1.56149328e+00 1.50341988e+00 -3.30120087e-01 -2.11666986e-01 - -6.27733886e-01 -2.88038880e-01 1.41853130e+00 -2.48780870e+00 - 1.27696538e+00 3.38023484e-01 -1.20702159e+00 -1.07531238e+00 - 1.67639303e+00 -9.45506871e-01 -1.15335047e+00 1.13731456e+00 - 3.38552862e-01 -9.37925577e-01 2.16624975e-01 -1.02575266e+00 - 1.10082138e+00 1.06137097e+00 5.32006204e-01 3.61661047e-01 - 1.76118815e+00 -3.13672615e-04 -1.18187416e+00 4.49473917e-01 - 2.08604693e+00 -1.01475954e+00 -3.61462772e-01 4.16446030e-01 - -5.39048500e-02 -9.82303560e-01 1.12185788e+00 2.32004070e+00 - 1.96131453e-01 -9.04068291e-01 -1.54896057e+00 2.57975996e-01 - 1.10387731e+00 4.75167751e-01 -2.40384089e-03 -5.89271665e-01 - -1.09216368e+00 8.34751308e-01 9.13771629e-01 -1.54572976e+00 - 1.58983898e+00 5.74071229e-01 1.39959466e+00 -1.34212780e+00 - -1.36582410e+00 -1.48969308e-01 5.02784431e-01 1.79636085e+00 - 7.06078887e-01 -2.42681280e-01 -1.02636921e+00 1.23000062e+00 - -9.64705408e-01 1.62846851e+00 -2.84384638e-01 1.59058201e+00 - 6.78931296e-01 -1.36995494e-01 -5.19396365e-01 -3.40907872e-01 - 4.28316683e-01 7.71564990e-02 -5.93811393e-01 -1.65631413e-01 - 7.85770267e-02 -2.12873411e+00 4.58168119e-01 -9.79721248e-01 - -1.15333235e+00 -1.68734431e+00 -1.79389250e+00 -1.35544753e+00 - -7.09020972e-01 1.95325911e+00 -5.25879681e-01 1.77750111e-01 - 4.00481373e-01 1.31065190e-01 -7.74434581e-02 -1.19529378e+00 - 1.45092750e+00 1.80719662e+00 -1.68265867e+00 -1.02402854e+00 - -2.79810339e-01 -9.64537799e-01 5.05965531e-01 -7.28272200e-01 - 2.16505599e+00 1.19054854e+00 2.12574035e-01 1.02698624e+00 - 1.10590041e+00 -5.63947499e-01 -8.16217244e-01 7.81430602e-02 - 8.61636102e-01 1.39060393e-01 -1.57699645e+00 -8.02824438e-01 - -7.39728659e-02 -7.56656080e-02 1.97254217e+00 -1.38598800e+00 - 5.05589247e-01 1.48911309e+00 2.27144980e+00 -4.04397428e-01 - 4.91429448e-01 5.69760382e-01 1.95481732e-01 -9.88451988e-02 - 4.35975403e-01 -2.53224611e+00 6.82069480e-01 1.26177907e-01 - -2.22474903e-01 2.04686213e+00 -6.74934030e-01 -4.03075993e-01 - 1.99155331e+00 -8.32061291e-01 -5.50115287e-01 -1.47025421e-01 - 8.40734482e-01 2.07802504e-01 -1.29262531e+00 -5.32168567e-01 - -6.07354462e-01 -7.72210434e-02 4.25843656e-01 4.18205768e-01 - -1.77601171e+00 1.06394100e+00 2.52568841e-01 1.38453209e+00 - 4.44198340e-01 1.10132301e+00 4.66545314e-01 1.34622574e+00 - 5.22250652e-01 -9.79152471e-02 2.27243471e+00 8.89037132e-01 - 5.73744416e-01 -1.27630436e+00 -1.28830767e+00 2.92715490e-01 - 1.45521194e-01 -6.14122570e-01 1.40982702e-01 1.58862698e+00 - 6.95402503e-01 -1.14100850e+00 -1.11508824e-01 -8.04966509e-01 - -3.87699902e-01 -4.13615674e-01 -4.77645576e-01 -1.25513542e+00 - -1.27654687e-01 -5.60607612e-01 -2.92944860e+00 2.05297208e+00 - 1.08876967e+00 -3.76300722e-01 1.88190490e-02 -1.17165434e+00 - 1.69572282e+00 1.89728868e+00 1.56693727e-01 1.02353060e+00 - 1.75286785e-01 -1.33672547e+00 -4.11823362e-01 1.31927967e-01 - -4.54547733e-01 -2.18653277e-01 -9.13995579e-02 -8.08784962e-02 - 5.22584915e-02 -1.06060505e+00 -7.17016160e-01 -1.28458428e+00 - 9.77815568e-01 -2.10564685e+00 1.15359032e+00 1.38634944e+00 - -3.02180588e-01 -2.60313702e+00 -3.61310571e-01 -6.41841590e-02 - -1.01059818e+00 -5.15218377e-01 1.53019524e+00 6.64927185e-01 - -9.24564123e-01 -1.59759915e+00 -3.27017307e-01 -2.13456780e-01 - 4.96199191e-01 -5.35316706e-01 5.11499584e-01 1.93515408e+00 - 8.15501034e-01 -4.80885059e-02 -1.83150470e-01 -3.56673121e-01 - 1.18064094e+00 -6.27313435e-01 4.52227108e-02 5.11979014e-02 - -5.01784205e-01 -1.37246549e+00 3.22974503e-01 -6.10039718e-02 - 5.00240445e-01 -5.33600330e-01 1.22082126e+00 -8.76773775e-01 - 1.71204031e+00 -1.74763656e+00 4.34770077e-01 4.75430697e-01 - -7.95557082e-01 4.24393952e-01 1.27951872e+00 -2.21681881e+00 - 5.02744794e-01 -2.49327087e+00 -9.65664804e-01 1.62916636e+00 - -5.81782043e-01 -1.17387998e+00 -1.97937727e-01 2.06790638e+00 - -1.83168221e+00 -1.05130994e+00 1.49743164e+00 1.85770237e+00 - -1.03431225e-01 -1.23905265e+00 2.09584355e+00 1.59444237e+00 - 6.78946972e-01 -8.12402368e-01 -4.90293391e-02 -1.60360083e-01 - 3.30851316e-01 1.45100141e+00 8.79417062e-01 -1.07737613e+00 - 1.37649643e+00 3.13132137e-01 6.87065780e-01 1.46654081e+00 - -1.11158025e+00 -3.58255804e-02 -5.31454563e-01 -1.56785941e+00 - 3.46709967e-01 2.51155663e+00 -1.84007776e+00 -3.22806127e-02 - 6.40542924e-01 1.23078205e-01 -1.13128051e-01 -1.29921639e+00 - 2.32786745e-01 -7.56763995e-01 -2.18992162e+00 1.19459224e+00 - 9.58385825e-01 5.16609773e-02 2.29074687e-01 1.07431805e+00 - 2.24339664e-01 9.04017568e-01 -2.97482193e-01 1.31198370e+00 - 3.20785731e-01 1.94051936e-01 -1.27006114e+00 2.86955863e-01 - -8.31954896e-01 -6.38282716e-01 -8.15375626e-01 -1.06623495e+00 - 2.12477136e+00 1.33133912e+00 1.92036748e+00 -1.22289503e+00 - 7.12537467e-02 -1.29350758e+00 -6.95694864e-01 -9.18127000e-01 - 1.23958397e+00 -3.96229684e-01 1.06837928e+00 6.04120910e-01 - 2.30363870e+00 -1.47944427e+00 1.25923324e+00 1.14644086e+00 - -9.73378837e-01 1.00713336e+00 3.36144954e-01 -2.27476254e-01 - -8.23935509e-01 -7.19109476e-01 2.04981208e+00 6.82818936e-04 - 7.82130003e-01 -7.90899396e-01 -7.79852927e-01 1.08198476e+00 - -1.34957623e+00 -4.15213764e-01 3.40274125e-02 1.11372605e-01 - -1.79093286e-01 3.95819783e-01 6.92855775e-01 7.34637558e-01 - -9.85803783e-01 -2.84155101e-01 1.19476283e+00 8.94647181e-01 - -1.37285948e+00 3.51479888e-01 -1.97822523e+00 4.69349027e-02 - 1.89776707e+00 -9.47892725e-01 -8.34323049e-01 7.64029801e-01 - -1.54280150e+00 -6.32755041e-01 5.96441150e-01 -6.34285867e-01 - -2.35969990e-01 7.44185686e-01 4.21546072e-01 2.66818404e-01 - -3.39122266e-01 3.70061129e-01 -1.23732679e-01 4.57805604e-01 - -4.96932030e-01 -1.82568967e-01 -9.89957809e-01 -3.50076109e-02 - -8.11000884e-01 -1.11404645e+00 2.58661151e-01 2.12739781e-01 - 7.82873750e-01 -4.31125581e-01 4.08052862e-01 5.50797582e-01 - 4.44773793e-01 -8.25653672e-01 1.49363488e-01 3.64140302e-01 - 7.17846677e-03 -2.08647776e+00 1.60489619e-01 2.21253109e+00 - -3.60039264e-01 -7.28077114e-01 3.73348564e-01 3.12378198e-01 - 1.19273520e+00 -1.25786817e+00 7.31380224e-01 5.90422034e-01 - -1.42146543e-01 3.38687330e-01 1.02945745e+00 8.72315407e-01 - -1.14301932e+00 8.20158184e-01 -6.46886081e-02 7.28740990e-02 - 7.31969923e-02 -1.50242984e-01 1.19170749e+00 -8.37179571e-02 - -1.11954618e+00 -3.93529862e-01 1.78694710e-01 1.22921479e+00 - 6.03094697e-01 -8.50535274e-01 2.48518991e+00 2.64775306e-01 - 1.27539074e+00 -1.51714429e-01 5.33359230e-01 8.39468479e-01 - 1.21819472e+00 1.05736828e+00 8.72196972e-01 -3.15086722e-01 - -5.71746051e-01 3.32607508e-01 9.33128059e-01 -2.22720698e-01 - 1.06517196e+00 1.45261669e+00 -9.16320920e-01 -8.36920679e-01 - -1.40375063e-01 3.07612658e-01 -5.24567008e-01 1.35220265e+00 - 4.24066871e-01 3.94470468e-02 -1.43590951e+00 -1.31581581e+00 - 2.81009257e-01 -2.13259554e+00 1.01263738e+00 -1.58153847e-01 - 3.24309301e+00 2.30791616e+00 -1.81449071e-01 -1.06337003e-01 - 9.95581508e-01 1.70317256e+00 -1.63802910e+00 -1.78637552e+00 - -6.22034788e-01 5.82737565e-01 4.96873587e-01 1.06977141e+00 - -1.19969749e+00 -2.31614375e+00 8.58014166e-01 -8.28196287e-01 - -2.01848793e+00 1.45063743e-01 7.93175697e-01 -1.22057132e-01 - -4.57848400e-01 -1.59362227e-01 -3.70582759e-01 -1.34804714e+00 - -7.14680672e-01 5.21279931e-01 3.21678549e-01 1.68059811e-01 - -9.95148480e-01 -1.81503162e-01 -2.17706513e+00 1.78476661e-01 - 1.42885768e+00 -1.47413218e+00 -5.85927665e-01 3.29106778e-01 - 1.25284266e+00 -4.48224604e-01 -6.92250311e-01 -1.96892309e+00 - 1.03438132e-01 2.04207611e+00 2.77358472e-01 -2.22789794e-02 - 3.22082311e-01 -1.10920044e-02 -8.13014388e-01 8.23279917e-01 - 2.25761518e-01 1.76132526e-02 -9.75366056e-01 1.30350792e+00 - 6.64702773e-01 -5.52920818e-01 1.00182486e+00 -4.97570843e-01 - -7.82003522e-01 -7.59703100e-01 -1.77106857e+00 4.71542388e-01 - -1.83114493e+00 -1.22844386e+00 -2.07558799e+00 -8.60696957e-02 - -1.50320455e-01 -3.26696217e-01 -1.04257774e+00 -1.17223370e+00 - 4.64369863e-01 -5.50551951e-01 3.16358328e-01 -8.85248661e-01 - 1.81062743e-01 1.30327809e+00 5.86165786e-01 -4.12413806e-01 - 2.57198960e-01 -2.40665168e-01 7.88018480e-03 -3.25611055e-01 - -1.64335206e-01 2.12093353e-01 -9.93358672e-01 -1.01587880e+00 - -1.38854253e+00 -1.60027111e+00 -1.21728265e+00 1.59462512e-01 - -8.92787516e-01 5.13614476e-01 3.47137488e-02 -1.96850419e+00 - 6.54975593e-01 8.10798645e-01 -9.93862689e-01 4.20191556e-01 - -8.56498241e-01 9.76703763e-02 -1.26449835e+00 -1.79310644e+00 - -3.82817179e-01 -1.23728946e-01 -3.97679687e-01 -1.18403053e+00 - -3.99899065e-01 2.69294739e-01 -4.48930301e-02 1.82501033e-01 - -6.70115173e-01 1.17171788e+00 -1.17617333e+00 4.87560302e-01 - 1.18189108e+00 4.80908096e-01 -2.02524230e-01 -5.28297067e-01 - 1.03608763e+00 -1.42297494e+00 -9.70124245e-01 -3.97557765e-01 - 1.23765373e+00 3.79767627e-01 -9.68045771e-01 -3.10307711e-01 - -6.14323139e-01 -7.09788978e-01 9.78889585e-01 -1.37174308e+00 - 1.60878086e+00 8.28229666e-01 1.09086168e+00 -4.62939501e-01 - -1.49940714e-01 -1.43315184e+00 -5.24662435e-01 4.93654847e-01 - 1.29606307e+00 -1.10715687e+00 -3.65505934e-01 -1.51299223e-01 - -1.39578915e+00 5.91349244e-01 -7.23990321e-01 -1.95390359e-01 - -4.51949358e-01 -7.89338529e-01 -7.04498542e-03 -2.15609580e-01 - -3.71857882e-01 -7.28461623e-01 3.19524676e-01 1.65361726e+00 - 1.39887229e-01 9.96118188e-01 -1.39022195e+00 1.57761440e-01 - -1.09736836e+00 -1.47692490e+00 -7.39371419e-01 1.19189930e+00 - 9.50015843e-01 2.43004724e-01 -1.53270054e+00 -1.96280658e-01 - 3.01699996e-01 1.75536230e-01 -1.84379363e+00 -8.90317857e-01 - -6.40777647e-02 5.36290884e-01 -2.19612789e+00 -1.62123978e-01 - 1.43048763e-01 8.49611998e-01 6.92857563e-01 5.82571328e-01 - -7.99940526e-01 -1.90800214e+00 -3.93052757e-01 1.00163186e+00 - 1.39345455e+00 7.10549116e-01 4.29340810e-01 3.79639655e-01 - -5.56118727e-01 -1.30060300e-01 1.66906953e+00 -9.42557991e-01 - 1.61477876e+00 -3.22028160e-01 1.32543206e+00 -1.40681505e+00 - 5.85904181e-01 -7.36770153e-01 -1.41526723e+00 2.18534037e-01 - 1.00947177e+00 -8.44941437e-01 -1.49901593e+00 -9.18447673e-02 - -8.74353275e-02 5.48319638e-01 9.49594557e-01 -5.94151802e-02 - 1.85550332e+00 2.24234745e-01 -3.46426517e-01 -1.61156738e+00 - -2.62659520e-01 -3.36386949e-01 9.60304797e-01 4.62483615e-01 - -1.17936909e+00 1.72254384e-01 6.71531200e-01 1.52803135e+00 - 8.87503982e-01 7.60846674e-01 6.00859933e-02 3.88312012e-01 - 1.24950492e+00 -1.33277738e+00 -3.49439323e-01 -7.72568762e-01 - 3.79153132e-01 1.23022223e+00 -5.96974313e-01 -2.39030433e+00 - -4.12220746e-01 9.13473725e-01 5.37629902e-01 4.28186238e-01 - -2.79993296e-01 -1.27903068e+00 5.15294313e-01 -8.34281802e-01 - 2.18409681e+00 5.70507228e-01 -5.80648303e-01 -6.11385226e-01 - -9.29442048e-02 -2.39501923e-01 1.11683381e+00 1.86416164e-01 - -7.31631756e-01 1.89044070e+00 4.93083298e-02 7.68839657e-01 - -6.08226478e-01 3.63115638e-01 3.11109990e-01 -1.88964927e+00 - 2.01527500e+00 1.29064441e+00 -3.97195309e-01 -1.10226977e+00 - 3.47561628e-01 8.66976976e-02 3.55668962e-01 1.91577390e-01 - 5.06240964e-01 1.44730568e+00 5.68103194e-01 -1.04965556e+00 - 1.36256289e+00 1.64061451e+00 3.15205669e+00 -1.12349403e+00 - 2.42882013e-01 -2.08209872e+00 5.53149164e-01 -5.48200309e-01 - 1.92344582e+00 -7.74614990e-01 -1.68918300e+00 -4.71263736e-01 - -1.97548783e+00 7.51099467e-01 -2.06508303e+00 2.84575820e-02 - -2.07781172e+00 -3.20297807e-01 1.64337814e+00 3.60647887e-01 - -8.63493621e-01 -3.12034898e-02 1.80168729e-02 4.72630352e-01 - -1.36685836e+00 5.92567265e-01 -2.70439172e+00 -6.29884541e-01 - -4.88273829e-01 6.33326828e-01 2.26308536e-02 -1.40556300e+00 - -9.84993100e-01 1.98680013e-01 -7.90673420e-02 -5.79124093e-01 - -4.80833799e-01 6.96168005e-01 -5.17692566e-01 -8.80912468e-02 - 2.16993666e+00 -1.42368269e+00 -1.85184276e+00 7.83589661e-01 - -6.78643048e-01 -3.18237513e-01 -7.81156361e-01 -2.59800136e-01 - 1.59853816e+00 8.02128434e-01 8.62223506e-01 -1.41622710e+00 - -1.15115368e+00 -2.83907866e+00 -8.09380829e-01 9.56703186e-01 - 5.16946495e-01 -2.21513256e-01 -2.23002844e-02 -6.88885570e-01 - 2.04758883e-01 7.65878260e-01 -1.61588681e+00 2.99647689e-01 - 9.32434857e-01 -4.15100276e-01 7.00700641e-01 2.81465435e+00 - -1.04342796e-01 3.93307060e-01 -7.38057077e-01 1.81251168e-01 - 4.60271627e-01 1.94842339e-01 -1.01969528e+00 2.06361771e-01 - 1.46219599e+00 4.48635757e-01 9.61987197e-01 -2.37011969e-01 - 7.49900222e-01 2.79267848e-01 -9.50910747e-01 3.18520993e-01 - -9.91696656e-01 -6.32880270e-01 1.89328313e-01 2.97244281e-01 - -1.18923247e+00 1.05398571e+00 5.72006285e-01 -7.04728425e-01 - -1.53758478e+00 -1.44904584e-01 -4.45943713e-01 1.13140456e-01 - -3.63116652e-01 -2.00745678e+00 -3.88346940e-01 -3.05619121e-01 - 6.33661032e-01 -6.98199689e-01 1.67867470e+00 -1.91164052e+00 - 1.31116748e+00 -2.24318966e-01 8.59584868e-01 7.98014849e-02 - 1.13955557e+00 -3.87361974e-01 -1.09862041e+00 1.42050433e+00 - -1.13480642e-01 2.21558467e-01 1.23475194e+00 -1.03195524e+00 - 5.95490873e-01 -5.89894772e-01 -1.32169351e-01 7.22789690e-02 - -1.50022185e+00 -7.78837025e-01 1.35107291e+00 8.86191189e-01 - -6.26395792e-02 2.43123457e-01 -2.94242501e-01 1.35658336e+00 - -2.00352579e-01 2.31842086e-01 1.14603019e+00 -7.77371466e-01 - 1.55878913e+00 -1.53046143e+00 1.62964213e+00 -3.72675598e-01 - -2.42698744e-01 -1.89001012e+00 -4.54794705e-01 -1.03980649e+00 - -6.07646346e-01 -1.37310967e-01 9.57791626e-01 2.07603350e-01 - 3.07676643e-01 -1.09551942e+00 -8.80754113e-01 1.79872501e+00 - -7.35201061e-01 1.65620828e+00 -2.43061200e-01 2.28793616e-03 - 1.07036519e+00 1.27189243e+00 -1.10812560e-01 -3.88243049e-01 - -5.59204482e-02 3.37957531e-01 8.91303658e-01 4.22985898e-03 - 6.18886292e-01 1.63835979e+00 3.81309599e-01 3.11652869e-01 - -1.51891077e+00 -2.63073015e+00 3.88375729e-01 -9.48242322e-02 - -5.14782667e-01 2.43530199e-01 -6.04037307e-02 1.12778020e+00 - 8.61108676e-02 -7.46923268e-01 9.90166247e-01 -2.17131078e-01 - 5.26536703e-01 -4.77410704e-01 -6.49826884e-01 8.41682330e-02 - 1.28286434e-02 4.79827046e-01 -2.68645763e-01 -1.83635974e+00 - -4.32773888e-01 7.93818474e-01 -2.97376066e-01 -2.46383399e-01 - 9.86376524e-01 -7.11973250e-01 -1.54280317e+00 -1.60631120e-01 - -4.84173954e-01 -9.66246307e-01 -1.75378457e-01 4.91505653e-01 - -9.28856153e-03 3.06981087e-01 1.81340766e+00 -3.42878848e-01 - 2.78348625e-01 -5.09213388e-01 -8.46359730e-01 -5.84907651e-01 - -4.78758574e-01 3.59475791e-01 2.34907579e+00 -7.16043413e-01 - -4.16625291e-01 -6.31230101e-02 3.95798862e-01 2.63550818e-01 - 1.28409576e+00 -2.42639208e+00 -2.38692999e+00 -4.95878160e-01 - 1.09730017e+00 -1.56564808e+00 -3.00763226e+00 5.71179390e-01 - 2.43657216e-01 -1.09020841e+00 -9.85539615e-01 -9.69743729e-01 - 1.10535316e-01 -6.26315296e-01 4.94063914e-01 -2.85462666e+00 - -8.56851876e-01 -1.12605393e+00 -2.56340683e-01 -5.38047731e-01 - 7.97843754e-01 -1.85933673e+00 4.54914756e-02 5.58336437e-01 - 1.85664749e+00 -4.81765985e-01 3.24458599e-01 -1.80386281e+00 - -2.39195609e+00 8.44774365e-01 -5.56029379e-03 1.17851269e+00 - -1.37340283e+00 5.19902110e-01 1.86881781e+00 4.19705600e-01 - 2.76581496e-01 8.55608098e-03 -1.57088149e+00 1.35557294e+00 - -6.10857666e-01 5.04483283e-01 -1.56303430e+00 -1.17945969e+00 - 1.43508744e+00 -3.47302288e-01 1.45327544e+00 -7.82306567e-02 - -1.84189773e+00 8.92067611e-01 -5.89892328e-01 6.98688805e-01 - 2.52108667e-02 -1.54311824e+00 1.64294764e-01 2.19622326e+00 - 4.92986470e-01 -5.75177521e-02 1.07534337e+00 2.10668460e-01 - 2.09534228e-01 -1.09162521e+00 2.33848020e-01 1.03544343e+00 - -8.36062133e-01 1.42449582e+00 -9.17060226e-02 -8.87769520e-01 - 9.97632086e-01 -2.86794245e-01 7.59514868e-01 -2.91758627e-01 - -5.98261431e-02 4.93560195e-01 2.90897071e-01 -1.99553180e+00 - -1.12124372e+00 9.29409981e-01 -1.32136583e-01 -5.23731291e-01 - -1.35857403e+00 -1.07973635e+00 -2.02179861e+00 -1.01280719e-01 - -5.98437607e-01 -9.66587901e-01 -6.81234226e-02 3.63970637e-01 - 2.09319782e+00 -1.51188338e+00 1.15938103e+00 9.73101556e-01 - -7.79092729e-01 2.03235954e-01 -4.57633257e-01 1.85688585e-01 - 1.50913095e+00 6.05606258e-01 5.08608401e-01 -4.30309325e-01 - -1.48405671e+00 -2.08468214e-01 1.20660818e+00 -1.52716780e+00 - -1.05508935e+00 1.26267731e-01 -9.84654784e-01 -7.92523444e-01 - -4.23598588e-01 -1.36561739e+00 -9.74121869e-01 7.03427911e-01 - -1.30736983e+00 -1.59787214e+00 -6.30024970e-01 7.31955528e-01 - -1.96921811e-01 -4.64581072e-01 4.43587974e-02 -1.74570739e+00 - 4.94705647e-01 3.83435041e-01 -4.09308463e-01 1.00592446e+00 - -1.12749612e+00 1.12790895e+00 1.54493660e-01 -3.36552590e-01 - -1.98736358e-02 -1.07264765e-01 1.22028291e+00 -1.07667911e+00 - -7.62519240e-01 1.03630292e+00 -5.95534921e-01 9.00533080e-01 - 1.69943595e+00 -4.01795030e-01 6.81362331e-01 -1.15369225e+00 - 7.54733324e-01 1.99423984e-01 4.73038286e-01 -6.45572066e-01 - -1.66108346e+00 5.03252089e-01 -2.06340313e+00 -3.17350507e-01 - 9.41181839e-01 1.91897735e-01 1.94834268e+00 1.02510917e+00 - -6.53898418e-01 2.73133796e-02 -4.73707020e-02 -1.58217680e+00 - -9.40398932e-01 -5.60760915e-01 -2.03622055e+00 4.74934690e-02 - -2.49071851e-01 -1.67289388e+00 4.04212862e-01 7.15872765e-01 - 2.27034479e-01 -3.24068189e-01 5.91442168e-01 6.80220366e-01 - 3.07403564e-01 -8.24612260e-01 2.76600778e-01 1.19548070e+00 - -5.96629083e-01 2.09793925e+00 -5.25630951e-01 3.24070118e-02 - -6.72485590e-01 2.23270833e-01 -1.74222171e+00 -5.46041787e-01 - -1.09212124e+00 1.35338351e-01 9.94447351e-01 6.46006167e-01 - -2.37165928e+00 2.04206967e+00 2.95279950e-01 -7.76499152e-01 - 1.57127917e-01 1.76189825e-01 -6.66848719e-01 1.97916672e-01 - -6.07472301e-01 -4.04361993e-01 -6.86279118e-01 -1.43267131e+00 - 1.45836473e-01 5.85299432e-01 5.14883578e-01 8.19925249e-01 - 3.24340224e-01 -5.32725513e-01 8.39663863e-01 -1.35751748e+00 - -9.00671363e-01 -9.64200079e-01 1.08005607e+00 9.42788541e-01 - 4.87305224e-01 1.07192981e+00 -8.79282773e-01 3.48595113e-01 - -1.25174433e-01 8.46882105e-01 -1.15224235e-01 -3.84942889e-01 - 5.40908515e-01 8.76169145e-01 -6.68345213e-01 -6.10756725e-02 - -1.05844069e+00 -2.49806094e+00 2.00459629e-01 6.08905375e-01 - 2.62066811e-01 -1.94956172e+00 -3.50849420e-01 -2.21834779e+00 - 1.36970115e+00 -1.08605456e+00 1.25540364e+00 9.45869625e-01 - -9.00590360e-01 -1.13278496e+00 3.50370228e-01 -8.44324052e-01 - 1.02484643e-01 -4.82827604e-01 -1.78115845e+00 -2.23046803e+00 - 2.26684242e-01 -5.61557770e-01 -5.53089797e-01 6.95863843e-01 - 1.17344224e+00 -2.72544712e-01 -3.40598702e-01 5.00143051e-01 - -1.35693562e+00 2.23153055e-01 5.51885962e-01 6.80230200e-01 - -1.93675268e+00 -7.61355281e-01 -8.78263235e-01 9.98869538e-02 - 4.80744570e-01 1.49671042e+00 -4.94965494e-01 -4.97572511e-01 - 1.14967203e+00 2.96576947e-01 1.26765156e+00 2.88727999e-01 - -1.15711665e+00 -1.33312833e+00 -1.55151224e+00 6.62051857e-01 - 1.18859684e+00 1.84936213e+00 8.43105316e-01 1.16283858e+00 - -5.80016315e-01 -1.15641582e+00 8.50079358e-01 4.17680740e-01 - 1.07444376e-01 3.87371480e-01 3.23185474e-01 1.28381681e+00 - 1.29498553e+00 5.44220328e-01 8.45802844e-01 5.14062524e-01 - -4.61169720e-01 7.56564677e-01 -1.39449501e+00 1.05389154e+00 - -1.19452298e+00 2.85803288e-01 -8.45153987e-01 5.53389013e-01 - -3.39252241e-02 -1.59475601e+00 4.20699626e-01 -5.77920794e-01 - 8.81146729e-01 1.32568879e-03 3.95420998e-01 -9.57568944e-01 - -5.89332223e-01 -4.58638638e-01 8.56702507e-01 1.67737710e+00 - -8.05171728e-01 1.07684815e+00 -2.16967750e+00 -1.19418943e+00 - -9.58778441e-01 -1.35250866e+00 -1.58358836e+00 4.12998915e-01 - -2.14067698e-01 3.79405409e-01 6.90472662e-01 2.65616983e-01 - 6.57763183e-01 -6.95322827e-02 -1.18699336e+00 1.05781412e+00 - -5.89112103e-01 -9.35596287e-01 -1.77917883e-01 -9.12810922e-01 - -4.17430371e-01 3.44053924e-01 3.48341733e-01 -8.97830069e-01 - -4.87705380e-01 1.07433295e+00 4.96400118e-01 2.07506514e+00 - 1.18012822e+00 1.14429307e+00 -1.38786852e+00 7.63860345e-01 - -5.31068034e-02 -5.50981820e-01 1.68015346e-01 9.45211530e-01 - 5.00831604e-01 -8.78473818e-01 7.42458940e-01 -6.37587547e-01 - 2.16683120e-01 -3.03350061e-01 -2.18866259e-01 7.66242981e-01 - -1.04019368e+00 1.94519508e+00 -1.82108605e+00 -5.97667694e-01 - 3.70211899e-01 5.84314167e-01 -1.00475721e-01 -5.73907048e-02 - 4.10550088e-01 -2.31069475e-01 1.39318120e+00 8.39349091e-01 - 8.72824371e-01 -9.66156721e-01 -7.93884248e-02 -3.00465941e-01 - -1.00560141e+00 3.60164195e-01 -5.37645757e-01 6.55403852e-01 - -1.07019162e+00 1.71904063e+00 -8.96332681e-01 2.19302341e-01 - 7.91859329e-01 -5.70640445e-01 8.52468669e-01 -1.22918355e+00 - 1.45211768e+00 1.41492654e-02 -5.96411407e-01 -2.99181759e-01 - 8.08919251e-01 -5.84455311e-01 -7.49357700e-01 -2.21536592e-01 - -1.17696214e+00 1.82054389e+00 -5.66394567e-01 8.31616759e-01 - 2.09470078e-01 -9.84534025e-01 -5.95470937e-03 7.84002781e-01 - -1.88261330e-01 1.19311297e+00 1.02151537e+00 9.18914855e-01 - -4.59656775e-01 -1.07984245e+00 -6.13587618e-01 1.96240947e-01 - 5.45786738e-01 -2.17157149e+00 -8.11320171e-03 1.58089221e-01 - -2.77544260e-01 1.29127383e+00 4.21780080e-01 1.31019187e+00 - -2.15960518e-01 9.37477469e-01 1.63506639e+00 1.31169343e+00 - -1.09214163e+00 -2.53676087e-01 -3.45164716e-01 4.89254534e-01 - 6.86353564e-01 -8.38892698e-01 6.78385124e-02 -2.01992726e+00 - 1.44194496e+00 -5.12374222e-01 5.58639392e-02 -3.09649706e-01 - 1.14602757e+00 -2.83750653e+00 -1.66103430e-02 -1.05818756e-01 - -4.53636378e-01 1.72622657e+00 -8.87329459e-01 7.71796167e-01 - 3.56252789e-01 1.26151919e+00 -6.67189062e-01 -7.32968926e-01 - 1.09606993e+00 4.25321311e-01 -1.54493737e+00 -5.39305031e-01 - 3.86382639e-01 8.37764204e-01 -6.79230630e-01 -2.91425610e+00 - 5.23223162e-01 -1.22032070e+00 -2.01733875e+00 -2.83232594e+00 - 8.13980877e-01 1.28617451e-01 -2.25656375e-01 1.38902438e+00 - 6.37324691e-01 1.61629999e+00 -1.39273548e+00 -3.45506221e-02 - -1.61671340e+00 -2.29382336e-01 -2.48615861e-01 6.26613617e-01 - 4.55638804e-02 6.23407125e-01 7.03440130e-01 2.15492868e+00 - -3.15296650e-01 -9.41022694e-01 2.46083617e-01 -8.02280009e-01 - 1.48196077e+00 -2.83218026e-01 -5.65576375e-01 -4.71315712e-01 - 7.18417406e-01 -1.57746986e-01 -6.41666830e-01 1.66535187e+00 - -8.56169820e-01 3.55660729e-02 6.52306139e-01 -1.63133514e+00 - 4.80260491e-01 -1.05510771e+00 8.22782755e-01 -9.62651014e-01 - 6.92769051e-01 1.10583305e+00 8.61436546e-01 2.51316637e-01 - -2.31300950e-01 -5.76419592e-01 1.47103465e+00 -6.99495196e-01 - -1.03706610e+00 6.65368915e-01 -7.96136796e-01 -9.50063527e-01 - -1.44360423e+00 -4.82909828e-01 -8.27150270e-02 2.43091732e-01 - 9.51306224e-01 5.26897907e-01 -1.90872222e-01 -1.98195681e-01 - 5.10157466e-01 1.27257013e+00 1.26314193e-01 -4.69130844e-01 - -9.92865980e-01 -1.44520521e+00 6.66269660e-01 5.77978551e-01 - 5.77363551e-01 -1.28659442e-01 1.40693474e+00 -4.84118491e-01 - 9.55269217e-01 -3.87428962e-02 -6.08350754e-01 5.92530310e-01 - 4.09636110e-01 3.56972754e-01 8.12784210e-02 -3.78930829e-02 - 2.38837194e+00 1.18425667e+00 7.22826898e-01 1.11799158e-01 - 1.00715721e+00 5.17994761e-01 2.62047537e-02 -3.03564340e-01 - -8.09173360e-02 -3.17042565e+00 2.10600838e-01 -4.88476068e-01 - 1.59266996e+00 1.93440709e-02 -1.18896341e+00 -4.98801544e-02 - -1.04612112e+00 4.73407418e-01 2.90947378e-01 9.57839608e-01 - -7.90881813e-01 3.40080969e-02 -1.36502182e+00 7.01494753e-01 - 5.18871188e-01 4.95805025e-01 1.61903130e-03 4.14772898e-01 - 1.60433912e+00 -2.85094261e-01 1.38645187e-01 -1.08612633e+00 - -8.68260980e-01 9.78132933e-02 1.10393357e+00 -2.56280810e-01 - -2.35916525e-01 -4.06885237e-01 -5.34756660e-01 2.94971466e-01 - -1.49725759e+00 1.28093958e+00 -6.65422320e-01 7.21903503e-01 - -1.26999855e+00 6.68840110e-01 6.71944618e-01 3.84202063e-01 - -9.94428515e-01 2.49548197e-01 -1.18305624e+00 -2.52763242e-01 - 1.02941310e+00 7.96701908e-01 1.51829422e-02 -9.74806547e-01 - 6.45587683e-01 1.93812381e-02 -2.32031560e+00 7.82629848e-01 - -2.42328346e-02 1.20245123e+00 -1.09527051e+00 -3.17670393e+00 - 2.79083848e-01 2.91563690e-01 -1.69067168e+00 -9.78499532e-01 - 2.75521779e+00 -3.05610627e-01 7.30958998e-01 -6.50659800e-01 - -2.09030294e+00 -1.70258254e-01 -2.65604556e-01 -3.99878234e-01 - -1.14640832e-01 2.64268279e-01 -1.50116950e-01 -1.26400614e+00 - 4.86184806e-01 -1.94937181e+00 -2.05035734e+00 7.39180505e-01 - 5.07020295e-01 -5.73953651e-02 -1.39544845e+00 -7.07910955e-01 - -2.62746751e-01 -1.59589398e+00 -7.39028633e-01 6.52330339e-01 - 1.42730534e+00 -4.19434756e-01 7.56488383e-01 -1.91406012e+00 - 6.59451246e-01 -8.03948402e-01 3.07345629e-01 -2.47752413e-01 - 1.32041407e+00 -8.74435663e-01 6.27606273e-01 -2.79507458e-01 - -1.58305481e-01 -1.83853894e-01 -8.45673084e-01 -1.40989656e-02 - -6.26482219e-02 2.41240248e-01 8.45051482e-02 7.97994554e-01 - 9.23508644e-01 -1.05062330e+00 8.68772507e-01 -1.18633640e+00 - 5.87855875e-01 6.19738758e-01 -1.91131687e+00 9.29792449e-02 - -4.68496114e-01 -7.58765280e-01 1.35085237e+00 -6.53679132e-01 - -8.06376934e-01 7.56949410e-02 6.85589731e-01 -1.41648591e-01 - 1.16637051e+00 5.30975640e-01 -2.17821121e+00 -5.29466987e-01 - -5.61842084e-01 4.79360968e-01 -2.62481713e+00 2.54900980e+00 - 5.31026065e-01 1.48876801e-01 1.18156767e+00 2.34365416e+00 - 8.70746672e-01 9.48813081e-01 1.30690023e-01 -1.41525760e-01 - -4.75795448e-01 -8.26111972e-01 -6.87014103e-01 -1.35055602e+00 - -7.58714527e-02 -1.68466759e+00 1.90916523e-01 -3.63306068e-02 - 1.46935299e-01 -5.22131562e-01 3.07686043e+00 1.36011589e+00 - -1.24454248e+00 2.32868090e-01 -6.08006120e-01 -1.01447976e+00 - -1.34007204e+00 4.32216704e-01 -6.81380212e-01 9.67826009e-01 - 9.93782222e-01 1.37391436e+00 -8.52386773e-01 4.75365609e-01 - 6.32454216e-01 -4.74178165e-01 -7.71771967e-01 1.60952067e+00 - 2.27495879e-01 6.35967731e-01 -8.22076023e-01 -1.21928442e+00 - -5.69449782e-01 -1.24021389e-01 -1.06421995e+00 -9.84480884e-03 - -1.68850517e+00 -1.12004745e+00 6.70911252e-01 6.03790522e-01 - -3.91876757e-01 -1.01776433e+00 -1.02740359e+00 -3.73268247e-01 - 6.44518495e-01 9.28245664e-01 -4.96531725e-01 -1.15349245e+00 - 2.67843634e-01 -8.23534966e-01 -5.18252611e-01 1.32527614e+00 - 8.27115417e-01 1.57974318e-01 -1.27191901e+00 2.11396560e-01 - 1.06265724e+00 -3.75633657e-01 1.35111403e+00 1.29274523e+00 - 1.66373289e+00 3.05243134e-01 5.87391675e-01 1.41418731e+00 - 4.47393924e-01 -5.01234114e-01 -4.48542833e-01 2.38554955e-01 - -1.29056144e+00 -1.43258914e-01 1.09654713e+00 -2.19125390e+00 - 1.53448212e+00 1.12237418e+00 -4.59764630e-01 5.43390810e-01 - 3.14772688e-03 1.22999036e+00 9.82683122e-01 -1.40007067e+00 - 5.87262392e-01 -2.05627859e-01 8.28325674e-02 -7.28171885e-01 - 1.74813712e+00 1.04006183e+00 -7.39771187e-01 -1.33461058e+00 - -3.85672629e-01 4.34502631e-01 2.40205839e-01 1.25313282e+00 - 7.89919734e-01 9.69569027e-01 3.26855391e-01 -1.38929868e+00 - -1.95698529e-01 1.06008971e+00 -5.15229292e-02 5.03090382e-01 - -3.18748325e-01 -1.02981186e+00 -1.23344064e-01 3.09047431e-01 - -6.55687690e-01 -2.35711247e-01 -9.39463258e-01 1.11924493e+00 - -2.94455767e-01 1.45275557e+00 1.52874780e+00 -8.10459629e-02 - -1.00844872e+00 -2.08589792e+00 -1.47203708e+00 -1.37383485e+00 - 1.37846959e+00 1.15824588e-01 3.89604539e-01 -2.22042108e+00 - -1.19796646e+00 8.87079775e-01 2.86774099e-01 -1.47205278e-01 - 5.64842343e-01 1.63579786e+00 -2.21042469e-01 6.93698004e-02 - 1.92596912e-01 2.39210963e+00 -2.09935617e+00 6.83223009e-01 - -1.14802249e-01 5.66771746e-01 -6.57372534e-01 -4.89650294e-02 - 7.11410582e-01 3.11291027e+00 8.08036208e-01 -8.48065615e-01 - -4.23759669e-01 -4.53414112e-01 -1.79564321e+00 -3.30090195e-01 - 7.32829094e-01 -1.27423215e+00 1.04848266e+00 4.87774819e-01 - -7.34233320e-01 -1.41529635e-01 1.59832180e+00 7.33547807e-01 - 8.78365524e-03 -2.37683311e-01 7.66090676e-02 -5.11763573e-01 - -2.10474944e+00 2.04217792e+00 1.55324996e-01 3.98147255e-01 - -5.79077490e-02 -7.35116780e-01 -1.85924485e-01 -1.59939480e+00 - 2.27708650e+00 -1.23451197e+00 -1.03196633e+00 -4.40259218e-01 - 1.28807569e+00 1.45089990e-02 -1.23613584e+00 1.25433838e+00 - 7.31000602e-01 5.38434982e-01 7.15734243e-01 2.39436197e+00 - 2.18509483e+00 -6.86715364e-01 1.62388504e+00 3.83168191e-01 - 4.99095827e-01 -1.69227087e+00 -1.70385763e-01 -8.12823594e-01 - 8.28058839e-01 1.07108045e+00 -2.07448626e+00 -2.20348649e-02 - 1.12186146e+00 -1.86678961e-01 7.29063630e-01 3.51165473e-01 - 3.41660231e-01 -2.71314979e-01 -1.21658075e+00 -1.12153018e+00 - 7.09352493e-01 1.94582548e-02 -1.38512239e-01 -4.23890054e-01 - -3.05188835e-01 1.04691184e+00 -3.35244089e-01 -1.34106469e+00 - 7.43651748e-01 -2.04876661e-01 3.13958637e-02 1.60757828e+00 - -2.05605570e-02 2.89183676e-01 -1.17892349e+00 -2.19669446e-01 - 6.09526038e-01 -1.55537379e+00 2.00845093e-01 1.00791074e-01 - 4.12874557e-02 1.03715885e+00 3.34438503e-01 1.47807193e+00 - -9.93589520e-01 1.35924563e-01 8.44667852e-02 -1.06925452e+00 - 9.56295013e-01 2.32467341e+00 -1.70060750e-02 5.05021095e-01 - 8.45184773e-02 -7.13584661e-01 -1.82076025e+00 -7.05773473e-01 - -1.39195836e+00 1.70410204e+00 -1.15109697e-01 -3.45919341e-01 - 8.89153838e-01 -1.12175107e+00 -7.66864061e-01 6.80722669e-02 - 1.02876902e+00 5.01685023e-01 2.48325139e-01 7.81177357e-02 - 4.00235862e-01 2.01652899e-01 -1.02997231e+00 4.60792221e-02 - -1.07368863e+00 8.10825452e-02 -4.80647951e-01 2.53938150e+00 - -3.98943834e-02 -1.40423954e+00 -1.34180218e-01 5.19358754e-01 - 1.67336392e+00 -5.89452386e-01 -1.58242798e+00 8.16837013e-01 - -1.38059866e+00 -4.36155438e-01 -2.57620484e-01 -6.96334541e-02 - 6.89164460e-01 -6.83634281e-01 -2.73729265e-01 -1.42703531e-02 - 1.49597958e-01 9.78271738e-02 4.36005771e-01 8.43839526e-01 - 3.92634124e-01 4.53585535e-01 -1.58222318e+00 5.02276979e-02 - 4.79507536e-01 7.28941202e-01 -6.92350745e-01 -6.59039259e-01 - -5.74100673e-01 5.36413908e-01 1.03388107e+00 -4.07460093e-01 - 9.07962322e-01 1.23479806e-01 -1.24285780e-01 -2.11384237e-01 - 7.46611357e-01 3.25577825e-01 2.27582550e+00 -8.52224648e-01 - 5.18101692e-01 -5.29201031e-01 -8.69287014e-01 -1.08671375e-02 - 1.37352481e-01 2.91817355e+00 -1.84145823e-01 -1.13017869e+00 - 2.73010731e-01 -2.07464004e+00 -9.19567794e-02 1.60252824e-02 - 7.94600368e-01 9.73622262e-01 -5.03569603e-01 -1.50614008e-01 - -7.66120851e-01 -1.12806261e+00 3.09894770e-01 -1.27358556e+00 - -4.14626896e-01 -9.95173931e-01 5.49683213e-01 1.80143011e+00 - -1.30158794e+00 -1.85301852e+00 -1.24886885e-01 1.56977959e-02 - 2.18659312e-01 -4.46320504e-01 -2.00639293e-01 3.67510408e-01 - 1.04043090e+00 -1.95504880e+00 -3.68939430e-01 2.13808790e-01 - -9.17976081e-01 1.73116958e+00 -6.35409892e-01 -4.12926860e-02 - 5.56022108e-01 -5.91284811e-01 -1.84657300e+00 -4.28654522e-01 - 1.02944052e+00 -3.36895138e-01 -8.46434295e-01 9.25672233e-01 - -3.30774903e-01 -5.10160804e-01 1.49456084e-01 7.70750225e-01 - 2.38171363e+00 8.27236116e-01 1.15983999e+00 -5.05966187e-01 - -1.02586806e+00 6.23313487e-02 1.07169919e-01 -1.07851279e+00 - -1.75294533e-01 3.99317592e-01 3.60393405e-01 5.04287839e-01 - 2.90964723e-01 -5.07008672e-01 -5.13900638e-01 -1.19895136e+00 - 1.78477502e+00 1.34503646e-02 -2.24853054e-01 -9.28792059e-01 - 2.93565798e+00 -1.07332802e+00 -4.28847939e-01 4.15836841e-01 - 6.00212991e-01 9.57932055e-01 1.80760849e+00 5.06157637e-01 - -1.05603075e+00 -2.56945133e-01 3.07832360e-01 2.06822610e+00 - -1.39780545e+00 1.20257652e+00 1.79914370e-01 8.37830544e-01 - -1.01994716e-01 -8.42432618e-01 7.50872791e-01 6.54431045e-01 - 9.72056985e-01 -1.33046350e-02 -5.99822044e-01 -1.55878699e+00 - 8.19373250e-01 4.23889488e-01 -9.65782642e-01 -5.38573682e-01 - -4.66758817e-01 -1.48298442e+00 -8.17682385e-01 -1.13383031e+00 - -4.12479013e-01 -7.38697469e-01 -1.62700486e+00 2.02259016e+00 - 1.25254071e+00 -7.79019415e-01 2.72421390e-01 4.17465091e-01 - 1.19108582e+00 -8.18605006e-01 1.09210682e+00 -5.47905326e-01 - -1.33586240e+00 7.24136531e-01 1.29075837e+00 2.58790898e+00 - -1.19972527e+00 6.98919713e-01 -4.09801245e-01 7.83358693e-01 - -1.47048950e+00 -1.26276982e+00 -7.75874794e-01 7.89640471e-02 - -6.41081274e-01 6.59054339e-01 1.02689457e+00 5.99577546e-01 - 1.61878765e+00 4.15826619e-01 -1.21689665e+00 1.36337650e+00 - -6.05156243e-01 9.10418510e-01 1.57373416e+00 4.56688017e-01 - -1.49615884e+00 -9.18212175e-01 1.32274449e+00 -1.69122681e-01 - 7.95735344e-02 7.58824408e-01 6.96322858e-01 2.78799802e-01 - -2.08668366e-01 -3.12668443e-01 8.57898831e-01 5.53569555e-01 - -2.57299989e-01 1.08211231e+00 -1.29034117e-01 -8.51662159e-01 - -7.58970678e-01 -9.78740811e-01 -2.72410005e-01 1.23372424e+00 - -8.05095196e-01 1.88475758e-01 -6.56689346e-01 -2.60309041e-01 - -1.51004124e+00 -2.20538229e-01 -1.93705380e-01 5.40287971e-01 - 2.23280907e-01 -1.19839215e+00 1.08419824e+00 2.10303038e-01 - 1.98393524e-01 -9.18235350e-03 -5.17757475e-01 2.17802346e-01 - 4.75588441e-01 -1.74689567e+00 -5.38760841e-01 -2.54946977e-01 - 2.56170601e-01 1.19331098e+00 -7.71740794e-01 -4.97961849e-01 - -5.12534082e-01 -8.01527619e-01 -3.47186506e-01 7.70802200e-01 - -1.05383551e+00 4.27814394e-01 -1.70190227e+00 -5.61338007e-01 - -3.29489291e-01 1.67485964e+00 -2.74802297e-01 -2.20124340e+00 - -1.99888885e+00 3.40295881e-02 6.13770843e-01 3.75455856e-01 - 1.67570099e-01 1.16225958e+00 -5.39591193e-01 8.94910574e-01 - -1.63201487e+00 -9.04947817e-01 -1.80704141e+00 -2.74845302e-01 - -4.62612174e-02 1.14101492e-01 6.18833527e-02 -9.27553356e-01 - -1.90240845e-01 -1.49744666e+00 1.36322892e+00 -3.03626060e-01 - -4.88886118e-01 1.58894038e+00 -1.24224651e+00 -1.12558711e+00 - -1.09622037e+00 -6.78713843e-02 4.31631625e-01 7.39675462e-01 - 2.57199526e+00 -2.24943256e+00 6.08735561e-01 -5.57130575e-01 - -5.86415172e-01 -6.19772136e-01 -9.08769667e-01 1.68452489e+00 - 8.42297316e-01 2.31959783e-02 1.35101581e+00 -1.51252902e+00 - -1.33070517e+00 -2.17796779e+00 -6.99414551e-01 1.61698520e-01 - 1.02858581e-01 1.49341911e-01 -1.41105688e+00 8.03154767e-01 - 2.39627525e-01 7.33054280e-01 -1.89276218e+00 3.11490864e-01 - -1.18776524e+00 -3.97323251e-01 5.34364879e-01 9.10936072e-02 - -8.51539969e-01 6.27315603e-03 -1.53153527e+00 1.15026736e+00 - -2.05284014e-01 1.11854994e+00 -5.26531696e-01 -5.23575962e-01 - -7.64186800e-01 8.76173913e-01 -7.09431171e-01 6.44595444e-01 - -3.82452399e-01 -1.97596335e+00 -5.70314050e-01 -1.17910826e+00 - 3.39027233e-02 -1.19040775e+00 9.25975084e-01 -9.10758600e-02 - -1.77595988e-01 6.55892134e-01 9.63906944e-01 -2.00987514e-02 - -7.71200299e-01 -1.77396965e+00 -6.86049759e-01 -6.17212236e-01 - 1.07057452e-01 -3.45146894e-01 6.87932968e-01 -6.12983286e-01 - -5.70411682e-02 1.18398750e+00 3.41825217e-01 -6.00375421e-02 - -6.01944447e-01 -6.33231401e-01 -8.50613236e-01 -5.41006863e-01 - 7.55665839e-01 -2.21888274e-01 -1.34977305e+00 1.76816928e+00 - 3.19965184e-01 -1.40694344e+00 -4.41848934e-01 1.10886380e-01 - 1.70459211e+00 -1.10811245e+00 9.76040721e-01 1.39948642e+00 - -6.28654301e-01 -2.77513623e-01 -1.03630924e+00 -2.03461140e-01 - -3.65899727e-02 6.59650028e-01 1.09018350e+00 -7.48096049e-01 - -1.96732509e+00 -2.12447792e-02 2.93537527e-01 2.49417692e-01 - -3.26673239e-01 1.24752097e-01 1.51544476e+00 3.81734163e-01 - -3.73368412e-01 -5.00197828e-01 -1.69852361e-01 1.63673997e+00 - -1.83300704e-01 -1.97943449e-02 -4.82313573e-01 1.09898341e+00 - 1.95983720e+00 4.50613052e-01 8.27729031e-02 3.67828347e-02 - -2.38430691e+00 7.45150864e-01 1.34166420e-01 4.28342044e-01 - -7.11462200e-01 -3.84456873e-01 -4.11024928e-01 1.02162302e+00 - -3.78759831e-01 2.08266044e+00 4.30862427e-01 -1.27568269e+00 - -1.28546298e-01 -6.51675105e-01 -2.19438243e+00 1.59947097e+00 - 7.03142524e-01 5.33207297e-01 1.65403664e-01 -1.38201761e+00 - -4.83617127e-01 8.41780782e-01 -1.05122006e+00 -2.53176928e+00 - -2.27106005e-01 -1.41757548e-01 1.35563418e-01 7.28518665e-01 - 6.37605786e-01 2.58769810e-01 1.19553137e+00 -1.51896977e+00 - 5.04581809e-01 6.26538336e-01 -1.23956227e+00 -1.50287056e+00 - -7.45942116e-01 -6.61541462e-01 7.53419459e-01 -6.14538789e-01 - -8.56348515e-01 -6.11633718e-01 1.11863232e+00 1.30433748e-02 - -1.79251030e-01 -1.10005498e-01 1.29206038e+00 -7.49171734e-01 - 1.31550825e+00 2.10496292e-01 -1.60419375e-01 2.50090051e+00 - -1.05563951e+00 -3.53685766e-01 -1.23243129e+00 9.52461958e-01 - 1.68918896e+00 1.66992283e+00 1.22394872e+00 -1.97620466e-01 - -3.65240991e-01 -1.67116976e+00 2.03587365e+00 4.68978077e-01 - -4.91956919e-01 1.32805693e+00 -9.72764492e-01 4.06622022e-01 - -1.72629133e-01 6.38677120e-01 -6.35735393e-02 -1.45092785e+00 - 7.55763724e-02 1.04886413e+00 -2.92164050e-02 7.75115132e-01 - -1.04552948e+00 -2.62607932e-02 -6.24510467e-01 7.46980846e-01 - 3.14915627e-02 6.90073669e-01 1.08771038e+00 6.91618562e-01 - 2.73118436e-01 -9.98212278e-01 -2.78611062e-03 -5.33090413e-01 - -3.44063967e-01 -3.74534160e-01 -9.79925573e-01 -1.95733857e+00 - -6.68143034e-01 1.93379447e-01 -1.34817708e+00 1.56822372e-02 - 1.90026844e+00 1.02305651e+00 1.64354146e+00 3.91127884e-01 - 1.16750979e+00 -7.23964751e-01 -6.44340277e-01 -2.73914027e+00 - 1.18614542e+00 1.54032397e+00 -5.53177238e-01 -2.53951699e-01 - -7.53862143e-01 3.82653564e-01 1.12493253e+00 -1.74277198e+00 - 4.09438521e-01 -2.76671767e-01 1.13394395e-01 4.58994001e-01 - -1.88202441e-01 -1.03586853e-01 -2.13875175e-01 9.70316231e-01 - 4.18759972e-01 2.20583543e-01 -1.27095175e+00 -3.78579736e-01 - 8.98669183e-01 3.76965739e-02 6.59531474e-01 1.76651180e+00 - 1.03860605e+00 9.31583345e-02 -5.10989189e-01 -2.55354953e+00 - -1.68384925e-01 -3.26012492e-01 3.90366197e-01 -1.53905228e-01 - -3.20385069e-01 -1.77939892e+00 -9.39967811e-01 4.76651043e-01 - 1.71590757e+00 -9.75253806e-02 3.27811062e-01 9.52478170e-01 - -8.10893953e-01 3.66814047e-01 -7.40490079e-01 8.64824414e-01 - 1.51403055e-01 2.98451811e-01 -7.82598615e-01 7.07032382e-01 - 2.08471298e+00 -1.00791168e+00 -3.39759022e-01 2.31607467e-01 - 7.86406219e-01 -5.59546292e-01 -2.83303022e+00 1.35918951e+00 - 7.60506094e-01 -1.74294758e+00 -6.07568800e-01 1.56818247e+00 - 1.44779646e+00 9.38639998e-01 -1.76112428e-01 8.29874203e-02 - -1.30925804e-01 1.12823975e+00 -8.75842392e-01 -8.05799663e-01 - 1.01027453e+00 -4.43614572e-02 1.37846732e+00 -6.71512187e-01 - -5.47783598e-02 -9.22412992e-01 7.36268144e-03 -1.47854698e+00 - -4.87633407e-01 -1.43625391e+00 6.93923235e-01 7.16589332e-01 - -1.73026276e+00 1.33388579e+00 2.49741507e+00 4.60063852e-02 - -6.41114235e-01 2.82433057e+00 -1.31896853e+00 2.01064665e-02 - 1.79622278e-01 1.34514761e+00 3.59738976e-01 -4.98633891e-01 - -9.51184392e-01 -3.41821820e-01 -1.00692737e+00 4.78253216e-02 - -5.64196169e-01 -1.49403775e+00 1.25514579e+00 -4.97672528e-01 - -1.26813686e+00 -1.68259129e-01 6.83715641e-02 -1.74646676e+00 - 3.31647664e-01 -1.04715753e+00 1.71102881e+00 5.23960531e-01 - -3.99899989e-01 -7.45167792e-01 -1.04967706e-01 6.12790763e-01 - -2.54941285e-02 -2.17158809e-01 5.83702922e-02 -6.03858948e-01 - -3.43205929e-01 -2.08579254e+00 9.73726213e-01 2.98797786e-01 - -7.73455858e-01 1.89873493e+00 8.70164782e-02 -1.03614354e+00 - 8.56737435e-01 3.98048222e-01 3.94579977e-01 -4.25755590e-01 - 4.50495720e-01 -2.61758298e-01 5.55941463e-01 -1.90494442e+00 - -4.33873475e-01 1.72215509e+00 -7.67067432e-01 -6.03838325e-01 - -4.47961330e-01 -1.21480608e+00 -2.20828080e+00 -2.30586395e-01 - 1.35859430e-01 -5.25050685e-02 -1.44878566e-01 4.30408716e-02 - 1.02284145e+00 -1.90446183e-01 -5.74082553e-01 -2.81125486e-01 - 5.79346776e-01 1.41202688e+00 -6.28143728e-01 9.89337862e-02 - 3.38357449e-01 -9.92578030e-01 2.04171669e-02 -1.46567380e+00 - -1.76905036e+00 3.66746396e-01 -3.83874744e-01 2.02280402e-01 - -1.78949777e-02 -9.76400077e-01 -4.52543013e-02 -1.23310125e+00 - 1.39097238e+00 -1.35597038e+00 -1.95785269e-01 -1.04227161e+00 - 5.70421159e-01 -3.31431359e-01 -5.18666744e-01 8.53201568e-01 - 2.94305924e-02 5.16644418e-01 -9.81325924e-01 -1.40481699e+00 - 1.07207961e-01 -3.18846345e-01 6.26434684e-01 -7.99084529e-02 - 2.05075338e-01 -1.62338257e+00 1.88843691e+00 -1.64241984e-01 - 6.97586954e-01 -8.36091697e-01 3.08009893e-01 -1.30284476e+00 - 4.52067852e-01 -3.55974078e-01 2.91905031e-02 6.97974026e-01 - -1.23254037e+00 -6.84491754e-01 4.57324266e-01 2.09822154e+00 - -1.38082340e-01 5.91136098e-01 -1.06971473e-01 1.94224560e+00 - -1.74722171e+00 9.46210474e-02 -1.62554407e+00 -2.50657153e+00 - 2.97462046e-01 1.39497802e-01 -2.84836948e-01 -1.06194086e-01 - 3.09829950e+00 -2.58175135e-01 5.10124385e-01 -6.49364829e-01 - 2.95232564e-01 -2.38233671e-01 -4.10518497e-01 1.81889856e+00 - 1.17718494e+00 1.36746645e-01 -2.30421185e+00 -3.61009210e-01 - 2.70302749e+00 -2.30661660e-01 1.10321760e+00 1.44194245e+00 - 2.18963921e-01 -8.41821611e-01 -2.43531227e+00 6.84163213e-01 - -8.14755976e-01 1.76150456e-01 -1.24982035e+00 -2.65237540e-01 - 3.37190554e-02 2.68593580e-01 -1.11482215e+00 1.19691098e+00 - -5.96894801e-01 -1.67446947e+00 -1.09023422e-01 3.43753099e-02 - 1.60589427e-01 1.36213243e+00 -2.36448839e-01 -1.19619623e-01 - -1.10318434e+00 8.20553482e-01 1.84309065e-01 -4.20027733e-01 - -1.90467465e+00 1.44441223e+00 -1.22767997e+00 -5.91278017e-01 - 3.58344525e-01 -1.19655788e+00 -1.49350977e+00 -2.12394762e+00 - 7.20996976e-01 -5.45186222e-01 2.92727017e+00 -9.94355083e-01 - -9.21451032e-01 -5.65392971e-01 2.43134663e-01 -4.70203340e-01 - 6.12332225e-01 -9.78588402e-01 -1.38159168e+00 -1.79821479e+00 - -2.22357416e+00 -5.18069029e-01 -3.00980300e-01 -6.31437004e-01 - 1.24340582e+00 1.22803020e+00 -3.34802985e-01 -1.79392111e+00 - -5.66705108e-01 -5.99821389e-01 -6.57134950e-01 -6.97718918e-01 - -6.53462887e-01 -3.86541605e-01 -6.35541081e-01 2.08322510e-01 - 3.60368729e-01 -9.10645068e-01 1.27106535e+00 -9.80062306e-01 - -1.61780059e+00 3.80778074e-01 -8.17408144e-01 3.21291476e-01 - -2.40366864e+00 1.39245594e+00 6.72293425e-01 -1.50047660e+00 - -1.47877717e+00 1.67226589e+00 -7.37379909e-01 -8.34184825e-01 - -2.26605201e+00 -3.50778192e-01 -2.27719620e-01 7.15493083e-01 - -1.39907837e+00 -1.52028680e+00 4.50962633e-01 1.00673008e+00 - -1.57115233e+00 1.08151412e+00 3.75418402e-02 1.09180534e+00 - -7.02203587e-02 -2.52227783e+00 -1.51967376e-01 1.06014383e+00 - -8.93746018e-01 -3.87964964e-01 -1.76133499e-01 -2.89951396e+00 - 1.67134762e+00 -9.01375651e-01 5.05735457e-01 -8.67011398e-02 - 1.25761902e+00 4.06056136e-01 -2.34924272e-01 -8.93515110e-01 - 7.97850728e-01 -1.87344134e+00 6.45890415e-01 -1.09904480e+00 - 1.25228369e+00 1.61299542e-01 6.44123793e-01 8.19888115e-02 - -1.75237715e+00 -8.60729158e-01 6.72389328e-01 1.66655824e-01 - 1.18098986e+00 4.03274566e-01 -9.66701865e-01 -7.96064138e-01 - 1.02917388e-01 -2.31404400e+00 1.67344189e+00 4.41746742e-01 - 1.22703457e+00 -1.58060396e+00 1.24456513e+00 2.23149896e+00 - -1.32087839e+00 7.71401823e-01 1.10994466e-03 -1.54443383e+00 - 4.92883980e-01 8.33603203e-01 -6.01126552e-01 1.16591656e+00 - -1.66676342e+00 -1.49475610e+00 1.36597395e-01 2.53852773e-02 - -7.16573477e-01 8.09594452e-01 5.30894339e-01 8.58139098e-01 - -1.23136438e-01 6.02580249e-01 -4.45650309e-01 2.05381870e+00 - -8.12545776e-01 2.10529733e+00 3.48879471e-02 -1.37551093e+00 - 1.36807930e+00 1.25901386e-01 8.51124108e-01 1.22385597e+00 - 3.37955564e-01 -5.92240870e-01 9.75713372e-01 7.55507441e-04 - 6.85857832e-01 -2.15373063e+00 4.46981728e-01 -6.38585806e-01 - 1.65330958e+00 6.23087347e-01 2.39857748e-01 4.86310005e-01 - -9.98440862e-01 1.62325668e+00 1.99137044e+00 3.92913520e-01 - 1.59040356e+00 -5.69148183e-01 -7.97113538e-01 4.09247503e-02 - 4.34941024e-01 -3.93986672e-01 5.37768185e-01 3.06389481e-01 - -9.98306751e-01 5.18793046e-01 8.63528252e-01 1.71469316e-01 - 1.15264809e+00 -1.21740377e+00 4.67950433e-01 -1.17028069e+00 - -1.11408138e+00 -6.30930841e-01 -9.42060173e-01 -5.47995806e-01 - -2.14150310e-01 8.37154508e-01 -3.21159035e-01 -1.58562565e+00 - 1.14006782e+00 -8.37090135e-01 -5.87204285e-02 4.46536511e-01 - 1.99610412e-01 1.09417951e+00 4.79003131e-01 -8.61309588e-01 - 2.17856216e+00 1.94333464e-01 -1.47340223e-01 9.63925421e-01 - 2.49450952e-01 3.99228185e-01 2.49218392e+00 -2.75121421e-01 - -3.17760885e-01 1.16578512e-01 1.07151592e+00 -1.50653398e+00 - -1.96555591e+00 -5.87115288e-01 -1.63392293e+00 -4.24831361e-01 - 5.02499700e-01 -8.90748858e-01 -1.42682660e+00 6.93371594e-02 - 8.62543464e-01 -1.69497347e+00 8.20232391e-01 8.17306221e-01 - 9.54085112e-01 1.21848023e+00 2.20201397e+00 -3.56092036e-01 - 1.09571584e-01 -4.16438431e-01 1.97684485e-03 1.53287339e+00 - -1.34501445e+00 -8.61049652e-01 1.37718093e+00 1.02003646e+00 - -3.63131642e-01 8.54402006e-01 3.41215849e-01 1.50027466e+00 - 7.45095909e-02 1.02563214e+00 2.64201188e+00 5.98099172e-01 - 4.93281670e-02 7.22446978e-01 -3.65280867e-01 2.38796759e+00 - 1.80071041e-01 -2.72933334e-01 -8.35689008e-01 1.09722018e+00 - 8.67187440e-01 -4.97354358e-01 5.76451361e-01 -1.50855601e+00 - 1.84970573e-01 4.52756345e-01 1.70732975e+00 -7.70818830e-01 - -1.27835262e+00 -8.95427942e-01 -1.33656847e+00 1.33665740e+00 - 1.54863930e+00 3.20216566e-01 1.22129500e+00 1.33958384e-01 - -9.51116458e-02 -3.33037615e-01 -3.10390979e-01 -1.54780161e+00 - 2.64254749e-01 -4.03339118e-02 7.45360732e-01 9.35234949e-02 - -1.62703708e-01 -8.71193051e-01 7.21532404e-02 7.38237202e-01 - -1.14333904e+00 8.74737501e-02 2.33042955e-01 -7.18064785e-01 - -5.87856293e-01 2.30610847e-01 -1.13342106e+00 3.49923730e-01 - -8.95354390e-01 1.06285369e+00 2.33782768e+00 -8.60756814e-01 - 5.34207880e-01 -7.26741910e-01 1.04095423e+00 -1.26525843e+00 - 4.80586767e-01 -8.72988939e-01 -2.06984594e-01 -1.03102529e+00 - 8.22350442e-01 -8.36980343e-01 5.51964283e-01 -3.44098032e-01 - -1.14472374e-01 1.47755906e-01 -7.61824667e-01 2.10367113e-01 - 1.53033459e+00 -7.78094769e-01 1.17181015e+00 1.49809766e+00 - -8.16155016e-01 -1.45505464e+00 3.01960886e-01 -5.47407508e-01 - 1.86860323e+00 3.48075405e-02 -8.02199423e-01 -1.14715946e+00 - 1.08990490e+00 -5.71183980e-01 4.38288003e-02 9.55802739e-01 - -4.84258354e-01 -4.59288284e-02 -1.65684093e-02 6.83324516e-01 - 5.90744317e-01 -2.20113724e-01 6.36490583e-02 -3.71779025e-01 - -1.47761238e+00 2.57600844e-01 1.89219177e+00 -8.83719146e-01 - -7.73029685e-01 -8.70727777e-01 7.24591792e-01 4.48485464e-01 - -1.58537686e-01 -9.18019056e-01 9.23767909e-02 -1.56462216e+00 - 1.43426567e-01 8.71524394e-01 -1.25119913e+00 -1.53251827e+00 - 1.57943785e+00 -6.11759663e-01 1.13594782e+00 -1.53997660e-01 - -1.61954820e-01 4.38412696e-01 -8.83998871e-01 1.32497147e-01 - -3.81489426e-01 -1.32049346e+00 3.37482065e-01 -2.63307810e-01 - -1.57024831e-01 -1.21016097e+00 1.59140038e+00 8.76714170e-01 - 9.64155912e-01 -3.05444688e-01 -1.72918284e+00 1.95291734e+00 - -6.10975064e-02 -7.48799801e-01 4.16801572e-01 1.16712081e+00 - -2.30505560e-02 1.27857518e+00 1.08299994e+00 -1.07638121e+00 - -6.73790097e-01 9.57537174e-01 -2.71261263e+00 -1.49455696e-01 - -1.43083119e+00 -4.76553798e-01 -2.98820078e-01 -3.58244389e-01 - -5.52261233e-01 -1.09714365e+00 2.29191136e+00 1.86746204e+00 - -1.16206586e+00 1.58211672e+00 -3.99465501e-01 1.29187480e-01 - -6.32739589e-02 -6.59748614e-01 9.83852327e-01 1.25886530e-01 - -4.76551414e-01 -1.30458939e+00 -8.93595040e-01 -1.64138007e+00 - 1.22235483e-02 -1.19666564e+00 -2.65560508e-01 1.37010181e+00 - 1.19929957e+00 -3.22768949e-02 -8.70847255e-02 1.87987459e+00 - 5.20756125e-01 -3.27806473e-01 -1.42601812e+00 3.93873215e-01 - 8.29323769e-01 -6.31651521e-01 2.91202754e-01 -3.63573283e-01 - -1.84032395e-01 -2.48782206e+00 -2.21781403e-01 9.69369292e-01 - -6.31402552e-01 5.25263697e-02 -7.55950689e-01 3.60441685e-01 - -2.16523200e-01 -9.39241529e-01 1.33255148e+00 9.28932011e-01 - -6.04906738e-01 4.97788310e-01 5.37411988e-01 6.42301261e-01 - -1.69425595e+00 1.00660658e+00 -5.69698989e-01 -3.57827932e-01 - 2.63788176e+00 3.01873386e-01 -3.86322469e-01 -2.20529437e+00 - 1.38976419e+00 -1.34520972e+00 -1.00343895e+00 -3.31352293e-01 - 1.41641176e+00 7.73239970e-01 -1.57837510e-01 3.08292061e-01 - -6.00385666e-01 1.39653683e-01 7.33367428e-02 9.68769908e-01 - 9.67178106e-01 4.93977368e-01 -3.62347543e-01 7.41822302e-01 - -9.19684693e-02 -1.08191442e+00 -4.03287798e-01 8.45269859e-02 - 9.10215795e-01 -1.83528170e-01 -1.20919871e+00 5.67416847e-02 - 1.13456041e-01 1.85837626e-01 9.19871509e-01 1.01892328e+00 - -5.94210863e-01 -6.16143465e-01 -1.04019868e+00 6.98815644e-01 - 1.44882011e+00 -3.15081924e-01 -2.41430566e-01 1.46678257e+00 - 5.05557835e-01 4.36235696e-01 -1.19091654e+00 6.78728044e-01 - -9.29136336e-01 -1.13402021e+00 -3.46771568e-01 -5.85792720e-01 - -2.21599698e-01 -1.44098163e+00 9.58816290e-01 1.12768626e+00 - -1.94734251e+00 9.78457808e-01 -8.10528636e-01 8.56975555e-01 - -6.84464946e-02 2.87755635e-02 -1.85097650e-01 1.21212924e+00 - -1.23958802e+00 6.07840359e-01 1.10133016e+00 7.16340065e-01 - 2.32469893e+00 -5.28946757e-01 9.57304835e-01 -1.06090635e-01 - 1.23340607e+00 -7.68899592e-03 -3.87875795e-01 1.30607224e+00 - -4.91561949e-01 -1.41272581e+00 -5.16846895e-01 6.19032919e-01 - 1.39203870e+00 1.11129373e-01 1.31819701e+00 -1.35817492e+00 - -4.43263471e-01 -2.88575113e-01 -1.14559427e-01 8.40943903e-02 - 6.90910935e-01 -4.44806844e-01 1.39028683e-01 -4.51228917e-02 - -9.25889909e-01 -1.13666691e-01 -1.43659532e-01 -1.65080562e-01 - 3.17594558e-02 7.56735429e-02 -4.32295173e-01 1.27271473e+00 - -1.16948068e+00 -7.20970094e-01 2.77996373e+00 -1.03799140e+00 - -1.35375464e+00 7.34476745e-01 -2.07711983e+00 -2.35273913e-01 - 2.14582491e+00 -1.54493594e+00 -1.98196447e+00 -1.49918115e+00 - 8.98359597e-01 1.12477672e+00 -5.12588799e-01 6.83212399e-01 - 1.39025533e+00 1.78734636e+00 -2.46335283e-01 -2.30147171e+00 - -8.20878923e-01 -4.26804185e-01 -2.58938789e-01 -1.87859401e-01 - 1.11889458e+00 1.27442837e+00 -1.71673656e-01 1.09848487e+00 - 3.57661366e-01 6.45059720e-02 9.06544149e-01 7.22520500e-02 - 2.05781126e+00 -1.43841112e+00 -5.15170872e-01 -1.26518890e-01 - 4.39452678e-01 1.26796043e+00 -2.43985672e-02 -1.42222965e+00 - 8.42513919e-01 7.54958749e-01 -1.94185865e+00 -3.80413374e-03 - -1.59014177e+00 -6.64651096e-01 -9.02808905e-01 8.09264719e-01 - 1.16830178e-01 -1.28559101e+00 8.80065978e-01 -1.43380582e+00 - 3.30113798e-01 -8.14010680e-01 -3.65677088e-01 5.04270732e-01 - 5.21449387e-01 1.33696902e+00 -8.22537020e-02 3.06450874e-01 - -4.37737316e-01 1.81989878e-01 4.39486057e-01 -9.83098447e-02 - -1.80375051e+00 3.31470907e-01 3.64561468e-01 -6.80173457e-01 - -1.28438699e+00 1.47587919e+00 4.91445512e-01 -2.29081035e-01 - -1.13703287e+00 -8.21814835e-01 1.30259037e+00 1.37661934e-01 - -5.50587058e-01 5.09689212e-01 -8.14398170e-01 1.50084305e+00 - 1.35280049e+00 -4.66574073e-01 -6.28670275e-01 -2.61813331e+00 - -1.48158073e+00 -1.33946979e+00 1.35190338e-01 -3.26233715e-01 - -1.34533548e+00 -2.81164050e-01 5.05337119e-01 5.27839363e-01 - -1.89967024e+00 -1.28263474e-01 -1.38334084e+00 9.01728690e-01 - 7.84146428e-01 -8.98027241e-01 -7.66045213e-01 -4.61267054e-01 - -2.25279331e+00 3.35968167e-01 -2.72844553e-01 9.70175087e-01 - -1.09840918e+00 -6.19122922e-01 -1.50652528e+00 -1.00429058e+00 - -1.18198204e+00 9.95973706e-01 -6.98261857e-01 1.24175683e-01 - -3.80763412e-01 -1.26946521e+00 -1.68300420e-01 5.99103332e-01 - -1.20775938e+00 8.09112787e-01 -8.49720895e-01 -4.82983917e-01 - 5.30683100e-01 -6.63436174e-01 3.75748575e-02 -3.14690828e-01 - 1.29368007e-01 -2.38073921e+00 -1.82968187e+00 6.15967035e-01 - 1.03718138e+00 -9.10486698e-01 -2.44028345e-01 2.94719249e-01 - 6.93217576e-01 -2.02767730e+00 -8.86040270e-01 -3.32579374e-01 - 7.03140199e-01 2.49843049e+00 -2.28005722e-01 -1.40697742e+00 - 4.02268082e-01 1.32099581e+00 1.08014122e-01 -9.06648517e-01 - 9.57993925e-01 6.63545609e-01 -1.02608085e+00 3.81425619e-01 - 9.97265577e-01 1.18189001e+00 -6.64489716e-02 -3.58079433e-01 - 1.22727382e+00 -1.21016049e+00 4.38579291e-01 -1.37002659e+00 - 1.06537628e+00 -2.58045524e-01 1.26858318e+00 4.91412759e-01 - -1.22322023e+00 1.42462778e+00 1.55124521e+00 8.82286489e-01 - -7.61587858e-01 -3.06013942e-01 1.45427212e-01 -7.03710377e-01 - 1.69234657e+00 1.44326478e-01 -2.70334816e+00 3.39355826e-01 - -3.48157406e-01 -7.49265552e-02 -3.68619353e-01 8.07046413e-01 - 1.11968780e+00 2.83053160e-01 -3.39311182e-01 -4.13372032e-02 - 1.95996165e-02 -8.02430034e-01 5.78928649e-01 4.60042328e-01 - 1.54044127e+00 1.25534976e+00 -6.85674906e-01 8.20298374e-01 - -9.50953364e-01 6.95932090e-01 -5.16614199e-01 -1.02325320e+00 - 2.80045331e-01 -2.67456722e+00 -1.75097561e+00 -1.77491888e-01 - 8.24417695e-02 -3.69318783e-01 -1.57943630e+00 -8.17891836e-01 - 3.91209334e-01 5.81781745e-01 -1.62231177e-02 1.15683377e+00 - 3.97397012e-01 -7.47455657e-01 -2.80502051e-01 -1.40286613e+00 - -1.08138993e-01 1.11210060e+00 6.85290098e-01 1.40200937e+00 - -3.02885234e-01 1.34609985e+00 -1.93304226e-01 1.47158504e+00 - -2.18567476e-02 7.07231402e-01 1.47511232e+00 -1.93427622e-01 - 2.18388990e-01 1.28886342e+00 -1.33003160e-01 -1.32089233e+00 - 2.14300704e+00 -4.69764471e-01 1.58756047e-01 1.15600669e+00 - -6.08493924e-01 -1.00786793e+00 2.53737301e-01 -1.79307640e+00 - -2.87906080e-01 2.45038658e-01 -2.69951910e-01 -1.54345095e+00 - 1.59417796e+00 1.03538156e-01 1.43115211e+00 -2.60036260e-01 - -8.80011976e-01 2.69180924e-01 -2.15787128e-01 -1.17776144e+00 - 1.46978426e+00 -6.22270226e-01 -1.84026635e+00 1.32816601e+00 - -5.04298031e-01 -6.07823908e-01 -1.76538754e+00 1.74094152e+00 - -1.28948057e+00 5.84494807e-02 9.12620306e-01 4.21716928e-01 - -5.96543103e-02 -1.70166588e+00 -8.09724808e-01 1.87800333e-01 - -7.87174046e-01 2.03359455e-01 6.81743383e-01 9.62502897e-01 - -1.22617936e+00 2.19664598e+00 7.67479360e-01 -1.71415368e-03 - 1.08953655e+00 2.36540771e+00 -1.43872011e+00 -2.25262642e-01 - -2.37843156e+00 1.92651540e-01 3.65394235e-01 -3.91621143e-02 - -6.54533565e-01 -2.04951453e+00 3.73533696e-01 1.45543444e+00 - -5.91018438e-01 4.40059930e-01 -1.56814432e+00 4.57213074e-01 - -3.93382311e-01 -7.62417972e-01 -7.60363638e-01 -3.93611580e-01 - 3.82433325e-01 -1.31216323e+00 -3.97115320e-01 -4.14717793e-02 - -1.09903920e+00 1.79666519e+00 -8.53495479e-01 -6.69436932e-01 - -9.78307962e-01 1.79848003e+00 5.52905202e-01 -9.36591983e-01 - 1.10623157e+00 2.00698689e-01 2.57253408e-01 -3.44477206e-01 - -7.49415994e-01 3.37766111e-01 -5.45087218e-01 1.81675565e+00 - -1.26929116e+00 1.53615677e+00 -1.11318849e-01 2.99085736e-01 - 8.33157778e-01 -6.30249321e-01 8.64248395e-01 -2.70754385e+00 - -7.19989538e-01 -9.24062788e-01 -1.25350606e+00 -2.14954838e-01 - 3.85749966e-01 7.02660024e-01 2.40876764e-01 -6.65693462e-01 - 3.69016439e-01 1.24285698e-01 9.36056197e-01 7.76936710e-01 - 3.04764137e-02 -8.97252977e-01 -6.52950168e-01 -5.49789369e-01 - -3.15266341e-01 2.68050432e-01 7.15174377e-01 -5.11186942e-02 - 1.61247887e-02 1.43306300e-01 -5.15905321e-01 -9.25392270e-01 - -1.75543785e+00 -6.95149362e-01 4.73145455e-01 -7.02911437e-01 - -2.37807035e-01 4.66431290e-01 5.20689487e-01 -1.07953286e+00 - 9.97059345e-01 1.12735009e+00 -1.12984991e+00 -8.23836863e-01 - 1.19109070e+00 -1.46853521e-01 -2.21977806e+00 -1.39968145e+00 - -2.49369621e-01 -1.16304266e+00 -4.13423777e-01 3.23962122e-02 - 1.46225464e+00 -2.90453911e-01 1.07254183e+00 1.42876017e+00 - 1.43430972e+00 2.84754604e-01 -1.96328914e+00 4.60824877e-01 - 1.04839993e+00 -8.10671076e-02 1.74458385e+00 8.59833539e-01 - 5.68273365e-01 1.08638072e+00 3.33369851e-01 -2.49281049e+00 - -4.99138951e-01 1.19716799e+00 -8.07591319e-01 -2.23472506e-01 - -1.04483628e+00 1.78392255e+00 1.68149245e+00 -8.83868873e-01 - 1.47297993e-01 -6.80494070e-01 6.14944637e-01 5.09683073e-01 - 8.11705589e-01 3.31918448e-01 2.93540621e+00 -1.02792382e+00 - -1.71060336e+00 -3.93090189e-01 -1.56798828e+00 -1.49246228e+00 - -1.00131464e+00 -9.22188878e-01 -9.54618096e-01 5.18218935e-01 - -5.06189048e-01 -1.79514572e-01 8.94232750e-01 -7.66322553e-01 - -2.64545888e-01 3.65759999e-01 -1.94156778e+00 -2.38490558e+00 - 3.46117318e-01 4.92014289e-01 2.06422019e+00 1.58094883e-01 - 8.19739699e-01 -1.95472643e-01 2.97227234e-01 1.66725683e+00 - -2.12399292e+00 4.13616270e-01 2.13558674e+00 2.82034218e-01 - 1.65057302e+00 -6.29212141e-01 -3.20474952e-02 3.35247032e-02 - -2.21759105e+00 -1.16817784e+00 -1.09687507e+00 8.22066069e-01 - 8.72153103e-01 3.24036449e-01 8.24102879e-01 -1.64149418e-01 - 1.20193172e+00 8.78844224e-03 5.78640476e-02 -2.65592635e-01 - 6.84450507e-01 -4.21198815e-01 9.76431370e-01 -1.13214910e+00 - -1.09891629e+00 6.87426746e-01 -8.85442123e-02 -4.53891426e-01 - 8.90057743e-01 -6.07261419e-01 -2.00201258e-01 1.39922798e+00 - 1.53914237e+00 1.57239819e+00 -2.82125771e-01 1.23070288e+00 - -9.48952854e-01 -9.06902850e-01 -2.35045576e+00 1.67286825e+00 - -8.63979876e-01 -1.26532584e-01 -9.10324335e-01 6.95165277e-01 - -3.75633657e-01 -1.30481684e+00 4.41662550e-01 -1.43373001e+00 - -8.37405741e-01 -2.10688901e+00 -1.42395806e+00 1.78529173e-01 - -1.96453607e+00 -1.48114574e+00 -2.36891606e-03 -5.43606877e-01 - 1.99203658e+00 1.83389652e+00 -8.67922485e-01 5.57137609e-01 - 7.49282539e-01 -5.97478867e-01 -1.49554157e+00 -1.22984489e-02 - -1.72845677e-01 -4.35764492e-01 4.42316860e-01 -7.50729620e-01 - -1.15199757e+00 1.09082967e-01 3.85970086e-01 -1.17664492e+00 - 9.28090632e-01 -1.77048111e+00 -5.78113437e-01 -2.58008540e-01 - -6.36223733e-01 -5.65060496e-01 1.87107444e+00 -8.50884080e-01 - 2.44122171e+00 4.37126786e-01 9.20551345e-02 4.76240367e-01 - 2.52992773e+00 -9.93592978e-01 -1.12041795e+00 -7.54866183e-01 - 3.21431458e-02 -1.72577107e+00 2.23392397e-02 1.69354713e+00 - 1.51407391e-01 -2.92685449e-01 -6.18160427e-01 8.98511708e-01 - -8.25832784e-01 1.84639597e+00 1.93335974e+00 2.56750792e-01 - -1.15150884e-02 1.03065979e+00 1.09479845e+00 -8.05991709e-01 - -5.26555121e-01 -1.32234740e+00 4.09064949e-01 6.66627884e-01 - -1.34992564e+00 -4.55198914e-01 -1.34725392e+00 -8.98799837e-01 - 5.51017046e-01 -7.33482063e-01 -3.00825953e-01 -1.62731171e+00 - 1.86990216e-01 -4.42733206e-02 -7.28959262e-01 -1.89014301e-01 - 1.80294886e-01 -8.14868152e-01 -1.47626257e+00 1.80557442e+00 - -6.33272707e-01 -9.67601016e-02 1.01083970e+00 3.52905512e+00 - -1.24448645e+00 4.49827224e-01 -7.07552314e-01 -5.38790286e-01 - -7.20212042e-01 -1.12565494e+00 -4.12913635e-02 -1.47450268e+00 - 9.46807563e-01 5.50029352e-02 4.22434986e-01 -4.66188610e-01 - -3.67203921e-01 2.87563920e-01 -1.16405058e+00 -3.57990444e-01 - -8.59613538e-01 -8.90568793e-01 -2.10983825e+00 1.33932662e+00 - -6.86538756e-01 1.16337609e+00 1.06879234e+00 -1.28599966e+00 - -1.08841121e+00 2.56050301e+00 1.75092590e+00 -3.15185115e-02 - 2.57373095e-01 -6.30846381e-01 4.32352722e-01 -7.17181921e-01 - -2.03235388e+00 -1.32155573e+00 2.10927483e-02 -1.13106728e+00 - -6.72774613e-02 -1.07466102e+00 -7.82353938e-01 8.53165329e-01 - 2.47480541e-01 -9.66442097e-03 8.78963947e-01 4.80608582e-01 - 1.44656324e+00 -4.59859252e-01 -4.29634988e-01 -1.30110931e+00 - -4.28980708e-01 9.64769900e-01 1.29998732e+00 9.60305154e-01 - -1.79419184e+00 1.08314526e+00 -1.83851862e+00 8.98117304e-01 - -3.67784530e-01 7.02533305e-01 -1.33244836e+00 1.17116761e+00 - -1.46497476e+00 -6.68454885e-01 -9.98094618e-01 -1.75428486e+00 - 9.45275649e-02 1.80608916e+00 2.66690516e+00 9.71557871e-02 - 1.11668122e+00 2.32094631e-01 4.98883188e-01 -3.42148662e-01 - 1.95076928e-01 5.37478685e-01 -5.78459740e-01 -7.74943009e-02 - -5.84301203e-02 8.26403320e-01 -1.06603217e+00 -5.15422076e-02 - 1.91200542e+00 1.16406012e+00 1.34917903e+00 2.44318396e-02 - 3.20997566e-01 1.00223613e+00 1.56025553e+00 -1.95160294e+00 - -1.31544125e+00 1.30944407e+00 -2.76889354e-01 -6.71720803e-01 - -1.25416100e+00 -5.55016696e-01 -1.40236926e+00 2.09109426e+00 - -2.24360526e-02 -5.79505414e-02 5.88076651e-01 9.62345183e-01 - 9.45888162e-01 1.07362199e+00 -6.30845547e-01 -2.97866791e-01 - 7.69031644e-01 -5.27389050e-01 -5.10663213e-03 3.98763776e-01 - 7.85185099e-01 -1.77768099e+00 7.14745641e-01 -2.33724058e-01 - 7.07457721e-01 7.76931703e-01 1.60791504e+00 1.35345364e+00 - 1.34657168e+00 1.44367027e+00 2.91378975e-01 3.21850888e-02 - 8.72817099e-01 7.32080936e-01 -3.09066236e-01 4.04823422e-01 - -3.50964665e-02 -8.53932917e-01 1.29046071e+00 5.51992431e-02 - 2.04870820e+00 -1.62549222e+00 -5.56778789e-01 3.26388367e-02 - 1.13738108e+00 3.99469376e-01 3.10472757e-01 -1.12726378e+00 - -1.15694141e+00 3.22504669e-01 -1.36410594e+00 -5.31877756e-01 - 5.00917315e-01 -9.82324243e-01 1.60535598e+00 1.22393632e+00 - -1.57141995e+00 -9.32371259e-01 -2.28577733e+00 -7.82591343e-01 - 7.66388476e-01 -4.13894951e-01 4.60797250e-01 -1.52233434e+00 - 7.97975242e-01 -5.55109203e-01 1.21696824e-02 -9.28745046e-02 - -1.27154851e+00 -1.15308428e+00 9.67794478e-01 3.98992151e-01 - 1.05004907e+00 3.24658245e-01 1.42990634e-01 -1.72601056e+00 - -2.70366699e-01 -2.73843203e-02 1.95310724e+00 6.86956644e-01 - 1.53606117e+00 5.97537696e-01 -7.11303234e-01 1.17124215e-01 - 2.36223653e-01 -9.39155579e-01 -3.84387374e-01 -2.05766991e-01 - 1.60561979e+00 -2.08074045e+00 1.20581841e+00 -9.36795235e-01 - -1.37207913e+00 1.17994213e+00 7.31898069e-01 1.25451875e+00 - 8.00701976e-01 2.05984497e+00 5.58723807e-01 -1.56477377e-01 - 3.42926383e-01 1.24346912e+00 -7.79375315e-01 -6.51888609e-01 - -3.08960021e-01 -2.10458279e+00 1.38078406e-01 -1.88612866e+00 - 5.30593134e-02 8.65289748e-01 -4.76876467e-01 1.67711592e+00 - -1.46050096e+00 1.19126070e+00 6.75137043e-01 -1.71603322e+00 - 2.71079421e+00 -6.37452826e-02 -8.77484024e-01 -1.40708721e+00 - -2.20018029e-01 -1.27406865e-01 1.02645531e-01 -2.97796160e-01 - 6.33225322e-01 -3.87629837e-01 -1.93496406e+00 -1.49361491e-01 - -2.36747766e+00 3.53770643e-01 -9.79300201e-01 1.63360357e-01 - 1.31407499e+00 -9.48748708e-01 1.06125629e+00 -4.33389574e-01 - 1.75319481e+00 -3.65152627e-01 -8.40492964e-01 6.87238455e-01 - -4.35789734e-01 -8.41449499e-01 -4.77839857e-01 -4.46065329e-02 - 1.34218013e+00 -1.93408775e+00 2.30806582e-02 6.24903619e-01 - 1.93448573e-01 1.30986512e+00 -7.43453279e-02 -1.84551448e-01 - 5.92329562e-01 9.93138179e-02 -3.90408725e-01 5.16144037e-01 - -6.52528286e-01 -6.11177146e-01 -5.27994037e-01 1.35306919e+00 - -3.22101641e+00 3.95598829e-01 3.42718139e-02 -1.68087041e+00 - 3.09726775e-01 4.75751996e-01 5.89639246e-01 -2.65182805e+00 - 1.36638939e+00 -7.37736702e-01 6.86567724e-01 7.41073787e-01 - -8.76697063e-01 6.60459936e-01 -3.85493249e-01 -2.07137728e+00 - 1.16774189e+00 -7.75044978e-01 -8.54493439e-01 -4.03065860e-01 - 2.72236735e-01 -5.72704114e-02 5.48066914e-01 -1.33948934e+00 - -7.42386341e-01 -1.15997076e-01 1.89788735e+00 -1.13812536e-01 - 1.08522224e+00 3.85551676e-02 -4.83658642e-01 -2.46724427e-01 - -1.68419719e-01 6.03645742e-01 1.37461126e+00 3.59680861e-01 - 1.18707764e+00 -9.64789748e-01 -4.14630502e-01 -7.24302530e-01 - 3.14444602e-01 -3.58958393e-01 7.29772925e-01 -9.31001723e-01 - 4.30309735e-02 6.86857939e-01 -2.06530285e+00 2.12163138e+00 - -2.83151895e-01 -2.02503991e+00 1.58093899e-01 3.34434696e-02 - -8.18943828e-02 9.91191089e-01 7.47444987e-01 2.97876179e-01 - 4.50155646e-01 -1.04358450e-01 1.77549088e+00 -1.47919607e+00 - 1.39651671e-01 6.50246292e-02 8.13280404e-01 -8.43853891e-01 - -1.74367261e+00 1.56823009e-01 2.81230479e-01 1.48959947e+00 - -1.27036202e+00 1.15469372e+00 2.36741447e+00 -3.77764881e-01 - -1.29094219e+00 -4.06974375e-01 1.40049672e-02 5.63787878e-01 - -5.91394722e-01 1.55519855e+00 -1.11453235e+00 -1.73920512e+00 - 1.10671043e+00 -2.27462101e+00 -2.04129770e-01 3.11768103e+00 - -1.20435534e-02 -1.12076759e+00 -5.73974624e-02 -5.08007891e-02 - 1.60649940e-01 6.62262976e-01 -1.03911114e+00 1.68754804e+00 - 2.45114282e-01 7.26794064e-01 -6.28149390e-01 1.28638208e+00 - -2.57052422e-01 2.77552515e-01 -9.33106065e-01 -9.96931612e-01 - 6.88205302e-01 -4.43606824e-01 -8.53823066e-01 -6.43183351e-01 - 6.98953092e-01 -6.29037857e-01 -1.17654935e-01 1.50277503e-02 - -1.91734886e+00 -1.47112644e+00 3.36277902e-01 -7.37029374e-01 - 1.05672491e+00 9.66807127e-01 7.19168663e-01 1.81977177e+00 - -8.30893293e-02 6.48881912e-01 -5.84400952e-01 2.45236993e+00 - 1.98363364e+00 1.18804341e-02 3.86586219e-01 1.35929763e+00 - -1.47139505e-01 -6.56624079e-01 8.73989761e-01 7.84392431e-02 - 1.82994455e-01 -8.34271371e-01 -1.71767211e+00 2.36142412e-01 - 9.04962063e-01 -2.25806743e-01 1.43642950e+00 -7.03916311e-01 - -1.05470443e+00 -6.72303066e-02 -8.25925767e-02 7.69968748e-01 - -7.74962842e-01 1.36809421e+00 1.96691692e+00 -3.91440280e-02 - 1.24971044e+00 -1.05649710e+00 1.07522154e+00 1.04727638e+00 - 8.63437295e-01 -1.55630885e-02 -2.95424891e+00 -6.41727149e-01 - -6.25916004e-01 -9.78254914e-01 1.22243002e-01 -4.18963432e-01 - -8.24612021e-01 -1.05880010e+00 -7.15512991e-01 1.12149918e+00 - -6.61025047e-01 -5.14938712e-01 -4.56697613e-01 3.38993520e-01 - 4.56249475e-01 -3.04465652e+00 1.02163517e+00 1.43636799e+00 - -1.05018651e+00 -9.72298265e-01 -4.50253397e-01 -7.56004035e-01 - 2.23926872e-01 -1.94652534e+00 -1.11792755e+00 1.53919721e+00 - -9.28219974e-01 1.24342823e+00 3.55338186e-01 -1.80926776e+00 - 6.35329187e-01 -1.17667302e-01 -4.55467045e-01 2.44957089e-01 - 4.74633485e-01 7.07591832e-01 1.17825282e+00 4.08135921e-01 - -1.37563407e+00 -4.73645449e-01 2.94304824e+00 7.24491060e-01 - 2.36928725e+00 1.39543414e+00 2.41802067e-01 3.01407069e-01 - -3.65800411e-02 -1.73780465e+00 1.80472527e-02 1.21067452e+00 - -5.99640012e-01 4.11953002e-01 8.51591051e-01 -9.09680426e-01 - -4.51302856e-01 -1.01791143e+00 -8.22165608e-01 -1.28760529e+00 - -9.00448442e-01 1.41945565e+00 -9.15477037e-01 1.26459226e-01 - -5.02868772e-01 1.17886591e+00 1.31116235e+00 8.64069998e-01 - 3.06134552e-01 -2.31059536e-01 -1.19187474e+00 1.94352889e+00 - -7.59672999e-01 6.63646340e-01 1.05956554e+00 -5.55280507e-01 - 1.03801250e+00 7.64213204e-01 1.72796381e+00 -4.69346464e-01 - -1.28586400e+00 -5.05489767e-01 2.08853543e-01 5.82127273e-02 - -3.93437892e-01 -1.90919745e+00 4.70483005e-01 1.97234452e+00 - 4.64251250e-01 6.09135509e-01 5.72445333e-01 3.97046655e-01 - 9.31665957e-01 3.95373672e-01 -3.12677175e-01 -7.95988917e-01 - -2.48973653e-01 3.14129502e-01 -3.16082954e-01 3.14338684e-01 - 4.39150125e-01 8.33179414e-01 1.28279495e+00 7.80074149e-02 - -1.30777657e+00 -3.45104098e-01 4.40625660e-02 -1.83576870e+00 - 1.38866448e+00 1.42377913e-01 3.81352842e-01 1.36552644e+00 - -6.30142808e-01 5.74041903e-01 -1.09864123e-01 -1.84086874e-01 - -7.72192359e-01 1.66560316e+00 -3.75500053e-01 6.99464262e-01 - -1.87500739e+00 1.17998850e+00 1.05221117e+00 9.74341109e-02 - -2.26942465e-01 4.00000006e-01 2.28879070e+00 -1.76323664e+00 - 7.82027364e-01 -9.11617279e-01 -1.99183971e-02 -2.47880116e-01 - -4.28318262e-01 -1.04651293e-02 -1.13417113e+00 1.02918041e+00 - -1.79414356e+00 5.39764524e-01 -3.26535851e-01 -1.17092490e+00 - 2.02383369e-01 -1.55695105e+00 3.97138223e-02 -1.42287695e+00 - 4.61272091e-01 -9.24357355e-01 -4.50994015e-01 2.93949395e-01 - -7.72542119e-01 -5.79640009e-02 -1.51383054e+00 6.30875587e-01 - 1.76491380e+00 2.79220891e+00 9.11681503e-02 7.88204134e-01 - -1.78062570e+00 1.50273430e+00 7.76228487e-01 -4.83933449e-01 - 1.23718083e+00 -2.54476070e-01 -4.58025187e-01 4.75334913e-01 - -1.06335449e+00 1.02162480e+00 -1.05598986e+00 -1.05005479e+00 - -9.09689784e-01 7.59500027e-01 -6.16775334e-01 -1.34799576e+00 - -6.05679989e-01 -3.20423692e-02 -1.18097246e+00 -1.15168107e+00 - 2.11129642e+00 -8.60565424e-01 -3.23042363e-01 -3.67237002e-01 - -4.21356797e-01 6.21346712e-01 9.48142946e-01 -3.05954427e-01 - -1.23647833e+00 1.27444017e+00 -9.53410938e-02 1.73032176e+00 - -2.24558592e+00 1.95403218e-01 1.91860628e+00 -3.53264123e-01 - -2.25493416e-01 2.95511693e-01 -5.58795810e-01 -3.12071919e-01 - -3.37815106e-01 -6.27420723e-01 1.22885656e+00 4.61078018e-01 - 2.50628233e-01 8.08570981e-01 -2.64355838e-01 -7.74203837e-01 - -6.02624297e-01 1.78304105e-03 -1.05921173e+00 -5.00261009e-01 - -5.16116619e-01 2.12455630e+00 -2.77381492e+00 -1.79619715e-01 - 1.72569382e+00 1.21844463e-01 7.53417492e-01 9.98263136e-02 - -6.67333364e-01 -2.24191594e+00 2.05455691e-01 -7.75606632e-01 - -8.00335407e-01 -4.98987257e-01 1.74486136e+00 2.40483761e+00 - 2.17207938e-01 -4.64767218e-01 -1.17722642e+00 4.68540162e-01 - 1.78290558e+00 -2.47340381e-01 -2.37899613e+00 6.70049667e-01 - 9.85399723e-01 -1.58584559e+00 1.76484182e-01 4.51226026e-01 - -7.31073022e-02 -7.90770948e-01 1.01593721e+00 -4.03379738e-01 - -1.48518622e+00 -2.04947448e+00 1.26084948e+00 6.16663992e-01 - 8.79735500e-02 -4.40274626e-01 -5.28998435e-01 -1.07573664e+00 - -4.31455642e-01 -2.12008715e+00 -7.36063659e-01 -3.10937524e-01 - 6.09880030e-01 -3.99264619e-02 7.94923127e-01 1.34913516e+00 - 1.05817008e+00 4.70509857e-01 1.18626794e-02 -9.80787992e-01 - 7.39944041e-01 1.67974734e+00 3.42106044e-01 1.91725910e+00 - -1.52557170e+00 -1.15416336e+00 8.55201483e-01 2.64290214e-01 - 1.31378222e+00 -1.54716410e-02 -8.67975414e-01 -2.13593245e-01 - -1.53268397e+00 -1.12859070e+00 9.72443998e-01 8.72787416e-01 - 3.63004297e-01 -1.72107756e+00 -3.58403563e-01 -1.22499990e+00 - 1.17653322e+00 -5.83722115e-01 -1.08574772e+00 2.97961645e-02 - -6.40154481e-01 1.48374391e+00 -7.34810889e-01 -1.39922822e+00 - -2.15291357e+00 -3.07865031e-02 -5.61930120e-01 -1.42890465e+00 - 1.18922651e+00 6.67245150e-01 -2.82178782e-02 1.36432624e+00 - 6.62536025e-01 -8.22855890e-01 -1.03922021e+00 8.17981541e-01 - 3.96366954e-01 1.69747770e-01 -8.46497774e-01 6.07947588e-01 - -7.55498886e-01 -1.01986654e-01 1.44694412e+00 -6.60834253e-01 - -1.57298923e+00 -1.05254722e+00 9.94298816e-01 2.54731148e-01 - -1.28877372e-01 -5.09093821e-01 4.85359907e-01 -8.49104822e-01 - 3.05665582e-01 -7.31254578e-01 -8.97659361e-01 -6.37487411e-01 - -3.75407457e-01 -4.88298349e-02 2.35455796e-01 -3.68836522e+00 - -8.07650566e-01 -3.91569972e-01 1.50506198e+00 -1.08000267e+00 - 1.62796617e-01 1.49503028e+00 -1.31296849e+00 3.93975228e-01 - -8.02271843e-01 -5.04633725e-01 -8.03110838e-01 -7.31857300e-01 - 2.72191215e+00 8.44455898e-01 1.10173786e+00 4.91993487e-01 - -1.60625684e+00 3.94957960e-01 1.07292962e+00 -8.53399456e-01 - 9.65326667e-01 7.04336226e-01 1.85204005e+00 -4.17783022e-01 - -1.09761095e+00 -3.61628115e-01 1.43557096e+00 -1.82840586e+00 - -2.06837583e+00 8.93952772e-02 -4.42589283e-01 7.72789307e-03 - 9.45311785e-01 -5.52569330e-01 -1.87300587e+00 2.04448152e+00 - -9.97228265e-01 -1.05903709e+00 1.10372007e+00 -3.84489119e-01 - -6.33898526e-02 4.82790112e-01 1.08358312e+00 -3.33460689e-01 - 1.51995206e+00 -2.13579988e+00 7.59787261e-01 -8.39527547e-01 - 1.37227130e+00 1.42044485e-01 -6.72905326e-01 6.62841082e-01 - 8.93188357e-01 6.41894102e-01 -1.02969623e+00 -4.84017134e-01 - -4.07283707e-03 -4.12596539e-02 7.49745488e-01 -2.01490116e+00 - 3.34327042e-01 -2.89981365e-01 -8.38500619e-01 -7.42486775e-01 - -2.14537665e-01 -3.60108519e+00 -7.01289594e-01 -8.75814795e-01 - 5.32783210e-01 -8.05526674e-01 -2.15113804e-01 1.54227662e+00 - 9.94351506e-01 9.02271390e-01 -4.99872416e-01 5.43032289e-01 - -2.33897805e-01 -1.96093217e-01 4.09685344e-01 -5.96993566e-01 - 1.49300313e+00 -1.60130143e-01 6.18360579e-01 1.45560050e+00 - 1.03165376e+00 -1.40891910e+00 4.44895893e-01 8.50719392e-01 - 8.91863048e-01 -1.11329484e+00 -6.57035232e-01 9.94558215e-01 - -7.61760354e-01 -2.99696058e-01 1.57587576e+00 -4.77992520e-02 - -7.26774156e-01 -8.77617359e-01 3.32719326e-01 -1.33314800e+00 - -2.47335970e-01 3.33230525e-01 1.29914987e+00 6.67548060e-01 - -7.14065135e-01 1.17841935e+00 -2.99515039e-01 -1.23570299e+00 - -1.72041357e-01 1.43816924e+00 1.65240049e-01 1.25218639e-02 - 1.63721168e+00 -1.43135488e+00 -1.28559911e+00 -1.61584651e+00 - 3.72648180e-01 1.66493133e-01 1.54273641e+00 2.43475080e-01 - -4.04157162e-01 1.79487292e-03 1.17707416e-01 -1.67176127e+00 - 2.71033168e+00 5.19870996e-01 2.10008308e-01 6.15163982e-01 - 7.09410965e-01 -1.40553489e-01 -6.31241143e-01 5.99450707e-01 - 3.85710835e-01 -8.09890091e-01 1.41903555e+00 2.93352038e-01 - 7.29819357e-01 1.39192092e+00 2.18888205e-02 3.16952020e-01 - 8.10027242e-01 -7.24861085e-01 5.40516637e-02 1.25548586e-01 - -9.56058562e-01 2.31763673e+00 -1.40729344e+00 -1.29810393e+00 - 1.31422663e+00 7.13266253e-01 -1.02596486e+00 6.07289016e-01 - 1.46785045e+00 -3.02630570e-02 2.59882283e+00 -1.72406185e+00 - 6.93823457e-01 1.16652834e+00 8.91343892e-01 -2.04355732e-01 - -2.90441483e-01 4.44145173e-01 8.15754473e-01 6.02473438e-01 - 1.04005790e+00 -1.01887000e+00 -2.27770984e-01 -9.40015912e-01 - -3.86112779e-01 1.01522088e+00 1.07983924e-01 2.60685921e+00 - 1.93406373e-01 5.96918799e-02 1.06182918e-01 -6.37808681e-01 - -5.78366108e-02 1.12778807e+00 1.13970667e-01 -7.93459713e-01 - -9.25084591e-01 -4.50201899e-01 7.38634318e-02 1.19358659e+00 - -1.34048867e+00 -3.19826275e-01 1.44310012e-01 -1.83370578e+00 - -8.33598256e-01 -6.79888546e-01 -7.95970380e-01 1.91889539e-01 - -1.60559610e-01 -5.55286109e-01 4.27962631e-01 1.72307467e+00 - 1.63925648e+00 -1.92277181e+00 5.77072203e-01 -1.78023294e-01 - 1.55155346e-01 -3.98283035e-01 1.17588632e-01 1.51407391e-01 - -5.78951776e-01 -8.96367550e-01 -4.48695689e-01 2.34259263e-01 - 5.98996341e-01 8.53348970e-01 -4.94190037e-01 2.83844560e-01 - 9.62502599e-01 1.16559494e+00 -5.88328540e-02 -2.20931724e-01 - 1.91124231e-01 -1.14970744e+00 -1.93609506e-01 7.44586408e-01 - 6.41947150e-01 -2.70267606e-01 2.17718199e-01 -8.24650347e-01 - 7.03492165e-01 -7.78214335e-02 -2.75951445e-01 -3.61723840e-01 - -6.28938675e-02 -4.47877645e-01 6.58143044e-01 -3.95283550e-01 - 1.59296405e+00 1.46521056e+00 -8.91543508e-01 6.97042719e-02 - -6.50656641e-01 -3.26647550e-01 2.00242758e+00 -3.07571411e-01 - 3.60226035e-01 5.41321278e-01 -1.00062943e+00 -8.55095446e-01 - -1.20602238e+00 -8.85970712e-01 -2.85003018e+00 5.44582427e-01 - 2.05419374e+00 1.47820741e-01 -1.41626883e+00 -6.42776370e-01 - -4.58876848e-01 -1.01758480e+00 8.60182643e-01 8.10260057e-01 - -1.15545177e+00 7.25081027e-01 1.52681267e+00 1.90255821e+00 - 2.86521196e+00 -9.01992679e-01 2.82422042e+00 -3.54206383e-01 - -5.92465460e-01 3.27072382e-01 -5.82061350e-01 3.99044126e-01 - -1.32007718e+00 -2.38752604e-01 1.28558218e+00 -5.70612922e-02 - 1.41595781e-01 4.75258619e-01 1.17066169e+00 -1.65522468e+00 - 5.84524035e-01 -2.68139154e-01 1.66637897e-01 1.05995011e+00 - -1.19418669e+00 -1.01264441e+00 -1.93542993e+00 -5.49158931e-01 - 1.33043230e+00 -1.20587185e-01 -1.34012485e+00 -4.85678732e-01 - -1.48786175e+00 -1.12518609e+00 3.88818920e-01 -1.17387331e+00 - 1.11263430e+00 -7.11202919e-02 8.55947062e-02 -2.78372973e-01 - 7.72847831e-01 7.83159912e-01 3.35054338e-01 5.64789116e-01 - -2.12089673e-01 5.42203009e-01 -3.37877780e-01 2.23162211e-03 - -2.74643749e-01 -4.57218528e-01 -6.89111769e-01 4.67690110e-01 - 1.31060588e+00 1.17110193e+00 -1.25656533e+00 1.28920305e+00 - -3.18158507e-01 6.00733042e-01 -9.06215310e-01 1.41979918e-01 - 3.13723177e-01 -1.48460484e+00 6.08267188e-01 1.34593499e+00 - -1.74647942e-01 1.36016810e+00 -1.89693496e-01 8.49584520e-01 - -2.21856999e+00 2.41877770e+00 1.49545825e+00 -2.84625500e-01 - -1.97208905e+00 2.88028359e-01 -1.44392550e+00 4.44853544e-01 - 4.11565721e-01 -9.41403151e-01 1.12096703e+00 8.05574715e-01 - 9.75823343e-01 8.85770377e-03 -1.41552413e+00 9.77872252e-01 - -2.70371407e-01 -1.38978016e+00 1.35203934e+00 2.09940743e+00 - -4.12481278e-02 2.15912104e+00 9.13147256e-02 -3.80455911e-01 - -6.30458474e-01 1.63708246e+00 -7.80143857e-01 7.35269308e-01 - -5.59163809e-01 -4.60306764e-01 1.03024971e+00 -3.80036861e-01 - -4.04795378e-01 1.13280475e+00 1.59739733e+00 6.44689798e-01 - -6.54769778e-01 -5.10608494e-01 7.99152628e-02 3.20930064e-01 - -9.87865925e-01 7.23037958e-01 8.55133310e-02 -1.71570837e+00 - -9.28926468e-01 -5.04128695e-01 -7.28576854e-02 1.88927877e+00 - 2.38171309e-01 -8.40771914e-01 2.20630810e-01 -3.52087736e-01 - 3.24758232e-01 5.80567658e-01 1.20923066e+00 -2.42864955e-02 - 1.63896048e+00 9.08022642e-01 -7.07439542e-01 2.20985556e+00 - 3.26994509e-01 -8.35124493e-01 1.65291309e+00 2.07777762e+00 - -3.30252647e-02 -5.03650248e-01 -1.72374964e-01 7.14731753e-01 - 1.27785671e+00 5.70487320e-01 1.01723209e-01 1.49801183e+00 - -3.12836468e-01 1.03091931e+00 -2.88062811e-01 4.31022972e-01 - -1.13562562e-01 -5.58521822e-02 -3.77386510e-01 1.34485400e+00 - -7.32926011e-01 -7.40232229e-01 2.12099361e+00 9.77847576e-01 - -2.38511372e+00 -5.65258741e-01 -1.81198931e+00 -9.50690687e-01 - -3.13851476e+00 -1.36989579e-01 -1.14194179e+00 5.99052571e-02 - -1.62512124e+00 8.29118565e-02 -4.14916277e-01 1.50195301e+00 - 1.41135681e+00 7.85345614e-01 -3.26613784e-01 1.98538041e+00 - -1.55303526e+00 -1.40355861e+00 -8.37830976e-02 2.25403810e+00 - -2.02338904e-01 -1.83245242e-01 -1.44191039e+00 9.45929587e-01 - 9.53176260e-01 7.89075196e-01 -6.29220665e-01 -7.06706762e-01 - -1.29312229e+00 8.38725045e-02 -1.08040738e+00 -1.17741656e+00 - -4.23690706e-01 -4.89156187e-01 -5.37978530e-01 4.66444612e-01 - 7.90477991e-02 -2.88156033e-01 1.83012933e-01 -9.91527021e-01 - 1.05278626e-01 1.50602490e-01 -1.50304687e+00 9.95693743e-01 - -8.81433725e-01 9.03097987e-02 1.16441178e+00 9.25676763e-01 - 3.43333893e-02 -3.62498999e-01 9.47511911e-01 6.74694955e-01 - -6.18365049e-01 -7.94893622e-01 -2.07235527e+00 1.54031843e-01 - -8.26977417e-02 1.13619137e+00 3.59055102e-01 1.33422923e+00 - -6.97364211e-01 1.13815033e+00 -5.28916359e-01 7.77596056e-01 - -1.45875835e+00 2.35135302e-01 -1.98468792e+00 -2.67137498e-01 - 4.40353990e-01 -5.20968556e-01 -1.05043757e+00 8.46118450e-01 - -1.03462793e-01 -4.89570647e-01 4.18959223e-02 2.63666916e+00 - 1.15350282e+00 1.16257370e+00 4.35546309e-01 1.66485280e-01 - -2.39608240e+00 -6.78980350e-01 -4.80574399e-01 -1.87489212e+00 - -1.15741575e+00 -1.98083651e+00 -3.83238345e-02 8.18765998e-01 - 1.10677457e+00 4.14784461e-01 -9.63396370e-01 -1.93206513e+00 - -1.75686872e+00 -1.26898253e+00 -8.88859689e-01 -1.59748495e-01 - -1.01416421e+00 3.38975906e-01 -2.32273504e-01 1.05457120e-02 - 1.14025056e+00 -1.18023169e+00 4.18704569e-01 -1.39732286e-02 - -3.33486289e-01 8.85256350e-01 8.79014134e-01 -9.46490824e-01 - -1.82160586e-01 2.81211674e-01 -2.73006737e-01 5.85021317e-01 - 2.81192005e-01 1.83256125e+00 -1.53027916e+00 -1.22435987e-01 - -9.08502519e-01 6.04107440e-01 -1.61717188e+00 3.46312702e-01 - -6.30048454e-01 -7.34161854e-01 -1.44649816e+00 -4.65075940e-01 - 1.08450055e+00 -2.51886826e-02 6.27680421e-01 1.66464186e+00 - 2.63755590e-01 -5.58799565e-01 4.67252225e-01 -5.44296145e-01 - -5.04422665e-01 -1.51989281e+00 9.26257491e-01 -3.86002570e-01 - 1.82098240e-01 -1.03596199e+00 4.35947627e-01 -7.61941254e-01 - -1.41708717e-01 8.77826035e-01 7.90759325e-01 -2.20300723e-02 - -1.76813102e+00 2.99281049e+00 1.74585903e+00 7.22674847e-01 - -7.53203809e-01 -1.13592163e-01 -5.43420434e-01 1.26385593e+00 - 2.62502879e-01 -7.84163833e-01 -4.70755100e-01 1.12118816e+00 - 1.60009789e+00 1.52833152e+00 7.57032037e-01 -2.52624869e-01 - -3.24151373e+00 2.97160029e-01 -1.82875886e-03 1.39982089e-01 - 5.38973473e-02 -1.32626212e+00 5.57408571e-01 6.32216334e-01 - 6.82338774e-01 9.46577013e-01 6.04051650e-01 -4.10029411e-01 - -7.08368495e-02 9.11248699e-02 1.07699446e-01 9.98217642e-01 - 6.43668771e-01 2.07734346e+00 2.20646906e+00 7.96495825e-02 - -3.83665562e+00 1.23683250e+00 -5.40376365e-01 -8.01483929e-01 - -3.98445278e-01 -4.46643054e-01 6.12269819e-01 -1.55943573e+00 - -1.07772076e+00 3.43158543e-01 6.22944057e-01 6.41541660e-01 - -7.87036300e-01 -9.78265643e-01 -5.06634831e-01 2.39872754e-01 - 1.48547733e+00 -2.04743719e+00 -2.20728755e-01 -1.33267844e+00 - -1.65224266e+00 9.66956675e-01 5.44624567e-01 4.49433506e-01 - 9.21025813e-01 -4.79718894e-01 -4.32637513e-01 2.18573070e+00 - -2.46571049e-01 7.35132098e-01 6.38154102e-03 -1.84825850e+00 - -2.04462910e+00 -5.78984201e-01 1.05610120e+00 7.88291633e-01 - 9.14077163e-01 -9.87046182e-01 1.61587250e+00 -9.64893281e-01 - 2.12824404e-01 -1.03522902e-02 -1.73873827e-01 1.33859217e+00 - -7.05110550e-01 -1.34960520e+00 1.99954864e-02 -3.99826735e-01 - 3.37738299e+00 6.76507473e-01 -4.09340560e-01 -1.05611157e+00 - -1.88028836e+00 -1.02521908e+00 1.14941768e-01 1.42732871e+00 - 1.12168920e+00 2.29094833e-01 -3.23167652e-01 2.68812358e-01 - -5.18638015e-01 1.71836400e+00 -1.89190179e-01 -8.63378569e-02 - -2.21582246e+00 -1.95434853e-01 8.36484134e-01 -1.06772256e+00 - -7.50720322e-01 7.92711020e-01 -1.47665727e+00 1.05780101e+00 - -5.32143891e-01 -3.88568223e-01 -1.12387836e+00 3.78046274e-01 - 2.45987201e+00 7.81497240e-01 -8.64285290e-01 1.13562059e+00 - -3.16679537e-01 1.37448502e+00 8.06676388e-01 -2.94314146e+00 - -1.00407958e+00 9.44385231e-01 1.51657268e-01 -1.02375591e+00 - -6.33589506e-01 -1.90623665e+00 -8.83732885e-02 2.21068978e+00 - 8.04933369e-01 2.19041705e+00 8.82397115e-01 1.02747929e+00 - 1.97407866e+00 3.77263248e-01 -1.87227324e-01 1.03265750e+00 - 1.45384026e+00 -4.79964644e-01 -7.21569061e-01 1.38856038e-01 - 8.31498861e-01 7.98091054e-01 -1.42416763e+00 1.91160607e+00 - -6.41164958e-01 -2.65064985e-01 -9.02654350e-01 2.69832873e+00 - 2.42070675e-01 -8.50975096e-01 8.63923550e-01 7.69135773e-01 - -1.87727019e-01 8.40606689e-01 -8.25291932e-01 -4.92918283e-01 - -2.01021671e+00 -1.32395113e+00 1.80763078e+00 1.64142859e+00 - 6.57886490e-02 1.66293144e+00 -1.60074487e-01 -2.43410304e-01 - 5.65106496e-02 -1.24921598e-01 -2.15196326e-01 1.14339817e+00 - 1.50135651e-01 7.12481737e-01 1.03138053e+00 3.88185799e-01 - -9.70271349e-01 -1.35286593e+00 1.48780629e-01 -1.71256304e-01 - 8.73914286e-02 -1.60540398e-02 2.58357358e+00 -1.33022630e+00 - -1.98994625e+00 3.32376838e-01 -3.76422554e-01 3.16067874e-01 - 5.83883047e-01 1.28410470e+00 -1.50722587e+00 -5.56717873e-01 - 4.28761274e-01 7.70463586e-01 2.19344950e+00 9.55703974e-01 - -5.67885973e-02 -8.90435755e-01 -5.64289868e-01 -5.92702806e-01 - 6.55233622e-01 2.56997436e-01 -1.32364953e+00 -4.35901463e-01 - 8.77547562e-01 3.51662368e-01 5.68644643e-01 3.82921696e-01 - 1.62643075e+00 2.36043066e-01 -3.86022806e-01 -9.15755033e-01 - 2.49609455e-01 -2.44070411e+00 -1.84289008e-01 -2.05839857e-01 - -1.36447084e+00 -9.28265810e-01 8.09657514e-01 2.30711743e-01 - 1.01788335e-01 3.06347519e-01 9.26275849e-02 1.24286258e+00 - 1.23429179e-01 -1.45821309e+00 1.63012993e+00 3.70340258e-01 - -5.89253724e-01 -1.25340176e+00 3.95914227e-01 -1.17172003e-01 - 1.57414675e+00 -3.45907271e-01 -8.37380767e-01 8.95796239e-01 - 9.25281882e-01 1.68651372e-01 6.76357031e-01 -3.76871191e-02 - 2.33865097e-01 1.16499090e+00 6.25318348e-01 -1.69274434e-01 - 1.35236251e+00 -2.36485529e+00 2.05207324e+00 -1.70595908e+00 - -6.27092838e-01 -4.09175344e-02 -1.46480665e-01 1.00641108e+00 - -1.05783665e+00 2.60766804e-01 -5.78212082e-01 2.95140982e-01 - -6.47722632e-02 9.29356515e-02 -5.49924850e-01 -5.10063946e-01 - 2.87272513e-01 9.43715513e-01 -5.98279357e-01 -8.68767560e-01 - 6.13460183e-01 -3.25038917e-02 8.30498159e-01 -9.05797243e-01 - -1.77943897e+00 -5.33004105e-01 -7.66352594e-01 -7.87883997e-02 - -6.36845231e-01 7.41016924e-01 -2.88150251e-01 1.50474504e-01 - 1.57634556e-01 1.94955096e-01 -2.41291113e-02 5.10600686e-01 - -7.95655131e-01 1.39941788e+00 -2.36537382e-01 -1.31154919e+00 - 1.88073352e-01 -8.51180494e-01 6.65334523e-01 2.20960569e+00 - -7.86572099e-01 3.32372338e-01 -1.75550556e+00 4.53905225e-01 - -8.85713816e-01 -6.24834597e-01 -1.10128653e+00 7.32151210e-01 - 1.00730431e+00 2.33852156e-02 -8.64426732e-01 1.58952308e+00 - -7.50410140e-01 -2.16203237e+00 4.58389014e-01 -1.10477471e+00 - 9.88608778e-01 -3.81137699e-01 6.03713214e-01 1.40150034e+00 - -1.20885885e+00 -5.65937102e-01 -7.56710693e-02 1.53755128e+00 - -6.58382714e-01 -5.94503701e-01 4.78140563e-01 -5.77228725e-01 - 1.52525887e-01 -1.55153739e+00 2.21432996e+00 -1.34770918e+00 - -3.21752697e-01 -3.24810371e-02 1.91268814e+00 -1.32484579e+00 - 1.98370409e+00 7.57176816e-01 4.17095333e-01 -9.84633684e-01 - 5.71141243e-01 -7.23486602e-01 1.26666129e+00 -9.71062005e-01 - -1.13158494e-01 -5.55273116e-01 -9.87522960e-01 7.41319835e-01 - 5.35458744e-01 1.20901024e+00 -1.77987516e+00 1.06417072e+00 - 2.50544882e+00 -4.51641083e-01 1.89514863e+00 -2.63161272e-01 - -1.37240970e+00 -3.05632442e-01 1.04820395e+00 -8.38176489e-01 - -6.00338638e-01 5.93539417e-01 3.06011667e-03 5.84622979e-01 - 2.37829536e-01 4.79625314e-01 -4.32301193e-01 2.31730938e-01 - -9.27372992e-01 1.28843367e+00 -6.34274721e-01 1.12884748e+00 - -2.33606964e-01 2.01999009e-01 -4.94964540e-01 -1.01599538e+00 - -1.74476039e+00 -1.05532420e+00 -3.23908865e-01 2.11879179e-01 - 1.01037931e+00 -4.45757449e-01 -8.81960750e-01 -1.44271481e+00 - 4.80063617e-01 1.38319457e+00 1.26633024e+00 7.09932745e-01 - 6.38543308e-01 1.69762373e+00 3.32081467e-01 6.46129668e-01 - -3.20783854e-01 -1.36204338e+00 -6.83848917e-01 -9.15903877e-03 - 6.07853718e-02 1.92628586e+00 8.12771693e-02 1.17884350e+00 - -4.35077250e-01 -3.11675876e-01 -1.32900763e+00 -7.42327988e-01 - 5.26981413e-01 7.56018050e-03 9.42938626e-01 1.81253955e-01 - -3.52453738e-01 2.14829683e+00 -9.39035714e-02 1.16300094e+00 - -4.01209205e-01 7.09599495e-01 9.28015649e-01 2.08491966e-01 - -2.35448289e+00 1.82524603e-02 1.23046005e+00 5.27480841e-01 - 4.45838869e-01 -1.72225690e+00 -7.20455050e-01 4.11357641e-01 - 4.10383284e-01 1.21466672e+00 -1.44866750e-01 4.05824095e-01 - 1.63742416e-02 -9.86942708e-01 -2.25579143e+00 4.31009680e-01 - 1.73947215e-01 -6.18175030e-01 -1.61917305e+00 6.29739523e-01 - -2.65920073e-01 5.56874752e-01 1.86404848e+00 -2.14143085e+00 - -6.99600816e-01 -2.09375620e-01 1.45482612e+00 -1.63640296e+00 - 1.93586099e+00 1.35855898e-01 -9.60006639e-02 -1.98059767e-01 - -4.36263025e-01 1.04811680e+00 -1.06852078e+00 3.45414191e-01 - -3.84181678e-01 -7.62133420e-01 2.58597517e+00 4.19050723e-01 - 6.13265574e-01 1.13777673e+00 -1.42135000e+00 1.75229192e+00 - -2.50895715e+00 -7.68766403e-01 -4.59031671e-01 5.81723392e-01 - 2.18718261e-01 -1.66814744e-01 -1.19973449e-02 -9.78023052e-01 - -1.49723244e+00 7.26511598e-01 6.93706095e-01 1.58449435e+00 - -7.90017068e-01 6.92815185e-01 -2.83706725e-01 1.19047499e+00 - 1.02196276e+00 2.11209401e-01 -3.39530170e-01 9.60932195e-01 - 6.46904528e-01 -6.84440374e-01 -1.32779872e+00 -2.88532913e-01 - 1.51541376e+00 1.14648712e+00 -1.11232746e+00 -1.16775739e+00 - 1.27468383e+00 -1.13563585e+00 5.09295344e-01 -6.16071224e-01 - 1.61704493e+00 -7.70424783e-01 1.14336157e+00 1.84858650e-01 - 2.52194852e-01 -1.97030854e+00 -2.20291328e+00 -2.20565945e-01 - 2.46906057e-01 -1.12519777e+00 -1.08869338e+00 5.61472654e-01 - 5.62854469e-01 -1.37741059e-01 7.80201733e-01 -1.17420232e+00 - -3.55934918e-01 1.64787757e+00 6.83221295e-02 1.26312509e-01 - 2.30594659e+00 -1.39934218e+00 7.96430349e-01 1.66398633e+00 - -3.24857920e-01 8.59630764e-01 -9.82019484e-01 -6.86732411e-01 - 3.84966403e-01 7.98566699e-01 1.12093651e+00 1.56962860e+00 - -6.74465597e-02 1.19361174e+00 -3.42304856e-01 9.02091324e-01 - 2.07488805e-01 4.45677906e-01 4.23507869e-01 -1.60470498e+00 - -1.12412190e+00 1.02541439e-01 -1.68727911e+00 8.31211090e-01 - 8.71555984e-01 3.27170551e-01 -1.12113404e+00 1.38013899e+00 - 2.03721070e+00 1.47076830e-01 5.10535359e-01 2.01738656e-01 - 5.30369163e-01 -1.19558759e-01 -3.45642380e-02 9.05717611e-01 - 2.34562421e+00 -1.40007126e+00 -1.03812486e-01 -1.18046045e+00 - 1.59202492e+00 -5.87737978e-01 -1.44320083e+00 6.38186991e-01 - 1.74431109e+00 6.63597882e-01 2.04797879e-01 4.09140915e-01 - 1.41484070e+00 -8.74199271e-01 -1.11847103e-01 -4.52690154e-01 - -1.16786528e+00 -3.28375250e-01 1.10772109e+00 5.66602170e-01 - 6.44310474e-01 1.46475613e-01 5.23323774e-01 -8.00590336e-01 - -3.09509873e-01 1.92004234e-01 2.77515858e-01 3.20187867e-01 - -1.38335288e-01 -3.77423406e-01 3.15777135e+00 9.06787276e-01 - 4.13147479e-01 -2.33208492e-01 -4.51218814e-01 -1.30800590e-01 - 2.92828858e-01 -1.88043559e+00 -1.56938958e+00 -6.36882365e-01 - 2.07139432e-01 -2.39273146e-01 1.47812128e+00 -1.48239684e+00 - -7.51345456e-01 1.52009177e+00 1.31119326e-01 8.84842575e-01 - 2.05124688e+00 -6.93804443e-01 -9.93143395e-02 -2.72426903e-01 - -1.95459306e+00 -3.10396791e-01 8.81062210e-01 5.40589571e-01 - 6.19083166e-01 -3.75508994e-01 -6.56097233e-01 1.26925564e+00 - -4.97317135e-01 2.94082046e-01 1.63334683e-01 1.46171701e+00 - 9.71412361e-01 -1.29525864e+00 -1.18755317e+00 5.78909814e-01 - -6.43376485e-02 -4.55171794e-01 1.01562786e+00 1.87459379e-01 - 5.24573885e-02 7.79828668e-01 -3.75108987e-01 1.56851900e+00 - -7.88047612e-01 6.56929076e-01 1.43383288e+00 -1.37729549e+00 - -3.43335181e-01 1.14220738e+00 1.15724456e+00 -1.71495602e-01 - -7.34671116e-01 9.55935836e-01 1.18354905e+00 -1.16466331e+00 - -1.20391369e+00 2.25256681e-01 6.29360616e-01 -2.99069077e-01 - 6.79293811e-01 -7.10136592e-01 -2.11469364e+00 -4.23901081e-01 - 6.32321462e-02 -2.75839955e-01 -1.15860331e+00 -1.81564903e+00 - -5.14088809e-01 8.97655785e-01 1.25619745e+00 1.18771338e+00 - -8.15909564e-01 -1.15370631e+00 4.96634603e-01 6.35100007e-01 - 4.00169611e-01 -2.01406336e+00 -8.66884589e-01 -2.96887904e-02 - 4.87218201e-01 7.67307341e-01 1.44253522e-01 3.30783814e-01 - 8.48654583e-02 6.85619354e-01 3.12865645e-01 6.57376572e-02 - -2.68368810e-01 8.12914222e-02 4.84591365e-01 2.67475396e-01 - 3.80826414e-01 1.34272182e+00 2.65163928e-01 -6.48998499e-01 - 8.83606791e-01 -1.36619830e+00 -1.08141339e+00 3.35847259e-01 - 5.16396642e-01 -1.24505126e+00 -3.82817477e-01 -6.06083810e-01 - -4.86676335e-01 9.43597794e-01 7.06505477e-01 -1.91221796e-02 - -5.45961916e-01 -1.34964967e+00 1.42018497e+00 5.92113912e-01 - -7.06669092e-01 7.19599485e-01 -7.02236220e-02 -2.87109494e-01 - 4.35211539e-01 -7.41498232e-01 -3.43756527e-01 1.21461064e-01 - -4.99451190e-01 6.69085205e-01 1.86906648e+00 -7.54298151e-01 - -1.66512525e+00 -2.00174022e+00 -1.50468141e-01 -6.63164616e-01 - 2.07860500e-01 1.32597670e-01 -1.60321689e+00 -1.81488383e+00 - 2.95933247e-01 1.35428831e-01 -4.25817996e-01 -1.65714359e+00 - 4.71391261e-01 1.27189958e+00 2.77982444e-01 -8.47473919e-01 - 4.45521660e-02 -1.36611414e+00 1.05796075e+00 8.18443373e-02 - -1.76815689e+00 -3.00674349e-01 4.63301510e-01 -3.33047628e-01 - -2.75895715e-01 -1.22440219e+00 5.66541016e-01 -6.48469210e-01 - -2.15644526e+00 -3.71257365e-01 9.81149733e-01 -3.86929452e-01 - 1.73798501e+00 4.28967953e-01 3.38758171e-01 -1.28398269e-01 - -6.82862103e-02 -2.63423622e-01 -3.02269399e-01 5.30867159e-01 - -9.35831249e-01 -1.07589388e+00 -1.12319505e+00 2.04565454e+00 - 5.30342877e-01 3.04492205e-01 -1.05751467e+00 6.23342276e-01 - -1.16571344e-01 2.14168936e-01 -9.76620317e-01 1.60268858e-01 - -4.23833549e-01 -7.65611172e-01 7.34106362e-01 -1.53770781e+00 - 5.64776242e-01 6.84486151e-01 -8.79643679e-01 -8.20328474e-01 - -2.23025680e-01 2.82036215e-01 -1.52854994e-01 4.33562458e-01 - 1.76297963e+00 -5.07716775e-01 -1.71517992e+00 8.25923562e-01 - 1.68331456e+00 3.92768353e-01 -1.08774865e+00 -1.23892367e-01 - -1.11852348e+00 6.42855465e-01 -1.35129035e+00 -3.00128788e-01 - -1.00433779e+00 1.26824692e-01 -3.66341680e-01 1.76762104e-01 - 2.23505553e-02 -9.86659050e-01 6.26877069e-01 -4.87656519e-02 - -9.88529027e-01 2.24092650e+00 4.76430178e-01 -1.53260559e-01 - 1.41520452e+00 1.84181559e+00 -1.45349428e-01 3.47397476e-01 - -9.88002837e-01 -1.47192746e-01 -1.99062556e-01 1.84827030e-01 - 3.32760483e-01 5.59828460e-01 2.37642661e-01 -1.21074259e+00 - -9.38467085e-01 -5.46407640e-01 -2.82240599e-01 1.05077434e+00 - -1.16293895e+00 -3.57415050e-01 9.53009486e-01 -2.99877465e-01 - 4.60415661e-01 -1.39542043e-01 2.56783247e-01 -2.28472546e-01 - 6.67065501e-01 -1.54505646e+00 -3.63312006e-01 2.96572685e-01 - 5.33695221e-01 1.49295259e+00 -3.91024679e-01 1.47948182e+00 - -6.59023821e-01 1.32864642e+00 8.17467153e-01 -9.13458586e-01 - 3.07039231e-01 -5.12358360e-02 6.25584602e-01 -2.23271593e-01 - -3.07246238e-01 1.36909461e+00 -9.87208933e-02 8.26422334e-01 - -2.25351483e-01 1.66666329e+00 -6.78334653e-01 -6.14143550e-01 - 1.52451718e+00 -4.44946378e-01 1.60107434e+00 8.91840577e-01 - -1.54649705e-01 -1.81329346e+00 -1.93183050e-02 1.01630054e-01 - 1.16721773e+00 1.58844721e+00 -6.84987187e-01 8.01181793e-01 - 7.65402138e-01 1.07341266e+00 4.98689920e-01 -1.94249773e+00 - -1.55422434e-01 -1.15570760e+00 -2.02605769e-01 9.80004251e-01 - 3.05150837e-01 1.14809084e+00 3.41753930e-01 -1.29832590e+00 - -2.52084708e+00 1.48808762e-01 -2.13926464e-01 -9.76945758e-01 - -1.70062721e+00 -7.34743237e-01 -1.94491491e-01 6.65877819e-01 - 1.88631088e-01 1.60806727e+00 -1.03560400e+00 8.81058872e-01 - 4.72987443e-02 6.94669127e-01 -4.36514854e-01 1.07168043e+00 - 2.95224756e-01 -6.79444969e-01 -1.20240292e-02 -2.39555866e-01 - 4.89178360e-01 -1.17977679e+00 -2.93517020e-02 3.95307243e-01 - 3.30233090e-02 1.34694123e+00 7.74022758e-01 -7.48079689e-03 - 2.16689095e-01 2.95665562e-01 4.03299600e-01 -1.82565796e+00 - -3.01526308e-01 -1.48443902e+00 1.27828097e+00 -6.08495213e-02 - -1.93340942e-01 8.96797359e-01 4.22324806e-01 -1.12863457e+00 - -7.09053650e-02 -3.35166961e-01 -1.74453294e+00 -6.90276206e-01 - -1.83540329e-01 -1.14950933e-01 -1.48223138e+00 1.15166557e+00 - -1.02103062e-01 3.75530757e-02 4.00641650e-01 -7.88076282e-01 - 4.56355959e-01 -3.67074639e-01 -8.96585524e-01 5.73447466e-01 - 1.49167383e+00 7.24829376e-01 9.22977567e-01 -5.20630598e-01 - -1.59761977e+00 1.49013758e+00 2.05451488e+00 1.54280674e+00 - -3.35785925e-01 -1.15463436e+00 -1.06568265e+00 -1.68373251e+00 - -1.07378617e-01 1.60567796e+00 -1.41662627e-01 3.02322894e-01 - 1.37559700e+00 -4.04095322e-01 2.03553152e+00 1.99209440e+00 - -1.31111339e-01 -5.77534258e-01 6.97238028e-01 -3.47524941e-01 - -1.15655518e+00 1.28123868e+00 1.21644735e+00 1.55511603e-01 - -1.26286471e+00 -1.22952628e+00 1.07229285e-01 -4.99871045e-01 - -3.41092944e-01 1.02196312e+00 7.33179331e-01 1.37814271e+00 - -9.90623236e-01 -3.43191564e-01 7.58981705e-01 4.48078334e-01 - 1.53238034e+00 4.20536697e-01 1.84810913e+00 5.40017784e-01 - 1.06216741e+00 -3.23514938e-01 1.42872119e-02 9.10944402e-01 - 1.61297822e+00 7.29236960e-01 -8.31717700e-02 1.37733507e+00 - -2.19197869e+00 -1.42223454e+00 6.81634247e-01 1.41083561e-02 - 6.05513632e-01 6.44075200e-02 5.70945680e-01 -8.14402819e-01 - 5.70486069e-01 9.94414508e-01 1.22055066e+00 2.01355433e+00 - 2.44098440e-01 2.73911089e-01 -1.52317250e+00 -5.31411409e-01 - -1.93082079e-01 1.05031955e+00 -6.11001134e-01 7.96340048e-01 - -1.62171701e-03 -1.77168116e-01 2.48893976e-01 -3.91156375e-01 - -4.29037243e-01 -1.70478594e+00 -2.47954205e-01 -4.79968160e-01 - -5.15981019e-01 4.16873068e-01 6.58631980e-01 2.32194498e-01 - 9.95415211e-01 5.75561941e-01 -2.95056194e-01 -4.62856293e-01 - 3.12822223e-01 5.73767126e-01 1.39784431e+00 1.19117308e+00 - -5.41473031e-01 -1.02006279e-01 6.71017826e-01 1.06799781e+00 - 9.38969493e-01 -6.38778359e-02 -1.25402260e+00 -9.98936355e-01 - 2.72633523e-01 3.42225581e-01 -1.09867942e+00 4.45696115e-02 - 6.31224334e-01 -1.15122664e+00 1.10499716e+00 -3.77531141e-01 - 1.25761306e+00 -5.33452451e-01 -1.02662551e+00 -4.58136976e-01 - -9.51481238e-02 -3.88479918e-01 -2.16001421e-01 3.33965153e-01 - -1.71179369e-01 -6.61123917e-02 -1.70922637e+00 3.91698003e-01 - -8.92032981e-02 1.61029303e+00 -9.39565182e-01 -1.80829537e+00 - 6.18068516e-01 3.46527010e-01 -1.00843298e+00 -1.52974892e+00 - 1.53739679e+00 1.41411042e+00 -2.48745680e+00 -4.58223015e-01 - 1.66419208e+00 -4.76062566e-01 -4.90612239e-01 -2.82759339e-01 - -1.12089694e+00 -1.70477569e+00 1.05254734e+00 -1.84293687e+00 - 2.47271448e-01 8.86579514e-01 -1.45122635e+00 -9.24268425e-01 - 1.21317923e+00 5.32891691e-01 -6.17285728e-01 1.20224905e+00 - 1.20653570e+00 2.69052684e-01 -9.00160253e-01 -4.13448691e-01 - 2.86386758e-01 -1.88594833e-01 9.38957155e-01 -1.52977407e+00 - -1.49426353e+00 1.80871114e-01 -4.91544425e-01 -6.27048612e-01 - -1.76805675e+00 1.95458457e-02 -5.27551062e-02 6.53478980e-01 - 6.89061359e-02 -1.32604077e-01 1.17700064e+00 -1.31246734e+00 - 5.36389232e-01 -1.67114711e+00 -8.38361859e-01 -1.21265709e+00 - 7.81942546e-01 -6.54102340e-02 -1.35032441e-02 -1.10673296e+00 - -2.99693495e-01 -7.63465285e-01 3.83455008e-01 -9.91091132e-01 - 1.32979441e+00 -1.67558050e+00 -8.06519806e-01 1.67720056e+00 - -4.35138762e-01 -1.31487870e+00 2.19071567e-01 -4.75392252e-01 - -8.35870266e-01 -1.09587085e+00 2.76837349e+00 -2.51551896e-01 - -1.92556739e+00 1.49370980e+00 6.11070931e-01 -7.66466439e-01 - -9.33119118e-01 6.73472822e-01 -2.46918321e-01 -1.92597783e+00 - 1.78723991e+00 -4.33990449e-01 1.49557173e+00 1.27843499e+00 - -1.00100958e+00 1.03318989e+00 -6.51442528e-01 -1.86670053e+00 - -4.24598157e-01 3.97637516e-01 -4.62736636e-01 -3.07048351e-01 - 1.14356601e+00 1.06493980e-01 -8.43298316e-01 7.02030301e-01 - 1.97675571e-01 -2.52160478e+00 4.28352922e-01 3.91828716e-01 - 2.01211292e-02 8.91700566e-01 -6.72393560e-01 3.36499780e-01 - -1.56695938e+00 4.07090396e-01 1.95305967e+00 -9.94856358e-01 - -7.59802103e-01 -7.10015416e-01 1.29998222e-01 -1.06303632e+00 - 4.21247393e-01 8.46839964e-01 5.34347236e-01 -1.76841545e+00 - 9.95167673e-01 9.37367439e-01 8.30161393e-01 3.72842252e-01 - 2.20189318e-01 1.03238940e+00 1.92351353e+00 -9.53040481e-01 - 1.07812536e+00 -8.62170979e-02 -1.29673049e-01 -1.53653073e+00 - -2.84789413e-01 -9.97264981e-01 -4.63854432e-01 -1.09818637e+00 - -4.48908389e-01 -4.06110346e-01 9.67549160e-02 -1.61474991e+00 - -1.44551814e+00 3.61914277e-01 -7.97438145e-01 1.07452977e+00 - 2.25269720e-01 6.06762171e-01 1.41069233e+00 -3.09710801e-01 - -5.74486732e-01 8.71702611e-01 5.09513557e-01 -1.87330306e-01 - 1.39145792e+00 -2.12244317e-01 3.27736363e-02 2.12226510e-02 - -9.34593916e-01 7.75853917e-02 1.39676023e+00 4.44968313e-01 - 3.10634263e-02 -1.61767274e-01 3.30867022e-01 7.49766648e-01 - 6.09238803e-01 -3.31864864e-01 3.24781597e-01 4.79939729e-01 - -1.00719500e+00 1.68230549e-01 -2.16130400e+00 4.82533723e-01 - 6.60343051e-01 7.47546136e-01 -3.95046860e-01 3.09046835e-01 - 1.42395389e+00 1.41376960e+00 -2.91726999e-02 1.39319628e-01 - 1.11091113e+00 -1.50080204e+00 -3.23075742e-01 1.33557045e+00 - 9.75200534e-01 3.69263768e-01 -1.50583243e+00 -4.99102324e-02 - -5.89915395e-01 3.31038892e-01 -8.09346676e-01 -1.02572334e+00 - -2.06486535e+00 1.22180474e+00 -3.99770498e-01 1.57145783e-01 - -2.23811646e-03 -1.06743395e+00 -1.25906491e+00 -1.00531518e+00 - -7.23168433e-01 3.66606236e-01 -7.95717657e-01 1.74649060e+00 - 1.02715099e+00 8.08359563e-01 -1.19658902e-01 1.51428068e+00 - -2.18005627e-01 -3.12614143e-02 1.69974536e-01 7.01823056e-01 - 6.36908054e-01 -4.75125283e-01 -5.32993734e-01 6.39902949e-01 - 7.15236604e-01 -3.93351138e-01 -1.58718812e+00 -2.07316101e-01 - 6.23525798e-01 1.80541682e+00 -1.19795799e+00 -4.26571250e-01 - -3.89403552e-01 1.35580540e+00 -3.49415019e-02 -9.07326579e-01 - 5.58695257e-01 -4.04638529e-01 -1.22438729e+00 -8.92773330e-01 - 1.73404193e+00 3.84209216e-01 1.07760057e-01 -2.49893740e-01 - 2.92148829e-01 2.80235946e-01 2.92168826e-01 1.14845090e-01 - 8.68798316e-01 2.31043454e-02 -9.77938831e-01 -6.12422884e-01 - -1.97901869e+00 2.02681708e+00 8.62754583e-01 -8.30070496e-01 - -2.81844854e-01 -6.36876464e-01 1.54254723e+00 -1.41987717e+00 - 1.14476062e-01 -1.08527839e-01 8.26838613e-01 -1.04009226e-01 - -1.75820887e+00 -2.51117826e-01 -9.72162247e-01 7.95795470e-02 - 1.43417704e+00 -1.73942530e+00 -8.00222754e-01 -5.52692950e-01 - -3.81631106e-01 7.40125656e-01 -1.73544422e-01 4.15802717e-01 - -6.25773072e-01 2.29924351e-01 7.77797639e-01 -7.32299566e-01 - 7.81358123e-01 5.23727685e-02 1.04546022e+00 -2.01419905e-01 - 3.81244659e-01 -4.06062305e-02 -9.62122604e-02 2.58930415e-01 - 5.36807477e-01 -1.24956489e+00 1.43363357e+00 -1.85966301e+00 - -6.84812129e-01 -9.42690670e-02 4.12355959e-01 1.57617247e+00 - -1.85829151e+00 -5.07694125e-01 -2.12683344e+00 3.16219740e-02 - 2.71683574e-01 1.59079576e+00 1.20634764e-01 1.04402518e+00 - 5.83837092e-01 -4.35248196e-01 -1.40896070e+00 2.75002301e-01 - -1.49210203e+00 1.29520702e+00 -1.62062109e-01 1.19517744e+00 - -4.80214447e-01 2.68850774e-01 2.09348857e-01 -4.50441167e-02 - -4.32659507e-01 3.92456025e-01 -9.26163256e-01 -1.20943534e+00 - 1.24282694e+00 7.27167964e-01 6.58916086e-02 1.66514242e+00 - -2.67765522e-01 1.32270324e+00 1.75574079e-01 -1.02475071e+00 - 1.84804022e+00 9.90710855e-01 -1.21021032e-01 7.27171242e-01 - -4.03703421e-01 8.29149008e-01 -2.39107847e+00 -1.01980138e+00 - 9.79259729e-01 -2.49655545e-01 2.07542300e-01 -1.34678566e+00 - 1.84786701e+00 1.19052958e+00 -2.00685218e-01 -4.04695868e-01 - 1.02960432e+00 1.98247039e+00 9.53760266e-01 2.17180276e+00 - -9.22351301e-01 -8.19386840e-01 -1.18894112e+00 4.41760659e-01 - 1.75603554e-01 -1.51951119e-01 8.09258640e-01 1.23232865e+00 - -1.52566242e+00 -5.96972048e-01 8.20143580e-01 -4.67738688e-01 - -7.98206091e-01 1.01853740e+00 -1.28950012e+00 -3.01579803e-01 - 3.28644037e-01 -5.73819935e-01 7.83155680e-01 -5.68876982e-01 - 1.40861952e+00 -1.25746295e-01 6.71270132e-01 7.46023953e-01 - -6.27539992e-01 -2.05348659e+00 3.60755742e-01 7.22883493e-02 - 5.55826008e-01 -2.26527467e-01 -4.30427700e-01 2.00847292e+00 - -1.47235405e+00 7.33950496e-01 -5.74601829e-01 -3.35778624e-01 - -3.17102820e-01 -8.03434253e-01 5.71224213e-01 1.39064360e+00 - 4.70013648e-01 1.03381999e-01 -9.86440480e-01 5.16825259e-01 - 2.15634966e+00 -3.80694628e-01 -3.75983566e-01 -5.61089158e-01 - -4.89414245e-01 -2.74529958e+00 4.05035377e-01 -8.45552206e-01 - -1.61026311e+00 7.53189921e-01 -1.08201778e+00 -3.38669062e-01 - 1.46644235e+00 -1.40921581e+00 -1.53259826e+00 2.69306183e-01 - 1.14948738e+00 -1.57434177e+00 4.07689750e-01 5.80999553e-01 - -2.05916047e+00 7.98501313e-01 8.76605868e-01 -3.11785579e-01 - 1.25234640e+00 1.13645554e+00 1.03117928e-01 -4.51445401e-01 - -9.72827911e-01 -3.75635102e-02 5.69758892e-01 -3.24176848e-01 - -2.19750094e+00 1.26871395e+00 1.90223217e-01 -3.79624486e-01 - 8.57158363e-01 1.52463865e+00 -2.80191159e+00 4.34263647e-01 - -1.42664921e+00 4.99110520e-01 1.27927199e-01 6.35726511e-01 - -4.99247611e-01 -1.87324011e+00 -4.50368196e-01 4.06847626e-01 - 1.01539230e+00 -1.17183125e+00 1.02706039e+00 -6.47872269e-01 - 3.62449884e-01 -5.09893954e-01 1.82170630e-01 8.91877651e-01 - -1.22701406e+00 -6.79905891e-01 1.84962586e-01 2.40158367e+00 - -8.63189697e-01 2.04013395e+00 2.00442338e+00 -9.73111540e-02 - -2.98152536e-01 -6.92065179e-01 -6.82025328e-02 -1.40671945e+00 - -7.62390196e-02 1.49688029e+00 -6.11085057e-01 -2.40724310e-01 - -1.70114052e+00 3.74062091e-01 2.64481694e-01 6.37023598e-02 - -2.16827050e-01 -2.92757928e-01 5.01900256e-01 -2.88166013e-02 - 3.14971864e-01 2.14982823e-01 1.12342727e+00 6.76994085e-01 - 1.92861840e-01 1.51842201e+00 4.20268536e-01 1.30496264e+00 - 6.06942594e-01 -1.84191906e+00 1.30483472e+00 1.35176241e-01 - -5.19547723e-02 2.81348348e-01 -3.44983160e-01 2.27028862e-01 - -5.91876090e-01 1.27427220e+00 -1.13588117e-01 -3.82583857e-01 - -9.24925804e-01 2.68775791e-01 1.87107101e-01 3.88424605e-01 - -1.02725184e+00 -1.60118115e+00 6.25591278e-01 1.48464191e+00 - -1.02635360e+00 1.58944130e+00 2.83911049e-01 -7.06290454e-03 - -3.97548705e-01 -1.38451803e+00 9.87728536e-01 -7.75606334e-01 - -7.79806674e-01 9.91926312e-01 -1.88298345e+00 -1.52359784e+00 - -1.81342438e-02 2.93284416e-01 4.84734654e-01 -3.01798493e-01 - 2.53036976e+00 -2.27892563e-01 -2.30786145e-01 -7.96986639e-01 - 1.23906814e-01 6.14961207e-01 1.20357800e+00 -1.48928428e+00 - -2.45995343e-01 4.46703613e-01 5.87057531e-01 6.54865801e-03 - 5.42423964e-01 -8.08963895e-01 1.55790544e+00 9.92628098e-01 - -1.34492779e+00 -1.07510304e+00 -1.04692173e+00 6.77969992e-01 - -1.28472781e+00 -3.31024349e-01 4.54704404e-01 -6.61659315e-02 - 2.58887529e-01 -1.06580269e+00 5.94854355e-01 -6.13211870e-01 - -3.24619681e-01 2.37731725e-01 1.03078224e-01 2.39800196e-02 - -7.50626802e-01 -2.99549401e-01 2.18359078e-03 -1.37508523e+00 - -4.62532967e-01 3.87679875e-01 -1.27743661e+00 1.15658164e+00 - 1.26961327e+00 -1.24515057e+00 1.35761964e+00 -1.43581584e-01 - 4.18514550e-01 -4.96651053e-01 9.23044801e-01 4.06094015e-01 - -8.14153254e-01 -1.16520202e+00 -1.46030247e+00 1.87941775e-01 - 5.72471440e-01 6.50965691e-01 -1.17981091e-01 -1.15699649e+00 - -6.82074785e-01 1.15234864e+00 -4.84628558e-01 1.43417627e-01 - -1.07565045e+00 3.36583048e-01 -4.57684577e-01 1.04671276e+00 - 7.73557723e-01 -4.95705932e-01 -1.26664028e-01 -5.51116228e-01 - -1.41943324e+00 -8.57442439e-01 1.98217332e-01 3.25259507e-01 - 1.48547903e-01 -3.01733047e-01 9.33066607e-01 6.35419041e-02 - 7.03253806e-01 -1.77377510e+00 -3.92240024e+00 2.83788025e-01 - -1.42476571e+00 -2.12451696e-01 -3.02097052e-01 1.58740056e+00 - 3.99996549e-01 1.65859029e-01 8.36991131e-01 3.68497580e-01 - -4.57677007e-01 1.07147598e+00 -1.10975015e+00 3.23247194e-01 - -1.54756975e+00 2.03943864e-01 1.92989647e-01 1.67645502e+00 - -1.56497395e+00 -6.00887179e-01 8.22682440e-01 3.35317254e-01 - 1.80305049e-01 -9.56135690e-01 -5.90818346e-01 1.24504101e+00 - -5.15091941e-02 3.75359684e-01 -1.15188330e-01 -1.18308556e+00 - 9.33810413e-01 5.03232360e-01 2.30563688e+00 -1.01558137e+00 - 8.22572947e-01 -1.31154394e+00 1.57346562e-01 9.76506591e-01 - -2.07213521e+00 -2.04932481e-01 -5.63209713e-01 -7.01957285e-01 - 3.85212660e-01 1.12068570e+00 -1.33278713e-01 2.05486432e-01 - 1.14308752e-01 1.80350640e-03 -1.08934259e+00 1.10844457e+00 - 2.95195371e-01 3.94881487e-01 -8.06509614e-01 -4.59220409e-01 - -1.76134199e-01 -1.53412473e+00 -1.68803430e+00 -6.00747943e-01 - 5.22520781e-01 -6.53137386e-01 -1.01880515e+00 -8.14685643e-01 - -4.55342412e-01 -3.53858978e-01 2.16552138e+00 1.05462062e+00 - 1.02477407e+00 1.16858041e+00 -9.98820841e-01 -1.22236466e+00 - -1.10687029e+00 1.33450615e+00 -5.47191799e-01 -1.25135612e+00 - -4.30996209e-01 -2.38605529e-01 2.68413275e-01 2.50951022e-01 - -8.28180552e-01 6.85656369e-01 -6.39193535e-01 -1.56276330e-01 - 6.86774075e-01 -6.41821384e-01 -5.69351435e-01 5.91668069e-01 - -8.57733190e-01 -3.84792030e-01 5.55742204e-01 2.05409241e+00 - 6.05404496e-01 4.72639769e-01 5.30801594e-01 -1.07319033e+00 - 1.15596741e-01 -2.97664888e-02 3.21584553e-01 1.16508052e-01 - 3.79699677e-01 -3.11467201e-01 5.86089157e-02 -1.22782862e+00 - -3.19907278e-01 1.39083350e+00 3.18808347e-01 -2.09210366e-02 - 1.89612329e+00 -1.36083496e+00 -1.10945559e+00 1.71271145e+00 - 4.59609367e-02 -2.42583036e-01 -1.19920433e+00 4.47696269e-01 - 5.44459879e-01 -1.09449434e+00 -1.40275583e-01 -3.35515827e-01 - 2.55977154e-01 -1.11925030e+00 1.25194740e+00 9.15823758e-01 - -3.63175571e-01 -2.16418713e-01 -1.36422157e-01 -7.78308570e-01 - -4.43429410e-01 -2.70270377e-01 1.61286843e+00 9.46618319e-01 - 9.34654713e-01 3.69925618e-01 -1.11732697e+00 -4.14911062e-02 - 9.70488489e-01 -1.25826466e+00 6.36553228e-01 2.32260919e+00 - 1.06181669e+00 1.41402912e+00 2.18338758e-01 9.67931926e-01 - -7.17853189e-01 7.95863807e-01 6.54388130e-01 2.89911181e-01 - 1.31600738e+00 -5.38784862e-01 -1.87057510e-01 9.54620391e-02 - -1.02961099e+00 2.26931944e-01 -1.71893036e+00 1.04695451e+00 - 4.66373675e-02 -5.20892918e-01 -8.34373593e-01 -2.34579027e-01 - 6.88259721e-01 5.47738969e-01 -7.05537975e-01 -8.46039653e-01 - 6.56898677e-01 -7.22347379e-01 5.99097759e-02 1.23754643e-01 - 3.12062144e-01 8.58270407e-01 -1.26296294e+00 -1.91115928e+00 - -5.67447305e-01 1.80152845e+00 2.02217412e+00 1.83117712e+00 - -1.92981362e-01 -1.78211510e+00 -2.34303904e+00 -6.14038169e-01 - -1.98073626e-01 1.30062544e+00 8.67805481e-01 2.27405086e-01 - -8.89844775e-01 -9.60779548e-01 2.54127681e-01 6.97050989e-01 - 3.91881257e-01 -1.03459799e+00 6.50667608e-01 4.25910980e-01 - -1.07066643e+00 -7.84320951e-01 6.88495994e-01 -2.34507531e-01 - 1.58914733e+00 5.01129270e-01 -4.86631185e-01 -1.02056004e-02 - 6.33833930e-02 -7.28390336e-01 -9.12588179e-01 7.01389909e-01 - 8.45273316e-01 6.03781462e-01 1.51531804e+00 -5.41773260e-01 - 1.67427075e+00 -9.00921106e-01 -1.01268554e+00 -1.75995886e+00 - -4.45795327e-01 -5.03722310e-01 5.25937259e-01 2.43890941e-01 - -1.19297338e+00 -3.92726243e-01 -3.71462017e-01 -1.77598226e+00 - -9.80946720e-01 -7.70813644e-01 1.43362498e+00 1.91450715e-01 - 6.62168741e-01 -1.49863553e+00 1.19421446e+00 1.30110204e+00 - -1.99834502e+00 -7.05316722e-01 4.95765567e-01 6.44388437e-01 - -6.78494751e-01 -3.05499464e-01 -5.97381055e-01 1.10418044e-01 - 1.19717848e+00 -7.71042168e-01 1.00082052e+00 -7.81672060e-01 - -8.47627223e-01 8.18594635e-01 9.21936393e-01 8.51409912e-01 - -1.31579745e+00 -4.65950966e-01 8.22988570e-01 4.15418409e-02 - -1.07369280e+00 4.58317935e-01 -7.14807272e-01 1.79452479e+00 - 1.54484141e+00 6.04097426e-01 1.36100674e+00 6.47913516e-02 - 7.65437484e-01 1.47772026e+00 2.45498687e-01 -2.55187184e-01 - -1.70462191e+00 -8.31335261e-02 8.23423207e-01 9.45633411e-01 - 5.04365087e-01 -5.41299105e-01 -1.97682154e+00 -4.95215088e-01 - -3.04344863e-01 -3.12928975e-01 6.18556380e-01 1.98567641e+00 - 1.23529203e-01 -2.16731787e-01 -2.59221822e-01 1.24384768e-01 - -8.28293741e-01 1.20198555e-01 4.50812340e-01 2.09686071e-01 - 4.57541943e-01 4.33747619e-01 -1.77189672e+00 6.36870146e-01 - -6.71014845e-01 -1.09583914e+00 -1.10445178e+00 4.33541596e-01 - -2.22574934e-01 -1.68196881e+00 4.78037804e-01 -1.43960810e+00 - 1.39821365e-01 2.38498926e-01 8.85577083e-01 1.78168476e+00 - -1.36467636e+00 -5.24738543e-02 -2.76335299e-01 4.33749676e-01 - 2.15668768e-01 -3.52546245e-01 -4.40902203e-01 -1.11645091e+00 - 9.87861574e-01 4.59204167e-01 9.17768180e-01 -3.10396135e-01 - -6.56835496e-01 -1.07831502e+00 3.58917594e-01 -1.03979325e+00 - 1.84866309e+00 -1.30764258e+00 2.73660123e-01 1.56685472e+00 - 5.99089742e-01 1.75673708e-01 -9.68483627e-01 -1.38176918e+00 - 7.70050108e-01 1.50614727e+00 -4.38849986e-01 -1.13906145e+00 - -2.45816946e+00 -9.80764866e-01 -1.13656841e-01 1.59912324e+00 - 6.73133671e-01 3.05085391e-01 -4.00119662e-01 -5.83165705e-01 - 1.40564764e+00 2.00738534e-01 -2.10260057e+00 -6.73539639e-01 - -8.58216047e-01 -5.46263993e-01 5.62910676e-01 -1.07444882e-01 - -7.68654048e-02 1.31347311e+00 -6.96461082e-01 1.86419952e+00 - -6.71307564e-01 8.91981944e-02 -2.81203598e-01 1.23419136e-01 - 4.44591999e-01 4.33835238e-01 -7.53117323e-01 -1.12291467e+00 - 2.50317168e+00 -4.83984351e-01 6.99212432e-01 -1.60764134e+00 - -2.79612720e-01 -1.27199382e-01 4.72815067e-01 -2.72245407e+00 - 1.66140616e-01 -1.14974141e+00 3.24345708e-01 -1.03632915e+00 - -8.55482697e-01 -2.73627609e-01 -1.05232942e+00 -4.75534439e-01 - -1.19337583e+00 -1.83317041e+00 1.01795709e+00 -1.29604864e+00 - -1.78680882e-01 7.52891064e-01 -6.14944100e-01 -8.25165927e-01 - 2.26238087e-01 1.11793387e+00 -1.16931522e+00 -1.50147825e-01 - -1.73249459e+00 -3.93200994e-01 3.42046842e-02 -8.54624629e-01 - -4.56991822e-01 -1.40822673e+00 7.70786107e-02 1.05569422e+00 - -1.09714389e+00 -6.22671783e-01 5.02546251e-01 1.21209073e+00 - -1.49318981e+00 -1.68784738e-01 2.83130825e-01 -1.02056392e-01 - -2.47679210e+00 5.01089990e-01 1.51933634e+00 1.16661750e-01 - -1.03110600e+00 -5.13127267e-01 1.76991308e+00 9.09962535e-01 - 6.40026629e-01 3.56412590e-01 -6.61137104e-01 -2.00571179e-01 - -7.79227391e-02 5.49574316e-01 1.50891888e+00 1.11833811e+00 - -1.41938436e+00 2.23994970e-01 7.36763358e-01 6.18860304e-01 - -6.29233778e-01 2.91410923e+00 -1.27708459e+00 -1.08318841e+00 - 2.33595088e-01 2.11680317e+00 -8.36194873e-01 -2.65828371e-01 - 1.07729363e+00 -1.78621542e-02 -3.34603846e-01 5.65632880e-01 - 2.72517979e-01 1.03043878e+00 1.29146886e+00 3.07003051e-01 - 2.91732728e-01 2.14110985e-01 4.89130557e-01 5.51706366e-02 - -6.08482340e-04 1.19662714e+00 1.51611614e+00 -1.50755298e+00 - 1.00211520e-02 -6.88694417e-01 -3.10971469e-01 4.02516216e-01 - 6.69438362e-01 -1.12126447e-01 -1.91334033e+00 -9.65045571e-01 - 8.96499693e-01 -2.74042654e+00 1.18742716e+00 5.21229982e-01 - 1.31617606e+00 -1.98569193e-01 -2.06140542e+00 4.30362642e-01 - 1.27465308e+00 9.24721777e-01 -6.62691474e-01 8.67610723e-02 - -5.32836258e-01 -8.19266438e-01 9.55388546e-01 -1.19753230e+00 - -8.93816411e-01 -1.10402560e+00 -1.49024749e+00 1.46414295e-01 - 1.16150308e+00 -1.84876755e-01 -1.50574780e+00 8.39944541e-01 - 1.68570173e+00 -6.46242321e-01 -4.78554279e-01 5.15660644e-01 - 1.62187374e+00 -5.19834161e-01 -1.33244979e+00 6.31418586e-01 - 7.99068093e-01 6.13145065e-03 -5.58133245e-01 9.94378746e-01 - 5.28035998e-01 -7.68435538e-01 4.90947329e-02 -1.35170758e+00 - 6.85890436e-01 -2.62233555e-01 -4.35710818e-01 -3.28744836e-02 - 4.03077006e-02 8.94930065e-01 5.91090560e-01 -1.56409279e-01 - 7.63668180e-01 4.00737256e-01 -4.26599622e-01 -1.24718159e-01 - -1.48222893e-01 5.05261362e-01 -3.67022604e-01 -3.90409172e-01 - 1.96875727e+00 9.39735472e-01 -7.60600686e-01 5.70940971e-01 - 1.18350066e-01 3.91551167e-01 1.37341321e-01 1.18964267e+00 - 3.45620334e-01 -6.20886981e-02 1.04219055e+00 2.33285040e-01 - 1.21478580e-01 3.14022601e-01 -9.36090201e-02 9.22699273e-03 - -5.86694598e-01 -7.51167238e-01 -9.98570383e-01 -1.39198327e+00 - -8.79717350e-01 -1.74267039e-01 1.03117585e-01 -2.71296561e-01 - -2.35570359e+00 1.57458639e+00 1.08648014e+00 4.45759863e-01 - 1.90749073e+00 -9.77364123e-01 1.88762510e+00 -9.65214074e-01 - 2.38737926e-01 -1.27383733e+00 1.46940956e-02 3.86166483e-01 - -1.54690325e-01 7.13877439e-01 9.90144461e-02 -5.22414744e-01 - -5.66813290e-01 8.10913801e-01 -6.53742015e-01 9.15402949e-01 - 1.22271180e+00 -1.66871381e+00 1.13292634e+00 2.06042266e+00 - 2.52003759e-01 5.99084198e-01 1.03700531e+00 -2.75482327e-01 - 1.82280228e-01 -1.31419957e+00 1.38635024e-01 4.98111993e-01 - 5.16163051e-01 1.06542468e+00 -7.74802148e-01 -5.32606781e-01 - 1.89550734e+00 -1.50328368e-01 2.00033784e-01 -6.70100451e-02 - -8.21680844e-01 1.62352920e+00 -5.27701437e-01 -1.12728548e+00 - -4.50740278e-01 -2.00170588e+00 -6.51287436e-01 -9.68128622e-01 - 6.17428660e-01 -6.73414230e-01 8.87803912e-01 1.39848399e+00 - 7.81175315e-01 -2.15388283e-01 1.41114503e-01 -1.52498496e+00 - -1.33779377e-01 -8.60438466e-01 6.94326401e-01 2.57275248e+00 - 4.99821633e-01 -1.99114013e+00 -3.61230195e-01 1.22490928e-01 - 8.73574242e-02 -7.30813146e-01 2.45291740e-01 -1.24250078e+00 - -9.44908738e-01 6.45797670e-01 -1.01566327e+00 5.38211584e-01 - -1.36946380e+00 1.76482594e+00 -7.74269938e-01 -2.23666579e-01 - 3.30613256e-01 -5.65349936e-01 1.43405959e-01 5.75356722e-01 - 9.60613251e-01 7.91497409e-01 -1.71810463e-01 -1.46631777e-01 - 2.34910846e+00 9.49889064e-01 -9.21823680e-01 9.99881625e-02 - -3.35143149e-01 -1.35334238e-01 -1.22949517e+00 -4.11923230e-01 - 1.81155574e+00 5.79076290e-01 -2.49116135e+00 1.21672344e+00 - 1.08962953e+00 -3.59285295e-01 7.17263520e-01 -8.59399199e-01 - -4.70255703e-01 1.49854636e+00 -2.71730036e-01 -1.86973166e+00 - -6.33032247e-02 -7.85883963e-02 1.19836855e+00 1.43364221e-02 - -4.21582311e-01 1.04505610e+00 1.20790236e-01 -7.50022709e-01 - 1.15499759e+00 1.93661764e-01 2.92596663e-03 1.77908391e-01 - -1.07066834e+00 -5.27653754e-01 3.58260190e-03 -1.94663689e-01 - 1.27242839e+00 2.17405856e-01 8.54903936e-01 9.39535443e-03 - 1.05628848e+00 3.97527754e-01 1.34204850e-02 -1.32830888e-02 - -9.54438627e-01 5.06915748e-01 -4.02047336e-02 -5.01991987e-01 - 1.10646176e+00 -1.25021601e+00 -4.22495902e-01 1.02771878e+00 - 3.15522939e-01 -4.78589274e-02 2.51003534e-01 1.79414168e-01 - -1.51666260e+00 -3.49261880e-01 1.61652160e+00 -6.59324110e-01 - -3.18301022e-01 -5.41620791e-01 -8.70152265e-02 -5.83122849e-01 - 4.86741513e-01 1.97313413e-01 7.73547590e-01 -1.05094111e+00 - -3.40340495e-01 -2.25722626e-01 4.55646902e-01 -7.93221176e-01 - 9.15653825e-01 1.18641555e+00 -9.88872349e-01 -4.90872890e-01 - -5.31931520e-01 2.63238621e+00 -4.29900110e-01 1.67762542e+00 - -1.26663196e+00 -9.89650965e-01 -6.86604559e-01 -8.45689297e-01 - -2.67898738e-01 -7.08778203e-01 -8.19341600e-01 1.15847194e+00 - 1.22379363e+00 9.49296474e-01 5.23998976e-01 9.27430689e-02 - -4.97030705e-01 -9.53157485e-01 3.62995714e-01 -1.17363036e+00 - -9.93920922e-01 1.15291321e+00 1.90064907e+00 3.93295139e-01 - 1.10645711e+00 -2.45848313e-01 -1.33767414e+00 6.12171590e-01 - 5.69289327e-01 7.16685504e-02 -2.42375463e-01 9.62210596e-01 - -9.85733032e-01 -7.73780584e-01 -1.00810647e+00 8.49729627e-02 - -1.08789301e+00 -6.84242189e-01 -7.05755591e-01 -2.60608345e-01 - -1.48737442e+00 8.72049928e-02 1.24730361e+00 1.35491371e-01 - 1.54132116e+00 1.33399820e+00 7.77735054e-01 7.46858418e-02 - 2.25003883e-02 7.63799787e-01 -3.91670614e-01 -1.88252079e+00 - 4.92004067e-01 -1.13106859e+00 -9.62141216e-01 2.23947995e-04 - -9.51248780e-02 -1.41207564e+00 1.65491566e-01 9.66139674e-01 - -1.12834156e+00 1.27248335e+00 -1.56017077e+00 2.11732411e+00 - -4.11260426e-01 1.45616993e-01 -1.89203358e+00 -6.21360652e-02 - 2.28482172e-01 -8.61621052e-02 -5.63081086e-01 -9.46564496e-01 - 1.13052666e+00 3.08071356e-02 1.54075241e+00 -5.26482344e-01 - -4.34794694e-01 8.15594256e-01 -1.62297499e+00 -1.21346331e+00 - 1.09748495e+00 -1.17193389e+00 -1.81905103e+00 1.86852717e+00 - -1.39381766e+00 1.84119403e-01 6.90851957e-02 -2.81062841e+00 - -2.67317027e-01 4.48413253e-01 3.54309380e-01 9.03520703e-01 - -3.87491316e-01 7.12504447e-01 -6.89329088e-01 -1.41046739e+00 - -1.29772842e+00 -1.46211040e+00 1.00958598e+00 -1.51457345e+00 - 2.74038851e-01 -2.65924186e-01 -1.02648571e-01 1.36360729e+00 - -4.38427359e-01 6.11601591e-01 3.75633501e-02 -1.52018905e-01 - 2.29996920e+00 -1.44134176e+00 1.28913188e+00 6.56406581e-01 - -4.79525000e-01 1.62931848e+00 -1.84811008e+00 -5.21766663e-01 - -1.20848215e+00 4.12083477e-01 -1.11306405e+00 4.33063954e-01 - -1.86436415e-01 -4.79353935e-01 3.88647258e-01 -2.68621624e-01 - -1.08863485e+00 2.80137300e+00 -4.87167180e-01 -2.21760917e+00 - -8.42098713e-01 -1.26727664e+00 8.26145589e-01 3.21652651e-01 - 3.98712814e-01 -3.70022476e-01 -2.79350251e-01 -3.70931089e-01 - -8.11931863e-02 8.35333407e-01 3.36075425e-01 5.00112832e-01 - 4.99558896e-01 8.84442449e-01 2.88150042e-01 3.58848065e-01 - 1.18880630e+00 -6.65045917e-01 -8.55742574e-01 -1.18592155e+00 - 1.82914570e-01 -1.85373223e+00 4.62154090e-01 -1.09193094e-01 - 1.13463974e+00 -6.71917319e-01 -2.03318283e-01 1.12220335e+00 - 1.38313568e+00 -6.80152357e-01 1.81781232e-01 2.94418991e-01 - 3.73653412e-01 -2.62575656e-01 1.28138340e+00 -1.88970351e+00 - -4.21130240e-01 4.47930723e-01 4.57224816e-01 1.35275900e+00 - 1.19822538e+00 -2.85541624e-01 1.29987121e+00 -7.14041770e-01 - 9.30887401e-01 1.81052244e+00 1.21494412e+00 1.05770387e-01 - -3.31778765e-01 -1.52859128e+00 3.10539633e-01 -1.11521721e+00 - -7.93134689e-01 -1.19074583e+00 -4.60392326e-01 -2.79694144e-02 - 2.73614585e-01 -6.00506663e-01 1.06928372e+00 -2.08205253e-01 - -7.87722409e-01 2.26292267e-01 1.96355283e+00 3.35160553e-01 - -3.62855613e-01 -1.52746141e+00 -2.19412327e-01 2.34928131e-01 - -3.87580901e-01 -1.66538465e+00 -8.50565374e-01 1.45453846e+00 - 1.43400097e+00 -8.86639178e-01 6.72756791e-01 -1.17857277e+00 - 1.40431631e+00 2.45235845e-01 9.49641526e-01 -1.71509337e+00 - 6.77637100e-01 -7.67406356e-03 8.51929069e-01 9.14867818e-01 - 6.57846928e-02 -6.00829661e-01 7.79696167e-01 -6.01544976e-02 - -8.72784972e-01 -9.20152247e-01 -1.47716439e+00 -5.10291159e-01 - 1.10269201e+00 1.77829802e-01 -5.29767692e-01 8.04855943e-01 - -8.32476616e-01 9.53502730e-02 -1.79630369e-01 7.79130638e-01 - -1.57722843e+00 -8.50197732e-01 -1.63824570e+00 -2.59596735e-01 - 1.21142745e+00 -5.05447209e-01 -5.10683954e-01 8.71724427e-01 - 1.06854951e+00 -8.85710642e-02 6.84714854e-01 -1.07695448e+00 - 1.37797439e+00 2.14870620e+00 5.21215796e-01 -2.08433151e+00 - -5.58230281e-01 6.12825334e-01 -4.21483010e-01 1.27219856e+00 - 5.88157237e-01 -7.06475496e-01 3.22648101e-02 -8.93833220e-01 - -1.82584882e-01 -5.54593265e-01 1.17476177e+00 2.11601210e+00 - -1.43118834e+00 -1.92321634e+00 -2.45172426e-01 -1.51841569e+00 - 4.62116040e-02 7.48936117e-01 1.49414194e+00 2.01394558e-01 - -1.93155229e-01 -1.97838497e+00 2.61851493e-02 7.12872595e-02 - 6.38956785e-01 -7.08488107e-01 -4.35052603e-01 -1.47800231e+00 - -2.29129291e+00 -1.10849714e+00 -1.57032585e+00 3.36712301e-01 - -5.18134058e-01 -5.38237154e-01 -9.84413028e-01 -3.23256540e+00 - 1.50831079e+00 -1.00027049e+00 -4.12420064e-01 -1.45489216e+00 - 9.71930325e-01 1.43428937e-01 -8.26365292e-01 -3.92229617e-01 - 1.67817998e+00 -7.29694292e-02 -1.81733534e-01 -9.41381514e-01 - -1.89458120e+00 -1.60038680e-01 3.25485110e-01 1.09958446e+00 - -2.24849653e+00 -1.12509632e+00 -5.80741107e-01 -3.48084862e-03 - -1.45033687e-01 -1.21688686e-01 4.08096045e-01 -9.03293133e-01 - 9.34141457e-01 4.33452427e-01 -3.19832265e-01 1.60090804e+00 - 1.55641603e+00 9.32895362e-01 -3.01855683e+00 1.57894504e+00 - 7.94758499e-01 2.00640869e+00 1.23746455e+00 1.57834268e+00 - 8.78654838e-01 1.34869325e+00 1.99412370e+00 2.16554731e-01 - -1.57223058e+00 -3.07460904e-01 3.89945596e-01 7.48875812e-02 - -1.24656200e+00 -1.63106763e+00 -7.41162658e-01 8.07739556e-01 - 1.58033979e+00 9.16567624e-01 1.05742729e+00 -8.08155760e-02 - -1.74208248e+00 -4.55220580e-01 -2.58764416e-01 -4.97052491e-01 - 5.46948649e-02 4.41419750e-01 8.53015184e-01 -2.78119862e-01 - -1.90375936e+00 -4.82877731e-01 4.37103957e-02 1.22280788e+00 - -5.67807853e-01 5.12543619e-01 2.90087909e-01 4.39005345e-02 - 9.30533290e-01 -6.60841167e-01 -5.75186431e-01 -1.19008226e-02 - -6.40146494e-01 5.08012295e-01 -7.13192284e-01 2.26304203e-01 - -1.00172766e-01 2.14925960e-01 -4.59741384e-01 -3.39088291e-01 - 9.60092694e-02 2.02074513e-01 -4.20191558e-03 1.70536232e+00 - -9.90736902e-01 -6.42037913e-02 -1.57362902e+00 3.72081488e-01 - -1.11524653e+00 -2.78310269e-01 -3.72064322e-01 2.80891120e-01 - -5.52790940e-01 -5.95305681e-01 1.43585646e+00 2.84566790e-01 - -5.61050832e-01 -2.50833660e-01 1.64535499e+00 -2.35164881e-01 - -6.89276636e-01 -6.84996128e-01 -5.49126506e-01 -5.94497740e-01 - -1.27964783e+00 -1.12420011e+00 -8.29320371e-01 5.91571033e-01 - -3.81237298e-01 -2.07161695e-01 1.27008808e+00 1.29500711e+00 - -2.82124937e-01 -6.30863488e-01 -8.64430308e-01 6.92775786e-01 - -2.04707235e-01 -3.86494905e-01 -8.13664556e-01 -7.69666433e-01 - 3.20058197e-01 -1.31543827e+00 9.94868696e-01 -5.33844650e-01 - -1.49635941e-01 -1.33697796e+00 -1.23659382e-02 -3.28320950e-01 - -5.00805974e-02 1.47170889e+00 1.77698046e-01 -7.94659197e-01 - -7.11319923e-01 5.95131159e-01 1.29869330e+00 2.81918019e-01 - 6.88712716e-01 6.09737694e-01 1.41906440e+00 -1.94959021e+00 - 6.67218924e-01 9.83736038e-01 -1.33872366e+00 -8.85668159e-01 - -8.50782752e-01 -7.28790224e-01 -4.81717050e-01 2.42602810e-01 - 8.65674675e-01 1.08131695e+00 -8.37823033e-01 -8.44703019e-01 - 1.31582189e+00 6.46743953e-01 8.28648433e-02 4.73336667e-01 - -2.98037827e-02 5.24584711e-01 -3.36066693e-01 2.68804938e-01 - -1.51054609e+00 -6.87784314e-01 -6.77011192e-01 -3.74390036e-01 - -9.26513255e-01 -5.33543646e-01 -1.51441407e+00 5.48700452e-01 - -8.25135186e-02 -1.70520759e+00 -8.34829926e-01 3.36601436e-01 - -5.43021917e-01 -6.96941435e-01 -7.45926797e-02 3.43597531e-01 - -1.43635249e+00 5.50128460e-01 -8.00293267e-01 -1.79279689e-02 - -3.54201421e-02 -5.27066112e-01 -1.02365971e+00 -2.85686553e-01 - 1.62533033e+00 -1.36167502e+00 1.69231308e+00 -2.59721160e-01 - 3.53714153e-02 -1.08671081e+00 -7.71269441e-01 -5.02909660e-01 - -1.04569495e+00 -2.08917961e-01 -4.07015890e-01 1.85078108e+00 - -1.16597414e+00 1.02690518e-01 2.47846276e-01 -1.07032061e+00 - 6.86827481e-01 -3.98327023e-01 -2.49238968e+00 6.59849584e-01 - -1.11694086e+00 -1.04777491e+00 6.87902093e-01 -3.47161472e-01 - 9.17772174e-01 1.21016610e+00 -6.44928455e-01 7.93262482e-01 - -7.55460799e-01 2.23311591e+00 -6.92956984e-01 1.02608740e+00 - 8.43790054e-01 3.16087812e-01 -6.70543134e-01 -1.51397496e-01 - -2.15358806e+00 2.24420041e-01 3.49615335e-01 1.33499414e-01 - -2.97330618e+00 -1.26563644e+00 -7.48090982e-01 2.91658938e-01 - 2.89338917e-01 -1.70223787e-01 6.35245562e-01 -1.02624238e+00 - 6.10235035e-01 4.74742115e-01 3.77554744e-01 -1.82040438e-01 - 1.83074379e+00 4.93195683e-01 -9.68335867e-01 2.48305470e-01 - 7.52483368e-01 -5.88108599e-01 1.08927226e+00 3.82033706e-01 - -2.34247828e+00 2.99671978e-01 -4.06029165e-01 5.37179828e-01 - 9.44214821e-01 1.95162043e-01 1.66525221e+00 -1.78706884e-01 - 6.24817848e-01 8.04942548e-01 -9.73459706e-02 7.78774321e-01 - -5.94672978e-01 -4.22568411e-01 -1.31248868e+00 2.41409123e-01 - 3.25251669e-01 1.17926311e+00 7.91140020e-01 -2.84795552e-01 - -1.17817974e+00 4.32850391e-01 1.11075318e+00 7.55184948e-01 - 1.08447754e+00 -2.11889535e-01 -4.66618955e-01 -7.36901343e-01 - -5.58964685e-02 -7.85907954e-02 2.06677631e-01 -1.29297450e-01 - -1.33627728e-01 -7.76694059e-01 9.41619039e-01 7.65180588e-02 - -1.74443930e-01 -7.47429252e-01 -1.33191311e+00 1.30796421e+00 - 5.54638684e-01 -1.62391782e+00 7.63142288e-01 9.22481179e-01 - 1.24376126e-01 1.59811091e+00 2.16719642e-01 -1.41682351e+00 - 3.65168989e-01 2.44897306e-01 -6.03296876e-01 5.73558807e-01 - -1.66766524e+00 9.90566850e-01 -8.78121674e-01 -9.84946132e-01 - -1.13824770e-01 4.32972938e-01 1.63924068e-01 -7.42659152e-01 - -5.06089270e-01 -1.13304234e+00 -2.24679902e-01 1.07502127e+00 - -4.26293770e-03 8.59414279e-01 1.45540130e+00 -5.88018537e-01 - -1.86573252e-01 2.17119288e+00 1.44434345e+00 -1.67240453e+00 - 3.87148321e-01 5.59992332e-04 8.26453090e-01 -7.44978786e-01 - -8.39608371e-01 -6.76070690e-01 -5.71928561e-01 8.15324187e-01 - -1.18120122e+00 -6.22613907e-01 -1.78901243e+00 1.00711787e+00 - 1.27103007e+00 -3.27073425e-01 -2.97949374e-01 4.53221917e-01 - 1.47448206e+00 4.08610761e-01 -6.97541952e-01 -1.61882257e+00 - -1.11512625e+00 7.32194483e-02 -6.55327499e-01 1.94967702e-01 - -5.52924037e-01 -1.05814648e+00 -1.17703164e+00 1.56235456e+00 - 8.88987720e-01 1.73051798e+00 5.39950907e-01 -4.29991394e-01 - 1.32334089e+00 -3.30680013e-02 2.20841765e-01 -4.28243801e-02 - -2.94690418e+00 1.60473502e+00 4.91003960e-01 3.29823047e-01 - -6.17760606e-02 1.63427815e-01 -5.97278953e-01 6.16579354e-01 - -1.30709839e+00 1.86068431e-01 5.61573446e-01 -1.98669136e-01 - 3.70867789e-01 -9.91334379e-01 -9.65826750e-01 2.36717120e-01 - 6.78549647e-01 -1.87486565e+00 5.22020638e-01 -5.43201506e-01 - -1.92806080e-01 -1.33951795e+00 -8.51617217e-01 9.66125309e-01 - 3.28633748e-02 -3.82778406e-01 -1.16488352e-01 4.89211828e-01 - 4.40494508e-01 -7.50270039e-02 1.09295917e+00 9.70462620e-01 - 4.13372040e-01 5.46117835e-02 6.57321095e-01 3.14267427e-01 - 1.06080008e+00 8.99561405e-01 -1.01825392e+00 -1.08596313e+00 - -7.01249540e-01 9.76138711e-01 -1.57630897e+00 3.94757390e-01 - 3.62543076e-01 1.16264808e+00 1.80338696e-01 -1.34320939e+00 - -9.59354937e-01 -1.18199551e+00 -5.37943959e-01 -1.82901561e-01 - -1.16544151e+00 1.47066200e+00 -1.32528138e+00 1.34610343e+00 - 1.18516302e+00 8.98302555e-01 -6.93766594e-01 3.42450887e-01 - -1.27737117e+00 9.10907030e-01 -7.39847794e-02 -4.09320205e-01 - -1.22374487e+00 -2.47750616e+00 -1.03854084e+00 1.11279821e+00 - -4.02833432e-01 -1.69328853e-01 6.55554354e-01 1.80566883e+00 - 1.32500470e+00 -9.72334981e-01 -1.94257057e+00 2.35961437e+00 - 2.26899171e+00 1.88453600e-01 -1.27048826e+00 9.63226557e-01 - -1.07983686e-01 -1.12981126e-01 8.94458830e-01 3.34374756e-01 - -1.18581486e+00 -3.24741930e-01 6.61152899e-01 5.10252237e-01 - 2.62016565e-01 9.33422983e-01 -3.61716986e-01 1.78669780e-01 - -3.37557912e+00 -4.49650198e-01 7.63902903e-01 -3.90394479e-01 - -4.66227382e-01 -1.04145300e+00 1.00336587e+00 5.48463285e-01 - 8.25414598e-01 -1.72621393e+00 -1.60450041e-01 -2.80111015e-01 - -2.80571699e-01 -6.16027534e-01 -3.46104264e-01 8.49644780e-01 - -1.91285133e+00 1.77729475e+00 4.83841121e-01 -4.23318967e-02 - 9.06834126e-01 -1.45183480e+00 -6.64700925e-01 -1.21075429e-01 - 1.36556041e+00 -4.64965813e-02 4.50054675e-01 -5.89578986e-01 - 4.86842066e-01 1.67723954e+00 9.95526671e-01 -8.14559907e-02 - 6.75490856e-01 5.77516973e-01 5.06587684e-01 1.11195338e+00 - 1.10447776e+00 -1.27158597e-01 6.41879082e-01 -3.83094907e-01 - -9.23072398e-01 -2.69493759e-01 -7.62781501e-02 -1.66158104e+00 - 8.60332549e-01 -2.76732540e+00 2.71481705e+00 1.50187910e+00 - -2.61008710e-01 9.42636095e-03 3.02971870e-01 -5.89160383e-01 - 1.92619145e-01 -1.43999979e-01 -9.42727178e-02 1.81461066e-01 - -1.31352782e+00 3.95697244e-02 1.63447753e-01 1.28849626e-01 - -2.15830758e-01 1.13785779e+00 -3.53652567e-01 2.27325901e-01 - 1.29729748e+00 -7.93761387e-02 2.14463264e-01 -9.63846371e-02 - 1.23736978e+00 -2.57724553e-01 1.28335452e+00 -1.03587896e-01 - -8.59459698e-01 6.79373145e-01 -3.04667830e-01 6.20500922e-01 - -2.72940445e+00 5.20526199e-03 -4.59605306e-01 -1.06325865e+00 - -5.53241014e-01 -5.73703170e-01 -4.79857959e-02 -1.36732650e+00 - 3.16273779e-01 1.36621475e+00 2.80928493e-01 8.08988631e-01 - 1.09456241e+00 1.23648953e+00 1.87013865e+00 -4.99202013e-01 - -7.18483984e-01 7.83465624e-01 7.08266199e-01 7.75675297e-01 - 1.88928246e+00 1.13699092e-02 4.26156551e-01 -9.80388224e-01 - 2.13532114e+00 -3.36544931e-01 -3.08215916e-01 -3.46749902e-01 - -6.44895852e-01 -7.29437053e-01 -5.23704290e-01 -1.36579597e+00 - -7.85317898e-01 1.94896317e+00 7.35655129e-01 9.25904155e-01 - 1.17720878e+00 1.83413458e+00 -1.77142251e+00 -1.22988307e+00 - -8.03723633e-01 -4.69729811e-01 -3.00569385e-01 -4.46977556e-01 - 2.19308913e-01 3.52953896e-02 -7.22834766e-01 -2.17554879e+00 - -9.92868721e-01 -5.49238682e-01 -2.47079432e-01 -1.12966740e+00 - -4.90806460e-01 -1.13098741e+00 -1.25819355e-01 -1.07309139e+00 - -8.48904669e-01 -1.30868411e+00 -1.53500691e-01 -8.66447747e-01 - 9.06784296e-01 4.05850261e-01 -6.98518679e-02 -7.97485471e-01 - 3.80520314e-01 6.90700889e-01 1.42688408e-01 1.60370183e+00 - 4.59637314e-01 9.13607240e-01 1.33565411e-01 6.75664425e-01 - -7.96781242e-01 -4.58398372e-01 3.49363565e-01 3.42099488e-01 - 5.07611811e-01 1.10039294e+00 -1.42497122e+00 -9.74452555e-01 - 2.29572639e-01 -7.24363625e-01 3.37776828e+00 5.15719593e-01 - -7.83308521e-02 -5.15469253e-01 -4.80162293e-01 3.01406607e-02 - 1.11981936e-01 -4.56699133e-01 -2.73152232e-01 1.34428453e+00 - 1.38450491e+00 2.52033979e-01 -1.34125972e+00 5.38503170e-01 - 1.55428243e+00 1.36978114e+00 7.34425843e-01 1.42242301e+00 - 5.33842742e-02 6.28102958e-01 -6.80775583e-01 2.92265445e-01 - 1.32953429e+00 -5.89423418e-01 -8.61443400e-01 9.74889398e-02 - -2.26662204e-01 6.49147749e-01 -4.53259528e-01 1.71843958e+00 - -1.28967607e+00 1.25373483e+00 -7.64849246e-01 -1.79598272e+00 - 2.37875164e-01 -2.54607975e-01 -1.10960937e+00 -1.68137693e+00 - -8.36078729e-03 9.75836873e-01 5.84953487e-01 5.67066483e-02 - 4.78847563e-01 -2.72141322e-02 -1.42972422e+00 -1.42089403e+00 - -2.03168526e-01 -1.60653293e-01 8.50982666e-01 7.57304251e-01 - 6.05132997e-01 -2.80282706e-01 -1.38065219e+00 4.12071139e-01 - 5.25169313e-01 -2.38853145e+00 -8.34338188e-01 -6.91054404e-01 - 1.18350200e-01 -8.03360462e-01 -3.81345674e-02 -2.10534945e-01 - -1.59149468e-01 -7.16394305e-01 -6.38985038e-01 -1.08026707e+00 - 3.80067438e-01 5.71102619e-01 -7.70617962e-01 1.52064013e+00 - -3.14601451e-01 1.53018558e+00 -6.42127156e-01 -2.02657771e+00 - 4.02197838e-01 2.48545110e-01 3.43937948e-02 -6.42232478e-01 - -3.45722251e-02 -5.11259437e-01 -3.02349567e-01 8.41632247e-01 - 1.84217229e-01 6.09951377e-01 -2.31868505e-01 -3.42017353e-01 - -1.15348256e+00 -3.70036602e-01 -3.32111454e+00 -5.90221941e-01 - -8.18741769e-02 5.63673675e-01 -2.68677890e-01 3.47673535e-01 - -2.17978859e+00 -5.68368509e-02 -1.67950284e+00 -1.20830357e+00 - -1.87165543e-01 1.27626395e+00 3.71206760e-01 1.82309970e-02 - -1.95303895e-02 -7.13628888e-01 -5.43983400e-01 4.27376032e-01 - -1.38536048e+00 5.93054056e-01 -3.88507068e-01 8.25557053e-01 - 9.61093485e-01 -1.67308402e+00 -5.81580758e-01 -8.55527699e-01 - -1.81272793e+00 -1.03253329e+00 2.44125581e+00 6.09509885e-01 - 1.91728795e+00 8.29098463e-01 -7.41843164e-01 -3.81771207e-01 - 5.88436186e-01 4.37506199e-01 -5.77342033e-01 -1.50999320e+00 - 3.28776121e+00 -1.41149670e-01 -3.25738907e-01 1.13334462e-01 - -2.42728591e-01 1.22728907e-01 1.28694725e+00 -1.24548480e-01 - -1.16271043e+00 -1.61643788e-01 -4.64401782e-01 1.42389894e+00 - 1.17111909e+00 1.07524998e-01 -1.27873588e+00 -1.22589231e+00 - 1.51232611e-02 3.36322963e-01 -3.74622822e-01 -4.80169594e-01 - -5.92799246e-01 -2.33939607e-02 -2.38276243e+00 -1.10132658e+00 - -6.27050817e-01 1.20750320e+00 -8.13813627e-01 1.80468142e-01 - 2.26542756e-01 1.29791653e+00 1.26257741e+00 -5.75029701e-02 - -4.75484729e-01 -6.63833082e-01 -2.41063222e-01 -4.34033364e-01 - 5.53627431e-01 4.20309812e-01 5.91801167e-01 1.25928366e+00 - 1.86286914e+00 1.24663390e-01 1.09946382e+00 -3.51660192e-01 - 1.18500698e+00 1.06150913e+00 1.00841594e+00 9.40698385e-02 - -6.68358147e-01 -1.39452839e+00 -4.00372073e-02 1.53301251e+00 - -3.09844911e-01 7.85126567e-01 8.43951404e-01 -2.83686221e-01 - 1.31634486e+00 -2.90866065e+00 -1.22275031e+00 -9.15632308e-01 - -8.19715023e-01 3.36262941e-01 2.90037811e-01 3.93990576e-01 - 5.96081495e-01 -9.50104594e-01 -1.21817601e+00 3.96319687e-01 - -6.30232096e-01 -1.57047510e+00 9.71344292e-01 1.39510190e+00 - 1.55716085e+00 1.87007725e-01 6.25932097e-01 1.36986828e+00 - -6.41415775e-01 1.11067772e+00 2.75396228e-01 9.13711667e-01 - 1.86093017e-01 1.29625666e+00 2.23523766e-01 -1.48397911e+00 - 1.45192659e+00 -9.47530746e-01 1.25305820e+00 -9.02428851e-02 - 1.17601097e+00 1.13476861e+00 4.30142969e-01 5.08535326e-01 - 4.22628611e-01 -2.01382470e+00 1.18330218e-01 -1.11606419e+00 - 1.71187147e-01 -1.14294255e+00 -1.18495190e+00 1.60493445e+00 - 1.06944740e+00 -4.14258474e-03 1.84768647e-01 -2.47916028e-01 - 5.69876373e-01 -6.76054358e-01 9.93740261e-01 -2.53086835e-01 - -4.19846982e-01 2.17130184e-02 -5.91146827e-01 3.28682363e-01 - -5.37362576e-01 4.68523592e-01 1.21600652e+00 -7.19321668e-01 - -1.30575463e-01 4.61330712e-01 -4.35857356e-01 4.64320570e-01 - 5.55398881e-01 1.08326173e+00 -6.05990052e-01 -9.42775249e-01 - -3.07577401e-01 -9.67752337e-01 5.64498186e-01 -1.16175127e+00 - 2.33301640e+00 1.29592788e+00 1.25757698e-02 8.67394507e-01 - -1.63680184e+00 2.52337337e+00 -5.90214610e-01 -1.20497413e-01 - -1.85593331e+00 4.86287296e-01 -1.10634410e+00 1.01655759e-01 - -5.44274449e-01 -1.07774389e+00 -8.04361522e-01 2.25788027e-01 - -1.28149092e+00 -7.48811662e-01 6.63273394e-01 2.60247993e+00 - 4.77588832e-01 -9.17261660e-01 -3.67114156e-01 3.15890051e-02 - -3.49424958e-01 9.03058171e-01 2.17473477e-01 9.07852411e-01 - -3.14443797e-01 1.32550025e+00 -1.66485578e-01 -9.71189857e-01 - 2.17732048e+00 -1.82497776e+00 -7.38738716e-01 -6.27027869e-01 - 2.50830626e+00 1.80685890e+00 1.49392819e+00 -3.30658674e-01 - 2.42617798e+00 3.68351996e-01 9.77263391e-01 1.79695383e-01 - -4.94665727e-02 -2.51778662e-01 -2.09639996e-01 6.08491659e-01 - 3.67199808e-01 -3.14713717e-02 1.50952399e+00 -2.96675301e+00 - 5.09887218e-01 7.17738688e-01 1.14696288e+00 -6.48714378e-02 - 2.07920074e-01 9.85948384e-01 1.49116635e+00 -3.00535977e-01 - 2.66975790e-01 1.67857397e+00 -2.12402701e+00 -6.90012574e-01 - 1.46583593e+00 1.09253788e+00 5.22445202e-01 -4.48596865e-01 - -8.81818473e-01 -5.91215611e-01 1.54800820e+00 -6.28442705e-01 - 1.01422966e-01 -7.53707647e-01 8.42774034e-01 -5.38726270e-01 - -9.75938886e-02 -1.68338215e+00 -1.61336827e+00 7.67319323e-03 - -1.32515717e+00 2.10508657e+00 -2.06152320e+00 -8.00175309e-01 - 1.67008676e-02 -3.65132928e-01 -1.10414541e+00 -9.57927823e-01 - -9.90654051e-01 -2.86721706e-01 1.95192909e+00 7.79372975e-02 - 1.18290055e+00 5.06976366e-01 1.11386240e+00 -2.79996419e+00 - -3.04458022e-01 1.30713546e+00 -5.71735799e-01 1.76247311e+00 - 1.48389018e+00 -4.69130725e-01 -5.03163517e-01 -1.01360583e+00 - -6.29594207e-01 -1.23258901e+00 4.28811193e-01 -1.45604646e+00 - 6.03733003e-01 6.28078699e-01 -1.66120446e+00 1.54981863e+00 - 5.13609290e-01 7.49698043e-01 -7.58665204e-01 6.22379124e-01 - 1.12639344e+00 -9.86083969e-02 2.00192237e+00 4.40646335e-02 - 1.14078581e+00 2.93008804e-01 5.85521519e-01 -1.67133224e+00 - 3.71053725e-01 -7.34975100e-01 -8.46398294e-01 1.74410319e+00 - -2.13770255e-01 -1.00330186e+00 -3.59131634e-01 -1.30045021e+00 - 1.43388450e+00 -6.95680737e-01 5.29299915e-01 1.35610676e+00 - -5.00152707e-01 5.32022417e-01 -1.03008604e+00 2.44576955e+00 - 9.36380148e-01 -1.92606434e-01 -7.72833705e-01 -1.13996550e-01 - -3.39983732e-01 1.95294452e+00 -1.22553468e+00 1.34801418e-01 - -1.83380675e-02 -1.17410302e-01 -1.08616658e-01 -1.16070807e-01 - 8.73066068e-01 6.08984470e-01 4.18911606e-01 -3.03398943e+00 - 2.99794883e-01 -1.65563941e+00 6.46320164e-01 1.47449732e+00 - -9.26239133e-01 -2.61780709e-01 -5.99355698e-01 1.23543179e+00 - 9.50278863e-02 1.72832942e+00 4.13754195e-01 2.42450178e-01 - 6.82067797e-02 8.17992091e-01 -1.73768616e+00 8.53439569e-02 - -7.00464472e-02 7.25418806e-01 -1.87701613e-01 -1.03873444e+00 - 7.35067483e-03 -1.08284056e+00 -1.92526221e+00 7.27122486e-01 - 5.79061568e-01 4.67090935e-01 1.30292845e+00 -6.65031910e-01 - -1.43111420e+00 -7.00297803e-02 -3.47500294e-01 -8.42295706e-01 - -7.98491538e-02 -6.11605167e-01 -1.83600533e+00 -4.41176683e-01 - -1.13637519e+00 2.31648073e-01 -6.56034768e-01 1.11163151e+00 - 1.60386610e+00 6.79381371e-01 -1.55102623e+00 1.50519598e+00 - 1.57572043e+00 1.87905114e-02 1.32292616e+00 -5.51675141e-01 - -1.86624599e+00 -7.60284424e-01 1.72930086e+00 -1.49107993e+00 - -4.78718081e-04 3.41438949e-02 -5.12974262e-01 -4.43521589e-01 - 7.22491369e-02 -5.62162340e-01 -1.42205846e+00 2.15031937e-01 - 1.07336664e+00 8.69572520e-01 -8.09735477e-01 -1.25122797e+00 - -2.84187883e-01 1.56779146e+00 6.45598054e-01 7.03964651e-01 - 8.89856935e-01 4.20766562e-01 1.20608546e-01 6.62306905e-01 - -5.46166122e-01 -1.41446304e+00 7.22996950e-01 1.23773813e+00 - -1.11823189e+00 8.21921468e-01 1.68076575e+00 -2.24620174e-03 - -8.18002164e-01 1.23018675e-01 -5.50533950e-01 -6.16730094e-01 - 4.33184057e-01 -1.60372949e+00 -1.44392896e+00 1.92908812e-02 - -2.23085396e-02 3.43554586e-01 1.87522542e+00 1.04514033e-01 - 5.98172128e-01 -9.75532949e-01 7.67937660e-01 -5.26390851e-01 - 7.14208335e-02 -1.08848624e-01 1.03297424e+00 -1.30997658e+00 - -2.28323865e+00 -2.13432699e-01 4.68531251e-01 1.52269378e-01 - 1.23220420e+00 4.37372684e-01 -5.63848317e-01 -2.32542014e+00 - -2.37312227e-01 1.02795899e+00 -3.92047793e-01 2.83202641e-02 - 4.21360254e-01 -1.01529241e+00 6.23235881e-01 1.63712180e+00 - -1.72813356e-01 -1.12906444e+00 -6.66010305e-02 -2.22438407e+00 - 3.36903274e-01 4.52274382e-01 1.67563629e+00 -1.93994790e-01 - 4.08485353e-01 -8.61288130e-01 -3.56427133e-01 -1.52518547e+00 - -1.94789618e-01 -3.05309147e-01 2.99199760e-01 -6.23596907e-01 - 8.33453536e-02 -3.33380044e-01 -3.31499398e-01 8.03623274e-02 - 4.05229405e-02 7.61174023e-01 -1.20643035e-01 -3.49885345e-01 - -1.80807441e-01 -7.14664102e-01 -3.59911084e-01 2.03115869e+00 - -9.12678778e-01 -2.78656334e-01 -3.87488663e-01 -2.47171611e-01 - -3.31239313e-01 2.79240102e-01 -1.71017516e+00 1.03717148e+00 - 6.32582784e-01 -1.24287164e+00 -2.26907730e+00 -9.77072120e-01 - -2.42666468e-01 2.58029199e+00 5.43256938e-01 -1.80758965e+00 - -6.89655483e-01 -5.32124378e-02 2.53683567e-01 -1.31474888e+00 - -1.34990230e-01 -7.17639148e-01 -1.10375857e+00 2.59595633e-01 - 7.73877144e-01 8.41041267e-01 5.74386358e-01 -4.97369528e-01 - 7.14549780e-01 3.09054434e-01 1.33941889e-01 -6.11693144e-01 - -4.96876121e-01 7.38869384e-02 2.43987441e+00 3.01242471e-01 - -6.11188531e-01 -8.32389474e-01 -4.91766602e-01 2.17729354e+00 - -3.99256647e-01 9.06870544e-01 -3.75928253e-01 2.61184543e-01 - 7.91745245e-01 -5.71902871e-01 -6.41111672e-01 1.65886974e+00 - 9.84592915e-01 7.90672302e-01 7.87276268e-01 4.59575474e-01 - 2.01777577e+00 4.39083017e-02 5.77262342e-01 -1.45091295e+00 - -8.24960709e-01 -1.20081902e+00 1.27406359e+00 -2.55254006e+00 - -4.02310014e-01 -2.99370670e+00 -8.31287920e-01 -4.90948945e-01 - 1.38487113e+00 3.96627933e-01 -8.33141506e-01 1.06596375e+00 - 1.78322550e-02 3.75668593e-02 -1.20267153e+00 -9.90013719e-01 - -8.25097322e-01 4.64612730e-02 7.78857246e-02 1.54409266e+00 - 3.85055155e-01 -2.70756578e+00 2.62641996e-01 -1.45001638e+00 - -9.65970933e-01 -8.12169492e-01 -6.63338304e-01 -1.37514937e+00 - 2.03482002e-01 -1.41080225e+00 3.56667608e-01 2.23697114e+00 - -1.32935905e+00 -2.84510314e-01 -2.52345729e+00 -3.00935298e-01 - 6.42717123e-01 7.20639229e-01 7.73637295e-02 8.89263451e-01 - 2.54605460e+00 1.71230471e+00 -5.22520900e-01 -1.45110404e+00 - -4.06151414e-01 -9.46982741e-01 4.51347560e-01 -1.18119359e+00 - -7.28224292e-02 8.38014066e-01 -1.00695871e-01 -9.46313798e-01 - 1.17481411e+00 -1.87898099e+00 -3.27795058e-01 -4.16600220e-02 - 1.59086585e-02 1.80496052e-01 6.10439837e-01 -3.54074389e-02 - -3.68058294e-01 -4.07589346e-01 -6.11515820e-01 7.87543297e-01 - -6.46528482e-01 8.46771121e-01 1.89990830e-02 -5.21254778e-01 - 6.18407190e-01 -7.27453947e-01 6.38184011e-01 2.12726116e-01 - -5.96324682e-01 5.28634787e-01 5.46589732e-01 1.15751958e+00 - 9.66147006e-01 4.91274029e-01 -1.12759089e+00 2.44858694e+00 - -1.40361619e+00 9.49263155e-01 4.31283832e-01 -1.17414236e+00 - -1.24300942e-01 -7.87528828e-02 1.30558276e+00 -7.13310897e-01 - -2.61945009e-01 -2.33606958e+00 1.73476279e+00 -1.07046819e+00 - 3.26910853e-01 -1.01407833e-01 -9.80009660e-02 8.09861958e-01 - 1.67126250e+00 2.51493663e-01 6.55077994e-01 3.28891546e-01 - 1.01753139e+00 -4.62916374e-01 1.07539058e+00 1.11049986e+00 - 1.08253706e+00 -6.60147846e-01 -1.68337628e-01 2.39466995e-01 - -1.37084210e+00 1.59492016e+00 -1.22765331e-02 2.83927888e-01 - -5.77004433e-01 -1.32687759e+00 -6.04029894e-01 2.19204187e-01 - 6.73428833e-01 1.61791533e-01 3.46724004e-01 -3.37026387e-01 - 4.96248573e-01 1.48283291e+00 7.04789460e-01 8.09541881e-01 - 2.94016391e-01 -1.50315511e+00 -1.04466653e+00 4.54875797e-01 - -1.09390523e-02 4.39842880e-01 1.67642343e+00 7.26036012e-01 - 1.25888622e+00 9.46813896e-02 -8.07215691e-01 -5.78661598e-02 - -8.04751813e-01 1.55829501e+00 7.90391088e-01 5.20512521e-01 - 1.23594320e+00 -2.02005649e+00 5.88137925e-01 -1.31064582e+00 - -7.25569855e-03 -3.11707586e-01 6.61736608e-01 2.28187293e-01 - 1.34511721e+00 -6.01897180e-01 7.09114373e-01 1.62223220e-01 - -1.63529027e+00 -9.18616474e-01 -1.49878040e-01 1.11229241e+00 - 4.63723063e-01 -1.16750085e+00 -1.40316233e-01 2.01780653e+00 - 1.11223333e-01 -3.27325583e-01 4.91739631e-01 7.63393104e-01 - -4.56168354e-01 5.02574801e-01 -3.28889489e-01 7.19327629e-01 - 1.69693157e-01 4.81967837e-01 -7.59183526e-01 -8.21286380e-01 - 9.45497334e-01 -7.93963820e-02 2.85034746e-01 -3.06395441e-01 - 8.98726761e-01 -6.07809961e-01 -1.35601237e-01 7.85992801e-01 - 1.17296767e+00 -5.23759305e-01 1.95968401e+00 3.13241690e-01 - -1.04292822e+00 3.69052052e-01 -3.82999212e-01 4.34068501e-01 - -4.28562462e-02 7.42736906e-02 -9.11593378e-01 9.90061224e-01 - -1.15612733e+00 -1.10876489e+00 3.67360353e-01 1.60284424e+00 - -3.52593094e-01 7.00972319e-01 1.07334018e+00 8.14684451e-01 - -3.11208278e-01 -1.09104669e+00 -2.20754766e+00 -2.62840509e-01 - 1.17808735e+00 -1.12416601e+00 7.00209856e-01 -9.50152516e-01 - 1.06072575e-01 7.07791626e-01 -6.59495413e-01 1.70484677e-01 - -4.15091336e-01 7.05148637e-01 2.35317171e-01 5.65076731e-02 - 1.55834997e+00 6.62306130e-01 -6.90351844e-01 -8.58639032e-02 - -7.58987814e-02 8.03195760e-02 2.52450919e+00 3.70851904e-01 - -2.03590250e+00 -5.42798996e-01 -1.93237209e+00 -3.29798251e-01 - -1.98731017e+00 -8.71590495e-01 -1.52072155e+00 3.84160191e-01 - 5.39362605e-04 3.02962303e-01 -1.04551390e-01 -1.39429772e+00 - 5.13521373e-01 1.92222452e+00 4.09212589e-01 -4.25924599e-01 - -1.42806721e+00 -9.08066988e-01 4.53039646e-01 1.07311797e+00 - 2.60866165e-01 7.99887300e-01 7.07678258e-01 5.06534398e-01 - -6.58473611e-01 1.73923507e-01 -8.48377824e-01 1.74289532e-02 - 3.30863535e-01 1.67878377e+00 -1.30973196e+00 4.81507629e-01 - 1.48123658e+00 7.58847773e-01 8.32945466e-01 5.87694883e-01 - 1.29833448e+00 -1.08747816e+00 -1.38592672e+00 -6.33414043e-03 - -2.16116214e+00 1.23064578e+00 1.22071517e+00 -4.33350563e-01 - -3.70770603e-01 -7.23474771e-02 5.20206504e-02 1.27173916e-01 - 5.58232129e-01 -1.72626346e-01 9.43193316e-01 4.45947766e-01 - -6.06963456e-01 1.33511269e+00 -1.35825694e+00 -9.21658128e-02 - 9.66917396e-01 -7.23073125e-01 3.09555650e-01 5.43339610e-01 - -8.60523880e-01 -7.24777207e-02 -1.56503826e-01 -5.51737845e-01 - 2.30167484e+00 -7.63106346e-01 1.00493026e+00 -3.92801017e-01 - -1.08663619e+00 1.02814242e-01 6.49093390e-01 9.20509577e-01 - 1.56226516e+00 2.94524622e+00 2.50514209e-01 -6.63092196e-01 - 4.52874333e-01 -2.66212940e-01 9.25966144e-01 -1.29716432e+00 - 2.36833453e+00 8.56048107e-01 -1.68400407e+00 -7.09028900e-01 - 8.16081047e-01 1.17634404e+00 -1.27223766e+00 7.13025868e-01 - -1.94733989e+00 -8.93358663e-02 1.11204684e+00 -5.51515758e-01 - -1.50692225e+00 1.46214798e-01 -6.04718864e-01 6.29558384e-01 - -8.85326445e-01 -7.72636890e-01 -2.02100229e+00 1.69944718e-01 - -1.60904944e-01 -1.18838465e+00 1.93798721e-01 1.02707028e+00 - -2.15567946e-01 2.68166959e-01 2.73311257e-01 -1.20432985e+00 - 2.05658579e+00 -4.70849842e-01 6.56116188e-01 7.69215286e-01 - -1.39595121e-01 6.06795013e-01 2.45819226e-01 -1.41763973e+00 - -3.32272708e-01 -6.12550914e-01 -4.61745054e-01 -1.30400312e+00 - 1.12643373e+00 1.60199261e+00 9.78002965e-01 1.27052927e+00 - -3.44634831e-01 -6.36256814e-01 -7.78587043e-01 6.44064903e-01 - 2.27063060e-01 -7.72217140e-02 1.29514849e+00 4.04941350e-01 - -8.38657618e-01 -6.80908859e-01 6.81993306e-01 1.04173291e+00 - 5.51680207e-01 1.89688873e+00 1.63500714e+00 4.41035837e-01 - -8.38580668e-01 -1.30998755e+00 -3.87381732e-01 7.53213465e-01 - 1.82664704e+00 4.17447031e-01 1.16872275e+00 -2.45307490e-01 - 6.22769117e-01 8.23356807e-01 -1.97959423e+00 -4.13438797e-01 - 9.49700356e-01 2.74468333e-01 -1.19651508e+00 -2.43012238e+00 - -3.65780741e-01 -7.15856925e-02 -8.83922935e-01 1.43084943e-01 - 5.06386757e-01 6.70687199e-01 -2.35591888e+00 -1.31428504e+00 - -3.04622650e-01 5.72173119e-01 4.89234000e-01 4.71869946e-01 - -3.83417934e-01 -3.27633440e-01 -7.28329241e-01 7.68194139e-01 - -9.20488775e-01 8.50036383e-01 2.07853243e-01 -4.50444780e-02 - -6.56710386e-01 -1.35554183e+00 -6.47652745e-01 -3.30006868e-01 - -2.36902428e+00 -1.97355151e-01 9.24357355e-01 5.67217886e-01 - -4.96205866e-01 4.69362997e-02 -5.56164563e-01 1.48760736e-01 - 1.54010785e+00 9.68498826e-01 5.20794332e-01 -4.92884904e-01 - 7.49938369e-01 -1.68687689e+00 -2.52353489e-01 5.83711445e-01 - 7.40421236e-01 6.69884503e-01 2.94830680e-01 -1.95724979e-01 - 5.37975967e-01 -9.52861011e-01 -8.44677269e-01 6.17346764e-02 - 1.73197642e-01 -1.82561517e+00 4.55146372e-01 7.72728384e-01 - -7.72852376e-02 2.37446618e+00 -6.82935119e-01 -3.61087173e-01 - -3.95066917e-01 -7.61876106e-02 -3.28724802e-01 -1.40681052e+00 - -5.46548307e-01 -9.76206958e-02 1.22857213e+00 2.20224333e+00 - 1.33547390e+00 -7.75379002e-01 -6.95809603e-01 -1.59328580e-01 - 1.78455889e+00 -1.65424049e-01 -1.32750750e+00 -5.01177132e-01 - 7.57055998e-01 6.89940274e-01 -2.00216666e-01 6.83414698e-01 - -2.27317214e+00 1.03829235e-01 8.60685408e-01 -8.32863271e-01 - -8.36338520e-01 -1.86644816e+00 1.58989561e+00 -7.20153987e-01 - 2.41043285e-01 6.03917062e-01 1.51206791e+00 -1.53897777e-01 - -5.39429784e-01 1.73067644e-01 1.28408027e+00 -1.70172527e-01 - -9.54869092e-01 -1.54075778e+00 -1.22129738e+00 8.64391506e-01 - -3.20624471e-01 -6.07520938e-01 -7.15967238e-01 8.50458592e-02 - -2.39455089e-01 -2.73132145e-01 1.34932387e+00 1.53801632e+00 - -1.61002171e+00 3.27592731e-01 -7.76862085e-01 -2.48251110e-01 - 3.09278399e-01 -2.12719917e-01 2.30459884e-01 6.72970235e-01 - 2.94576794e-01 -6.46836281e-01 -6.29199624e-01 -1.43970525e+00 - -8.53920281e-01 1.96206832e+00 -7.92003572e-01 -1.19337881e+00 - 3.19927871e-01 -1.10364757e-01 -1.95593238e-01 1.40904593e+00 - 5.49126685e-01 -5.29992938e-01 -4.24475908e-01 -5.02586544e-01 - 8.74739647e-01 -3.08147639e-01 -1.79315352e+00 -4.83196788e-02 - -1.53702605e+00 2.18423057e+00 -9.65667903e-01 -1.61864772e-01 - 2.58202732e-01 -2.88658172e-01 -2.90030390e-01 4.21827286e-01 - 1.52621591e+00 7.67509103e-01 1.47851384e+00 -1.04661906e+00 - 3.17362636e-01 1.55246049e-01 3.51091295e-01 1.37623203e+00 - -2.20789149e-01 -4.05503362e-01 5.33980787e-01 -1.77820277e+00 - -2.36852050e-01 -1.34197682e-01 -9.20358598e-01 1.05705905e+00 - -1.06224036e+00 -1.35873049e-01 2.93157220e-01 1.35345161e+00 - -1.05564352e-02 9.36153591e-01 7.00595379e-01 1.23457558e-01 - -1.08951342e+00 -1.43458080e+00 1.06505401e-01 -5.42696893e-01 - 9.19261098e-01 7.45856524e-01 -1.84610993e-01 -1.15771294e+00 - 5.10735571e-01 -1.01794946e+00 -7.58463919e-01 -9.79415715e-01 - -1.38501018e-01 1.17019236e+00 -5.04087005e-03 -4.43218678e-01 - -1.52561498e+00 5.52124560e-01 -7.80404627e-01 1.17517620e-01 - -3.54218602e-01 8.38177204e-01 -1.57236397e-01 -5.89263551e-02 - 4.85747159e-01 2.10694027e+00 8.88446927e-01 -8.86061043e-02 - 3.19799059e-03 -5.24859488e-01 -9.49995697e-01 -5.52750766e-01 - -7.72148967e-01 6.53368413e-01 -1.07119489e+00 1.33321059e+00 - 7.52046406e-01 2.05017291e-02 3.18611477e-04 -5.57281315e-01 - 2.70084411e-01 1.08809245e+00 -7.03156471e-01 -1.53339207e+00 - 7.03218281e-01 3.24365973e-01 -1.12019885e+00 -7.33391345e-01 - -8.59104276e-01 -7.24532247e-01 -3.23682755e-01 -2.66547292e-01 - -6.50793672e-01 -7.58163810e-01 -2.28784338e-01 1.67691529e-01 - -3.16296548e-01 1.15866697e+00 5.08087695e-01 -1.59084582e+00 - 5.23727775e-01 6.05543971e-01 -4.34734970e-01 -1.49519753e+00 - 8.06675944e-03 -7.70865148e-03 -1.12164629e+00 -1.45121920e+00 - -8.75535458e-02 -2.33077198e-01 -4.17265356e-01 -8.34107324e-02 - -1.10277641e+00 -4.34820622e-01 5.33325016e-01 -3.49970073e-01 - -1.67197251e+00 -4.72692281e-01 -1.07015848e+00 2.67136306e-01 - 3.18837404e-01 -1.65877461e+00 7.59012997e-01 -1.11218631e+00 - 1.55918229e+00 -4.87762652e-02 -1.24639344e+00 -1.14198601e+00 - 3.48095298e-01 -3.91299903e-01 -9.66539085e-02 1.61278021e+00 - -6.99244082e-01 5.99628575e-02 1.41984451e+00 2.70513475e-01 - -1.60660279e+00 5.05389035e-01 -1.16300285e+00 1.40680754e+00 - -8.22867632e-01 2.96950960e+00 1.46737599e+00 -7.96612561e-01 - 2.00138055e-02 7.53644288e-01 7.83660471e-01 4.42336261e-01 - 1.25215900e+00 -1.22968949e-01 -3.66717666e-01 -3.46863985e-01 - 8.58373225e-01 3.65181029e-01 2.91283667e-01 7.95234263e-01 - -2.05105233e+00 3.10068816e-01 2.18568891e-01 1.66880810e+00 - -1.52858794e+00 1.55718660e+00 7.54091293e-02 4.88989532e-01 - 1.67591882e+00 5.60761273e-01 4.00368303e-01 2.42291182e-01 - -4.83478129e-01 -2.84891240e-02 -7.15512753e-01 -4.45668995e-02 - -2.33593059e+00 4.29039568e-01 -1.21737257e-01 -3.36203724e-01 - 2.19758078e-01 -1.43356395e+00 1.92927003e+00 -1.02767460e-01 - -1.45844430e-01 -9.65396225e-01 2.39463821e-02 1.38695991e+00 - 1.03591621e+00 8.54437768e-01 5.46398938e-01 -3.58542532e-01 - -1.45016313e+00 -2.01038778e-01 -2.35871464e-01 1.04879439e+00 - 3.09390455e-01 1.61231148e+00 1.19748855e+00 -4.56495613e-01 - -1.79512918e+00 1.38557518e+00 1.35938525e+00 -1.36621153e+00 - 1.50924003e+00 3.82117182e-01 1.60685495e-01 1.61478329e+00 - 1.34468520e+00 6.51330948e-01 4.73217070e-01 6.16394281e-01 - -4.00057435e-01 -5.51014505e-02 -9.02317166e-01 -3.33039790e-01 - -3.66633892e-01 -6.35305345e-01 1.36746496e-01 -3.23980838e-01 - 5.02234697e-01 -1.62232065e+00 -1.13354795e-01 1.31551638e-01 - -3.10629249e-01 -1.72392696e-01 -1.58524597e+00 3.56477112e-01 - 6.47605956e-01 4.93537307e-01 1.03872705e+00 2.01529607e-01 - 4.78681177e-01 1.56090736e-01 6.87550724e-01 -1.57722962e+00 - -8.93580973e-01 1.58689964e+00 -2.46043587e+00 -1.40177935e-01 - 1.17227495e+00 -9.32606682e-02 -4.05011058e-01 1.29421681e-01 - -3.53120178e-01 3.08494922e-02 -6.83381021e-01 1.10867441e+00 - 1.42326498e+00 -1.33390322e-01 -1.38648939e+00 -5.23843169e-01 - -1.83988661e-01 5.05538881e-01 -1.00801325e+00 -7.69162297e-01 - 5.10676146e-01 9.38908815e-01 -6.21594071e-01 -1.33321702e+00 - 3.28411818e+00 7.83304214e-01 -2.67003775e-01 8.69031250e-01 - -1.24456629e-01 8.67156863e-01 1.59393191e-01 9.12718996e-02 - -1.13581866e-01 -1.44738138e+00 1.93809485e+00 -9.41533923e-01 - -1.36002362e+00 -2.12859884e-01 -9.61471796e-01 -5.17127924e-02 - 6.87763870e-01 9.95302916e-01 8.14730704e-01 -3.05632055e-01 - 4.95918281e-02 4.07879055e-01 -1.75778747e+00 5.70584714e-01 - -1.45729923e+00 2.04034662e+00 -4.04515088e-01 -1.84875175e-01 - -7.69707084e-01 -6.29970014e-01 2.27409363e+00 9.40369666e-01 - -1.54920244e+00 1.50421286e+00 1.19324231e+00 -2.81212449e-01 - 3.06696951e-01 -1.49440793e-02 1.97257996e+00 -7.96290159e-01 - 1.46010607e-01 -4.28145319e-01 -1.43304038e+00 2.19163823e+00 - -1.18156660e+00 2.20129192e-01 -1.74070060e+00 -3.99070531e-01 - 4.21968818e-01 -8.24733973e-01 5.86801827e-01 -2.69041091e-01 - 1.16008246e+00 1.36176085e+00 -5.86744785e-01 7.67328590e-02 - 1.83454230e-01 6.19349301e-01 1.43819964e-02 1.90210953e-01 - -2.43406475e-01 1.15239866e-01 1.29794014e+00 6.46083236e-01 - 7.01255620e-01 7.23070323e-01 -1.53808773e-01 3.18827033e-01 - -1.08517373e+00 1.26880765e+00 -2.78296041e+00 -8.71483907e-02 - 1.14089596e+00 -1.36386788e+00 3.99779916e-01 -6.80766463e-01 - -3.68803263e-01 9.45940971e-01 1.98056921e-01 -2.03435588e+00 - -9.76425529e-01 9.75335717e-01 4.48942721e-01 1.74813557e+00 - 1.68264759e+00 -3.42245132e-01 2.50323772e+00 2.44796705e-02 - -9.62668538e-01 -6.33639872e-01 -2.25672507e+00 1.60701907e+00 - 1.59476197e+00 -1.01550746e+00 -2.27884293e-01 9.56075415e-02 - -9.60353076e-01 -1.75510204e+00 -2.10357606e-01 -3.21079433e-01 - -3.88411254e-01 2.22716063e-01 1.42951965e+00 -3.03092718e-01 - 1.70442998e-01 7.26870537e-01 5.54142952e-01 -6.14423513e-01 - 9.63771194e-02 5.47539830e-01 -1.12835133e+00 1.26353896e+00 - 1.43158889e+00 -7.22780228e-01 1.93670020e-01 6.42278254e-01 - -5.45614183e-01 4.81613845e-01 1.63255632e-01 -1.46791816e+00 - 4.59161311e-01 -1.10014236e+00 9.39202845e-01 -6.71211064e-01 - -1.61285430e-01 1.37227893e+00 4.96381640e-01 -1.07947397e+00 - -1.01491654e+00 -5.99302769e-01 3.94644082e-01 3.58299822e-01 - -8.43102276e-01 -1.16720951e+00 2.81583399e-01 -1.81860059e-01 - 2.15721583e+00 3.66036557e-02 1.81425285e+00 -1.17038357e+00 - -1.57321298e+00 -1.12008512e+00 1.18912983e+00 1.75610161e+00 - -7.82179475e-01 -4.60566044e-01 -1.16898096e+00 -1.81163990e+00 - 7.18411684e-01 2.23020300e-01 1.04879830e-02 -1.40005648e+00 - 1.63995671e+00 -5.03356755e-01 5.33626437e-01 -1.45456135e+00 - -1.90525189e-01 2.91374445e-01 3.27075213e-01 1.54648542e-01 - 4.94522482e-01 -1.13052046e+00 1.97372600e-01 1.13757685e-01 - 2.31672689e-01 7.23452196e-02 -1.27321279e+00 -1.31779981e+00 - -2.21426821e+00 8.17534506e-01 8.27657521e-01 8.43550265e-01 - 3.06789964e-01 -2.13311911e+00 -1.02671051e+00 1.70415416e-01 - 1.35518000e-01 -2.22748220e-01 3.68917197e-01 1.16517078e-02 - 1.20770462e-01 -6.39065325e-01 9.82010961e-01 -4.30771619e-01 - -1.33064914e+00 -7.70999014e-01 -3.48466188e-01 9.94002759e-01 - -4.22679968e-02 -1.86148226e-01 -1.24866343e+00 7.85482585e-01 - -6.51952147e-01 -1.00027859e-01 1.48018312e+00 5.08763313e-01 - -1.62825406e+00 -1.76951027e+00 -6.43104538e-02 -1.21274066e+00 - 6.48546755e-01 4.22630645e-02 1.65384769e-01 -7.06743181e-01 - 1.74138367e+00 2.84259367e+00 -2.78584719e-01 3.53039235e-01 - 9.74173397e-02 -2.59686857e-01 -8.32070470e-01 -1.08496797e+00 - -1.13559723e+00 4.05240029e-01 -5.19674480e-01 -2.57829285e+00 - 4.44858223e-01 -3.82612832e-02 1.34667301e+00 3.73122394e-01 - -1.47867358e+00 -2.22967672e+00 -7.19472408e-01 -6.25107408e-01 - 9.43159223e-01 -8.87093227e-03 1.13368845e+00 -1.32483304e+00 - 9.65264380e-01 7.20989168e-01 -2.01823401e+00 1.32261789e+00 - 9.34584886e-02 3.92044514e-01 -8.64896119e-01 1.27524698e+00 - -1.18585229e+00 8.32092047e-01 8.20607483e-01 -9.74167407e-01 - 1.98337364e+00 2.31528476e-01 6.53892636e-01 -1.52712083e+00 - 2.11772990e+00 -1.22455692e+00 1.02488649e+00 -9.94771719e-01 - -6.18988812e-01 7.88754046e-01 1.02755272e+00 -1.27128422e+00 - 1.09047222e+00 1.04498053e+00 -9.37953413e-01 1.59864759e+00 - -1.45685768e+00 -6.24772608e-01 4.79002625e-01 -8.53566289e-01 - 2.63819098e-01 2.41076082e-01 -1.87074274e-01 8.32400560e-01 - -2.28360787e-01 2.31541067e-01 1.14647186e+00 7.90686190e-01 - 7.86017299e-01 -1.14917958e+00 5.24792552e-01 -8.78498793e-01 - -1.28141499e+00 -7.17063472e-02 -9.04295206e-01 7.90617704e-01 - 8.14535439e-01 -1.90262628e+00 -8.28302562e-01 -1.45978665e+00 - -4.62629288e-01 4.55175668e-01 -2.42288098e-01 -5.89759231e-01 - 1.30017650e+00 -2.30485725e+00 1.23687184e+00 -5.44288278e-01 - -7.81090111e-02 -5.27363360e-01 1.54288816e+00 -1.23097849e+00 - 1.17363697e-02 5.27438641e-01 -1.61100280e+00 -9.19898152e-01 - -1.48178533e-01 -8.86665046e-01 -8.15591753e-01 -3.49862665e-01 - 6.49581313e-01 -3.42717767e-01 1.07842624e+00 -9.97757792e-01 - -1.25192225e+00 1.92634535e+00 -6.57958150e-01 -1.31091201e+00 - -1.74205399e+00 1.07510340e+00 -3.41304511e-01 -2.53709733e-01 - -1.59201711e-01 1.01294291e+00 -1.79529831e-01 2.34405375e+00 - 1.33760571e-01 1.06178474e+00 8.05239856e-01 -3.64473201e-02 - -5.86970985e-01 3.31963927e-01 5.10098755e-01 -5.72586060e-01 - 1.66338861e-01 1.23620498e+00 2.06994772e+00 8.54369342e-01 - 1.86275971e+00 -6.29374206e-01 7.01882243e-01 1.33851945e+00 - -8.96333337e-01 -4.85590100e-01 1.73790479e+00 -9.75098848e-01 - -1.34113002e+00 1.52798152e+00 1.19676328e+00 4.32120234e-01 - -4.57107536e-02 -4.38524514e-01 2.81948864e-01 1.57980692e+00 - 2.38566130e-01 2.16695929e+00 -8.97942007e-01 -1.96807552e-02 - -3.73058677e-01 -1.08514154e+00 -1.53083727e-01 4.17329311e-01 - 5.24230182e-01 2.17045212e+00 4.64467466e-01 1.15013683e+00 - 6.60942495e-02 1.67805564e+00 2.39844918e+00 5.31085134e-01 - -9.92659032e-01 -5.48804998e-01 2.81934291e-01 4.56498027e-01 - 6.98425114e-01 4.60197687e-01 6.98491605e-03 -6.36198461e-01 - 4.01138276e-01 1.85498738e+00 1.01992857e+00 5.34703970e-01 - 1.02956688e+00 -3.71064037e-01 -5.08265138e-01 -7.47785091e-01 - -1.25286624e-01 -4.22005892e-01 1.20827472e+00 -4.75323409e-01 - -8.02721739e-01 1.94257712e+00 -4.51375216e-01 1.53528452e-01 - 4.51322824e-01 -1.13669312e+00 1.23548377e+00 6.37119055e-01 - 1.55838406e+00 -6.79221928e-01 9.91257951e-02 3.93557549e-02 - 2.63888508e-01 -1.45461476e+00 1.10836379e-01 -1.42646396e+00 - 3.58334064e-01 -2.12864923e+00 -1.85158682e+00 8.18548739e-01 - -5.46838462e-01 -9.87858057e-01 1.27084565e+00 3.27626228e-01 - 8.72090220e-01 -1.66242078e-01 2.67962861e+00 -6.04513884e-01 - 1.74449551e+00 -6.13660403e-02 -1.65451038e+00 4.82848734e-01 - 3.14762563e-01 -1.05132186e+00 -9.22219276e-01 -2.46976428e-02 - 3.49747807e-01 -2.47066331e+00 1.05312145e+00 -4.94497985e-01 - -1.33174753e+00 9.41066563e-01 -2.36179791e-02 1.21616578e+00 - 4.37952250e-01 -1.51450292e-03 -1.52729943e-01 4.82712746e-01 - -4.13531274e-01 -8.52572560e-01 6.09909117e-01 -1.44487843e-01 - 3.35676484e-02 1.08937979e+00 -1.13876611e-01 1.50339259e-02 - -5.99649489e-01 4.30739939e-01 3.49827915e-01 9.00775135e-01 - -5.79128861e-01 -4.57816958e-01 7.92137384e-01 2.52213001e-01 - -9.06218112e-01 1.30655408e+00 -1.69845057e+00 7.84284770e-01 - -1.26521206e+00 1.88578856e+00 -1.54459977e+00 1.15225065e+00 - -8.16343576e-02 -3.66602570e-01 1.85396656e-01 6.31900311e-01 - -9.04400527e-01 -8.15787733e-01 6.49604857e-01 -1.70734298e+00 - -5.84229112e-01 3.04068685e+00 -9.68012989e-01 1.14780283e+00 - -5.85950792e-01 5.09045124e-01 6.54355705e-01 1.12895679e+00 - -2.00253829e-01 1.98888469e+00 -5.89336574e-01 2.34220415e-01 - 2.45276928e-01 -1.12196815e+00 -2.85872251e-01 1.61513492e-01 - -4.32121038e-01 2.44092393e+00 -8.38623703e-01 1.17942452e+00 - 8.56459796e-01 -1.38621349e-02 -8.65209043e-01 -7.68255293e-01 - -5.17967284e-01 -1.78574133e+00 -1.00346541e+00 -1.48247647e+00 - 1.34587502e+00 6.18462384e-01 7.85974085e-01 5.88831961e-01 - 1.66967630e-01 1.46068037e+00 -6.10762656e-01 1.06669657e-01 - 7.47058809e-01 -1.32120919e+00 3.09658885e+00 2.85657626e-02 - 1.41989455e-01 1.25687802e+00 9.14955318e-01 3.80780213e-02 - 1.30129442e-01 -1.50414848e+00 -5.56465805e-01 5.17210245e-01 - 4.43892837e-01 1.22116578e+00 -1.20501056e-01 -1.38728654e+00 - 1.05673039e+00 1.43608940e+00 8.98244560e-01 -1.63633674e-01 - 1.04438603e+00 3.42891049e+00 1.15152314e-01 -1.31173670e+00 - -5.86285055e-01 6.25476003e-01 -1.60480428e+00 1.69403076e-01 - 3.80336642e-01 1.53512180e+00 1.31019485e+00 8.64584804e-01 - -2.16267896e+00 9.21053112e-01 7.05513358e-01 2.55944371e-01 - -1.73022211e+00 -7.76773930e-01 -1.16682279e+00 -2.01500845e+00 - -2.91515142e-03 -7.97822118e-01 2.62530565e+00 -1.68771937e-01 - 1.29375315e+00 -1.56909907e+00 -5.30089557e-01 9.68978778e-02 - -9.08153951e-01 -4.22200680e-01 -2.17950910e-01 -2.33907983e-01 - 1.85511863e+00 3.78226280e-01 4.79510576e-01 5.39654791e-01 - 6.12626374e-01 -8.28869864e-02 -3.20746362e-01 1.66816056e-01 - 1.15680397e+00 1.12623823e+00 -2.40398481e-01 7.67843723e-01 - -1.94584668e-01 3.38705480e-02 -4.73523378e-01 -1.81645140e-01 - 1.34133005e+00 9.24211442e-02 1.35282502e-01 -1.44721782e+00 - -3.09243917e-01 -2.41314903e-01 -8.29143465e-01 2.16654111e-02 - -4.29182291e-01 7.99958557e-02 1.21503010e-01 -9.55754340e-01 - -3.98701787e-01 7.73742795e-02 1.18429947e+00 2.56719172e-01 - -6.61935151e-01 3.99004757e-01 1.70754886e+00 9.16706443e-01 - 5.47526538e-01 7.25590169e-01 1.43995404e+00 1.07562685e+00 - 5.66760361e-01 1.06864226e+00 -1.08165109e+00 -1.50226521e+00 - 1.57098651e-01 6.86807111e-02 -5.43910027e-01 8.25852692e-01 - -1.86286354e+00 -5.34772277e-01 -4.84955549e-01 -1.63630176e+00 - 1.49473095e+00 -1.35920131e+00 -5.88340044e-01 9.07979786e-01 - 6.46045327e-01 -1.10817468e+00 6.82489216e-01 2.89784491e-01 - 2.33303547e+00 4.03940409e-01 -1.41241819e-01 -8.56501937e-01 - -7.44243383e-01 -8.25769067e-01 1.37109876e+00 1.79699156e-02 - 5.65247118e-01 1.51424766e+00 -2.35846424e+00 1.27887380e+00 - 5.53969108e-02 4.35048372e-01 -8.28403234e-01 -7.55124390e-01 - 4.88845885e-01 2.38459778e+00 1.00331330e+00 1.26552296e+00 - -1.92001268e-01 -6.00204229e-01 -7.80804932e-01 -4.88273710e-01 - -2.05278254e+00 -7.24919319e-01 -5.60353160e-01 -1.75358176e-01 - 1.31750107e-01 3.62450868e-01 -1.49253728e-02 -5.81443071e-01 - -1.96335864e+00 -5.75308621e-01 -9.72450614e-01 4.66066748e-01 - 9.09287393e-01 9.38571095e-01 7.56952763e-01 -1.33525729e-01 - -5.21955550e-01 -7.47102737e-01 -1.50749886e+00 9.91853714e-01 - -1.74711034e-01 -1.24634027e+00 -8.26491356e-01 -8.66973460e-01 - 4.30244841e-02 1.06746781e+00 -3.47989559e-01 -2.25253344e+00 - -1.89180329e-01 -1.56193897e-01 -2.24920228e-01 -6.27461195e-01 - 2.17439085e-01 6.61411136e-02 -2.48303134e-02 4.12641376e-01 - -4.29832816e-01 -4.86585677e-01 1.38012326e+00 5.59355438e-01 - 8.26946080e-01 1.22610462e+00 5.25944173e-01 9.28388655e-01 - -1.57934511e+00 2.04153866e-01 -1.82718456e-01 1.67768407e+00 - 9.27645504e-01 -6.30445182e-01 3.72794151e-01 1.67681336e-01 - 4.54331428e-01 -1.01884246e+00 3.60016465e-01 -2.28151351e-01 - 2.82449752e-01 -7.24385142e-01 -5.85063159e-01 1.17384207e+00 - -1.05154753e+00 -9.77754414e-01 -5.15812561e-02 -7.71890402e-01 - -8.85028660e-01 1.05075970e-01 -1.50506580e+00 -1.00796223e+00 - 2.05409551e+00 4.59475070e-01 -7.46563375e-01 4.18215066e-01 - 5.67330480e-01 9.94414449e-01 -1.27156034e-01 9.76497710e-01 - -2.03844643e+00 -1.05630815e+00 1.63871408e-01 3.29633534e-01 - -9.11782265e-01 2.59419978e-01 -6.23840272e-01 -2.76959091e-01 - 1.02477384e+00 9.08937514e-01 -9.97060299e-01 -1.05725145e+00 - -4.11173403e-01 -2.67463010e-02 -6.57898486e-01 1.04064250e+00 - 3.47389847e-01 4.06085104e-01 -1.26653886e+00 1.16729069e+00 - -2.82016873e+00 8.34055424e-01 9.34311748e-01 -4.32401180e-01 - 1.27291024e+00 -6.30975515e-02 6.24317043e-02 3.39528799e-01 - 3.18657494e+00 1.34443390e+00 7.01131225e-01 -9.12804484e-01 - 1.43589824e-01 -8.13124120e-01 8.22110832e-01 1.40466973e-01 - -1.12452173e+00 8.11245814e-02 -8.49417448e-01 1.51216224e-01 - -8.59077334e-01 2.74876337e-02 7.50469863e-01 -5.30519933e-02 - -3.90888274e-01 2.03276217e-01 6.97182596e-01 -1.10718322e+00 - 2.60656148e-01 1.31754076e+00 5.73656976e-01 5.77541478e-02 - 6.39737487e-01 4.52496469e-01 7.53601134e-01 -2.02393150e+00 - -1.55345619e+00 1.00482917e+00 -1.50527918e+00 4.01538640e-01 - -3.52049649e-01 -9.96189237e-01 3.49649757e-01 2.01033503e-01 - 2.11034274e+00 -6.24811232e-01 1.44011009e+00 3.52422923e-01 - -6.61021888e-01 7.99246430e-01 5.65738499e-01 3.58432941e-02 - 2.66295344e-01 -5.96987009e-01 -8.25787127e-01 7.59442151e-01 - -2.23939419e-01 -8.00829977e-02 2.79821604e-01 2.65288353e-01 - -1.42932642e+00 1.28625214e+00 1.21182151e-01 -2.02653259e-01 - 4.30846006e-01 2.36542195e-01 7.67062664e-01 5.37983656e-01 - 7.17846215e-01 1.35099387e+00 8.83624554e-01 -1.60486221e-01 - -1.12290120e+00 1.66779792e+00 -2.37723380e-01 -4.73107427e-01 - -1.73672676e+00 3.38473111e-01 1.48886606e-01 -1.51511490e+00 - 4.53274041e-01 -4.98191923e-01 2.69031501e+00 -5.69361210e-01 - -4.35193330e-01 -8.74776244e-01 1.27787650e+00 -3.72775793e-01 - -7.24914134e-01 7.06955016e-01 -1.38035297e+00 6.25110567e-01 - 1.78139973e+00 5.66607833e-01 6.37898326e-01 8.70060205e-01 - 1.70970798e+00 1.20989394e+00 4.39669728e-01 9.03891385e-01 - -8.85934949e-01 7.76293576e-01 -1.00555599e+00 -3.67022425e-01 - -3.08256221e+00 4.01429653e-01 -6.43872023e-01 1.11114526e+00 - 3.90192598e-01 1.12409592e+00 -1.08359659e+00 1.03838599e+00 - 2.10585642e+00 7.64158309e-01 7.90349007e-01 1.69597670e-01 - -5.24668753e-01 7.61192381e-01 -1.50608289e+00 -4.17755127e-01 - -1.77105677e+00 -7.03656375e-01 6.42045081e-01 5.05393386e-01 - 5.09136952e-02 1.70400992e-01 -1.88586462e+00 -3.72562289e-01 - -6.81351602e-01 -6.59059703e-01 5.99578619e-01 -8.60507309e-01 - -6.04680836e-01 -2.08706927e+00 -2.94123173e-01 -1.08783758e+00 - 7.92029276e-02 -9.51108038e-01 1.04542248e-01 -1.44685760e-01 - 1.67204177e+00 4.97217983e-01 -1.83963788e+00 -1.26488888e+00 - 5.52080087e-02 -9.29847956e-01 -2.96836853e+00 1.34954631e+00 - 7.20224023e-01 5.20976901e-01 -6.07323050e-01 4.27676767e-01 - -7.04360306e-01 2.04857305e-01 -9.20583248e-01 1.51628840e+00 - -1.57023668e+00 1.23228383e+00 3.89158279e-02 -4.89466995e-01 - 1.67002916e-01 7.33348012e-01 1.36674678e+00 3.13143015e-01 - -8.90696108e-01 5.56301251e-02 1.81679320e+00 1.34767807e+00 - 9.08763468e-01 7.95687020e-01 6.99450076e-01 1.15875483e+00 - -4.56628799e-01 -1.35508609e+00 6.90035000e-02 -8.17581594e-01 - -1.05502081e+00 -7.20278561e-01 2.48531389e+00 1.36124641e-01 - 3.61907840e-01 -1.22236276e+00 -1.84375298e+00 -8.19052637e-01 - 1.43819559e+00 3.34275454e-01 5.98595083e-01 7.38566041e-01 - -5.91349363e-01 1.38796556e+00 -6.32807612e-01 1.40623236e+00 - 4.42302167e-01 -1.78904510e+00 -7.48009503e-01 -1.78493857e+00 - 1.16328216e+00 -2.47354209e-02 -2.22834253e+00 3.75746876e-01 - -5.37164271e-01 -1.48380589e+00 1.67796588e+00 1.07285865e-01 - -2.93431401e-01 -3.99424791e-01 -1.51327944e+00 -1.30454886e+00 - 1.28491676e+00 -4.73882884e-01 2.20677014e-02 -2.04433036e+00 - -1.31809199e+00 6.09812498e-01 3.77711773e-01 1.75337517e+00 - 1.34295630e+00 -2.30710968e-01 4.72953558e-01 -3.16603452e-01 - 2.05480981e+00 1.14901471e+00 1.69685096e-01 2.00394779e-01 - 1.44485033e+00 5.79577863e-01 1.55862796e+00 -2.19364786e+00 - 4.61228251e-01 -5.15434861e-01 1.75637794e+00 1.54639864e+00 - 3.28358077e-02 1.53563094e+00 1.78311455e+00 7.44066685e-02 - 2.47193384e+00 -3.13411541e-02 -2.14520946e-01 8.65872562e-01 - -3.53790730e-01 8.53397772e-02 1.40943444e+00 1.92188527e-02 - -1.69501781e+00 3.67319375e-01 -6.96293592e-01 8.65271151e-01 - 6.96189702e-01 1.47140038e+00 9.07532275e-01 1.09024429e+00 - -6.69096768e-01 8.03986490e-01 1.96845126e+00 -6.88649893e-01 - 1.79973829e+00 -5.49350083e-01 -8.51971269e-01 2.96200454e-01 - 7.15910435e-01 9.11750138e-01 -9.61762428e-01 4.34540093e-01 - -6.85554564e-01 -1.48892224e+00 -9.28736329e-01 -7.50900388e-01 - 5.21595180e-01 1.15221369e+00 -1.47320664e+00 3.51961076e-01 - -1.17669821e+00 6.29085422e-01 -6.22730553e-01 -1.05132902e+00 - 7.99664497e-01 -5.86922586e-01 -3.15963954e-01 -2.21998906e+00 - 9.46551383e-01 8.20908546e-01 5.45784412e-03 -1.52758098e+00 - 1.20936918e+00 -6.66263402e-01 -4.53353465e-01 2.31204414e+00 - -7.03636467e-01 7.42209971e-01 5.62092245e-01 3.90172184e-01 - 2.82000542e-01 3.76388550e-01 2.71940589e-01 -5.19183576e-02 - 1.51128316e+00 -1.31266057e+00 1.02145815e+00 8.22342277e-01 - 6.98399127e-01 1.11697948e+00 -9.25993919e-02 -1.86897409e+00 - -7.22926632e-02 -9.91194189e-01 -7.31859207e-01 -1.24241972e+00 - -4.77344871e-01 -5.96623719e-01 -2.37756991e+00 7.00221896e-01 - 2.22032189e+00 6.06237173e-01 -2.02923954e-01 -5.47463968e-02 - -4.20985878e-01 9.00415853e-02 -3.70080955e-02 -8.41048419e-01 - 1.68288839e+00 -1.48715425e+00 7.18331516e-01 -1.01197630e-01 - 1.90083182e+00 3.40864599e-01 -3.59905541e-01 -2.47707203e-01 - 3.99066955e-02 -1.14279068e+00 -7.66869545e-01 -1.78494394e+00 - -9.22976911e-01 9.39064845e-02 3.00175041e-01 -1.97932971e+00 - -1.37211710e-01 5.63409925e-01 -1.62912190e-01 -6.33788645e-01 - -4.12161142e-01 -5.40763378e-01 -6.36730552e-01 1.73823521e-01 - 2.30768824e+00 -7.43699908e-01 -3.18142623e-01 -5.16979396e-02 - -6.87354445e-01 1.09823585e+00 -2.37088537e+00 -5.31059146e-01 - 8.18313718e-01 -2.94094294e-01 -8.20838988e-01 4.40756768e-01 - 9.06343222e-01 2.10635439e-01 -8.01220357e-01 4.56203312e-01 - -4.02049303e-01 7.45186865e-01 1.48325884e+00 -2.60557675e+00 - 1.11067474e-01 -1.73573399e+00 2.44178867e+00 -4.68124539e-01 - 7.46373236e-01 -3.53168584e-02 -8.03084552e-01 -9.22053039e-01 - 1.58567917e+00 9.55938041e-01 1.94358408e+00 5.54533340e-02 - 2.87919611e-01 -2.67191559e-01 -2.87082195e-01 -1.04188693e+00 - 8.40101913e-02 -7.89246619e-01 4.41908926e-01 1.70010224e-01 - 2.87682056e-01 1.67512989e+00 -3.77179496e-02 7.56726980e-01 - 2.05150938e+00 1.30661702e+00 1.15316975e+00 -3.06688219e-01 - 2.29643062e-01 -1.31827629e+00 4.98914391e-01 2.74460167e-01 - 2.33851243e-02 1.02719474e+00 -1.20777488e+00 1.63486290e+00 - 9.01764691e-01 -7.06339777e-02 -1.48729593e-01 4.80611205e-01 - 1.52085125e+00 1.01607718e-01 1.44030899e-01 1.81184784e-02 - 1.78202415e+00 -3.68883103e-01 -2.95075979e-02 1.46343565e+00 - 1.47010833e-01 1.13705885e+00 -9.11280036e-01 -2.05292523e-01 - 1.48443401e+00 -6.76339746e-01 -1.51299044e-01 2.56685227e-01 - -9.89154518e-01 -4.13810104e-01 1.12636685e-01 4.66817357e-02 - 1.28035200e+00 1.54145718e+00 -8.56084883e-01 1.11268520e-01 - 6.74248338e-01 1.51634741e+00 7.18490303e-01 2.90913403e-01 - 6.66542530e-01 6.73768818e-01 -9.27714288e-01 1.92798793e-01 - 1.45916104e+00 -2.07356766e-01 1.29446641e-01 -2.42516443e-01 - -2.31385946e+00 2.05901768e-02 3.01791936e-01 -5.12493372e-01 - -1.15463698e+00 -8.80606472e-01 7.64187634e-01 1.97252476e+00 - 1.38243330e+00 -1.34434605e+00 8.54627311e-01 -6.63162351e-01 - -5.82267046e-01 -4.61975604e-01 7.06066966e-01 9.59253371e-01 - -9.80310738e-02 -8.10707435e-02 4.64263737e-01 5.67994535e-01 - 1.17622566e+00 -1.28437197e+00 2.00273395e-02 -1.06802440e+00 - -5.09422794e-02 -2.26175293e-01 -4.67494540e-02 -7.99742877e-01 - -1.48728979e+00 -2.09443760e+00 -1.31180152e-01 -4.31001097e-01 - -6.75937653e-01 1.32254851e+00 -8.59438181e-01 1.41683412e+00 - 1.62384379e+00 -1.95090044e+00 9.23814625e-02 -3.99696499e-01 - -2.91180396e+00 -1.44464123e+00 7.74936080e-01 2.01919937e+00 - 9.02678132e-01 3.23632479e-01 -2.13323813e-02 1.68266296e+00 - 2.14823914e+00 -5.86842656e-01 -1.99998653e+00 -2.09818745e+00 - 1.32188094e+00 -4.70585413e-02 1.83241904e-01 -1.79065096e+00 - -7.53188908e-01 -2.95480818e-01 2.07838386e-01 -3.98592025e-01 - -1.03732622e+00 -7.84566998e-01 1.20832324e+00 2.44784623e-01 - 6.12156808e-01 -8.98873806e-01 -2.95382357e+00 -5.23104310e-01 - -8.66570532e-01 3.49012256e-01 3.28572369e+00 6.95165992e-01 - 2.73596025e+00 5.68769693e-01 -6.29886210e-01 3.47916067e-01 - 1.78063858e+00 1.82956254e+00 -5.75337350e-01 2.53183693e-01 - -1.42926693e+00 8.24668646e-01 1.55335474e+00 3.97994556e-02 - -6.67384028e-01 3.60130966e-01 4.17459041e-01 8.11438337e-02 - 1.78715125e-01 -1.73814848e-01 9.95954931e-01 7.36227453e-01 - -9.83103156e-01 -1.01059055e+00 -2.54374415e-01 7.36837149e-01 - 1.48057416e-01 -6.43150389e-01 2.61518598e-01 -3.30810517e-01 - 8.80486488e-01 3.33058447e-01 -3.15278202e-01 2.31518775e-01 - 1.31504810e+00 -5.69630086e-01 4.02656972e-01 -7.04603419e-02 - 4.26983684e-01 -6.54564917e-01 1.10335432e-01 1.10782325e+00 - -9.37007010e-01 -1.32940507e+00 -1.29145551e+00 -1.17066252e+00 - 5.95323265e-01 -1.40585971e+00 -2.97895342e-01 6.18289292e-01 - 4.16659057e-01 -3.85558993e-01 1.90923735e-01 1.07145332e-01 - -2.24240959e-01 -2.10834712e-01 2.23542738e+00 8.09310913e-01 - 2.48065090e+00 -4.96963143e-01 -8.49665999e-02 -1.33212578e+00 - -8.09566796e-01 1.90191317e+00 -5.22034287e-01 1.91928124e+00 - -1.24336231e+00 -1.05346906e+00 -3.41231227e-01 -6.37499243e-02 - 9.23087299e-01 4.50770408e-01 -4.16315258e-01 -1.33315802e-01 - -3.19062918e-01 -9.14653718e-01 -8.87962937e-01 1.02788389e+00 - -1.82816494e+00 1.61735868e+00 -3.42994213e-01 -7.72508621e-01 - 7.39759266e-01 1.48521662e+00 1.11763239e+00 5.01852036e-01 - 1.08686352e+00 -1.71649754e+00 -5.21830559e-01 -9.11170065e-01 - 1.22463870e+00 -3.83478016e-01 8.45406532e-01 5.35856962e-01 - -8.62306952e-01 -5.86958230e-01 1.29381764e+00 -7.75482655e-01 - 1.24867611e-01 -7.82541275e-01 -3.13952178e-01 -4.61619422e-02 - -7.82866538e-01 -9.60446954e-01 2.86273330e-01 -8.44737411e-01 - -8.10657561e-01 -9.39053118e-01 -7.79080465e-02 9.43701625e-01 - 1.50652862e+00 -7.25918114e-01 2.15031075e+00 -1.00470483e+00 - 1.08354855e+00 -7.21696436e-01 -8.70923281e-01 -7.44756162e-01 - -5.24476990e-02 5.79972148e-01 6.69868052e-01 -9.43985045e-01 - -9.89659309e-01 -9.16638672e-01 -1.11354403e-01 -2.10155606e-01 - -9.50175464e-01 2.69603699e-01 2.26256871e+00 -5.04715443e-01 - -4.33511823e-01 5.59661567e-01 -5.58816314e-01 6.68964505e-01 - -7.18542695e-01 1.83211958e+00 3.44774961e-01 -6.20022953e-01 - -5.88691711e-01 -5.13799727e-01 1.37274969e+00 -1.26875103e+00 - -6.67941689e-01 -5.61660528e-01 -1.56879667e-02 -2.71109700e-01 - -1.49122787e+00 -3.99822682e-01 -9.95506868e-02 -6.28097832e-01 - 6.86775506e-01 9.19408441e-01 -7.83867016e-02 2.30541587e-01 - -2.16105604e+00 -4.21102762e-01 1.00482166e+00 -4.23436582e-01 - -7.37261653e-01 2.27768898e+00 1.81907713e+00 4.67574924e-01 - 1.13891065e+00 -2.22593546e+00 -7.87551463e-01 -4.69957858e-01 - -2.35980093e-01 -4.89944011e-01 -2.55002618e+00 3.86024624e-01 - 5.43654144e-01 1.60189879e+00 -7.88309276e-01 -8.70163143e-01 - -2.67998457e+00 -2.93566823e-01 9.01111782e-01 -6.11767888e-01 - 2.14344478e+00 -2.56594837e-01 1.46992281e-01 -1.36310303e+00 - 2.87220359e-01 -1.97931981e+00 6.93697929e-01 7.24428177e-01 - 1.35929143e+00 -1.01996875e+00 1.60562113e-01 5.63816547e-01 - 4.42281634e-01 -8.65299284e-01 -2.39198184e+00 3.46960306e-01 - -1.52499676e-01 -5.66670597e-01 7.63945341e-01 -4.49195392e-02 - -1.31919503e+00 -1.93145227e+00 -1.12896323e+00 2.39301229e+00 - 7.22776771e-01 1.88612342e-01 -1.81660920e-01 -2.47392938e-01 - 1.16719604e+00 -4.90595728e-01 5.76920390e-01 -3.48427951e-01 - 1.94015658e+00 7.42288530e-01 1.24555683e+00 -5.84409773e-01 - 1.05813611e+00 -3.12577426e-01 1.10034823e-01 1.63504767e+00 - -1.04277503e+00 2.03247726e-01 4.38005269e-01 5.20004392e-01 - -4.50784504e-01 2.08626413e+00 -1.29493189e+00 1.45888543e+00 - 1.21506560e+00 1.90088880e+00 -1.58822715e+00 -8.68831515e-01 - 1.49194300e+00 -1.87703919e+00 -1.60645890e+00 -4.88307774e-01 - 1.91993788e-01 1.59222507e+00 -1.85265914e-01 -6.08239174e-01 - 3.11389923e-01 4.69596475e-01 4.41239834e-01 -1.77986205e-01 - -1.95508611e+00 -9.17616010e-01 1.42027104e+00 1.87068939e-01 - -3.22522134e-01 1.11816859e+00 -1.13011658e+00 -2.25348130e-01 - 1.37758404e-01 -4.41377491e-01 1.00762822e-01 -1.87693939e-01 - -1.39934301e+00 1.81477875e-01 -1.32835174e+00 1.65615126e-01 - 1.37413585e+00 -2.81023204e-01 -1.17148316e+00 -1.26987815e-01 - -8.31971109e-01 5.94509006e-01 -1.47988752e-01 1.70976329e+00 - 1.16298962e+00 -1.53080508e-01 7.15609074e-01 -3.93236399e-01 - 3.39568853e-02 -3.61434191e-01 7.66455948e-01 1.43298829e+00 - 7.98648179e-01 9.21936572e-01 9.23033416e-01 5.51255941e-01 - -1.13928616e+00 1.70097148e+00 1.80441082e+00 -2.33879596e-01 - -3.70697916e-01 6.12411678e-01 5.50812840e-01 -2.01946065e-01 - -1.25002563e+00 3.05701196e-01 5.51436543e-01 7.25821137e-01 - 3.49292785e-01 -2.71115154e-01 9.94002044e-01 6.58484995e-02 - -4.64923024e-01 -1.80653378e-01 -1.74596906e+00 -1.82241827e-01 - 6.88436508e-01 3.39783013e-01 1.52066422e+00 -2.62974918e-01 - 4.87313151e-01 7.92740166e-01 2.34285712e+00 1.44765496e-01 - -1.04418719e+00 8.47928762e-01 1.86572731e+00 -3.22548717e-01 - 8.41081560e-01 1.22872806e+00 1.01706553e+00 1.25558525e-01 - -6.98473752e-01 -8.12795460e-02 4.58600461e-01 9.08222497e-01 - 3.33945036e-01 -3.79907966e-01 5.55379629e-01 8.60085070e-01 - 3.79610062e-01 -8.36187124e-01 -1.81406036e-01 9.47820783e-01 - -1.38301778e+00 -2.09317803e+00 1.50808287e+00 5.11415243e-01 - 6.06729984e-01 -1.07681692e+00 7.37528086e-01 2.75248677e-01 - -1.58432758e+00 1.42428339e+00 1.58683330e-01 -3.96083981e-01 - -3.77989739e-01 -2.01522183e+00 5.93336582e-01 2.61857212e-01 - -1.94080129e-01 1.27848518e+00 -1.38385904e+00 -5.96772492e-01 - -8.50008249e-01 -2.78794974e-01 6.74700215e-02 4.30013537e-01 - -3.46973449e-01 -8.77355754e-01 1.68600690e+00 2.41164017e+00 - 1.43286675e-01 -7.56780386e-01 1.47545409e+00 -1.44675708e+00 - -3.34773332e-01 -6.96117043e-01 -1.15953553e+00 -2.80024230e-01 - -8.64172101e-01 1.26544638e-02 2.71771681e-02 -6.21331751e-01 - -3.70513350e-01 -2.30848968e-01 8.09846044e-01 -1.48429394e-01 - 8.24701726e-01 5.85812986e-01 5.71530700e-01 1.38784599e+00 - -1.11939085e+00 -4.73183095e-01 1.76329875e+00 -2.57316500e-01 - -2.81512380e+00 -2.02968344e-01 1.34939536e-01 -2.61711311e+00 - 1.33393562e+00 -7.13712454e-01 3.28563124e-01 -2.25981548e-01 - -1.28835142e+00 -1.07050812e+00 -9.35082853e-01 1.08567190e+00 - -1.47538185e-01 -5.21865487e-01 -9.34608936e-01 1.19615018e+00 - 1.46420133e+00 1.18474388e+00 4.92437333e-01 2.89830029e-01 - -1.21593785e+00 -2.45819122e-01 -6.83860183e-01 -1.82551205e+00 - 1.05704343e+00 -6.95829928e-01 1.19594193e+00 8.81746411e-01 - 5.56717992e-01 6.12884045e-01 -6.15313888e-01 1.85785019e+00 - 4.31254320e-02 3.01931679e-01 -6.49076343e-01 -9.31165278e-01 - 4.77295995e-01 2.17513502e-01 -7.89352357e-01 -1.43842673e+00 - -1.26992309e+00 1.19613373e+00 -6.13253832e-01 -3.38908553e-01 - 1.45034027e+00 -1.12215054e+00 -7.26994812e-01 -7.68766820e-01 - 1.61806846e+00 1.99338353e+00 7.51563251e-01 7.83384740e-01 - -5.80354989e-01 1.94732845e+00 2.67696261e-01 8.53110254e-01 - -1.05524942e-01 -2.68032581e-01 1.66304231e+00 5.39723694e-01 - 1.45137548e+00 2.55328178e-01 -8.48366559e-01 -5.83342910e-01 - 1.02965243e-01 -6.26500189e-01 -1.10746145e+00 1.20141709e+00 - -5.44870079e-01 1.38192743e-01 2.96133578e-01 -5.88153124e-01 - 1.45079947e+00 -1.11714172e+00 2.61625260e-01 2.46117711e-01 - -5.42546026e-02 -1.29691267e+00 8.70817542e-01 7.43510664e-01 - -6.75568521e-01 7.07487226e-01 3.74907672e-01 -1.28061211e+00 - -8.53928924e-01 1.13828146e+00 3.09416175e-01 -5.64302921e-01 - -8.58475864e-01 -5.15393794e-01 -4.44479525e-01 -8.45963061e-01 - -1.74176407e+00 1.14001334e+00 9.70633686e-01 -1.27325547e+00 - 1.36311293e-01 -9.02238190e-01 -2.61567235e-01 -3.44766587e-01 - -4.27834213e-01 2.00294709e+00 -4.40325677e-01 -8.08263838e-01 - -7.62581408e-01 1.04377699e+00 -1.65210235e+00 -7.62622893e-01 - 1.44682288e+00 2.43986189e-01 1.91922009e-01 2.13214636e+00 - 2.46347949e-01 -7.08186150e-01 -2.82077044e-01 -1.13543236e+00 - 1.25593710e+00 5.74561715e-01 1.12441421e+00 1.21298397e+00 - -4.14210767e-01 -9.44752872e-01 -5.17131127e-02 -8.72761071e-01 - 1.64956883e-01 1.13209689e+00 -1.68139651e-01 1.49995089e-01 - -9.38456133e-02 8.02727580e-01 -6.67782366e-01 -2.19644618e+00 - 4.70868856e-01 9.87834394e-01 4.46374238e-01 -6.40230477e-01 - -1.13501215e+00 4.94497538e-01 1.41478956e-01 -1.25792623e-01 - 1.82246339e+00 -9.46800113e-01 1.77503705e+00 3.17839049e-02 - -1.83557045e+00 -1.79202661e-01 9.59496081e-01 -3.65055025e-01 - -1.47477853e+00 -1.64514527e-01 -8.57172370e-01 4.91814405e-01 - 7.73399830e-01 -1.05627036e+00 -7.05804050e-01 7.56642461e-01 - -8.58844280e-01 -1.08215809e+00 -1.17869115e+00 3.52038622e-01 - 3.09157729e-01 -6.84046030e-01 5.08539611e-03 1.52713239e+00 - -9.74060178e-01 -1.95174265e+00 4.55075562e-01 -1.10401161e-01 - -7.10025072e-01 6.52080238e-01 8.39225173e-01 -2.20827794e+00 - -6.52597904e-01 2.31751633e+00 5.83404899e-01 6.76103652e-01 - 1.17289424e+00 1.70688257e-02 7.96684146e-01 7.18062043e-01 - 5.39041519e-01 -3.22011977e-01 1.72058925e-01 -5.88361561e-01 - 1.28143048e-02 -1.16849089e+00 -7.90623963e-01 -2.82312036e-01 - -9.43705440e-01 -5.05017281e-01 -1.63925976e-01 4.02262330e-01 - 9.49257314e-01 -4.63784367e-01 1.33510458e+00 9.38611090e-01 - 5.04093826e-01 1.53649136e-01 3.19120854e-01 1.18705118e+00 - 1.22357965e+00 1.33494675e+00 4.13268320e-02 5.54967165e-01 - 1.34354651e+00 -6.23716056e-01 -3.18416864e-01 1.60366632e-02 - 8.99598718e-01 9.15172160e-01 -2.30864108e-01 -1.50860941e+00 - 2.46183947e-01 -9.18316126e-01 -1.48993623e+00 -1.26387155e+00 - 3.26380998e-01 1.36103737e+00 3.88945155e-02 2.11380661e-01 - 1.18289495e+00 -1.54445493e+00 5.20176411e-01 -1.15897842e-02 - 5.66057444e-01 -4.83483911e-01 -8.17937315e-01 1.40508905e-01 - 1.64919138e+00 8.60910118e-01 -5.07692754e-01 -1.34097207e+00 - -1.68206728e+00 1.31609941e+00 -9.55082297e-01 -3.25513422e-01 - 1.32121110e+00 -1.61390150e+00 1.49226451e+00 -2.53640443e-01 - -1.14858329e+00 -2.47350931e-01 -7.22596765e-01 4.40819800e-01 - -6.48132443e-01 -1.91164219e+00 -2.20562533e-01 -1.35654509e+00 - 2.04319879e-01 -1.18713033e+00 -4.88012791e-01 1.82587802e-01 - -2.12458730e-01 2.50430286e-01 6.80287182e-01 -1.15793371e+00 - -6.43975437e-02 -3.32950377e+00 2.47333303e-01 -3.00814122e-01 - -9.83449757e-01 3.15719426e-01 1.17269754e+00 -4.76017743e-01 - -9.44814503e-01 1.65025264e-01 -1.33560669e+00 -6.60381377e-01 - 1.22240222e+00 6.21792302e-02 -1.87946701e+00 -1.79699171e+00 - 1.62085697e-01 8.62390578e-01 -1.61762142e+00 4.49712604e-01 - 1.33034348e+00 1.88785899e+00 3.31222773e-01 5.06802619e-01 - -2.13318443e+00 2.09975266e+00 -5.75490236e-01 -9.33471143e-01 - -2.55206138e-01 1.51167345e+00 2.20804596e+00 1.25408679e-01 - 9.98084188e-01 4.32040505e-02 -1.03779542e+00 1.47674370e+00 - -6.22127473e-01 1.53553081e+00 1.29854131e+00 6.14036262e-01 - 1.82897472e+00 2.23234606e+00 -1.75243363e-01 -8.08794141e-01 - -1.86426985e+00 2.24468231e+00 -4.39682007e-01 1.13503683e+00 - -7.41255343e-01 1.53826559e+00 -7.00117588e-01 -1.19171917e+00 - -2.08506435e-01 -5.31775713e-01 5.96299767e-01 -5.13949282e-02 - 6.27702415e-01 -2.91021466e-01 -6.65107369e-01 -5.83560526e-01 - 2.04493880e+00 2.25170925e-01 -1.98376226e+00 -5.20209014e-01 - 9.68660340e-02 1.84539974e+00 1.02869868e+00 -9.21717823e-01 - 2.41835570e+00 -3.11603639e-02 1.24632978e+00 -1.15500855e+00 - -2.39816308e+00 -1.40843153e+00 -3.25033355e+00 2.11208940e+00 - 2.14532912e-01 -9.24643993e-01 4.52624023e-01 1.89334035e-01 - 6.20933414e-01 1.20953143e+00 -2.89921165e-01 -8.79839301e-01 - 8.27775240e-01 8.02332759e-01 1.00735806e-01 2.17755413e+00 - -4.37822133e-01 -8.99857998e-01 4.18091416e-01 -6.74937844e-01 - 4.95516539e-01 -8.33008230e-01 -4.00242269e-01 1.89084041e+00 - -4.05587345e-01 -7.98851550e-01 1.17067873e+00 -1.05104196e+00 - -6.92943215e-01 5.76561093e-01 9.38411415e-01 1.22047639e+00 - -7.47221828e-01 5.96008658e-01 -4.66108203e-01 -6.05655611e-01 - 1.20321274e+00 -5.49554408e-01 2.78204590e-01 1.55579463e-01 - -1.07075441e+00 9.66092885e-01 5.60280919e-01 -1.98629057e+00 - 2.77421117e-01 -7.72973418e-01 1.09707534e+00 -1.44539988e+00 - -3.56476963e-01 3.62024546e-01 5.43402955e-02 7.62958229e-01 - -1.51842201e+00 1.65191436e+00 5.25144637e-01 1.35971820e+00 - 8.74240160e-01 -1.16101140e-02 -1.28419816e+00 -5.63553154e-01 - 1.58956337e+00 1.38543773e+00 -6.19599998e-01 -2.81830430e+00 - -2.12689728e-01 -2.87702888e-01 -2.49772608e-01 5.28539181e-01 - 8.76212597e-01 9.90299523e-01 4.45709139e-01 -1.80077076e+00 - -3.41199726e-01 2.30408978e+00 3.74216735e-01 3.67819577e-01 - 3.59485954e-01 4.15379614e-01 -2.38311219e+00 -2.39524320e-01 - -3.78928334e-02 1.28149545e+00 4.76728939e-02 -1.64651144e+00 - 4.30089146e-01 -8.03824663e-01 -1.21678567e+00 -8.34159076e-01 - 5.42504251e-01 1.25876129e+00 4.82929945e-01 6.32023215e-01 - 4.73045260e-01 1.01327173e-01 6.96930647e-01 -2.41877556e+00 - 1.11088350e-01 -1.04492509e+00 1.16290259e+00 -1.47155300e-01 - 3.09480876e-02 -1.50507188e+00 1.01683092e+00 6.27328098e-01 - 5.56947887e-01 -8.13022852e-01 -7.73639858e-01 2.23599449e-01 - -6.94498420e-01 4.34892952e-01 -2.16091251e+00 -4.87544566e-01 - 1.79675341e+00 4.39485818e-01 1.66793659e-01 1.98522651e+00 - -2.33595371e+00 -8.15311134e-01 -4.01173174e-01 8.58351231e-01 - 8.75656366e-01 -8.66006732e-01 9.97954309e-01 -4.03314769e-01 - 1.21163964e+00 -1.03264272e+00 -1.23311602e-01 -4.90167320e-01 - 1.05603039e+00 -5.22781193e-01 -4.24485281e-02 5.06811380e-01 - 5.45258462e-01 8.86952579e-01 2.11124778e-01 -4.18248922e-01 - -3.49698752e-01 2.56723147e-02 2.49592483e-01 -1.47171646e-01 - 1.92034207e-02 -1.51725388e+00 6.67355299e-01 1.81112289e-01 - 1.00187039e+00 1.60507407e-04 -1.73839760e+00 -3.58446181e-01 - -1.99929923e-01 7.42903292e-01 -2.07027364e+00 4.11795154e-02 - 5.13675213e-01 -6.61487877e-01 7.46280104e-02 -6.27036512e-01 - -7.29572952e-01 -4.08419847e-01 -7.10691988e-01 6.83618307e-01 - -7.34951854e-01 -9.02909994e-01 1.76792872e+00 9.60740030e-01 - -1.86151338e+00 -7.64768302e-01 2.42462978e-01 3.23755056e-01 - -1.49049282e+00 1.15856552e+00 8.25224400e-01 4.81901646e-01 - 1.35672843e+00 4.52289999e-01 4.48580891e-01 1.55767930e+00 - 1.10121953e+00 -8.41507435e-01 6.33195162e-01 -6.64267838e-01 - 5.04154563e-01 4.76613522e-01 -1.38346732e+00 -1.21845610e-01 - 2.65257031e-01 -4.36290205e-01 3.14709932e-01 8.05897772e-01 - -3.05930637e-02 2.24599481e-01 1.38342404e+00 4.73196596e-01 - 2.00824291e-01 -9.58079398e-01 6.71025991e-01 3.30149949e-01 - 1.28373945e+00 4.68785428e-02 -1.81626654e+00 -1.70012093e+00 - 7.35680759e-01 -5.70193589e-01 4.36767757e-01 1.14232644e-01 - -1.66839674e-01 -9.63612869e-02 -8.65389526e-01 -1.57776885e-02 - -5.84493458e-01 -3.72256517e-01 -3.23649049e-01 -2.26693273e-01 - 9.22173440e-01 -1.29790828e-01 1.02370238e+00 1.53785491e+00 - 8.54944646e-01 5.34468532e-01 3.26832205e-01 8.65358710e-01 - 8.93607363e-02 9.84569848e-01 6.14105821e-01 -3.59840453e-01 - 8.95850897e-01 9.14800093e-02 -1.06132008e-01 -4.76199150e-01 - 1.66018844e-01 9.96462643e-01 8.88939977e-01 3.53393793e-01 - 5.99643648e-01 1.27169681e+00 -1.49966285e-01 -1.11541998e+00 - 1.30112982e+00 -5.29778242e-01 -6.90338135e-01 -1.79619777e+00 - 5.71283042e-01 -1.62145257e+00 -7.50258803e-01 5.88236809e-01 - 7.21921265e-01 -6.51704133e-01 -1.34153321e-01 -8.72594535e-01 - -5.48396766e-01 2.66215265e-01 -5.01646698e-01 -2.44215041e-01 - -1.46470666e+00 1.13295615e+00 5.10066569e-01 7.92905390e-01 - 1.29042530e+00 -8.25212896e-01 -3.58394474e-01 -1.89483440e+00 - 3.14042282e+00 -1.49585322e-01 1.03872788e+00 -1.23816109e+00 - 9.96388435e-01 1.85759091e+00 -6.02807477e-02 -1.01088154e+00 - -1.38796404e-01 -2.22676158e+00 -3.78867015e-02 8.20967138e-01 - -3.57576460e-02 -2.84139544e-01 5.27568102e-01 6.04024827e-01 - -1.65990555e+00 8.88229609e-02 2.43598509e+00 -1.33501625e+00 - -8.54188353e-02 1.94174612e+00 1.10417521e+00 -5.13396680e-01 - 2.06135377e-01 -2.11167987e-03 -7.48071790e-01 -7.42521062e-02 - -2.79966146e-01 1.12748936e-01 -7.48383164e-01 -1.27661824e+00 - -4.12807725e-02 1.71511817e+00 8.22029263e-02 -3.31593782e-01 - 7.54085779e-01 -1.58319607e-01 1.86803293e+00 -3.91250942e-03 - 7.00497091e-01 -7.54604399e-01 5.03089249e-01 2.75093466e-01 - -5.65061234e-02 -5.01757741e-01 -6.38030171e-01 2.82352656e-01 - 6.28939092e-01 -1.27162799e-01 2.87267923e-01 -3.79146785e-02 - 9.85260233e-02 -2.00444981e-01 1.79725792e-02 2.69804049e+00 - 3.11634302e-01 6.45296931e-01 4.38023597e-01 -2.62816817e-01 - 1.29398918e+00 -2.91163772e-01 1.05505455e+00 -3.34206998e-01 - -9.59972501e-01 3.65396202e-01 -1.10466838e+00 5.35640240e-01 - -1.19415867e+00 -3.19008023e-01 -1.43172967e+00 2.82416701e+00 - -1.65395951e+00 9.81374085e-03 1.14330423e+00 1.03638589e+00 - 8.51342142e-01 -3.17228377e-01 1.35108900e+00 9.13391113e-01 - 2.12299615e-01 2.40580752e-01 -3.03922653e-01 -9.09652054e-01 - 1.23715103e+00 1.31220794e+00 -8.84109139e-01 -1.53785169e-01 - -1.43423215e-01 -3.26559022e-02 6.42948821e-02 9.46861446e-01 - -7.47217298e-01 -8.46346974e-01 1.23661971e+00 -4.62644756e-01 - -1.26303530e+00 -4.85129952e-01 -3.30052733e-01 1.55496275e+00 - 9.24826205e-01 1.14752066e+00 8.19973648e-01 9.46243644e-01 - -6.88082099e-01 -1.98602155e-01 -9.89915311e-01 -2.10488036e-01 - 6.84104979e-01 1.34583279e-01 1.31148076e+00 -1.14141583e+00 - 5.00238359e-01 -1.24830282e+00 9.41764712e-02 7.40333438e-01 - -2.14502573e-01 5.51126599e-01 -9.59732354e-01 -1.66368997e+00 - 5.57029605e-01 1.74401021e+00 -9.96044934e-01 1.22219419e+00 - 4.34234105e-02 -1.33569157e+00 -1.13150525e+00 1.57847740e-02 - -4.04462278e-01 -1.31291294e+00 -1.69403923e+00 -8.46719325e-01 - 1.93809003e-01 -6.40075281e-03 -5.01237929e-01 2.01887190e-01 - 1.04002666e+00 3.94126505e-01 -1.21308669e-01 9.28072035e-01 - -9.59850311e-01 -2.24919543e-01 -2.01087937e-01 -1.23695326e+00 - -4.19874161e-01 -2.07495975e+00 1.91217804e+00 -9.37413812e-01 - 1.58611417e+00 3.80845070e-02 5.64864159e-01 -4.88799810e-01 - 2.07929686e-01 6.50489628e-01 6.64552391e-01 7.74773419e-01 - -3.68332714e-01 -4.80359674e-01 -1.59417629e+00 5.23218989e-01 - 1.08237875e+00 1.61598170e+00 3.05172622e-01 -7.66479790e-01 - -1.69709611e+00 1.04166675e+00 -1.79627657e+00 -3.40604067e-01 - -2.24306822e-01 -6.94551393e-02 3.58025990e-02 1.38147563e-01 - -2.41536331e+00 1.46559095e-02 -2.22957563e+00 7.32218087e-01 - -5.55125177e-01 -2.15703678e+00 -8.14204693e-01 -5.70075333e-01 - 1.19069958e+00 1.96637735e-01 3.44444931e-01 1.18122160e+00 - -9.61442888e-02 7.54187942e-01 6.83861136e-01 -1.28646946e+00 - 1.47298360e+00 -1.42030180e+00 9.17366624e-01 -1.51707304e+00 - 6.01236761e-01 1.86191487e+00 -2.12978661e-01 -2.56355643e-01 - 6.10440016e-01 -3.29396456e-01 1.81827867e+00 1.92192256e-01 - -3.20169508e-01 -5.40285528e-01 -9.32948470e-01 -6.27754986e-01 - -3.50673407e-01 -4.87639070e-01 -6.58175290e-01 2.09884119e+00 - -1.03009510e+00 1.46632582e-01 5.92521012e-01 6.30107999e-01 - -5.86070754e-02 -2.40702569e-01 4.51547503e-01 2.66521662e-01 - -2.58958846e-01 -1.56187856e+00 -4.67923731e-01 -1.14655018e+00 - -6.67159557e-01 3.32232378e-02 3.58736157e-01 1.78045273e+00 - -5.37406385e-01 7.74625659e-01 3.05960029e-01 1.29599762e+00 - 5.76228738e-01 -1.00848138e+00 -1.72304606e+00 3.11261714e-01 - -5.28451025e-01 -9.80313718e-01 -1.26757765e+00 1.60975501e-01 - 1.50770772e+00 -4.08672333e-01 5.90270340e-01 -1.02266349e-01 - -1.28938210e+00 7.34811366e-01 -1.09469199e+00 2.16266308e-02 - -3.41520429e-01 8.93077433e-01 -1.05815101e+00 3.90869915e-01 - -8.39797378e-01 -5.11105239e-01 -3.63520026e+00 2.09209847e+00 - 5.50556958e-01 -7.67347515e-01 -1.12465572e+00 -4.95648742e-01 - 7.16879547e-01 -1.89208496e+00 -1.41584426e-01 1.77064192e+00 - -1.13394630e+00 8.03850368e-02 2.20330977e+00 6.32151604e-01 - 1.25923827e-01 5.06636441e-01 -4.00865346e-01 1.29963124e+00 - -3.99330020e-01 -1.95918167e+00 -3.91303092e-01 -9.66734529e-01 - -2.81194001e-01 -2.59084046e-01 1.19742322e+00 6.95292711e-01 - 2.63265491e-01 -2.94541125e-03 -1.05885804e+00 -2.18117523e+00 - 1.13160014e-01 7.94641197e-01 -1.08333886e-01 -5.34854352e-01 - 4.64502156e-01 -8.33322033e-02 1.68574417e+00 1.23272836e+00 - 9.59508061e-01 7.57734120e-01 -1.29016745e+00 6.62993729e-01 - 3.76325965e-01 6.06424749e-01 4.92609501e-01 4.52762574e-01 - 2.70086646e+00 -7.61856735e-02 9.36337352e-01 -4.70989823e-01 - 7.90641606e-01 9.39319909e-01 3.64426076e-01 1.66924691e+00 - -8.84276032e-01 -4.87220943e-01 2.12380081e-01 8.80806863e-01 - 1.79429516e-01 1.28575909e+00 -6.61689878e-01 -1.14057851e+00 - 3.08128327e-01 -7.71802738e-02 5.42863727e-01 -2.74022911e-02 - -3.40019792e-01 -7.49375746e-02 2.88488939e-02 -3.04981291e-01 - -2.03132048e-01 -4.03593332e-01 8.56665134e-01 1.94021001e-01 - 2.25675249e+00 1.60704553e-01 -5.50857484e-01 2.67067528e+00 - 4.13626373e-01 9.90124762e-01 5.92103004e-01 6.23011053e-01 - 5.35654068e-01 -1.10208452e+00 3.67529035e-01 -1.32486129e+00 - -1.74362838e-01 -7.25617468e-01 3.57470185e-01 3.34126234e-01 - 4.81145889e-01 2.87296563e-01 -6.06825352e-01 -2.61003494e-01 - 9.20396149e-01 1.06785667e+00 -1.14678311e+00 -1.53092766e+00 - -7.26889372e-01 1.04520762e+00 1.21488404e+00 -9.24711883e-01 - 3.07309359e-01 1.83443129e+00 4.15311486e-01 1.42150331e+00 - 9.98726547e-01 -9.10198689e-01 -1.89981431e-01 3.47544774e-02 - 1.04133390e-01 9.16328967e-01 4.08684760e-01 -2.93620086e+00 - -1.31284022e+00 -2.18738103e+00 1.75365239e-01 2.40487650e-01 - 2.83224970e-01 -9.58928287e-01 6.74144030e-01 7.16262519e-01 - -1.25514913e+00 -1.46603823e+00 -1.31686233e-04 1.79178226e+00 - 1.69974279e+00 5.63404381e-01 -7.29996681e-01 8.86842668e-01 - -4.52424258e-01 -7.67507017e-01 -2.14396691e+00 1.02382198e-01 - -6.29867375e-01 9.64303762e-02 -2.29935765e+00 1.69826671e-01 - -2.43112111e+00 1.08794022e+00 5.21891236e-01 -2.78664768e-01 - 5.62719882e-01 -9.78074253e-01 7.23755136e-02 7.94774055e-01 - -3.09428930e+00 -4.47224349e-01 -2.26441305e-02 -1.96293622e-01 - 1.45710182e+00 -1.12390113e+00 -6.21159256e-01 -1.91586900e+00 - 1.51028669e+00 -2.46290147e-01 -4.38408881e-01 -5.24826832e-02 - 6.36235833e-01 3.47363412e-01 2.64788413e+00 4.53935891e-01 - -4.65245426e-01 -5.61356723e-01 -3.92758965e-01 -1.50183690e+00 - 3.98642085e-02 7.83468127e-01 -6.13363624e-01 -8.07380080e-01 - 1.51594520e+00 -1.42648315e+00 -2.06781840e+00 -1.15290189e+00 - -1.54043269e+00 3.86226058e-01 -3.60704511e-01 -2.19687045e-01 - 1.15293419e+00 -6.91334844e-01 7.75089383e-01 1.51132286e+00 - -4.55662489e-01 7.37022877e-01 -2.41817906e-01 9.24098134e-01 - -1.57148921e+00 -1.64551222e+00 -3.91267717e-01 -8.88924837e-01 - -5.16709089e-01 -2.19526589e-01 7.20013559e-01 -4.07660574e-01 - -8.23861301e-01 -3.51513386e-01 -7.69145250e-01 -9.34005499e-01 - -1.61301684e+00 6.94781169e-02 -5.68705678e-01 9.66278732e-01 - -1.44739604e+00 -5.42083561e-01 1.81290722e+00 -9.39609528e-01 - -2.36533254e-01 -1.21584404e+00 -1.34842753e+00 1.54731095e+00 - 1.71039492e-01 1.33678839e-01 -1.69631624e+00 3.58741403e-01 - 1.15742195e+00 1.14947701e+00 6.68333232e-01 -1.12948561e+00 - 3.13123405e-01 6.67584956e-01 -1.79444361e+00 -1.12127960e+00 - 8.88157308e-01 8.76045763e-01 -1.54858303e+00 7.95021474e-01 - 9.53034014e-02 7.81008959e-01 -6.34521306e-01 6.45183504e-01 - 8.43828797e-01 -3.51334929e-01 1.77525468e-02 -6.48521602e-01 - 1.65163898e+00 -1.20271936e-01 -9.22024310e-01 -7.15825915e-01 - 1.26770601e-01 7.53331184e-01 -1.33582687e+00 1.84765995e+00 - 3.08675289e-01 4.66455966e-01 -9.71895456e-02 1.92589667e-02 - -1.48283139e-01 1.01779592e+00 -7.92858750e-03 7.67050207e-01 - -4.33847517e-01 1.95640534e-01 -8.03920329e-01 1.21652818e+00 - -8.33747268e-01 -3.24979454e-01 -3.14960420e-01 -3.53951395e-01 - 7.22495794e-01 -3.16657752e-01 1.39392447e+00 -8.18475068e-01 - 1.69377476e-01 -7.75785983e-01 4.20972794e-01 -1.59497261e-02 - -1.55062050e-01 1.18847501e+00 -6.54341877e-01 5.86515129e-01 - -7.68959939e-01 1.12685204e-01 -1.19557536e+00 -6.62517130e-01 - -8.21430460e-02 1.81464922e+00 1.00803867e-01 1.65466833e+00 - 2.20323294e-01 -3.87084246e-01 1.53935647e+00 4.37017620e-01 - -6.42721593e-01 4.70571458e-01 -2.92594761e-01 -1.98903418e+00 - 9.35973167e-01 1.09493411e+00 1.83794200e+00 4.22143698e-01 - -2.48723418e-01 8.34989429e-01 8.49820375e-01 1.63828123e+00 - -4.46276218e-02 -1.65578976e-01 -7.15544403e-01 5.37462115e-01 - -1.54826164e+00 7.76087344e-01 5.87835133e-01 5.85070193e-01 - 1.38225242e-01 1.53297830e+00 8.71993482e-01 -6.07077181e-01 - -5.89179039e-01 -8.79673839e-01 5.51283360e-01 -1.00013888e+00 - 1.08339214e+00 1.32297325e+00 -8.79016280e-01 -1.85548529e-01 - 1.66209806e-02 8.83983135e-01 1.60934341e+00 -1.68549240e+00 - -3.18180710e-01 -2.66016692e-01 -1.40066981e+00 -4.85110015e-01 - 4.27768946e-01 -2.78678894e-01 -1.54462111e+00 2.87408280e+00 - 2.66177863e-01 1.11694127e-01 -1.09792638e+00 2.60706156e-01 - -1.64213791e-01 -1.88852346e+00 -1.07025540e+00 7.94926047e-01 - 1.94401765e+00 3.67165416e-01 1.10308325e+00 -1.55755311e-01 - -8.65461603e-02 1.09097898e+00 1.66277730e+00 1.30109668e+00 - -1.06636310e+00 -6.69588894e-02 2.77524680e-01 -2.59677976e-01 - 3.87857556e-01 8.20576787e-01 1.00280976e+00 1.68587017e+00 - -3.29441100e-01 -1.36403322e+00 3.58234584e-01 1.88381100e+00 - -4.76975858e-01 6.86846972e-01 6.95745766e-01 1.91442043e-01 - 6.92472095e-03 -9.23429191e-01 -2.28166199e+00 -1.11185074e+00 - -3.90304148e-01 1.09742272e+00 -1.03128061e-01 -5.19025743e-01 - -8.90985429e-01 6.56805813e-01 1.35835719e+00 1.06343567e+00 - 6.06149673e-01 -1.47561276e+00 -3.51674229e-01 1.94730425e+00 - 1.12195051e+00 -1.53750324e+00 -5.55905215e-02 3.63098443e-01 - -7.62158453e-01 -2.30920410e+00 9.88240466e-02 -7.87758231e-01 - -7.83841729e-01 -3.54680359e-01 -1.33675918e-01 -9.08279419e-03 - -8.83989871e-01 -2.20209524e-01 -1.75742340e+00 3.11039060e-01 - -2.85627532e+00 -2.89490074e-01 9.19186175e-01 -1.05934072e+00 - 7.16430187e-01 1.15573275e+00 -3.30789995e+00 -1.51362911e-01 - -2.87577343e+00 -1.18208446e-01 -7.17121083e-03 -2.48254395e+00 - 1.92483830e+00 2.86015809e-01 -8.49023461e-01 1.46537948e+00 - -1.79561317e+00 -1.22917104e+00 -2.29945564e+00 1.80104518e+00 - 3.32346380e-01 6.59401774e-01 3.01001221e-01 5.73767781e-01 - 7.80777112e-02 2.70921350e-01 1.07603438e-01 -6.08591795e-01 - -4.66247648e-02 2.73594588e-01 1.10549122e-01 -1.47777212e+00 - -1.34498668e+00 1.08284140e+00 -3.06994051e-01 5.84725320e-01 - -1.09211993e+00 -1.47314584e+00 -8.43596637e-01 -2.22565070e-01 - -6.65140271e-01 1.29393351e+00 1.79523304e-01 1.84893858e+00 - 4.04408157e-01 -2.41255093e+00 1.05604160e+00 -1.50955164e+00 - 2.30533674e-01 -7.74684072e-01 2.14271092e+00 1.43959796e+00 - -7.41820633e-01 -1.27357662e-01 2.44598076e-01 8.63414049e-01 - 2.91519128e-02 -1.02489495e+00 -7.17558622e-01 -4.48179618e-03 - 1.23445702e+00 4.27837849e-01 -7.73414910e-01 6.48372769e-01 - -3.86984825e-01 8.90407979e-01 2.52701044e+00 -3.58826369e-01 - 8.01605165e-01 -7.45393336e-01 -1.12263769e-01 2.95622870e-02 - -1.93683654e-01 2.46301365e+00 -3.06593060e-01 5.86851537e-01 - -1.93008685e+00 4.31996554e-01 1.14205456e+00 -5.27597368e-01 - -2.51022220e+00 -3.86517681e-02 1.71838496e-02 7.24075079e-01 - 2.01090312e+00 -3.80053639e-01 -8.56288135e-01 -1.16691780e+00 - 1.98078617e-01 4.79354262e-01 -5.45655370e-01 7.78682768e-01 - 4.65384066e-01 4.32316884e-02 -9.02414739e-01 -4.81864303e-01 - -2.42975521e+00 2.35787064e-01 2.63412893e-02 1.88478991e-01 - 5.04319012e-01 3.85821491e-01 -1.10383183e-01 4.86680418e-02 - -1.40431643e+00 -5.83720624e-01 5.04710525e-02 -7.35853553e-01 - 8.60150814e-01 -6.58632100e-01 2.16305405e-01 1.16565660e-01 - 7.46277571e-01 9.00791168e-01 6.94032609e-01 -2.10600615e+00 - 1.19414818e+00 1.52730823e+00 -4.82124299e-01 2.36646906e-01 - -8.90831709e-01 -1.94837856e+00 -7.38715291e-01 -5.40139019e-01 - -3.54850739e-01 -9.58914101e-01 -7.33425140e-01 -1.10063218e-02 - -7.70226955e-01 5.43524325e-01 6.81660652e-01 2.38038778e+00 - 1.12567997e+00 2.43103057e-01 -6.20264292e-01 2.93747365e-01 - 3.26595962e-01 -5.13313115e-01 7.83731401e-01 -1.39809799e+00 - -2.44399831e-01 5.55858433e-01 -2.87584156e-01 -2.06754401e-01 - -1.37785867e-01 -8.35601270e-01 1.17390096e+00 -1.01803768e+00 - 3.18159342e-01 4.13647890e-01 -9.48621929e-01 7.31042549e-02 - -9.09632146e-01 1.39104247e+00 -1.13878298e+00 -6.53897107e-01 - -1.22422504e+00 -1.35769868e+00 1.59121525e+00 5.42774677e-01 - 1.40293419e-01 -6.56576991e-01 9.07611787e-01 -8.24222445e-01 - 7.93639302e-01 -8.72511268e-01 -6.44507706e-01 7.58282363e-01 - 8.63827348e-01 5.97472906e-01 -1.04967415e+00 -4.46726859e-01 - 1.56898737e-01 -8.49280953e-01 1.37096274e+00 -2.40194702e+00 - -7.78739512e-01 -5.09566486e-01 -2.84394360e+00 -1.39622915e+00 - 6.25289500e-01 3.02836373e-02 1.57313597e+00 1.54522645e+00 - 4.33046371e-01 3.13238263e-01 6.59311116e-02 -2.65508199e+00 - -1.39455950e+00 1.00675501e-01 6.16025686e-01 2.89669812e-01 - -8.92464995e-01 9.37406301e-01 1.48244405e+00 9.37989593e-01 - 9.89599347e-01 -5.59225798e-01 1.07593417e+00 1.45758644e-01 - 5.11555552e-01 3.65431845e-01 -1.21487103e-01 -8.60002458e-01 - -8.21366608e-02 9.86761630e-01 -4.41475809e-01 8.93347263e-01 - 7.37070680e-01 -2.98937410e-01 1.38880014e+00 -3.55546683e-01 - 1.50954688e+00 4.34126973e-01 -1.58659182e-02 -8.87801409e-01 - -8.82602334e-02 -2.03759506e-01 -1.25829637e+00 -2.17615426e-01 - -8.03803205e-01 5.85991740e-01 1.30628049e+00 -1.22691572e+00 - 1.71230769e+00 -1.28551036e-01 -9.75021362e-01 7.57754803e-01 - -4.46454465e-01 -8.17366764e-02 -4.04278249e-01 -8.16737950e-01 - 7.23997056e-01 -2.86301404e-01 -1.43665183e+00 5.51060140e-01 - -1.20558238e+00 -1.15326023e+00 1.19260621e+00 -1.19633985e+00 - -2.31644487e+00 6.22861505e-01 -3.03863436e-01 -2.17655122e-01 - -3.14500391e-01 2.31669933e-01 -7.52719223e-01 1.47611487e+00 - -1.47623050e+00 4.44874913e-01 6.42121017e-01 -7.59651780e-01 - -1.02373445e+00 1.01717424e+00 -9.90860760e-01 -6.06663942e-01 - 8.87308955e-01 3.47412340e-02 -1.79714370e+00 1.74705911e+00 - -5.30010760e-01 -4.02757287e-01 2.55784243e-01 6.25076771e-01 - 1.96110427e+00 -1.66006815e+00 8.25952232e-01 7.43009686e-01 - -1.37164906e-01 7.87273705e-01 9.34274018e-01 -1.38196278e+00 - 1.74976647e+00 -1.55876231e+00 -1.13531625e+00 -2.61787683e-01 - 1.92494047e+00 -3.08256626e-01 -3.19338888e-02 1.43207860e+00 - -3.00318152e-01 -1.17843676e+00 -4.17234838e-01 1.37079179e+00 - 7.08649978e-02 1.93065822e+00 7.37934947e-01 -6.44808948e-01 - 5.10763042e-02 -8.53218138e-01 -6.68433011e-01 4.47908401e+00 - 1.01862454e+00 8.61097991e-01 -6.23180330e-01 1.56958401e+00 - 1.09941566e+00 1.88162124e+00 1.83311415e+00 -1.47908926e+00 - -1.10983944e+00 -1.00378776e+00 -9.43940639e-01 3.23452353e-01 - -5.66803455e-01 2.07259846e+00 1.90467989e+00 -6.46159425e-03 - 1.95532620e+00 1.48813462e+00 -4.50706571e-01 -2.00921535e+00 - -2.17968389e-01 -6.44622147e-01 2.06505626e-01 8.92282873e-02 - 6.30176589e-02 5.23684740e-01 -1.21252191e+00 4.11360264e-01 - -4.39616144e-01 1.99982196e-01 9.57409859e-01 -1.22439981e+00 - -2.81293660e-01 -1.18534237e-01 -5.14268994e-01 8.99829507e-01 - -7.20388353e-01 -5.55400372e-01 -7.09919155e-01 1.09205544e+00 - -4.92444098e-01 4.60045397e-01 1.97570515e+00 -7.48184174e-02 - 1.11206067e+00 1.67135406e+00 1.02719152e+00 -1.68013163e-02 - 6.42107785e-01 -1.28737855e+00 -7.80043185e-01 -6.04675174e-01 - 2.24061203e+00 -8.12584519e-01 -8.27476740e-01 -1.04392099e+00 - 5.59811771e-01 1.42621398e-01 1.08675897e+00 6.71259642e-01 - -1.38038981e+00 -8.07277799e-01 1.36532879e+00 -2.03298241e-01 - 1.18759179e+00 -1.49680942e-01 -9.67656434e-01 -3.03260773e-01 - -9.68049824e-01 -5.65552771e-01 2.00925708e-01 5.11605084e-01 - -1.59019136e+00 -1.73680075e-02 -5.86615741e-01 -6.48414791e-01 - 1.32684618e-01 -5.50451875e-01 1.38213801e+00 1.18806183e+00 - 1.18182826e+00 -4.94910210e-01 -9.62498844e-01 8.20299312e-02 - -1.50348365e-01 -6.08823180e-01 -1.17573440e+00 -4.15179692e-02 - -4.04069453e-01 6.88347042e-01 6.65106058e-01 1.10079980e+00 - -1.25929558e+00 4.01058316e-01 -7.27465689e-01 -1.22041416e+00 - -4.28602248e-01 1.06870496e+00 -3.72318357e-01 1.56545460e+00 - 5.48878491e-01 -3.94090474e-01 1.16004848e+00 4.45396781e-01 - -9.32559371e-01 1.85880268e+00 2.59478837e-01 1.87961925e-02 - -8.55532646e-01 1.17953825e+00 5.17275453e-01 6.29238188e-01 - -5.07531345e-01 8.34100544e-01 9.11813617e-01 -6.68778241e-01 - -1.41699016e+00 5.81373036e-01 -1.57334697e+00 1.52015775e-01 - -4.47245091e-01 1.21866524e+00 -3.92843902e-01 1.10727298e+00 - -5.14209449e-01 9.99991000e-01 2.06034160e+00 -1.06804717e+00 - 2.92035729e-01 2.74878240e+00 6.65888265e-02 6.17882684e-02 - 7.88498938e-01 4.10808623e-02 -1.70722604e+00 1.13228738e+00 - -2.00294852e+00 -4.16791648e-01 1.20742571e+00 -3.56542438e-01 - -6.93696499e-01 8.79263878e-01 -8.51114810e-01 -1.22222567e+00 - 2.74513990e-01 -1.82923302e-01 -1.24367309e+00 6.36952519e-01 - 4.64487612e-01 -3.59755009e-01 -2.91332006e-01 4.70392734e-01 - 1.19036138e+00 -2.82793224e-01 6.54688478e-01 5.65200210e-01 - 8.79307270e-01 3.93091999e-02 -9.12325501e-01 -4.62499142e-01 - 3.42853457e-01 1.81846440e+00 -4.84019637e-01 8.04787457e-01 - -3.28056455e-01 1.10468753e-01 1.51539671e+00 7.64292598e-01 - -1.24129927e+00 -5.72475016e-01 1.25622129e+00 1.41878039e-01 - -1.14884007e+00 -8.18952441e-01 7.03493536e-01 9.94410276e-01 - 1.78553015e-02 2.47531390e+00 8.24152350e-01 -2.43481070e-01 - -8.15706074e-01 -9.74884331e-01 3.35421324e-01 -4.88777041e-01 - 1.67374372e+00 1.46631181e+00 6.33066535e-01 -1.17899621e+00 - -6.72218859e-01 1.25567758e+00 -5.63066185e-01 1.12214172e+00 - 4.74603921e-01 5.17178848e-02 -7.02832818e-01 -9.42438468e-02 - 1.89647958e-01 1.03253889e+00 -8.85735571e-01 -2.62816846e-01 - 4.48231325e-02 6.62678361e-01 -9.08464313e-01 -1.31656981e+00 - 6.73965156e-01 -1.51794910e+00 1.28497970e+00 -1.55926093e-01 - -1.54753113e+00 -4.80420232e-01 3.79123628e-01 -5.32701075e-01 - 2.00297497e-02 7.40078390e-01 -2.02190340e-01 -9.37057674e-01 - -2.98983991e-01 1.41484010e+00 1.68506432e+00 -2.01992607e+00 - 1.09102786e+00 2.50147730e-01 9.23982680e-01 -5.50488234e-01 - -1.76826346e+00 5.84488809e-01 -9.40977275e-01 -1.38829243e+00 - -1.31840134e+00 6.00913763e-01 1.39452472e-01 7.82465994e-01 - 6.69075251e-01 -4.20933574e-01 -7.04735741e-02 -5.11437535e-01 - -2.10327774e-01 -9.67254758e-01 3.90484303e-01 1.07512081e+00 - -8.16613019e-01 1.28126967e+00 1.13521494e-01 2.41078973e-01 - -1.69942951e+00 -1.80726230e+00 1.44116357e-01 7.89732099e-01 - -6.71380460e-01 1.67009389e+00 7.34935999e-01 -9.08341229e-01 - 3.08486670e-02 2.47692919e+00 -1.24256396e+00 5.28269947e-01 - -3.22744548e-01 7.06896305e-01 1.26727366e+00 -3.86237085e-01 - 1.11891222e+00 2.22120333e+00 6.62747398e-02 7.84888685e-01 - -5.90440750e-01 1.92593094e-02 -1.76675096e-01 7.96935916e-01 - 1.20467591e+00 -1.09654851e-01 1.53362286e+00 1.53004742e+00 - 8.46853733e-01 -5.16026556e-01 -6.11828804e-01 -8.12381327e-01 - 9.49166059e-01 -1.21176815e+00 6.76837683e-01 -1.08110940e+00 - 8.23546112e-01 1.72195826e-02 1.33363855e+00 1.14145458e+00 - 1.43506539e+00 1.46081877e+00 1.43630832e-01 -1.15879309e+00 - -1.05252302e+00 -2.08684850e+00 -1.08366585e+00 6.88236579e-02 - -5.59654176e-01 -1.80135608e-01 4.92715418e-01 -3.04065257e-01 - -1.12182879e+00 -5.32336682e-02 -8.98490489e-01 -2.70786494e-01 - 1.34277701e+00 -2.70505309e-01 1.77095759e+00 1.43163598e+00 - -5.79305291e-01 -1.11112380e+00 -5.25541604e-02 1.15332782e+00 - -4.90190893e-01 9.08090293e-01 1.20843995e+00 -7.74025679e-01 - -6.76051080e-01 7.12662041e-01 1.51847556e-01 -6.86061502e-01 - -3.52180451e-01 -7.05195516e-02 -1.56485438e+00 7.65643120e-01 - -4.78692204e-01 -2.78371620e+00 -2.38709692e-02 -7.41571248e-01 - 1.68882215e+00 -5.85445046e-01 -1.09764338e+00 5.07932663e-01 - -6.60447359e-01 -7.79456198e-01 5.09501398e-01 -1.04046726e+00 - -1.45727360e+00 1.72599804e+00 5.79498708e-01 -2.64209270e-01 - -3.47720971e-03 5.05864382e-01 9.73337471e-01 -7.55975306e-01 - 3.66146535e-01 -5.06704152e-01 -1.88981250e-01 1.60692871e-01 - -6.67857286e-03 7.35566914e-01 4.87086564e-01 -1.50816917e+00 - -1.35782325e+00 -3.69716674e-01 1.82862532e+00 -8.51591825e-01 - -2.08222717e-01 -5.36346018e-01 -5.54044366e-01 1.78239748e-01 - 1.40077722e+00 4.55968410e-01 -8.03530991e-01 2.75322962e+00 - 7.85732195e-02 -5.77220976e-01 1.41209638e+00 -4.69070613e-01 - 1.66291368e+00 -8.70642781e-01 1.43003035e+00 2.01047659e-01 - 1.29105222e+00 1.90176570e+00 -1.13260734e+00 1.27577186e+00 - -1.40890396e+00 -1.04718435e+00 -8.03409457e-01 5.03636658e-01 - 3.30158770e-02 -1.67136088e-01 1.46235272e-01 5.91289580e-01 - 6.68248117e-01 1.09416687e+00 1.82003486e+00 -2.13623717e-02 - -5.21088958e-01 6.13536119e-01 -1.14425480e+00 -6.78704143e-01 - 9.99380946e-01 -1.24128371e-01 -1.20819247e+00 -1.48980308e+00 - 4.87166792e-02 1.14189649e+00 1.12854576e+00 1.10880502e-01 - 2.00890946e+00 -1.04820363e-01 -1.38567626e-01 -2.56743598e+00 - -4.47362959e-01 8.52015018e-01 1.56396139e+00 -8.30234647e-01 - 4.62252319e-01 1.04687905e+00 1.12081533e-02 -2.28328037e+00 - 2.47290000e-01 7.61430621e-01 -9.47422922e-01 -2.54154533e-01 - -1.15549600e+00 -2.57014751e+00 1.02700436e+00 -3.74853378e-03 - 1.59767497e+00 8.57322216e-01 -3.54209751e-01 -8.31806585e-02 - 5.66177368e-01 9.66883421e-01 3.22670639e-01 1.10559320e+00 - -2.96668738e-01 -9.97143567e-01 -6.48092553e-02 2.05184892e-01 - -1.20153534e+00 -1.33672214e+00 4.47386295e-01 -4.06356841e-01 - -1.65840909e-01 -3.59692663e-01 1.52060091e+00 1.26176119e+00 - -1.23735368e+00 -5.32793224e-01 8.50004256e-01 6.01936221e-01 - -4.14579622e-02 -1.13760018e+00 -6.82540059e-01 -5.16767561e-01 - 2.76994318e-01 1.70372713e+00 -6.86880231e-01 -1.02603519e+00 - -1.08586475e-01 1.03173900e+00 -1.44688296e+00 1.45363414e+00 - -1.72031358e-01 -7.34319985e-01 -1.60018933e+00 7.06520796e-01 - -1.71171188e-01 1.70705482e-01 -9.00811732e-01 -3.19096029e-01 - -4.86581236e-01 -1.00593105e-01 -6.69552326e-01 -1.53418100e+00 - 2.72849965e+00 -1.36340427e+00 1.81459010e-01 -1.77202785e+00 - -1.99704742e+00 9.78264883e-02 1.74367592e-01 1.26149905e+00 - 3.99546832e-01 -1.49810717e-01 -9.49600220e-01 1.49901211e+00 - 9.28851247e-01 -1.24632943e+00 3.79637294e-02 1.44052237e-01 - -3.45653653e-01 7.71850467e-01 8.31733167e-01 -1.90485701e-01 - -2.80869246e-01 1.49895847e+00 9.32838917e-01 7.23603889e-02 - -1.62920904e+00 3.44000101e-01 6.98083460e-01 -2.75452161e+00 - 2.51869440e-01 -4.16691989e-01 -2.20854282e-01 -9.07380939e-01 - -1.48189291e-01 1.39199162e+00 -4.46398497e-01 9.65573549e-01 - -8.06331217e-01 7.58756220e-01 1.65585557e-03 3.52024823e-01 - 7.15325586e-03 6.41625226e-01 -2.56758523e+00 -1.41982532e+00 - 4.17080492e-01 -2.04353833e+00 -5.30580342e-01 -5.43408394e-02 - -8.28140140e-01 -6.91561401e-01 2.71582186e-01 1.00929499e+00 - -9.58449364e-01 -5.75058222e-01 -7.31260955e-01 -2.42137980e+00 - 1.41711786e-01 9.34717774e-01 1.01664317e+00 -8.70766401e-01 - -1.25169802e+00 2.16055475e-02 8.29273701e-01 1.54850453e-01 - 2.89304644e-01 1.14183092e+00 -7.39136577e-01 -2.19970092e-01 - 1.63460553e+00 -1.96811125e-01 -1.26733696e+00 -6.69356108e-01 - -9.38179851e-01 3.57162595e-01 -8.95946980e-01 2.71034241e-01 \ No newline at end of file diff --git a/image_generation/stable_diffusion_1_5/cpp/scripts/requirements.txt b/image_generation/stable_diffusion_1_5/cpp/scripts/requirements.txt deleted file mode 100644 index b880ee0ef8..0000000000 --- a/image_generation/stable_diffusion_1_5/cpp/scripts/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ ---extra-index-url https://download.pytorch.org/whl/cpu -torch -diffusers -optimum-intel[nncf,openvino] -huggingface_hub[cli] diff --git a/image_generation/stable_diffusion_1_5/cpp/set_up_and_run.sh b/image_generation/stable_diffusion_1_5/cpp/set_up_and_run.sh deleted file mode 100755 index 0f59af083a..0000000000 --- a/image_generation/stable_diffusion_1_5/cpp/set_up_and_run.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -# Copyright (C) 2023 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -set -e # Exit immediately if a command exits with a non-zero status - -abs_path() { - script_path=$(eval echo "${BASH_SOURCE[0]}") - directory=$(dirname "$script_path") - builtin cd "$directory" || exit - pwd -P -} -cd "`abs_path`" - -# initialize OpenVINO -rm -fr ./openvino && mkdir ./openvino -curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2023.3.0-13739-294cc6668c4/l_openvino_toolkit_ubuntu20_2023.3.0.dev20231219_x86_64.tgz | tar --directory ./openvino/ --strip-components 1 -xz -sudo -E ./openvino/install_dependencies/install_openvino_dependencies.sh -source ./openvino/setupvars.sh - -# download extra dependencies -sudo -E apt install libeigen3-dev -y - -# download / convert models -cd scripts -python -m pip install -U pip -python -m pip install -r ./requirements.txt -python -m pip install ../../../../thirdparty/openvino_contrib/modules/custom_operations/ -python convert_model.py -sd runwayml/stable-diffusion-v1-5 -b 1 -t FP16 -dyn True -cd .. - -# build app -cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ -cmake --build ./build/ --config Release --parallel - -# run app -cd build -./stable_diffusion -m ../scripts/runwayml/stable-diffusion-v1-5 -t FP16_dyn diff --git a/image_generation/stable_diffusion_1_5/cpp/src/main.cpp b/image_generation/stable_diffusion_1_5/cpp/src/main.cpp deleted file mode 100644 index 0d572f1750..0000000000 --- a/image_generation/stable_diffusion_1_5/cpp/src/main.cpp +++ /dev/null @@ -1,309 +0,0 @@ -// Copyright (C) 2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include <algorithm> -#include <iostream> -#include <string> -#include <random> -#include <fstream> -#include <filesystem> - -#include "openvino/runtime/core.hpp" -#include "openvino/pass/manager.hpp" -#include "openvino/core/preprocess/pre_post_process.hpp" - -#include "cxxopts.hpp" -#include "scheduler_lms_discrete.hpp" -#include "lora.hpp" -#include "imwrite.hpp" - -class Timer { - const decltype(std::chrono::steady_clock::now()) m_start; -public: - Timer(const std::string& scope) : - m_start(std::chrono::steady_clock::now()) { - (std::cout << scope << ": ").flush(); - } - - ~Timer() { - auto m_end = std::chrono::steady_clock::now(); - std::cout << std::chrono::duration<double, std::milli>(m_end - m_start).count() << " ms" << std::endl; - } -}; - -ov::Tensor randn_tensor(uint32_t height, uint32_t width, bool use_np_latents, uint32_t seed = 42) { - ov::Tensor noise(ov::element::f32, {1, 4, height / 8, width / 8}); - if (use_np_latents) { - // read np generated latents with defaut seed 42 - const char * latent_file_name = "../scripts/np_latents_512x512.txt"; - std::ifstream latent_copy_file(latent_file_name, std::ios::ate); - OPENVINO_ASSERT(latent_copy_file.is_open(), "Cannot open ", latent_file_name); - - size_t file_size = latent_copy_file.tellg() / sizeof(float); - OPENVINO_ASSERT(file_size >= noise.get_size(), "Cannot generate ", noise.get_shape(), " with ", latent_file_name, ". File size is small"); - - latent_copy_file.seekg(0, std::ios::beg); - for (size_t i = 0; i < noise.get_size(); ++i) - latent_copy_file >> noise.data<float>()[i]; - } else { - std::mt19937 gen{seed}; - std::normal_distribution<float> normal{0.0f, 1.0f}; - std::generate_n(noise.data<float>(), noise.get_size(), [&]() { - return normal(gen); - }); - } - return noise; -} - -struct StableDiffusionModels { - ov::CompiledModel text_encoder; - ov::CompiledModel unet; - ov::CompiledModel vae_decoder; - ov::CompiledModel tokenizer; -}; - -void apply_lora(std::shared_ptr<ov::Model> model, InsertLoRA::LoRAMap& lora_map) { - if (!lora_map.empty()) { - ov::pass::Manager manager; - manager.register_pass<InsertLoRA>(lora_map); - manager.run_passes(model); - } -} - -StableDiffusionModels compile_models(const std::string& model_path, const std::string& device, - const std::string& lora_path, const float alpha, const bool use_cache) { - StableDiffusionModels models; - - ov::Core core; - if (use_cache) - core.set_property(ov::cache_dir("./cache_dir")); - core.add_extension(TOKENIZERS_LIBRARY_PATH); - - // read LoRA weights - std::map<std::string, InsertLoRA::LoRAMap> lora_weights; - if (!lora_path.empty()) { - Timer t("Loading and multiplying LoRA weights"); - lora_weights = read_lora_adapters(lora_path, alpha); - } - - // Text encoder - { - Timer t("Loading and compiling text encoder"); - auto text_encoder_model = core.read_model(model_path + "/text_encoder/openvino_model.xml"); - apply_lora(text_encoder_model, lora_weights["text_encoder"]); - models.text_encoder = core.compile_model(text_encoder_model, device); - } - - // UNet - { - Timer t("Loading and compiling UNet"); - auto unet_model = core.read_model(model_path + "/unet/openvino_model.xml"); - apply_lora(unet_model, lora_weights["unet"]); - models.unet = core.compile_model(unet_model, device); - } - - // VAE decoder - { - Timer t("Loading and compiling VAE decoder"); - auto vae_decoder_model = core.read_model(model_path + "/vae_decoder/openvino_model.xml"); - ov::preprocess::PrePostProcessor ppp(vae_decoder_model); - ppp.output().model().set_layout("NCHW"); - ppp.output().tensor().set_layout("NHWC"); - models.vae_decoder = core.compile_model(vae_decoder_model = ppp.build(), device); - } - - // Tokenizer - { - Timer t("Loading and compiling tokenizer"); - models.tokenizer = core.compile_model(model_path + "/tokenizer/openvino_tokenizer.xml", device); - } - - return models; -} - -ov::Tensor text_encoder(StableDiffusionModels models, std::string& pos_prompt, std::string& neg_prompt) { - const size_t MAX_LENGTH = 77; // 'model_max_length' from 'tokenizer_config.json' - const size_t HIDDEN_SIZE = static_cast<size_t>(models.text_encoder.output(0).get_partial_shape()[2].get_length()); - const int32_t EOS_TOKEN_ID = 49407, PAD_TOKEN_ID = EOS_TOKEN_ID; - const ov::Shape input_ids_shape({1, MAX_LENGTH}); - - ov::InferRequest tokenizer_req = models.tokenizer.create_infer_request(); - ov::InferRequest text_encoder_req = models.text_encoder.create_infer_request(); - - auto compute_text_embeddings = [&] (std::string& prompt, ov::Tensor encoder_output_tensor) { - ov::Tensor input_ids(ov::element::i32, input_ids_shape); - std::fill_n(input_ids.data<int32_t>(), input_ids.get_size(), PAD_TOKEN_ID); - - // tokenization - tokenizer_req.set_input_tensor(ov::Tensor{ov::element::string, {1}, &prompt}); - tokenizer_req.infer(); - ov::Tensor input_ids_token = tokenizer_req.get_tensor("input_ids"); - std::copy_n(input_ids_token.data<std::int32_t>(), input_ids_token.get_size(), input_ids.data<int32_t>()); - - // text embeddings - text_encoder_req.set_tensor("input_ids", input_ids); - text_encoder_req.set_output_tensor(0, encoder_output_tensor); - text_encoder_req.infer(); - }; - - ov::Tensor text_embeddings(ov::element::f32, {2, MAX_LENGTH, HIDDEN_SIZE}); - - compute_text_embeddings(neg_prompt, ov::Tensor(text_embeddings, {0, 0, 0}, {1, MAX_LENGTH, HIDDEN_SIZE})); - compute_text_embeddings(pos_prompt, ov::Tensor(text_embeddings, {1, 0, 0}, {2, MAX_LENGTH, HIDDEN_SIZE})); - - return text_embeddings; -} - -ov::Tensor unet(ov::InferRequest req, ov::Tensor sample, ov::Tensor timestep, ov::Tensor text_embedding_1d) { - req.set_tensor("sample", sample); - req.set_tensor("timestep", timestep); - req.set_tensor("encoder_hidden_states", text_embedding_1d); - - req.infer(); - - ov::Tensor noise_pred_tensor = req.get_output_tensor(); - ov::Shape noise_pred_shape = noise_pred_tensor.get_shape(); - noise_pred_shape[0] = 1; - - // perform guidance - const float guidance_scale = 7.5f; - const float* noise_pred_uncond = noise_pred_tensor.data<const float>(); - const float* noise_pred_text = noise_pred_uncond + ov::shape_size(noise_pred_shape); - - ov::Tensor noisy_residual(noise_pred_tensor.get_element_type(), noise_pred_shape); - for (size_t i = 0; i < ov::shape_size(noise_pred_shape); ++i) - noisy_residual.data<float>()[i] = noise_pred_uncond[i] + guidance_scale * (noise_pred_text[i] - noise_pred_uncond[i]); - - return noisy_residual; -} - -ov::Tensor vae_decoder(ov::CompiledModel& decoder_compiled_model, ov::Tensor sample) { - const float coeffs_const{1 / 0.18215}; - for (size_t i = 0; i < sample.get_size(); ++i) - sample.data<float>()[i] *= coeffs_const; - - ov::InferRequest req = decoder_compiled_model.create_infer_request(); - req.set_input_tensor(sample); - req.infer(); - - return req.get_output_tensor(); -} - -ov::Tensor postprocess_image(ov::Tensor decoded_image) { - ov::Tensor generated_image(ov::element::u8, decoded_image.get_shape()); - - // convert to u8 image - const float* decoded_data = decoded_image.data<const float>(); - std::uint8_t* generated_data = generated_image.data<std::uint8_t>(); - for (size_t i = 0; i < decoded_image.get_size(); ++i) { - generated_data[i] = static_cast<std::uint8_t>(std::clamp(decoded_data[i] * 0.5f + 0.5f, 0.0f, 1.0f) * 255); - } - - return generated_image; -} - -int32_t main(int32_t argc, char* argv[]) { - cxxopts::Options options("stable_diffusion", "Stable Diffusion implementation in C++ using OpenVINO\n"); - - options.add_options() - ("p,posPrompt", "Initial positive prompt for SD ", cxxopts::value<std::string>()->default_value("cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting")) - ("n,negPrompt","Defaut is empty with space", cxxopts::value<std::string>()->default_value(" ")) - ("d,device", "AUTO, CPU, or GPU", cxxopts::value<std::string>()->default_value("CPU")) - ("step", "Number of diffusion steps", cxxopts::value<size_t>()->default_value("20")) - ("s,seed", "Number of random seed to generate latent for one image output", cxxopts::value<size_t>()->default_value("42")) - ("num", "Number of image output", cxxopts::value<size_t>()->default_value("1")) - ("height", "destination image height", cxxopts::value<size_t>()->default_value("512")) - ("width", "destination image width", cxxopts::value<size_t>()->default_value("512")) - ("c,useCache", "use model caching", cxxopts::value<bool>()->default_value("false")) - ("r,readNPLatent", "read numpy generated latents from file", cxxopts::value<bool>()->default_value("false")) - ("m,modelPath", "Specify path of SD model IRs", cxxopts::value<std::string>()->default_value("../models/dreamlike-anime-1.0")) - ("t,type", "Specify the type of SD model IRs (e.g., FP16_static or FP16_dyn)", cxxopts::value<std::string>()->default_value("FP16_static")) - ("l,loraPath", "Specify path of LoRA file. (*.safetensors).", cxxopts::value<std::string>()->default_value("")) - ("a,alpha", "alpha for LoRA", cxxopts::value<float>()->default_value("0.75"))("h,help", "Print usage"); - cxxopts::ParseResult result; - - try { - result = options.parse(argc, argv); - } catch (const cxxopts::exceptions::exception& e) { - std::cout << e.what() << "\n\n"; - std::cout << options.help() << std::endl; - return EXIT_FAILURE; - } - - if (result.count("help")) { - std::cout << options.help() << std::endl; - return EXIT_SUCCESS; - } - - std::string positive_prompt = result["posPrompt"].as<std::string>(); - std::string negative_prompt = result["negPrompt"].as<std::string>(); - const std::string device = result["device"].as<std::string>(); - const uint32_t num_inference_steps = result["step"].as<size_t>(); - const uint32_t user_seed = result["seed"].as<size_t>(); - const uint32_t num_images = result["num"].as<size_t>(); - const uint32_t height = result["height"].as<size_t>(); - const uint32_t width = result["width"].as<size_t>(); - const bool use_cache = result["useCache"].as<bool>(); - const bool read_np_latent = result["readNPLatent"].as<bool>(); - const std::string model_base_path = result["modelPath"].as<std::string>(); - const std::string model_type = result["type"].as<std::string>(); - const std::string lora_path = result["loraPath"].as<std::string>(); - const float alpha = result["alpha"].as<float>(); - - const std::string folder_name = "images"; - try { - std::filesystem::create_directory(folder_name); - } catch (const std::exception& e) { - std::cerr << "Failed to create dir" << e.what() << std::endl; - } - - std::cout << "OpenVINO version: " << ov::get_openvino_version() << std::endl; - - // Stable Diffusion pipeline - - StableDiffusionModels models = compile_models(model_base_path + "/" + model_type, device, lora_path, alpha, use_cache); - ov::InferRequest unet_infer_request = models.unet.create_infer_request(); - - ov::PartialShape sample_shape = models.unet.input("sample").get_partial_shape(); - OPENVINO_ASSERT(sample_shape.is_dynamic() || (sample_shape[2] * 8 == width && sample_shape[3] * 8 == height), - "UNet model has static shapes [1, 4, H/8, W/8] or dynamic shapes [?, 4, ?, ?]"); - - Timer t("Running Stable Diffusion pipeline"); - - ov::Tensor text_embeddings = text_encoder(models, positive_prompt, negative_prompt); - - std::shared_ptr<Scheduler> scheduler = std::make_shared<LMSDiscreteScheduler>(); - scheduler->set_timesteps(num_inference_steps); - std::vector<std::int64_t> timesteps = scheduler->get_timesteps(); - - for (uint32_t n = 0; n < num_images; n++) { - std::uint32_t seed = num_images == 1 ? user_seed: n; - ov::Tensor noise = randn_tensor(height, width, read_np_latent, seed); - - // latents are multiplied by 'init_noise_sigma' - ov::Shape latent_shape = noise.get_shape(), latent_model_input_shape = latent_shape; - latent_model_input_shape[0] = 2; // Unet accepts batch 2 - ov::Tensor latent(ov::element::f32, latent_shape), latent_model_input(ov::element::f32, latent_model_input_shape); - for (size_t i = 0; i < noise.get_size(); ++i) { - latent.data<float>()[i] = noise.data<float>()[i] * scheduler->get_init_noise_sigma(); - } - - for (size_t inference_step = 0; inference_step < num_inference_steps; inference_step++) { - // concat the same latent twice along a batch dimension - latent.copy_to(ov::Tensor(latent_model_input, {0, 0, 0, 0}, {1, latent_shape[1], latent_shape[2], latent_shape[3]})); - latent.copy_to(ov::Tensor(latent_model_input, {1, 0, 0, 0}, {2, latent_shape[1], latent_shape[2], latent_shape[3]})); - - scheduler->scale_model_input(latent_model_input, inference_step); - - ov::Tensor timestep(ov::element::i64, {1}, ×teps[inference_step]); - ov::Tensor noisy_residual = unet(unet_infer_request, latent_model_input, timestep, text_embeddings); - - latent = scheduler->step(noisy_residual, latent, inference_step); - } - - ov::Tensor decoded_image = vae_decoder(models.vae_decoder, latent); - imwrite(std::string("./images/seed_") + std::to_string(seed) + ".bmp", postprocess_image(decoded_image), true); - } - - return EXIT_SUCCESS; -} diff --git a/llm_bench/python/README.md b/llm_bench/python/README.md index b7f4c2d0f2..6d2d373654 100755 --- a/llm_bench/python/README.md +++ b/llm_bench/python/README.md @@ -1,123 +1,173 @@ -# Benchmarking script for large language models +# Benchmarking Script for Large Language Models -This script provides a unified approach to estimate performance for Large Language Models. -It is based on pipelines provided by Optimum-Intel and allows to estimate performance for -pytorch and openvino models, using almost the same code and precollected models. +This script provides a unified approach to estimate performance for Large Language Models (LLMs). It leverages pipelines provided by Optimum-Intel and allows performance estimation for PyTorch and OpenVINO models using nearly identical code and pre-collected models. -## Usage -### 1. Start a Python virtual environment +### 1. Prepare Python Virtual Environment for LLM Benchmarking ``` bash -python3 -m venv python-env -source python-env/bin/activate -pip install update --upgrade -pip install -r requirements.txt +python3 -m venv ov-llm-bench-env +source ov-llm-bench-env/bin/activate +pip install --upgrade pip + +git clone https://github.com/openvinotoolkit/openvino.genai.git +cd openvino.genai/llm_bench/python/ +pip install -r requirements.txt ``` -### 2. Convert a model to OpenVINO IR - -The conversion script for preparing benchmarking models, -`convert.py` allows to reproduce IRs stored on shared drive. -Prerequisites: -install conversion dependencies using `requirements.txt` +> Note: +> For existing Python environments, run the following command to ensure that all dependencies are installed with the latest versions: +> `pip install -U --upgrade-strategy eager -r requirements.txt` -Usage: +#### (Optional) Hugging Face Login : -```bash -python convert.py --model_id <model_id_or_path> --output_dir <out_dir> -``` +Login to Hugging Face if you want to use non-public models: -Paramters: -* `--model_id` - model_id for downloading from huggngface_hub (https://huggingface.co/models) or path with directory where pytorch model located. -* `--output_dir` - output directory for saving OpenVINO model -* `--precision` - (optional, default FP32), precision for model conversion FP32 or FP16 -* `--save_orig` - flag for saving original pytorch model, model will be located in `<output_dir>/pytorch` subdirectory. -* `--compress_weights` - The weight compression option, INT8 - INT8 weights, 4BIT_DEFAULT - for 4-bit weights compression with predefined configuration, INT4_SYM - for INT4 compressed weights with symmetric quantization, INT4_ASYM - for INT4 compressed weights with assymetric quantization. You can specify multiple backends separated by a space. -* `--compress_weights_backends` - (optional, default openvino) backends for weights compression, this option has an effect only with `--compress_weights`. You can specify multiple backends separated by a space. -* `--ratio` - Compression ratio between primary and backup precision, e.g. INT4/INT8. -* `--group_size` - Size of the group of weights that share the same quantization parameters - -Usage example: ```bash -python convert.py --model_id meta-llama/Llama-2-7b-chat-hf --output_dir models/llama-2-7b-chat +huggingface-cli login ``` -the result of running the command will have the following file structure: +### 2. Convert Model to OpenVINO IR Format + +The `optimum-cli` tool simplifies converting Hugging Face models to OpenVINO IR format. +- Detailed documentation can be found in the [Optimum-Intel documentation](https://huggingface.co/docs/optimum/main/en/intel/openvino/export). +- To learn more about weight compression, see the [NNCF Weight Compression Guide](https://docs.openvino.ai/2024/openvino-workflow/model-optimization-guide/weight-compression.html). +- For additional guidance on running inference with OpenVINO for LLMs, see the [OpenVINO LLM Inference Guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide.html). - |-llama-2-7b-chat - |-pytorch - |-dldt - |-FP32 - |-openvino_model.xml - |-openvino_model.bin - |-config.json - |-added_tokens.json - |-tokenizer_config.json - |-tokenizer.json - |-tokenizer.model - |-special_tokens_map.json +**Usage:** -### 3. Bechmarking +```bash +optimum-cli export openvino --model <MODEL_ID> --weight-format <PRECISION> <OUTPUT_DIR> -Prerequisites: -install benchmarking dependencies using `requirements.txt` +optimum-cli export openvino -h # For detailed information +``` -``` bash -pip install -r requirements.txt +* `--model <MODEL_ID>` : model_id for downloading from [huggngface_hub](https://huggingface.co/models) or path with directory where pytorch model located. +* `--weight-format <PRECISION>` : precision for model conversion. Available options: `fp32, fp16, int8, int4, mxfp4` +* `<OUTPUT_DIR>`: output directory for saving generated OpenVINO model. + +**NOTE:** +- Models larger than 1 billion parameters are exported to the OpenVINO format with 8-bit weights by default. You can disable it with `--weight-format fp32`. + +**Example:** +```bash +optimum-cli export openvino --model meta-llama/Llama-2-7b-chat-hf --weight-format fp16 models/llama-2-7b-chat ``` -note: **You can specify the installed openvino version through pip install** -``` bash -# e.g. -pip install openvino==2023.2.0 +**Resulting file structure:** + +```console + models + └── llama-2-7b-chat + ├── config.json + ├── generation_config.json + ├── openvino_detokenizer.bin + ├── openvino_detokenizer.xml + ├── openvino_model.bin + ├── openvino_model.xml + ├── openvino_tokenizer.bin + ├── openvino_tokenizer.xml + ├── special_tokens_map.json + ├── tokenizer_config.json + ├── tokenizer.json + └── tokenizer.model ``` -### 4. Run the following command to test the performance of one LLM model +### 3. Benchmark LLM Model + +To benchmark the performance of the LLM, use the following command: + ``` bash python benchmark.py -m <model> -d <device> -r <report_csv> -f <framework> -p <prompt text> -n <num_iters> # e.g. -python benchmark.py -m models/llama-2-7b-chat/pytorch/dldt/FP32 -n 2 -python benchmark.py -m models/llama-2-7b-chat/pytorch/dldt/FP32 -p "What is openvino?" -n 2 -python benchmark.py -m models/llama-2-7b-chat/pytorch/dldt/FP32 -pf prompts/llama-2-7b-chat_l.jsonl -n 2 +python benchmark.py -m models/llama-2-7b-chat/ -n 2 +python benchmark.py -m models/llama-2-7b-chat/ -p "What is openvino?" -n 2 +python benchmark.py -m models/llama-2-7b-chat/ -pf prompts/llama-2-7b-chat_l.jsonl -n 2 ``` -Parameters: -* `-m` - model path -* `-d` - inference device (default=cpu) -* `-r` - report csv -* `-f` - framework (default=ov) -* `-p` - interactive prompt text -* `-pf` - path of JSONL file including interactive prompts -* `-n` - number of benchmarking iterations, if the value greater 0, will exclude the first iteration. (default=0) +**Parameters:** +- `-m`: Path to the model. +- `-d`: Inference device (default: CPU). +- `-r`: Path to the CSV report. +- `-f`: Framework (default: ov). +- `-p`: Interactive prompt text. +- `-pf`: Path to a JSONL file containing prompts. +- `-n`: Number of iterations (default: 0, the first iteration is excluded). +- `-ic`: Limit the output token size (default: 512) for text generation and code generation models. + +**Additional options:** ``` bash python ./benchmark.py -h # for more information ``` -## Running `torch.compile()` +#### Benchmarking the Original PyTorch Model: +To benchmark the original PyTorch model, first download the model locally and then run benchmark by specifying PyTorch as the framework with parameter `-f pt` -The option `--torch_compile_backend` uses `torch.compile()` to speed up -the PyTorch code by compiling it into optimized kernels using a selected backend. +```bash +# Download PyTorch Model +huggingface-cli download meta-llama/Llama-2-7b-chat-hf --local-dir models/llama-2-7b-chat/pytorch +# Benchmark with PyTorch Framework +python benchmark.py -m models/llama-2-7b-chat/pytorch -n 2 -f pt +``` -Prerequisites: install benchmarking dependencies using requirements.txt +> **Note:** If needed, You can install a specific OpenVINO version using pip: +> ``` bash +> # e.g. +> pip install openvino==2024.4.0 +> # Optional, install the openvino nightly package if needed. +> # OpenVINO nightly is pre-release software and has not undergone full release validation or qualification. +> pip uninstall openvino +> pip install --upgrade --pre openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly +> ``` -``` bash -pip install -r requirements/requirements.txt -``` +## 4. Benchmark LLM with `torch.compile()` + +The `--torch_compile_backend` option enables you to use `torch.compile()` to accelerate PyTorch models by compiling them into optimized kernels using a specified backend. -In order to run the `torch.compile()` on CUDA GPU, install additionally the nightly PyTorch version: +Before benchmarking, you need to download the original PyTorch model. Use the following command to download the model locally: ```bash -pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118 +huggingface-cli download meta-llama/Llama-2-7b-chat-hf --local-dir models/llama-2-7b-chat/pytorch ``` -Add the option `--torch_compile_backend` with the desired backend: `pytorch` or `openvino` (default) while running the benchmarking script: +To run the benchmarking script with `torch.compile()`, use the `--torch_compile_backend` option to specify the backend. You can choose between `pytorch` or `openvino` (default). Example: ```bash python ./benchmark.py -m models/llama-2-7b-chat/pytorch -d CPU --torch_compile_backend openvino ``` -## Additional Resources -### 1. NOTE -> If you encounter any errors, please check **[NOTES.md](./doc/NOTES.md)** which provides solutions to the known errors. -### 2. Image generation -> If you need to set parameters for image generation, you need to prepare a prompt file, please check **[IMAGE_GEN.md](./doc/IMAGE_GEN.md)** \ No newline at end of file +> **Note:** To use `torch.compile()` with CUDA GPUs, you need to install the nightly version of PyTorch: +> +> ```bash +> pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118 +> ``` + + +## 5. Running on 2-Socket Platforms + +The benchmarking script sets `openvino.properties.streams.num(1)` by default. For multi-socket platforms, use `numactl` on Linux or the `--load_config` option to modify behavior. + +| OpenVINO Version | Behaviors | +|:--------------------|:------------------------------------------------| +| Before 2024.0.0 | streams.num(1) <br>execute on 2 sockets. | +| 2024.0.0 | streams.num(1) <br>execute on the same socket as the APP is running on. | + +For example, `--load_config config.json` as following will result in streams.num(1) and execute on 2 sockets. +```json +{ + "INFERENCE_NUM_THREADS": <NUMBER> +} +``` +`<NUMBER>` is the number of total physical cores in 2 sockets. + +## 6. Execution on CPU device + +OpenVINO is by default bult with [oneTBB](https://github.com/oneapi-src/oneTBB/) threading library, while Torch uses [OpenMP](https://www.openmp.org/). Both threading libraries have ['busy-wait spin'](https://gcc.gnu.org/onlinedocs/libgomp/GOMP_005fSPINCOUNT.html) by default. When running LLM pipeline on CPU device, there is threading overhead in the switching between inference on CPU with OpenVINO (oneTBB) and postprocessing (For example: greedy search or beam search) with Torch (OpenMP). + +**Alternative solutions** +1. Use --genai option which uses OpenVINO genai API instead of optimum-intel API. In this case postprocessing is executed with OpenVINO genai API. +2. Without --genai option which uses optimum-intel API, set environment variable [OMP_WAIT_POLICY](https://gcc.gnu.org/onlinedocs/libgomp/OMP_005fWAIT_005fPOLICY.html) to PASSIVE which will disable OpenMP 'busy-wait', and benchmark.py will also limit the Torch thread number to avoid using CPU cores which is in 'busy-wait' by OpenVINO inference. + +## 7. Additional Resources + +- **Error Troubleshooting:** Check the [NOTES.md](./doc/NOTES.md) for solutions to known issues. +- **Image Generation Configuration:** Refer to [IMAGE_GEN.md](./doc/IMAGE_GEN.md) for setting parameters for image generation models. diff --git a/llm_bench/python/benchmark.py b/llm_bench/python/benchmark.py index b3f5eedabf..690ae4bc8a 100644 --- a/llm_bench/python/benchmark.py +++ b/llm_bench/python/benchmark.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright (C) 2018-2023 Intel Corporation +# Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 import os import sys @@ -7,29 +7,25 @@ import time from pathlib import Path import logging as log -import utils.ov_utils -import utils.pt_utils -import utils.model_utils +import llm_bench_utils.ov_utils +import llm_bench_utils.pt_utils +import llm_bench_utils.model_utils import torch import numpy as np from openvino.runtime import get_version -from utils.config_class import DEFAULT_MODEL_CLASSES import PIL import hashlib -import utils.metrics_print -import utils.output_csv -import utils.hook_greedy_search -import utils.hook_beam_search +import llm_bench_utils.metrics_print +import llm_bench_utils.output_csv import traceback from transformers import set_seed from PIL import Image -from utils.memory_profile import MemConsumption -from utils.hook_forward import StableDiffusionHook -import utils.output_json +from llm_bench_utils.memory_profile import MemConsumption +from llm_bench_utils.hook_forward import StableDiffusionHook +import llm_bench_utils.output_json +import llm_bench_utils.output_file -HOOK_BEAM_SEARCH_UTILS = {'pt': utils.hook_beam_search, 'ov': utils.hook_beam_search} -HOOK_GREEDY_SEARCH_UTILS = {'pt': utils.hook_greedy_search, 'ov': utils.hook_greedy_search} -FW_UTILS = {'pt': utils.pt_utils, 'ov': utils.ov_utils} +FW_UTILS = {'pt': llm_bench_utils.pt_utils, 'ov': llm_bench_utils.ov_utils} DEFAULT_INFERENCE_STEPS = 20 LCM_DEFAULT_INFERENCE_STEPS = 4 @@ -39,7 +35,6 @@ DEFAULT_SUPER_RESOLUTION_WIDTH = 128 DEFAULT_SUPER_RESOLUTION_HEIGHT = 128 DEFAULT_OUTPUT_TOKEN_SIZE = 512 -MAX_OUTPUT_TOKEN_SIZE = 64 * 1024 mem_consumption = MemConsumption() stable_diffusion_hook = StableDiffusionHook() @@ -55,6 +50,7 @@ def gen_iterate_data( res_md5='', max_rss_mem='', max_shared_mem='', + max_uss_mem='', prompt_idx='', tokenization_time=[], ): @@ -72,16 +68,19 @@ def gen_iterate_data( iter_data['other_tokens_infer_avg_latency'] = '' iter_data['max_rss_mem_consumption'] = max_rss_mem iter_data['max_shared_mem_consumption'] = max_shared_mem + iter_data['max_uss_mem_consumption'] = max_uss_mem iter_data['prompt_idx'] = prompt_idx iter_data['tokenization_time'] = tokenization_time[0] if len(tokenization_time) > 0 else '' iter_data['detokenization_time'] = tokenization_time[1] if len(tokenization_time) > 1 else '' return iter_data -def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, prompt_index, bench_hook): +def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, md5_list, prompt_index, bench_hook, model_precision, proc_id): set_seed(args['seed']) input_text_list = [input_text] * args['batch_size'] - log.info(f'input_text={input_text}') + if args["output_dir"] is not None and num == 0: + for bs_index, in_text in enumerate(input_text_list): + llm_bench_utils.output_file.output_input_text(in_text, args, model_precision, prompt_index, bs_index, proc_id) tok_encode_start = time.perf_counter() input_data = tokenizer(input_text_list, return_tensors='pt') tok_encode_end = time.perf_counter() @@ -90,26 +89,44 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, # Remove `token_type_ids` from inputs input_tokens = input_data['input_ids'] if 'input_ids' in input_data else input_data input_token_size = input_tokens[0].numel() - - max_output_token_size = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count'] - max_output_token_size = MAX_OUTPUT_TOKEN_SIZE if max_output_token_size > MAX_OUTPUT_TOKEN_SIZE else max_output_token_size if args['batch_size'] > 1: - log.info(f"batch_size={args['batch_size']}") - log.info(f"All input token size after padding:{input_token_size} * {args['batch_size']}") - log.info(f"All max_output_token_size:{max_output_token_size} * {args['batch_size']}") - else: - log.info(f'Input token size:{input_token_size}, max_output_token_size:{max_output_token_size}') + out_str = '[warm-up]' if num == 0 else '[{}]'.format(num) + out_str += " Batch_size={}, ".format(args['batch_size']) + out_str += 'all input token size after padding: {} * {}, '.format(input_token_size, args['batch_size']) + if args['infer_count'] is not None: + out_str += 'all max_output_token_size: {} * {}'.format(args['infer_count'], args['batch_size']) + log.info(out_str) max_rss_mem_consumption = '' + max_uss_mem_consumption = '' max_shared_mem_consumption = '' if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: mem_consumption.start_collect_memory_consumption() + max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count'] start = time.perf_counter() - result = model.generate(**input_data, max_new_tokens=int(max_output_token_size), num_beams=args['num_beams'], use_cache=True) + if args['infer_count'] is not None and args['end_token_stopping'] is False: + model.generation_config.eos_token_id = None + model.config.eos_token_id = None + result = model.generate( + **input_data, + max_new_tokens=int(max_gen_tokens), + num_beams=args['num_beams'], + use_cache=True, + eos_token_id=None, + do_sample=False + ) + else: + result = model.generate( + **input_data, + max_new_tokens=int(max_gen_tokens), + num_beams=args['num_beams'], + use_cache=True, + do_sample=False + ) end = time.perf_counter() if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: mem_consumption.end_collect_momory_consumption() - max_rss_mem_consumption, max_shared_mem_consumption = mem_consumption.get_max_memory_consumption() + max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = mem_consumption.get_max_memory_consumption() mem_consumption.clear_max_memory_consumption() generation_time = end - start @@ -120,89 +137,355 @@ def run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, # Only text_gen need to minus length of input_data, because generated_text may include input_text num_tokens = 0 result_md5_list = [] - for i in range(args['batch_size']): - if 'sum' not in args['model_name'] and result[i][:input_token_size].equal(input_tokens[i]): - generated_text_len = len(result[i]) - input_tokens[i].numel() + for bs_idx in range(args['batch_size']): + if 'sum' not in args['model_name'] and result[bs_idx][:input_token_size].equal(input_tokens[bs_idx]): + generated_token_size = len(result[bs_idx]) - input_tokens[bs_idx].numel() else: - generated_text_len = len(result[i]) - num_tokens += generated_text_len - if generated_text_len > max_output_token_size: + generated_token_size = len(result[bs_idx]) + # Encoder-decoder models expect the `decoder_input_ids` to start with a special token + # When counting the output length, subtract 1. The last token does not participate in inference. + if model.config.is_encoder_decoder and result[bs_idx][0] == model.config.decoder_start_token_id: + generated_token_size = generated_token_size - 1 + num_tokens += generated_token_size + if generated_token_size > max_gen_tokens: log.error('Output token size is over max output token size!') - result_text = generated_text[i] - result_md5_list.append(hashlib.md5(result_text.encode()).hexdigest()) - per_token_time = generation_time * 1000 / num_tokens + result_text = generated_text[bs_idx] + if args["output_dir"] is not None: + llm_bench_utils.output_file.output_gen_text(result_text, args, model_precision, prompt_index, num, bs_idx, proc_id) + result_md5_list.append(hashlib.new("md5", result_text.encode(), usedforsecurity=False).hexdigest()) + if len(md5_list[num]) == 0: + md5_list[num] = {prompt_index : result_md5_list} + else: + md5_list[num][prompt_index] = result_md5_list + per_token_time = generation_time * 1000 / (num_tokens / args['batch_size']) + tm_list = [] + tm_infer_list = [] + if bench_hook is not None: + tm_list = bench_hook.get_time_list() + log.debug('latency of all tokens:') + [log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_list)] + tm_infer_list = bench_hook.get_time_infer_list() + log.debug('latency of all infers:') + [log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_infer_list)] + if args['num_beams'] == 1 and generated_token_size != len(tm_infer_list): + log.warning(f'Output token size({generated_token_size}) is not equal to infer count({len(tm_infer_list)})') iter_data = gen_iterate_data( num, input_token_size * args['batch_size'], - max_output_token_size * args['batch_size'], + len(tm_infer_list), num_tokens, generation_time, per_token_time, result_md5_list, max_rss_mem=max_rss_mem_consumption, max_shared_mem=max_shared_mem_consumption, + max_uss_mem=max_uss_mem_consumption, prompt_idx=prompt_index, tokenization_time=(tok_encode_time, tok_decode_time) ) iter_data_list.append(iter_data) - tm_list = bench_hook.get_time_list() - tm_infer_list = bench_hook.get_time_infer_list() - utils.metrics_print.print_metrics( + llm_bench_utils.metrics_print.print_metrics( num, iter_data, tm_list, tm_infer_list, - generated=generated_text[0], warm_up=(num == 0), max_rss_mem=max_rss_mem_consumption, max_shared_mem=max_shared_mem_consumption, - tokenization_time=(tok_encode_time, tok_decode_time) + max_uss_mem=max_uss_mem_consumption, + tokenization_time=(tok_encode_time, tok_decode_time), + batch_size=args['batch_size'] ) - bench_hook.clear_time_list() - bench_hook.clear_time_infer_list() + if num > 0: + prev_md5 = md5_list[num - 1][prompt_index] + if result_md5_list != prev_md5: + log.warning(f"[{num}] Prompt[{prompt_index}]'s md5 {result_md5_list} " + f"is different from md5 of the {num - 1} iteration {prev_md5}") + llm_bench_utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0]) + if not args.get("use_cb", False): + if num == 1: + # if the device is CPU, throw exception + if args['devices'].lower().startswith('cpu') is True: + assert (result_md5_list == prev_md5) + else: + # throw exception + assert (result_md5_list == prev_md5) + else: + llm_bench_utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0]) + if bench_hook is not None: + bench_hook.clear_time_list() + bench_hook.clear_time_infer_list() -def run_text_generation_benchmark(model_path, framework, device, args, num_iters): - model, tokenizer, pretrain_time = FW_UTILS[framework].create_text_gen_model(model_path, device, **args) - # Override forward for statistic each forward time. - default_model_type = DEFAULT_MODEL_CLASSES[args['use_case']] - model_type = args.get('model_type', default_model_type) +def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data_list, md5_list, prompt_index, streamer, model_precision, proc_id): + set_seed(args['seed']) + input_text_list = [input_text] * args['batch_size'] + if args["output_dir"] is not None and num == 0: + for bs_index, in_text in enumerate(input_text_list): + llm_bench_utils.output_file.output_input_text(in_text, args, model_precision, prompt_index, bs_index, proc_id) + pt_inputs = tokenizer(input_text_list, return_tensors="pt") + input_token_size = pt_inputs.input_ids.shape[1] + if args['batch_size'] > 1: + out_str = '[warm-up]' if num == 0 else '[{}]'.format(num) + out_str += " Batch_size={}, ".format(args['batch_size']) + out_str += 'all input token size after padding: {} * {}, '.format(input_token_size, args['batch_size']) + if args['infer_count'] is not None: + out_str += 'all max_output_token_size: {} * {}'.format(args['infer_count'], args['batch_size']) + log.info(out_str) + + max_rss_mem_consumption = '' + max_uss_mem_consumption = '' + max_shared_mem_consumption = '' + if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: + mem_consumption.start_collect_memory_consumption() + max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count'] + start = time.perf_counter() + generation_result = model.generate(input_text_list, max_new_tokens=max_gen_tokens, num_beams=args["num_beams"]) + end = time.perf_counter() + generated_text = generation_result.texts + perf_metrics = generation_result.perf_metrics + + if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: + mem_consumption.end_collect_momory_consumption() + max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = mem_consumption.get_max_memory_consumption() + mem_consumption.clear_max_memory_consumption() - if args['num_beams'] > 1: - bench_hook = HOOK_BEAM_SEARCH_UTILS[framework].BeamSearchHook() + generation_time = end - start + generated_tokens = [tokenizer(text).input_ids for text in generated_text] + # Only text_gen need to minus length of input_data, because generated_text may include input_text + num_tokens = 0 + result_md5_list = [] + for bs_idx in range(args['batch_size']): + generated_text_len = len(generated_tokens[bs_idx]) + num_tokens += generated_text_len + if generated_text_len > max_gen_tokens: + log.error('Output token size is over max output token size!') + result_text = generated_text[bs_idx] + if args["output_dir"] is not None: + llm_bench_utils.output_file.output_gen_text(result_text, args, model_precision, prompt_index, num, bs_idx, proc_id) + result_md5_list.append(hashlib.new("md5", result_text.encode(), usedforsecurity=False).hexdigest()) + if len(md5_list[num]) == 0: + md5_list[num] = {prompt_index : result_md5_list} + else: + md5_list[num][prompt_index] = result_md5_list + per_token_time = generation_time * 1000 / (num_tokens / args['batch_size']) + tm_list = np.array(perf_metrics.raw_metrics.m_durations) / 1000 / 1000 + log.debug('latency of all tokens:') + [log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_list)] + tokenization_time = ( + np.mean(perf_metrics.raw_metrics.tokenization_durations) / 1000, + np.mean(perf_metrics.raw_metrics.detokenization_durations) / 1000 + ) + iter_data = gen_iterate_data( + num, + input_token_size * args['batch_size'], + len(tm_list), + num_tokens, + generation_time, + per_token_time, + result_md5_list, + max_rss_mem=max_rss_mem_consumption, + max_shared_mem=max_shared_mem_consumption, + max_uss_mem=max_uss_mem_consumption, + prompt_idx=prompt_index, + tokenization_time=tokenization_time + ) + iter_data_list.append(iter_data) + llm_bench_utils.metrics_print.print_metrics( + num, + iter_data, + tm_list.tolist(), + [], + warm_up=(num == 0), + max_rss_mem=max_rss_mem_consumption, + max_shared_mem=max_shared_mem_consumption, + max_uss_mem=max_uss_mem_consumption, + tokenization_time=tokenization_time, + batch_size=args['batch_size'] + ) + if num > 0: + prev_md5 = md5_list[num - 1][prompt_index] + if result_md5_list != prev_md5: + log.warning(f"[{num}] Prompt[{prompt_index}]'s md5 {result_md5_list} " + f"is different from md5 of the {num - 1} iteration {prev_md5}") + llm_bench_utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0]) + if not args.get("use_cb", False): + if num == 1: + # if the device is CPU, throw exception + if args['devices'].lower().startswith('cpu') is True: + assert (result_md5_list == prev_md5) + else: + # throw exception + assert (result_md5_list == prev_md5) + else: + llm_bench_utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0]) + + +def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, args, iter_data_list, md5_list, prompt_index, streamer, model_precision, proc_id): + set_seed(args['seed']) + input_text_list = [input_text] * args['batch_size'] + if args["output_dir"] is not None and num == 0: + for bs_index, in_text in enumerate(input_text_list): + llm_bench_utils.output_file.output_input_text(in_text, args, model_precision, prompt_index, bs_index, proc_id) + pt_inputs = tokenizer(input_text_list, return_tensors="pt") + input_token_size = pt_inputs.input_ids.shape[1] + pipe_tokenizer = model.get_tokenizer() + tok_encode_start = time.perf_counter() + input_data = pipe_tokenizer.encode(input_text_list) + tok_encode_end = time.perf_counter() + tok_encode_time = (tok_encode_end - tok_encode_start) * 1000 + if args['batch_size'] > 1: + out_str = '[warm-up]' if num == 0 else '[{}]'.format(num) + out_str += " Batch_size={}, ".format(args['batch_size']) + out_str += 'all input token size after padding: {} * {}, '.format(input_token_size, args['batch_size']) + if args['infer_count'] is not None: + out_str += 'all max_output_token_size: {} * {}'.format(args['infer_count'], args['batch_size']) + log.info(out_str) + max_rss_mem_consumption = '' + max_uss_mem_consumption = '' + max_shared_mem_consumption = '' + if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: + mem_consumption.start_collect_memory_consumption() + max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count'] + streamer.reset() + start = time.perf_counter() + generated_tokens = model.generate(input_data, max_new_tokens=max_gen_tokens, num_beams=args["num_beams"], streamer=streamer).tokens + end = time.perf_counter() + if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: + mem_consumption.end_collect_momory_consumption() + max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = mem_consumption.get_max_memory_consumption() + mem_consumption.clear_max_memory_consumption() + generation_time = end - start + tok_decode_start = time.perf_counter() + generated_text = pipe_tokenizer.decode(generated_tokens) + tok_decode_end = time.perf_counter() + tok_decode_time = (tok_decode_end - tok_decode_start) * 1000 + # Only text_gen need to minus length of input_data, because generated_text may include input_text + num_tokens = 0 + result_md5_list = [] + for bs_idx in range(args['batch_size']): + generated_text_len = len(generated_tokens[bs_idx]) + num_tokens += generated_text_len + if generated_text_len > max_gen_tokens: + log.error('Output token size is over max output token size!') + result_text = generated_text[bs_idx] + if args["output_dir"] is not None: + llm_bench_utils.output_file.output_gen_text(result_text, args, model_precision, prompt_index, num, bs_idx, proc_id) + result_md5_list.append(hashlib.new("md5", result_text.encode(), usedforsecurity=False).hexdigest()) + if len(md5_list[num]) == 0: + md5_list[num] = {prompt_index : result_md5_list} + else: + md5_list[num][prompt_index] = result_md5_list + per_token_time = generation_time * 1000 / (num_tokens / args['batch_size']) + tm_list = streamer.get_time_list() + log.debug('latency of all tokens:') + [log.debug('[{}]{:.4f}'.format(idx, tm)) for idx, tm in enumerate(tm_list)] + iter_data = gen_iterate_data( + num, + input_token_size * args['batch_size'], + len(tm_list), + num_tokens, + generation_time, + per_token_time, + result_md5_list, + max_rss_mem=max_rss_mem_consumption, + max_shared_mem=max_shared_mem_consumption, + max_uss_mem=max_uss_mem_consumption, + prompt_idx=prompt_index, + tokenization_time=(tok_encode_time, tok_decode_time) + ) + iter_data_list.append(iter_data) + llm_bench_utils.metrics_print.print_metrics( + num, + iter_data, + tm_list, + [], + warm_up=(num == 0), + max_rss_mem=max_rss_mem_consumption, + max_shared_mem=max_shared_mem_consumption, + max_uss_mem=max_uss_mem_consumption, + tokenization_time=(tok_encode_time, tok_decode_time), + batch_size=args['batch_size'] + ) + if num > 0: + prev_md5 = md5_list[num - 1][prompt_index] + if result_md5_list != prev_md5: + log.warning(f"[{num}] Prompt[{prompt_index}]'s md5 {result_md5_list} " + f"is different from md5 of the {num - 1} iteration {prev_md5}") + llm_bench_utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0]) + if not args.get("use_cb", False): + if num == 1: + # if the device is CPU, throw exception + if args['devices'].lower().startswith('cpu') is True: + assert (result_md5_list == prev_md5) + else: + # throw exception + assert (result_md5_list == prev_md5) else: - bench_hook = HOOK_GREEDY_SEARCH_UTILS[framework].GreedySearchHook() - bench_hook.new_forward(model, model_type) + llm_bench_utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=generated_text[0]) + streamer.reset() + +def run_text_generation_benchmark(model_path, framework, device, args, num_iters): + model, tokenizer, pretrain_time, bench_hook, use_genai = FW_UTILS[framework].create_text_gen_model(model_path, device, **args) + model_precision = llm_bench_utils.model_utils.get_model_precision(model_path.parts) iter_data_list = [] - input_text_list = utils.model_utils.get_prompts(args) + md5_list = {num : {} for num in range(num_iters + 1)} + input_text_list = llm_bench_utils.model_utils.get_prompts(args) + if args['prompt_index'] is None: + prompt_idx_list = [prompt_idx for prompt_idx, input_text in enumerate(input_text_list)] + text_list = input_text_list + else: + prompt_idx_list = [] + text_list = [] + for i in args['prompt_index']: + if 0 <= i < len(input_text_list): + text_list.append(input_text_list[i]) + prompt_idx_list.append(i) if len(input_text_list) == 0: raise RuntimeError('==Failure prompts is empty ==') + log.info(f'Benchmarking iter nums(exclude warm-up): {num_iters}, prompt nums: {len(text_list)}, ' + f"prompt idx: {prompt_idx_list}, num_beams: {args['num_beams']}") - log.info(f'num_iters={num_iters}, num_text_list={len(input_text_list)}') # if num_iters == 0, just output warm-up data - for num in range(num_iters + 1): - prompt_idx = 0 - for input_text in input_text_list: - run_text_generation(input_text, num, model, tokenizer, args, iter_data_list, prompt_idx, bench_hook) - prompt_idx = prompt_idx + 1 - - utils.metrics_print.print_average(iter_data_list) + if not use_genai: + text_gen_fn = run_text_generation + elif bench_hook is not None: + text_gen_fn = run_text_generation_genai_with_stream + else: + text_gen_fn = run_text_generation_genai + proc_id = os.getpid() + if args['subsequent'] is False: + for num in range(num_iters + 1): + for idx, input_text in enumerate(text_list): + if num == 0: + log.info(f'[warm-up] Input text: {input_text}') + text_gen_fn(input_text, num, model, tokenizer, args, iter_data_list, md5_list, prompt_idx_list[idx], bench_hook, model_precision, proc_id) + else: + for idx, input_text in enumerate(text_list): + for num in range(num_iters + 1): + if num == 0: + log.info(f'[warm-up] Input text: {input_text}') + text_gen_fn(input_text, num, model, tokenizer, args, iter_data_list, md5_list, prompt_idx_list[idx], bench_hook, model_precision, proc_id) + llm_bench_utils.metrics_print.print_average(iter_data_list, prompt_idx_list, args['batch_size'], True) return iter_data_list, pretrain_time -def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list): +def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list, proc_id): set_seed(args['seed']) input_text = image_param['prompt'] image_width = image_param.get('width', DEFAULT_IMAGE_WIDTH) image_height = image_param.get('height', DEFAULT_IMAGE_HEIGHT) nsteps = image_param.get('steps', DEFAULT_INFERENCE_STEPS if 'lcm' not in args["model_name"] else LCM_DEFAULT_INFERENCE_STEPS) - nsteps = 1 if num == 0 else nsteps guidance_scale = image_param.get('guidance_scale', None) - log.info(f'batch_size={args["batch_size"]}, steps={nsteps}, width={image_width}, height={image_height}, guidance_scale={guidance_scale}') + log.info( + f"[{'warm-up' if num == 0 else num}] Input params: Batch_size={args['batch_size']}, " + f'steps={nsteps}, width={image_width}, height={image_height}, guidance_scale={guidance_scale}' + ) result_md5_list = [] max_rss_mem_consumption = '' + max_uss_mem_consumption = '' max_shared_mem_consumption = '' if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: mem_consumption.start_collect_memory_consumption() @@ -214,20 +497,20 @@ def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list) additional_args["guidance_scale"] = 1.0 if 'turbo' in args['model_name']: additional_args["guidance_scale"] = 0.0 + input_text_list = [input_text] * args['batch_size'] + if num == 0 and args["output_dir"] is not None: + for bs_idx, in_text in enumerate(input_text_list): + llm_bench_utils.output_file.output_image_input_text(in_text, args, image_id, bs_idx, proc_id) start = time.perf_counter() - res = pipe([input_text] * args['batch_size'], num_inference_steps=nsteps, height=image_height, width=image_width, **additional_args).images + res = pipe(input_text_list, num_inference_steps=nsteps, height=image_height, width=image_width, **additional_args).images end = time.perf_counter() if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: mem_consumption.end_collect_momory_consumption() - max_rss_mem_consumption, max_shared_mem_consumption = mem_consumption.get_max_memory_consumption() + max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = mem_consumption.get_max_memory_consumption() mem_consumption.clear_max_memory_consumption() - for i in range(args['batch_size']): - if num == 0: - rslt_img_fn = args['model_name'] + '_img' + str(image_id) + '_bs' + str(args['batch_size']) + '-' + str(i + 1) + '_img_warm-up.png' - else: - rslt_img_fn = args['model_name'] + '_iter' + str(num) + '_img' + str(image_id) + '_bs' + str(args['batch_size']) + '-' + str(i + 1) + '.png' - res[i].save(rslt_img_fn) - result_md5_list.append(hashlib.md5(Image.open(rslt_img_fn).tobytes()).hexdigest()) + for bs_idx in range(args['batch_size']): + rslt_img_fn = llm_bench_utils.output_file.output_gen_image(res[bs_idx], args, image_id, num, bs_idx, proc_id, '.png') + result_md5_list.append(hashlib.md5(Image.open(rslt_img_fn).tobytes(), usedforsecurity=False).hexdigest()) generation_time = end - start iter_data = gen_iterate_data( iter_idx=num, @@ -236,48 +519,66 @@ def run_image_generation(image_param, num, image_id, pipe, args, iter_data_list) res_md5=result_md5_list, max_rss_mem=max_rss_mem_consumption, max_shared_mem=max_shared_mem_consumption, + max_uss_mem=max_uss_mem_consumption, prompt_idx=image_id, ) iter_data_list.append(iter_data) - utils.metrics_print.print_metrics( + llm_bench_utils.metrics_print.print_metrics( num, iter_data, - generated=rslt_img_fn, warm_up=(num == 0), max_rss_mem=max_rss_mem_consumption, max_shared_mem=max_shared_mem_consumption, + max_uss_mem=max_uss_mem_consumption, stable_diffusion=stable_diffusion_hook ) + llm_bench_utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=rslt_img_fn) stable_diffusion_hook.clear_statistics() def run_image_generation_benchmark(model_path, framework, device, args, num_iters): + if args['genai']: + log.warning("GenAI pipeline is not supported for this task. Switched on default benchmarking") pipe, pretrain_time = FW_UTILS[framework].create_image_gen_model(model_path, device, **args) iter_data_list = [] - input_image_list = utils.model_utils.get_image_param_from_prompt_file(args) - if len(input_image_list) == 0: - raise RuntimeError('==Failure prompts is empty ==') - + input_image_list = llm_bench_utils.model_utils.get_image_param_from_prompt_file(args) if framework == "ov": stable_diffusion_hook.new_text_encoder(pipe) stable_diffusion_hook.new_unet(pipe) stable_diffusion_hook.new_vae_decoder(pipe) - log.info(f"num_iters={num_iters}, num_text_list={len(input_image_list)}") + if args['prompt_index'] is None: + prompt_idx_list = [image_id for image_id, input_text in enumerate(input_image_list)] + image_list = input_image_list + else: + prompt_idx_list = [] + image_list = [] + for i in args['prompt_index']: + if 0 <= i < len(input_image_list): + image_list.append(input_image_list[i]) + prompt_idx_list.append(i) + if len(image_list) == 0: + raise RuntimeError('==Failure prompts is empty ==') + log.info(f'Benchmarking iter nums(exclude warm-up): {num_iters}, prompt nums: {len(image_list)}, prompt idx: {prompt_idx_list}') # if num_iters == 0, just output warm-up data - for num in range(num_iters + 1): - image_id = 0 - for image_param in input_image_list: - run_image_generation(image_param, num, image_id, pipe, args, iter_data_list) - image_id += 1 - - utils.metrics_print.print_average(iter_data_list) + proc_id = os.getpid() + if args['subsequent'] is False: + for num in range(num_iters + 1): + for image_id, image_param in enumerate(image_list): + run_image_generation(image_param, num, prompt_idx_list[image_id], pipe, args, iter_data_list, proc_id) + else: + for image_id, image_param in enumerate(image_list): + for num in range(num_iters + 1): + run_image_generation(image_param, num, prompt_idx_list[image_id], pipe, args, iter_data_list, proc_id) + llm_bench_utils.metrics_print.print_average(iter_data_list, prompt_idx_list, args['batch_size'], False) return iter_data_list, pretrain_time def run_image_classification(model_path, framework, device, args, num_iters=10): + if args['genai']: + log.warning("GenAI pipeline is not supported for this task. Switched on default benchmarking") model, input_size = FW_UTILS[framework].create_image_classification_model(model_path, device, **args) data = torch.rand(input_size) @@ -298,16 +599,19 @@ def run_image_classification(model_path, framework, device, args, num_iters=10): return iter_data_list -def run_ldm_super_resolution(img, num, pipe, args, framework, iter_data_list, image_id, tm_list): +def run_ldm_super_resolution(img, num, pipe, args, framework, iter_data_list, image_id, tm_list, proc_id): set_seed(args['seed']) nsteps = img.get('steps', DEFAULT_SUPER_RESOLUTION_STEPS) - nsteps = 1 if num == 0 else nsteps resize_image_width = img.get('width', DEFAULT_SUPER_RESOLUTION_WIDTH) resize_image_height = img.get('height', DEFAULT_SUPER_RESOLUTION_HEIGHT) - log.info(f'Test {num} input image={img["prompt"]}, steps={nsteps}, resize_width={resize_image_width}, resize_height={resize_image_height}') + log.info( + f"[{'warm-up' if num == 0 else num}] Input params: steps={nsteps}, " + f'resize_width={resize_image_width}, resize_height={resize_image_height}' + ) low_res_img = PIL.Image.open(img['prompt']).convert('RGB') low_res_img = low_res_img.resize((resize_image_width, resize_image_height)) max_rss_mem_consumption = '' + max_uss_mem_consumption = '' max_shared_mem_consumption = '' if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: mem_consumption.start_collect_memory_consumption() @@ -316,17 +620,12 @@ def run_ldm_super_resolution(img, num, pipe, args, framework, iter_data_list, im end = time.perf_counter() if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: mem_consumption.end_collect_momory_consumption() - max_rss_mem_consumption, max_shared_mem_consumption = mem_consumption.get_max_memory_consumption() + max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = mem_consumption.get_max_memory_consumption() mem_consumption.clear_max_memory_consumption() - if num == 0: - rslt_img_fn = args['model_name'] + '_warmup_' + img['prompt'].name - else: - rslt_img_fn = args['model_name'] + '_iter' + str(num) + '_' + img['prompt'].name - log.info(f'Result will be saved to {rslt_img_fn}') result_md5_list = [] if framework == 'ov': - res[0].save(rslt_img_fn) - result_md5_list.append(hashlib.md5(Image.open(rslt_img_fn).tobytes()).hexdigest()) + rslt_img_fn = llm_bench_utils.output_file.output_gen_image(res[0], args, image_id, num, None, proc_id, '.png') + result_md5_list.append(hashlib.md5(Image.open(rslt_img_fn).tobytes(), usedforsecurity=False).hexdigest()) generation_time = end - start iter_data = gen_iterate_data( @@ -336,30 +635,36 @@ def run_ldm_super_resolution(img, num, pipe, args, framework, iter_data_list, im res_md5=result_md5_list, max_rss_mem=max_rss_mem_consumption, max_shared_mem=max_shared_mem_consumption, + max_uss_mem=max_uss_mem_consumption, prompt_idx=image_id, ) iter_data_list.append(iter_data) - utils.metrics_print.print_metrics( + llm_bench_utils.metrics_print.print_metrics( num, iter_data, - generated=rslt_img_fn, warm_up=(num == 0), max_rss_mem=max_rss_mem_consumption, max_shared_mem=max_shared_mem_consumption, + max_uss_mem=max_uss_mem_consumption ) - utils.metrics_print.print_ldm_unet_vqvae_infer_latency(num, iter_data, tm_list, warm_up=(num == 0),) + llm_bench_utils.metrics_print.print_generated(num, warm_up=(num == 0), generated=rslt_img_fn) + llm_bench_utils.metrics_print.print_ldm_unet_vqvae_infer_latency(num, iter_data, tm_list, warm_up=(num == 0)) def run_ldm_super_resolution_benchmark(model_path, framework, device, args, num_iters): + if args["genai"]: + log.warning("GenAI pipeline is not supported for this task. Switched on default benchmarking") pipe, pretrain_time = FW_UTILS[framework].create_ldm_super_resolution_model(model_path, device, **args) iter_data_list = [] tm_list = [] - input_image_list = utils.model_utils.get_image_param_from_prompt_file(args) + input_image_list = llm_bench_utils.model_utils.get_image_param_from_prompt_file(args) if len(input_image_list) > 0: images = [] for image in input_image_list: - image['prompt'] = os.path.join(os.path.dirname(args['prompt'] if args['prompt'] is not None else args['prompt_file']), - image['prompt'].replace('./', '')) + if args['prompt'] is None and args['prompt_file'] is None: + raise RuntimeError('==Failure image is empty ==') + elif args['prompt_file'] is not None and len(args['prompt_file']) > 0: + image['prompt'] = os.path.join(os.path.dirname(args['prompt_file'][0]), image['prompt'].replace('./', '')) image['prompt'] = Path(image['prompt']) images.append(image) else: @@ -371,16 +676,33 @@ def run_ldm_super_resolution_benchmark(model_path, framework, device, args, num_ images = [images] else: raise RuntimeError('==Failure image is empty ==') - log.info(f'Number benchmarking images {len(images)}') + + prompt_idx_list = [image_id for image_id, image_param in enumerate(images)] + if args['prompt_index'] is None: + prompt_idx_list = [image_id for image_id, input_text in enumerate(images)] + image_list = images + else: + prompt_idx_list = [] + image_list = [] + for i in args['prompt_index']: + if 0 <= i < len(images): + image_list.append(images[i]) + prompt_idx_list.append(i) + if len(image_list) == 0: + raise RuntimeError('==Failure prompts is empty ==') + log.info(f'Benchmarking iter nums(exclude warm-up): {num_iters}, prompt nums: {len(image_list)}, prompt idx: {prompt_idx_list}') # if num_iters == 0, just output warm-up data + proc_id = os.getpid() for num in range(num_iters + 1): - image_id = 0 - for img in images: - run_ldm_super_resolution(img, num, pipe, args, framework, iter_data_list, image_id, tm_list) + for image_id, img in enumerate(image_list): + if num == 0: + if args["output_dir"] is not None: + llm_bench_utils.output_file.output_image_input_text(str(img['prompt']), args, prompt_idx_list[image_id], None, proc_id) + log.info(f"[{'warm-up' if num == 0 else num}] Input image={img['prompt']}") + run_ldm_super_resolution(img, num, pipe, args, framework, iter_data_list, prompt_idx_list[image_id], tm_list, proc_id) tm_list.clear() - image_id = image_id + 1 - utils.metrics_print.print_average(iter_data_list) + llm_bench_utils.metrics_print.print_average(iter_data_list, prompt_idx_list, 1, False) return iter_data_list, pretrain_time @@ -392,6 +714,13 @@ def num_iters_type(x): return x +def num_infer_count_type(x): + x = int(x) + if x < 1: + raise argparse.ArgumentTypeError('Minimum input value is 1') + return x + + def get_argprser(): parser = argparse.ArgumentParser('LLM benchmarking tool', add_help=True, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('-m', '--model', help='model folder including IR files or Pytorch files', required=TabError) @@ -400,14 +729,16 @@ def get_argprser(): parser.add_argument('-rj', '--report_json', help='report json') parser.add_argument('-f', '--framework', default='ov', help='framework') parser.add_argument('-p', '--prompt', default=None, help='one prompt') - parser.add_argument('-pf', '--prompt_file', default=None, help='prompt file in jsonl format') + parser.add_argument('-pf', '--prompt_file', nargs='+', default=None, + help='Prompt file(s) in jsonl format. Multiple prompt files should be separated with space(s).') + parser.add_argument('-pi', '--prompt_index', nargs='+', type=num_iters_type, default=None, + help='Run the specified prompt index. You can specify multiple prompt indexes, separated by spaces.') parser.add_argument( '-ic', '--infer_count', default=None, - type=int, - help='limit the output token size ' - f'(default {DEFAULT_OUTPUT_TOKEN_SIZE}) of text_gen and code_gen models.', + type=num_infer_count_type, + help='set the output token size, the value must be greater than 0.' ) parser.add_argument( '-n', @@ -456,10 +787,42 @@ def get_argprser(): required=False, help='Enables running the torch.compile() with specified backend: pytorch or openvino (default)', ) + parser.add_argument( + '--torch_compile_dynamic', + action='store_true', + help='Enables dynamic shape tracking for torch.compile()', + ) + parser.add_argument( + '--torch_compile_options', + default=None, + required=False, + help='Options for torch.compile() in JSON format', + ) + parser.add_argument( + '--torch_compile_input_module', + default=None, + required=False, + help='Specifies the module to decorate with torch.compile(). By default, parent module will be decorated.', + ) parser.add_argument( '--convert_tokenizer', action='store_true', help='Convert tokenizer to OpenVINO format' ) - utils.model_utils.add_stateful_model_arguments(parser) + parser.add_argument( + '--subsequent', + action='store_true', + help='if the value is True, input prompts are processed in subsequent manner' + 'if the value is False (default), input prompts are processed in interleave manner' + ) + parser.add_argument('-od', '--output_dir', help='Save the input text and generated text, images to files') + llm_bench_utils.model_utils.add_stateful_model_arguments(parser) + parser.add_argument("--genai", action="store_true", help="Use OpenVINO GenAI optimized pipelines for benchmarking") + parser.add_argument("--use_cb", action="store_true", help="Use Continuous Batching inference mode") + parser.add_argument("--cb_config", required=False, default=None, help="Path to file with Continuous Batching Scheduler settings or dict") + parser.add_argument( + '--end_token_stopping', + action='store_true', + help='Stop the generation even output token size does not achieve infer_count or max token size ({DEFAULT_OUTPUT_TOKEN_SIZE}}).' + ) return parser.parse_args() @@ -474,20 +837,35 @@ def get_argprser(): def main(): - log.basicConfig(format='[ %(levelname)s ] %(message)s', level=log.INFO, stream=sys.stdout) + logging_kwargs = {"encoding": "utf-8"} if sys.version_info[1] > 8 else {} + log.basicConfig(format='[ %(levelname)s ] %(message)s', level=os.environ.get("LOGLEVEL", log.INFO), stream=sys.stdout, **logging_kwargs) args = get_argprser() - model_path, framework, model_args, model_name = utils.model_utils.analyze_args(args) + model_path, framework, model_args, model_name = llm_bench_utils.model_utils.analyze_args(args) # Set the device for running OpenVINO backend for torch.compile() if model_args['torch_compile_backend']: ov_torch_backend_device = str(args.device) os.putenv('OPENVINO_TORCH_BACKEND_DEVICE', ov_torch_backend_device.upper()) - os.system('echo OPENVINO_TORCH_BACKEND_DEVICE=$OPENVINO_TORCH_BACKEND_DEVICE') + os.system('echo [ INFO ] OPENVINO_TORCH_BACKEND_DEVICE=$OPENVINO_TORCH_BACKEND_DEVICE') + out_str = 'Model path={}'.format(model_path) if framework == 'ov': - log.info(f'model_path={model_path}, openvino runtime version: {get_version()}') + out_str += ', openvino runtime version: {}'.format(get_version()) if model_args['config'].get('PREC_BF16') and model_args['config']['PREC_BF16'] is True: log.warning('[Warning] Param bf16/prec_bf16 only work for framework pt. It will be disabled.') + if 'cpu' in args.device.lower(): + env_omp = os.getenv('OMP_WAIT_POLICY') + if env_omp is None or env_omp != 'PASSIVE': + log.warning("It is recommended to set the environment variable OMP_WAIT_POLICY to PASSIVE, " + "so that OpenVINO inference can use all CPU resources without waiting.") + original_torch_thread_nums = torch.get_num_threads() + if model_args['num_beams'] > 1: + torch.set_num_threads(int(original_torch_thread_nums / 2)) + else: + torch.set_num_threads(1) + log.info(f"The num_beams is {model_args['num_beams']}, update Torch thread num from " + f'{original_torch_thread_nums} to {torch.get_num_threads()}, avoid to use the CPU cores for OpenVINO inference.') + log.info(out_str) if args.memory_consumption: mem_consumption.start_collect_mem_consumption_thread() try: @@ -495,12 +873,12 @@ def main(): if args.report is not None or args.report_json is not None: model_precision = '' if framework == 'ov': - ir_conversion_frontend = utils.model_utils.get_ir_conversion_frontend(model_name, model_path.parents._parts) + ir_conversion_frontend = llm_bench_utils.model_utils.get_ir_conversion_frontend(model_name, model_path.parts) if ir_conversion_frontend != '': framework = framework + '(' + ir_conversion_frontend + ')' - model_precision = utils.model_utils.get_model_precision(model_path.parents._parts) + model_precision = llm_bench_utils.model_utils.get_model_precision(model_path.parts) if args.report is not None: - utils.output_csv.write_result( + llm_bench_utils.output_csv.write_result( args.report, model_name, framework, @@ -511,7 +889,7 @@ def main(): model_precision, ) if args.report_json is not None: - utils.output_json.write_result( + llm_bench_utils.output_json.write_result( args.report_json, model_name, framework, @@ -524,6 +902,7 @@ def main(): except Exception: log.error('An exception occurred') log.info(traceback.format_exc()) + exit(1) finally: if args.memory_consumption: mem_consumption.end_collect_mem_consumption_thread() diff --git a/llm_bench/python/convert.py b/llm_bench/python/convert.py index 0e4d7c6231..49cea02c11 100644 --- a/llm_bench/python/convert.py +++ b/llm_bench/python/convert.py @@ -1,30 +1,32 @@ # -*- coding: utf-8 -*- -# Copyright (C) 2018-2023 Intel Corporation +# Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 import sys import gc import time +import copy import logging as log from argparse import ArgumentParser from functools import wraps from pathlib import Path -from typing import Tuple +from typing import Tuple, Union, Dict, Optional, TYPE_CHECKING +import nncf import torch from diffusers import ( StableDiffusionPipeline, StableDiffusionXLImg2ImgPipeline, LDMSuperResolutionPipeline, - DiffusionPipeline, ) from diffusers import UNet2DConditionModel, AutoencoderTiny, LCMScheduler -from nncf import compress_weights -from openvino import Type, PartialShape, save_model, convert_model +from nncf.torch.model_creation import is_wrapped_model +from openvino import Type as OVType, PartialShape, save_model, convert_model from openvino.runtime import Core, get_version from optimum.exporters import TasksManager from optimum.utils import DEFAULT_DUMMY_SHAPES -from optimum.exporters.onnx import get_encoder_decoder_models_for_export +from optimum.intel.openvino.configuration import OVConfig +from optimum.exporters.utils import get_encoder_decoder_models_for_export from optimum.exporters.openvino import export_models -from optimum.utils.save_utils import maybe_load_preprocessors +from optimum.exporters.openvino.model_patcher import patch_model_with_bettertransformer from optimum.intel.openvino import ( OVModelForSeq2SeqLM, OVStableDiffusionPipeline, @@ -35,14 +37,9 @@ OV_DECODER_WITH_PAST_NAME, OV_ENCODER_NAME, ) -from optimum.exporters.onnx import __main__ as optimum_main +from optimum.utils.import_utils import is_torch_available, is_diffusers_available -try: - from optimum.exporters.openvino.__main__ import _get_submodels_and_export_configs -except ImportError: - from optimum.exporters.onnx.__main__ import ( - _get_submodels_and_onnx_configs as _get_submodels_and_export_configs, - ) +from optimum.exporters.utils import _get_submodels_and_export_configs from transformers import ( AutoTokenizer, @@ -51,16 +48,15 @@ AutoModelForSeq2SeqLM, AutoModel, ) -from utils.nncf_utils import get_compressed_path -from utils.model_utils import add_stateful_model_arguments +from llm_bench_utils.nncf_utils import get_compressed_path +from llm_bench_utils.model_utils import add_stateful_model_arguments from optimum.exporters.openvino.utils import flattenize_inputs -from utils.conversion_utils.convert_patch import patch_model_for_optimum_export -from utils.conversion_utils.better_transformer_patch import ( +from llm_bench_utils.conversion_utils.convert_patch import patch_model_for_optimum_export +from llm_bench_utils.conversion_utils.better_transformer_patch import ( register_bettertransformer_config, ) -from utils.conversion_utils.export_configs import * # noqa: F401,F403 -from utils.ov_model_classes import register_normalized_configs -from utils.conversion_utils.helpers import ( +import llm_bench_utils.conversion_utils.export_configs # noqa: F401,F403 +from llm_bench_utils.conversion_utils.helpers import ( PYTORCH_DIR, OV_DIR, GPTQ_DIR, @@ -76,25 +72,64 @@ save_ov_model_helper, get_fp_path, is_ov_model_provided, + is_int8_compression, BackendType, ) +from llm_bench_utils.nncf_utils import COMPRESSION_OPTIONS + +if TYPE_CHECKING: + from optimum.onnx.configuration import OnnxConfig + + if is_torch_available(): + from transformers.modeling_utils import PreTrainedModel + + if is_diffusers_available(): + from diffusers import ModelMixin -register_normalized_configs() register_bettertransformer_config() +def compress_torchmodels( + models_and_export_configs, + stateful: bool = True, + dummy_shapes: Optional[Dict] = None, + compression_options: Optional[Dict] = None, +): + if dummy_shapes is None: + dummy_shapes = {} + + if compression_options is None: + compression_options = {} + + for model_name in models_and_export_configs.keys(): + submodel, sub_export_config = models_and_export_configs[model_name] + if stateful: + submodel = patch_model_with_bettertransformer(submodel) + if is_wrapped_model(submodel): + dataset = None + else: + dummy_inputs = sub_export_config.generate_dummy_inputs(framework="pt", **dummy_shapes) + dataset = nncf.Dataset([dummy_inputs]) + compressed_submodel = nncf.compress_weights(submodel, dataset=dataset, **compression_options) + models_and_export_configs[model_name] = (compressed_submodel, sub_export_config) + return models_and_export_configs + + def convert_optimum_causallm_base(model, args, model_config=None, compress_only=False): tokenizer_id = args.tokenizer_id or args.model_id tok = AutoTokenizer.from_pretrained(tokenizer_id, trust_remote_code=True) precision = args.precision gptq_applied = is_gptq(model_config) pt_compress_weights = is_torch_compression(args) + if args.stateful: + log.warning( + "usage --stateful flag is deprecated and will be removed in future, default behaviour is export stateful model" + " please use --disable_stateful if you need model without state" + ) if not compress_only: model_config = model.config model = patch_model_for_optimum_export(model) - precision = ( - precision if not gptq_applied else GPTQ_DIR.format(precision=args.precision) - ) + precision = precision if not gptq_applied else GPTQ_DIR.format(precision=args.precision) ov_out_dir = Path(args.output_dir) / PYTORCH_DIR / OV_DIR / precision if gptq_applied and args.compress_weights: log.info("Weights compression will be skipped for GPTQ models") @@ -103,34 +138,31 @@ def convert_optimum_causallm_base(model, args, model_config=None, compress_only= model.save_pretrained(pt_out_dir) save_tokenizer(tok, pt_out_dir) dummy_shapes = DEFAULT_DUMMY_SHAPES - onnx_config, models_and_onnx_configs = _get_submodels_and_export_configs( + export_config, models_and_export_configs = _get_submodels_and_export_configs( model=model, task="text-generation-with-past", - custom_onnx_configs={}, + exporter="openvino", + custom_export_configs={}, custom_architecture=None, fn_get_submodels=None, preprocessors=None, _variant="default", monolith=False, + library_name="transformers" ) - if "decoder_with_past_model" in models_and_onnx_configs: - models_and_onnx_configs = { - "model": models_and_onnx_configs["decoder_with_past_model"] - } + if "decoder_with_past_model" in models_and_export_configs: + models_and_export_configs = {"model": models_and_export_configs["decoder_with_past_model"]} model.config.save_pretrained(ov_out_dir) - files_subpaths = [ - "openvino_" + model_name + ".xml" - for model_name in models_and_onnx_configs.keys() - ] + files_subpaths = ["openvino_" + model_name + ".xml" for model_name in models_and_export_configs.keys()] export_models( - models_and_onnx_configs=models_and_onnx_configs, + models_and_export_configs=models_and_export_configs, output_dir=ov_out_dir, output_names=files_subpaths, input_shapes=dummy_shapes, device="cpu", - compression_option="fp16" if args.precision == "FP16" else None, + ov_config=OVConfig(dtype="fp16") if args.precision == "FP16" else None, model_kwargs={}, - stateful=args.stateful, + stateful=not args.disable_stateful, ) save_tokenizer(tok, ov_out_dir) @@ -143,9 +175,7 @@ def convert_optimum_causallm_base(model, args, model_config=None, compress_only= ) for compress_option in args.compress_weights: log.info(f"Compress model weights to {compress_option}") - optimized_dir = get_compressed_path( - args.output_dir, args.precision, compress_option - ) + optimized_dir = get_compressed_path(args.output_dir, args.precision, compress_option) model_config.save_pretrained(optimized_dir) fp_path = get_fp_path(args, "openvino_model.xml") ir_model = Core().read_model(fp_path) @@ -161,35 +191,58 @@ def convert_optimum_causallm_base(model, args, model_config=None, compress_only= ) if pt_compress_weights and not gptq_applied: - compressed_model = compress_weights(model) - onnx_config, models_and_onnx_configs = _get_submodels_and_export_configs( - model=compressed_model, - task="text-generation-with-past", - custom_onnx_configs={}, - custom_architecture=None, - fn_get_submodels=None, - preprocessors=None, - _variant="default", - monolith=False, - ) - pt_out_dir = ( - Path(args.output_dir) - / PYTORCH_DIR - / OV_DIR - / PYTORCH_COMPRESS_WEIGHTS_DIR.format(precision) - ) - model.config.save_pretrained(pt_out_dir) - export_models( - models_and_onnx_configs=models_and_onnx_configs, - output_dir=pt_out_dir, - output_names=files_subpaths, - input_shapes=dummy_shapes, - device="cpu", - compression_option="fp16" if args.precision == "FP16" else None, - model_kwargs={}, - stateful=args.stateful, - ) - save_tokenizer(tok, pt_out_dir) + compression_modes = [] + for cw in args.compress_weights: + if is_int8_compression(cw): + compression_modes.append(cw) + assert compression_modes, "Only INT8 compression supported for PyTorch backend" + number_compression_modes = len(compression_modes) + original_model = model + for idx, compress_mode in enumerate(compression_modes): + if number_compression_modes - idx > 1: + model = copy.deepcopy(original_model) + else: + model = original_model + + _, models_and_export_configs = _get_submodels_and_export_configs( + model=model, + exporter="openvino", + task="text-generation-with-past", + custom_export_configs={}, + custom_architecture=None, + fn_get_submodels=None, + preprocessors=None, + _variant="default", + monolith=False, + library_name="transformers" + ) + + compression_options = COMPRESSION_OPTIONS[compress_mode] + models_and_export_configs = compress_torchmodels( + models_and_export_configs, + stateful=not args.disable_stateful, + dummy_shapes=dummy_shapes, + compression_options=compression_options, + ) + + pt_out_dir = ( + Path(args.output_dir) + / PYTORCH_DIR + / OV_DIR + / PYTORCH_COMPRESS_WEIGHTS_DIR.format(precision=precision, compression=compress_mode) + ) + model.config.save_pretrained(pt_out_dir) + export_models( + models_and_export_configs=models_and_export_configs, + output_dir=pt_out_dir, + output_names=files_subpaths, + input_shapes=dummy_shapes, + device="cpu", + ov_config=OVConfig(dtype="fp16") if args.precision == "FP16" else None, + model_kwargs={}, + stateful=not args.disable_stateful, + ) + save_tokenizer(tok, pt_out_dir) return @@ -222,11 +275,7 @@ def convert_causal_lm(args): def convert_seq2seq(args): config = AutoConfig.from_pretrained(args.model_id, trust_remote_code=True) - tokenizer_id = ( - args.model_id - if "blenderbot-9B" not in args.model_id - else "facebook/blenderbot-3B" - ) + tokenizer_id = args.model_id if "blenderbot-9B" not in args.model_id else "facebook/blenderbot-3B" tok = AutoTokenizer.from_pretrained(tokenizer_id, trust_remote_code=True) pt_compress_weights = is_torch_compression(args) if args.save_orig or pt_compress_weights: @@ -240,58 +289,69 @@ def convert_seq2seq(args): pt_model.save_pretrained(pt_out_dir) save_tokenizer(tok, pt_out_dir) if pt_compress_weights: - compressed_pt_model = compress_weights(pt_model) - onnx_config_constructor = TasksManager.get_exporter_config_constructor( - model=pt_model, exporter="onnx", task="text2text-generation" - ) - onnx_config = onnx_config_constructor(pt_model.config, use_past=True) - models_and_onnx_configs = get_encoder_decoder_models_for_export( - compressed_pt_model, onnx_config - ) - encoder_file_name = Path("encoder") / OV_ENCODER_NAME - decoder_file_name = Path("decoder") / OV_DECODER_NAME - decoder_with_past_file_name = ( - Path("decoder_with_past") / OV_DECODER_WITH_PAST_NAME - ) + compression_modes = [] + for cw in args.compress_weights: + if is_int8_compression(cw): + compression_modes.append(cw) + assert compression_modes, "Only INT8 compression supported for PyTorch backend" + for idx, compress_mode in enumerate(compression_modes): + if idx > 0: + pt_model = AutoModelForSeq2SeqLM.from_pretrained( + args.model_id, + trust_remote_code=True, + config=config, + ) - output_names = [ - encoder_file_name, - decoder_file_name, - decoder_with_past_file_name, - ] - save_dir_path = ( - Path(args.output_dir) - / PYTORCH_DIR - / OV_DIR - / PYTORCH_COMPRESS_WEIGHTS_DIR.format(args.precision) - ) - try: - export_models( - models_and_onnx_configs=models_and_onnx_configs, - opset=onnx_config.DEFAULT_ONNX_OPSET, - output_dir=save_dir_path, - output_names=output_names, - compress_option="FP16" if args.precision == "FP16" else None, + export_config_constructor = TasksManager.get_exporter_config_constructor( + model=pt_model, exporter="openvino", task="text2text-generation" ) - save_tokenizer(tok, save_dir_path) - except Exception as ex: - log.warning( - f"PT weights compression failed with {ex}, please use OpenVINO backend instead" + export_config = export_config_constructor(pt_model.config, use_past=True) + models_and_export_configs = get_encoder_decoder_models_for_export(pt_model, export_config) + + compression_options = COMPRESSION_OPTIONS[compress_mode] + models_and_export_configs = compress_torchmodels( + models_and_export_configs, compression_options=compression_options ) + encoder_file_name = Path("encoder") / OV_ENCODER_NAME + decoder_file_name = Path("decoder") / OV_DECODER_NAME + decoder_with_past_file_name = Path("decoder_with_past") / OV_DECODER_WITH_PAST_NAME + + output_names = [ + encoder_file_name, + decoder_file_name, + decoder_with_past_file_name, + ] + save_dir_path = ( + Path(args.output_dir) + / PYTORCH_DIR + / OV_DIR + / PYTORCH_COMPRESS_WEIGHTS_DIR.format(precision=args.precision, compression=compress_mode) + ) + try: + export_models( + models_and_export_configs=models_and_export_configs, + opset=export_config.DEFAULT_ONNX_OPSET, + output_dir=save_dir_path, + output_names=output_names, + ov_config=OVConfig(dtype="fp16") if args.precision == "FP16" else None, + stateful=False + ) + save_tokenizer(tok, save_dir_path) + except Exception as ex: + log.warning(f"PT weights compression failed with {ex}, please use OpenVINO backend instead") + del pt_model gc.collect() + # skip openvino compression pipeline if pytorch compression pipeline was used + if pt_compress_weights: + return + ov_compression = is_ov_compression(args) - ov_encoder = is_ov_model_provided( - args.model_id, args.output_dir, args.precision, "openvino_encoder_model.xml" - ) - ov_decoder = is_ov_model_provided( - args.model_id, args.output_dir, args.precision, "openvino_decoder_model.xml" - ) - compress_only = ( - ov_compression and not args.force_convert and ov_encoder and ov_decoder - ) + ov_encoder = is_ov_model_provided(args.model_id, args.output_dir, args.precision, "openvino_encoder_model.xml") + ov_decoder = is_ov_model_provided(args.model_id, args.output_dir, args.precision, "openvino_decoder_model.xml") + compress_only = ov_compression and not args.force_convert and ov_encoder and ov_decoder if not compress_only: start = time.perf_counter() model = OVModelForSeq2SeqLM.from_pretrained( @@ -300,7 +360,10 @@ def convert_seq2seq(args): compile=False, trust_remote_code=True, config=AutoConfig.from_pretrained(args.model_id, trust_remote_code=True), + load_in_8bit=False ) + if is_fp16(args): + model.half() end = time.perf_counter() log.info(f"Conversion total time {end - start}s") @@ -321,9 +384,7 @@ def convert_seq2seq(args): ) for compress_option in args.compress_weights: log.info(f"Compress model weights to {compress_option}") - optimized_dir = get_compressed_path( - args.output_dir, args.precision, compress_option - ) + optimized_dir = get_compressed_path(args.output_dir, args.precision, compress_option) fp_enc_path = get_fp_path(args, "openvino_encoder_model.xml") enc_model = Core().read_model(fp_enc_path) compress_ov_model_weights_helper( @@ -363,242 +424,296 @@ def convert_seq2seq(args): ) -def convert_sd(args): - start = time.perf_counter() - pt_compress_weights = is_torch_compression(args) - if args.save_orig or pt_compress_weights: - pt_model = StableDiffusionPipeline.from_pretrained(args.model_id) - if args.save_orig: - pt_model.save_pretrained(Path(args.output_dir) / PYTORCH_DIR) - if pt_compress_weights: - wc_text_encoder = compress_weights(pt_model.text_encoder) - wc_unet = compress_weights(pt_model.unet) - wc_vae = compress_weights(pt_model.vae) - pt_model.text_encoder = wc_text_encoder - pt_model.unet = wc_unet - pt_model.vae = wc_vae - _, models_and_onnx_configs = optimum_main._get_submodels_and_onnx_configs( - model=pt_model, - task="stable-diffusion", - monolith=False, - custom_onnx_configs={}, - custom_architecture=False, - _variant="default", - ) - output = ( - Path(args.output_dir) - / PYTORCH_DIR - / OV_DIR - / PYTORCH_COMPRESS_WEIGHTS_DIR.format(args.precision) - ) - for model_name in models_and_onnx_configs: - subcomponent = models_and_onnx_configs[model_name][0] - if hasattr(subcomponent, "save_config"): - subcomponent.save_config(output / model_name) - elif hasattr(subcomponent, "config") and hasattr( - subcomponent.config, "save_pretrained" - ): - subcomponent.config.save_pretrained(output / model_name) - - files_subpaths = [ - Path(name_dir) / OV_XML_FILE_NAME - for name_dir in models_and_onnx_configs - ] - - # Saving the additional components needed to perform inference. - pt_model.scheduler.save_pretrained(output.joinpath("scheduler")) - - feature_extractor = getattr(pt_model, "feature_extractor", None) - if feature_extractor is not None: - feature_extractor.save_pretrained(output.joinpath("feature_extractor")) +def _get_submodels_for_export_stable_diffusion( + pipeline: "StableDiffusionPipeline", +) -> Dict[str, Union["PreTrainedModel", "ModelMixin"]]: + """ + Returns the components of a Stable Diffusion model. + """ + from diffusers import StableDiffusionXLImg2ImgPipeline + + models_for_export = {} + if isinstance(pipeline, StableDiffusionXLImg2ImgPipeline): + projection_dim = pipeline.text_encoder_2.config.projection_dim + else: + projection_dim = pipeline.text_encoder.config.projection_dim + + # Text encoder + if pipeline.text_encoder is not None: + if isinstance(pipeline, StableDiffusionXLImg2ImgPipeline): + pipeline.text_encoder.config.output_hidden_states = True + models_for_export["text_encoder"] = pipeline.text_encoder + + # U-NET + pipeline.unet.config.text_encoder_projection_dim = projection_dim + # The U-NET time_ids inputs shapes depends on the value of `requires_aesthetics_score` + # https://github.com/huggingface/diffusers/blob/v0.18.2/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py#L571 + pipeline.unet.config.requires_aesthetics_score = getattr(pipeline.config, "requires_aesthetics_score", False) + models_for_export["unet"] = pipeline.unet + + # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565 + vae_encoder = copy.deepcopy(pipeline.vae) + if isinstance(vae_encoder, AutoencoderTiny): + vae_encoder.forward = lambda sample: {"latent_sample": vae_encoder.encode(x=sample)["latents"]} + else: + vae_encoder.forward = lambda sample: {"latent_sample": vae_encoder.encode(x=sample)["latent_dist"].sample()} + models_for_export["vae_encoder"] = vae_encoder + + # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600 + vae_decoder = copy.deepcopy(pipeline.vae) + if isinstance(vae_encoder, AutoencoderTiny): + vae_decoder.forward = lambda latent_sample: vae_decoder.decode(latent_sample) + else: + vae_decoder.forward = lambda latent_sample: vae_decoder.decode(z=latent_sample) + models_for_export["vae_decoder"] = vae_decoder + + text_encoder_2 = getattr(pipeline, "text_encoder_2", None) + if text_encoder_2 is not None: + text_encoder_2.config.output_hidden_states = True + models_for_export["text_encoder_2"] = text_encoder_2 + + return models_for_export + + +def get_stable_diffusion_models_for_export( + pipeline: "StableDiffusionPipeline", + int_dtype: str = "int64", + float_dtype: str = "fp32", +) -> Dict[str, Tuple[Union["PreTrainedModel", "ModelMixin"], "OnnxConfig"]]: + """ + Returns the components of a Stable Diffusion model and their subsequent onnx configs. + + Args: + pipeline ([`StableDiffusionPipeline`]): + The model to export. + int_dtype (`str`, defaults to `"int64"`): + The data type of integer tensors, could be ["int64", "int32", "int8"], default to "int64". + float_dtype (`str`, defaults to `"fp32"`): + The data type of float tensors, could be ["fp32", "fp16", "bf16"], default to "fp32". + + Returns: + `Dict[str, Tuple[Union[`PreTrainedModel`, `TFPreTrainedModel`], `OnnxConfig`]: A Dict containing the model and + onnx configs for the different components of the model. + """ + models_for_export = _get_submodels_for_export_stable_diffusion(pipeline) + + # Text encoder + if "text_encoder" in models_for_export: + text_encoder_config_constructor = TasksManager.get_exporter_config_constructor( + model=pipeline.text_encoder, + exporter="openvino", + task="feature-extraction", + library_name="diffusers", + ) + text_encoder_export_config = text_encoder_config_constructor( + pipeline.text_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["text_encoder"] = (models_for_export["text_encoder"], text_encoder_export_config) + + # U-NET + export_config_constructor = TasksManager.get_exporter_config_constructor( + model=pipeline.unet, + exporter="openvino", + task="semantic-segmentation", + model_type="unet", + library_name="diffusers", + ) + unet_export_config = export_config_constructor(pipeline.unet.config, int_dtype=int_dtype, float_dtype=float_dtype) + models_for_export["unet"] = (models_for_export["unet"], unet_export_config) + + # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565 + vae_encoder = models_for_export["vae_encoder"] + vae_config_constructor = TasksManager.get_exporter_config_constructor( + model=vae_encoder, + exporter="openvino", + task="semantic-segmentation", + model_type="vae-encoder", + library_name="diffusers", + ) + vae_export_config = vae_config_constructor(vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype) + models_for_export["vae_encoder"] = (vae_encoder, vae_export_config) + + # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600 + vae_decoder = models_for_export["vae_decoder"] + vae_config_constructor = TasksManager.get_exporter_config_constructor( + model=vae_decoder, + exporter="openvino", + task="semantic-segmentation", + model_type="vae-decoder", + library_name="diffusers", + ) + vae_export_config = vae_config_constructor(vae_decoder.config, int_dtype=int_dtype, float_dtype=float_dtype) + models_for_export["vae_decoder"] = (vae_decoder, vae_export_config) + + if "text_encoder_2" in models_for_export: + export_config_constructor = TasksManager.get_exporter_config_constructor( + model=pipeline.text_encoder_2, + exporter="openvino", + task="feature-extraction", + model_type="clip-text-with-projection", + library_name="diffusers", + ) + export_config = export_config_constructor( + pipeline.text_encoder_2.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["text_encoder_2"] = (models_for_export["text_encoder_2"], export_config) - tokenizer = getattr(pt_model, "tokenizer", None) - if tokenizer is not None: - tokenizer.save_pretrained(output.joinpath("tokenizer")) + return models_for_export - tokenizer_2 = getattr(pt_model, "tokenizer_2", None) - if tokenizer_2 is not None: - tokenizer_2.save_pretrained(output.joinpath("tokenizer_2")) - pt_model.save_config(output) +def convert_sd_prepared_for_export_common(pipeline, models_and_export_configs, output_dir, args): + for model_name in models_and_export_configs: + subcomponent = models_and_export_configs[model_name][0] + if hasattr(subcomponent, "save_config"): + subcomponent.save_config(output_dir / model_name) + elif hasattr(subcomponent, "config") and hasattr(subcomponent.config, "save_pretrained"): + subcomponent.config.save_pretrained(output_dir / model_name) - export_models( - models_and_onnx_configs=models_and_onnx_configs, - output_dir=output, - output_names=files_subpaths, - ) + files_subpaths = [Path(name_dir) / OV_XML_FILE_NAME for name_dir in models_and_export_configs] - del pt_model - gc.collect() + # Saving the additional components needed to perform inference. + pipeline.scheduler.save_pretrained(output_dir.joinpath("scheduler")) - model = OVStableDiffusionPipeline.from_pretrained( - args.model_id, export=True, compile=False - ) - end = time.perf_counter() - log.info(f"Conversion total time {end - start}s") + feature_extractor = getattr(pipeline, "feature_extractor", None) + if feature_extractor is not None: + feature_extractor.save_pretrained(output_dir.joinpath("feature_extractor")) - if is_fp16(args): - model.half() - start1 = time.perf_counter() - model.save_pretrained(Path(args.output_dir) / PYTORCH_DIR / OV_DIR / args.precision) - end1 = time.perf_counter() - log.info(f"Serialization total time {end1 - start1}s") + tokenizer = getattr(pipeline, "tokenizer", None) + if tokenizer is not None: + tokenizer.save_pretrained(output_dir.joinpath("tokenizer")) - if is_ov_compression(args): - for weigths_compression_option in args.compress_weights: - if weigths_compression_option != "INT8": - log.warning( - "Weights compression {weigths_compression_option} does not supported for SD, will be ignored" - ) - continue - ov_int8_dir = get_compressed_path( - args.output_dir, args.precision, weigths_compression_option - ) - model.text_encoder.model = compress_weights(model.text_encoder.model) - model.unet.model = compress_weights(model.unet.model) - model.vae_decoder.model = compress_weights(model.vae_decoder.model) - model.save_pretrained(ov_int8_dir) + tokenizer_2 = getattr(pipeline, "tokenizer_2", None) + if tokenizer_2 is not None: + tokenizer_2.save_pretrained(output_dir.joinpath("tokenizer_2")) - # Saving the additional components needed to perform inference. - model.scheduler.save_pretrained(ov_int8_dir.joinpath("scheduler")) + pipeline.save_config(output_dir) - feature_extractor = getattr(model, "feature_extractor", None) - if feature_extractor is not None: - feature_extractor.save_pretrained( - ov_int8_dir.joinpath("feature_extractor") - ) + export_models( + models_and_export_configs=models_and_export_configs, + output_dir=output_dir, + output_names=files_subpaths, + ov_config=OVConfig(dtype="fp16") if args.precision == "FP16" else None, + stateful=False + ) - tokenizer = getattr(model, "tokenizer", None) - if tokenizer is not None: - tokenizer.save_pretrained(ov_int8_dir.joinpath("tokenizer")) - tokenizer_2 = getattr(model, "tokenizer_2", None) - if tokenizer_2 is not None: - tokenizer_2.save_pretrained(ov_int8_dir.joinpath("tokenizer_2")) +def convert_sd_common(pipeline, output_dir, args): + models_and_export_configs = get_stable_diffusion_models_for_export(pipeline) + convert_sd_prepared_for_export_common(pipeline, models_and_export_configs, output_dir, args) - model.save_config(ov_int8_dir) - del model - gc.collect() +def convert_sd(args): + pt_compress_weights = is_torch_compression(args) + pt_model = StableDiffusionPipeline.from_pretrained(args.model_id) + if args.save_orig: + pt_model.save_pretrained(Path(args.output_dir) / PYTORCH_DIR) + output_dir = Path(args.output_dir) / PYTORCH_DIR / OV_DIR / args.precision + models_and_export_configs = get_stable_diffusion_models_for_export(pt_model) + convert_sd_prepared_for_export_common(pt_model, models_and_export_configs, output_dir, args) -def convert_lcm(args): - start = time.perf_counter() - pt_compress_weights = is_torch_compression(args) - if args.save_orig or pt_compress_weights: - pt_model = DiffusionPipeline.from_pretrained(args.model_id) - if args.save_orig: - pt_model.save_pretrained(Path(args.output_dir) / PYTORCH_DIR) - if pt_compress_weights: - wc_text_encoder = compress_weights(pt_model.text_encoder) - wc_unet = compress_weights(pt_model.unet) - wc_vae = compress_weights(pt_model.vae) - pt_model.text_encoder = wc_text_encoder - pt_model.unet = wc_unet - pt_model.vae = wc_vae - _, models_and_onnx_configs = optimum_main._get_submodels_and_onnx_configs( - model=pt_model, - task="stable-diffusion", - monolith=False, - custom_onnx_configs={}, - custom_architecture=False, - _variant="default", + if pt_compress_weights: + compression_modes = [] + for cw in args.compress_weights: + if is_int8_compression(cw): + compression_modes.append(cw) + assert compression_modes, "Only INT8 compression supported for PyTorch backend" + for idx, compress_mode in enumerate(compression_modes): + if idx > 0: + pt_model = StableDiffusionPipeline.from_pretrained(args.model_id) + models_and_export_configs = get_stable_diffusion_models_for_export(pt_model) + + target_models_and_export_configs = { + k: models_and_export_configs[k] for k in ("text_encoder", "unet", "vae_decoder") + } + compression_options = COMPRESSION_OPTIONS[compress_mode] + models_and_export_configs.update( + compress_torchmodels(target_models_and_export_configs, compression_options=compression_options) ) + output = ( Path(args.output_dir) / PYTORCH_DIR / OV_DIR - / PYTORCH_COMPRESS_WEIGHTS_DIR.format(args.precision) + / PYTORCH_COMPRESS_WEIGHTS_DIR.format(precision=args.precision, compression=compress_mode) ) - for model_name in models_and_onnx_configs: - subcomponent = models_and_onnx_configs[model_name][0] - if hasattr(subcomponent, "save_config"): - subcomponent.save_config(output / model_name) - elif hasattr(subcomponent, "config") and hasattr( - subcomponent.config, "save_pretrained" - ): - subcomponent.config.save_pretrained(output / model_name) - - files_subpaths = [ - Path(name_dir) / OV_XML_FILE_NAME - for name_dir in models_and_onnx_configs - ] + convert_sd_prepared_for_export_common(pt_model, models_and_export_configs, output, args) + del pt_model + gc.collect() - # Saving the additional components needed to perform inference. - pt_model.scheduler.save_pretrained(output.joinpath("scheduler")) + if is_ov_compression(args): + for weigths_compression_option in args.compress_weights: + if not is_int8_compression(weigths_compression_option): + log.warning( + f"Weights compression {weigths_compression_option} is not supported for SD, will be ignored" + ) + continue + model = OVStableDiffusionPipeline.from_pretrained(output_dir, compile=False) + ov_int8_dir = get_compressed_path(args.output_dir, args.precision, weigths_compression_option) + model.text_encoder.model = nncf.compress_weights(model.text_encoder.model) + model.unet.model = nncf.compress_weights(model.unet.model) + model.vae_decoder.model = nncf.compress_weights(model.vae_decoder.model) + model.save_pretrained(ov_int8_dir) - feature_extractor = getattr(pt_model, "feature_extractor", None) - if feature_extractor is not None: - feature_extractor.save_pretrained(output.joinpath("feature_extractor")) + del model + gc.collect() - tokenizer = getattr(pt_model, "tokenizer", None) - if tokenizer is not None: - tokenizer.save_pretrained(output.joinpath("tokenizer")) - tokenizer_2 = getattr(pt_model, "tokenizer_2", None) - if tokenizer_2 is not None: - tokenizer_2.save_pretrained(output.joinpath("tokenizer_2")) +def convert_lcm(args): + pt_compress_weights = is_torch_compression(args) + pt_model = StableDiffusionPipeline.from_pretrained(args.model_id) + if args.save_orig: + pt_model.save_pretrained(Path(args.output_dir) / PYTORCH_DIR) - pt_model.save_config(output) + output_dir = Path(args.output_dir) / PYTORCH_DIR / OV_DIR / args.precision + models_and_export_configs = get_stable_diffusion_models_for_export(pt_model) + convert_sd_prepared_for_export_common(pt_model, models_and_export_configs, output_dir, args) - export_models( - models_and_onnx_configs=models_and_onnx_configs, - output_dir=output, - output_names=files_subpaths, + if pt_compress_weights: + compression_modes = [] + for cw in args.compress_weights: + if is_int8_compression(cw): + compression_modes.append(cw) + assert compression_modes, "Only INT8 compression supported for PyTorch backend" + for idx, compress_mode in enumerate(compression_modes): + if idx > 0: + pt_model = StableDiffusionPipeline.from_pretrained(args.model_id) + models_and_export_configs = get_stable_diffusion_models_for_export(pt_model) + + target_models_and_export_configs = { + k: models_and_export_configs[k] for k in ("text_encoder", "unet", "vae_decoder") + } + compression_options = COMPRESSION_OPTIONS[compress_mode] + models_and_export_configs.update( + compress_torchmodels(target_models_and_export_configs, compression_options=compression_options) ) - del pt_model - gc.collect() - - model = OVLatentConsistencyModelPipeline.from_pretrained( - args.model_id, export=True, compile=False - ) - end = time.perf_counter() - log.info(f"Conversion total time {end - start}s") - - if is_fp16(args): - model.half() - start1 = time.perf_counter() - model.save_pretrained(Path(args.output_dir) / PYTORCH_DIR / OV_DIR / args.precision) - end1 = time.perf_counter() - log.info(f"Serialization total time {end1 - start1}s") + output = ( + Path(args.output_dir) + / PYTORCH_DIR + / OV_DIR + / PYTORCH_COMPRESS_WEIGHTS_DIR.format(precision=args.precision, compression=compress_mode) + ) + convert_sd_prepared_for_export_common(pt_model, models_and_export_configs, output, args) + del pt_model + gc.collect() if is_ov_compression(args): for weigths_compression_option in args.compress_weights: - if weigths_compression_option != "INT8": + if not is_int8_compression(weigths_compression_option): log.warning( - "Weights compression {weigths_compression_option} does not supported for LCM, will be ignored" + f"Weights compression {weigths_compression_option} is not supported for LCM, will be ignored" ) continue - ov_int8_dir = get_compressed_path( - args.output_dir, args.precision, weigths_compression_option - ) - model.text_encoder.model = compress_weights(model.text_encoder.model) - model.unet.model = compress_weights(model.unet.model) - model.vae_decoder.model = compress_weights(model.vae_decoder.model) + model = OVLatentConsistencyModelPipeline.from_pretrained(output_dir, compile=False) + ov_int8_dir = get_compressed_path(args.output_dir, args.precision, weigths_compression_option) + model.text_encoder.model = nncf.compress_weights(model.text_encoder.model) + model.unet.model = nncf.compress_weights(model.unet.model) + model.vae_decoder.model = nncf.compress_weights(model.vae_decoder.model) model.save_pretrained(ov_int8_dir) - # Saving the additional components needed to perform inference. - model.scheduler.save_pretrained(ov_int8_dir.joinpath("scheduler")) - - feature_extractor = getattr(model, "feature_extractor", None) - if feature_extractor is not None: - feature_extractor.save_pretrained( - ov_int8_dir.joinpath("feature_extractor") - ) - - tokenizer = getattr(model, "tokenizer", None) - if tokenizer is not None: - tokenizer.save_pretrained(ov_int8_dir.joinpath("tokenizer")) - - tokenizer_2 = getattr(model, "tokenizer_2", None) - if tokenizer_2 is not None: - tokenizer_2.save_pretrained(ov_int8_dir.joinpath("tokenizer_2")) - - model.save_config(ov_int8_dir) - - del model - gc.collect() + del model + gc.collect() def convert_sdxl(args): @@ -607,142 +722,85 @@ def convert_sdxl(args): def build_pt_model(model_id): model_ids = [idx.replace(" ", "") for idx in model_id.split(",")] pt_model = StableDiffusionXLImg2ImgPipeline.from_pretrained(model_ids[0]) - tiny_vae = False if len(model_ids) > 1: for additional_model in model_ids[1:]: if "lora" in additional_model: pt_model.load_lora_weights(additional_model) pt_model.fuse_lora() if "lcm" in additional_model: - pt_model.scheduler = LCMScheduler.from_config( - pt_model.scheduler.config - ) + pt_model.scheduler = LCMScheduler.from_config(pt_model.scheduler.config) continue if "lcm" in additional_model and "lora" not in additional_model: unet = UNet2DConditionModel.from_pretrained(additional_model) pt_model.unet = unet - pt_model.scheduler = LCMScheduler.from_config( - pt_model.scheduler.config - ) + pt_model.scheduler = LCMScheduler.from_config(pt_model.scheduler.config) continue if "tae" in additional_model: - tiny_vae = True vae = AutoencoderTiny.from_pretrained(additional_model) pt_model.vae = vae continue - preprocessors = maybe_load_preprocessors(model_ids[0]) - return pt_model, preprocessors, tiny_vae - - def convert_pt_to_ov(pt_model, preprocessors, output_dir, fp16, tiny_vae): - _, models_and_onnx_configs = optimum_main._get_submodels_and_onnx_configs( - model=pt_model, - task="stable-diffusion-xl", - monolith=False, - custom_onnx_configs={}, - custom_architecture=False, - _variant="default", - preprocessors=preprocessors, - legacy=False, - ) - if tiny_vae: - models_and_onnx_configs["vae_encoder"][0].forward = lambda sample: { - "latent_sample": models_and_onnx_configs["vae_encoder"][0].encode( - x=sample - )["latents"] - } - models_and_onnx_configs["vae_decoder"][ - 0 - ].forward = lambda latent_sample: models_and_onnx_configs["vae_decoder"][ - 0 - ].decode( - latent_sample - ) - for model_name in models_and_onnx_configs: - subcomponent = models_and_onnx_configs[model_name][0] - - if hasattr(subcomponent, "save_config"): - subcomponent.save_config(output_dir / model_name) - elif hasattr(subcomponent, "config") and hasattr( - subcomponent.config, "save_pretrained" - ): - subcomponent.config.save_pretrained(output_dir / model_name) - - files_subpaths = [ - Path(name_dir) / OV_XML_FILE_NAME for name_dir in models_and_onnx_configs - ] - - # Saving the additional components needed to perform inference. - pt_model.scheduler.save_pretrained(output_dir.joinpath("scheduler")) - - feature_extractor = getattr(pt_model, "feature_extractor", None) - if feature_extractor is not None: - feature_extractor.save_pretrained(output_dir.joinpath("feature_extractor")) - - tokenizer = getattr(pt_model, "tokenizer", None) - if tokenizer is not None: - tokenizer.save_pretrained(output_dir.joinpath("tokenizer")) - - tokenizer_2 = getattr(pt_model, "tokenizer_2", None) - if tokenizer_2 is not None: - tokenizer_2.save_pretrained(output_dir.joinpath("tokenizer_2")) - - pt_model.save_config(output_dir) + return pt_model - export_models( - models_and_onnx_configs=models_and_onnx_configs, - output_dir=output_dir, - output_names=files_subpaths, - compression_option="fp16" if fp16 else None, - ) - - pt_model, preprocessors, tiny_vae = build_pt_model(args.model_id) + pt_model = build_pt_model(args.model_id) if args.save_orig: pt_model.save_pretrained(Path(args.output_dir) / PYTORCH_DIR) - if pt_compress_weights: - output = ( - Path(args.output_dir) - / PYTORCH_DIR - / OV_DIR - / PYTORCH_COMPRESS_WEIGHTS_DIR.format(args.precision) - ) - pt_model.text_encoder = compress_weights(pt_model.text_encoder) - pt_model.unet = compress_weights(pt_model.unet) - pt_model.vae = compress_weights(pt_model.vae) - if getattr(pt_model, "text_encoder_2", None) is not None: - pt_model.text_encoder_2 = compress_weights(pt_model.text_encoder_2) - convert_pt_to_ov(pt_model, output, is_fp16(args), tiny_vae) + del pt_model gc.collect() - pt_model, preprocessors, tiny_vae = build_pt_model(args.model_id) + pt_model = build_pt_model(args.model_id) fp_out_dir = Path(args.output_dir) / PYTORCH_DIR / OV_DIR / args.precision - convert_pt_to_ov(pt_model, preprocessors, fp_out_dir, is_fp16(args), tiny_vae) + models_and_export_configs = get_stable_diffusion_models_for_export(pt_model) + convert_sd_prepared_for_export_common(pt_model, models_and_export_configs, fp_out_dir, args) + + if pt_compress_weights: + compression_modes = [] + for cw in args.compress_weights: + if is_int8_compression(cw): + compression_modes.append(cw) + assert compression_modes, "Only INT8 compression supported for PyTorch backend" + for idx, compress_mode in enumerate(compression_modes): + if idx > 0: + pt_model = build_pt_model(args.model_id) + models_and_export_configs = get_stable_diffusion_models_for_export(pt_model) + + compression_options = COMPRESSION_OPTIONS[compress_mode] + models_and_export_configs = compress_torchmodels( + models_and_export_configs, compression_options=compression_options + ) + + output = ( + Path(args.output_dir) + / PYTORCH_DIR + / OV_DIR + / PYTORCH_COMPRESS_WEIGHTS_DIR.format(precision=args.precision, compression=compress_mode) + ) + + convert_sd_prepared_for_export_common(pt_model, models_and_export_configs, output, args) + + del pt_model + gc.collect() if is_ov_compression(args): for weigths_compression_option in args.compress_weights: - if weigths_compression_option != "INT8": + if not is_int8_compression(weigths_compression_option): log.warning( - "Weights compression {weigths_compression_option} does not supported for SDXL, will be ignored" + f"Weights compression {weigths_compression_option} is not supported for SDXL, will be ignored" ) continue - ov_int8_dir = get_compressed_path( - args.output_dir, args.precision, weigths_compression_option - ) - model = OVStableDiffusionXLPipeline.from_pretrained( - fp_out_dir, compile=False - ) - model.text_encoder.model = compress_weights(model.text_encoder.model) + ov_int8_dir = get_compressed_path(args.output_dir, args.precision, weigths_compression_option) + compression_options = COMPRESSION_OPTIONS[weigths_compression_option] + model = OVStableDiffusionXLPipeline.from_pretrained(fp_out_dir, compile=False) + model.text_encoder.model = nncf.compress_weights(model.text_encoder.model, **compression_options) if getattr(model, "text_encoder_2", None) is not None: - model.text_encoder_2.model = compress_weights( - model.text_encoder_2.model - ) - model.unet.model = compress_weights(model.unet.model) - model.vae_decoder.model = compress_weights(model.vae_decoder.model) + model.text_encoder_2.model = nncf.compress_weights(model.text_encoder_2.model, **compression_options) + model.unet.model = nncf.compress_weights(model.unet.model) + model.vae_decoder.model = nncf.compress_weights(model.vae_decoder.model, **compression_options) if getattr(model, "vae_encoder", None) is not None: - model.vae_encoder.model = compress_weights(model.vae_encoder.model) + model.vae_encoder.model = nncf.compress_weights(model.vae_encoder.model, **compression_options) model.save_pretrained(ov_int8_dir) del model @@ -753,10 +811,10 @@ def convert_ldm_super_res(args): pipeline = LDMSuperResolutionPipeline.from_pretrained(args.model_id) if args.save_orig: pipeline.save_pretrained(Path(args.output_dir) / PYTORCH_DIR) - unet_example_input = [ + unet_example_input = ( torch.zeros((1, 6, 128, 128)), torch.tensor(1, dtype=torch.int32), - ] + ) class Decoder(torch.nn.Module): def __init__(self, model): @@ -768,37 +826,10 @@ def forward(self, latents): decoder = Decoder(pipeline.vqvae) - pt_compress_weights = is_torch_compression(args) compress_to_fp16 = is_fp16(args) - if pt_compress_weights: - compressed_unet = compress_weights(pipeline.unet) - ov_compressed_unet = convert_model( - compressed_unet, example_input=unet_example_input - ) - ov_compressed_unet.inputs[1].get_node().set_element_type(Type.i32) - ov_compressed_unet.inputs[1].get_node().set_partial_shape(PartialShape([])) - ov_compressed_unet.validate_nodes_and_infer_types() - pt_out_dir = ( - Path(args.output_dir) - / PYTORCH_DIR - / OV_DIR - / PYTORCH_COMPRESS_WEIGHTS_DIR.format(args.precision) - ) - save_model( - ov_compressed_unet, - pt_out_dir / "unet.xml", - compress_to_fp16=compress_to_fp16, - ) - pipeline.scheduler.save_config(pt_out_dir) - # Couldn't compress decoder weights (RuntimeError: cdist only supports floating-point dtypes, X2 got: Byte) - ov_decoder = convert_model(decoder, example_input=torch.zeros((1, 3, 128, 128))) - save_model( - ov_decoder, pt_out_dir / "vqvae.xml", compress_to_fp16=compress_to_fp16 - ) - # convert model to OpenVINO IR ov_unet = convert_model(pipeline.unet, example_input=unet_example_input) - ov_unet.inputs[1].get_node().set_element_type(Type.i32) + ov_unet.inputs[1].get_node().set_element_type(OVType.i32) ov_unet.inputs[1].get_node().set_partial_shape(PartialShape([])) ov_unet.validate_nodes_and_infer_types() save_dir = Path(args.output_dir) / PYTORCH_DIR / OV_DIR / args.precision @@ -806,24 +837,65 @@ def forward(self, latents): ov_decoder = convert_model(decoder, example_input=torch.zeros((1, 3, 128, 128))) save_model(ov_decoder, save_dir / "vqvae.xml", compress_to_fp16=compress_to_fp16) pipeline.scheduler.save_config(save_dir) + del ov_unet, ov_decoder + gc.collect() + + pt_compress_weights = is_torch_compression(args) + if pt_compress_weights: + compression_modes = [] + for cw in args.compress_weights: + if is_int8_compression(cw): + compression_modes.append(cw) + assert compression_modes, "Only INT8 compression supported for PyTorch backend" + for idx, compress_mode in enumerate(compression_modes): + if idx > 0: + pipeline = LDMSuperResolutionPipeline.from_pretrained(args.model_id) + decoder = Decoder(pipeline.vqvae) + + compression_options = COMPRESSION_OPTIONS[compress_mode] + compressed_unet = nncf.compress_weights( + pipeline.unet, dataset=nncf.Dataset([unet_example_input]), **compression_options + ) + ov_compressed_unet = convert_model(compressed_unet, example_input=unet_example_input) + ov_compressed_unet.inputs[1].get_node().set_element_type(OVType.i32) + ov_compressed_unet.inputs[1].get_node().set_partial_shape(PartialShape([])) + ov_compressed_unet.validate_nodes_and_infer_types() + pt_out_dir = ( + Path(args.output_dir) + / PYTORCH_DIR + / OV_DIR + / PYTORCH_COMPRESS_WEIGHTS_DIR.format(precision=args.precision, compression=compress_mode) + ) + save_model( + ov_compressed_unet, + pt_out_dir / "unet.xml", + compress_to_fp16=compress_to_fp16, + ) + pipeline.scheduler.save_config(pt_out_dir) + decoder_example_input = torch.zeros(1, 3, 128, 128) + compressed_decoder = nncf.compress_weights( + decoder, dataset=nncf.Dataset([decoder_example_input]), **compression_options + ) + ov_compressed_decoder = convert_model(compressed_decoder, example_input=decoder_example_input) + save_model(ov_compressed_decoder, pt_out_dir / "vqvae.xml", compress_to_fp16=compress_to_fp16) if is_ov_compression(args): for weigths_compression_option in args.compress_weights: - if weigths_compression_option != "INT8": + if not is_int8_compression(weigths_compression_option): log.warning( - "Weights compression {weigths_compression_option} does not supported for LDM, will be ignored" + f"Weights compression {weigths_compression_option} is not supported for LDM, will be ignored" ) continue - ov_int8_dir = get_compressed_path( - args.output_dir, args.precision, weigths_compression_option - ) - compressed_ov_unet = compress_weights(ov_unet) + ov_int8_dir = get_compressed_path(args.output_dir, args.precision, weigths_compression_option) + ov_unet = Core().read_model(save_dir / "unet.xml") + compressed_ov_unet = nncf.compress_weights(ov_unet) save_model( compressed_ov_unet, ov_int8_dir / "unet.xml", compress_to_fp16=compress_to_fp16, ) - compressed_ov_decoder = compress_weights(ov_decoder) + ov_decoder = Core().read_model(save_dir / "vqvae.xml") + compressed_ov_decoder = nncf.compress_weights(ov_decoder) save_model( compressed_ov_decoder, ov_int8_dir / "vqvae.xml", @@ -833,6 +905,7 @@ def forward(self, latents): def convert_mpt(args): + @torch.no_grad def convert_to_ov(pt_model, tok, out_path, compress_to_fp16=False): pt_model.config.use_cache = True outs = pt_model( @@ -846,13 +919,9 @@ def convert_to_ov(pt_model, tok, out_path, compress_to_fp16=False): dynamic_shapes = {"input_ids": {1: "seq_len"}, "attention_mask": {1: "seq_len"}} for idx in range(len(outs.past_key_values)): - inputs.extend( - [f"past_key_values.{idx}.key", f"past_key_values.{idx}.value"] - ) + inputs.extend([f"past_key_values.{idx}.key", f"past_key_values.{idx}.value"]) dynamic_shapes[inputs[-1]] = {2: "past_sequence + sequence"} - dynamic_shapes[inputs[-2]] = { - 3 if not old else 2: "past_sequence + sequence" - } + dynamic_shapes[inputs[-2]] = {3 if not old else 2: "past_sequence + sequence"} outputs.extend([f"present.{idx}.key", f"present.{idx}.value"]) inputs.append("attention_mask") @@ -882,12 +951,10 @@ def ts_patched_forward( ov_model = convert_model(pt_model, example_input=dummy_inputs) pt_model.forward = orig_forward - for inp_name, m_input, input_data in zip( - inputs, ov_model.inputs, flattenize_inputs(dummy_inputs.values()) - ): + for inp_name, m_input, input_data in zip(inputs, ov_model.inputs, flattenize_inputs(dummy_inputs.values())): input_node = m_input.get_node() - if input_node.element_type == Type.dynamic: - m_input.get_node().set_element_type(Type.f32) + if input_node.element_type == OVType.dynamic: + m_input.get_node().set_element_type(OVType.f32) shape = list(input_data.shape) if inp_name in dynamic_shapes: for k in dynamic_shapes[inp_name]: @@ -898,11 +965,15 @@ def ts_patched_forward( for out, out_name in zip(ov_model.outputs, outputs): out.get_tensor().set_names({out_name}) - save_ov_model_helper( - ov_model, out_path, fp16=compress_to_fp16, tok=tok, config=pt_model.config - ) + save_ov_model_helper(ov_model, out_path, fp16=compress_to_fp16, tok=tok, config=pt_model.config) - config = AutoConfig.from_pretrained(args.model_id, trust_remote_code=True) + remote_code = False + pt_model = None + try: + config = AutoConfig.from_pretrained(args.model_id, trust_remote_code=False) + except Exception: + config = AutoConfig.from_pretrained(args.model_id, trust_remote_code=True) + remote_code = True cuda, post_init = patch_gptq(config) model_kwargs = {} precision = args.precision @@ -913,9 +984,7 @@ def ts_patched_forward( and is_ov_model_provided(args.model_id, args.output_dir, args.precision) ) gptq_applied = is_gptq(config) - precision = ( - precision if not gptq_applied else GPTQ_DIR.format(precision=args.precision) - ) + precision = precision if not gptq_applied else GPTQ_DIR.format(precision=args.precision) if post_init is not None: model_kwargs = {"torch_dtype": torch.float32} pt_model = None @@ -923,11 +992,19 @@ def ts_patched_forward( tok = AutoTokenizer.from_pretrained(tokenizer_id, trust_remote_code=True) compress_to_fp16 = is_fp16(args) if not compression_only: - pt_model = AutoModelForCausalLM.from_pretrained( - args.model_id, trust_remote_code=True, config=config, **model_kwargs - ) - pt_model.config.use_cache = True - pt_model.eval() + + def create_model(model_id, config, model_kwargs): + pt_model = AutoModelForCausalLM.from_pretrained( + model_id, trust_remote_code=remote_code, config=config, **model_kwargs + ) + pt_model.config.use_cache = True + pt_model.eval() + return pt_model + + pt_model = create_model(args.model_id, config, model_kwargs) + + if not remote_code: + return convert_optimum_causallm_base(pt_model, args, config, compression_only) if args.save_orig: pt_out_dir = Path(args.output_dir) / PYTORCH_DIR @@ -939,16 +1016,37 @@ def ts_patched_forward( convert_to_ov(pt_model, tok, ov_dir, compress_to_fp16) if is_torch_compression(args): - compressed_pt_model = compress_weights(pt_model) - pt_path = ( - Path(args.output_dir) - / PYTORCH_DIR - / OV_DIR - / PYTORCH_COMPRESS_WEIGHTS_DIR.format(precision) - ) - convert_to_ov(compressed_pt_model, tok, pt_path, compress_to_fp16) + compression_modes = [] + for cw in args.compress_weights: + if is_int8_compression(cw): + compression_modes.append(cw) + assert compression_modes, "Only INT8 compression supported for PyTorch backend" + + dummy_inputs = { + "input_ids": torch.ones((1, 10), dtype=torch.long), + "attention_mask": torch.ones((1, 10), dtype=torch.long), + } + + for idx, compress_mode in enumerate(compression_modes): + if idx > 0: + pt_model = create_model(args.model_id, config, model_kwargs) + + compression_options = COMPRESSION_OPTIONS[compress_mode] + compressed_pt_model = nncf.compress_weights( + pt_model, dataset=nncf.Dataset([dummy_inputs]), **compression_options + ) + + pt_path = ( + Path(args.output_dir) + / PYTORCH_DIR + / OV_DIR + / PYTORCH_COMPRESS_WEIGHTS_DIR.format(precision=precision, compression=compress_mode) + ) + convert_to_ov(compressed_pt_model, tok, pt_path, compress_to_fp16) if is_ov_compression(args): + if not remote_code: + return convert_optimum_causallm_base(pt_model, args, config, compression_only) ov_path = get_fp_path(args, "openvino_model.xml") if compression_only: log.info( @@ -958,9 +1056,7 @@ def ts_patched_forward( ov_model = Core().read_model(ov_path) for compress_option in args.compress_weights: log.info(f"Compress model weights to {compress_option}") - ov_compressed_path = get_compressed_path( - args.output_dir, args.precision, compress_option - ) + ov_compressed_path = get_compressed_path(args.output_dir, args.precision, compress_option) compress_ov_model_weights_helper( ov_model, tok, @@ -976,27 +1072,26 @@ def ts_patched_forward( def convert_chatglm(args): - def convert_to_ov(pt_model, tok, out_path, compress_to_fp16=False): - pt_model.config.torchscript = True + def make_dummy_input(): last_token = torch.tensor([[130328]]) past = torch.zeros(28, 2, 5, 1, 32, 128) position_ids = torch.tensor([[[2], [4]]]) - dummy_input = { + return { "input_ids": last_token, "past_key_values": past, "position_ids": position_ids, } + + def convert_to_ov(pt_model, tok, out_path, compress_to_fp16=False): + pt_model.config.torchscript = True + dummy_input = make_dummy_input() ov_model = convert_model(pt_model, example_input=dummy_input) ov_model.outputs[0].get_tensor().set_names({"logits"}) for i in range(1, len(ov_model.outputs), 2): idx = (i - 1) // 2 ov_model.outputs[i].get_tensor().set_names({f"present.{int(idx)}.key"}) - ov_model.outputs[i + 1].get_tensor().set_names( - {f"present.{int(idx)}.value"} - ) - save_ov_model_helper( - ov_model, out_path, fp16=compress_to_fp16, tok=tok, config=pt_model.config - ) + ov_model.outputs[i + 1].get_tensor().set_names({f"present.{int(idx)}.value"}) + save_ov_model_helper(ov_model, out_path, fp16=compress_to_fp16, tok=tok, config=pt_model.config) config = AutoConfig.from_pretrained(args.model_id, trust_remote_code=True) cuda, post_init = patch_gptq(config) @@ -1010,21 +1105,22 @@ def convert_to_ov(pt_model, tok, out_path, compress_to_fp16=False): ) compress_to_fp16 = is_fp16(args) gptq_applied = is_gptq(config) - precision = ( - precision if not gptq_applied else GPTQ_DIR.format(precision=args.precision) - ) + precision = precision if not gptq_applied else GPTQ_DIR.format(precision=args.precision) tokenizer_id = args.tokenizer_id or args.model_id tok = AutoTokenizer.from_pretrained(tokenizer_id, trust_remote_code=True) ov_out_path = Path(args.output_dir) / PYTORCH_DIR / OV_DIR / precision if post_init is not None: model_kwargs = {"torch_dtype": torch.float32} if not compression_only: - pt_model = AutoModel.from_pretrained( - args.model_id, trust_remote_code=True, config=config, **model_kwargs - ) - pt_model.config.use_cache = True - pt_model.to(torch.float32) - pt_model.eval() + + def create_model(model_id, config, model_kwargs): + pt_model = AutoModel.from_pretrained(model_id, trust_remote_code=True, config=config, **model_kwargs) + pt_model.config.use_cache = True + pt_model.to(torch.float32) + pt_model.eval() + return pt_model + + pt_model = create_model(args.model_id, config, model_kwargs) if args.save_orig: pt_out_dir = Path(args.output_dir) / PYTORCH_DIR @@ -1034,14 +1130,29 @@ def convert_to_ov(pt_model, tok, out_path, compress_to_fp16=False): pt_compress_weights = is_torch_compression(args) if pt_compress_weights: - compressed_pt_model = compress_weights(pt_model) - pt_out_path = ( - Path(args.output_dir) - / PYTORCH_DIR - / OV_DIR - / PYTORCH_COMPRESS_WEIGHTS_DIR.format(precision) - ) - convert_to_ov(compressed_pt_model, tok, pt_out_path) + compression_modes = [] + for cw in args.compress_weights: + if is_int8_compression(cw): + compression_modes.append(cw) + assert compression_modes, "Only INT8 compression supported for PyTorch backend" + + dummy_input = make_dummy_input() + for idx, compress_mode in enumerate(compression_modes): + if idx > 0: + pt_model = create_model(args.model_id, config, model_kwargs) + + compression_options = COMPRESSION_OPTIONS[compress_mode] + compressed_pt_model = nncf.compress_weights( + pt_model, dataset=nncf.Dataset([dummy_input]), **compression_options + ) + + pt_out_path = ( + Path(args.output_dir) + / PYTORCH_DIR + / OV_DIR + / PYTORCH_COMPRESS_WEIGHTS_DIR.format(precision=precision, compression=compress_mode) + ) + convert_to_ov(compressed_pt_model, tok, pt_out_path) if is_ov_compression(args): ov_model_path = get_fp_path(args, "openvino_model.xml") @@ -1053,9 +1164,7 @@ def convert_to_ov(pt_model, tok, out_path, compress_to_fp16=False): ov_model = Core().read_model(ov_model_path) for compress_option in args.compress_weights: log.info(f"Compress model weights to {compress_option}") - ov_compressed_path = get_compressed_path( - args.output_dir, args.precision, args.compress_weights - ) + ov_compressed_path = get_compressed_path(args.output_dir, args.precision, args.compress_weights) compress_ov_model_weights_helper( ov_model, tok, @@ -1071,120 +1180,68 @@ def convert_to_ov(pt_model, tok, out_path, compress_to_fp16=False): def convert_falcon(args): - def convert_to_ov(pt_model, tok, out_path, compress_to_fp16=False): - outs = pt_model(input_ids=torch.ones((1, 10), dtype=torch.long)) - inputs = ["input_ids"] - outputs = ["logits"] - - dynamic_shapes = {"input_ids": {1: "seq_len"}} - - for idx in range(len(outs.past_key_values)): - inputs.extend( - [f"past_key_values.{idx}.key", f"past_key_values.{idx}.value"] - ) - dynamic_shapes[inputs[-1]] = {1: "past_sequence + sequence"} - dynamic_shapes[inputs[-2]] = {1: "past_sequence + sequence"} - outputs.extend([f"present.{idx}.key", f"present.{idx}.value"]) - - dummy_inputs = { - "input_ids": torch.ones((1, 2), dtype=torch.long), - "past_key_values": outs.past_key_values, - } - flatten_inputs = flattenize_inputs(dummy_inputs.values()) - pt_model.config.torchscript = True - ov_model = convert_model(pt_model, example_input=dummy_inputs) - for port, input_data, input_name in zip( - ov_model.inputs[1:], flatten_inputs[1:], inputs[1:] - ): - port.get_node().set_element_type(Type.f32) - shape = list(input_data.shape) - shape[2] = -1 - port.get_node().set_partial_shape(PartialShape(shape)) - port.get_tensor().set_names({input_name}) - for idx, out_name in enumerate(outputs): - ov_model.outputs[idx].get_tensor().set_names({out_name}) - ov_model.validate_nodes_and_infer_types() - save_ov_model_helper( - ov_model, out_path, fp16=compress_to_fp16, tok=tok, config=pt_model.config - ) - - config = AutoConfig.from_pretrained(args.model_id, trust_remote_code=True) + config = AutoConfig.from_pretrained(args.model_id) cuda, post_init = patch_gptq(config) model_kwargs = {} precision = args.precision - config = AutoConfig.from_pretrained(args.model_id, trust_remote_code=True) - cuda, post_init = patch_gptq(config) - model_kwargs = {} compression_only = ( args.compress_weights and not args.force_convert and not is_torch_compression(args) and is_ov_model_provided(args.model_id, args.output_dir, args.precision) ) - gptq_applied = is_gptq(config) if post_init is not None: model_kwargs = {"torch_dtype": torch.float32} pt_model = None - tokenizer_id = args.tokenizer_id or args.model_id - tok = AutoTokenizer.from_pretrained(tokenizer_id, trust_remote_code=True) gptq_applied = is_gptq(config) - precision = ( - precision if not gptq_applied else GPTQ_DIR.format(precision=args.precision) - ) - if post_init is not None: - model_kwargs = {"torch_dtype": torch.float32} - pt_model = None - compress_to_fp16 = is_fp16(args) + precision = precision if not gptq_applied else GPTQ_DIR.format(precision=args.precision) if not compression_only: pt_model = AutoModelForCausalLM.from_pretrained( args.model_id, - config=AutoConfig.from_pretrained(args.model_id, trust_remote_code=True), - trust_remote_code=True, + config=AutoConfig.from_pretrained(args.model_id), **model_kwargs, ) pt_model.config.use_cache = True pt_model.eval() - if args.save_orig: - pt_out_dir = Path(args.output_dir) / PYTORCH_DIR - pt_model.save_pretrained(pt_out_dir) - save_tokenizer(tok, pt_out_dir) + convert_optimum_causallm_base(pt_model, args, config, compression_only) - ov_out_path = Path(args.output_dir) / PYTORCH_DIR / OV_DIR / args.precision - convert_to_ov(pt_model, tok, ov_out_path, compress_to_fp16) + if post_init is not None: + unpatch_gptq(cuda, post_init) - if is_torch_compression(args): - pt_compressed_model = compress_weights(pt_model) - pt_comp_path = ( - Path(args.output_dir) - / PYTORCH_DIR - / OV_DIR - / PYTORCH_COMPRESS_WEIGHTS_DIR.format(args.precision) - ) - convert_to_ov(pt_compressed_model, tok, pt_comp_path, compress_to_fp16) - if is_ov_compression(args): - fp_path = get_fp_path(args, "openvino_model.xml") - if compression_only: - log.info( - f"Model conversion to {args.precision} will be skipped as found converted model {fp_path}. " - "If it is not expected behaviour, please remove previously converted model or use --force_convert option" - ) - ov_model = Core().read_model(fp_path) - for compress_option in args.compress_weights: - log.info(f"Compress model weights to {compress_option}") - ov_compressed_path = get_compressed_path( - args.output_dir, args.precision, compress_option - ) - compress_ov_model_weights_helper( - ov_model, - tok, - pt_model.config, - ov_compressed_path, - compress_to_fp16, - compress_option, - args, - ) +def convert_phi(args): + trust_remote_code = False + try: + config = AutoConfig.from_pretrained(args.model_id, trust_remote_code=False) + except Exception: + config = AutoConfig.from_pretrained(args.model_id, trust_remote_code=True) + trust_remote_code = True + cuda, post_init = patch_gptq(config) + model_kwargs = {} + model_kwargs["trust_remote_code"] = trust_remote_code + precision = args.precision + compression_only = ( + args.compress_weights + and not args.force_convert + and not is_torch_compression(args) + and is_ov_model_provided(args.model_id, args.output_dir, args.precision) + ) + if post_init is not None: + model_kwargs["torch_dtype"] = torch.float32 + pt_model = None + gptq_applied = is_gptq(config) + precision = precision if not gptq_applied else GPTQ_DIR.format(precision=args.precision) + if not compression_only: + pt_model = AutoModelForCausalLM.from_pretrained( + args.model_id, + config=config, + **model_kwargs, + ) + pt_model.config.use_cache = True + pt_model.eval() + + convert_optimum_causallm_base(pt_model, args, config, compression_only) if post_init is not None: unpatch_gptq(cuda, post_init) @@ -1204,9 +1261,7 @@ def convert_baichaun(args): model_kwargs = {"torch_dtype": torch.float32} model = None if not compression_only: - model = AutoModelForCausalLM.from_pretrained( - args.model_id, trust_remote_code=True, **model_kwargs - ) + model = AutoModelForCausalLM.from_pretrained(args.model_id, trust_remote_code=True, **model_kwargs) try: model.to(torch.float32) if post_init is None: @@ -1225,7 +1280,7 @@ def convert_baichaun(args): def convert_qwen(args): config = AutoConfig.from_pretrained(args.model_id, trust_remote_code=True) cuda, post_init = patch_gptq(config) - model_kwargs = {"revision": "2abd8e5777bb4ce9c8ab4be7dbbd0fe4526db78d"} + model_kwargs = {} precision = args.precision compression_only = ( args.compress_weights @@ -1236,7 +1291,6 @@ def convert_qwen(args): if post_init is not None: model_kwargs = { "torch_dtype": torch.float32, - "revision": "c02ede58c0ab0045f5e4788c35842bec6a7baa0a", } model = None if not compression_only: @@ -1256,13 +1310,22 @@ def convert_codegen2(args): if config.model_type == "codegen": config.model_type = "codegen2" cuda, post_init = patch_gptq(config) - pt_model = AutoModelForCausalLM.from_pretrained( - args.model_id, - trust_remote_code=True, - config=AutoConfig.from_pretrained(args.model_id, trust_remote_code=True), + precision = args.precision + compression_only = ( + args.compress_weights + and not args.force_convert + and not is_torch_compression(args) + and is_ov_model_provided(args.model_id, args.output_dir, precision) ) - pt_model.config = config - convert_optimum_causallm_base(pt_model, args, model_config=config) + pt_model = None + if not compression_only: + pt_model = AutoModelForCausalLM.from_pretrained( + args.model_id, + trust_remote_code=True, + config=AutoConfig.from_pretrained(args.model_id, trust_remote_code=True), + ) + pt_model.config = config + convert_optimum_causallm_base(pt_model, args, config, compression_only) if post_init is not None: unpatch_gptq(cuda, post_init) @@ -1296,6 +1359,7 @@ def convert_aquilachat(args): "lcm": convert_lcm, "ldm": convert_ldm_super_res, "mpt": convert_mpt, + "phi-": convert_phi, "replit": convert_mpt, "chatglm2": convert_causal_lm, "chatglm3": convert_causal_lm, @@ -1320,24 +1384,16 @@ def get_convert_model_type(model_id): def main(): - log.basicConfig( - format="[ %(levelname)s ] %(message)s", level=log.INFO, stream=sys.stdout - ) + log.basicConfig(format="[ %(levelname)s ] %(message)s", level=log.INFO, stream=sys.stdout) parser = ArgumentParser() - parser.add_argument( - "-m", "--model_id", required=True, help="model_id or directory for loading" - ) + parser.add_argument("-m", "--model_id", required=True, help="model_id or directory for loading") parser.add_argument( "--tokenizer_id", required=False, help="tokenizer id or directory for loading. If not provided, model_id will be used by default", ) - parser.add_argument( - "-o", "--output_dir", required=True, help="output directory for saving model" - ) - parser.add_argument( - "--save_orig", action="store_true", help="save pytorch model on disk" - ) + parser.add_argument("-o", "--output_dir", required=True, help="output directory for saving model") + parser.add_argument("--save_orig", action="store_true", help="save pytorch model on disk") parser.add_argument( "-p", "--precision", @@ -1345,21 +1401,21 @@ def main(): default="FP32", help="base conversion precision", ) - parser.add_argument( - "--force_convert", action="store_true", help="Force model conversion" - ) + parser.add_argument("--force_convert", action="store_true", help="Force model conversion") compression_group = parser.add_argument_group("Weights compression parameters") compression_group.add_argument( "-c", "--compress_weights", type=str, - choices=["INT8", "4BIT_DEFAULT", "INT4_SYM", "INT4_ASYM"], + choices=["INT8", "INT8_ASYM", "INT8_SYM", "4BIT_DEFAULT", "4BIT_MAXIMUM", "INT4_SYM", "INT4_ASYM", "E2M1"], nargs="+", help=( - "The weight compression option, e.g. INT8 - INT8 weights, " - "4BIT_DEFAULT - for 4-bit compression with predefined configs, " - "INT4_* - for INT4 compressed weights." + "The weight compression option, e.g. INT8 - INT8 weights (deprecated, please use INT8_ASYM instead), " + "4BIT_DEFAULT - for 4-bit compression with predefined configs with performance-accuracy trade-off, " + "4BIT_MAXIMUM - for 4-bit compression with predefined configs for the best performance, " + "INT4_* - for INT4 compressed weights, " + "E2M1 - for fp4 compression with fp8 (e8m0) scales." ), ) compression_group.add_argument( @@ -1387,9 +1443,29 @@ def main(): action="store_true", help="Compress all layers including embeddings and prediction head", ) + compression_group.add_argument( + "--dataset", + help=( + "Dataset name for data-aware compression. Must be one of ['wikitext2', 'c4', 'c4-new']." + ), + default=None, + type=str, + ) + compression_group.add_argument( + "--awq", + action="store_true", + help="Apply AWQ algorithm during compression", + ) + compression_group.add_argument( + "--scale_estimation", + action="store_true", + help="Apply scale estimation algorithm during compression", + ) add_stateful_model_arguments(parser) args = parser.parse_args() + log.warning("[DEPRECATED] Not for production use! Please use the 'optimum-intel' to generate the IRs. For details, please check:" + " https://github.com/openvinotoolkit/openvino.genai/blob/master/llm_bench/python/README.md#2-convert-model-to-openvino-ir-format") log.info(f"openvino runtime version: {get_version()}") model_type = get_convert_model_type(args.model_id.lower()) converter = converters[model_type] diff --git a/llm_bench/python/doc/NOTES.md b/llm_bench/python/doc/NOTES.md index 90968abf74..8d84b4e8c8 100644 --- a/llm_bench/python/doc/NOTES.md +++ b/llm_bench/python/doc/NOTES.md @@ -61,4 +61,14 @@ Solution: update `tokenization_baichuan.py` as following: <br /> - self.add_eos_token = add_eos_token - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(vocab_file) -``` \ No newline at end of file +``` + +## CompressWeights Mode INT4 - ConnectionError: Couldn't reach 'wikitext' on the Hub (SSLError) +Download LLM from hugginface, convert to OpenVINO IR files and run with convert.py and CompressWeights Mode to INT4, the following error may occur: +```bash +raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({type(e)._name_})") +ConnectionError: Couldn't reach 'wikitext' on the Hub (SSLError) +``` +root cause: The wikitext data set was not downloaded correctly, or the Hugging Face Hub network could not be connected normally. <br /> +Solution: <br /> +Refer to https://huggingface.co/docs/datasets/loading#arrow , copy wikitext data set to ~/.cache/huggingface/datasets/ folder, set the environment variable HF_DATASETS_OFFLINE to 1. \ No newline at end of file diff --git a/llm_bench/python/utils/config_class.py b/llm_bench/python/llm_bench_utils/config_class.py similarity index 83% rename from llm_bench/python/utils/config_class.py rename to llm_bench/python/llm_bench_utils/config_class.py index 506cdf3b26..bba4d9a640 100644 --- a/llm_bench/python/utils/config_class.py +++ b/llm_bench/python/llm_bench_utils/config_class.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright (C) 2018-2023 Intel Corporation +# Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 from transformers import AutoTokenizer from transformers import AutoModelForCausalLM, T5ForConditionalGeneration, BlenderbotForConditionalGeneration, AutoModel @@ -11,7 +11,7 @@ OVLatentConsistencyModelPipeline, OVStableDiffusionXLPipeline ) -from utils.ov_model_classes import OVMPTModel, OVFalconModel, OVLDMSuperResolutionPipeline, OVChatGLMModel, OVChatGLM2Model, OVQwenModel +from llm_bench_utils.ov_model_classes import OVMPTModel, OVLDMSuperResolutionPipeline, OVChatGLMModel TOKENIZE_CLASSES_MAPPING = { 'decoder': AutoTokenizer, @@ -25,8 +25,8 @@ 'decoder': OVModelForCausalLM, 't5': OVModelForSeq2SeqLM, 'blenderbot': OVModelForSeq2SeqLM, + 'falcon': OVModelForCausalLM, 'mpt': OVMPTModel, - 'falcon': OVFalconModel, 'stable-diffusion-xl': OVStableDiffusionXLPipeline, 'sdxl': OVStableDiffusionXLPipeline, 'lcm-sdxl': OVStableDiffusionXLPipeline, @@ -38,10 +38,9 @@ 'codet5': OVModelForSeq2SeqLM, 'codegen2': OVModelForCausalLM, 'ldm_super_resolution': OVLDMSuperResolutionPipeline, - 'chatglm2': OVChatGLM2Model, - 'chatglm3': OVChatGLM2Model, + 'chatglm2': OVModelForCausalLM, + 'chatglm3': OVModelForCausalLM, 'chatglm': OVChatGLMModel, - 'qwen': OVQwenModel, } PT_MODEL_CLASSES_MAPPING = { @@ -59,11 +58,12 @@ 'image_gen': ['stable-diffusion-', 'ssd-', 'deepfloyd-if', 'tiny-sd', 'small-sd', 'lcm-', 'sdxl'], 'text2speech': ['whisper'], 'image_cls': ['vit'], - 'code_gen': ['replit', 'codegen2', 'codegen', 'codet5'], + 'code_gen': ['replit', 'codegen2', 'codegen', 'codet5', "stable-code"], 'text_gen': [ 'decoder', 't5', 'falcon', + "glm", 'gpt-', 'gpt2', 'aquila', @@ -78,7 +78,7 @@ 'pythia-', 'stablelm-', 'stable-zephyr-', - 'rocket-' + 'rocket-', 'blenderbot', 'vicuna', 'dolly', @@ -93,8 +93,18 @@ 'qwen', 'zephyr', 'mistral', + 'mixtral', 'yi-', - 'phi-' + 'phi-', + 'phi2-', + 'minicpm', + 'gemma', + "deci", + "internlm", + "olmo", + "phi3", + "starcoder", + "instruct-gpt" ], 'ldm_super_resolution': ['ldm-super-resolution'], } diff --git a/llm_bench/python/utils/conversion_utils/better_transformer_patch.py b/llm_bench/python/llm_bench_utils/conversion_utils/better_transformer_patch.py similarity index 83% rename from llm_bench/python/utils/conversion_utils/better_transformer_patch.py rename to llm_bench/python/llm_bench_utils/conversion_utils/better_transformer_patch.py index c3962bc56a..4def9cfa0a 100644 --- a/llm_bench/python/utils/conversion_utils/better_transformer_patch.py +++ b/llm_bench/python/llm_bench_utils/conversion_utils/better_transformer_patch.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright (C) 2018-2023 Intel Corporation +# Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 import math @@ -865,9 +865,156 @@ def forward( return outputs +def gptj_apply_rotary_pos_emb(tensor: torch.Tensor, sin: torch.Tensor, cos: torch.Tensor) -> torch.Tensor: + sin = torch.repeat_interleave(sin[:, :, None, :], 2, 3) + cos = torch.repeat_interleave(cos[:, :, None, :], 2, 3) + return (tensor * cos) + (rotate_every_two(tensor) * sin) + + +def gptj_forward( + self, + hidden_states: torch.FloatTensor, + layer_past: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = False, + output_attentions: Optional[bool] = False, +) -> Union[ + Tuple[torch.Tensor, Tuple[torch.Tensor]], + Optional[Tuple[torch.Tensor, Tuple[torch.Tensor], Tuple[torch.Tensor, ...]]], +]: + query = self.q_proj(hidden_states) + key = self.k_proj(hidden_states) + value = self.v_proj(hidden_states) + + query = self._split_heads(query, self.num_attention_heads, self.head_dim, True) + key = self._split_heads(key, self.num_attention_heads, self.head_dim, True) + value = self._split_heads(value, self.num_attention_heads, self.head_dim, False) + + sincos = self.embed_positions[position_ids] + sin, cos = torch.split(sincos, sincos.shape[-1] // 2, dim=-1) + print("patched gptj model with simplied position_ids") + if self.rotary_dim is not None: + k_rot = key[:, :, :, : self.rotary_dim] + k_pass = key[:, :, :, self.rotary_dim :] + + q_rot = query[:, :, :, : self.rotary_dim] + q_pass = query[:, :, :, self.rotary_dim :] + + k_rot = gptj_apply_rotary_pos_emb(k_rot, sin, cos) + q_rot = gptj_apply_rotary_pos_emb(q_rot, sin, cos) + + key = torch.cat([k_rot, k_pass], dim=-1) + query = torch.cat([q_rot, q_pass], dim=-1) + else: + key = gptj_apply_rotary_pos_emb(key, sin, cos) + query = gptj_apply_rotary_pos_emb(query, sin, cos) + + key = key.permute(0, 2, 1, 3) + query = query.permute(0, 2, 1, 3) + + if layer_past is not None: + past_key = layer_past[0] + past_value = layer_past[1] + key = torch.cat((past_key, key), dim=-2) + value = torch.cat((past_value, value), dim=-2) + + if use_cache is True: + # Note that this cast is quite ugly, but is not implemented before ROPE as the original codebase keeps the key in float32 all along the computation. + # Reference: https://github.com/kingoflolz/mesh-transformer-jax/blob/f8315e3003033b23f21d78361b288953064e0e76/mesh_transformer/layers.py#L128 + present = (key.to(hidden_states.dtype), value) + else: + present = None + + # compute self-attention: V x Softmax(QK^T) + attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask) + + attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_dim) + attn_output = self.out_proj(attn_output) + attn_output = self.resid_dropout(attn_output) + + outputs = (attn_output, present) + if output_attentions: + outputs += (attn_weights,) + + return outputs # a, present, (attentions) + + +def raise_on_head_mask(head_mask: Optional[torch.Tensor]): + if head_mask is not None: + raise ValueError( + "layer_head_mask different than None is unsupported for now with BetterTransformer, please" + "open a PR or an issue at https://github.com/huggingface/optimum." + ) + + +def gptj_wrapped_scaled_dot_product( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, +): + raise_on_head_mask(head_mask) + batch_size = query.shape[0] + + mask_value = torch.finfo(value.dtype).min + mask_value = torch.full([], mask_value, dtype=value.dtype) + + # in gpt-neo-x and gpt-j the query and keys are always in fp32 + # thus we need to cast them to the value dtype + if self.downcast_qk: + query = query.to(value.dtype) + key = key.to(value.dtype) + + if batch_size == 1 and attention_mask is not None and attention_mask[0, 0, -1, -1] < -1: + raise ValueError("BetterTransformer does not support padding='max_length' with a batch size of 1.") + + dropout_p = self.dropout_prob_attn if self.training else 0.0 + if batch_size == 1 or self.training: + if query.shape[2] > 1: + sdpa_result = torch.nn.functional.scaled_dot_product_attention( + query, key, value, attn_mask=None, dropout_p=dropout_p, is_causal=True + ) + else: + sdpa_result = torch.nn.functional.scaled_dot_product_attention( + query, key, value, attn_mask=None, dropout_p=dropout_p, is_causal=False + ) + else: + query_length, key_length = query.size(-2), key.size(-2) + + # causal_mask is always [True, ..., True] otherwise, so executing this + # is unnecessary + if query_length > 1: + causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool) + + causal_mask = torch.where(causal_mask, 0, mask_value) + + # torch.Tensor.expand does no memory copy + + if attention_mask is not None: + attention_mask = causal_mask + attention_mask + else: + causal_mask = causal_mask.expand(batch_size, -1, -1, -1) + + sdpa_result = torch.nn.functional.scaled_dot_product_attention( + query, key, value, attn_mask=attention_mask, dropout_p=dropout_p, is_causal=False + ) + + # in gpt-neo-x and gpt-j the query and keys are always in fp32 + # thus we need to cast them to the value dtype + if self.downcast_qk: + sdpa_result = sdpa_result.to(value.dtype) + + return sdpa_result, None + + def register_bettertransformer_config(): from optimum.bettertransformer.models import BetterTransformerManager from optimum.bettertransformer.models.base import BetterTransformerBaseLayer + from transformers.models.gptj.modeling_gptj import GPTJAttention class StableLMAttentionLayerBetterTransformer( BetterTransformerBaseLayer, StableLMAttention, nn.Module @@ -944,6 +1091,41 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"): def forward(self, *args, **kwargs): return bt_aquila_forward(self, *args, **kwargs) + class GPTJAttentionLayerBetterTransformer(BetterTransformerBaseLayer, GPTJAttention, nn.Module): + _attn = gptj_wrapped_scaled_dot_product + + def __init__(self, layer: "nn.Module", config: "PretrainedConfig"): + super().__init__(config) + with torch.device("meta"): + super(BetterTransformerBaseLayer, self).__init__(config) + + submodules = [ + "k_proj", + "v_proj", + "q_proj", + "out_proj", + "attn_dropout", + "resid_dropout", + "bias", + "scale_attn", + "masked_bias", + ] + # Attribute only for transformers>=4.28 + if hasattr(layer, "embed_positions"): + submodules.append("embed_positions") + + for attr in submodules: + setattr(self, attr, getattr(layer, attr)) + + self.module_mapping = None + self.original_layers_mapping = {submodule: submodule for submodule in submodules} + + self.downcast_qk = True + self.dropout_prob_attn = config.attn_pdrop + + def forward(self, *args, **kwargs): + return gptj_forward(self, *args, **kwargs) + BetterTransformerManager.MODEL_MAPPING["stablelm_epoch"] = { "Attention": StableLMAttentionLayerBetterTransformer } @@ -963,6 +1145,10 @@ def forward(self, *args, **kwargs): ), } + BetterTransformerManager.MODEL_MAPPING["gptj"] = { + "GPTJAttention": GPTJAttentionLayerBetterTransformer + } + BetterTransformerManager.NOT_REQUIRES_NESTED_TENSOR.add("stablelm_epoch") BetterTransformerManager.NOT_REQUIRES_STRICT_VALIDATION.add("stablelm_epoch") BetterTransformerManager.MODEL_MAPPING["codegen2"] = { diff --git a/llm_bench/python/llm_bench_utils/conversion_utils/convert_patch.py b/llm_bench/python/llm_bench_utils/conversion_utils/convert_patch.py new file mode 100644 index 0000000000..8edf785ad6 --- /dev/null +++ b/llm_bench/python/llm_bench_utils/conversion_utils/convert_patch.py @@ -0,0 +1,257 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import torch +from typing import Tuple, Optional, Union +import types +from transformers.modeling_outputs import BaseModelOutputWithPast + + +def _yi_prepare_decoder_attention_mask( + attention_mask, input_ids, inputs_embeds, past_key_values_length +): + input_shape = input_ids.shape if input_ids is not None else inputs_embeds.shape[:-1] + return _prepare_decoder_attention_mask( + attention_mask, input_shape, inputs_embeds, past_key_values_length + ) + + +# Copied from transformers.models.bart.modeling_bart._expand_mask +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill( + inverted_mask.to(torch.bool), torch.finfo(dtype).min + ) + + +# Modified from transformers.models.bloom.modeling_bloom._make_causal_mask +def _make_causal_mask( + input_ids_shape: torch.Size, + device: torch.device, + past_key_values_length: int, + dtype: torch.dtype = torch.bool, +) -> torch.BoolTensor: + """ + Make causal mask used for bi-directional self-attention. + """ + batch_size, target_length = input_ids_shape + mask = torch.zeros( + (target_length, target_length + past_key_values_length), + dtype=dtype, + device=device, + ) + seq_ids = torch.arange(target_length, device=device) + + mask[:, past_key_values_length:] = ( + (seq_ids[:, None] < seq_ids[None, :]) * torch.finfo(dtype).min + if torch.is_floating_point(mask) + else seq_ids[:, None] < seq_ids[None, :] + ) + + return mask[None, None, :, :].expand( + batch_size, 1, target_length, target_length + past_key_values_length + ) + + +# Modified from transformers.models.llama.modeling_llama._prepare_decoder_attention_mask +def _prepare_decoder_attention_mask( + attention_mask, input_shape, inputs_embeds, past_key_values_length +): + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = None + + combined_attention_mask = _make_causal_mask( + input_shape, + device=inputs_embeds.device, + past_key_values_length=past_key_values_length, + dtype=inputs_embeds.dtype, + ) + + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = _expand_mask( + attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ).to(inputs_embeds.device) + combined_attention_mask = ( + expanded_attn_mask + if combined_attention_mask is None + else expanded_attn_mask + combined_attention_mask + ) + + return combined_attention_mask + + +def stablelm_forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, +) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # Retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time" + ) + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError( + "You have to specify either decoder_input_ids or decoder_inputs_embeds" + ) + + seq_length_with_past = seq_length + past_key_values_length = 0 + + if past_key_values is not None: + past_key_values_length = past_key_values[0][0].shape[2] + seq_length_with_past = seq_length + past_key_values_length + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, + seq_length + past_key_values_length, + dtype=torch.long, + device=device, + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + # Embed positions + if getattr(self, "_use_flash_attention_2", False): + # 2d mask is passed through the layers + attention_mask = ( + attention_mask + if (attention_mask is not None and 0 in attention_mask) + else None + ) + else: + if attention_mask is None: + attention_mask = torch.ones( + (batch_size, seq_length_with_past), + dtype=torch.bool, + device=inputs_embeds.device, + ) + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + + hidden_states = inputs_embeds + + if self.gradient_checkpointing and self.training: + if use_cache: + use_cache = False + + # Decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + # None for past_key_value + return module(*inputs, past_key_value, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(decoder_layer), + hidden_states, + attention_mask, + position_ids, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # Add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] + if v is not None + ) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +def patch_model_for_optimum_export(model): + if model.config.model_type in ["stablelm_epoch"]: + model.model._prepare_decoder_attention_mask = _prepare_decoder_attention_mask + model.model.forward = types.MethodType(stablelm_forward, model.model) + elif model.config.model_type == "Yi": + model.model._prepare_decoder_attention_mask = _yi_prepare_decoder_attention_mask + return model diff --git a/llm_bench/python/llm_bench_utils/conversion_utils/export_configs.py b/llm_bench/python/llm_bench_utils/conversion_utils/export_configs.py new file mode 100644 index 0000000000..cf465b8f53 --- /dev/null +++ b/llm_bench/python/llm_bench_utils/conversion_utils/export_configs.py @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +from optimum.exporters.onnx.config import TextDecoderOnnxConfig, TextDecoderWithPositionIdsOnnxConfig +from optimum.exporters.tasks import TasksManager +from optimum.utils import ( + NormalizedTextConfig, + DummyPastKeyValuesGenerator, + DummyTextInputGenerator, +) +from optimum.exporters.openvino.model_configs import register_in_tasks_manager + + +class YIDummyTextInputGenerator(DummyTextInputGenerator): + SUPPORTED_INPUT_NAMES = { + "input_ids", + "attention_mask", + "token_type_ids", + "position_ids", + } + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + input = super().generate(input_name, framework, int_dtype, float_dtype) + if input_name == "position_ids": + input = input[:, -1:] + return input + + +@register_in_tasks_manager('yi', *["text-generation", "text-generation-with-past"]) +class YIOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): + # The ONNX export of this architecture needs the Trilu operator support, available since opset 14 + DEFAULT_ONNX_OPSET = 14 + DUMMY_INPUT_GENERATOR_CLASSES = ( + YIDummyTextInputGenerator, + DummyPastKeyValuesGenerator, + ) + DUMMY_PKV_GENERATOR_CLASS = DummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + no_position_ids = False + + +@register_in_tasks_manager("jais", *["text-generation", "text-generation-with-past"]) +class JaisOpenVINOConfig(TextDecoderOnnxConfig): + DEFAULT_ONNX_OPSET = 13 + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_layers='n_layer', num_attention_heads='n_head', hidden_size='n_embd') + + +TasksManager._SUPPORTED_MODEL_TYPE['stablelm_epoch'] = TasksManager._SUPPORTED_MODEL_TYPE['stablelm'] +TasksManager._SUPPORTED_MODEL_TYPE['stablelm-epoch'] = TasksManager._SUPPORTED_MODEL_TYPE['stablelm'] +TasksManager._SUPPORTED_MODEL_TYPE['stablelm2'] = TasksManager._SUPPORTED_MODEL_TYPE['stablelm'] +TasksManager._SUPPORTED_MODEL_TYPE["aquila"] = TasksManager._SUPPORTED_MODEL_TYPE["stablelm"] +TasksManager._SUPPORTED_MODEL_TYPE["codegen2"] = TasksManager._SUPPORTED_MODEL_TYPE["codegen"] diff --git a/llm_bench/python/utils/conversion_utils/helpers.py b/llm_bench/python/llm_bench_utils/conversion_utils/helpers.py similarity index 51% rename from llm_bench/python/utils/conversion_utils/helpers.py rename to llm_bench/python/llm_bench_utils/conversion_utils/helpers.py index 79557c18cf..5c6e05588e 100644 --- a/llm_bench/python/utils/conversion_utils/helpers.py +++ b/llm_bench/python/llm_bench_utils/conversion_utils/helpers.py @@ -1,14 +1,23 @@ # -*- coding: utf-8 -*- -# Copyright (C) 2018-2023 Intel Corporation +# Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 - +import copy +import json from enum import Enum -from pathlib import Path import logging as log +from pathlib import Path +from typing import Optional, List, Dict + import torch +import numpy as np from nncf import compress_weights +from nncf import Dataset from openvino import save_model -from ..nncf_utils import COMPRESSION_OPTIONS, INT4_MODEL_CONFIGURATION +import nncf +from ..nncf_utils import COMPRESSION_OPTIONS +from optimum.gptq.data import get_dataset, prepare_dataset +from optimum.intel.openvino.configuration import _check_default_4bit_configs, OVQuantizationMethod, _DEFAULT_4BIT_CONFIG +import warnings class BackendType(Enum): @@ -17,7 +26,7 @@ class BackendType(Enum): PYTORCH_DIR = 'pytorch' -PYTORCH_COMPRESS_WEIGHTS_DIR = 'compressed_weights/PT_{precision}-INT8' +PYTORCH_COMPRESS_WEIGHTS_DIR = 'compressed_weights/PT_{precision}-{compression}' OV_DIR = 'dldt' GPTQ_DIR = "GPTQ_INT4-{precision}" @@ -34,6 +43,10 @@ def is_fp16(args): return args.precision == "FP16" +def is_int8_compression(compress_weights_mode): + return compress_weights_mode in ["INT8", "INT8_ASYM", "INT8_SYM"] + + def is_ov_model_provided(model_id, model_dir, precision, model_name="openvino_model.xml"): model_dirs = [] if Path(model_id).is_dir(): @@ -83,25 +96,114 @@ def save_tokenizer(tokenizer, out_dir): log.error(f'tokenizer loading failed with {e}') +def transform_fn( + config, + input_shapes: Dict[str, List], + input_ids: torch.LongTensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + **kwargs +): + inputs = {"input_ids": np.array(input_ids)} + + if "attention_mask" in input_shapes: + inputs["attention_mask"] = attention_mask + + if "position_ids" in input_shapes: + if position_ids is None: + position_ids = np.cumsum(attention_mask, axis=1) - 1 + position_ids[attention_mask == 0] = 1 + else: + position_ids = np.array(position_ids) + inputs["position_ids"] = position_ids + + if "beam_idx" in input_shapes: + batch_size = input_ids.shape[0] + if config.model_type == "bloom": + batch_size *= config.num_attention_heads + inputs["beam_idx"] = np.arange(batch_size, dtype=int) + + for name, shape in input_shapes.items(): + if name in inputs: + continue + inputs[name] = np.zeros(shape) + + return inputs + + +def get_ov_input_shapes(model, batch_size=1): + inputs = {} + for val in model.inputs: + name = val.any_name + shape = list(val.partial_shape.get_min_shape()) + shape[0] = batch_size + inputs[name] = shape + + return inputs + + +def get_nncf_dataset(ov_model, tokenizer, config, dataset_name, subset_size): + """initializes dict with data-aware compression parameters if defined dataset and tokenizer + + Args: + ov_model : OpenVINO model for compression + tokenizer : tokenizer for ov_model + config : ov_model configuration + dataset_name: name of the dataset to load; must be one of ['wikitext2', 'c4', 'c4-new'] + subset_size: the number of sample the dataset should contain + + Returns: + nncf_dataset: NNCF dataset + """ + subset_size = subset_size or 128 + dataset = get_dataset(dataset_name, tokenizer, seqlen=32, nsamples=subset_size) + dataset = prepare_dataset(dataset) + input_shapes = get_ov_input_shapes(ov_model) + nncf_dataset = Dataset(dataset, lambda x: transform_fn(config=config, input_shapes=input_shapes, **x)) + return nncf_dataset + + def compress_ov_model_weights_helper(ov_model, tok, config, out_path, compress_weights_format="INT8", fp16=False, args={}, model_name="openvino_model"): - compression_args = None + if "INT8" in compress_weights_format and "INT8_ASYM" in COMPRESSION_OPTIONS: + warnings.warn("Usage INT8 mode is deprecated and will be removed soon. Please use INT8_ASYM instead", DeprecationWarning) if "4BIT_DEFAULT" in compress_weights_format: - model_id = out_path.parents[3].name - if model_id in INT4_MODEL_CONFIGURATION: - compression_args = INT4_MODEL_CONFIGURATION[model_id] - else: - compression_args = COMPRESSION_OPTIONS["INT4_SYM"] - - if compression_args is None: - compression_args = COMPRESSION_OPTIONS[compress_weights_format] - if args.ratio is not None: - compression_args["ratio"] = args.ratio - if args.group_size is not None: - compression_args["group_size"] = args.group_size - if args.all_layers: - compression_args["all_layers"] = True + compression_args = _check_default_4bit_configs(config.name_or_path) + if compression_args is None: + config_path = Path(config.name_or_path) / "config.json" + if config_path.exists(): + with config_path.open("r") as f: + json_config = json.load(f) + name_or_path = json_config.get("_name_or_path", None) + if name_or_path is not None: + # Do additional check in case the input model is a full precision IR exported from PT model by path + compression_args = _check_default_4bit_configs(name_or_path) + compression_args = compression_args or _DEFAULT_4BIT_CONFIG + compression_args = copy.deepcopy(compression_args) + compression_args.pop("bits") + + sym = compression_args.pop("sym", False) + compression_args["mode"] = nncf.CompressWeightsMode.INT4_SYM if sym else nncf.CompressWeightsMode.INT4_ASYM + if compression_args.pop("quant_method", None) == OVQuantizationMethod.AWQ: + compression_args["awq"] = True + if "num_samples" in compression_args: + compression_args["subset_size"] = compression_args.pop("num_samples") + if not compression_args.get("all_layers", None): + compression_args.pop("all_layers", None) + else: + compression_args = copy.deepcopy(COMPRESSION_OPTIONS[compress_weights_format]) + for arg_name in ["ratio", "group_size", "all_layers", "dataset", "awq", "scale_estimation"]: + arg_value = getattr(args, arg_name, None) + if arg_value: + compression_args[arg_name] = arg_value + log.info("Compression options:") log.info(compression_args) + + dataset_name = compression_args.pop("dataset", None) + if dataset_name is not None and tok is not None: + nncf_dataset = get_nncf_dataset(ov_model, tok, config, dataset_name, compression_args.get("subset_size", None)) + compression_args["dataset"] = nncf_dataset + compressed_ov_model = compress_weights(ov_model, **compression_args) save_ov_model_helper(compressed_ov_model, out_path, model_name, fp16=fp16, tok=tok, config=config) diff --git a/llm_bench/python/utils/hook_beam_search.py b/llm_bench/python/llm_bench_utils/hook_beam_search.py similarity index 69% rename from llm_bench/python/utils/hook_beam_search.py rename to llm_bench/python/llm_bench_utils/hook_beam_search.py index 8bcac71d49..99b0a9e5c3 100644 --- a/llm_bench/python/utils/hook_beam_search.py +++ b/llm_bench/python/llm_bench_utils/hook_beam_search.py @@ -1,54 +1,63 @@ # -*- coding: utf-8 -*- -# Copyright (C) 2018-2023 Intel Corporation +# Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # flake8: noqa import time import torch import warnings -import transformers -import torch.distributed as dist import logging as log from torch import nn -from packaging import version from typing import Optional, Tuple, Union, List from transformers.generation.stopping_criteria import ( + EosTokenCriteria, StoppingCriteriaList, validate_stopping_criteria, ) from transformers.generation.logits_process import LogitsProcessorList from transformers.generation.beam_search import BeamScorer +from transformers.generation.utils import ( + _split_model_inputs, + stack_model_outputs, +) from transformers.utils import ModelOutput -class BeamSearchEncoderDecoderOutput(ModelOutput): +logger = log.getLogger(__name__) + + +class GenerateBeamDecoderOnlyOutput(ModelOutput): sequences: torch.LongTensor = None sequences_scores: Optional[torch.FloatTensor] = None scores: Optional[Tuple[torch.FloatTensor]] = None + logits: Optional[Tuple[torch.FloatTensor]] = None beam_indices: Optional[torch.LongTensor] = None - encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None - encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None - decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None -class BeamSearchDecoderOnlyOutput(ModelOutput): +class GenerateBeamEncoderDecoderOutput(ModelOutput): sequences: torch.LongTensor = None sequences_scores: Optional[torch.FloatTensor] = None scores: Optional[Tuple[torch.FloatTensor]] = None + logits: Optional[Tuple[torch.FloatTensor]] = None beam_indices: Optional[torch.LongTensor] = None - attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None -BeamSearchOutput = Union[BeamSearchEncoderDecoderOutput, BeamSearchDecoderOnlyOutput] +GenerateBeamOutput = Union[GenerateBeamDecoderOnlyOutput, GenerateBeamEncoderDecoderOutput] tm_list = [] tm_infer_list = [] -# Transformers version: Release/v4.35.2 514de24abfd4416aeba6a6455ad5920f57f3567d -# Copied from https://github.com/huggingface/transformers/blob/514de24abfd4416aeba6a6455ad5920f57f3567d/src/transformers/generation/utils.py#L2894 +# Transformers version: v4.40-release 4fdf58afb72b0754da30037fc800b6044e7d9c99 +# Copied from https://github.com/huggingface/transformers/blob/4fdf58afb72b0754da30037fc800b6044e7d9c99/src/transformers/generation/utils.py#L2911 # Add the function of collecting latency def new_beam_search( self, @@ -62,17 +71,19 @@ def new_beam_search( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, output_scores: Optional[bool] = None, + output_logits: Optional[bool] = None, return_dict_in_generate: Optional[bool] = None, synced_gpus: bool = False, + sequential: Optional[bool] = None, **model_kwargs, - ) -> Union[BeamSearchOutput, torch.LongTensor]: + ) -> Union[GenerateBeamOutput, torch.LongTensor]: r""" Generates sequences of token ids for models with a language modeling head using **beam search decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. <Tip warning={true}> - In most cases, you do not need to call [`~generation.GenerationMixin.beam_search`] directly. Use generate() + In most cases, you do not need to call [`~generation.GenerationMixin._beam_search`] directly. Use generate() instead. For an overview of generation strategies and code examples, check the [following guide](../generation_strategies). @@ -103,21 +114,28 @@ def new_beam_search( output_hidden_states (`bool`, *optional*, defaults to `False`): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more details. + output_logits (`bool`, *optional*, defaults to `False`): + Whether or not to return the raw prediction logit scores. See `logits` under returned tensors for + more details. output_scores (`bool`, *optional*, defaults to `False`): Whether or not to return the prediction scores. See `scores` under returned tensors for more details. return_dict_in_generate (`bool`, *optional*, defaults to `False`): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. synced_gpus (`bool`, *optional*, defaults to `False`): Whether to continue running the while loop until max_length (needed for ZeRO stage 3) + sequential (`bool`, defaults to `False`): + By default, beam search has `batch_size * num_beams` as effective batch size (see `beam_search()` for + more details). This flag will avoid parallelizing the beam search and will instead run beam search + sequentially. model_kwargs: Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is an encoder-decoder model the kwargs should include `encoder_outputs`. Return: - [`generation.BeamSearchDecoderOnlyOutput`], [`~generation.BeamSearchEncoderDecoderOutput`] or + [`generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a - [`~generation.BeamSearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and - `return_dict_in_generate=True` or a [`~generation.BeamSearchEncoderDecoderOutput`] if + [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and + `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if `model.config.is_encoder_decoder=True`. @@ -133,8 +151,8 @@ def new_beam_search( ... ) >>> import torch - >>> tokenizer = AutoTokenizer.from_pretrained("t5-base") - >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") + >>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base") + >>> model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base") >>> encoder_input_str = "translate English to German: How old are you?" >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids @@ -167,7 +185,7 @@ def new_beam_search( ... ] ... ) - >>> outputs = model.beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs) + >>> outputs = model._beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs) >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) ['Wie alt bist du?'] @@ -175,20 +193,40 @@ def new_beam_search( # init values logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() + sequential = sequential if sequential is not None else self.generation_config.low_memory if max_length is not None: warnings.warn( "`max_length` is deprecated in this function, use" - " `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.", + " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.", UserWarning, ) stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length) if len(stopping_criteria) == 0: warnings.warn("You don't have defined any stopping_criteria, this will likely loop forever", UserWarning) pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id - eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id + if eos_token_id is not None: + logger.warning_once( + "`eos_token_id` is deprecated in this function and will be removed in v4.41, use" + " `stopping_criteria=StoppingCriteriaList([EosTokenCriteria(eos_token_id=eos_token_id)])` instead." + " Otherwise make sure to set `model.generation_config.eos_token_id`", + FutureWarning, + ) + stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id)) + else: + # TODO remove when the method is totally private and beam scorer refactored + # need to get `eos_token_id` and add stopping criteria, so that generation does not go forever + eos_token_id = [ + criteria.eos_token_id.tolist() for criteria in stopping_criteria if hasattr(criteria, "eos_token_id") + ] + eos_token_id = eos_token_id[0] if eos_token_id else None + if eos_token_id is None and self.generation_config.eos_token_id is not None: + eos_token_id = self.generation_config.eos_token_id + stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id)) + if isinstance(eos_token_id, int): eos_token_id = [eos_token_id] output_scores = output_scores if output_scores is not None else self.generation_config.output_scores + output_logits = output_logits if output_logits is not None else self.generation_config.output_logits output_attentions = ( output_attentions if output_attentions is not None else self.generation_config.output_attentions ) @@ -205,6 +243,9 @@ def new_beam_search( num_beams = beam_scorer.num_beams batch_beam_size, cur_len = input_ids.shape + if "inputs_embeds" in model_kwargs: + cur_len = model_kwargs["inputs_embeds"].shape[1] + model_kwargs["cache_position"] = torch.arange(cur_len, device=input_ids.device) if num_beams * batch_size != batch_beam_size: raise ValueError( @@ -213,6 +254,7 @@ def new_beam_search( # init attention / hidden states / scores tuples scores = () if (return_dict_in_generate and output_scores) else None + raw_logits = () if (return_dict_in_generate and output_logits) else None beam_indices = ( tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None ) @@ -233,29 +275,59 @@ def new_beam_search( beam_scores[:, 1:] = -1e9 beam_scores = beam_scores.view((batch_size * num_beams,)) - this_peer_finished = False # used by synced_gpus only - while True: - tic = time.perf_counter() - if synced_gpus: - # Under synced_gpus the `forward` call must continue until all gpus complete their sequence. - # The following logic allows an early break if all peers finished generating their sequence - this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device) - # send 0.0 if we finished, 1.0 otherwise - dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM) - # did all peers finish? the reduced sum will be 0.0 then - if this_peer_finished_flag.item() == 0.0: - break + this_peer_finished = False + decoder_prompt_len = input_ids.shape[-1] # record the prompt length of decoder + + while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device): + tic = time.perf_counter() model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) + + # if sequential is True, split the input to batches of batch_size and run sequentially tic_infer = time.perf_counter() - outputs = self( - **model_inputs, - return_dict=True, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - ) - tm_infer_list.append(time.perf_counter() - tic_infer) + if sequential: + if any( + model_name in self.__class__.__name__.lower() + for model_name in [ + "fsmt", + "reformer", + "bloom", + "ctrl", + "gpt_bigcode", + "transo_xl", + "xlnet", + "cpm", + "jamba", + ] + ): + raise RuntimeError( + f"Currently generation for {self.__class__.__name__} is not supported " + f"for `low_memory beam_search`. Please open an issue on GitHub if you need this feature." + ) + inputs_per_sub_batches = _split_model_inputs( + model_inputs, split_size=batch_size, full_batch_size=batch_beam_size + ) + outputs_per_sub_batch = [ + self( + **inputs_per_sub_batch, + return_dict=True, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + for inputs_per_sub_batch in inputs_per_sub_batches + ] + + outputs = stack_model_outputs(outputs_per_sub_batch) + + else: # Unchanged original behavior + outputs = self( + **model_inputs, + return_dict=True, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + tm_infer_list.append(time.perf_counter() - tic_infer) if synced_gpus and this_peer_finished: cur_len = cur_len + 1 continue # don't waste resources running the code we don't need @@ -274,13 +346,14 @@ def new_beam_search( if return_dict_in_generate: if output_scores: scores += (next_token_scores_processed,) + if output_logits: + raw_logits += (next_token_logits,) if output_attentions: decoder_attentions += ( (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) ) if self.config.is_encoder_decoder: cross_attentions += (outputs.cross_attentions,) - if output_hidden_states: decoder_hidden_states += ( (outputs.decoder_hidden_states,) @@ -310,6 +383,7 @@ def new_beam_search( pad_token_id=pad_token_id, eos_token_id=eos_token_id, beam_indices=beam_indices, + decoder_prompt_len=decoder_prompt_len, ) beam_scores = beam_outputs["next_beam_scores"] @@ -319,10 +393,14 @@ def new_beam_search( input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1) model_kwargs = self._update_model_kwargs_for_generation( - outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder + outputs, + model_kwargs, + is_encoder_decoder=self.config.is_encoder_decoder, ) - if model_kwargs["past_key_values"] is not None: - model_kwargs["past_key_values"] = self._reorder_cache(model_kwargs["past_key_values"], beam_idx) + if model_kwargs.get("past_key_values", None) is not None: + model_kwargs["past_key_values"] = self._temporary_reorder_cache( + model_kwargs["past_key_values"], beam_idx + ) if return_dict_in_generate and output_scores: beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices)))) @@ -330,11 +408,8 @@ def new_beam_search( # increase cur_len cur_len = cur_len + 1 tm_list.append(time.perf_counter() - tic) - if beam_scorer.is_done or stopping_criteria(input_ids, scores): - if not synced_gpus: - break - else: - this_peer_finished = True + if beam_scorer.is_done or all(stopping_criteria(input_ids, scores)): + this_peer_finished = True sequence_outputs = beam_scorer.finalize( input_ids, @@ -345,6 +420,7 @@ def new_beam_search( eos_token_id=eos_token_id, max_length=stopping_criteria.max_length, beam_indices=beam_indices, + decoder_prompt_len=decoder_prompt_len, ) if return_dict_in_generate: @@ -352,25 +428,29 @@ def new_beam_search( sequence_outputs["sequence_scores"] = None if self.config.is_encoder_decoder: - return BeamSearchEncoderDecoderOutput( + return GenerateBeamEncoderDecoderOutput( sequences=sequence_outputs["sequences"], sequences_scores=sequence_outputs["sequence_scores"], scores=scores, + logits=raw_logits, beam_indices=sequence_outputs["beam_indices"], encoder_attentions=encoder_attentions, encoder_hidden_states=encoder_hidden_states, decoder_attentions=decoder_attentions, cross_attentions=cross_attentions, decoder_hidden_states=decoder_hidden_states, + past_key_values=model_kwargs.get("past_key_values"), ) else: - return BeamSearchDecoderOnlyOutput( + return GenerateBeamDecoderOnlyOutput( sequences=sequence_outputs["sequences"], sequences_scores=sequence_outputs["sequence_scores"], scores=scores, + logits=raw_logits, beam_indices=sequence_outputs["beam_indices"], attentions=decoder_attentions, hidden_states=decoder_hidden_states, + past_key_values=model_kwargs.get("past_key_values"), ) else: return sequence_outputs["sequences"] @@ -403,12 +483,6 @@ def get_time_infer_list(self): global tm_infer_list return tm_infer_list - def new_forward(self, model, model_type=None): + def new_forward(self, model): """Define a new beam search function.""" - min_version = version.parse('4.34.0') - trans_version = version.parse(transformers.__version__) - if trans_version < min_version: - log.warning(f'The function of getting latency of beam search will not be available with current transformers version:{trans_version}') - else: - bound_method = new_beam_search.__get__(model, model.__class__) - model.beam_search = bound_method \ No newline at end of file + model._beam_search = new_beam_search.__get__(model, model.__class__) \ No newline at end of file diff --git a/llm_bench/python/llm_bench_utils/hook_common.py b/llm_bench/python/llm_bench_utils/hook_common.py new file mode 100644 index 0000000000..4751ed7d4d --- /dev/null +++ b/llm_bench/python/llm_bench_utils/hook_common.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# flake8: noqa +import logging as log +import transformers +from packaging import version + +TRANS_MIN_VERSION = '4.40.0' + + +def get_bench_hook(num_beams, ov_model): + min_version = version.parse(TRANS_MIN_VERSION) + trans_version = version.parse(transformers.__version__) + search_type = 'beam search' if num_beams > 1 else 'greedy search' + if trans_version >= min_version: + import llm_bench_utils.hook_greedy_search + import llm_bench_utils.hook_beam_search + if num_beams > 1: + bench_hook = llm_bench_utils.hook_beam_search.BeamSearchHook() + else: + bench_hook = llm_bench_utils.hook_greedy_search.GreedySearchHook() + bench_hook.new_forward(ov_model) + else: + log.warning(f'The minimum version of transformers to get 1st and 2nd tokens latency of {search_type} is: {min_version}') + bench_hook = None + return bench_hook \ No newline at end of file diff --git a/llm_bench/python/utils/hook_forward.py b/llm_bench/python/llm_bench_utils/hook_forward.py similarity index 85% rename from llm_bench/python/utils/hook_forward.py rename to llm_bench/python/llm_bench_utils/hook_forward.py index 1a019f6520..702bd947e1 100644 --- a/llm_bench/python/utils/hook_forward.py +++ b/llm_bench/python/llm_bench_utils/hook_forward.py @@ -45,9 +45,9 @@ def clear_statistics(self): def new_text_encoder(self, pipe): old_text_encoder = pipe.text_encoder.request - def my_text_encoder(inputs, shared_memory=True, **kwargs): + def my_text_encoder(inputs, share_inputs=True, **kwargs): t1 = time.time() - r = old_text_encoder(inputs, shared_memory=shared_memory, **kwargs) + r = old_text_encoder(inputs, share_inputs=share_inputs, **kwargs) t2 = time.time() text_encoder_time = t2 - t1 self.text_encoder_time += text_encoder_time @@ -58,9 +58,9 @@ def my_text_encoder(inputs, shared_memory=True, **kwargs): def new_unet(self, pipe): old_unet = pipe.unet.request - def my_unet(inputs, shared_memory=True, **kwargs): + def my_unet(inputs, share_inputs=True, **kwargs): t1 = time.time() - r = old_unet(inputs, shared_memory=shared_memory, **kwargs) + r = old_unet(inputs, share_inputs=share_inputs, **kwargs) t2 = time.time() unet_time = t2 - t1 self.unet_time_list.append(unet_time) @@ -71,9 +71,9 @@ def my_unet(inputs, shared_memory=True, **kwargs): def new_vae_decoder(self, pipe): old_vae_decoder = pipe.vae_decoder.request - def my_vae_decoder(inputs, shared_memory=True, **kwargs): + def my_vae_decoder(inputs, share_inputs=True, **kwargs): t1 = time.time() - r = old_vae_decoder(inputs, shared_memory=shared_memory, **kwargs) + r = old_vae_decoder(inputs, share_inputs=share_inputs, **kwargs) t2 = time.time() vae_decoder_time = t2 - t1 self.vae_decoder_time += vae_decoder_time diff --git a/llm_bench/python/utils/hook_greedy_search.py b/llm_bench/python/llm_bench_utils/hook_greedy_search.py similarity index 73% rename from llm_bench/python/utils/hook_greedy_search.py rename to llm_bench/python/llm_bench_utils/hook_greedy_search.py index ff1fffd130..03bbd55ea4 100644 --- a/llm_bench/python/utils/hook_greedy_search.py +++ b/llm_bench/python/llm_bench_utils/hook_greedy_search.py @@ -1,48 +1,58 @@ # -*- coding: utf-8 -*- -# Copyright (C) 2018-2023 Intel Corporation +# Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # flake8: noqa import time import torch import warnings -import transformers -import torch.distributed as dist import logging as log -from packaging import version +import transformers from typing import Optional, Tuple, Union, List from transformers.generation.stopping_criteria import ( + EosTokenCriteria, StoppingCriteriaList, validate_stopping_criteria, ) from transformers.generation.logits_process import LogitsProcessorList from transformers.generation.streamers import BaseStreamer from transformers.utils import ModelOutput +import llm_bench_utils.hook_sample as hook_sample +import llm_bench_utils.hook_sample_v43 as hook_sample_v43 +import llm_bench_utils.hook_sample_v45 as hook_sample_v45 +from packaging import version -class GreedySearchDecoderOnlyOutput(ModelOutput): +logger = log.getLogger(__name__) + + +class GenerateDecoderOnlyOutput(ModelOutput): sequences: torch.LongTensor = None scores: Optional[Tuple[torch.FloatTensor]] = None + logits: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None -class GreedySearchEncoderDecoderOutput(ModelOutput): +class GenerateEncoderDecoderOutput(ModelOutput): sequences: torch.LongTensor = None scores: Optional[Tuple[torch.FloatTensor]] = None + logits: Optional[Tuple[torch.FloatTensor]] = None encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None -GreedySearchOutput = Union[GreedySearchEncoderDecoderOutput, GreedySearchDecoderOnlyOutput] +GenerateNonBeamOutput = Union[GenerateDecoderOnlyOutput, GenerateEncoderDecoderOutput] tm_list = [] tm_infer_list = [] -# Transformers version: Release/v4.35.2 514de24abfd4416aeba6a6455ad5920f57f3567d -# Copied from https://github.com/huggingface/transformers/blob/514de24abfd4416aeba6a6455ad5920f57f3567d/src/transformers/generation/utils.py#L2353 +# Transformers version: v4.40-release 4fdf58afb72b0754da30037fc800b6044e7d9c99 +# Copied from https://github.com/huggingface/transformers/blob/4fdf58afb72b0754da30037fc800b6044e7d9c99/src/transformers/generation/utils.py#L2310 # Add the function of collecting latency def new_greedy_search( self, @@ -55,18 +65,19 @@ def new_greedy_search( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, output_scores: Optional[bool] = None, + output_logits: Optional[bool] = None, return_dict_in_generate: Optional[bool] = None, synced_gpus: bool = False, streamer: Optional["BaseStreamer"] = None, **model_kwargs, - ) -> Union[GreedySearchOutput, torch.LongTensor]: + ) -> Union[GenerateNonBeamOutput, torch.LongTensor]: r""" Generates sequences of token ids for models with a language modeling head using **greedy decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. <Tip warning={true}> - In most cases, you do not need to call [`~generation.GenerationMixin.greedy_search`] directly. Use generate() + In most cases, you do not need to call [`~generation.GenerationMixin._greedy_search`] directly. Use generate() instead. For an overview of generation strategies and code examples, check the [following guide](../generation_strategies). @@ -98,6 +109,9 @@ def new_greedy_search( for more details. output_scores (`bool`, *optional*, defaults to `False`): Whether or not to return the prediction scores. See `scores` under returned tensors for more details. + output_logits (`bool`, *optional*, defaults to `False`): + Whether or not to return the raw prediction logit scores. See `logits` under returned tensors + for more details. return_dict_in_generate (`bool`, *optional*, defaults to `False`): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. synced_gpus (`bool`, *optional*, defaults to `False`): @@ -110,10 +124,10 @@ def new_greedy_search( If model is an encoder-decoder model the kwargs should include `encoder_outputs`. Return: - [`~generation.GreedySearchDecoderOnlyOutput`], [`~generation.GreedySearchEncoderDecoderOutput`] or + [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a - [`~generation.GreedySearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and - `return_dict_in_generate=True` or a [`~generation.GreedySearchEncoderDecoderOutput`] if + [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and + `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if `model.config.is_encoder_decoder=True`. Examples: @@ -128,8 +142,8 @@ def new_greedy_search( ... MaxLengthCriteria, ... ) - >>> tokenizer = AutoTokenizer.from_pretrained("gpt2") - >>> model = AutoModelForCausalLM.from_pretrained("gpt2") + >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2") + >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2") >>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id @@ -145,7 +159,7 @@ def new_greedy_search( ... ) >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)]) - >>> outputs = model.greedy_search( + >>> outputs = model._greedy_search( ... input_ids, logits_processor=logits_processor, stopping_criteria=stopping_criteria ... ) @@ -163,10 +177,27 @@ def new_greedy_search( ) stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length) pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id - eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id + if eos_token_id is not None: + logger.warning_once( + "`eos_token_id` is deprecated in this function and will be removed in v4.41, use" + " `stopping_criteria=StoppingCriteriaList([EosTokenCriteria(eos_token_id=eos_token_id)])` instead." + " Otherwise make sure to set `model.generation_config.eos_token_id`", + FutureWarning, + ) + stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id)) + else: + # TODO remove when the method is totally private + # need to get `eos_token_id` and add stopping criteria, so that generation does not go forever + eos_token_id = [ + criteria.eos_token_id.tolist() for criteria in stopping_criteria if hasattr(criteria, "eos_token_id") + ] + eos_token_id = eos_token_id[0] if eos_token_id else None + if eos_token_id is None and self.generation_config.eos_token_id is not None: + eos_token_id = self.generation_config.eos_token_id + stopping_criteria.append(EosTokenCriteria(eos_token_id=eos_token_id)) + if isinstance(eos_token_id, int): eos_token_id = [eos_token_id] - eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None output_scores = output_scores if output_scores is not None else self.generation_config.output_scores output_attentions = ( output_attentions if output_attentions is not None else self.generation_config.output_attentions @@ -181,6 +212,7 @@ def new_greedy_search( ) # init attention / hidden states / scores tuples + raw_logits = () if (return_dict_in_generate and output_logits) else None scores = () if (return_dict_in_generate and output_scores) else None decoder_attentions = () if (return_dict_in_generate and output_attentions) else None cross_attentions = () if (return_dict_in_generate and output_attentions) else None @@ -194,21 +226,15 @@ def new_greedy_search( ) # keep track of which sequences are already finished - unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device) - - this_peer_finished = False # used by synced_gpus only - while True: + batch_size, cur_len = input_ids.shape + if "inputs_embeds" in model_kwargs: + cur_len = model_kwargs["inputs_embeds"].shape[1] + this_peer_finished = False + unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device) + model_kwargs["cache_position"] = torch.arange(cur_len, device=input_ids.device) + + while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device): tic = time.perf_counter() - if synced_gpus: - # Under synced_gpus the `forward` call must continue until all gpus complete their sequence. - # The following logic allows an early break if all peers finished generating their sequence - this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device) - # send 0.0 if we finished, 1.0 otherwise - dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM) - # did all peers finish? the reduced sum will be 0.0 then - if this_peer_finished_flag.item() == 0.0: - break - # prepare model inputs model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) @@ -234,6 +260,8 @@ def new_greedy_search( if return_dict_in_generate: if output_scores: scores += (next_tokens_scores,) + if output_logits: + raw_logits += (next_token_logits,) if output_attentions: decoder_attentions += ( (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) @@ -262,46 +290,39 @@ def new_greedy_search( if streamer is not None: streamer.put(next_tokens.cpu()) model_kwargs = self._update_model_kwargs_for_generation( - outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder + outputs, + model_kwargs, + is_encoder_decoder=self.config.is_encoder_decoder, ) - # if eos_token was found in one sentence, set sentence to finished - if eos_token_id_tensor is not None: - unfinished_sequences = unfinished_sequences.mul( - next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0) - ) - - # stop when each sentence is finished - if unfinished_sequences.max() == 0: - this_peer_finished = True - - # stop if we exceed the maximum length - if stopping_criteria(input_ids, scores): - this_peer_finished = True + unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores) + this_peer_finished = unfinished_sequences.max() == 0 tm_list.append(time.perf_counter() - tic) - if this_peer_finished and not synced_gpus: - break if streamer is not None: streamer.end() if return_dict_in_generate: if self.config.is_encoder_decoder: - return GreedySearchEncoderDecoderOutput( + return GenerateEncoderDecoderOutput( sequences=input_ids, scores=scores, + logits=raw_logits, encoder_attentions=encoder_attentions, encoder_hidden_states=encoder_hidden_states, decoder_attentions=decoder_attentions, cross_attentions=cross_attentions, decoder_hidden_states=decoder_hidden_states, + past_key_values=model_kwargs.get("past_key_values"), ) else: - return GreedySearchDecoderOnlyOutput( + return GenerateDecoderOnlyOutput( sequences=input_ids, scores=scores, + logits=raw_logits, attentions=decoder_attentions, hidden_states=decoder_hidden_states, + past_key_values=model_kwargs.get("past_key_values"), ) else: return input_ids @@ -334,12 +355,12 @@ def get_time_infer_list(self): global tm_infer_list return tm_infer_list - def new_forward(self, model, model_type=None): + def new_forward(self, model): """Define a new greedy search function.""" - min_version = version.parse('4.34.0') + model._greedy_search = new_greedy_search.__get__(model, model.__class__) + model._sample = hook_sample.new_sample.__get__(model, model.__class__) trans_version = version.parse(transformers.__version__) - if trans_version < min_version: - log.warning(f'The function of getting latency of greedy search will not be available with current transformers version:{trans_version}') - else: - bound_method = new_greedy_search.__get__(model, model.__class__) - model.greedy_search = bound_method + if trans_version >= version.parse('4.45.0'): + model._sample = hook_sample_v45.new_sample.__get__(model, model.__class__) + elif trans_version >= version.parse('4.43.0'): + model._sample = hook_sample_v43.new_sample.__get__(model, model.__class__) diff --git a/llm_bench/python/llm_bench_utils/hook_sample.py b/llm_bench/python/llm_bench_utils/hook_sample.py new file mode 100644 index 0000000000..22111c1a3f --- /dev/null +++ b/llm_bench/python/llm_bench_utils/hook_sample.py @@ -0,0 +1,229 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# flake8: noqa +import time +import torch +import logging as log +from torch import nn +from typing import Optional, Tuple, Union +from transformers.generation.stopping_criteria import ( + StoppingCriteriaList, + validate_stopping_criteria, +) +from transformers.generation.logits_process import LogitsProcessorList +from transformers.generation.streamers import BaseStreamer +from transformers.utils import ModelOutput +from transformers.generation.configuration_utils import GenerationConfig +import llm_bench_utils.hook_greedy_search as hook_greedy + + +logger = log.getLogger(__name__) + + +class GenerateDecoderOnlyOutput(ModelOutput): + sequences: torch.LongTensor = None + scores: Optional[Tuple[torch.FloatTensor]] = None + logits: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None + + +class GenerateEncoderDecoderOutput(ModelOutput): + sequences: torch.LongTensor = None + scores: Optional[Tuple[torch.FloatTensor]] = None + logits: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None + + +GenerateNonBeamOutput = Union[GenerateDecoderOnlyOutput, GenerateEncoderDecoderOutput] + + +# Transformers version: v4.41-release ab0f050b42d903f34d6eb97f3f8c0c07f0517ad2 +# Copied from https://github.com/huggingface/transformers/blob/ab0f050b42d903f34d6eb97f3f8c0c07f0517ad2/src/transformers/generation/utils.py#L2310 +# Add the function of collecting latency +def new_sample( + self, + input_ids: torch.LongTensor, + logits_processor: LogitsProcessorList, + stopping_criteria: StoppingCriteriaList, + generation_config: GenerationConfig, + synced_gpus: bool, + streamer: Optional["BaseStreamer"], + logits_warper: Optional[LogitsProcessorList] = None, + **model_kwargs, + ) -> Union[GenerateNonBeamOutput, torch.LongTensor]: + r""" + Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and + can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. + + Parameters: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + The sequence used as a prompt for the generation. + logits_processor (`LogitsProcessorList`): + An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] + used to modify the prediction scores of the language modeling head applied at each generation step. + stopping_criteria (`StoppingCriteriaList`): + An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] + used to tell if the generation loop should stop. + generation_config ([`~generation.GenerationConfig`]): + The generation configuration to be used as parametrization of the decoding method. + synced_gpus (`bool`): + Whether to continue running the while loop until max_length (needed for ZeRO stage 3) + streamer (`BaseStreamer`, *optional*): + Streamer object that will be used to stream the generated sequences. Generated tokens are passed + through `streamer.put(token_ids)` and the streamer is responsible for any further processing. + logits_warper (`LogitsProcessorList`, *optional*): + An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used + to warp the prediction score distribution of the language modeling head applied before multinomial + sampling at each generation step. Only required with sampling strategies (i.e. `do_sample` is set in + `generation_config`) + model_kwargs: + Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is + an encoder-decoder model the kwargs should include `encoder_outputs`. + + Return: + [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or `torch.LongTensor`: + A `torch.LongTensor` containing the generated tokens (default behaviour) or a + [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and + `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if + `model.config.is_encoder_decoder=True`. + """ + # init values + pad_token_id = generation_config.pad_token_id + output_attentions = generation_config.output_attentions + output_hidden_states = generation_config.output_hidden_states + output_scores = generation_config.output_scores + output_logits = generation_config.output_logits + return_dict_in_generate = generation_config.return_dict_in_generate + has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria) + do_sample = generation_config.do_sample + if do_sample is True and not isinstance(logits_warper, LogitsProcessorList): + raise ValueError( + "`do_sample` is set to `True`, `logits_warper` must be a `LogitsProcessorList` instance (it is " + f"{logits_warper})." + ) + + # init attention / hidden states / scores tuples + scores = () if (return_dict_in_generate and output_scores) else None + raw_logits = () if (return_dict_in_generate and output_logits) else None + decoder_attentions = () if (return_dict_in_generate and output_attentions) else None + cross_attentions = () if (return_dict_in_generate and output_attentions) else None + decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None + + # if model is an encoder-decoder, retrieve encoder attention weights and hidden states + if return_dict_in_generate and self.config.is_encoder_decoder: + encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None + encoder_hidden_states = ( + model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None + ) + + # keep track of which sequences are already finished + batch_size = input_ids.shape[0] + this_peer_finished = False + unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device) + model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs) + + while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device): + tic = time.perf_counter() + # prepare model inputs + model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) + + # forward pass to get next token + tic_infer = time.perf_counter() + outputs = self( + **model_inputs, + return_dict=True, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + hook_greedy.tm_infer_list.append(time.perf_counter() - tic_infer) + + if synced_gpus and this_peer_finished: + continue # don't waste resources running the code we don't need + + next_token_logits = outputs.logits[:, -1, :] + + # pre-process distribution + next_token_scores = logits_processor(input_ids, next_token_logits) + if do_sample: + next_token_scores = logits_warper(input_ids, next_token_scores) + + # Store scores, attentions and hidden_states when required + if return_dict_in_generate: + if output_scores: + scores += (next_token_scores,) + if output_logits: + raw_logits += (next_token_logits,) + if output_attentions: + decoder_attentions += ( + (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) + ) + if self.config.is_encoder_decoder: + cross_attentions += (outputs.cross_attentions,) + + if output_hidden_states: + decoder_hidden_states += ( + (outputs.decoder_hidden_states,) + if self.config.is_encoder_decoder + else (outputs.hidden_states,) + ) + + # token selection + if do_sample: + probs = nn.functional.softmax(next_token_scores, dim=-1) + next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) + else: + next_tokens = torch.argmax(next_token_scores, dim=-1) + + # finished sentences should have their next token be a padding token + if has_eos_stopping_criteria: + next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences) + + # update generated ids, model inputs, and length for next step + input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) + if streamer is not None: + streamer.put(next_tokens.cpu()) + model_kwargs = self._update_model_kwargs_for_generation( + outputs, + model_kwargs, + is_encoder_decoder=self.config.is_encoder_decoder, + ) + + unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores) + this_peer_finished = unfinished_sequences.max() == 0 + hook_greedy.tm_list.append(time.perf_counter() - tic) + + if streamer is not None: + streamer.end() + + if return_dict_in_generate: + if self.config.is_encoder_decoder: + return GenerateEncoderDecoderOutput( + sequences=input_ids, + scores=scores, + logits=raw_logits, + encoder_attentions=encoder_attentions, + encoder_hidden_states=encoder_hidden_states, + decoder_attentions=decoder_attentions, + cross_attentions=cross_attentions, + decoder_hidden_states=decoder_hidden_states, + past_key_values=model_kwargs.get("past_key_values"), + ) + else: + return GenerateDecoderOnlyOutput( + sequences=input_ids, + scores=scores, + logits=raw_logits, + attentions=decoder_attentions, + hidden_states=decoder_hidden_states, + past_key_values=model_kwargs.get("past_key_values"), + ) + else: + return input_ids \ No newline at end of file diff --git a/llm_bench/python/llm_bench_utils/hook_sample_v43.py b/llm_bench/python/llm_bench_utils/hook_sample_v43.py new file mode 100644 index 0000000000..7dce578dac --- /dev/null +++ b/llm_bench/python/llm_bench_utils/hook_sample_v43.py @@ -0,0 +1,233 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# flake8: noqa +import time +import torch +import logging as log +from torch import nn +from typing import Optional, Tuple, Union +from transformers.generation.stopping_criteria import ( + StoppingCriteriaList, + validate_stopping_criteria, +) +from transformers.generation.logits_process import LogitsProcessorList +from transformers.generation.streamers import BaseStreamer +from transformers.utils import ModelOutput +from transformers.generation.configuration_utils import GenerationConfig +import llm_bench_utils.hook_greedy_search as hook_greedy + + +logger = log.getLogger(__name__) + + +class GenerateDecoderOnlyOutput(ModelOutput): + sequences: torch.LongTensor = None + scores: Optional[Tuple[torch.FloatTensor]] = None + logits: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None + + +class GenerateEncoderDecoderOutput(ModelOutput): + sequences: torch.LongTensor = None + scores: Optional[Tuple[torch.FloatTensor]] = None + logits: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None + + +GenerateNonBeamOutput = Union[GenerateDecoderOnlyOutput, GenerateEncoderDecoderOutput] + + +# Transformers version: v4.43-release 868d36d29ec132deeaaf8571b25b6a1b911d0145 +# Copied from https://github.com/huggingface/transformers/blob/868d36d29ec132deeaaf8571b25b6a1b911d0145/src/transformers/generation/utils.py#L2841 +# Add the function of collecting latency +def new_sample( + self, + input_ids: torch.LongTensor, + logits_processor: LogitsProcessorList, + stopping_criteria: StoppingCriteriaList, + generation_config: GenerationConfig, + synced_gpus: bool, + streamer: Optional["BaseStreamer"], + logits_warper: Optional[LogitsProcessorList], + **model_kwargs, + ) -> Union[GenerateNonBeamOutput, torch.LongTensor]: + r""" + Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and + can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. + + Parameters: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + The sequence used as a prompt for the generation. + logits_processor (`LogitsProcessorList`): + An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] + used to modify the prediction scores of the language modeling head applied at each generation step. + stopping_criteria (`StoppingCriteriaList`): + An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] + used to tell if the generation loop should stop. + generation_config ([`~generation.GenerationConfig`]): + The generation configuration to be used as parametrization of the decoding method. + synced_gpus (`bool`): + Whether to continue running the while loop until max_length (needed for ZeRO stage 3) + streamer (`BaseStreamer`, *optional*): + Streamer object that will be used to stream the generated sequences. Generated tokens are passed + through `streamer.put(token_ids)` and the streamer is responsible for any further processing. + logits_warper (`LogitsProcessorList`, *optional*): + An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used + to warp the prediction score distribution of the language modeling head applied before multinomial + sampling at each generation step. Only required with sampling strategies (i.e. `do_sample` is set in + `generation_config`) + model_kwargs: + Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is + an encoder-decoder model the kwargs should include `encoder_outputs`. + + Return: + [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or `torch.LongTensor`: + A `torch.LongTensor` containing the generated tokens (default behaviour) or a + [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and + `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if + `model.config.is_encoder_decoder=True`. + """ + # init values + pad_token_id = generation_config._pad_token_tensor + output_attentions = generation_config.output_attentions + output_hidden_states = generation_config.output_hidden_states + output_scores = generation_config.output_scores + output_logits = generation_config.output_logits + return_dict_in_generate = generation_config.return_dict_in_generate + has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria) + do_sample = generation_config.do_sample + if do_sample is True and not isinstance(logits_warper, LogitsProcessorList): + raise ValueError( + "`do_sample` is set to `True`, `logits_warper` must be a `LogitsProcessorList` instance (it is " + f"{logits_warper})." + ) + + # init attention / hidden states / scores tuples + scores = () if (return_dict_in_generate and output_scores) else None + raw_logits = () if (return_dict_in_generate and output_logits) else None + decoder_attentions = () if (return_dict_in_generate and output_attentions) else None + cross_attentions = () if (return_dict_in_generate and output_attentions) else None + decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None + + # if model is an encoder-decoder, retrieve encoder attention weights and hidden states + if return_dict_in_generate and self.config.is_encoder_decoder: + encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None + encoder_hidden_states = ( + model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None + ) + + # keep track of which sequences are already finished + batch_size = input_ids.shape[0] + this_peer_finished = False + unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device) + model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs) + + while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device): + tic = time.perf_counter() + # prepare model inputs + model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) + + # prepare variable output controls (note: some models won't accept all output controls) + model_inputs.update({"output_attentions": output_attentions} if output_attentions else {}) + model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {}) + + # forward pass to get next token + tic_infer = time.perf_counter() + outputs = self(**model_inputs, return_dict=True) + hook_greedy.tm_infer_list.append(time.perf_counter() - tic_infer) + + if synced_gpus and this_peer_finished: + continue # don't waste resources running the code we don't need + + # Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration + # (the clone itself is always small) + next_token_logits = outputs.logits[:, -1, :].clone() + + # pre-process distribution + next_token_scores = logits_processor(input_ids, next_token_logits) + if do_sample: + next_token_scores = logits_warper(input_ids, next_token_scores) + + # Store scores, attentions and hidden_states when required + if return_dict_in_generate: + if output_scores: + scores += (next_token_scores,) + if output_logits: + raw_logits += (next_token_logits,) + if output_attentions: + decoder_attentions += ( + (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) + ) + if self.config.is_encoder_decoder: + cross_attentions += (outputs.cross_attentions,) + + if output_hidden_states: + decoder_hidden_states += ( + (outputs.decoder_hidden_states,) + if self.config.is_encoder_decoder + else (outputs.hidden_states,) + ) + + # token selection + if do_sample: + probs = nn.functional.softmax(next_token_scores, dim=-1) + next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) + else: + next_tokens = torch.argmax(next_token_scores, dim=-1) + + # finished sentences should have their next token be a padding token + if has_eos_stopping_criteria: + next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences) + + # update generated ids, model inputs, and length for next step + input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) + if streamer is not None: + streamer.put(next_tokens.cpu()) + model_kwargs = self._update_model_kwargs_for_generation( + outputs, + model_kwargs, + is_encoder_decoder=self.config.is_encoder_decoder, + ) + + unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores) + this_peer_finished = unfinished_sequences.max() == 0 + hook_greedy.tm_list.append(time.perf_counter() - tic) + # This is needed to properly delete outputs.logits which may be very large for first iteration + # Otherwise a reference to outputs is kept which keeps the logits alive in the next iteration + del outputs + + if streamer is not None: + streamer.end() + + if return_dict_in_generate: + if self.config.is_encoder_decoder: + return GenerateEncoderDecoderOutput( + sequences=input_ids, + scores=scores, + logits=raw_logits, + encoder_attentions=encoder_attentions, + encoder_hidden_states=encoder_hidden_states, + decoder_attentions=decoder_attentions, + cross_attentions=cross_attentions, + decoder_hidden_states=decoder_hidden_states, + past_key_values=model_kwargs.get("past_key_values"), + ) + else: + return GenerateDecoderOnlyOutput( + sequences=input_ids, + scores=scores, + logits=raw_logits, + attentions=decoder_attentions, + hidden_states=decoder_hidden_states, + past_key_values=model_kwargs.get("past_key_values"), + ) + else: + return input_ids \ No newline at end of file diff --git a/llm_bench/python/llm_bench_utils/hook_sample_v45.py b/llm_bench/python/llm_bench_utils/hook_sample_v45.py new file mode 100644 index 0000000000..1644c63a4f --- /dev/null +++ b/llm_bench/python/llm_bench_utils/hook_sample_v45.py @@ -0,0 +1,225 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# flake8: noqa +import time +import torch +import logging as log +from torch import nn +from typing import Optional, Tuple, Union +from transformers.generation.stopping_criteria import ( + StoppingCriteriaList, + validate_stopping_criteria, +) +from transformers.generation.logits_process import LogitsProcessorList +from transformers.generation.streamers import BaseStreamer +from transformers.utils import ModelOutput +from transformers.generation.configuration_utils import GenerationConfig +import llm_bench_utils.hook_greedy_search as hook_greedy + + +logger = log.getLogger(__name__) + + +class GenerateDecoderOnlyOutput(ModelOutput): + sequences: torch.LongTensor = None + scores: Optional[Tuple[torch.FloatTensor]] = None + logits: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None + + +class GenerateEncoderDecoderOutput(ModelOutput): + sequences: torch.LongTensor = None + scores: Optional[Tuple[torch.FloatTensor]] = None + logits: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + past_key_values: Optional[Tuple[Tuple[Tuple[torch.FloatTensor]]]] = None + + +GenerateNonBeamOutput = Union[GenerateDecoderOnlyOutput, GenerateEncoderDecoderOutput] + + +# Transformers version: v4.45.0 +# Copied from https://github.com/huggingface/transformers/blob/v4.45.0/src/transformers/generation/utils.py#L2925 +# Add the function of collecting latency +def new_sample( + self, + input_ids: torch.LongTensor, + logits_processor: LogitsProcessorList, + stopping_criteria: StoppingCriteriaList, + generation_config: GenerationConfig, + synced_gpus: bool, + streamer: Optional["BaseStreamer"], + **model_kwargs, + ) -> Union[GenerateNonBeamOutput, torch.LongTensor]: + r""" + Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and + can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. + + Parameters: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + The sequence used as a prompt for the generation. + logits_processor (`LogitsProcessorList`): + An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] + used to modify the prediction scores of the language modeling head applied at each generation step. + stopping_criteria (`StoppingCriteriaList`): + An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] + used to tell if the generation loop should stop. + generation_config ([`~generation.GenerationConfig`]): + The generation configuration to be used as parametrization of the decoding method. + synced_gpus (`bool`): + Whether to continue running the while loop until max_length (needed for ZeRO stage 3) + streamer (`BaseStreamer`, *optional*): + Streamer object that will be used to stream the generated sequences. Generated tokens are passed + through `streamer.put(token_ids)` and the streamer is responsible for any further processing. + model_kwargs: + Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is + an encoder-decoder model the kwargs should include `encoder_outputs`. + + Return: + [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or `torch.LongTensor`: + A `torch.LongTensor` containing the generated tokens (default behaviour) or a + [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and + `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if + `model.config.is_encoder_decoder=True`. + """ + # init values + pad_token_id = generation_config._pad_token_tensor + output_attentions = generation_config.output_attentions + output_hidden_states = generation_config.output_hidden_states + output_scores = generation_config.output_scores + output_logits = generation_config.output_logits + return_dict_in_generate = generation_config.return_dict_in_generate + max_length = generation_config.max_length + has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria) + do_sample = generation_config.do_sample + + # init attention / hidden states / scores tuples + scores = () if (return_dict_in_generate and output_scores) else None + raw_logits = () if (return_dict_in_generate and output_logits) else None + decoder_attentions = () if (return_dict_in_generate and output_attentions) else None + cross_attentions = () if (return_dict_in_generate and output_attentions) else None + decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None + + # if model is an encoder-decoder, retrieve encoder attention weights and hidden states + if return_dict_in_generate and self.config.is_encoder_decoder: + encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None + encoder_hidden_states = ( + model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None + ) + + # keep track of which sequences are already finished + batch_size, cur_len = input_ids.shape + this_peer_finished = False + unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device) + model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs) + + while self._has_unfinished_sequences( + this_peer_finished, synced_gpus, device=input_ids.device, cur_len=cur_len, max_length=max_length + ): + tic = time.perf_counter() + # prepare model inputs + model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) + + # prepare variable output controls (note: some models won't accept all output controls) + model_inputs.update({"output_attentions": output_attentions} if output_attentions else {}) + model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {}) + + # forward pass to get next token + tic_infer = time.perf_counter() + outputs = self(**model_inputs, return_dict=True) + hook_greedy.tm_infer_list.append(time.perf_counter() - tic_infer) + + if synced_gpus and this_peer_finished: + continue # don't waste resources running the code we don't need + + # Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration + # (the clone itself is always small) + next_token_logits = outputs.logits.clone()[:, -1, :].float() + + # pre-process distribution + next_token_scores = logits_processor(input_ids, next_token_logits) + + # Store scores, attentions and hidden_states when required + if return_dict_in_generate: + if output_scores: + scores += (next_token_scores,) + if output_logits: + raw_logits += (next_token_logits,) + if output_attentions: + decoder_attentions += ( + (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) + ) + if self.config.is_encoder_decoder: + cross_attentions += (outputs.cross_attentions,) + + if output_hidden_states: + decoder_hidden_states += ( + (outputs.decoder_hidden_states,) + if self.config.is_encoder_decoder + else (outputs.hidden_states,) + ) + + # token selection + if do_sample: + probs = nn.functional.softmax(next_token_scores, dim=-1) + # TODO (joao): this OP throws "skipping cudagraphs due to ['incompatible ops']", find solution + next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) + else: + next_tokens = torch.argmax(next_token_scores, dim=-1) + + # finished sentences should have their next token be a padding token + if has_eos_stopping_criteria: + next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences) + + # update generated ids, model inputs, and length for next step + input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) + if streamer is not None: + streamer.put(next_tokens.cpu()) + model_kwargs = self._update_model_kwargs_for_generation( + outputs, + model_kwargs, + is_encoder_decoder=self.config.is_encoder_decoder, + ) + + unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores) + this_peer_finished = unfinished_sequences.max() == 0 + cur_len += 1 + hook_greedy.tm_list.append(time.perf_counter() - tic) + # This is needed to properly delete outputs.logits which may be very large for first iteration + # Otherwise a reference to outputs is kept which keeps the logits alive in the next iteration + del outputs + + if streamer is not None: + streamer.end() + + if return_dict_in_generate: + if self.config.is_encoder_decoder: + return GenerateEncoderDecoderOutput( + sequences=input_ids, + scores=scores, + logits=raw_logits, + encoder_attentions=encoder_attentions, + encoder_hidden_states=encoder_hidden_states, + decoder_attentions=decoder_attentions, + cross_attentions=cross_attentions, + decoder_hidden_states=decoder_hidden_states, + past_key_values=model_kwargs.get("past_key_values"), + ) + else: + return GenerateDecoderOnlyOutput( + sequences=input_ids, + scores=scores, + logits=raw_logits, + attentions=decoder_attentions, + hidden_states=decoder_hidden_states, + past_key_values=model_kwargs.get("past_key_values"), + ) + else: + return input_ids \ No newline at end of file diff --git a/llm_bench/python/utils/memory_profile.py b/llm_bench/python/llm_bench_utils/memory_profile.py similarity index 63% rename from llm_bench/python/utils/memory_profile.py rename to llm_bench/python/llm_bench_utils/memory_profile.py index a90256a0a6..25bf33c938 100644 --- a/llm_bench/python/utils/memory_profile.py +++ b/llm_bench/python/llm_bench_utils/memory_profile.py @@ -1,10 +1,11 @@ # -*- coding: utf-8 -*- -# Copyright (C) 2018-2023 Intel Corporation +# Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 from threading import Event, Thread import psutil import time import os +import sys class MemConsumption: @@ -13,6 +14,7 @@ def __init__(self): self.g_exit_get_mem_thread = False self.g_end_collect_mem = False self.g_max_rss_mem_consumption = -1 + self.g_max_uss_mem_consumption = -1 self.g_max_shared_mem_consumption = -1 self.g_event = Event() self.g_data_event = Event() @@ -23,15 +25,29 @@ def collect_memory_consumption(self): self.g_event.wait() while True: process = psutil.Process(os.getpid()) - rss_mem_data = process.memory_info().rss / float(2**20) try: - shared_mem_data = process.memory_info().shared / float(2**20) + memory_full_info = process.memory_full_info() + rss_mem_data = memory_full_info.rss + if sys.platform.startswith('linux'): + shared_mem_data = memory_full_info.shared + uss_mem_data = rss_mem_data - shared_mem_data + elif sys.platform.startswith('win'): + uss_mem_data = memory_full_info.uss + shared_mem_data = rss_mem_data - uss_mem_data + else: + uss_mem_data = -1 + shared_mem_data = -1 except Exception: + rss_mem_data = -1 + uss_mem_data = -1 shared_mem_data = -1 + if rss_mem_data > self.g_max_rss_mem_consumption: self.g_max_rss_mem_consumption = rss_mem_data if shared_mem_data > self.g_max_shared_mem_consumption: self.g_max_shared_mem_consumption = shared_mem_data + if uss_mem_data > self.g_max_uss_mem_consumption: + self.g_max_uss_mem_consumption = uss_mem_data self.g_data_event.set() if self.g_end_collect_mem is True: self.g_event.set() @@ -54,11 +70,15 @@ def get_max_memory_consumption(self): """Return the data.""" self.g_data_event.wait() self.g_data_event.clear() - return self.g_max_rss_mem_consumption, self.g_max_shared_mem_consumption + max_rss_mem = self.g_max_rss_mem_consumption / float(2**20) if self.g_max_rss_mem_consumption > -1 else -1 + max_shared_mem = self.g_max_shared_mem_consumption / float(2**20) if self.g_max_shared_mem_consumption > -1 else -1 + max_uss_mem = self.g_max_uss_mem_consumption / float(2**20) if self.g_max_uss_mem_consumption > -1 else -1 + return max_rss_mem, max_shared_mem, max_uss_mem def clear_max_memory_consumption(self): """Clear MemConsumption.""" self.g_max_rss_mem_consumption = -1 + self.g_max_uss_mem_consumption = -1 self.g_max_shared_mem_consumption = -1 def start_collect_mem_consumption_thread(self): diff --git a/llm_bench/python/llm_bench_utils/metrics_print.py b/llm_bench/python/llm_bench_utils/metrics_print.py new file mode 100644 index 0000000000..f1e8b3ddc3 --- /dev/null +++ b/llm_bench/python/llm_bench_utils/metrics_print.py @@ -0,0 +1,191 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +import logging as log + + +def print_metrics( + iter_num, iter_data, tms=None, tms_infer=None, warm_up=False, max_rss_mem=-1, max_shared_mem=-1, + max_uss_mem=-1, stable_diffusion=None, tokenization_time=None, batch_size=1 +): + iter_str = str(iter_num) + if warm_up: + iter_str = 'warm-up' + output_str = '' + latency_unit = 'token' + if batch_size > 1: + latency_unit = '{}tokens'.format(batch_size) + if iter_data['input_size'] != '': + output_str += 'Input token size: {}, '.format(iter_data['input_size']) + if iter_data['output_size'] != '': + output_str += 'Output size: {}, '.format(iter_data['output_size']) + if iter_data['infer_count'] != '': + output_str += 'Infer count: {}, '.format(iter_data['infer_count']) + if tokenization_time: + output_str += 'Tokenization Time: {:.2f}ms, '.format(tokenization_time[0]) + if len(tokenization_time) > 1: + output_str += 'Detokenization Time: {:.2f}ms, '.format(tokenization_time[1]) + if iter_data['generation_time'] != '': + output_str += 'Generation Time: {:.2f}s, '.format(iter_data['generation_time']) + if iter_data['latency'] != '': + output_str += 'Latency: {:.2f} ms/{}'.format(iter_data['latency'], latency_unit) + if output_str != '': + output_str = ' '.join(['[{}]'.format(iter_str), output_str]) + log.info(output_str) + if tms is not None: + iter_data['first_token_latency'] = tms[0] * 1000 if len(tms) > 0 else -1 + iter_data['other_tokens_avg_latency'] = sum(tms[1:]) / (len(tms) - 1) * 1000 if len(tms) > 1 else -1 + first_token_latency = 'NA' if iter_data['first_token_latency'] == -1 else f"{iter_data['first_token_latency']:.2f} ms/{latency_unit}" + other_token_latency = 'NA' if iter_data['other_tokens_avg_latency'] == -1 else f"{iter_data['other_tokens_avg_latency']:.2f} ms/{latency_unit}" + log.info( + f"[{iter_str}] First token latency: {first_token_latency}, " + f"other tokens latency: {other_token_latency}, len of tokens: {len(tms)} * {batch_size}", + ) + if len(tms) == 0: + log.warning(f'[{iter_str}] No hook data output for first token latency and other tokens latency') + if tms_infer is not None: + iter_data['first_token_infer_latency'] = tms_infer[0] * 1000 if len(tms_infer) > 0 else -1 + iter_data['other_tokens_infer_avg_latency'] = sum(tms_infer[1:]) / (len(tms_infer) - 1) * 1000 if len(tms_infer) > 1 else -1 + first_infer_latency = 'NA' if iter_data['first_token_infer_latency'] == -1 else f"{iter_data['first_token_infer_latency']:.2f} ms/infer" + other_infer_latency = 'NA' if iter_data['other_tokens_infer_avg_latency'] == -1 else f"{iter_data['other_tokens_infer_avg_latency']:.2f} ms/infer" + log.info( + f"[{iter_str}] First infer latency: {first_infer_latency}, " + f"other infers latency: {other_infer_latency}, inference count: {len(tms_infer)}", + ) + if len(tms_infer) == 0: + log.warning(f'[{iter_str}] No hook data output for first infer latency and other infers latency') + if stable_diffusion is not None: + print_stable_diffusion_infer_latency(iter_str, iter_data, stable_diffusion) + output_str = '' + if max_rss_mem != '' and max_rss_mem > -1: + output_str += 'Max rss memory cost: {:.2f}MBytes, '.format(max_rss_mem) + if max_uss_mem != '' and max_uss_mem > -1: + output_str += 'max uss memory cost: {:.2f}MBytes, '.format(max_uss_mem) + if max_shared_mem != '' and max_shared_mem > -1: + output_str += 'max shared memory cost: {:.2f}MBytes'.format(max_shared_mem) + if output_str != '': + output_str = ' '.join(['[{}]'.format(iter_str), output_str]) + log.info(output_str) + if iter_data['result_md5'] != '': + log.info(f"[{iter_str}] Result MD5:{iter_data['result_md5']}") + + +def print_generated(iter_num, warm_up=False, generated=None): + iter_str = str(iter_num) + if warm_up: + iter_str = 'warm-up' + if generated is not None: + try: + log.info(f'[{iter_str}] Generated: {generated}') + except UnicodeError: + try: + utf8_generated = generated.encode(encoding="utf-8", errors="replace").decode() + log.info(f'[{iter_str}] Generated: {utf8_generated}') + except Exception: + log.warning(f"[{iter_str}] Unable print generated") + + +def print_stable_diffusion_infer_latency(iter_str, iter_data, stable_diffusion): + iter_data['first_token_latency'] = stable_diffusion.get_1st_unet_latency() + iter_data['other_tokens_avg_latency'] = stable_diffusion.get_2nd_unet_latency() + iter_data['first_token_infer_latency'] = iter_data['first_token_latency'] + iter_data['other_tokens_infer_avg_latency'] = iter_data['other_tokens_avg_latency'] + log.info(f"[{iter_str}] First step of unet latency: {iter_data['first_token_latency']:.2f} ms/step, " + f"other steps of unet latency: {iter_data['other_tokens_avg_latency']:.2f} ms/step",) + log.info(f"[{iter_str}] Text encoder latency: {stable_diffusion.get_text_encoder_latency():.2f} ms/step, " + f"unet latency: {stable_diffusion.get_unet_latency():.2f} ms/step, " + f"vae decoder latency: {stable_diffusion.get_vae_decoder_latency():.2f} ms/step, " + f"text encoder step count: {stable_diffusion.get_text_encoder_step_count()}, " + f"unet step count: {stable_diffusion.get_unet_step_count()}, " + f"vae decoder step count: {stable_diffusion.get_vae_decoder_step_count()}",) + + +def print_ldm_unet_vqvae_infer_latency(iter_num, iter_data, tms=None, warm_up=False): + iter_str = str(iter_num) + if warm_up: + iter_str = 'warm-up' + len_tms = len(tms) + iter_data['first_token_latency'] = tms[0] * 1000 if len_tms > 0 else -1 + iter_data['other_tokens_avg_latency'] = sum(tms[1:(len_tms - 1)]) / (len_tms - 2) * 1000 if len_tms > 2 else 0 + iter_data['first_token_infer_latency'] = iter_data['first_token_latency'] + iter_data['other_tokens_infer_avg_latency'] = iter_data['other_tokens_avg_latency'] + + first_token_latency = 'NA' if iter_data['first_token_latency'] == -1 else f"{iter_data['first_token_latency']:.2f} ms/step" + other_token_latency = 'NA' if iter_data['other_tokens_avg_latency'] == -1 else f"{iter_data['other_tokens_avg_latency']:.2f} ms/step" + log.info(f"[{iter_str}] First step of unet latency: {first_token_latency}, " + f"other steps of unet latency: {other_token_latency}",) + if len_tms > 1: + log.info(f"[{iter_str}] Unet latency: {(sum(tms[0:(len_tms - 1)]) / (len_tms - 1)) * 1000:.2f} ms/step, " + f"vqvae decoder latency: {tms[len_tms - 1] * 1000:.2f} ms/step, " + f"unet step count: {len_tms - 1}, " + f"vqvae decoder step count: 1",) + + +def output_avg_statis_tokens(prompt_dict, prompt_idx_list, iter_data_list, batch_size, is_text_gen): + for p_idx in prompt_idx_list: + avg_1st_token_latency = 0 + avg_2nd_tokens_latency = 0 + avg_2nd_token_tput = 0 + avg_input_size = 0 + index_num = 0 + for iter_data in iter_data_list: + # Exclude the warm-up iteration + if iter_data['iteration'] == 0: + continue + if iter_data['prompt_idx'] == p_idx: + avg_1st_token_latency += iter_data['first_token_latency'] if iter_data['first_token_latency'] != '' else 0 + avg_2nd_tokens_latency += iter_data['other_tokens_avg_latency'] if iter_data['other_tokens_avg_latency'] != '' else 0 + avg_input_size += iter_data['input_size'] if iter_data['input_size'] != '' else 0 + index_num = index_num + 1 + if index_num > 0: + avg_1st_token_latency = avg_1st_token_latency / index_num + avg_2nd_tokens_latency = avg_2nd_tokens_latency / index_num + avg_input_size = int(avg_input_size / index_num) + if avg_2nd_tokens_latency > 0: + avg_2nd_token_tput = (1 / avg_2nd_tokens_latency) * batch_size * 1000 + latency_unit = 'token' if is_text_gen is True else 'step' + if batch_size > 1: + if is_text_gen is True: + latency_unit = '{}tokens'.format(batch_size) + else: + latency_unit = '{}steps'.format(batch_size) + avg_1st_token_latency = 'NA' if avg_1st_token_latency < 0 else f'{avg_1st_token_latency:.2f} ms/{latency_unit}' + avg_2nd_tokens_latency = 'NA' if avg_2nd_tokens_latency < 0 else f'{avg_2nd_tokens_latency:.2f} ms/{latency_unit}' + avg_2nd_token_tput = 'NA' if avg_2nd_tokens_latency == 'NA' else f'{avg_2nd_token_tput:.2f} {latency_unit}s/s' + if is_text_gen is True: + prompt_dict[p_idx] = '\n[ INFO ] [Average] Prompt[{}] Input token size: {}, 1st token latency: {}, ' \ + '2nd token latency: {}, 2nd tokens throughput: {}' \ + .format(p_idx, avg_input_size, avg_1st_token_latency, avg_2nd_tokens_latency, avg_2nd_token_tput) + else: + prompt_dict[p_idx] = '\n[ INFO ] [Average] Prompt[{}] 1st step of unet latency: {}, ' \ + '2nd steps of unet latency: {}, 2nd steps throughput: {}' \ + .format(p_idx, avg_1st_token_latency, avg_2nd_tokens_latency, avg_2nd_token_tput) + + +def print_average(iter_data_list, prompt_idx_list, batch_size, is_text_gen=False): + if len(iter_data_list) <= 1: + # 1st iteration is the warm-up iteration + return + total_generation_time = 0 + total_num_tokens = 0 + warm_up_iters = 0 + for iter_data in iter_data_list: + if iter_data['iteration'] == 0: + # Exclude the warm-up iteration + warm_up_iters = warm_up_iters + 1 + continue + if iter_data['generation_time'] != '': + total_generation_time += iter_data['generation_time'] + if iter_data['output_size'] != '': + total_num_tokens += iter_data['output_size'] + + total_iters = len(iter_data_list) - warm_up_iters + + if total_iters > 0: + prompt_dict = {} + output_avg_statis_tokens(prompt_dict, prompt_idx_list, iter_data_list, batch_size, is_text_gen) + log.info('<<< Warm-up iteration is excluded. >>>') + out_str = '[Total] Iterations: {}'.format(total_iters) + for prompt_key in prompt_dict: + out_str += prompt_dict[prompt_key] + log.info(out_str) diff --git a/llm_bench/python/utils/model_utils.py b/llm_bench/python/llm_bench_utils/model_utils.py similarity index 55% rename from llm_bench/python/utils/model_utils.py rename to llm_bench/python/llm_bench_utils/model_utils.py index 6a253f1df5..3d5359e26c 100644 --- a/llm_bench/python/utils/model_utils.py +++ b/llm_bench/python/llm_bench_utils/model_utils.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- -# Copyright (C) 2018-2023 Intel Corporation +# Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 import argparse import os import json import logging as log from pathlib import Path -from utils.config_class import DEFAULT_MODEL_CLASSES, USE_CASES, OV_MODEL_CLASSES_MAPPING, PT_MODEL_CLASSES_MAPPING +from llm_bench_utils.config_class import DEFAULT_MODEL_CLASSES, USE_CASES, OV_MODEL_CLASSES_MAPPING, PT_MODEL_CLASSES_MAPPING def get_prompts(args): @@ -25,24 +25,25 @@ def get_prompts(args): else: raise RuntimeError('== prompt should not be empty string ==') else: - input_prompt = args['prompt_file'] - if input_prompt.endswith('.jsonl'): - if os.path.exists(input_prompt): - log.info(f'Read prompts from {input_prompt}') - with open(input_prompt, 'r', encoding='utf-8') as f: - for line in f: - data = json.loads(line) - if 'prompt' in data: - if data['prompt'] != '': - prompts_list.append(data['prompt']) + input_prompt_list = args['prompt_file'] + for input_prompt in input_prompt_list: + if input_prompt.endswith('.jsonl'): + if os.path.exists(input_prompt): + log.info(f'Read prompts from {input_prompt}') + with open(input_prompt, 'r', encoding='utf-8') as f: + for line in f: + data = json.loads(line) + if 'prompt' in data: + if data['prompt'] != '': + prompts_list.append(data['prompt']) + else: + raise RuntimeError(f'== prompt in prompt file:{input_prompt} should not be empty string ==') else: - raise RuntimeError('== prompt should not be empty string ==') - else: - raise RuntimeError('== key word "prompt" does not exist in prompt file ==') + raise RuntimeError(f'== key word "prompt" does not exist in prompt file:{input_prompt} ==') + else: + raise RuntimeError(f'== The prompt file:{input_prompt} does not exist ==') else: - raise RuntimeError('== The prompt file does not exist ==') - else: - raise RuntimeError('== The prompt file should be ended with .jsonl ==') + raise RuntimeError(f'== The prompt file:{input_prompt} should be ended with .jsonl ==') return prompts_list @@ -59,46 +60,42 @@ def get_image_param_from_prompt_file(args): else: raise RuntimeError('== prompt should not be empty string ==') else: - input_prompt = args['prompt_file'] - if input_prompt.endswith('.jsonl'): - if os.path.exists(input_prompt): - log.info(f'Read prompts from {input_prompt}') - with open(input_prompt, 'r', encoding='utf-8') as f: - for line in f: - image_param = {} - data = json.loads(line) - if 'prompt' in data: - if data['prompt'] != '': - image_param['prompt'] = data['prompt'] + input_prompt_list = args['prompt_file'] + for input_prompt in input_prompt_list: + if input_prompt.endswith('.jsonl'): + if os.path.exists(input_prompt): + log.info(f'Read prompts from {input_prompt}') + with open(input_prompt, 'r', encoding='utf-8') as f: + for line in f: + image_param = {} + data = json.loads(line) + if 'prompt' in data: + if data['prompt'] != '': + image_param['prompt'] = data['prompt'] + else: + raise RuntimeError('== prompt in prompt file:{input_prompt} should not be empty string ==') else: - raise RuntimeError('== prompt should not be empty string ==') - else: - raise RuntimeError('== key word "prompt" does not exist in prompt file ==') - if 'width' in data: - image_param['width'] = int(data['width']) - if 'height' in data: - image_param['height'] = int(data['height']) - if 'steps' in data: - image_param['steps'] = int(data['steps']) - if 'guidance_scale' in data: - image_param['guidance_scale'] = float(data['guidance_scale']) - image_param_list.append(image_param) + raise RuntimeError(f'== key word "prompt" does not exist in prompt file:{input_prompt} ==') + if 'width' in data: + image_param['width'] = int(data['width']) + if 'height' in data: + image_param['height'] = int(data['height']) + if 'steps' in data: + image_param['steps'] = int(data['steps']) + if 'guidance_scale' in data: + image_param['guidance_scale'] = float(data['guidance_scale']) + image_param_list.append(image_param) + else: + raise RuntimeError(f'== The prompt file:{input_prompt} does not exist ==') else: - raise RuntimeError('== The prompt file does not exist ==') - else: - raise RuntimeError('== The prompt file should be ended with .jsonl ==') + raise RuntimeError(f'== The prompt file:{input_prompt} should be ended with .jsonl ==') return image_param_list def set_default_param_for_ov_config(ov_config): - if 'PERFORMANCE_HINT' not in ov_config: - ov_config['PERFORMANCE_HINT'] = 'LATENCY' # With this PR https://github.com/huggingface/optimum-intel/pull/362, we are able to disable model cache if 'CACHE_DIR' not in ov_config: ov_config['CACHE_DIR'] = '' - # OpenVINO self have default value 2 for nstreams on machine with 2 nodes. Reducing memory consumed via changing nstreams to 1. - if 'NUM_STREAMS' not in ov_config: - ov_config['NUM_STREAMS'] = '1' def add_stateful_model_arguments(parser: argparse.ArgumentParser): @@ -110,6 +107,13 @@ def add_stateful_model_arguments(parser: argparse.ArgumentParser): 'Additional operations are inserted into the model to handle cache state (Gathers, ShapeOf, etc.)', ) + parser.add_argument( + '--disable-stateful', + action="store_true", + default=None, + help="Disable stateful transformation for model conversion" + ) + def analyze_args(args): model_args = {} @@ -125,7 +129,20 @@ def analyze_args(args): model_args['save_prepared_model'] = args.save_prepared_model model_args['num_beams'] = args.num_beams model_args['torch_compile_backend'] = args.torch_compile_backend + model_args['torch_compile_dynamic'] = args.torch_compile_dynamic + model_args['torch_compile_options'] = args.torch_compile_options + model_args['torch_compile_input_module'] = args.torch_compile_input_module model_args['convert_tokenizer'] = args.convert_tokenizer + model_args['subsequent'] = args.subsequent + model_args['output_dir'] = args.output_dir + model_args['genai'] = args.genai + model_args["use_cb"] = args.use_cb + model_args['devices'] = args.device + model_args['prompt_index'] = [] if args.prompt_index is not None else None + if model_args['prompt_index'] is not None: + # Deduplication + [model_args['prompt_index'].append(i) for i in args.prompt_index if i not in model_args['prompt_index']] + model_args['end_token_stopping'] = args.end_token_stopping model_framework = args.framework model_path = Path(args.model) @@ -143,11 +160,18 @@ def analyze_args(args): model_args['config'] = config if model_framework == 'ov': set_default_param_for_ov_config(model_args['config']) - log.info(f"ov_config={model_args['config']}") + log.info(f"OV Config={model_args['config']}") elif model_framework == 'pt': - log.info(f"pt_config={model_args['config']}") + log.info(f"PT Config={model_args['config']}") model_args['model_type'] = get_model_type(model_name, use_case, model_framework) model_args['model_name'] = model_name + + if args.use_cb and not args.genai: + raise RuntimeError("Continious batching mode supported only via OpenVINO GenAI") + cb_config = None + if args.cb_config: + cb_config = get_config(args.cb_config) + model_args["cb_config"] = cb_config return model_path, model_framework, model_args, model_name @@ -180,11 +204,17 @@ def get_use_case(model_name_or_path): def get_config(config): - with open(config, 'r') as f: + if Path(config).is_file(): + with open(config, 'r') as f: + try: + ov_config = json.load(f) + except Exception: + raise RuntimeError(f'==Parse file:{config} failiure, json format is incorrect ==') + else: try: - ov_config = json.load(f) + ov_config = json.loads(config) except Exception: - raise RuntimeError(f'==Parse file:{config} failiure, json format is incorrect ==') + raise RuntimeError(f'==Parse config:{config} failiure, json format is incorrect ==') return ov_config @@ -202,7 +232,7 @@ def get_model_type(model_name, use_case, model_framework): def normalize_model_ids(model_ids_list): - return [m_id[:-1] if m_id.ends_with('_') else m_id for m_id in model_ids_list] + return [m_id[:-1] if m_id.endswith('_') else m_id for m_id in model_ids_list] def get_ir_conversion_frontend(cur_model_name, model_name_list): @@ -219,9 +249,14 @@ def get_ir_conversion_frontend(cur_model_name, model_name_list): def get_model_precision(model_name_list): precision_list = [ - 'FP32', 'FP16', 'FP16-INT8', 'INT8', 'INT8_compressed_weights', 'INT8_quantized', 'PT_compressed_weights', - 'OV_FP32-INT8', 'OV_FP16-INT8', 'PT_FP32-INT8', 'PT_FP16-INT8', 'GPTQ_INT4-FP32', 'GPTQ_INT4-FP16', 'INT4', - 'OV_FP16-INT4_SYM', 'OV_FP16-INT4_ASYM', 'OV_FP32-INT4_SYM', 'OV_FP32-INT4_ASYM', 'OV_FP32-4BIT_DEFAULT', 'OV_FP16-4BIT_DEFAULT'] + 'FP32', 'FP16', + 'FP16-INT8', 'INT8', 'INT8_compressed_weights', 'INT8_quantized', 'PT_compressed_weights', + 'OV_FP32-INT8', 'OV_FP16-INT8', + 'OV_FP32-INT8_ASYM', 'OV_FP32-INT8_SYM', 'OV_FP16-INT8_ASYM', 'OV_FP16-INT8_SYM', + 'PT_FP32-INT8', 'PT_FP16-INT8', 'PT_FP32-INT8_ASYM', 'PT_FP32-INT8_SYM', 'PT_FP16-INT8_ASYM', 'PT_FP16-INT8_SYM', + 'GPTQ_INT4-FP32', 'GPTQ_INT4-FP16', 'INT4', + 'OV_FP16-INT4_SYM', 'OV_FP16-INT4_ASYM', 'OV_FP32-INT4_SYM', 'OV_FP32-INT4_ASYM', + 'OV_FP32-4BIT_DEFAULT', 'OV_FP16-4BIT_DEFAULT', 'OV_FP32-4BIT_MAXIMUM', 'OV_FP16-4BIT_MAXIMUM'] model_precision = 'unknown' # Search from right to left of model path for i in range(len(model_name_list) - 1, -1, -1): diff --git a/llm_bench/python/llm_bench_utils/nncf_utils.py b/llm_bench/python/llm_bench_utils/nncf_utils.py new file mode 100644 index 0000000000..b65e90a3a9 --- /dev/null +++ b/llm_bench/python/llm_bench_utils/nncf_utils.py @@ -0,0 +1,37 @@ +from pathlib import Path + +import nncf + +COMPRESSION_OPTIONS = { + "INT8": { + "mode": nncf.CompressWeightsMode.INT8 if "INT8_ASYM" not in nncf.CompressWeightsMode.__members__ else nncf.CompressWeightsMode.INT8_ASYM}, + "INT4_SYM": { + "mode": nncf.CompressWeightsMode.INT4_SYM, + "group_size": 128, + }, + "INT4_ASYM": { + "mode": nncf.CompressWeightsMode.INT4_ASYM, + "group_size": 128, + }, + "4BIT_MAXIMUM": { + "mode": nncf.CompressWeightsMode.INT4_SYM, + "group_size": 128, + "ratio": 1, + "all_layers": True, + }, + "E2M1": { + "mode": nncf.CompressWeightsMode.E2M1, + "group_size": 32, + "all_layers": True, + }, +} + +if "INT8_ASYM" in nncf.CompressWeightsMode.__members__: + COMPRESSION_OPTIONS["INT8_ASYM"] = {"mode": nncf.CompressWeightsMode.INT8_ASYM} + +if "INT8_SYM" in nncf.CompressWeightsMode.__members__: + COMPRESSION_OPTIONS["INT8_SYM"] = {"mode": nncf.CompressWeightsMode.INT8_SYM} + + +def get_compressed_path(output_dir: str, base_precision, option: str): + return Path(output_dir) / "pytorch/dldt/compressed_weights" / f"OV_{base_precision}-{option}" diff --git a/llm_bench/python/llm_bench_utils/output_csv.py b/llm_bench/python/llm_bench_utils/output_csv.py new file mode 100644 index 0000000000..82bfb6bb7f --- /dev/null +++ b/llm_bench/python/llm_bench_utils/output_csv.py @@ -0,0 +1,186 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +import csv +import numpy as np +import copy +from pathlib import Path + + +def output_comments(result, use_case, writer): + for key in result.keys(): + result[key] = '' + writer.writerow(result) + + comment_list = [] + if use_case == 'text_gen' or use_case == 'code_gen': + comment_list.append('input_size: Input token size') + comment_list.append('output_size: Text/Code generation models: generated text token size') + comment_list.append("infer_count: Limit the Text/Code generation models' output token size") + comment_list.append('latency: Text/Code generation models: ms/token. Output token size / generation time') + comment_list.append('1st_latency: Text/Code generation models: Fisrt token latency') + comment_list.append('2nd_avg_latency: Text/Code generation models: Other tokens (exclude first token) latency') + comment_list.append('1st_infer_latency: Text/Code generation models: Fisrt inference latency') + comment_list.append('2nd_infer_avg_latency: Text/Code generation models: Other inferences (exclude first inference) latency') + comment_list.append('result_md5: MD5 of generated text') + comment_list.append('prompt_idx: Index of prompts') + elif use_case == 'image_gen': + comment_list.append("infer_count: Tex2Image models' Inference(or Sampling) step size") + comment_list.append('1st_latency: First step latency of unet') + comment_list.append('2nd_avg_latency: Other steps latency of unet(exclude first step)') + comment_list.append('1st_infer_latency: Same as 1st_latency') + comment_list.append('2nd_infer_avg_latency: Same as 2nd_avg_latency') + comment_list.append('prompt_idx: Index of prompts') + elif use_case == 'ldm_super_resolution': + comment_list.append("infer_count: Tex2Image models' Inference(or Sampling) step size") + comment_list.append('1st_latency: First step latency of unet') + comment_list.append('2nd_avg_latency: Other steps latency of unet(exclude first step)') + comment_list.append('1st_infer_latency: Same as 1st_latency') + comment_list.append('2nd_infer_avg_latency: Same as 2nd_avg_latency') + comment_list.append('prompt_idx: Image Index') + comment_list.append('tokenization_time: Tokenizer encode time') + comment_list.append('detokenization_time: Tokenizer decode time') + comment_list.append('pretrain_time: Total time of load model and compile model') + comment_list.append('generation_time: Time for one interaction. (e.g. The duration of answering one question or generating one picture)') + comment_list.append('iteration=0: warm-up; iteration=avg: average (exclude warm-up);iteration=mini: minimum value (exclude warm-up);' + 'iteration=median: median value (exclude warm-up);') + comment_list.append( + 'max_rss_mem: max rss memory consumption;' + ) + comment_list.append( + 'max_shared_mem: max shared memory consumption;' + ) + + for comments in comment_list: + result['iteration'] = comments + writer.writerow(result) + + +def output_avg_min_median(iter_data_list): + prompt_idxs = [] + for iter_data in iter_data_list: + prompt_idxs.append(iter_data['prompt_idx']) + prompt_idxs = list(set(prompt_idxs)) + result = {} + for prompt_idx in prompt_idxs: + same_prompt_datas = [] + for iter_data in iter_data_list: + if iter_data['prompt_idx'] == prompt_idx and iter_data['iteration'] > 0: + same_prompt_datas.append(iter_data) + key_word = ['input_size', 'infer_count', 'generation_time', 'output_size', 'latency', 'first_token_latency', 'other_tokens_avg_latency', + 'first_token_infer_latency', 'other_tokens_infer_avg_latency', 'tokenization_time', 'detokenization_time'] + if len(same_prompt_datas) > 0: + iters_idx = ['avg', 'mini', 'median'] + result[prompt_idx] = [copy.deepcopy(same_prompt_datas[0]) for i in range(3)] + for i in range(len(iters_idx)): + result[prompt_idx][i]['iteration'] = iters_idx[i] + for key in key_word: + values = [] + for prompt in same_prompt_datas: + if prompt[key] != '': + values.append(prompt[key]) + if len(values) > 0: + result[prompt_idx][0][key] = np.mean(values) + result[prompt_idx][1][key] = np.min(values) + result[prompt_idx][2][key] = np.median(values) + return result + + +def gen_data_to_csv(result, iter_data, pretrain_time): + generation_time = iter_data['generation_time'] + latency = iter_data['latency'] + first_latency = iter_data['first_token_latency'] + other_latency = iter_data['other_tokens_avg_latency'] + first_token_infer_latency = iter_data['first_token_infer_latency'] + other_token_infer_latency = iter_data['other_tokens_infer_avg_latency'] + rss_mem = iter_data['max_rss_mem_consumption'] + uss_mem = iter_data['max_uss_mem_consumption'] + shared_mem = iter_data['max_shared_mem_consumption'] + token_time = iter_data['tokenization_time'] + detoken_time = iter_data['detokenization_time'] + result['iteration'] = str(iter_data['iteration']) + result['pretrain_time(s)'] = pretrain_time + result['input_size'] = iter_data['input_size'] + result['infer_count'] = iter_data['infer_count'] + result['generation_time(s)'] = round(generation_time, 5) if generation_time != '' else generation_time + result['output_size'] = iter_data['output_size'] + result['latency(ms)'] = round(latency, 5) if latency != '' else latency + result['result_md5'] = iter_data['result_md5'] + if first_latency < 0: + result['1st_latency(ms)'] = 'NA' + else: + result['1st_latency(ms)'] = round(first_latency, 5) if first_latency != '' else first_latency + if other_latency < 0: + result['2nd_avg_latency(ms)'] = 'NA' + else: + result['2nd_avg_latency(ms)'] = round(other_latency, 5) if other_latency != '' else other_latency + if first_token_infer_latency < 0: + result['1st_infer_latency(ms)'] = 'NA' + else: + result['1st_infer_latency(ms)'] = round(first_token_infer_latency, 5) if first_token_infer_latency != '' else first_token_infer_latency + if other_token_infer_latency < 0: + result['2nd_infer_avg_latency(ms)'] = 'NA' + else: + result['2nd_infer_avg_latency(ms)'] = round(other_token_infer_latency, 5) if other_token_infer_latency != '' else other_token_infer_latency + result['max_rss_mem(MB)'] = round(rss_mem, 5) if rss_mem != '' else rss_mem + result['max_uss_mem(MB)'] = round(uss_mem, 5) if uss_mem != '' else uss_mem + result['max_shared_mem(MB)'] = round(shared_mem, 5) if shared_mem != '' else shared_mem + result['prompt_idx'] = iter_data['prompt_idx'] + result['tokenization_time'] = round(token_time, 5) if token_time != '' else token_time + result['detokenization_time'] = round(detoken_time, 5) if detoken_time != '' else detoken_time + + +def write_result(report_file, model, framework, device, model_args, iter_data_list, pretrain_time, model_precision): + header = [ + 'iteration', + 'model', + 'framework', + 'device', + 'pretrain_time(s)', + 'input_size', + 'infer_count', + 'generation_time(s)', + 'output_size', + 'latency(ms)', + '1st_latency(ms)', + '2nd_avg_latency(ms)', + 'precision', + 'max_rss_mem(MB)', + 'max_uss_mem(MB)', + 'max_shared_mem(MB)', + 'prompt_idx', + '1st_infer_latency(ms)', + '2nd_infer_avg_latency(ms)', + 'num_beams', + 'batch_size', + 'tokenization_time', + 'detokenization_time', + 'result_md5', + ] + out_file = Path(report_file) + + if len(iter_data_list) > 0: + with open(out_file, 'w+', newline='') as f: + writer = csv.DictWriter(f, header) + writer.writeheader() + result = {} + result['model'] = model + result['framework'] = framework + result['device'] = device + result['pretrain_time(s)'] = round(pretrain_time, 5) + result['precision'] = model_precision + result['num_beams'] = model_args['num_beams'] + result['batch_size'] = model_args['batch_size'] + for i in range(len(iter_data_list)): + iter_data = iter_data_list[i] + pre_time = '' if i > 0 else result['pretrain_time(s)'] + gen_data_to_csv(result, iter_data, pre_time) + writer.writerow(result) + + res_data = output_avg_min_median(iter_data_list) + + for key in res_data.keys(): + for data in res_data[key]: + gen_data_to_csv(result, data, '') + writer.writerow(result) + output_comments(result, model_args['use_case'], writer) diff --git a/llm_bench/python/llm_bench_utils/output_file.py b/llm_bench/python/llm_bench_utils/output_file.py new file mode 100644 index 0000000000..8efbb430a7 --- /dev/null +++ b/llm_bench/python/llm_bench_utils/output_file.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +import os + + +def save_text_to_file(input_text, text_file_name, args): + if args['output_dir'] is not None: + if os.path.exists(args['output_dir']) is False: + os.mkdir(args['output_dir']) + out_path = args['output_dir'] + else: + out_path = '.' + save_path = out_path + os.sep + text_file_name + input_text_file = open(save_path, 'w') + input_text_file.write(input_text) + input_text_file.close() + + +def save_image_file(img, img_file_name, args): + if args['output_dir'] is not None: + if os.path.exists(args['output_dir']) is False: + os.mkdir(args['output_dir']) + out_path = args['output_dir'] + else: + out_path = '.' + save_path = out_path + os.sep + img_file_name + img.save(save_path) + return save_path + + +def output_input_text(input_text, args, model_precision, prompt_idx, batchsize_idx, proc_id): + if args['batch_size'] > 1: + text_file_name = args['model_name'] + '_' + model_precision + '_p' + str(prompt_idx) + '_bs' + str(batchsize_idx) + else: + text_file_name = args['model_name'] + '_' + model_precision + '_p' + str(prompt_idx) + text_file_name = text_file_name + '_pid' + str(proc_id) + '_input.txt' + save_text_to_file(input_text, text_file_name, args) + + +def output_image_input_text(input_text, args, prompt_idx, batchsize_idx, proc_id): + if args['batch_size'] > 1 and batchsize_idx is not None: + text_file_name = args['model_name'] + '_p' + str(prompt_idx) + '_bs' + str(batchsize_idx) + else: + text_file_name = args['model_name'] + '_p' + str(prompt_idx) + text_file_name = text_file_name + '_pid' + str(proc_id) + '_input.txt' + save_text_to_file(input_text, text_file_name, args) + + +def output_gen_text(generated_text, args, model_precision, prompt_idx, iteration, batchsize_idx, proc_id): + if args['batch_size'] > 1: + text_file_name = args['model_name'] + '_' + model_precision + '_p' + str(prompt_idx) + '_bs' + str(batchsize_idx) + else: + text_file_name = args['model_name'] + '_' + model_precision + '_p' + str(prompt_idx) + text_file_name = text_file_name + '_iter' + str(iteration) + '_pid' + str(proc_id) + '_output.txt' + save_text_to_file(generated_text, text_file_name, args) + + +def output_gen_image(img, args, prompt_idx, iteration, batchsize_idx, proc_id, suffix): + if args['batch_size'] > 1 and batchsize_idx is not None: + img_save_name = args['model_name'] + '_p' + str(prompt_idx) + '_bs' + str(batchsize_idx) + else: + img_save_name = args['model_name'] + '_p' + str(prompt_idx) + img_save_name = img_save_name + '_iter' + str(iteration) + '_pid' + str(proc_id) + '_output' + suffix + img_save_path = save_image_file(img, img_save_name, args) + return img_save_path diff --git a/llm_bench/python/utils/output_json.py b/llm_bench/python/llm_bench_utils/output_json.py similarity index 95% rename from llm_bench/python/utils/output_json.py rename to llm_bench/python/llm_bench_utils/output_json.py index 716f59548f..b50a17f974 100644 --- a/llm_bench/python/utils/output_json.py +++ b/llm_bench/python/llm_bench_utils/output_json.py @@ -15,6 +15,7 @@ def write_result(report_file, model, framework, device, model_args, iter_data_li first_token_infer_latency = iter_data['first_token_infer_latency'] other_token_infer_latency = iter_data['other_tokens_infer_avg_latency'] rss_mem = iter_data['max_rss_mem_consumption'] + uss_mem = iter_data['max_uss_mem_consumption'] shared_mem = iter_data['max_shared_mem_consumption'] tokenization_time = iter_data['tokenization_time'] detokenization_time = iter_data['detokenization_time'] @@ -36,6 +37,7 @@ def write_result(report_file, model, framework, device, model_args, iter_data_li 'first_infer_latency': round(first_token_infer_latency, 5) if first_token_infer_latency != '' else first_token_infer_latency, 'second_infer_avg_latency': round(other_token_infer_latency, 5) if other_token_infer_latency != '' else other_token_infer_latency, 'max_rss_mem': round(rss_mem, 5) if rss_mem != '' else -1, + 'max_uss_mem': round(uss_mem, 5) if uss_mem != '' else -1, 'max_shared_mem': round(shared_mem, 5) if shared_mem != '' else -1, 'prompt_idx': iter_data['prompt_idx'], 'tokenization_time': round(tokenization_time, 5) if tokenization_time != '' else tokenization_time, diff --git a/llm_bench/python/utils/ov_model_classes.py b/llm_bench/python/llm_bench_utils/ov_model_classes.py similarity index 54% rename from llm_bench/python/utils/ov_model_classes.py rename to llm_bench/python/llm_bench_utils/ov_model_classes.py index 46b5dd9345..0ade0f1299 100644 --- a/llm_bench/python/utils/ov_model_classes.py +++ b/llm_bench/python/llm_bench_utils/ov_model_classes.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright (C) 2018-2023 Intel Corporation +# Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # flake8: noqa import time @@ -17,32 +17,12 @@ from optimum.intel.openvino import OVModelForCausalLM from optimum.intel.openvino.utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME from openvino.runtime import Model, Core, Tensor, Type -from optimum.utils import NormalizedTextConfig, NormalizedConfigManager from transformers import PretrainedConfig from transformers.modeling_outputs import CausalLMOutputWithPast, ModelOutput from transformers import GenerationConfig, StoppingCriteriaList from transformers.generation.logits_process import LogitsProcessorList, LogitsProcessor from transformers.generation.utils import GenerateOutput -def register_normalized_configs(): - NormalizedConfigManager._conf['mpt'] = NormalizedTextConfig.with_args(num_layers='n_layers', num_attention_heads='n_heads') - NormalizedConfigManager._conf['RefinedWebModel'] = NormalizedTextConfig.with_args(num_layers='n_layer', num_attention_heads='n_head') - NormalizedConfigManager._conf['falcon'] = NormalizedTextConfig.with_args(num_layers='num_hidden_layers', num_attention_heads='num_attention_heads') - NormalizedConfigManager._conf['RefinedWeb'] = NormalizedTextConfig.with_args(num_layers='n_layer', num_attention_heads='n_head') - NormalizedConfigManager._conf['chatglm'] = NormalizedTextConfig.with_args(num_layers='num_layers', num_attention_heads='num_attention_heads') - NormalizedConfigManager._conf['stablelm_epoch'] = NormalizedTextConfig.with_args(num_layers='num_hidden_layers', num_attention_heads='num_attention_heads') - NormalizedConfigManager._conf['stablelm-epoch'] = NormalizedTextConfig.with_args(num_layers='num_hidden_layers', num_attention_heads='num_attention_heads') - NormalizedConfigManager._conf['jais'] = NormalizedTextConfig.with_args(num_layers='n_layer', num_attention_heads='n_head', hidden_size='n_embd') - NormalizedConfigManager._conf['baichuan'] = NormalizedTextConfig.with_args( - num_layers='num_hidden_layers', num_attention_heads='num_attention_heads', hidden_size='hidden_size') - NormalizedConfigManager._conf['qwen'] = NormalizedTextConfig.with_args( - num_layers='num_hidden_layers', num_attention_heads='num_attention_heads', hidden_size='hidden_size') - NormalizedConfigManager._conf['mistral'] = NormalizedTextConfig.with_args(num_key_value_heads='num_key_value_heads', allow_new=True) - NormalizedConfigManager._conf['Yi'] = NormalizedTextConfig - NormalizedConfigManager._conf['phi'] = NormalizedTextConfig - NormalizedConfigManager._conf["codegen2"] = NormalizedConfigManager._conf["codegen"] - NormalizedConfigManager._conf["aquila"] = NormalizedConfigManager._conf["llama"] - class OVMPTModel(OVModelForCausalLM): def _reshape( @@ -60,7 +40,7 @@ def _reshape( else: if '.key' in inputs.get_any_name(): shapes[inputs][3] = -1 - else: + elif inputs.get_any_name() != "beam_idx": shapes[inputs][2] = -1 model.reshape(shapes) return model @@ -158,7 +138,7 @@ def forward( inputs["position_ids"] = position_ids - if hasattr(self, 'next_beam_idx'): + if hasattr(self, 'next_beam_idx') and "beam_idx" in self.input_names: inputs['beam_idx'] = self.next_beam_idx # Run inference @@ -180,26 +160,6 @@ def forward( return CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values) -class OVFalconModel(OVModelForCausalLM): - def _reshape( - self, - model: Model, - *args, - **kwargs, - ): - shapes = {} - for inputs in model.inputs: - shapes[inputs] = inputs.get_partial_shape() - if shapes[inputs].rank.get_length() in [1, 2, 4]: - shapes[inputs][0] = -1 - if shapes[inputs].rank.get_length() in [2, 3]: - shapes[inputs][1] = -1 - if shapes[inputs].rank.get_length() == 4: - shapes[inputs][2] = -1 - model.reshape(shapes) - return model - - class OVLDMSuperResolutionPipeline(DiffusionPipeline): def __init__(self, model_path: Path, core: Core, device: str): super().__init__() @@ -308,126 +268,6 @@ def preprocess(image): return 2.0 * image - 1.0 -class OVChatGLM2Model(OVModelForCausalLM): - def _reshape( - self, - model: Model, - batch_size: int, - sequence_length: int, - height: int = None, - width: int = None, - ): - shapes = {} - for inputs in model.inputs: - shapes[inputs] = inputs.get_partial_shape() - shapes[inputs][0] = -1 - input_name = inputs.get_any_name() - if input_name.startswith('beam_idx'): - continue - if input_name.startswith('past_key_values'): - shapes[inputs][1] = -1 - shapes[inputs][2] = 2 - elif shapes[inputs].rank.get_length() > 1: - shapes[inputs][1] = -1 - model.reshape(shapes) - return model - - def get_position_ids(self, input_ids, device): - batch_size, seq_length = input_ids.shape - position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1) - return position_ids - - def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs): - past_key_values = past_key_values or kwargs.get('past', None) - - # `past_key_values` may be in the stardard format (e.g. in contrastive search), converts to bloom's format if needed - if past_key_values is not None and self.config.model_type == 'bloom': - if past_key_values[0][0].shape[0] == input_ids.shape[0]: - past_key_values = self._convert_to_bloom_cache(past_key_values) - - attention_mask = kwargs.get('attention_mask', None) - position_ids = kwargs.get('position_ids', None) - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -1].unsqueeze(-1) - return { - 'input_ids': input_ids, - 'past_key_values': past_key_values, - 'use_cache': self.use_cache, - 'position_ids': position_ids, - 'attention_mask': attention_mask, - 'token_type_ids': None, - } - - def _update_model_kwargs_for_generation( - self, - outputs: ModelOutput, - model_kwargs: Dict[str, Any], - is_encoder_decoder: bool = False, - standardize_cache_format: bool = False, - ) -> Dict[str, Any]: - # update past_key_values - model_kwargs['past_key_values'] = self._extract_past_from_model_output( - outputs, standardize_cache_format=standardize_cache_format - ) - - # update attention mask - if 'attention_mask' in model_kwargs: - attention_mask = model_kwargs['attention_mask'] - model_kwargs['attention_mask'] = torch.cat( - [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 - ) - - # update position ids - if 'position_ids' in model_kwargs: - position_ids = model_kwargs['position_ids'] - new_position_id = position_ids[..., -1:].clone() - new_position_id += 1 - model_kwargs['position_ids'] = torch.cat([position_ids, new_position_id], dim=-1) - - model_kwargs['is_first_forward'] = False - return model_kwargs - - @classmethod - def _from_pretrained( - cls, - model_id: Union[str, Path], - config: PretrainedConfig, - use_auth_token: Optional[Union[bool, str, None]] = None, - revision: Optional[Union[str, None]] = None, - force_download: bool = False, - cache_dir: Optional[str] = None, - file_name: Optional[str] = None, - subfolder: str = '', - from_onnx: bool = False, - local_files_only: bool = False, - load_in_8bit: bool = False, - **kwargs, - ): - model_path = Path(model_id) - default_file_name = ONNX_WEIGHTS_NAME if from_onnx else OV_XML_FILE_NAME - file_name = file_name or default_file_name - - model_cache_path = cls._cached_file( - model_path=model_path, - use_auth_token=use_auth_token, - revision=revision, - force_download=force_download, - cache_dir=cache_dir, - file_name=file_name, - subfolder=subfolder, - local_files_only=local_files_only, - ) - - model = cls.load_model(model_cache_path, load_in_8bit=load_in_8bit) - init_cls = OVChatGLM2Model - - return init_cls(model=model, config=config, model_save_dir=model_cache_path.parent, **kwargs) - - class OVChatGLMModel(OVModelForCausalLM): position_encoding_2d = True num_layers = 28 @@ -448,8 +288,11 @@ def __init__( **kwargs, ): super().__init__(model, config, device, dynamic_shapes, ov_config, model_save_dir, **kwargs) - self.key_value_input_names = ['past_key_values'] - self.key_value_output_names = [o.any_name for o in self.model.outputs[1:]] + self.is_v1 = False + if not self.stateful and not self.key_value_input_names: + self.is_v1 = True + self.key_value_input_names = ['past_key_values'] + self.key_value_output_names = [o.any_name for o in self.model.outputs[1:]] def prepare_inputs_for_generation( self, @@ -460,6 +303,13 @@ def prepare_inputs_for_generation( past: Optional[torch.Tensor] = None, **kwargs, ) -> dict: + if not self.is_v1: + return super().prepare_inputs_for_generation( + input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, + position_ids=position_ids, + past=past, + **kwargs + ) batch_size, seq_length = input_ids.shape mask = self.mask_token_id g_mask = self.gmask_token_id @@ -590,6 +440,9 @@ def forward( past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, **kwargs, ) -> CausalLMOutputWithPast: + + if not self.is_v1: + return super().forward(input_ids=input_ids, attention_mask=attention_mask, past_key_values=past_key_values, **kwargs) self.compile() inputs = {} @@ -605,7 +458,7 @@ def forward( inputs['position_ids'] = np.array(kwargs['position_ids']) # Run inference - self.request.start_async(inputs, shared_memory=True) + self.request.start_async(inputs, share_inputs=True) self.request.wait() logits = torch.from_numpy(self.request.get_tensor('logits').data).to(self.device) @@ -629,241 +482,3 @@ def _reshape( width: int = None, ): return model - - -class StopWordsLogitsProcessor(LogitsProcessor): - ''' - :class:`transformers.LogitsProcessor` that enforces that when specified sequences appear, stop geration. - - Args: - stop_words_ids (:obj:`List[List[int]]`): - List of list of token ids of stop ids. In order to get the tokens of the words - that should not appear in the generated text, use :obj:`tokenizer(bad_word, - add_prefix_space=True).input_ids`. - eos_token_id (:obj:`int`): - The id of the `end-of-sequence` token. - ''' - - def __init__(self, stop_words_ids: Iterable[Iterable[int]], eos_token_id: int): - - if not isinstance(stop_words_ids, List) or len(stop_words_ids) == 0: - raise ValueError( - f'`stop_words_ids` has to be a non-emtpy list, but is {stop_words_ids}.' - ) - if any(not isinstance(bad_word_ids, list) for bad_word_ids in stop_words_ids): - raise ValueError( - f'`stop_words_ids` has to be a list of lists, but is {stop_words_ids}.' - ) - if any( - any( - (not isinstance(token_id, (int, np.integer)) or token_id < 0) - for token_id in stop_word_ids - ) - for stop_word_ids in stop_words_ids - ): - raise ValueError( - f'Each list in `stop_words_ids` has to be a list of positive integers, but is {stop_words_ids}.' - ) - - self.stop_words_ids = list( - filter( - lambda bad_token_seq: bad_token_seq != [eos_token_id], stop_words_ids - ) - ) - self.eos_token_id = eos_token_id - for stop_token_seq in self.stop_words_ids: - assert ( - len(stop_token_seq) > 0 - ), 'Stop words token sequences {} cannot have an empty list'.format( - stop_words_ids - ) - - def __call__( - self, input_ids: torch.LongTensor, scores: torch.FloatTensor - ) -> torch.FloatTensor: - stopped_samples = self._calc_stopped_samples(input_ids) - for i, should_stop in enumerate(stopped_samples): - if should_stop: - scores[i, self.eos_token_id] = float(2**15) - return scores - - def _tokens_match(self, prev_tokens: torch.LongTensor, tokens: List[int]) -> bool: - if len(tokens) == 0: - # if bad word tokens is just one token always ban it - return True - elif len(tokens) > len(prev_tokens): - # if bad word tokens are longer then prev input_ids they can't be equal - return False - elif prev_tokens[-len(tokens) :].tolist() == tokens: - # if tokens match - return True - else: - return False - - def _calc_stopped_samples(self, prev_input_ids: Iterable[int]) -> Iterable[int]: - stopped_samples = [] - for prev_input_ids_slice in prev_input_ids: - match = False - for stop_token_seq in self.stop_words_ids: - if self._tokens_match(prev_input_ids_slice, stop_token_seq): - # if tokens do not match continue - match = True - break - stopped_samples.append(match) - - return stopped_samples - - -class OVQwenModel(OVModelForCausalLM): - def _reshape( - self, - model: Model, - batch_size: int, - sequence_length: int, - height: int = None, - width: int = None, - ): - shapes = {} - for inputs in model.inputs: - if inputs.get_any_name().startswith('beam_idx'): - continue - shapes[inputs] = inputs.get_partial_shape() - shapes[inputs][0] = -1 - if shapes[inputs].rank.get_length() > 1: - shapes[inputs][1] = -1 - model.reshape(shapes) - return model - - @classmethod - def _from_pretrained( - cls, - model_id: Union[str, Path], - config: PretrainedConfig, - use_auth_token: Optional[Union[bool, str, None]] = None, - revision: Optional[Union[str, None]] = None, - force_download: bool = False, - cache_dir: Optional[str] = None, - file_name: Optional[str] = None, - subfolder: str = '', - from_onnx: bool = False, - local_files_only: bool = False, - load_in_8bit: bool = False, - **kwargs, - ): - model_path = Path(model_id) - default_file_name = ONNX_WEIGHTS_NAME if from_onnx else OV_XML_FILE_NAME - file_name = file_name or default_file_name - - model_cache_path = cls._cached_file( - model_path=model_path, - use_auth_token=use_auth_token, - revision=revision, - force_download=force_download, - cache_dir=cache_dir, - file_name=file_name, - subfolder=subfolder, - local_files_only=local_files_only, - ) - - model = cls.load_model(model_cache_path, load_in_8bit=load_in_8bit) - init_cls = OVQwenModel - - return init_cls(model=model, config=config, model_save_dir=model_cache_path.parent, **kwargs) - - def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs): - past_key_values = past_key_values or kwargs.get('past', None) - - # `past_key_values` may be in the stardard format (e.g. in contrastive search), converts to bloom's format if needed - if past_key_values is not None and self.config.model_type == 'bloom': - if past_key_values[0][0].shape[0] == input_ids.shape[0]: - past_key_values = self._convert_to_bloom_cache(past_key_values) - - attention_mask = kwargs.get('attention_mask', None) - position_ids = kwargs.get('position_ids', None) - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -1].unsqueeze(-1) - return { - 'input_ids': input_ids, - 'past_key_values': past_key_values, - 'use_cache': self.use_cache, - 'position_ids': position_ids, - 'attention_mask': attention_mask, - 'token_type_ids': None, - } - - def _update_model_kwargs_for_generation( - self, - outputs: 'ModelOutput', - model_kwargs: Dict[str, 'Any'], - is_encoder_decoder: bool = False, - standardize_cache_format: bool = False, - ) -> Dict[str, 'Any']: - # update past_key_values - model_kwargs['past_key_values'] = self._extract_past_from_model_output( - outputs, standardize_cache_format=standardize_cache_format - ) - - # update attention mask - if 'attention_mask' in model_kwargs: - attention_mask = model_kwargs['attention_mask'] - model_kwargs['attention_mask'] = torch.cat( - [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 - ) - - # update position ids - if 'position_ids' in model_kwargs: - position_ids = model_kwargs['position_ids'] - new_position_id = position_ids[..., -1:].clone() - new_position_id += 1 - model_kwargs['position_ids'] = torch.cat([position_ids, new_position_id], dim=-1) - - model_kwargs['is_first_forward'] = False - return model_kwargs - - - def generate( - self, - inputs: Optional[torch.Tensor] = None, - generation_config: Optional[GenerationConfig] = None, - logits_processor: Optional[LogitsProcessorList] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - prefix_allowed_tokens_fn: Optional[ - Callable[[int, torch.Tensor], List[int]] - ] = None, - synced_gpus: Optional[bool] = None, - #assistant_model: Optional['PreTrainedModel'] = None, - #streamer: Optional['BaseStreamer'] = None, - **kwargs, - ) -> Union[GenerateOutput, torch.LongTensor]: - generation_config = generation_config if generation_config is not None else self.generation_config - - # Process stop_words_ids. - stop_words_ids = kwargs.pop('stop_words_ids', [[151643]]) - if stop_words_ids is None and generation_config is not None: - stop_words_ids = getattr(generation_config, 'stop_words_ids', None) - if stop_words_ids is None: - stop_words_ids = getattr(generation_config, 'stop_words_ids', None) - - if stop_words_ids is not None: - stop_words_logits_processor = StopWordsLogitsProcessor( - stop_words_ids=stop_words_ids, - eos_token_id=generation_config.eos_token_id, - ) - if logits_processor is None: - logits_processor = LogitsProcessorList([stop_words_logits_processor]) - else: - logits_processor.append(stop_words_logits_processor) - - return super().generate( - inputs, - generation_config=generation_config, - logits_processor=logits_processor, - stopping_criteria=stopping_criteria, - prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, - synced_gpus=synced_gpus, - **kwargs, - ) \ No newline at end of file diff --git a/llm_bench/python/utils/ov_utils.py b/llm_bench/python/llm_bench_utils/ov_utils.py similarity index 59% rename from llm_bench/python/utils/ov_utils.py rename to llm_bench/python/llm_bench_utils/ov_utils.py index 614b15b235..da77f5da22 100644 --- a/llm_bench/python/utils/ov_utils.py +++ b/llm_bench/python/llm_bench_utils/ov_utils.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright (C) 2018-2023 Intel Corporation +# Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 from pathlib import Path from transformers import AutoConfig @@ -9,9 +9,8 @@ import torch import time import types - -from utils.config_class import OV_MODEL_CLASSES_MAPPING, TOKENIZE_CLASSES_MAPPING, DEFAULT_MODEL_CLASSES -from .ov_model_classes import register_normalized_configs +from llm_bench_utils.hook_common import get_bench_hook +from llm_bench_utils.config_class import OV_MODEL_CLASSES_MAPPING, TOKENIZE_CLASSES_MAPPING, DEFAULT_MODEL_CLASSES import openvino.runtime.opset13 as opset @@ -91,8 +90,12 @@ def build_ov_tokenizer(hf_tokenizer): return hf_tokenizer ov_tokenizer, ov_detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True) - ov_compiled_tokenizer = ov.compile_model(ov_tokenizer) - ov_compiled_detokenizer = ov.compile_model(ov_detokenizer) + return build_ov_tokenizer_wrapper(hf_tokenizer, ov_tokenizer, ov_detokenizer) + + +def build_ov_tokenizer_wrapper(hf_tokenizer, tokenizer_model, detokenizer_model): + ov_compiled_tokenizer = ov.compile_model(tokenizer_model, "CPU") + ov_compiled_detokenizer = ov.compile_model(detokenizer_model, "CPU") def encode_ov_tokenizer_full(self, text, *args, **kwargs): if isinstance(text, str): @@ -134,57 +137,124 @@ def create_text_gen_model(model_path, device, **kwargs): model_path = model_path.parents[2] ov_config = kwargs['config'] - register_normalized_configs() model_path_existed = Path(model_path).exists() # load model if not model_path_existed: raise RuntimeError(f'==Failure ==: model path:{model_path} does not exist') else: - if model_type in ['mpt', 'falcon', 'replit', 'codegen2', 'chatglm']: - start = time.perf_counter() - ov_model = model_class.from_pretrained( - model_path, - device=device, - ov_config=ov_config, - config=AutoConfig.from_pretrained(model_path, trust_remote_code=True), - stateful=kwargs.get("stateful", None) - ) - end = time.perf_counter() - else: - start = time.perf_counter() - config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) - ov_model = model_class.from_pretrained( - model_path, - device=device, - ov_config=ov_config, - config=config, - compile=False, - stateful=kwargs.get("stateful", None) - ) - if not isinstance(ov_model, OV_MODEL_CLASSES_MAPPING['t5']): - patch_inter_processing_and_compile(ov_model, **kwargs) - end = time.perf_counter() + if kwargs.get("genai", False) and is_genai_available(log_msg=True): + if model_class not in [OV_MODEL_CLASSES_MAPPING[default_model_type], OV_MODEL_CLASSES_MAPPING["mpt"], OV_MODEL_CLASSES_MAPPING["chatglm"]]: + log.warning("OpenVINO GenAI based benchmarking is not available for {model_type}. Will be switched to default bencmarking") + else: + return create_genai_text_gen_model(model_path, device, ov_config, **kwargs) + remote_code = False + try: + model_config = AutoConfig.from_pretrained(model_path, trust_remote_code=False) + except Exception: + model_config = AutoConfig.from_pretrained(model_path, trust_remote_code=True) + remote_code = True + start = time.perf_counter() + ov_model = model_class.from_pretrained( + model_path, + device=device, + ov_config=ov_config, + config=model_config, + stateful=kwargs.get("stateful", None), + trust_remote_code=remote_code + ) + if not isinstance(ov_model, OV_MODEL_CLASSES_MAPPING['t5']): + patch_inter_processing_and_compile(ov_model, **kwargs) + end = time.perf_counter() + bench_hook = get_bench_hook(kwargs['num_beams'], ov_model) from_pretrained_time = end - start log.info(f'From pretrained time: {from_pretrained_time:.2f}s') # load token tokenizer = token_class.from_pretrained(model_path, trust_remote_code=True) if kwargs.get("convert_tokenizer", False): tokenizer = build_ov_tokenizer(tokenizer) - return ov_model, tokenizer, from_pretrained_time + return ov_model, tokenizer, from_pretrained_time, bench_hook, False + + +def create_genai_text_gen_model(model_path, device, ov_config, **kwargs): + import openvino_tokenizers # noqa: F401 + import openvino_genai + from transformers import AutoTokenizer + + if not (model_path / "openvino_tokenizer.xml").exists() or not (model_path / "openvino_detokenizer.xml").exists(): + convert_ov_tokenizer(model_path) + + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + + cb = kwargs.get("use_cb", False) + if cb: + log.info("Continuous Batching mode activated") + default_cb_config = {"cache_size": 1} + if "GPU" in device: + default_cb_config["block_size"] = 16 + scheduler_config = openvino_genai.SchedulerConfig() + scheduler_params = kwargs.get("cb_config") or default_cb_config + if scheduler_params: + log.info(f"Scheduler parameters:\n{scheduler_params}") + + for param, value in scheduler_params.items(): + setattr(scheduler_config, param, value) + ov_config["scheduler_config"] = scheduler_config + start = time.perf_counter() + llm_pipe = openvino_genai.LLMPipeline(str(model_path), device.upper(), ov_config) + end = time.perf_counter() + log.info(f'Pipeline initialization time: {end - start:.2f}s') + + class TokenStreamer(openvino_genai.StreamerBase): + def __init__(self, tokenizer): + openvino_genai.StreamerBase.__init__(self) + self.tokenizer = tokenizer + self.token_generation_time = [] + self.generated_tokens = [] + self.start_time = time.perf_counter() + + def put(self, token_id): + self.token_generation_time.append(time.perf_counter() - self.start_time) + self.generated_tokens.append(token_id) + self.start_time = time.perf_counter() + return False + + def reset(self): + self.token_generation_time = [] + self.generated_tokens = [] + self.start_time = time.perf_counter() + + def end(self): + pass + + def get_tokens(self): + return self.generated_tokens + + def get_time_list(self): + return self.token_generation_time + streamer = TokenStreamer(llm_pipe.get_tokenizer()) if cb else None + + return llm_pipe, tokenizer, end - start, streamer, True + + +def convert_ov_tokenizer(tokenizer_path): + from optimum.exporters.openvino.convert import export_tokenizer + from transformers import AutoTokenizer + + hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True) + + export_tokenizer(hf_tokenizer, tokenizer_path) def create_image_gen_model(model_path, device, **kwargs): default_model_type = DEFAULT_MODEL_CLASSES[kwargs['use_case']] model_type = kwargs.get('model_type', default_model_type) - print(model_type) model_class = OV_MODEL_CLASSES_MAPPING[model_type] model_path = Path(model_path) ov_config = kwargs['config'] if not Path(model_path).exists(): raise RuntimeError(f'==Failure ==: model path:{model_path} does not exist') else: - log.info(f'model_path={model_path}') start = time.perf_counter() ov_model = model_class.from_pretrained(model_path, device=device, ov_config=ov_config) end = time.perf_counter() @@ -207,3 +277,15 @@ def create_ldm_super_resolution_model(model_path, device, **kwargs): from_pretrained_time = end - start log.info(f'From pretrained time: {from_pretrained_time:.2f}s') return ov_model, from_pretrained_time + + +def is_genai_available(log_msg=False): + import importlib + try: + importlib.import_module('openvino_genai') + except ImportError as ex: + if log_msg: + log.warning("Attempt to load OpenVINO GenaAI package failed. Please install openvino_genai package. Full error message available in debug mode") + log.warning(ex) + return False + return True diff --git a/llm_bench/python/utils/pt_utils.py b/llm_bench/python/llm_bench_utils/pt_utils.py similarity index 80% rename from llm_bench/python/utils/pt_utils.py rename to llm_bench/python/llm_bench_utils/pt_utils.py index a03d1d0b6b..d9f530a179 100644 --- a/llm_bench/python/utils/pt_utils.py +++ b/llm_bench/python/llm_bench_utils/pt_utils.py @@ -1,15 +1,14 @@ # -*- coding: utf-8 -*- -# Copyright (C) 2018-2023 Intel Corporation +# Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 from pathlib import Path import torch -from utils.config_class import PT_MODEL_CLASSES_MAPPING, TOKENIZE_CLASSES_MAPPING, DEFAULT_MODEL_CLASSES +from llm_bench_utils.config_class import PT_MODEL_CLASSES_MAPPING, TOKENIZE_CLASSES_MAPPING, DEFAULT_MODEL_CLASSES import os import time import logging as log -import openvino.torch # noqa: F401 - -MAX_CONNECT_TIME = 50 +import llm_bench_utils.hook_common as hook_common +import json def set_bf16(model, device, **kwargs): @@ -23,7 +22,15 @@ def set_bf16(model, device, **kwargs): return model -def run_torch_compile(model, backend='openvino'): +def torch_compile_child_module(model, child_modules, backend='openvino', dynamic=None, options=None): + if len(child_modules) == 1: + setattr(model, child_modules[0], torch.compile(getattr(model, child_modules[0]), backend=backend, dynamic=dynamic, fullgraph=True, options=options)) + return model + setattr(model, child_modules[0], torch_compile_child_module(getattr(model, child_modules[0]), child_modules[1:], backend, dynamic, options)) + return model + + +def run_torch_compile(model, backend='openvino', dynamic=None, options=None, child_modules=None): if backend == 'pytorch': log.info(f'Running torch.compile() with {backend} backend') start = time.perf_counter() @@ -34,7 +41,10 @@ def run_torch_compile(model, backend='openvino'): else: log.info(f'Running torch.compile() with {backend} backend') start = time.perf_counter() - compiled_model = torch.compile(model, backend=backend) + if child_modules and len(child_modules) > 0: + compiled_model = torch_compile_child_module(model, child_modules, backend, dynamic, options) + else: + compiled_model = torch.compile(model, backend=backend, dynamic=dynamic, options=options) end = time.perf_counter() compile_time = end - start log.info(f'Compiling model via torch.compile() took: {compile_time}') @@ -74,7 +84,7 @@ def create_text_gen_model(model_path, device, **kwargs): gptneoxclm = 'transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXForCausalLM' chatglmfcg = 'transformers_modules.pytorch_original.modeling_chatglm.ChatGLMForConditionalGeneration' real_base_model_name = str(type(model)).lower() - log.info('Real base model=', real_base_model_name) + log.info(f'Real base model={real_base_model_name}') # bfclm will trigger generate crash. # If the device is set to GPU there's a need to substitute it with 'cuda' so it will be accepted by PyTorch @@ -93,11 +103,22 @@ def create_text_gen_model(model_path, device, **kwargs): else: raise RuntimeError('==Failure ==: no device to load') + bench_hook = hook_common.get_bench_hook(kwargs['num_beams'], model) + if kwargs['torch_compile_backend']: backend = kwargs['torch_compile_backend'] - compiled_model = run_torch_compile(model, backend) + dynamic = None + options = None + child_modules = None + if kwargs['torch_compile_dynamic']: + dynamic = kwargs['torch_compile_dynamic'] + if kwargs['torch_compile_options']: + options = json.loads(kwargs['torch_compile_options']) + if kwargs['torch_compile_input_module']: + child_modules = kwargs['torch_compile_input_module'].split(".") + compiled_model = run_torch_compile(model, backend, dynamic, options, child_modules) model = compiled_model - return model, tokenizer, from_pretrain_time + return model, tokenizer, from_pretrain_time, bench_hook, False def create_image_gen_model(model_path, device, **kwargs): diff --git a/llm_bench/python/llm_run_on_linux.sh b/llm_bench/python/llm_run_on_linux.sh deleted file mode 100755 index 209f0faf20..0000000000 --- a/llm_bench/python/llm_run_on_linux.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash -# Copyright (C) 2023 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -git lfs install -export GIT_LFS_SKIP_SMUDGE=0 -git_clone_bloomz_560m="git clone https://huggingface.co/bigscience/bloomz-560m" -echo ${git_clone_bloomz_560m} -eval ${git_clone_bloomz_560m} -wait - -convert_model="python ./llm_bench/python/convert.py --model_id bloomz-560m/ --output_dir ./ov_models/bloomz-560m --precision FP16" -echo ${convert_model} -eval ${convert_model} -wait - -bemchmarking="python ./llm_bench/python/benchmark.py -m ./ov_models/bloomz-560m/pytorch/dldt/FP16/ -d cpu -n 1" -echo ${bemchmarking} -eval ${bemchmarking} \ No newline at end of file diff --git a/llm_bench/python/prompts/stable-diffusion.jsonl b/llm_bench/python/prompts/stable-diffusion.jsonl new file mode 100644 index 0000000000..59c23064a5 --- /dev/null +++ b/llm_bench/python/prompts/stable-diffusion.jsonl @@ -0,0 +1 @@ +{"steps":"30", "width":"256", "height":"256", "guidance_scale":"1.0", "prompt": "side profile centered painted portrait, Gandhi rolling a blunt, Gloomhaven, matte painting concept art, art nouveau, 8K HD Resolution, beautifully background"} \ No newline at end of file diff --git a/llm_bench/python/requirements-dev.txt b/llm_bench/python/requirements-dev.txt deleted file mode 100644 index f073e766b9..0000000000 --- a/llm_bench/python/requirements-dev.txt +++ /dev/null @@ -1,16 +0,0 @@ ---extra-index-url https://download.pytorch.org/whl/cpu -numpy -openvino>=2023.1.0 -pillow -torch -transformers>=4.28.0 -tokenizers -diffusers -optimum -git+https://github.com/slyalin/optimum-intel.git@stateful -git+https://github.com/openvinotoolkit/nncf.git -packaging -psutil -timm -tiktoken -onnx \ No newline at end of file diff --git a/llm_bench/python/requirements.txt b/llm_bench/python/requirements.txt index a52b83acb4..6139bf843c 100644 --- a/llm_bench/python/requirements.txt +++ b/llm_bench/python/requirements.txt @@ -1,18 +1,22 @@ --extra-index-url https://download.pytorch.org/whl/cpu numpy -openvino>=2023.1.0 +--extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly +openvino +openvino-tokenizers +openvino_genai auto-gptq>=0.5.1 # for gptq pillow torch -transformers>=4.33.0 +transformers>=4.40.0 diffusers>=0.22.0 -optimum>=1.14.0,<1.15.0 -git+https://github.com/eaidova/optimum-intel.git@a57c86a6e561be838cbefb0ac430008c05e9aaa9 -git+https://github.com/openvinotoolkit/nncf.git@6d17662fec917e1293189e2c3a2b94139a433f16 +#optimum is in dependency list of optimum-intel +git+https://github.com/huggingface/optimum-intel.git@main#egg=optimum-intel +git+https://github.com/openvinotoolkit/nncf.git@develop#egg=nncf packaging psutil timm tiktoken -onnx +onnx<=1.16.1 einops transformers_stream_generator +bitsandbytes diff --git a/llm_bench/python/utils/conversion_utils/convert_patch.py b/llm_bench/python/utils/conversion_utils/convert_patch.py deleted file mode 100644 index 0ff122a578..0000000000 --- a/llm_bench/python/utils/conversion_utils/convert_patch.py +++ /dev/null @@ -1,244 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (C) 2018-2023 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import torch -from typing import Tuple, Optional -import types -from transformers.modeling_outputs import BaseModelOutputWithPast -from transformers import PreTrainedModel - - -@torch.jit.script_if_tracing -def _chatglm2_get_context_layer(query_layer: torch.Tensor, key_layer: torch.Tensor, value_layer: torch.Tensor): - mask = torch.zeros((query_layer.shape[-2], key_layer.shape[-2]), dtype=query_layer.dtype) - if query_layer.shape[2] == key_layer.shape[2]: - tmp_mask = torch.ones((query_layer.shape[-2], key_layer.shape[-2]), dtype=torch.bool).triu(diagonal=1) - mask.masked_fill_(tmp_mask, float("-inf")) - - context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer, attn_mask=mask) - return context_layer - - -def _core_attention_forward(self, query_layer, key_layer, value_layer, attention_mask): - query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]] - if attention_mask is None: - context_layer = _chatglm2_get_context_layer(query_layer, key_layer, value_layer) - else: - context_layer = torch.nn.functional.scaled_dot_product_attention( - query_layer, key_layer, value_layer, attention_mask - ) - context_layer = context_layer.permute(2, 0, 1, 3) - new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,) - context_layer = context_layer.reshape(*new_context_layer_shape) - - return context_layer - - -@torch.jit.script_if_tracing -def _get_chatglm_attention_mask(input_ids, past_key): - mask = torch.zeros((input_ids.shape[1], past_key.shape[0] + input_ids.shape[1]), dtype=past_key.dtype) - if past_key.shape[0] == 0: - tmp_mask = torch.ones((input_ids.shape[1], past_key.shape[0] + input_ids.shape[1]), dtype=torch.bool).triu(diagonal=1) - mask.masked_fill_(tmp_mask, float("-inf")) - return mask - - -def _chatglm_transformer_forward( - self, - input_ids, - position_ids: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.BoolTensor] = None, - full_attention_mask: Optional[torch.BoolTensor] = None, - past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, - inputs_embeds: Optional[torch.Tensor] = None, - use_cache: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None -): - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - batch_size, seq_length = input_ids.shape - - if inputs_embeds is None: - inputs_embeds = self.embedding(input_ids) - - if self.pre_seq_len is not None: - if past_key_values is None: - past_key_values = self.get_prompt(batch_size=batch_size, device=input_ids.device, - dtype=inputs_embeds.dtype) - if attention_mask is not None: - attention_mask = torch.cat([attention_mask.new_ones((batch_size, self.pre_seq_len)), attention_mask], dim=-1) - - if full_attention_mask is None: - if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1): - full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask) - elif past_key_values is not None: - full_attention_mask = torch.ones(batch_size, seq_length, seq_length, - device=input_ids.device, - dtype=torch.float) * float("-inf") - full_attention_mask.triu_(diagonal=1) - past_length = 0 - if past_key_values: - past_length = past_key_values[0][0].shape[0] - if past_length: - full_attention_mask = torch.cat((torch.zeros(batch_size, seq_length, past_length, - device=input_ids.device), full_attention_mask), dim=-1) - full_attention_mask.unsqueeze_(1) - - # Rotary positional embeddings - rotary_pos_emb = self.rotary_pos_emb(self.seq_length) - if position_ids is not None: - rotary_pos_emb = rotary_pos_emb[position_ids] - else: - rotary_pos_emb = rotary_pos_emb[None, :seq_length] - rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous() - - # Run encoder. - hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder( - inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb, - kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states - ) - - if not return_dict: - return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None) - - return BaseModelOutputWithPast( - last_hidden_state=hidden_states, - past_key_values=presents, - hidden_states=all_hidden_states, - attentions=all_self_attentions, - ) - - -def _patch_chatglm_core_attention_forward(model: "PreTrainedModel"): - model.transformer.forward = types.MethodType(_chatglm_transformer_forward, model.transformer) - for block in model.transformer.encoder.layers: - block.self_attention.core_attention.forward = types.MethodType( - _core_attention_forward, block.self_attention.core_attention - ) - - -def _update_qwen_rotary_embedding_cache(model): - model.transformer.rotary_emb(2048) - - -def _yi_prepare_decoder_attention_mask(attention_mask, input_ids, inputs_embeds, past_key_values_length): - input_shape = input_ids.shape if input_ids is not None else inputs_embeds.shape[:-1] - return _prepare_decoder_attention_mask(attention_mask, input_shape, inputs_embeds, past_key_values_length) - - -# Modified from transformers.models.mistral.modeling_mistral._prepare_decoder_sliding_window_attention_mask -def _prepare_decoder_sliding_window_attention_mask( - attention_mask: torch.Tensor, - input_shape: Tuple[int, int], - inputs_embeds: torch.Tensor, - past_key_values_length: int, - sliding_window: int, -): - from transformers.models.mistral.modeling_mistral import _expand_mask, _make_sliding_window_causal_mask - - # create causal mask - # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] - combined_attention_mask = None - - combined_attention_mask = _make_sliding_window_causal_mask( - input_shape, - device=inputs_embeds.device, - dtype=inputs_embeds.dtype, - past_key_values_length=past_key_values_length, - sliding_window=sliding_window, - ) - - if attention_mask is not None: - # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] - expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to( - inputs_embeds.device - ) - combined_attention_mask = ( - expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask - ) - - return combined_attention_mask - - -# Copied from transformers.models.bart.modeling_bart._expand_mask -def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): - """ - Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. - """ - bsz, src_len = mask.size() - tgt_len = tgt_len if tgt_len is not None else src_len - - expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) - - inverted_mask = 1.0 - expanded_mask - - return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) - - -# Modified from transformers.models.bloom.modeling_bloom._make_causal_mask -def _make_causal_mask( - input_ids_shape: torch.Size, - device: torch.device, - past_key_values_length: int, - dtype: torch.dtype = torch.bool, -) -> torch.BoolTensor: - """ - Make causal mask used for bi-directional self-attention. - """ - batch_size, target_length = input_ids_shape - mask = torch.zeros((target_length, target_length + past_key_values_length), dtype=dtype, device=device) - seq_ids = torch.arange(target_length, device=device) - - mask[:, past_key_values_length:] = ( - (seq_ids[:, None] < seq_ids[None, :]) * torch.finfo(dtype).min - if torch.is_floating_point(mask) - else seq_ids[:, None] < seq_ids[None, :] - ) - - return mask[None, None, :, :].expand(batch_size, 1, target_length, target_length + past_key_values_length) - - -# Modified from transformers.models.llama.modeling_llama._prepare_decoder_attention_mask -def _prepare_decoder_attention_mask(attention_mask, input_shape, inputs_embeds, past_key_values_length): - - # create causal mask - # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] - combined_attention_mask = None - - combined_attention_mask = _make_causal_mask( - input_shape, - device=inputs_embeds.device, - past_key_values_length=past_key_values_length, - dtype=inputs_embeds.dtype, - ) - - if attention_mask is not None: - # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] - expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to( - inputs_embeds.device - ) - combined_attention_mask = ( - expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask - ) - - return combined_attention_mask - - -def patch_model_for_optimum_export(model): - if model.config.model_type in ["stablelm_epoch", "baichuan"]: - model.model._prepare_decoder_attention_mask = _prepare_decoder_attention_mask - elif model.config.model_type == "chatglm": - _patch_chatglm_core_attention_forward(model) - elif model.config.model_type == "qwen": - _update_qwen_rotary_embedding_cache(model) - elif model.config.model_type == "mistral": - model.model._prepare_decoder_attention_mask = _prepare_decoder_sliding_window_attention_mask - elif model.config.model_type == "Yi": - model.model._prepare_decoder_attention_mask = _yi_prepare_decoder_attention_mask - return model diff --git a/llm_bench/python/utils/conversion_utils/export_configs.py b/llm_bench/python/utils/conversion_utils/export_configs.py deleted file mode 100644 index 8b78c5d829..0000000000 --- a/llm_bench/python/utils/conversion_utils/export_configs.py +++ /dev/null @@ -1,454 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (C) 2018-2023 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import torch -from typing import Callable, Dict, Type, Optional, Tuple - -from optimum.exporters.onnx import TextDecoderOnnxConfig -from optimum.exporters.tasks import TasksManager, make_backend_config_constructor_for_task -from optimum.utils import ( - NormalizedTextConfig, DEFAULT_DUMMY_SHAPES, - DummyPastKeyValuesGenerator, - DummyTextInputGenerator, - DummyInputGenerator -) - - -class TextDecoderWithPositionIdsOnnxConfig(TextDecoderOnnxConfig): - no_position_ids = False - - @property - def inputs(self) -> Dict[str, Dict[int, str]]: - common_inputs = super().inputs - - # Decoders based on GPT2 require a position_ids input to avoid - # generating wrong position_ids in the model itself: - # https://github.com/huggingface/transformers/blob/v4.33.1/src/transformers/models/gpt2/modeling_gpt2.py#L802 - if not self.no_position_ids and "text-generation" in self.task: - common_inputs["position_ids"] = {0: "batch_size", 1: "sequence_length"} - - return common_inputs - - -def create_register(overwrite_existing: bool = False): - def wrapper(model_type: str, *supported_tasks: str) -> Callable[[Type], Type]: - def decorator(config_cls: Type) -> Type: - mapping = TasksManager._SUPPORTED_MODEL_TYPE.get(model_type, {}) - mapping_backend = mapping.get("onnx", {}) - for task in supported_tasks: - normalized_task = task - if "-with-past" in task: - normalized_task = task.split("-with-past")[0] - if normalized_task not in TasksManager.get_all_tasks(): - known_tasks = ", ".join(TasksManager.get_all_tasks()) - raise ValueError( - f'The TasksManager does not know the task called "{task}", known tasks: {known_tasks}.' - ) - if not overwrite_existing and task in mapping_backend: - continue - mapping_backend[task] = make_backend_config_constructor_for_task(config_cls, task) - mapping["onnx"] = mapping_backend - TasksManager._SUPPORTED_MODEL_TYPE[model_type] = mapping - return config_cls - - return decorator - - return wrapper - - -register_in_tasks_manager = create_register() -register_in_tasks_manager_with_override = create_register(True) - - -class YIDummyTextInputGenerator(DummyTextInputGenerator): - SUPPORTED_INPUT_NAMES = { - "input_ids", - "attention_mask", - "token_type_ids", - "position_ids", - } - - def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): - input = super().generate(input_name, framework, int_dtype, float_dtype) - if input_name == "position_ids": - input = input[:, -1:] - return input - - -@register_in_tasks_manager('yi', *["text-generation", "text-generation-with-past"]) -class YIOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): - # The ONNX export of this architecture needs the Trilu operator support, available since opset 14 - DEFAULT_ONNX_OPSET = 14 - DUMMY_INPUT_GENERATOR_CLASSES = ( - YIDummyTextInputGenerator, - DummyPastKeyValuesGenerator, - ) - DUMMY_PKV_GENERATOR_CLASS = DummyPastKeyValuesGenerator - NORMALIZED_CONFIG_CLASS = NormalizedTextConfig - no_position_ids = False - - -class MistralDummyTextInputGenerator(DummyTextInputGenerator): - SUPPORTED_INPUT_NAMES = { - "input_ids", - "attention_mask", - "token_type_ids", - "position_ids", - } - - def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): - input = super().generate(input_name, framework, int_dtype, float_dtype) - if input_name == "position_ids": - input = input[:, -1:] - return input - - -class MistralDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator): - def __init__( - self, - task: str, - normalized_config: NormalizedTextConfig, - batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], - sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"], - random_batch_size_range: Optional[Tuple[int, int]] = None, - random_sequence_length_range: Optional[Tuple[int, int]] = None, - **kwargs, - ): - super().__init__( - task=task, - normalized_config=normalized_config, - batch_size=batch_size, - sequence_length=sequence_length, - random_batch_size_range=random_batch_size_range, - random_sequence_length_range=random_sequence_length_range, - ) - self.num_key_value_heads = normalized_config.num_key_value_heads - - def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): - shape = ( - self.batch_size, - self.num_key_value_heads, - self.sequence_length, - self.hidden_size // self.num_attention_heads, - ) - return [ - ( - self.random_float_tensor(shape, framework=framework, dtype=float_dtype), - self.random_float_tensor(shape, framework=framework, dtype=float_dtype), - ) - for _ in range(self.num_layers) - ] - - -@register_in_tasks_manager('mistral', *["text-generation", "text-generation-with-past"]) -class MistralOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): - # The ONNX export of this architecture needs the Trilu operator support, available since opset 14 - DEFAULT_ONNX_OPSET = 14 - DUMMY_INPUT_GENERATOR_CLASSES = ( - MistralDummyTextInputGenerator, - MistralDummyPastKeyValuesGenerator, - ) - DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator - NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_key_value_heads="num_key_value_heads", allow_new=True) - no_position_ids = False - - -class QwenDummyInputsGenerator(DummyTextInputGenerator): - SUPPORTED_INPUT_NAMES = { - "input_ids", - "attention_mask", - "token_type_ids", - "position_ids", - } - - def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): - input = super().generate(input_name, framework, int_dtype, float_dtype) - if input_name == "input_ids": - input = torch.tensor([[1583]]) - if input_name == "attention_mask": - input = torch.ones((1, 7), dtype=input.dtype) - if input_name == "position_ids": - input = torch.tensor([[6]]) - return input - - -class QwenDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator): - def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): - shape = ( - 1, - 6, - self.num_attention_heads, - self.hidden_size // self.num_attention_heads, - ) - return [ - ( - torch.zeros(shape, dtype=torch.float32), - torch.zeros(shape, dtype=torch.float32), - ) - for _ in range(self.num_layers) - ] - - -@register_in_tasks_manager("qwen", *["text-generation", "text-generation-with-past"]) -class QwenOpenVINOConfig(TextDecoderOnnxConfig): - DEFAULT_ONNX_OPSET = 13 - NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args( - num_layers='num_hidden_layers', num_attention_heads='num_attention_heads', hidden_size='hidden_size' - ) - DUMMY_INPUT_GENERATOR_CLASSES = (QwenDummyInputsGenerator, QwenDummyPastKeyValuesGenerator) - DUMMY_PKV_GENERATOR_CLASS = QwenDummyPastKeyValuesGenerator - no_position_ids = False - - def generate_dummy_inputs(self, framework: str = "pt", **kwargs): - dummy_inputs_generators = self._create_dummy_input_generator_classes(**kwargs) - - dummy_inputs = {} - input_names = [key for key in self.inputs.keys() if not key.startswith("past_key_values")] - if self.use_past_in_inputs and self.use_cache_branch is not False: - input_names.append("past_key_values") - - for input_name in input_names: - input_was_inserted = False - for dummy_input_gen in dummy_inputs_generators: - if dummy_input_gen.supports_input(input_name): - dummy_inputs[input_name] = self.overwrite_shape_and_generate_input( - dummy_input_gen, - input_name, - framework, - input_shapes=kwargs, - ) - input_was_inserted = True - break - if not input_was_inserted: - raise RuntimeError( - f'Could not generate dummy input for "{input_name}". Try adding a proper dummy input generator to the model ONNX config.' - ) - - # refer to https://github.com/huggingface/optimum/pull/764 - cond1 = self.use_past_in_inputs - cond2 = self.PAD_ATTENTION_MASK_TO_PAST - cond3 = self.use_cache_branch is not False - cond4 = "attention_mask" in dummy_inputs - if (cond1 and cond2 and cond3 and cond4): - # Obtain the past sequence length from the value instead of the key (Bloom). - past_length = dummy_inputs["past_key_values"][0][1].shape[1] - - dummy_inputs["attention_mask"] = DummyInputGenerator.pad_input_on_dim( - dummy_inputs["attention_mask"], - desired_length=past_length + 1, - dim=1, - dtype=dummy_inputs["attention_mask"].dtype, - ) - - return dummy_inputs - - @property - def inputs(self) -> Dict[str, Dict[int, str]]: - common_inputs = super().inputs - if not self.no_position_ids and self.task == "text-generation": - common_inputs["position_ids"] = {0: "batch_size", 1: "sequence_length"} - - return common_inputs - - def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str): - """ - Fills `input_or_outputs` mapping with past_key_values dynamic axes considering the direction. - - Args: - inputs_or_outputs (`Dict[str, Dict[int, str]]`): The mapping to fill. - direction (`str`): - either "inputs" or "outputs", it specifies whether `input_or_outputs` is the input mapping or the - output mapping, this is important for axes naming. - """ - if direction not in ["inputs", "outputs"]: - raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given') - - if direction == "inputs": - decoder_sequence_name = "past_sequence_length" - name = "past_key_values" - else: - decoder_sequence_name = "past_sequence_length + 1" - name = "present" - - for i in range(self._normalized_config.num_layers): - inputs_or_outputs[f"{name}.{i}.key"] = {0: "batch_size", 1: decoder_sequence_name} - inputs_or_outputs[f"{name}.{i}.value"] = {0: "batch_size", 1: decoder_sequence_name} - - -@register_in_tasks_manager("baichuan", *["text-generation", "text-generation-with-past"]) -class Baichaun2OpenVINOConfig(TextDecoderOnnxConfig): - DEFAULT_ONNX_OPSET = 13 - NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args( - num_layers='num_hidden_layers', num_attention_heads='num_attention_heads', hidden_size='hidden_size' - ) - - -@register_in_tasks_manager("jais", *["text-generation", "text-generation-with-past"]) -class JaisOpenVINOConfig(TextDecoderOnnxConfig): - DEFAULT_ONNX_OPSET = 13 - NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_layers='n_layer', num_attention_heads='n_head', hidden_size='n_embd') - - -class ChatGLM2NormalizedConfig(NormalizedTextConfig): - NUM_LAYERS = "num_layers" - VOCAB_SIZE = "padded_vocab_size" - - -class ChatGLM2DummyTextInputGenerator(DummyTextInputGenerator): - SUPPORTED_INPUT_NAMES = { - "input_ids", - "attention_mask", - "token_type_ids", - "position_ids", - } - - def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): - input = super().generate(input_name, framework, int_dtype, float_dtype) - if input_name == "attention_mask": - input = torch.ones(input.shape, dtype=input.dtype) - if input_name == "position_ids": - bs = input.shape[0] - input = torch.range(0, input.shape[1], dtype=input.dtype).repeat(bs, 1) - return input - - -class ChatGLM2DummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator): - def __init__( - self, - task: str, - normalized_config: NormalizedTextConfig, - batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], - sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"], - random_batch_size_range: Optional[Tuple[int, int]] = None, - random_sequence_length_range: Optional[Tuple[int, int]] = None, - **kwargs, - ): - super().__init__( - task=task, - normalized_config=normalized_config, - batch_size=batch_size, - sequence_length=sequence_length, - random_batch_size_range=random_batch_size_range, - random_sequence_length_range=random_sequence_length_range, - ) - self.multi_query_group_num = normalized_config.multi_query_group_num - self.head_dim = self.hidden_size // self.num_attention_heads - - def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): - past_key_shape = ( - self.sequence_length, - self.batch_size, - self.multi_query_group_num, - self.head_dim, - ) - past_value_shape = ( - self.sequence_length, - self.batch_size, - self.multi_query_group_num, - self.head_dim, - ) - return [ - ( - self.random_float_tensor(past_key_shape, framework=framework, dtype=float_dtype), - self.random_float_tensor(past_value_shape, framework=framework, dtype=float_dtype), - ) - for _ in range(self.num_layers) - ] - - -@register_in_tasks_manager("chatglm", *["text-generation", "text-generation-with-past"]) -class ChatGLM2OpenVINOConfig(TextDecoderOnnxConfig): - NORMALIZED_CONFIG_CLASS = ChatGLM2NormalizedConfig - DUMMY_INPUT_GENERATOR_CLASSES = (ChatGLM2DummyTextInputGenerator, ChatGLM2DummyPastKeyValuesGenerator) - DUMMY_PKV_GENERATOR_CLASS = ChatGLM2DummyPastKeyValuesGenerator - no_position_ids = False - - def generate_dummy_inputs(self, framework: str = "pt", **kwargs): - dummy_inputs_generators = self._create_dummy_input_generator_classes(**kwargs) - - dummy_inputs = {} - input_names = [key for key in self.inputs.keys() if not key.startswith("past_key_values")] - if self.use_past_in_inputs and self.use_cache_branch is not False: - input_names.append("past_key_values") - - for input_name in input_names: - input_was_inserted = False - for dummy_input_gen in dummy_inputs_generators: - if dummy_input_gen.supports_input(input_name): - dummy_inputs[input_name] = self.overwrite_shape_and_generate_input( - dummy_input_gen, - input_name, - framework, - input_shapes=kwargs, - ) - input_was_inserted = True - break - if not input_was_inserted: - raise RuntimeError( - f'Could not generate dummy input for "{input_name}". Try adding a proper dummy input generator to the model ONNX config.' - ) - - # refer to https://github.com/huggingface/optimum/pull/764 - cond1 = self.use_past_in_inputs - cond2 = self.PAD_ATTENTION_MASK_TO_PAST - cond3 = self.use_cache_branch is not False - cond4 = "attention_mask" in dummy_inputs - if (cond1 and cond2 and cond3 and cond4): - # Obtain the past sequence length from the value instead of the key (Bloom). - past_length = dummy_inputs["past_key_values"][0][1].shape[0] - for k, v in dummy_inputs.items(): - if k not in ["attention_mask", "past_key_values"]: - dummy_inputs[k] = v[:, -1:] - - dummy_inputs["attention_mask"] = DummyInputGenerator.pad_input_on_dim( - dummy_inputs["attention_mask"], - desired_length=past_length + 1, - dim=1, - dtype=dummy_inputs["attention_mask"].dtype, - ) - - return dummy_inputs - - @property - def inputs(self) -> Dict[str, Dict[int, str]]: - common_inputs = super().inputs - if not self.no_position_ids and self.task == "text-generation": - common_inputs["position_ids"] = {0: "batch_size", 1: "sequence_length"} - - return common_inputs - - def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str): - """ - Fills `input_or_outputs` mapping with past_key_values dynamic axes considering the direction. - - Args: - inputs_or_outputs (`Dict[str, Dict[int, str]]`): The mapping to fill. - direction (`str`): - either "inputs" or "outputs", it specifies whether `input_or_outputs` is the input mapping or the - output mapping, this is important for axes naming. - """ - if direction not in ["inputs", "outputs"]: - raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given') - - if direction == "inputs": - decoder_sequence_name = "past_sequence_length" - name = "past_key_values" - else: - decoder_sequence_name = "past_sequence_length + 1" - name = "present" - - for i in range(self._normalized_config.num_layers): - inputs_or_outputs[f"{name}.{i}.key"] = {1: "batch_size", 0: decoder_sequence_name} - inputs_or_outputs[f"{name}.{i}.value"] = {1: "batch_size", 0: decoder_sequence_name} - - -TasksManager._SUPPORTED_MODEL_TYPE['stablelm_epoch'] = TasksManager._SUPPORTED_MODEL_TYPE['llama'] -TasksManager._SUPPORTED_MODEL_TYPE['stablelm-epoch'] = TasksManager._SUPPORTED_MODEL_TYPE['llama'] -TasksManager._SUPPORTED_MODEL_TYPE["aquila"] = TasksManager._SUPPORTED_MODEL_TYPE["llama"] -TasksManager._SUPPORTED_MODEL_TYPE["codegen2"] = TasksManager._SUPPORTED_MODEL_TYPE["codegen"] - - -@register_in_tasks_manager('phi', *["text-generation", "text-generation-with-past"]) -class PhiOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): - NORMALIZED_CONFIG_CLASS = NormalizedTextConfig diff --git a/llm_bench/python/utils/metrics_print.py b/llm_bench/python/utils/metrics_print.py deleted file mode 100644 index e83d009a77..0000000000 --- a/llm_bench/python/utils/metrics_print.py +++ /dev/null @@ -1,123 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (C) 2018-2023 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 -import logging as log - - -def print_metrics( - iter_num, iter_data, tms=None, tms_infer=None, generated=None, warm_up=False, max_rss_mem=-1, max_shared_mem=-1, - stable_diffusion=None, tokenization_time=None -): - if tms is None: - tms = [] - if tms_infer is None: - tms_infer = [] - iter_str = str(iter_num) - if warm_up: - iter_str = 'warm-up' - if iter_data['input_size'] != '': - log.info(f"[{iter_str}] Input token size: {iter_data['input_size']}") - if iter_data['output_size'] != '': - log.info(f"[{iter_str}] Output size: {iter_data['output_size']}") - if iter_data['infer_count'] != '': - log.info(f"[{iter_str}] Infer count: {iter_data['infer_count']}") - if tokenization_time: - encode_time = tokenization_time[0] - log.info(f"[{iter_str}] Tokenization Time: {encode_time:.2f}ms") - if len(tokenization_time) > 1: - decode_time = tokenization_time[1] - log.info(f"[{iter_str}] Detokenization Time: {decode_time:.2f}ms") - if iter_data['generation_time'] != '': - log.info(f"[{iter_str}] Generation Time: {iter_data['generation_time']:.2f}s") - if iter_data['latency'] != '': - log.info(f"[{iter_str}] Latency: {iter_data['latency']:.2f} ms/token") - if generated is not None: - log.info(f'[{iter_str}] Generated:\n{generated}') - if iter_data['result_md5'] != '': - log.info(f"[{iter_str}] Result MD5:{iter_data['result_md5']}") - if len(tms) > 0: - iter_data['first_token_latency'] = tms[0] * 1000 if len(tms) > 0 else -1 - iter_data['other_tokens_avg_latency'] = sum(tms[1:]) / (len(tms) - 1) * 1000 if len(tms) > 1 else -1 - log.info( - f"[{iter_str}] First token latency: {iter_data['first_token_latency']:.2f} ms/token, " - f"other tokens latency: {iter_data['other_tokens_avg_latency']:.2f} ms/token, len of tokens: {len(tms)}", - ) - if len(tms_infer) > 0: - iter_data['first_token_infer_latency'] = tms_infer[0] * 1000 if len(tms_infer) > 0 else -1 - iter_data['other_tokens_infer_avg_latency'] = sum(tms_infer[1:]) / (len(tms_infer) - 1) * 1000 if len(tms_infer) > 1 else -1 - log.info( - f"[{iter_str}] First token infer latency: {iter_data['first_token_infer_latency']:.2f} ms/token, " - f"other tokens infer latency: {iter_data['other_tokens_infer_avg_latency']:.2f} ms/token, len of tokens: {len(tms_infer)}", - ) - if stable_diffusion is not None: - print_stable_diffusion_infer_latency(iter_str, iter_data, stable_diffusion) - if max_rss_mem != '' and max_rss_mem > -1: - log.info(f'[{iter_str}] max rss memory cost:\n{max_rss_mem}') - if max_shared_mem != '' and max_shared_mem > -1: - log.info(f'[{iter_str}] max shared memory cost:\n{max_shared_mem}') - - -def print_stable_diffusion_infer_latency(iter_str, iter_data, stable_diffusion): - iter_data['first_token_latency'] = stable_diffusion.get_1st_unet_latency() - iter_data['other_tokens_avg_latency'] = stable_diffusion.get_2nd_unet_latency() - iter_data['first_token_infer_latency'] = iter_data['first_token_latency'] - iter_data['other_tokens_infer_avg_latency'] = iter_data['other_tokens_avg_latency'] - log.info(f"[{iter_str}] First step of unet latency: {iter_data['first_token_latency']:.2f} ms/step, " - f"other steps of unet latency: {iter_data['other_tokens_avg_latency']:.2f} ms/step",) - log.info(f"[{iter_str}] text encoder latency: {stable_diffusion.get_text_encoder_latency():.2f} ms/step, " - f"unet latency: {stable_diffusion.get_unet_latency():.2f} ms/step, " - f"vae decoder latency: {stable_diffusion.get_vae_decoder_latency():.2f} ms/step, " - f"text encoder step count: {stable_diffusion.get_text_encoder_step_count()}, " - f"unet step count: {stable_diffusion.get_unet_step_count()}, " - f"vae decoder step count: {stable_diffusion.get_vae_decoder_step_count()}",) - - -def print_ldm_unet_vqvae_infer_latency(iter_num, iter_data, tms=None, warm_up=False): - iter_str = str(iter_num) - if warm_up: - iter_str = 'warm-up' - len_tms = len(tms) - iter_data['first_token_latency'] = tms[0] * 1000 if len_tms > 0 else -1 - iter_data['other_tokens_avg_latency'] = sum(tms[1:(len_tms - 1)]) / (len_tms - 2) * 1000 if len_tms > 2 else 0 - iter_data['first_token_infer_latency'] = iter_data['first_token_latency'] - iter_data['other_tokens_infer_avg_latency'] = iter_data['other_tokens_avg_latency'] - - log.info(f"[{iter_str}] First step of unet latency: {iter_data['first_token_latency']:.2f} ms/step, " - f"other steps of unet latency: {iter_data['other_tokens_avg_latency']:.2f} ms/step",) - if len_tms > 1: - log.info(f"[{iter_str}] unet latency: {(sum(tms[0:(len_tms - 1)]) / (len_tms - 1)) * 1000:.2f} ms/step, " - f"vqvae decoder latency: {tms[len_tms - 1] * 1000:.2f} ms/step, " - f"unet step count: {len_tms - 1}, " - f"vqvae decoder step count: 1",) - - -def print_average(iter_data_list): - if len(iter_data_list) <= 1: - # 1st iteration is the warm-up iteration - return - total_generation_time = 0 - total_num_tokens = 0 - warm_up_iters = 0 - for iter_data in iter_data_list: - if iter_data['iteration'] == 0: - # Exclude the warm-up iteration - warm_up_iters = warm_up_iters + 1 - continue - if iter_data['generation_time'] != '': - total_generation_time += iter_data['generation_time'] - if iter_data['output_size'] != '': - total_num_tokens += iter_data['output_size'] - - total_iters = len(iter_data_list) - warm_up_iters - - if total_iters > 0: - log.info('<<< Warm-up iteration is excluded. >>>') - log.info(f'[Total] Iterations: {total_iters}') - if total_num_tokens > 0: - log.info(f'[Total] Output size: {total_num_tokens} tokens') - if total_generation_time > 0: - avg_per_iter_time = total_generation_time / total_iters - log.info(f'[Average] Iteration time: {avg_per_iter_time:.2f}s') - if total_num_tokens > 0: - avg_per_token_time = total_generation_time * 1000 / total_num_tokens - log.info(f'[Average] Latency: {avg_per_token_time:.2f} ms/token') diff --git a/llm_bench/python/utils/nncf_utils.py b/llm_bench/python/utils/nncf_utils.py deleted file mode 100644 index ce137b91fc..0000000000 --- a/llm_bench/python/utils/nncf_utils.py +++ /dev/null @@ -1,39 +0,0 @@ -from pathlib import Path - -import nncf - - -COMPRESSION_OPTIONS = { - "INT8": {"mode": nncf.CompressWeightsMode.INT8}, - "INT4_SYM": { - "mode": nncf.CompressWeightsMode.INT4_SYM, - "group_size": 128, - }, - "INT4_ASYM": { - "mode": nncf.CompressWeightsMode.INT4_ASYM, - "group_size": 128, - }, -} - - -def get_compressed_path(output_dir: str, base_precision, option: str): - return Path(output_dir) / "pytorch/dldt/compressed_weights" / f"OV_{base_precision}-{option}" - - -INT4_MODEL_CONFIGURATION = { - "dolly-v2-3b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 32, "ratio": 0.5}, - "gpt-j-6b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64}, - "opt-6.7b": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64, "ratio": 0.8}, - "bloomz-7b1": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 32, "ratio": 0.6}, - "red-pajama-incite-7b-instruct": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128}, - "zephyr-7b-beta": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "ratio": 0.6}, - "llama-2-7b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.6}, - "llama-2-7b-chat": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.8}, - "llama-2-13b-chat": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "ratio": 0.8}, - "stablelm-3b-4e1t": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "ratio": 0.8}, - "stablelm-epoch-3b-preview": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64, "ratio": 0.8}, - "stable-zephyr-3b-dpo": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64, "ratio": 0.8}, - "rocket-3b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.8}, - "chatglm2-6b": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.72}, - "qwen-7b-chat": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128, "ratio": 0.6}, -} diff --git a/llm_bench/python/utils/output_csv.py b/llm_bench/python/utils/output_csv.py deleted file mode 100644 index 085f9f1b29..0000000000 --- a/llm_bench/python/utils/output_csv.py +++ /dev/null @@ -1,199 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (C) 2018-2023 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 -import csv -from pathlib import Path - - -def output_comments(result, use_case, writer): - for key in result.keys(): - result[key] = '' - writer.writerow(result) - - comment_list = [] - if use_case == 'text_gen' or use_case == 'code_gen': - comment_list.append('input_size: Input token size') - comment_list.append('output_size: Text/Code generation models: generated text token size') - comment_list.append("infer_count: Limit the Text/Code generation models' output token size") - comment_list.append('latency: Text/Code generation models: ms/token. Output token size / generation time') - comment_list.append('1st_latency: Text/Code generation models: Fisrt token latency') - comment_list.append('2nd_avg_latency: Text/Code generation models: Other tokens (exclude first token) latency') - comment_list.append('1st_infer_latency: Text/Code generation models: Fisrt inference latency') - comment_list.append('2nd_infer_avg_latency: Text/Code generation models: Other inferences (exclude first inference) latency') - comment_list.append('result_md5: MD5 of generated text') - comment_list.append('prompt_idx: Index of prompts') - elif use_case == 'image_gen': - comment_list.append("infer_count: Tex2Image models' Inference(or Sampling) step size") - comment_list.append('1st_latency: First step lantency of unet') - comment_list.append('2nd_avg_latency: Other steps latency of unet(exclude first step)') - comment_list.append('1st_infer_latency: Same as 1st_latency') - comment_list.append('2nd_infer_avg_latency: Same as 2nd_avg_latency') - comment_list.append('prompt_idx: Index of prompts') - elif use_case == 'ldm_super_resolution': - comment_list.append("infer_count: Tex2Image models' Inference(or Sampling) step size") - comment_list.append('1st_latency: First step lantency of unet') - comment_list.append('2nd_avg_latency: Other steps lantency of unet(exclude first step)') - comment_list.append('1st_infer_latency: Same as 1st_latency') - comment_list.append('2nd_infer_avg_latency: Same as 2nd_avg_latency') - comment_list.append('prompt_idx: Image Index') - comment_list.append('tokenization_time: Tokenizer encode time') - comment_list.append('detokenization_time: Tokenizer decode time') - comment_list.append('pretrain_time: Total time of load model and compile model') - comment_list.append('generation_time: Time for one interaction. (e.g. The duration of answering one question or generating one picture)') - comment_list.append('iteration=0: warm-up; iteration=-1: average (exclude warm-up)') - comment_list.append( - 'max_rss_mem: max rss memory consumption;' 'the value in -1 iteration row is the maximum value of all available RSS memory numbers in iterations', - ) - comment_list.append( - 'max_shared_mem: max shared memory consumption;' - 'the value in -1 iteration row is the maximum value of all available shared memory numbers in iterations', - ) - - for comments in comment_list: - result['iteration'] = comments - writer.writerow(result) - - -def write_result(report_file, model, framework, device, model_args, iter_data_list, pretrain_time, model_precision): - header = [ - 'iteration', - 'model', - 'framework', - 'device', - 'pretrain_time(s)', - 'input_size', - 'infer_count', - 'generation_time(s)', - 'output_size', - 'latency(ms)', - '1st_latency(ms)', - '2nd_avg_latency(ms)', - 'precision', - 'max_rss_mem(MB)', - 'max_shared_mem(MB)', - 'prompt_idx', - '1st_infer_latency(ms)', - '2nd_infer_avg_latency(ms)', - 'num_beams', - 'batch_size', - 'tokenization_time', - 'detokenization_time', - 'result_md5', - ] - out_file = Path(report_file) - - if len(iter_data_list) > 0: - with open(out_file, 'w+', newline='') as f: - writer = csv.DictWriter(f, header) - writer.writeheader() - - total_generation_time = 0 - total_num_tokens = 0 - total_input_size = 0 - total_infer_count = 0 - total_first_token_latency = 0 - total_other_tokens_avg_latency = 0 - total_first_token_infer_latency = 0 - total_other_tokens_infer_avg_latency = 0 - total_max_rss_mem_consumption = 0 - total_max_shared_mem_consumption = 0 - result = {} - result['model'] = model - result['framework'] = framework - result['device'] = device - result['pretrain_time(s)'] = round(pretrain_time, 5) - result['precision'] = model_precision - result['num_beams'] = model_args['num_beams'] - result['batch_size'] = model_args['batch_size'] - total_iters = len(iter_data_list) - - skip_iter_nums = 0 - for i in range(total_iters): - iter_data = iter_data_list[i] - generation_time = iter_data['generation_time'] - latency = iter_data['latency'] - first_latency = iter_data['first_token_latency'] - other_latency = iter_data['other_tokens_avg_latency'] - first_token_infer_latency = iter_data['first_token_infer_latency'] - other_token_infer_latency = iter_data['other_tokens_infer_avg_latency'] - rss_mem = iter_data['max_rss_mem_consumption'] - shared_mem = iter_data['max_shared_mem_consumption'] - result['iteration'] = str(iter_data['iteration']) - if i > 0: - result['pretrain_time(s)'] = '' - - result['input_size'] = iter_data['input_size'] - result['infer_count'] = iter_data['infer_count'] - result['generation_time(s)'] = round(generation_time, 5) if generation_time != '' else generation_time - result['output_size'] = iter_data['output_size'] - result['latency(ms)'] = round(latency, 5) if latency != '' else latency - result['result_md5'] = iter_data['result_md5'] - result['1st_latency(ms)'] = round(first_latency, 5) if first_latency != '' else first_latency - result['2nd_avg_latency(ms)'] = round(other_latency, 5) if other_latency != '' else other_latency - result['1st_infer_latency(ms)'] = round(first_token_infer_latency, 5) if first_token_infer_latency != '' else first_token_infer_latency - result['2nd_infer_avg_latency(ms)'] = round(other_token_infer_latency, 5) if other_token_infer_latency != '' else other_token_infer_latency - result['max_rss_mem(MB)'] = round(rss_mem, 5) if rss_mem != '' else rss_mem - result['max_shared_mem(MB)'] = round(shared_mem, 5) if shared_mem != '' else shared_mem - result['prompt_idx'] = iter_data['prompt_idx'] - result['tokenization_time'] = iter_data['tokenization_time'] - result['detokenization_time'] = iter_data['detokenization_time'] - writer.writerow(result) - - # Skip the warm-up iteration - if iter_data['iteration'] > 0: - if iter_data['generation_time'] != '': - total_generation_time += iter_data['generation_time'] - if iter_data['output_size'] != '': - total_num_tokens += iter_data['output_size'] - if iter_data['input_size'] != '': - total_input_size += iter_data['input_size'] - if iter_data['first_token_latency'] != '': - total_first_token_latency += iter_data['first_token_latency'] - if iter_data['other_tokens_avg_latency'] != '': - total_other_tokens_avg_latency += iter_data['other_tokens_avg_latency'] - if iter_data['first_token_infer_latency'] != '': - total_first_token_infer_latency += iter_data['first_token_infer_latency'] - if iter_data['other_tokens_infer_avg_latency'] != '': - total_other_tokens_infer_avg_latency += iter_data['other_tokens_infer_avg_latency'] - if iter_data['infer_count'] != '': - total_infer_count += iter_data['infer_count'] - else: - skip_iter_nums = skip_iter_nums + 1 - if iter_data['max_rss_mem_consumption'] != '': - if iter_data['max_rss_mem_consumption'] > total_max_rss_mem_consumption: - total_max_rss_mem_consumption = iter_data['max_rss_mem_consumption'] - if iter_data['max_shared_mem_consumption'] != '': - if iter_data['max_shared_mem_consumption'] > total_max_shared_mem_consumption: - total_max_shared_mem_consumption = iter_data['max_shared_mem_consumption'] - total_iters -= skip_iter_nums - if total_iters > 0: - result['iteration'] = str('-1') - result['pretrain_time(s)'] = '' - if total_input_size > 0: - result['input_size'] = round(total_input_size / total_iters, 5) - if total_infer_count > 0: - result['infer_count'] = round(total_infer_count / total_iters, 5) - if total_generation_time > 0: - result['generation_time(s)'] = round(total_generation_time / total_iters, 5) - if total_num_tokens > 0: - avg_per_token_time = total_generation_time * 1000 / total_num_tokens - result['output_size'] = round(total_num_tokens / total_iters, 5) - result['latency(ms)'] = round(avg_per_token_time, 5) - else: - result['output_size'] = '' - result['latency(ms)'] = '' - if total_first_token_latency > 0: - result['1st_latency(ms)'] = round(total_first_token_latency / total_iters, 5) - if total_other_tokens_avg_latency > 0: - result['2nd_avg_latency(ms)'] = round(total_other_tokens_avg_latency / total_iters, 5) - if total_first_token_infer_latency > 0: - result['1st_infer_latency(ms)'] = round(total_first_token_infer_latency / total_iters, 5) - if total_other_tokens_infer_avg_latency > 0: - result['2nd_infer_avg_latency(ms)'] = round(total_other_tokens_infer_avg_latency / total_iters, 5) - if total_max_rss_mem_consumption > 0: - result['max_rss_mem(MB)'] = total_max_rss_mem_consumption - if total_max_shared_mem_consumption > 0: - result['max_shared_mem(MB)'] = total_max_shared_mem_consumption - writer.writerow(result) - - output_comments(result, model_args['use_case'], writer) diff --git a/llm_bench/python/who_what_benchmark/README.md b/llm_bench/python/who_what_benchmark/README.md new file mode 100644 index 0000000000..ab72bc7a89 --- /dev/null +++ b/llm_bench/python/who_what_benchmark/README.md @@ -0,0 +1,118 @@ +# Simple Accuracy Benchmark for Generative AI models + +## Features + +* Simple and quick accuracy test for compressed, quantized, pruned, distilled LLMs. It works with any model that suppors HuggingFace Transformers text generation API including: + * HuggingFace Transformers compressed models via [Bitsandbytes](https://huggingface.co/docs/transformers/main_classes/quantization#transformers.BitsAndBytesConfig) + * [GPTQ](https://huggingface.co/docs/transformers/main_classes/quantization#transformers.GPTQConfig) via HuggingFace API + * Llama.cpp via [BigDL-LLM](https://github.com/intel-analytics/BigDL/tree/main/python/llm) + * [OpenVINO](https://github.com/openvinotoolkit/openvino) and [NNCF](https://github.com/openvinotoolkit/nncf) via [Optimum-Intel](https://github.com/huggingface/optimum-intel) + * Support of custom datasets of the user choice +* Validation of text-to-image pipelines. Computes similarity score between generated images: + * Supports Diffusers library and Optimum-Intel via `Text2ImageEvaluator` class. + +The main idea is to compare similarity of text generation between baseline and optimized LLMs. + +The API provides a way to access to investigate the worst generated text examples. + +```python +from transformers import AutoModelForCausalLM, AutoTokenizer +import whowhatbench + +model_id = "facebook/opt-1.3b" +base_small = AutoModelForCausalLM.from_pretrained(model_id) +optimized_model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True, device_map="auto") +tokenizer = AutoTokenizer.from_pretrained(model_id) + +evaluator = whowhatbench.TextEvaluator(base_model=base_small, tokenizer=tokenizer) +metrics_per_prompt, metrics = evaluator.score(optimized_model) + +metric_of_interest = "similarity" +print(metric_of_interest, ": ", metrics["similarity"][0]) + +worst_examples = evaluator.worst_examples(top_k=5, metric=metric_of_interest) +print("Metric: ", metric_of_interest) +for e in worst_examples: + print("\t=========================") + print("\tPrompt: ", e["prompt"]) + print("\tBaseline Model:\n ", "\t" + e["source_model"]) + print("\tOptimized Model:\n ", "\t" + e["optimized_model"]) + +``` + +Use your own list of prompts to compare (e.g. from a dataset): +```python +from datasets import load_dataset +val = load_dataset("lambada", split="validation[20:40]") +prompts = val["text"] +... +metrics_per_prompt, metrics = evaluator.score(optimized_model, test_data=prompts) +``` + +### Installing + +* python -m venv eval_env +* source eval_env/bin/activate +* pip install -r requirements.txt + +### CLI example for text-generation models + +```sh +wwb --help + +# Run ground truth generation for uncompressed model on the first 32 samples from squad dataset +# Ground truth will be saved in llama_2_7b_squad_gt.csv file +wwb --base-model meta-llama/Llama-2-7b-chat-hf --gt-data llama_2_7b_squad_gt.csv --dataset squad --split validation[:32] --dataset-field question + +# Run comparison with compressed model on the first 32 samples from squad dataset +wwb --target-model /home/user/models/Llama_2_7b_chat_hf_int8 --gt-data llama_2_7b_squad_gt.csv --dataset squad --split validation[:32] --dataset-field question + +# Output will be like this +# similarity FDT SDT FDT norm SDT norm +# 0 0.972823 67.296296 20.592593 0.735127 0.151505 + +# Run ground truth generation for uncompressed model on internal set of questions +# Ground truth will be saved in llama_2_7b_squad_gt.csv file +wwb --base-model meta-llama/Llama-2-7b-chat-hf --gt-data llama_2_7b_wwb_gt.csv + +# Run comparison with compressed model on internal set of questions +wwb --target-model /home/user/models/Llama_2_7b_chat_hf_int8 --gt-data llama_2_7b_wwb_gt.csv + +# Use --num-samples to control the number of samples +wwb --base-model meta-llama/Llama-2-7b-chat-hf --gt-data llama_2_7b_wwb_gt.csv --num-samples 10 + +# Use -v for verbose mode to see the difference in the results +wwb --target-model /home/user/models/Llama_2_7b_chat_hf_int8 --gt-data llama_2_7b_wwb_gt.csv --num-samples 10 -v + +# Use --hf AutoModelForCausalLM to instantiate the model from model_id/folder +wwb --base-model meta-llama/Llama-2-7b-chat-hf --gt-data llama_2_7b_wwb_gt.csv --hf + +# Use --language parameter to control the language of promts +# Autodetection works for basic Chinese models +wwb --base-model meta-llama/Llama-2-7b-chat-hf --gt-data llama_2_7b_wwb_gt.csv --hf +``` + +### Example of Stable Diffusion comparison +```sh +# Export FP16 model +optimum-cli export openvino -m SimianLuo/LCM_Dreamshaper_v7 --weight-format fp16 sd-lcm-fp16 +# Export INT8 WOQ model +optimum-cli export openvino -m SimianLuo/LCM_Dreamshaper_v7 --weight-format int8 sd-lcm-int8 +# Collect the references +wwb --base-model sd-lcm-fp16 --gt-data lcm_test/sd_xl.json --model-type text-to-image +# Compute the metric +wwb --target-model sd-lcm-int8 --gt-data lcm_test/sd_xl.json --model-type text-to-image +``` + +### Supported metrics + +* `similarity` - averaged similarity measured by neural network trained for sentence embeddings. The best is 1.0, the minimum is 0.0, higher-better. +* `FDT` - Average position of the first divergent token betwen sentences generated by differnrt LLMs. The worst is 0, higher-better. [Paper.](https://arxiv.org/abs/2311.01544) +* `FDT norm` - Average share of matched tokens until first divergent one betwen sentences generated by differnrt LLMs. The best is 1, higher-better.[Paper.](https://arxiv.org/abs/2311.01544) +* `SDT` - Average number of divergent tokens in the evaluated outputs betwen sentences generated by differnrt LLMs. The best is 0, lower-better. [Paper.](https://arxiv.org/abs/2311.01544) +* `SDT norm` - Average share of divergent tokens in the evaluated outputs betwen sentences generated by differnrt LLMs. The best is 0, the maximum is 1, lower-better. [Paper.](https://arxiv.org/abs/2311.01544) + +### Notes + +* The generation of ground truth on uncompressed model must be run before comparison with compressed model. +* WWB uses [sentence-transformers/all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) for similarity measurement but you can use other similar network. diff --git a/llm_bench/python/who_what_benchmark/examples/gptq_eval.py b/llm_bench/python/who_what_benchmark/examples/gptq_eval.py new file mode 100644 index 0000000000..4f5ecf8c94 --- /dev/null +++ b/llm_bench/python/who_what_benchmark/examples/gptq_eval.py @@ -0,0 +1,29 @@ +import whowhatbench +from transformers import AutoModelForCausalLM, AutoTokenizer + +model_id = "meta-llama/Llama-2-7b-chat-hf" +model_gptq_id = "TheBloke/Llama-2-7B-Chat-GPTQ" + +model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto") +tokenizer = AutoTokenizer.from_pretrained(model_id) + + +evaluator = whowhatbench.Evaluator(base_model=model, tokenizer=tokenizer) + +model_int4 = AutoModelForCausalLM.from_pretrained(model_gptq_id, device_map="auto") +all_metrics_per_question, all_metrics = evaluator.score(model_int4) + +print(all_metrics_per_question) +print(all_metrics) + +metrics = ["similarity", "SDT norm"] + +for metric in metrics: + worst_examples = evaluator.worst_examples(top_k=5, metric=metric) + print("Metric: ", metric) + for e in worst_examples: + print("\t=========================") + print(f"\t{metric}: ", e[metric]) + print("\tPrompt: ", e["prompt"]) + print("\tSource Model:\n ", "\t" + e["source_model"]) + print("\tOptimized Model:\n ", "\t" + e["optimized_model"]) diff --git a/llm_bench/python/who_what_benchmark/examples/huggingface_eval.py b/llm_bench/python/who_what_benchmark/examples/huggingface_eval.py new file mode 100644 index 0000000000..9d25d90500 --- /dev/null +++ b/llm_bench/python/who_what_benchmark/examples/huggingface_eval.py @@ -0,0 +1,30 @@ +import whowhatbench +from transformers import AutoModelForCausalLM, AutoTokenizer + +model_id = "meta-llama/Llama-2-7b-chat-hf" + +model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto") +tokenizer = AutoTokenizer.from_pretrained(model_id) + + +evaluator = whowhatbench.Evaluator(base_model=model, tokenizer=tokenizer) + +model_int4 = AutoModelForCausalLM.from_pretrained( + model_id, load_in_4bit=True, device_map="auto" +) +all_metrics_per_question, all_metrics = evaluator.score(model_int4) + +print(all_metrics_per_question) +print(all_metrics) + +metrics = ["similarity", "SDT norm"] + +for metric in metrics: + worst_examples = evaluator.worst_examples(top_k=5, metric=metric) + print("Metric: ", metric) + for e in worst_examples: + print("\t=========================") + print(f"\t{metric}: ", e[metric]) + print("\tPrompt: ", e["prompt"]) + print("\tSource Model:\n ", "\t" + e["source_model"]) + print("\tOptimized Model:\n ", "\t" + e["optimized_model"]) diff --git a/llm_bench/python/who_what_benchmark/examples/openvino_batched_eval.py b/llm_bench/python/who_what_benchmark/examples/openvino_batched_eval.py new file mode 100644 index 0000000000..5781ddf229 --- /dev/null +++ b/llm_bench/python/who_what_benchmark/examples/openvino_batched_eval.py @@ -0,0 +1,131 @@ +from pathlib import PosixPath +import os +import tempfile + +import whowhatbench +from whowhatbench.wwb import load_dataset +from optimum.intel.openvino import OVModelForCausalLM + +from openvino_genai import ( + ContinuousBatchingPipeline, + SchedulerConfig, + GenerationConfig, + CacheEvictionConfig, + AggregationMode, +) + +from openvino_tokenizers import convert_tokenizer +from openvino import serialize +from transformers import AutoTokenizer + +model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" +MAX_NEW_TOKENS = 128 +SEQS_PER_REQUEST = 5 +MAX_SEQUENCES = 100 + + +model = OVModelForCausalLM.from_pretrained( + model_id, export=True, trust_remote_code=True +) +tokenizer = AutoTokenizer.from_pretrained(model_id) +model_path = PosixPath(tempfile.gettempdir()) / model_id +model.save_pretrained(model_path) + +ov_tokenizer, ov_detokenizer = convert_tokenizer( + tokenizer, with_detokenizer=True, skip_special_tokens=True +) +serialize(ov_tokenizer, model_path / "openvino_tokenizer.xml") +serialize(ov_detokenizer, model_path / "openvino_detokenizer.xml") + +scheduler_config_noopt = SchedulerConfig() +scheduler_config_noopt.num_kv_blocks = 300 +scheduler_config_noopt.dynamic_split_fuse = True +scheduler_config_noopt.max_num_batched_tokens = 256 +scheduler_config_noopt.max_num_seqs = 256 +scheduler_config_noopt.enable_prefix_caching = False + +scheduler_config_opt = SchedulerConfig() +scheduler_config_opt.num_kv_blocks = 300 +scheduler_config_opt.dynamic_split_fuse = True +scheduler_config_opt.max_num_batched_tokens = 256 +scheduler_config_opt.max_num_seqs = 256 +scheduler_config_opt.use_cache_eviction = True +scheduler_config_opt.enable_prefix_caching = False +eviction_config = CacheEvictionConfig(32, 32, 128, AggregationMode.NORM_SUM) +scheduler_config_opt.cache_eviction_config = eviction_config + +generation_config = GenerationConfig() +generation_config.num_return_sequences = 1 +generation_config.max_new_tokens = MAX_NEW_TOKENS + +data = load_dataset(path="squad", name=None, split="validation")["context"] +data_dict = {"prompts": list(dict({k: None for k in data}).keys())[:MAX_SEQUENCES]} + +model_cb_noopt = ContinuousBatchingPipeline( + model_path.absolute().as_posix(), scheduler_config_noopt, "CPU", {} +) +model_cb_opt = ContinuousBatchingPipeline( + model_path.absolute().as_posix(), scheduler_config_opt, "CPU", {} +) + + +GT_DATA_FILE = "gt_data.csv" + +if os.path.exists(GT_DATA_FILE): + evaluator = whowhatbench.TextEvaluator( + base_model=model_cb_noopt, + gt_data=GT_DATA_FILE, + tokenizer=tokenizer, + test_data=data_dict, + generation_config=generation_config, + max_new_tokens=MAX_NEW_TOKENS, + seqs_per_request=3, + ) +else: + evaluator = whowhatbench.TextEvaluator( + base_model=model_cb_noopt, + tokenizer=tokenizer, + test_data=data_dict, + generation_config=generation_config, + max_new_tokens=MAX_NEW_TOKENS, + seqs_per_request=3, + ) + evaluator.dump_gt("gt_data.csv") + + +all_metrics_per_question, all_metrics = evaluator.score(model_cb_opt) + + +print(all_metrics_per_question) +print(all_metrics) + +metrics = ["similarity", "SDT norm"] + +for metric in metrics: + worst_examples = evaluator.worst_examples(top_k=5, metric=metric) + print("Metric: ", metric) + for e in worst_examples: + print("\t=========================") + print(f"\t{metric}: ", e[metric]) + print("\tPrompt: ", e["prompt"]) + print("\tSource Model:\n ", "\t" + e["source_model"]) + print("\tOptimized Model:\n ", "\t" + e["optimized_model"]) + +pipeline_opt_metrics = model_cb_opt.get_metrics() +pipeline_noopt_metrics = model_cb_noopt.get_metrics() + +print( + f"No-opt cache usage: max {pipeline_noopt_metrics.max_cache_usage:.3f}, avg {pipeline_noopt_metrics.avg_cache_usage:.3f}" +) +print( + f"Opt cache usage: max {pipeline_opt_metrics.max_cache_usage:.3f}, avg {pipeline_opt_metrics.avg_cache_usage:.3f}" +) +max_optimization_ratio = ( + pipeline_noopt_metrics.max_cache_usage / pipeline_opt_metrics.max_cache_usage +) +avg_optimization_ratio = ( + pipeline_noopt_metrics.avg_cache_usage / pipeline_opt_metrics.avg_cache_usage +) +print( + f"Optimization ratios: max {max_optimization_ratio:.3f}x, avg {avg_optimization_ratio:.3f}x" +) diff --git a/llm_bench/python/who_what_benchmark/examples/openvino_eval.py b/llm_bench/python/who_what_benchmark/examples/openvino_eval.py new file mode 100644 index 0000000000..5c3691897c --- /dev/null +++ b/llm_bench/python/who_what_benchmark/examples/openvino_eval.py @@ -0,0 +1,31 @@ +import whowhatbench +from optimum.intel.openvino import OVModelForCausalLM +from transformers import AutoTokenizer + +model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" + +model = OVModelForCausalLM.from_pretrained(model_id, load_in_8bit=False, export=True) +tokenizer = AutoTokenizer.from_pretrained(model_id) + + +evaluator = whowhatbench.Evaluator(base_model=model, tokenizer=tokenizer) + +model_int8 = OVModelForCausalLM.from_pretrained( + model_id, load_in_8bit=True, export=True +) +all_metrics_per_question, all_metrics = evaluator.score(model_int8) + +print(all_metrics_per_question) +print(all_metrics) + +metrics = ["similarity", "SDT norm"] + +for metric in metrics: + worst_examples = evaluator.worst_examples(top_k=5, metric=metric) + print("Metric: ", metric) + for e in worst_examples: + print("\t=========================") + print(f"\t{metric}: ", e[metric]) + print("\tPrompt: ", e["prompt"]) + print("\tSource Model:\n ", "\t" + e["source_model"]) + print("\tOptimized Model:\n ", "\t" + e["optimized_model"]) diff --git a/llm_bench/python/who_what_benchmark/requirements.txt b/llm_bench/python/who_what_benchmark/requirements.txt new file mode 100644 index 0000000000..caae595e69 --- /dev/null +++ b/llm_bench/python/who_what_benchmark/requirements.txt @@ -0,0 +1,9 @@ +transformers>=4.35.2 +sentence-transformers>=2.2.2 +openvino>=2024.3.0 +openvino-telemetry +optimum-intel @ git+https://github.com/huggingface/optimum-intel.git +openvino-tokenizers +pandas>=2.0.3 +numpy>=1.23.5 +tqdm>=4.66.1 diff --git a/llm_bench/python/who_what_benchmark/setup.py b/llm_bench/python/who_what_benchmark/setup.py new file mode 100644 index 0000000000..e59d9d2630 --- /dev/null +++ b/llm_bench/python/who_what_benchmark/setup.py @@ -0,0 +1,16 @@ +from setuptools import find_packages, setup + +with open("requirements.txt") as f: + required = f.read().splitlines() + +setup( + name="whowhatbench", + version="1.0.0", + url="https://github.com/openvinotoolkit/openvino.genai.git", + author="Intel", + author_email="andrey.anufriev@intel.com", + description="Short test for LLMs", + packages=find_packages(), + install_requires=required, + entry_points={"console_scripts": ["wwb=whowhatbench.wwb:main"]}, +) diff --git a/llm_bench/python/who_what_benchmark/tests/test_cli_image.py b/llm_bench/python/who_what_benchmark/tests/test_cli_image.py new file mode 100644 index 0000000000..a1e1b3934b --- /dev/null +++ b/llm_bench/python/who_what_benchmark/tests/test_cli_image.py @@ -0,0 +1,98 @@ +import subprocess # nosec B404 +import os +import shutil +import pytest +import logging + + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def run_wwb(args): + logger.info(" ".join(["wwb"] + args)) + result = subprocess.run(["wwb"] + args, capture_output=True, text=True) + logger.info(result) + return result + + +@pytest.mark.parametrize( + ("model_id", "model_type", "backend"), + [ + ("hf-internal-testing/tiny-stable-diffusion-torch", "text-to-image", "hf"), + ("hf-internal-testing/tiny-stable-diffusion-torch", "text-to-image", "openvino"), + ("hf-internal-testing/tiny-stable-diffusion-xl-pipe", "text-to-image", "hf"), + ], +) +def test_image_model_types(model_id, model_type, backend): + GT_FILE = "test_sd.json" + wwb_args = [ + "--base-model", + model_id, + "--target-model", + model_id, + "--num-samples", + "1", + "--gt-data", + GT_FILE, + "--device", + "CPU", + "--model-type", + model_type, + ] + if backend == "hf": + wwb_args.append("--hf") + + result = run_wwb(wwb_args) + print(f"WWB result: {result}, {result.stderr}") + + try: + os.remove(GT_FILE) + except OSError: + pass + shutil.rmtree("reference", ignore_errors=True) + shutil.rmtree("target", ignore_errors=True) + + assert result.returncode == 0 + assert "Metrics for model" in result.stderr + assert "## Reference text" not in result.stderr + + +@pytest.mark.parametrize( + ("model_id", "model_type", "backend"), + [ + ("hf-internal-testing/tiny-stable-diffusion-torch", "text-to-image", "hf"), + ], +) +def test_image_custom_dataset(model_id, model_type, backend): + GT_FILE = "test_sd.json" + wwb_args = [ + "--base-model", + model_id, + "--num-samples", + "1", + "--gt-data", + GT_FILE, + "--device", + "CPU", + "--model-type", + model_type, + "--dataset", + "google-research-datasets/conceptual_captions", + "--dataset-field", + "caption", + ] + if backend == "hf": + wwb_args.append("--hf") + + result = run_wwb(wwb_args) + + assert os.path.exists(GT_FILE) + + try: + os.remove(GT_FILE) + except OSError: + pass + shutil.rmtree("reference", ignore_errors=True) + + assert result.returncode == 0 diff --git a/llm_bench/python/who_what_benchmark/tests/test_cli_text.py b/llm_bench/python/who_what_benchmark/tests/test_cli_text.py new file mode 100644 index 0000000000..161a9afb72 --- /dev/null +++ b/llm_bench/python/who_what_benchmark/tests/test_cli_text.py @@ -0,0 +1,207 @@ +import subprocess # nosec B404 +import os +import shutil +import tempfile +import pandas as pd +import pytest +import logging + +from transformers import AutoTokenizer +from optimum.intel.openvino import OVModelForCausalLM, OVWeightQuantizationConfig + + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def run_wwb(args): + logger.info(" ".join(["wwb"] + args)) + result = subprocess.run(["wwb"] + args, capture_output=True, text=True) + logger.info(result) + return result + + +model_id = "facebook/opt-125m" +tmp_dir = tempfile.mkdtemp() +base_model_path = os.path.join(tmp_dir, "opt125m") +target_model_path = os.path.join(tmp_dir, "opt125m_int8") + + +def setup_module(): + from optimum.exporters.openvino.convert import export_tokenizer + + logger.info("Create models") + tokenizer = AutoTokenizer.from_pretrained(model_id) + base_model = OVModelForCausalLM.from_pretrained(model_id) + base_model.save_pretrained(base_model_path) + tokenizer.save_pretrained(base_model_path) + export_tokenizer(tokenizer, base_model_path) + + target_model = OVModelForCausalLM.from_pretrained( + model_id, quantization_config=OVWeightQuantizationConfig(bits=8) + ) + target_model.save_pretrained(target_model_path) + tokenizer.save_pretrained(target_model_path) + export_tokenizer(tokenizer, target_model_path) + + +def teardown_module(): + logger.info("Remove models") + shutil.rmtree(tmp_dir) + + +def test_text_target_model(): + result = run_wwb( + [ + "--base-model", + base_model_path, + "--target-model", + target_model_path, + "--num-samples", + "2", + "--device", + "CPU", + "--model-type", + "text", + ] + ) + + assert result.returncode == 0 + assert "Metrics for model" in result.stderr + assert "## Reference text" not in result.stderr + + +@pytest.fixture +def test_text_gt_data(): + with tempfile.NamedTemporaryFile(suffix=".csv") as tmpfile: + temp_file_name = tmpfile.name + + result = run_wwb( + [ + "--base-model", + base_model_path, + "--gt-data", + temp_file_name, + "--dataset", + "EleutherAI/lambada_openai,en", + "--dataset-field", + "text", + "--split", + "test", + "--num-samples", + "2", + "--device", + "CPU", + ] + ) + data = pd.read_csv(temp_file_name) + os.remove(temp_file_name) + + assert result.returncode == 0 + assert len(data["questions"].values) == 2 + + +def test_text_output_directory(): + with tempfile.TemporaryDirectory() as temp_dir: + result = run_wwb( + [ + "--base-model", + base_model_path, + "--target-model", + target_model_path, + "--num-samples", + "2", + "--device", + "CPU", + "--output", + temp_dir, + ] + ) + assert result.returncode == 0 + assert "Metrics for model" in result.stderr + assert os.path.exists(os.path.join(temp_dir, "metrics_per_qustion.csv")) + assert os.path.exists(os.path.join(temp_dir, "metrics.csv")) + + +def test_text_verbose(): + result = run_wwb( + [ + "--base-model", + base_model_path, + "--target-model", + target_model_path, + "--num-samples", + "2", + "--device", + "CPU", + "--verbose", + ] + ) + assert result.returncode == 0 + assert "## Diff " in result.stderr + + +def test_text_language_autodetect(): + with tempfile.NamedTemporaryFile(suffix=".csv") as tmpfile: + temp_file_name = tmpfile.name + + result = run_wwb( + [ + "--base-model", + "Qwen/Qwen2-0.5B", + "--gt-data", + temp_file_name, + "--num-samples", + "2", + "--device", + "CPU", + ] + ) + data = pd.read_csv(temp_file_name) + os.remove(temp_file_name) + + assert result.returncode == 0 + assert "马克" in data["prompts"].values[0] + + +def test_text_hf_model(): + with tempfile.NamedTemporaryFile(suffix=".csv") as tmpfile: + temp_file_name = tmpfile.name + + result = run_wwb( + [ + "--base-model", + model_id, + "--gt-data", + temp_file_name, + "--num-samples", + "2", + "--device", + "CPU", + "--hf", + ] + ) + data = pd.read_csv(temp_file_name) + os.remove(temp_file_name) + + assert result.returncode == 0 + assert len(data["prompts"].values) == 2 + + +def test_text_genai_model(): + result = run_wwb( + [ + "--base-model", + base_model_path, + "--target-model", + target_model_path, + "--num-samples", + "2", + "--device", + "CPU", + "--genai", + ] + ) + assert result.returncode == 0 + assert "Metrics for model" in result.stderr + assert "## Reference text" not in result.stderr diff --git a/llm_bench/python/who_what_benchmark/whowhatbench/__init__.py b/llm_bench/python/who_what_benchmark/whowhatbench/__init__.py new file mode 100644 index 0000000000..4d61b0d086 --- /dev/null +++ b/llm_bench/python/who_what_benchmark/whowhatbench/__init__.py @@ -0,0 +1,13 @@ +from .registry import register_evaluator, MODELTYPE2TASK, EVALUATOR_REGISTRY +from .text_evaluator import TextEvaluator +from .text_evaluator import TextEvaluator as Evaluator +from .text2image_evaluator import Text2ImageEvaluator + +__all__ = [ + "Evaluator", + "register_evaluator", + "TextEvaluator", + "Text2ImageEvaluator", + "MODELTYPE2TASK", + "EVALUATOR_REGISTRY", +] diff --git a/llm_bench/python/who_what_benchmark/whowhatbench/registry.py b/llm_bench/python/who_what_benchmark/whowhatbench/registry.py new file mode 100644 index 0000000000..867b53e27a --- /dev/null +++ b/llm_bench/python/who_what_benchmark/whowhatbench/registry.py @@ -0,0 +1,37 @@ + +from abc import ABC, abstractmethod + + +# Registry for evaluators +EVALUATOR_REGISTRY = {} +MODELTYPE2TASK = { + "text": "text-generation", + "text-to-image": "text-to-image", +} + + +def register_evaluator(*names): + def decorate(cls): + for name in names: + assert ( + name not in EVALUATOR_REGISTRY + ), f"Evaluator named '{name}' conflicts with existing evaluators! Please register with a non-conflicting alias instead." + + EVALUATOR_REGISTRY[name] = cls + return cls + + return decorate + + +class BaseEvaluator(ABC): + @abstractmethod + def dump_gt(self, csv_name: str): + pass + + @abstractmethod + def score(self, model, **kwargs): + pass + + @abstractmethod + def worst_examples(self, top_k: int = 5, metric="similarity"): + pass diff --git a/llm_bench/python/who_what_benchmark/whowhatbench/text2image_evaluator.py b/llm_bench/python/who_what_benchmark/whowhatbench/text2image_evaluator.py new file mode 100644 index 0000000000..79dda2dcc9 --- /dev/null +++ b/llm_bench/python/who_what_benchmark/whowhatbench/text2image_evaluator.py @@ -0,0 +1,162 @@ +import os +from typing import Any, Union + +import pandas as pd +from tqdm import tqdm +from transformers import set_seed +import torch + +from .registry import register_evaluator, BaseEvaluator + +from .whowhat_metrics import ImageSimilarity + +default_data = { + "prompts": [ + "Cinematic, a vibrant Mid-century modern dining area, colorful chairs and a sideboard, ultra realistic, many detail", + "colibri flying near a flower, side view, forest background, natural light, photorealistic, 4k", + "Illustration of an astronaut sitting in outer space, moon behind him", + "A vintage illustration of a retro computer, vaporwave aesthetic, light pink and light blue", + "A view from beautiful alien planet, very beautiful, surealism, retro astronaut on the first plane, 8k photo", + "red car in snowy forest, epic vista, beautiful landscape, 4k, 8k", + "A raccoon trapped inside a glass jar full of colorful candies, the background is steamy with vivid colors", + "cute cat 4k, high-res, masterpiece, best quality, soft lighting, dynamic angle", + "A cat holding a sign that says hello OpenVINO", + "A small cactus with a happy face in the Sahara desert.", + ], +} + + +@register_evaluator("text-to-image") +class Text2ImageEvaluator(BaseEvaluator): + def __init__( + self, + base_model: Any = None, + gt_data: str = None, + test_data: Union[str, list] = None, + metrics="similarity", + similarity_model_id: str = "openai/clip-vit-large-patch14", + resolution=(512, 512), + num_inference_steps=4, + crop_prompts=True, + num_samples=None, + gen_image_fn=None, + seed=42, + ) -> None: + assert ( + base_model is not None or gt_data is not None + ), "Text generation pipeline for evaluation or ground trush data must be defined" + + self.test_data = test_data + self.metrics = metrics + self.resolution = resolution + self.crop_prompt = crop_prompts + self.num_samples = num_samples + self.num_inference_steps = num_inference_steps + self.seed = seed + self.similarity = None + self.similarity = ImageSimilarity(similarity_model_id) + self.last_cmp = None + self.gt_dir = os.path.dirname(gt_data) + if base_model: + self.gt_data = self._generate_data( + base_model, gen_image_fn, os.path.join(self.gt_dir, "reference") + ) + else: + self.gt_data = pd.read_csv(gt_data, keep_default_na=False) + + def dump_gt(self, csv_name: str): + self.gt_data.to_csv(csv_name) + + def score(self, model, gen_image_fn=None): + predictions = self._generate_data( + model, gen_image_fn, os.path.join(self.gt_dir, "target") + ) + + all_metrics_per_prompt = {} + all_metrics = {} + + if self.similarity: + metric_dict, metric_per_question = self.similarity.evaluate( + self.gt_data, predictions + ) + all_metrics.update(metric_dict) + all_metrics_per_prompt.update(metric_per_question) + + self.last_cmp = all_metrics_per_prompt + self.last_cmp["prompts"] = predictions["prompts"].values + self.last_cmp["source_model"] = self.gt_data["images"].values + self.last_cmp["optimized_model"] = predictions["images"].values + self.last_cmp = pd.DataFrame(self.last_cmp) + + return pd.DataFrame(all_metrics_per_prompt), pd.DataFrame([all_metrics]) + + def worst_examples(self, top_k: int = 5, metric="similarity"): + assert self.last_cmp is not None + + res = self.last_cmp.nsmallest(top_k, metric) + res = list(row for idx, row in res.iterrows()) + + return res + + def _generate_data(self, model, gen_image_fn=None, image_dir="reference"): + if hasattr(model, "reshape") and self.resolution is not None: + model.reshape( + batch_size=1, + height=self.resolution[0], + width=self.resolution[1], + num_images_per_prompt=1, + ) + + def default_gen_image_fn(model, prompt, num_inference_steps, generator=None): + output = model( + prompt, + num_inference_steps=num_inference_steps, + output_type="pil", + width=self.resolution[0], + height=self.resolution[0], + generator=generator, + ) + return output.images[0] + + gen_image_fn = gen_image_fn or default_gen_image_fn + + if self.test_data: + if isinstance(self.test_data, str): + data = pd.read_csv(self.test_data) + else: + if isinstance(self.test_data, dict): + assert "prompts" in self.test_data + data = dict(self.test_data) + else: + data = {"prompts": list(self.test_data)} + data = pd.DataFrame.from_dict(data) + else: + data = pd.DataFrame.from_dict(default_data) + + prompts = data["prompts"] + prompts = ( + prompts.values + if self.num_samples is None + else prompts.values[: self.num_samples] + ) + images = [] + rng = torch.Generator(device="cpu") + + if not os.path.exists(image_dir): + os.makedirs(image_dir) + for i, prompt in tqdm(enumerate(prompts), desc="Evaluate pipeline"): + set_seed(self.seed) + image = gen_image_fn( + model, + prompt, + self.num_inference_steps, + generator=rng.manual_seed(self.seed), + ) + image_path = os.path.join(image_dir, f"{i}.png") + image.save(image_path) + images.append(image_path) + + res_data = {"prompts": list(prompts), "images": images} + df = pd.DataFrame(res_data) + + return df diff --git a/llm_bench/python/who_what_benchmark/whowhatbench/text_evaluator.py b/llm_bench/python/who_what_benchmark/whowhatbench/text_evaluator.py new file mode 100644 index 0000000000..436d2be034 --- /dev/null +++ b/llm_bench/python/who_what_benchmark/whowhatbench/text_evaluator.py @@ -0,0 +1,268 @@ +from typing import Any, Union + +import pandas as pd +from tqdm import tqdm + +from .registry import register_evaluator, BaseEvaluator +from .whowhat_metrics import TextDivergency, TextSimilarity + +default_data = { + "en": { + "prompts": [ + "Who is Mark Twain?", + "Who is William Shakespeare?", + "Who is Agatha Christie?", + "Who is Barbara Cartland?", + "Who is Danielle Steel?", + "Who is Harold Robbins?", + "Who is Georges Simenon?", + "Who is Enid Blyton?", + "Who is Sidney Sheldon?", + "Who is Akira Toriyama?", + "Who is Leo Tolstoy?", + "Who is Alexander Pushkin?", + "Who is Stephen King?", + "What is C++?", + "What is Python?", + "What is Java?", + "What is JavaScript?", + "What is Perl?", + "What is OpenCV?", + "Who is the most famous writer?", + "Who is the most famous inventor?", + "Who is the most famous mathematician?", + "Who is the most famous composer?", + "Who is the most famous programmer?", + "Who is the most famous athlete?", + "Who is the most famous ancient Greek scientist?", + "What color will you get when you mix blue and yellow?", + ], + }, + "cn": { + "prompts": [ + "马克吐温是谁?", + "谁是威廉-莎士比亚?", + "阿加莎-克里斯蒂是谁?", + "芭芭拉-卡特兰是谁?", + "丹妮尔-斯蒂尔是谁?", + "谁是哈罗德-罗宾斯?", + "乔治-西默农是谁?", + "伊妮德-布莱顿是谁?", + "西德尼-谢尔顿是谁?", + "鸟山明是谁?", + "谁是列夫-托尔斯泰?", + "亚历山大-普希金是谁?", + "斯蒂芬-金是谁?", + "C++是什么?", + "Python是什么?", + "什么是 Java?", + "JavaScript是什么?", + "什么是 Perl?", + "什么是 OpenCV?", + "谁是最著名的作家?", + "谁是最有名的发明家?", + "谁是最著名的数学家?", + "最著名的作曲家是谁?", + "谁是最有名的程序员?", + "谁是最著名的运动员?", + "谁是最著名的古希腊科学家?", + "蓝色和黄色混合会得到什么颜色?", + ], + }, +} + + +def autodetect_language(model): + model2language = { + "chatglm": "cn", + "qwen2": "cn", + "qwen": "cn", + "baichuan": "cn", + "minicpmv": "cn", + "internlm": "cn", + } + + if not hasattr(model, "config"): + return "en" + return model2language.get(model.config.model_type, "en") + + +@register_evaluator( + "text-generation", "text-generation-with-past", "text2text-generation" +) +class TextEvaluator(BaseEvaluator): + def __init__( + self, + base_model: Any = None, + tokenizer: Any = None, + gt_data: str = None, + test_data: Union[str, list] = None, + metrics=("similarity", "divergency"), + similarity_model_id: str = "sentence-transformers/all-mpnet-base-v2", + max_new_tokens=128, + crop_question=True, + num_samples=None, + language=None, + gen_answer_fn=None, + generation_config=None, + generation_config_base=None, + seqs_per_request=None, + ) -> None: + assert ( + base_model is not None or gt_data is not None + ), "Text generation pipeline for evaluation or ground trush data must be defined" + + self.test_data = test_data + self.metrics = metrics + self.max_new_tokens = max_new_tokens + self.tokenizer = tokenizer + self._crop_question = crop_question + self.num_samples = num_samples + self.generation_config = generation_config + self.generation_config_base = generation_config + self.seqs_per_request = seqs_per_request + if self.generation_config is not None: + assert self.seqs_per_request is not None + + # Take language from the base model if provided + self.language = language + if self.language is None: + if base_model is not None: + self.language = autodetect_language(base_model) + + if base_model: + self.gt_data = self._generate_data( + base_model, gen_answer_fn, generation_config=generation_config + ) + else: + self.gt_data = pd.read_csv(gt_data, keep_default_na=False) + + # Take language ground truth if no base model provided + if self.language is None and "language" in self.gt_data.columns: + self.language = self.gt_data["language"].values[0] + + self.similarity = None + self.divergency = None + if "similarity" in self.metrics: + self.similarity = TextSimilarity(similarity_model_id) + if "divergency" in self.metrics: + assert tokenizer is not None + self.divergency = TextDivergency(tokenizer) + + self.last_cmp = None + + def dump_gt(self, csv_name: str): + self.gt_data.to_csv(csv_name) + + def score(self, model, gen_answer_fn=None): + predictions = self._generate_data(model, gen_answer_fn, self.generation_config) + + all_metrics_per_prompt = {} + all_metrics = {} + + if self.similarity: + metric_dict, metric_per_question = self.similarity.evaluate( + self.gt_data, predictions + ) + all_metrics.update(metric_dict) + all_metrics_per_prompt.update(metric_per_question) + + if self.divergency: + metric_dict, metric_per_question = self.divergency.evaluate( + self.gt_data, predictions + ) + all_metrics.update(metric_dict) + all_metrics_per_prompt.update(metric_per_question) + + self.last_cmp = all_metrics_per_prompt + self.last_cmp["prompts"] = predictions["prompts"].values + self.last_cmp["source_model"] = self.gt_data["answers"].values + self.last_cmp["optimized_model"] = predictions["answers"].values + self.last_cmp = pd.DataFrame(self.last_cmp) + self.last_cmp.rename(columns={"prompts": "prompt"}, inplace=True) + + return pd.DataFrame(all_metrics_per_prompt), pd.DataFrame([all_metrics]) + + def worst_examples(self, top_k: int = 5, metric="similarity"): + assert self.last_cmp is not None + + if metric in ["SDT", "SDT norm"]: + res = self.last_cmp.nlargest(top_k, metric) + else: + res = self.last_cmp.nsmallest(top_k, metric) + + res = list(row for idx, row in res.iterrows()) + + return res + + def _generate_data(self, model, gen_answer_fn=None, generation_config=None): + def default_gen_answer(model, tokenizer, prompt, max_new_tokens, crop_question): + inputs = self.tokenizer(prompt, return_tensors="pt") + + tokens = model.generate(**inputs, do_sample=False, max_new_tokens=max_new_tokens) + out = self.tokenizer.batch_decode(tokens, skip_special_tokens=True)[0] + return out[len(prompt) :] if crop_question else out + + gen_answer_fn = gen_answer_fn or default_gen_answer + + if self.test_data: + if isinstance(self.test_data, str): + data = pd.read_csv(self.test_data) + else: + if isinstance(self.test_data, dict): + assert "prompts" in self.test_data + data = dict(self.test_data) + else: + data = {"prompts": list(self.test_data)} + data = pd.DataFrame.from_dict(data) + else: + if self.language is None: + print( + "No language detecting in the base model or ground truth data. Taking language from target model." + ) + self.language = autodetect_language(model) + data = pd.DataFrame.from_dict(default_data[self.language]) + + prompt_data = data["prompts"] + + answers = [] + prompts = ( + prompt_data.values + if self.num_samples is None + else prompt_data.values[: self.num_samples] + ) + + if generation_config is None: + for p in tqdm(prompts, desc="Evaluate pipeline"): + answers.append( + gen_answer_fn( + model, + self.tokenizer, + p, + self.max_new_tokens, + self._crop_question, + ) + ) + else: + with tqdm(total=len(prompt_data.values)) as progress_bar: + batch = [] + for p_idx, p in enumerate(prompt_data.values): + progress_bar.update(1) + batch.append(p) + if ( + len(batch) == self.seqs_per_request + or p_idx == len(prompt_data.values) - 1 + ): + ans_batch = model.generate( + batch, [generation_config] * len(batch) + ) + for ans in ans_batch: + answers.append(ans.m_generation_ids[0]) + + batch.clear() + + res_data = {"prompts": list(prompts), "answers": answers} + df = pd.DataFrame(res_data) + df["language"] = self.language + + return df diff --git a/llm_bench/python/who_what_benchmark/whowhatbench/whowhat_metrics.py b/llm_bench/python/who_what_benchmark/whowhatbench/whowhat_metrics.py new file mode 100644 index 0000000000..bbf96a3312 --- /dev/null +++ b/llm_bench/python/who_what_benchmark/whowhatbench/whowhat_metrics.py @@ -0,0 +1,161 @@ +""" +Metrics for text similarity +""" + +from difflib import SequenceMatcher +from PIL import Image +import torch +import torch.nn.functional as F + +import numpy as np +from sentence_transformers import SentenceTransformer, util +from transformers import CLIPImageProcessor, CLIPModel +from tqdm import tqdm + + +def evaluate_similarity(model, data_gold, data_prediction): + answers_gold = data_gold["answers"].values + answers_prediction = data_prediction["answers"].values + + metric_per_question = [] + for gold, prediction in tqdm( + zip(answers_gold, answers_prediction), desc="Similarity evaluation" + ): + embeddings = model.encode([gold, prediction]) + cos_sim = util.cos_sim(embeddings, embeddings) + metric_per_question.append(cos_sim[0, 1].item()) + + metric_dict = {"similarity": np.mean(metric_per_question)} + return metric_dict, {"similarity": metric_per_question} + + +def evaluate_divergency(tokenizer, data_gold, data_prediction): + answers_gold = data_gold["answers"].values + answers_prediction = data_prediction["answers"].values + + DEBUG = False + # NOTE: a - reference answers, b - answers to evaluate + fdt_list = [] # each value = the position of first divergent (different) token. + sdt_list = [] # each value = number of tokens to correct in the prediction. + sdtn_list = [] # each value = share of tokens to correct in the prediction + fdt_max = [] # each value = total number of tokens in the reference + for a_answer, b_answer in zip(answers_gold, answers_prediction): + a_indexes = tokenizer.encode(a_answer, return_tensors="pt").squeeze().tolist() + b_indexes = tokenizer.encode(b_answer, return_tensors="pt").squeeze().tolist() + if not a_indexes and not b_indexes: + sdt_list.append(0) + fdt_list.append(0) + sdtn_list.append(0) + fdt_max.append(0) + elif a_indexes and not b_indexes: + sdt_list.append(len(a_indexes)) + fdt_list.append(0) + sdtn_list.append(1) + fdt_max.append(len(a_indexes)) + elif not a_indexes and b_indexes: + sdt_list.append(len(b_indexes)) + fdt_list.append(0) + sdtn_list.append(1) + fdt_max.append(0) + else: + if isinstance(a_indexes, int): + a_indexes = list([a_indexes]) + if isinstance(b_indexes, int): + b_indexes = list([b_indexes]) + fdt_max.append(len(a_indexes)) + + matcher = SequenceMatcher(None, a_indexes, b_indexes) + blocks = matcher.get_matching_blocks() + a, b, size = blocks[0] + fdt = 0 + if a == 0 and b == 0: + fdt = blocks[0].size + fdt_list.append(fdt) + + num_matched = sum(block.size for block in blocks) + sdt = len(b_indexes) - num_matched + sdt_list.append(sdt) + sdt_norm = sdt / len(b_indexes) + sdtn_list.append(sdt_norm) + + if DEBUG: + print(blocks) + for block in blocks: + a, b, size = block + matched = a_indexes[a : a + size + 1] + print(matched) + print(tokenizer.decode(matched)) + matched = b_indexes[b : b + size + 1] + print(matched) + print(tokenizer.decode(matched)) + fdt_max = np.average(fdt_max) + metric_per_question = { + "FDT": fdt_list, + "SDT": sdt_list, + "FDT norm": np.array(fdt_list) / fdt_max, + "SDT norm": sdtn_list, + } + + fdt_avg = np.average(fdt_list) + metric_dict = { + "FDT": fdt_avg, + "SDT": np.average(sdt_list), + "FDT norm": fdt_avg / fdt_max, + "SDT norm": np.average(sdtn_list), + } + + return metric_dict, metric_per_question + + +class TextSimilarity: + def __init__(self, model_id) -> None: + self.model = SentenceTransformer(model_id) + + def evaluate(self, gt, prediction): + return evaluate_similarity(self.model, gt, prediction) + + +class TextDivergency: + def __init__(self, tokenizer) -> None: + self.tokenizer = tokenizer + + def evaluate(self, gt, prediction): + return evaluate_divergency(self.tokenizer, gt, prediction) + + +# Image metrics +def evaluate_image_similarity(processor, model, data_gold, data_prediction): + images_gold = data_gold["images"].values + images_prediction = data_prediction["images"].values + + metric_per_image = [] + for gold, prediction in tqdm( + zip(images_gold, images_prediction), desc="Image Similarity evaluation" + ): + gold_image = Image.open(gold) + prediction_image = Image.open(prediction) + + gold_inputs = processor(images=gold_image, return_tensors="pt")["pixel_values"] + prediction_inputs = processor(images=prediction_image, return_tensors="pt")[ + "pixel_values" + ] + + with torch.no_grad(): + gold_outputs = model.get_image_features(gold_inputs) + prediction_outputs = model.get_image_features(prediction_inputs) + + cos_sim = F.cosine_similarity(gold_outputs, prediction_outputs) + print("cos_sim: ", cos_sim.item()) + metric_per_image.append(cos_sim.item()) + + metric_dict = {"similarity": np.mean(metric_per_image)} + return metric_dict, {"similarity": metric_per_image} + + +class ImageSimilarity: + def __init__(self, model_id) -> None: + self.processor = CLIPImageProcessor.from_pretrained(model_id) + self.model = CLIPModel.from_pretrained(model_id).eval() + + def evaluate(self, gt, prediction): + return evaluate_image_similarity(self.processor, self.model, gt, prediction) diff --git a/llm_bench/python/who_what_benchmark/whowhatbench/wwb.py b/llm_bench/python/who_what_benchmark/whowhatbench/wwb.py new file mode 100644 index 0000000000..19c6aed2cd --- /dev/null +++ b/llm_bench/python/who_what_benchmark/whowhatbench/wwb.py @@ -0,0 +1,469 @@ +import argparse +import difflib +import os +import json +import pandas as pd +import logging +from datasets import load_dataset +from diffusers import DiffusionPipeline +from optimum.intel.openvino import OVModelForCausalLM +from optimum.utils import NormalizedConfigManager, NormalizedTextConfig +from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM + +from optimum.exporters.tasks import TasksManager +from optimum.intel import OVPipelineForText2Image + +import openvino_genai +from whowhatbench import EVALUATOR_REGISTRY, MODELTYPE2TASK + + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +TasksManager._SUPPORTED_MODEL_TYPE["stablelm-epoch"] = ( + TasksManager._SUPPORTED_MODEL_TYPE["llama"] +) +NormalizedConfigManager._conf["stablelm-epoch"] = NormalizedTextConfig.with_args( + num_layers="num_hidden_layers", + num_attention_heads="num_attention_heads", +) + + +class GenAIModelWrapper: + """ + A helper class to store additional attributes for GenAI models + """ + + def __init__(self, model, model_dir): + self.model = model + self.config = AutoConfig.from_pretrained(model_dir) + + def __getattr__(self, attr): + if attr in self.__dict__: + return getattr(self, attr) + else: + return getattr(self.model, attr) + + +def load_text_genai_pipeline(model_dir, device="CPU"): + try: + import openvino_genai + except ImportError: + logger.error("Failed to import openvino_genai package. Please install it.") + exit(-1) + logger.info("Using OpenVINO GenAI API") + return GenAIModelWrapper(openvino_genai.LLMPipeline(model_dir, device), model_dir) + + +def load_text_model( + model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False +): + if use_hf: + logger.info("Using HF Transformers API") + return AutoModelForCausalLM.from_pretrained( + model_id, trust_remote_code=True, device_map=device.lower() + ) + + if use_genai: + return load_text_genai_pipeline(model_id, device) + + if ov_config: + with open(ov_config) as f: + ov_options = json.load(f) + else: + ov_options = None + try: + model = OVModelForCausalLM.from_pretrained( + model_id, trust_remote_code=True, device=device, ov_config=ov_options + ) + except ValueError: + config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) + model = OVModelForCausalLM.from_pretrained( + model_id, + config=config, + trust_remote_code=True, + use_cache=True, + device=device, + ov_config=ov_options, + ) + return model + + +TEXT2IMAGE_TASK2CLASS = { + "text-to-image": OVPipelineForText2Image, +} + + +def load_text2image_model( + model_type, model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False +): + if ov_config: + with open(ov_config) as f: + ov_options = json.load(f) + else: + ov_options = None + + if use_hf: + return DiffusionPipeline.from_pretrained(model_id, trust_remote_code=True) + + TEXT2IMAGEPipeline = TEXT2IMAGE_TASK2CLASS[model_type] + + try: + model = TEXT2IMAGEPipeline.from_pretrained( + model_id, trust_remote_code=True, device=device, ov_config=ov_options + ) + except ValueError: + config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) + model = TEXT2IMAGEPipeline.from_pretrained( + model_id, + config=config, + trust_remote_code=True, + use_cache=True, + device=device, + ov_config=ov_options, + ) + return model + + +def load_model( + model_type, model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False +): + from .registry import MODELTYPE2TASK + + if model_id is None: + return None + + if model_type == "text": + return load_text_model(model_id, device, ov_config, use_hf, use_genai) + elif MODELTYPE2TASK[model_type] == "text-to-image": + return load_text2image_model( + model_type, model_id, device, ov_config, use_hf, use_genai + ) + else: + raise ValueError(f"Unsupported model type: {model_type}") + + +def load_prompts(args): + if args.dataset is None: + return None + split = "validation" + if args.split is not None: + split = args.split + if "," in args.dataset: + path_name = args.dataset.split(",") + path = path_name[0] + name = path_name[1] + else: + path = args.dataset + name = None + data = load_dataset(path=path, name=name, split=split) + + res = data[args.dataset_field] + + res = {"prompts": list(res)} + + return res + + +def parse_args(): + parser = argparse.ArgumentParser( + prog="WWB CLI", + description="This sript generates answers for questions from csv file", + ) + + parser.add_argument( + "--base-model", + default=None, + help="Model to ground truth generation.", + ) + parser.add_argument( + "--target-model", + default=None, + help="Model to comparison with base_model. Usually it is compressed, quantized version of base_model.", + ) + parser.add_argument( + "--tokenizer", + default=None, + help="Tokenizer for divergency metric. If not defined then will be load from base_model or target_model.", + ) + + parser.add_argument( + "--gt-data", + default=None, + help="CSV file with base_model generation. If defined and exists then base_model will not used." + "I defined and not exists them will be generated by base_model evaluation.", + ) + parser.add_argument( + "--model-type", + type=str, + choices=["text", "text-to-image"], + default="text", + help="Indicated the model type, e.g. 'text' - for LLMs, 't2im' - for text-to-image pipelines.", + ) + parser.add_argument( + "--data-encoder", + type=str, + default="sentence-transformers/all-mpnet-base-v2", + help="Model for measurement of similarity between base_model and target_model." + "By default it is sentence-transformers/all-mpnet-base-v2," + "but for Chinese LLMs better to use sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2.", + ) + parser.add_argument( + "--dataset", + type=str, + default=None, + help="Name of the dataset with prompts. The interface for dataset is load_dataset from datasets library." + "Please provide this argument in format path,name (for example wikitext,wikitext-2-v1)." + "If None then internal list of prompts will be used.", + ) + parser.add_argument( + "--dataset-field", + type=str, + default="text", + help="The name of field in dataset for prompts. For example question or context in squad." + "Will be used only if dataset is defined.", + ) + parser.add_argument( + "--split", + type=str, + default=None, + help="Split of prompts from dataset (for example train, validation, train[:32])." + "Will be used only if dataset is defined.", + ) + parser.add_argument( + "--output", + type=str, + default=None, + help="Directory name for saving the per sample comparison and metrics in CSV files.", + ) + parser.add_argument( + "--num-samples", + type=int, + default=None, + help="Maximum number of prompts to use from dataset", + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + help="Print results and their difference", + ) + parser.add_argument( + "--device", + type=str, + default="CPU", + help="Device to run the model, e.g. 'CPU', 'GPU'.", + ) + parser.add_argument( + "--ov-config", + type=str, + default=None, + help="Path to the JSON file that contains OpenVINO Runtime configuration.", + ) + parser.add_argument( + "--language", + type=str, + choices=["en", "cn"], + default=None, + help="Used to select default prompts based on the primary model language, e.g. 'en', 'ch'.", + ) + parser.add_argument( + "--hf", + action="store_true", + help="Use AutoModelForCausalLM from transformers library to instantiate the model.", + ) + parser.add_argument( + "--genai", + action="store_true", + help="Use LLMPipeline from transformers library to instantiate the model.", + ) + + return parser.parse_args() + + +def check_args(args): + assert not (args.base_model is None and args.target_model is None) + assert not (args.base_model is None and args.gt_data is None) + + +def load_tokenizer(args): + tokenizer = None + if args.tokenizer is not None: + tokenizer = AutoTokenizer.from_pretrained( + args.tokenizer, trust_remote_code=True + ) + elif args.base_model is not None: + tokenizer = AutoTokenizer.from_pretrained( + args.base_model, trust_remote_code=True + ) + else: + tokenizer = AutoTokenizer.from_pretrained( + args.target_model, trust_remote_code=True + ) + + return tokenizer + + +def diff_strings(a: str, b: str, *, use_loguru_colors: bool = False) -> str: + output = [] + matcher = difflib.SequenceMatcher(None, a, b) + if use_loguru_colors: + green = "<GREEN><black>" + red = "<RED><black>" + endgreen = "</black></GREEN>" + endred = "</black></RED>" + else: + green = "\x1b[38;5;16;48;5;2m" + red = "\x1b[38;5;16;48;5;1m" + endgreen = "\x1b[0m" + endred = "\x1b[0m" + + for opcode, a0, a1, b0, b1 in matcher.get_opcodes(): + if opcode == "equal": + output.append(a[a0:a1]) + elif opcode == "insert": + output.append(f"{green}{b[b0:b1]}{endgreen}") + elif opcode == "delete": + output.append(f"{red}{a[a0:a1]}{endred}") + elif opcode == "replace": + output.append(f"{green}{b[b0:b1]}{endgreen}") + output.append(f"{red}{a[a0:a1]}{endred}") + return "".join(output) + + +def genai_gen_answer(model, tokenizer, question, max_new_tokens, skip_question): + config = openvino_genai.GenerationConfig() + config.max_new_tokens = max_new_tokens + out = model.generate(question, config) + return out + + +def get_evaluator(base_model, args): + # config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) + # task = TasksManager.infer_task_from_model(config._name_or_path) + # TODO: Add logic to auto detect task based on model_id (TaskManager does not work for locally saved models) + task = MODELTYPE2TASK[args.model_type] + + try: + EvaluatorCLS = EVALUATOR_REGISTRY[task] + prompts = load_prompts(args) + + if task == "text-generation": + tokenizer = load_tokenizer(args) + return EvaluatorCLS( + base_model=base_model, + gt_data=args.gt_data, + test_data=prompts, + tokenizer=tokenizer, + similarity_model_id=args.data_encoder, + num_samples=args.num_samples, + language=args.language, + gen_answer_fn=genai_gen_answer if args.genai else None, + ) + elif task == "text-to-image": + return EvaluatorCLS( + base_model=base_model, + gt_data=args.gt_data, + test_data=prompts, + num_samples=args.num_samples, + ) + else: + raise ValueError(f"Unsupported task: {task}") + + except KeyError: + raise ValueError( + f"Attempted to load evaluator for '{task}', but no evaluator for this model type found!" + "Supported model types: {', '.join(EVALUATOR_REGISTRY.keys())}" + ) + + +def print_text_results(evaluator): + metric_of_interest = "similarity" + worst_examples = evaluator.worst_examples(top_k=5, metric=metric_of_interest) + for i, e in enumerate(worst_examples): + ref_text = "" + actual_text = "" + diff = "" + for l1, l2 in zip( + e["source_model"].splitlines(), e["optimized_model"].splitlines() + ): + if l1 == "" and l2 == "": + continue + ref_text += l1 + "\n" + actual_text += l2 + "\n" + diff += diff_strings(l1, l2) + "\n" + + logger.info( + "--------------------------------------------------------------------------------------" + ) + logger.info("## Reference text %d:\n%s", i + 1, ref_text) + logger.info("## Actual text %d:\n%s", i + 1, actual_text) + logger.info("## Diff %d: ", i + 1) + logger.info(diff) + + +def print_image_results(evaluator): + metric_of_interest = "similarity" + worst_examples = evaluator.worst_examples(top_k=1, metric=metric_of_interest) + for i, e in enumerate(worst_examples): + logger.info( + "--------------------------------------------------------------------------------------" + ) + logger.info(f"Top-{i+1} example:") + logger.info(e) + + +def main(): + args = parse_args() + check_args(args) + + if args.gt_data and os.path.exists(args.gt_data): + evaluator = get_evaluator(None, args) + else: + base_model = load_model( + args.model_type, + args.base_model, + args.device, + args.ov_config, + args.hf, + args.genai, + ) + evaluator = get_evaluator(base_model, args) + + if args.gt_data: + evaluator.dump_gt(args.gt_data) + del base_model + + if args.target_model: + target_model = load_model( + args.model_type, + args.target_model, + args.device, + args.ov_config, + args.hf, + args.genai, + ) + all_metrics_per_question, all_metrics = evaluator.score( + target_model, genai_gen_answer if args.genai else None + ) + logger.info("Metrics for model: %s", args.target_model) + logger.info(all_metrics) + + if args.output: + if not os.path.exists(args.output): + os.mkdir(args.output) + df = pd.DataFrame(all_metrics_per_question) + df.to_csv(os.path.join(args.output, "metrics_per_qustion.csv")) + df = pd.DataFrame(all_metrics) + df.to_csv(os.path.join(args.output, "metrics.csv")) + + if args.verbose and args.target_model is not None: + if args.model_type == "text": + print_text_results(evaluator) + elif "text-to-image" in args.model_type: + print_image_results(evaluator) + + +if __name__ == "__main__": + main() diff --git a/miniCPM-V-2_6.py b/miniCPM-V-2_6.py new file mode 100644 index 0000000000..205f12628c --- /dev/null +++ b/miniCPM-V-2_6.py @@ -0,0 +1,22 @@ +from optimum.intel.openvino import OVModelForVisualCausalLM +from transformers import AutoProcessor +from PIL import Image +import requests +import cv2 +import numpy as np +res = 448, 448 +im = np.arange(res[0] * res[1] * 3, dtype=np.uint8) % 255 +im = im.reshape([*res, 3]) +cv2.imwrite("lines.png", im) +model_id = "openbmb/MiniCPM-V-2_6" +processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) +prompt = processor.tokenizer.apply_chat_template([{"role": "user", "content": "(<image>./</image>)\nWhat is unusual on this image?"}], tokenize=False, add_generation_prompt=True) +image = Image.open("/home/vzlobin/r/g/g.png").convert('RGB') +# image = Image.open(requests.get("https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11", stream=True).raw).convert('RGB') +model = OVModelForVisualCausalLM.from_pretrained("MiniCPM-V-2_6", trust_remote_code=True) +inputs = processor([prompt], [image], return_tensors="pt") +result = model.generate(**inputs, max_new_tokens=200) +decoded = processor.tokenizer.batch_decode(result[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0] +print(decoded) +with open("ref.txt", "w") as f: + f.write(decoded) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000..7be4478108 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,49 @@ +[project] +name = "openvino-genai" +version = "2024.5.0.0" +description = "Python bindings for https://github.com/openvinotoolkit/openvino.genai" +requires-python = ">=3.8" +readme = {file = "src/README.md", content-type="text/markdown"} +license = {text = "OSI Approved :: Apache Software License"} +authors = [ + { name = "OpenVINO Developers", email = "openvino@intel.com" }, +] +classifiers = [ + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] +dependencies = [ + "openvino_tokenizers~=2024.5.0.0.dev" +] + +[tool.py-build-cmake.module] +directory = "src/python" + +[tool.py-build-cmake.sdist] +exclude = ["llm_bench", "samples", "tests", "thirdparty"] + +[tool.py-build-cmake.cmake] +minimum_version = "3.23" +build_type = "Release" +config = ["Release"] +find_python3 = true +build_args = ["--parallel", "--target", "py_generate_pipeline"] +install_args = ["--strip"] +install_components = ["wheel_genai"] +options = {"BUILD_TOKENIZERS" = "OFF"} + +[build-system] +requires = [ + "py-build-cmake@git+https://github.com/tttapa/py-build-cmake@7ab73da351c7140f06d727a8705bece4cf544cd9", + "cmake~=3.23" +] +build-backend = "py_build_cmake.build" + +[tool.pytest.ini_options] +markers = [ + "nightly", + "precommit: (deselect with '-m \"precommit\"')", +] diff --git a/requirements-build.txt b/requirements-build.txt new file mode 100644 index 0000000000..2611a89b08 --- /dev/null +++ b/requirements-build.txt @@ -0,0 +1 @@ +cmake~=3.30 \ No newline at end of file diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt new file mode 100644 index 0000000000..2a8f26ff4d --- /dev/null +++ b/samples/CMakeLists.txt @@ -0,0 +1,43 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +add_subdirectory(cpp/beam_search_causal_lm) +add_subdirectory(cpp/chat_sample) +add_subdirectory(cpp/continuous_batching_accuracy) +add_subdirectory(cpp/continuous_batching_benchmark) +add_subdirectory(cpp/greedy_causal_lm) +add_subdirectory(cpp/lora_greedy_causal_lm) +add_subdirectory(cpp/multinomial_causal_lm) +add_subdirectory(cpp/prompt_lookup_decoding_lm) +add_subdirectory(cpp/visual_language_chat) +add_subdirectory(cpp/speculative_decoding_lm) +add_subdirectory(cpp/benchmark_genai) +add_subdirectory(cpp/whisper_speech_recognition) +add_subdirectory(cpp/text2image) + +install(FILES requirements.txt DESTINATION samples + COMPONENT cpp_samples_genai) + +install(DIRECTORY + cpp/beam_search_causal_lm + cpp/chat_sample + cpp/greedy_causal_lm + cpp/multinomial_causal_lm + # Don't install prompt_lookup_decoding_lm and speculative_decoding_lm because they don't use openvino_genai library and arent verifyed yet. + # Don't install continuous_batching_accuracy and continuous_batching_benchmark because CB isn't ready. + cpp/visual_language_chat + cpp/whisper_speech_recognition + cpp/text2image + cpp/lora_greedy_causal_lm + DESTINATION samples/cpp COMPONENT cpp_samples_genai) + +install(DIRECTORY + python/beam_search_causal_lm + python/chat_sample + python/greedy_causal_lm + python/multinomial_causal_lm + python/whisper_speech_recognition + # python/text2image + DESTINATION samples/python COMPONENT cpp_samples_genai + USE_SOURCE_PERMISSIONS) diff --git a/samples/cpp/beam_search_causal_lm/CMakeLists.txt b/samples/cpp/beam_search_causal_lm/CMakeLists.txt new file mode 100644 index 0000000000..9bf1a8aac8 --- /dev/null +++ b/samples/cpp/beam_search_causal_lm/CMakeLists.txt @@ -0,0 +1,22 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +find_package(OpenVINOGenAI REQUIRED + HINTS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. + NO_CMAKE_FIND_ROOT_PATH +) + +add_executable(beam_search_causal_lm beam_search_causal_lm.cpp) +target_link_libraries(beam_search_causal_lm PRIVATE openvino::genai) +set_target_properties(beam_search_causal_lm PROPERTIES + COMPILE_PDB_NAME beam_search_causal_lm + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) +target_compile_features(beam_search_causal_lm PRIVATE cxx_std_11) + +install(TARGETS beam_search_causal_lm + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/beam_search_causal_lm/README.md b/samples/cpp/beam_search_causal_lm/README.md new file mode 100644 index 0000000000..0d2ee83bfc --- /dev/null +++ b/samples/cpp/beam_search_causal_lm/README.md @@ -0,0 +1,36 @@ +# Text generation C++ sample that supports most popular models like LLaMA 3 + +This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. It's only possible to change the device for inference to a differnt one, GPU for example, from the command line interface. The sample fearures `ov::genai::LLMPipeline` and configures it to use multiple beam grops. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. + +```sh +pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +## Run + +`beam_search_causal_lm TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` + + +Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. + +See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. + +### Troubleshooting + +#### Unicode characters encoding error on Windows + +Example error: +``` +UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to <undefined> +``` + +If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: +1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. +2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp b/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp new file mode 100644 index 0000000000..5f3187f33b --- /dev/null +++ b/samples/cpp/beam_search_causal_lm/beam_search_causal_lm.cpp @@ -0,0 +1,36 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include <openvino/genai/llm_pipeline.hpp> + +int main(int argc, char* argv[]) try { + if (argc < 3) { + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> '<PROMPT 1>' ['<PROMPT 2>' ...]"); + } + auto prompts = std::vector<std::string>(argv + 2, argv + argc); + std::string model_path = argv[1]; + + std::string device = "CPU"; // GPU can be used as well + ov::genai::LLMPipeline pipe(model_path, device); + + ov::genai::GenerationConfig config; + config.max_new_tokens = 20; + config.num_beam_groups = 3; + config.num_beams = 15; + config.num_return_sequences = config.num_beams; + + // Since the streamer is set, the results will + // be printed each time a new token is generated. + auto beams = pipe.generate(prompts, config); + std::cout << beams << '\n'; +} catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} diff --git a/samples/cpp/benchmark_genai/CMakeLists.txt b/samples/cpp/benchmark_genai/CMakeLists.txt new file mode 100644 index 0000000000..902a05eee6 --- /dev/null +++ b/samples/cpp/benchmark_genai/CMakeLists.txt @@ -0,0 +1,32 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +find_package(OpenVINOGenAI REQUIRED + PATHS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. + NO_CMAKE_FIND_ROOT_PATH +) + +include(FetchContent) + +if(POLICY CMP0135) + cmake_policy(SET CMP0135 NEW) +endif() + +FetchContent_Declare(cxxopts + URL https://github.com/jarro2783/cxxopts/archive/refs/tags/v3.1.1.tar.gz + URL_HASH SHA256=523175f792eb0ff04f9e653c90746c12655f10cb70f1d5e6d6d9491420298a08) +FetchContent_MakeAvailable(cxxopts) + +add_executable(benchmark_genai benchmark_genai.cpp) +target_link_libraries(benchmark_genai PRIVATE openvino::genai cxxopts::cxxopts) +set_target_properties(benchmark_genai PROPERTIES + COMPILE_PDB_NAME benchmark_genai + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) + +install(TARGETS benchmark_genai + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/benchmark_genai/README.md b/samples/cpp/benchmark_genai/README.md new file mode 100644 index 0000000000..1a46db05d9 --- /dev/null +++ b/samples/cpp/benchmark_genai/README.md @@ -0,0 +1,47 @@ +# LLMs benchmarking sample + +This sample script demonstrates how to benchmark an LLMs in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics. + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. + +```sh +pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +## Usage + +```sh +benchmark_genai [OPTIONS] +``` + +### Options + +- `-m, --model`: Path to the model and tokenizers base directory. +- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text. +- `-nw, --num_warmup` (default: `1`): Number of warmup iterations. +- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations. +- `-n, --num_iter` (default: `3`): Number of iterations. +- `-d, --device` (default: `"CPU"`): Device to run the model on. + +### Output: + +``` +benchmark_genai -m TinyLlama-1.1B-Chat-v1.0 -n 10 +``` + +``` +Load time: 3405.69 ms +Generate time: 1430.77 ± 3.04 ms +Tokenization time: 0.51 ± 0.02 ms +Detokenization time: 0.37 ± 0.01 ms +TTFT: 81.60 ± 0.54 ms +TPOT: 71.52 ± 2.72 ms +Throughput tokens/s: 13.98 ± 0.53 +``` + +For more information how performance metrics are calculated please follow [performance-metrics tutorial](../../../src/README.md#performance-metrics). diff --git a/samples/cpp/benchmark_genai/benchmark_genai.cpp b/samples/cpp/benchmark_genai/benchmark_genai.cpp new file mode 100644 index 0000000000..8fadeac444 --- /dev/null +++ b/samples/cpp/benchmark_genai/benchmark_genai.cpp @@ -0,0 +1,74 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/llm_pipeline.hpp" +#include <cxxopts.hpp> + +int main(int argc, char* argv[]) try { + cxxopts::Options options("benchmark_vanilla_genai", "Help command"); + + options.add_options() + ("m,model", "Path to model and tokenizers base directory", cxxopts::value<std::string>()->default_value(".")) + ("p,prompt", "Prompt", cxxopts::value<std::string>()->default_value("The Sky is blue because")) + ("nw,num_warmup", "Number of warmup iterations", cxxopts::value<size_t>()->default_value(std::to_string(1))) + ("n,num_iter", "Number of iterations", cxxopts::value<size_t>()->default_value(std::to_string(3))) + ("mt,max_new_tokens", "Maximal number of new tokens", cxxopts::value<size_t>()->default_value(std::to_string(20))) + ("d,device", "device", cxxopts::value<std::string>()->default_value("CPU")) + ("h,help", "Print usage"); + + cxxopts::ParseResult result; + try { + result = options.parse(argc, argv); + } catch (const cxxopts::exceptions::exception& e) { + std::cout << e.what() << "\n\n"; + std::cout << options.help() << std::endl; + return EXIT_FAILURE; + } + + if (result.count("help")) { + std::cout << options.help() << std::endl; + return EXIT_SUCCESS; + } + + std::string prompt = result["prompt"].as<std::string>(); + const std::string model_path = result["model"].as<std::string>(); + std::string device = result["device"].as<std::string>(); + size_t num_warmup = result["num_warmup"].as<size_t>(); + size_t num_iter = result["num_iter"].as<size_t>(); + + ov::genai::GenerationConfig config; + config.max_new_tokens = result["max_new_tokens"].as<size_t>(); + + ov::genai::LLMPipeline pipe(model_path, device); + + for (size_t i = 0; i < num_warmup; i++) + pipe.generate(prompt, config); + + ov::genai::DecodedResults res = pipe.generate(prompt, config); + ov::genai::PerfMetrics metrics = res.perf_metrics; + for (size_t i = 0; i < num_iter - 1; i++) { + res = pipe.generate(prompt, config); + metrics = metrics + res.perf_metrics; + } + + std::cout << std::fixed << std::setprecision(2); + std::cout << "Load time: " << metrics.get_load_time() << " ms" << std::endl; + std::cout << "Generate time: " << metrics.get_generate_duration().mean << " ± " << metrics.get_generate_duration().std << " ms" << std::endl; + std::cout << "Tokenization time: " << metrics.get_tokenization_duration().mean << " ± " << metrics.get_tokenization_duration().std << " ms" << std::endl; + std::cout << "Detokenization time: " << metrics.get_detokenization_duration().mean << " ± " << metrics.get_detokenization_duration().std << " ms" << std::endl; + std::cout << "TTFT: " << metrics.get_ttft().mean << " ± " << metrics.get_ttft().std << " ms" << std::endl; + std::cout << "TPOT: " << metrics.get_tpot().mean << " ± " << metrics.get_tpot().std << " ms/token " << std::endl; + std::cout << "Throughput: " << metrics.get_throughput().mean << " ± " << metrics.get_throughput().std << " tokens/s" << std::endl; + + return 0; +} catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} diff --git a/samples/cpp/chat_sample/CMakeLists.txt b/samples/cpp/chat_sample/CMakeLists.txt new file mode 100644 index 0000000000..69578dc86c --- /dev/null +++ b/samples/cpp/chat_sample/CMakeLists.txt @@ -0,0 +1,22 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +find_package(OpenVINOGenAI REQUIRED + PATHS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. + NO_CMAKE_FIND_ROOT_PATH +) + +add_executable(chat_sample chat_sample.cpp) +target_link_libraries(chat_sample PRIVATE openvino::genai) +set_target_properties(chat_sample PROPERTIES + COMPILE_PDB_NAME chat_sample + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) +target_compile_features(chat_sample PRIVATE cxx_std_11) + +install(TARGETS chat_sample + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/chat_sample/README.md b/samples/cpp/chat_sample/README.md new file mode 100644 index 0000000000..3f736985c2 --- /dev/null +++ b/samples/cpp/chat_sample/README.md @@ -0,0 +1,44 @@ +# C++ chat_sample that supports most popular models like LLaMA 3 + +This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it for the chat scenario. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. + +```sh +pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +## Run: + +`chat_sample TinyLlama-1.1B-Chat-v1.0` + + +Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. + +See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. + +### Troubleshooting + +#### Unicode characters encoding error on Windows + +Example error: +``` +UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to <undefined> +``` + +If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: +1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. +2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. + +#### Missing chat template + +If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model. +The following template can be used as a default, but it may not work properly with every model: +``` +"chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n<|im_start|>assistant\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>\n'}}{% endif %}{% endfor %}", +``` diff --git a/samples/cpp/chat_sample/chat_sample.cpp b/samples/cpp/chat_sample/chat_sample.cpp new file mode 100644 index 0000000000..827c08ae57 --- /dev/null +++ b/samples/cpp/chat_sample/chat_sample.cpp @@ -0,0 +1,43 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/llm_pipeline.hpp" + +int main(int argc, char* argv[]) try { + if (2 != argc) { + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR>"); + } + std::string prompt; + std::string model_path = argv[1]; + + std::string device = "CPU"; // GPU, NPU can be used as well + ov::genai::LLMPipeline pipe(model_path, device); + + ov::genai::GenerationConfig config; + config.max_new_tokens = 100; + std::function<bool(std::string)> streamer = [](std::string word) { + std::cout << word << std::flush; + // Return flag corresponds whether generation should be stopped. + // false means continue generation. + return false; + }; + + pipe.start_chat(); + std::cout << "question:\n"; + while (std::getline(std::cin, prompt)) { + pipe.generate(prompt, config, streamer); + std::cout << "\n----------\n" + "question:\n"; + } + pipe.finish_chat(); +} catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} diff --git a/samples/cpp/continuous_batching_accuracy/CMakeLists.txt b/samples/cpp/continuous_batching_accuracy/CMakeLists.txt new file mode 100644 index 0000000000..26dc9bc7b8 --- /dev/null +++ b/samples/cpp/continuous_batching_accuracy/CMakeLists.txt @@ -0,0 +1,30 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# start of dependencies + +include(FetchContent) + +if(POLICY CMP0135) + cmake_policy(SET CMP0135 NEW) +endif() + +FetchContent_Declare(cxxopts + URL https://github.com/jarro2783/cxxopts/archive/refs/tags/v3.1.1.tar.gz + URL_HASH SHA256=523175f792eb0ff04f9e653c90746c12655f10cb70f1d5e6d6d9491420298a08) +FetchContent_MakeAvailable(cxxopts) + +if(NOT TARGET nlohmann_json) + FetchContent_Declare(nlohmann_json + URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz + URL_HASH SHA256=0d8ef5af7f9794e3263480193c491549b2ba6cc74bb018906202ada498a79406) + FetchContent_MakeAvailable(nlohmann_json) +endif() + +find_package(OpenVINO REQUIRED COMPONENTS Runtime) + +# end of dependencies + +set(TARGET_NAME continuous_batching_accuracy) +add_executable(${TARGET_NAME} ${TARGET_NAME}.cpp) +target_link_libraries(${TARGET_NAME} PRIVATE openvino::genai cxxopts::cxxopts) diff --git a/samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy.cpp b/samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy.cpp new file mode 100644 index 0000000000..280506333f --- /dev/null +++ b/samples/cpp/continuous_batching_accuracy/continuous_batching_accuracy.cpp @@ -0,0 +1,150 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include <openvino/openvino.hpp> +#include <cxxopts.hpp> + +#include "openvino/genai/continuous_batching_pipeline.hpp" + +void print_generation_result(const ov::genai::GenerationResult& generation_result) { + for (size_t output_id = 0; output_id < generation_result.m_generation_ids.size(); ++output_id) { + std::cout << "Answer " << output_id << " (" << generation_result.m_scores[output_id] << ") : " << generation_result.m_generation_ids[output_id] << std::endl; + } +} + +int main(int argc, char* argv[]) try { + // Command line options + + cxxopts::Options options("accuracy_sample", "Help command"); + + options.add_options() + ("n,num_prompts", "A number of prompts", cxxopts::value<size_t>()->default_value("1")) + ("dynamic_split_fuse", "Whether to use dynamic split-fuse or vLLM scheduling", cxxopts::value<bool>()->default_value("false")) + ("m,model", "Path to model and tokenizers base directory", cxxopts::value<std::string>()->default_value(".")) + ("d,device", "Target device to run the model", cxxopts::value<std::string>()->default_value("CPU")) + ("use_prefix", "Whether to use a prefix or not", cxxopts::value<bool>()->default_value("false")) + ("h,help", "Print usage"); + + cxxopts::ParseResult result; + try { + result = options.parse(argc, argv); + } catch (const cxxopts::exceptions::exception& e) { + std::cout << e.what() << "\n\n"; + std::cout << options.help() << std::endl; + return EXIT_FAILURE; + } + + if (result.count("help")) { + std::cout << options.help() << std::endl; + return EXIT_SUCCESS; + } + + const size_t num_prompts = result["num_prompts"].as<size_t>(); + const bool dynamic_split_fuse = result["dynamic_split_fuse"].as<bool>(); + const std::string models_path = result["model"].as<std::string>(); + const std::string device = result["device"].as<std::string>(); + const bool use_prefix = result["use_prefix"].as<bool>(); + + std::string prefix_str = + "You are an advanced language model designed to assist users by providing accurate, " + "relevant, and helpful information. Your responses should be accurate, concise, contextual, " + "respectful, and helpful. The request is: "; + + // create dataset + + std::vector<std::string> prompt_examples = { + "What is OpenVINO?", + "How are you?", + "What is your name?", + "Tell me something about Canada", + "What is OpenVINO?", + }; + + std::vector<ov::genai::GenerationConfig> sampling_params_examples { + ov::genai::beam_search(), + ov::genai::greedy(), + ov::genai::multinomial(), + }; + + std::vector<std::string> prompts(num_prompts); + std::vector<ov::genai::GenerationConfig> sampling_params(num_prompts); + + for (size_t request_id = 0; request_id < num_prompts; ++request_id) { + prompts[request_id] = use_prefix ? prefix_str + prompt_examples[request_id % prompt_examples.size()] + : prompt_examples[request_id % prompt_examples.size()]; + sampling_params[request_id] = sampling_params_examples[request_id % sampling_params_examples.size()]; + } + + // Perform the inference + auto get_default_block_size = [](const std::string& device) { + const size_t cpu_block_size = 32; + const size_t gpu_block_size = 16; + + bool is_gpu = device.find("GPU") != std::string::npos; + + return is_gpu ? gpu_block_size : cpu_block_size; + }; + + ov::genai::SchedulerConfig scheduler_config; + // batch size + scheduler_config.max_num_batched_tokens = use_prefix ? 256 : 32; + // cache params + scheduler_config.num_kv_blocks = 364; + scheduler_config.block_size = get_default_block_size(device); + // mode - vLLM or dynamic_split_fuse + scheduler_config.dynamic_split_fuse = dynamic_split_fuse; + // vLLM specific params + scheduler_config.max_num_seqs = 2; + scheduler_config.enable_prefix_caching = use_prefix; + + // It's possible to construct a Tokenizer from a different path. + // If the Tokenizer isn't specified, it's loaded from the same folder. + ov::genai::ContinuousBatchingPipeline pipe(models_path, ov::genai::Tokenizer{models_path}, scheduler_config, device); + + if (use_prefix) { + std::cout << "Running inference for prefix to compute the shared prompt's KV cache..." << std::endl; + std::vector<ov::genai::GenerationResult> generation_results = pipe.generate({prefix_str}, {ov::genai::greedy()}); + ov::genai::GenerationResult& generation_result = generation_results.front(); + OPENVINO_ASSERT(generation_result.m_status == ov::genai::GenerationStatus::FINISHED); + } + + std::vector<ov::genai::GenerationResult> generation_results = pipe.generate(prompts, sampling_params); + + for (size_t request_id = 0; request_id < generation_results.size(); ++request_id) { + const ov::genai::GenerationResult & generation_result = generation_results[request_id]; + std::cout << "Question: " << prompts[request_id] << std::endl; + switch (generation_result.m_status) + { + case ov::genai::GenerationStatus::FINISHED: + print_generation_result(generation_result); + break; + case ov::genai::GenerationStatus::IGNORED: + std::cout << "Request was ignored due to lack of memory." <<std::endl; + if (generation_result.m_generation_ids.size() > 0) { + std::cout << "Partial result:" << std::endl; + print_generation_result(generation_result); + } + break; + case ov::genai::GenerationStatus::DROPPED_BY_PIPELINE: + std::cout << "Request was aborted." <<std::endl; + if (generation_result.m_generation_ids.size() > 0) { + std::cout << "Partial result:" << std::endl; + print_generation_result(generation_result); + } + break; + default: + break; + } + std::cout << std::endl; + } +} catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} diff --git a/samples/cpp/continuous_batching_benchmark/CMakeLists.txt b/samples/cpp/continuous_batching_benchmark/CMakeLists.txt new file mode 100644 index 0000000000..34a15f58d7 --- /dev/null +++ b/samples/cpp/continuous_batching_benchmark/CMakeLists.txt @@ -0,0 +1,31 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# start of dependencies + +include(FetchContent) + +if(POLICY CMP0135) + cmake_policy(SET CMP0135 NEW) +endif() + +FetchContent_Declare(cxxopts + URL https://github.com/jarro2783/cxxopts/archive/refs/tags/v3.1.1.tar.gz + URL_HASH SHA256=523175f792eb0ff04f9e653c90746c12655f10cb70f1d5e6d6d9491420298a08) +FetchContent_MakeAvailable(cxxopts) + +if(NOT TARGET nlohmann_json) + FetchContent_Declare(nlohmann_json + URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz + URL_HASH SHA256=0d8ef5af7f9794e3263480193c491549b2ba6cc74bb018906202ada498a79406) + FetchContent_MakeAvailable(nlohmann_json) +endif() + +find_package(OpenVINO REQUIRED COMPONENTS Runtime) +find_package(Threads REQUIRED) + +# end of dependencies + +set(TARGET_NAME continuous_batching_benchmark) +add_executable(${TARGET_NAME} ${TARGET_NAME}.cpp) +target_link_libraries(${TARGET_NAME} PRIVATE openvino::genai nlohmann_json::nlohmann_json cxxopts::cxxopts Threads::Threads) diff --git a/samples/cpp/continuous_batching_benchmark/continuous_batching_benchmark.cpp b/samples/cpp/continuous_batching_benchmark/continuous_batching_benchmark.cpp new file mode 100644 index 0000000000..7c3e75eafa --- /dev/null +++ b/samples/cpp/continuous_batching_benchmark/continuous_batching_benchmark.cpp @@ -0,0 +1,552 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include <fstream> +#include <cstdlib> +#include <chrono> +#include <ostream> +#include <random> +#include <stdexcept> +#include <thread> +#include <mutex> +#include <atomic> + +#include <nlohmann/json.hpp> +#include <cxxopts.hpp> + +#include "openvino/genai/cache_eviction.hpp" +#include "openvino/genai/tokenizer.hpp" +#include "openvino/genai/continuous_batching_pipeline.hpp" +#include "openvino/genai/generation_handle.hpp" + +namespace { + +class AutoStartTimer { + const decltype(std::chrono::steady_clock::now()) m_start; +public: + AutoStartTimer() : + m_start(std::chrono::steady_clock::now()) { + } + + double current_in_milli() const { + auto m_end = std::chrono::steady_clock::now(); + return std::chrono::duration<double, std::milli>(m_end - m_start).count(); + } +}; + +struct Dataset { + std::vector<std::string> m_prompts; + std::vector<ov::genai::GenerationConfig> m_sampling_params; + std::vector<size_t> m_input_lens, m_output_lens; + + size_t m_total_input_len = 0; + size_t m_total_output_len = 0; + + void reserve(const size_t size) { + m_prompts.reserve(size); + m_sampling_params.reserve(size); + m_input_lens.reserve(size); + m_output_lens.reserve(size); + } + + void push_data(std::string prompt, ov::genai::GenerationConfig sampling_params) { + m_prompts.push_back(prompt); + m_sampling_params.push_back(sampling_params); + } + + void push_lens(size_t input_len, size_t output_len) { + m_input_lens.push_back(input_len); + m_output_lens.push_back(output_len); + + m_total_input_len += input_len; + m_total_output_len += output_len; + } + + float get_average_input_len() const { + OPENVINO_ASSERT(!empty()); + return static_cast<float>(m_total_input_len / size()); + } + + float get_average_output_len() const { + OPENVINO_ASSERT(!empty()); + return static_cast<float>(m_total_output_len / size()); + } + + bool empty() const { + return size() == 0; + } + + size_t size() const { + return m_prompts.size(); + } +}; + +Dataset filtered_dataset(const std::string& models_path, const std::string& dataset_path, const size_t num_prompts, const size_t max_input_len, const size_t max_output_len) { + std::ifstream json_file(dataset_path.c_str()); + OPENVINO_ASSERT(json_file.is_open(), "Cannot open dataset file"); + + // from vLLM tput benchmark + const float dataset_size_coeff = 1.2f; + + nlohmann::json json_dataset = nlohmann::json::parse(json_file); + Dataset sampled_dataset, dataset; + const size_t num_prompt_candidates = static_cast<size_t>(num_prompts * dataset_size_coeff); + sampled_dataset.reserve(num_prompt_candidates); + dataset.reserve(num_prompt_candidates); + + ov::genai::Tokenizer tokenizer(models_path); + + for (auto json_data_iterator = json_dataset.begin(); json_data_iterator != json_dataset.end() && dataset.size() < num_prompt_candidates; ++json_data_iterator) { + auto & json_data = *json_data_iterator; + + // Filter out the conversations with less than 2 turns. + if (json_data["conversations"].size() < 2) + continue; + + // Only keep the first two turns of each conversation. + std::string human_question = json_data["conversations"][0]["value"]; + std::string gpt_answer = json_data["conversations"][1]["value"]; + + ov::Tensor _input_ids_prompt = tokenizer.encode(human_question).input_ids; + size_t input_len = _input_ids_prompt.get_size(); + + ov::Tensor _input_ids_answer = tokenizer.encode(gpt_answer).input_ids; + size_t output_len = _input_ids_answer.get_size(); + + // Prune too short sequences. + if (input_len < 4 || output_len < 4) + continue; + // Prune too long sequences. + if (input_len > max_input_len || (input_len + output_len) > 2048) + continue; + + ov::genai::GenerationConfig greedy_search = ov::genai::greedy(); + greedy_search.max_new_tokens = std::min(max_output_len, output_len); + greedy_search.ignore_eos = true; + greedy_search.repetition_penalty = 1.0; + greedy_search.frequency_penalty = 0.0; + greedy_search.presence_penalty = 0.0; + greedy_search.diversity_penalty = 0.0; + greedy_search.length_penalty = 0.0; + + dataset.push_data(human_question, greedy_search); + dataset.push_lens(input_len, output_len); + } + + // sample dataset + srand(42); + + for (size_t selected_index = rand() % dataset.size(); sampled_dataset.size() < num_prompts; selected_index = rand() % dataset.size()) { + sampled_dataset.push_data(dataset.m_prompts[selected_index], dataset.m_sampling_params[selected_index]); + sampled_dataset.push_lens(dataset.m_input_lens[selected_index], dataset.m_output_lens[selected_index]); + } + + return sampled_dataset; +} + +class GenerationInfo { + + struct SequenceInfo { + std::chrono::milliseconds ttft; + std::chrono::milliseconds cumulated_tpot; + std::chrono::milliseconds mean_tpot; + size_t num_output_tokens; + + std::chrono::steady_clock::time_point start_time; + std::chrono::steady_clock::time_point last_read_time; + + SequenceInfo(std::chrono::steady_clock::time_point& start_time) { + num_output_tokens = 0; + ttft = std::chrono::milliseconds::zero(); + cumulated_tpot = std::chrono::milliseconds::zero(); + this->start_time = start_time; + } + + void update() { + std::chrono::steady_clock::time_point new_read_time = std::chrono::steady_clock::now(); + if (last_read_time.time_since_epoch() == std::chrono::milliseconds::zero()) { + ttft = std::chrono::duration_cast<std::chrono::milliseconds>(new_read_time - start_time); + } else { + cumulated_tpot += std::chrono::duration_cast<std::chrono::milliseconds>(new_read_time - last_read_time); + mean_tpot = cumulated_tpot / num_output_tokens; + + } + num_output_tokens++; + last_read_time = new_read_time; + } + }; + + struct GenerationMetrics { + std::chrono::milliseconds mean_ttft = std::chrono::milliseconds::zero(); + std::chrono::milliseconds mean_tpot = std::chrono::milliseconds::zero(); + size_t num_output_tokens = 0; + size_t num_input_tokens; + }; + + ov::genai::GenerationHandle generation_handle; + std::chrono::steady_clock::time_point start_time; + std::unordered_map<int64_t, SequenceInfo> sequences_info; + bool active = true; + size_t input_len; + +public: + GenerationInfo(ov::genai::GenerationHandle generation_handle, size_t input_len) : input_len(input_len) + { + this->generation_handle = std::move(generation_handle); + start_time = std::chrono::steady_clock::now(); + } + + void update_sequence(int64_t sequence_id) { + if (sequences_info.find(sequence_id) == sequences_info.end()) + sequences_info.emplace(sequence_id, SequenceInfo(start_time)); + sequences_info.at(sequence_id).update(); + } + + void update(ov::genai::GenerationOutputs& outputs){ + for (auto const& output: outputs) { + update_sequence(output.first); + } + } + + ov::genai::GenerationOutputs read() { + return generation_handle->read(); + } + + bool can_read() { + return generation_handle->can_read(); + } + + bool is_finished() { + return generation_handle->get_status() == ov::genai::GenerationStatus::FINISHED; + } + + void set_inactive() { + active = false; + } + + bool is_active() { + return active; + } + + GenerationMetrics get_metrics() { + GenerationMetrics generation_metrics; + if (!sequences_info.empty()) { + for (auto& sequenceInfoPair : sequences_info) { + generation_metrics.mean_ttft += sequenceInfoPair.second.ttft; + generation_metrics.mean_tpot += sequenceInfoPair.second.mean_tpot; + generation_metrics.num_output_tokens += sequenceInfoPair.second.num_output_tokens; + } + generation_metrics.mean_ttft /= sequences_info.size(); + generation_metrics.mean_tpot /= sequences_info.size(); + generation_metrics.num_input_tokens = input_len; + } + return generation_metrics; + } +}; + +class GenerationInfoCollector { + std::mutex mutex; + std::vector<GenerationInfo> generations_info; + size_t num_finished = 0; + std::chrono::steady_clock::time_point start_time; + +public: + + void set_start_time(std::chrono::steady_clock::time_point start_time) { + this->start_time = start_time; + } + + void add_generation(ov::genai::ContinuousBatchingPipeline* pipe, Dataset* dataset, size_t request_id) { + ov::genai::GenerationHandle generation_handle = pipe->add_request(request_id, dataset->m_prompts[request_id], dataset->m_sampling_params[request_id]); + std::lock_guard<std::mutex> lock(mutex); + generations_info.emplace_back(std::move(generation_handle), dataset->m_input_lens[request_id]); + } + + size_t run() { + std::lock_guard<std::mutex> lock(mutex); + for (GenerationInfo& generation_info : generations_info) { + if (!generation_info.is_active()) + continue; + + if (generation_info.is_finished()) { + num_finished++; + generation_info.set_inactive(); + } else if (generation_info.can_read()) { + auto outputs = generation_info.read(); + generation_info.update(outputs); + } + } + return num_finished; + } + + void print_statistics() { + std::chrono::seconds total_duration = std::chrono::duration_cast<std::chrono::seconds>(std::chrono::steady_clock::now() - start_time); + std::chrono::milliseconds mean_ttft = std::chrono::milliseconds::zero(); + std::chrono::milliseconds mean_tpot = std::chrono::milliseconds::zero(); + size_t total_input_len = 0; + size_t total_output_len = 0; + + + for (GenerationInfo& generation_info : generations_info){ + auto generation_metrics = generation_info.get_metrics(); + mean_ttft += generation_metrics.mean_ttft; + mean_tpot += generation_metrics.mean_tpot; + total_input_len += generation_metrics.num_input_tokens; + total_output_len += generation_metrics.num_output_tokens; + } + mean_ttft /= generations_info.size(); + mean_tpot /= generations_info.size(); + std::cout << "Benchmark duration: " << total_duration.count() << " s" << std::endl; + std::cout << "Total number of input tokens: " << total_input_len << std::endl; + std::cout << "Total number of output tokens: " << total_output_len << std::endl; + std::cout << "Input throughput: " << total_input_len / total_duration.count() << " tokens / s" << std::endl; + std::cout << "Output throughput: " << total_output_len / total_duration.count() << " tokens / s" << std::endl; + std::cout << "Mean TTFT: " << mean_ttft.count() << " ms" << std::endl; + std::cout << "Mean TPOT: " << mean_tpot.count() << " ms" << std::endl; + } +}; + +void trafficSimulator(ov::genai::ContinuousBatchingPipeline* pipe, Dataset* dataset, std::string request_rate, GenerationInfoCollector* generation_info_collector) { + double numeric_request_rate; + std::random_device rd; + std::mt19937 gen(rd()); + std::exponential_distribution<> distribution; + + if (request_rate == "inf") { + numeric_request_rate = -1.0; + } else { + numeric_request_rate = std::stod(request_rate); + if (numeric_request_rate < 0) + throw std::invalid_argument("request_rate cannot be a negative number"); + + distribution = std::exponential_distribution<>(numeric_request_rate); + } + + /* + std::cout << "Total input tokens: " << dataset->m_total_input_len << std::endl; + std::cout << "Total output tokens: " << dataset->m_total_output_len << std::endl; + std::cout << "Average input len: " << dataset->get_average_input_len() << " tokens" << std::endl; + std::cout << "Average output len: " << dataset->get_average_output_len() << " tokens" << std::endl; + */ + + std::cout << "Launching traffic simulator thread with request_rate: " << request_rate << std::endl; + generation_info_collector->set_start_time(std::chrono::steady_clock::now()); + for (size_t request_id = 0; request_id < dataset->size(); ++request_id) { + std::cout << "Traffic thread adding request to the queue..." << std::endl; + generation_info_collector->add_generation(pipe, dataset, request_id); + if (numeric_request_rate > 0) + std::this_thread::sleep_for(std::chrono::milliseconds(int(distribution(gen) * 1000))); + } + std::cout << "All requests sent, traffic simulation finished. Exiting thread." << std::endl; +} + +void llmEngineLoop(ov::genai::ContinuousBatchingPipeline* pipe, Dataset* dataset, std::atomic<bool>* finishThread) { + std::cout << "Launching LLM engine thread" << std::endl; + size_t num_finished = 0; + + while (!(*finishThread)) { + while (pipe->has_non_finished_requests()) { + pipe->step(); + } + } + std::cout << "All requests processed, LLM Engine loop escaped. Exiting thread." << std::endl; +} + +void statisticsReporter(GenerationInfoCollector* generations_info_collector, int num_prompts) { + int num_finished = 0; + while (num_finished < num_prompts) { + num_finished = generations_info_collector->run(); + } + std::cout << "Benchmark finished, summarizing statistics..." << std::endl; + generations_info_collector->print_statistics(); + + std::cout << "Exiting statistics reporter thread." << std::endl; +} + +bool parse_plugin_config_json(nlohmann::json& node, ov::AnyMap& device_config_map) { + if (!node.is_object()) { + std::cout << "Error: nlohmann json object is not an object." << std::endl; + return false; + } + for (auto& element : node.items()) { + if (element.value().is_string()) { + device_config_map[std::string(element.key())] = element.value().get<std::string>(); + std::cout << "Setting plugin config: " << element.key() << " : " << element.value().get<std::string>() << std::endl; + } else if (element.value().is_number_integer()) { + device_config_map[std::string(element.key())] = element.value().get<std::int64_t>(); + std::cout << "Setting plugin config: " << element.key() << " : " << element.value().get<std::int64_t>() << std::endl; + } else if (element.value().is_number_float()) { + device_config_map[std::string(element.key())] = element.value().get<float>(); + std::cout << "Setting plugin config: " << element.key() << " : " << element.value().get<float>() << std::endl; + } else if (element.value().is_number_unsigned()) { + device_config_map[std::string(element.key())] = element.value().get<uint64_t>(); + std::cout << "Setting plugin config: " << element.key() << " : " << element.value().get<float>() << std::endl; + } else if (element.value().is_boolean()) { + device_config_map[std::string(element.key())] = element.value().get<bool>(); + std::cout << "Setting plugin config: " << element.key() << " : " << element.value().get<bool>() << std::endl; + } else { + std::cout << "Error: nlohmann json type not supported for: " << element.key() << std::endl; + return false; + } + } + + return true; +} + +bool parse_plugin_config_string(const std::string& config_string, ov::AnyMap& device_config_map) { + if (config_string.empty()) { + std::cout << "Empty plugin config string. " << std::endl; + return true; + } + + nlohmann::json node; + try { + node = nlohmann::json::parse(config_string); + } catch (const nlohmann::json::parse_error& e) { + std::cout << "ERROR: Plugin config json parser error - message: " << e.what() << '\n' + << "exception id: " << e.id << '\n' + << "byte position of error: " << e.byte << std::endl; + return false; + } catch (...) { + std::cout << "ERROR: Plugin config json parser error - message: " << std::endl; + return false; + } + + if (node.is_null()) { + std::cout << "Error: nlohmann json object is null." << std::endl; + return false; + } + + return parse_plugin_config_json(node, device_config_map); +} + +} // namespace + +int main(int argc, char* argv[]) try { + // + // Command line options + // + + cxxopts::Options options("benchmark_sample", "Help command"); + + options.add_options() + ("n,num_prompts", "A number of prompts", cxxopts::value<size_t>()->default_value("1000")) + ("b,max_batch_size", "A maximum number of batched tokens", cxxopts::value<size_t>()->default_value("256")) + ("dynamic_split_fuse", "Whether to use dynamic split-fuse or vLLM scheduling", cxxopts::value<bool>()->default_value("true")) + ("m,model", "Path to model and tokenizers base directory", cxxopts::value<std::string>()->default_value(".")) + ("dataset", "Path to dataset .json file", cxxopts::value<std::string>()->default_value("./ShareGPT_V3_unfiltered_cleaned_split.json")) + ("max_input_len", "Max input length take from dataset", cxxopts::value<size_t>()->default_value("1024")) + ("max_output_len", "Max output length", cxxopts::value<size_t>()->default_value("2048")) + ("request_rate", "Number of requests per second. If this is inf, then all the requests are sent at time 0. Otherwise, we use Poisson process to synthesize the request arrival times.", cxxopts::value<std::string>()->default_value("inf")) + ("cache_size", "Size of memory used for KV cache in GB. Default: 16", cxxopts::value<size_t>()->default_value("16")) + ("device", "Target device to run the model. Default: CPU", cxxopts::value<std::string>()->default_value("CPU")) + ("device_config", "Plugin configuration JSON. Example: '{\"MODEL_DISTRIBUTION_POLICY\":\"TENSOR_PARALLEL\",\"PERF_COUNT\":true}' Default: {\"PERF_COUNT\":true}", cxxopts::value<std::string>()->default_value("{\"PERF_COUNT\":true}")) + ("use_cache_eviction", "Whether to use cache eviction", cxxopts::value<bool>()->default_value("false")) + ("h,help", "Print usage"); + + cxxopts::ParseResult result; + try { + result = options.parse(argc, argv); + } catch (const cxxopts::exceptions::exception& e) { + std::cout << e.what() << "\n\n"; + std::cout << options.help() << std::endl; + return EXIT_FAILURE; + } + + if (result.count("help")) { + std::cout << options.help() << std::endl; + return EXIT_SUCCESS; + } + + const size_t num_prompts = result["num_prompts"].as<size_t>(); + const size_t max_batch_size = result["max_batch_size"].as<size_t>(); + const bool dynamic_split_fuse = result["dynamic_split_fuse"].as<bool>(); + const std::string models_path = result["model"].as<std::string>(); + const std::string dataset_path = result["dataset"].as<std::string>(); + const size_t max_input_len = result["max_input_len"].as<size_t>(); + const size_t max_output_len = result["max_output_len"].as<size_t>(); + const std::string request_rate = result["request_rate"].as<std::string>(); + const std::string device = result["device"].as<std::string>(); + const std::string device_config = result["device_config"].as<std::string>(); + const size_t cache_size = result["cache_size"].as<size_t>(); + const bool use_cache_eviction = result["use_cache_eviction"].as<bool>(); + + // Create requests for generation + Dataset dataset = filtered_dataset(models_path, dataset_path, num_prompts, max_input_len, max_output_len); + + auto get_default_block_size = [](const std::string& device) { + const size_t cpu_block_size = 32; + const size_t gpu_block_size = 16; + + bool is_gpu = device.find("GPU") != std::string::npos; + + return is_gpu ? gpu_block_size : cpu_block_size; + }; + + // Perform the first inference + ov::genai::SchedulerConfig scheduler_config; + scheduler_config.max_num_batched_tokens = max_batch_size, + scheduler_config.cache_size = cache_size, + scheduler_config.block_size = get_default_block_size(device), + scheduler_config.dynamic_split_fuse = dynamic_split_fuse, + scheduler_config.max_num_seqs = 256; // not used if dynamic_split_fuse=True + if (use_cache_eviction) { + scheduler_config.use_cache_eviction = true; + scheduler_config.cache_eviction_config = ov::genai::CacheEvictionConfig(32, 32, 128, ov::genai::AggregationMode::NORM_SUM); + } + + std::cout << "Benchmarking parameters: " << std::endl; + std::cout << "\tMax number of batched tokens: " << scheduler_config.max_num_batched_tokens << std::endl; + std::cout << "\tScheduling type: " << (scheduler_config.dynamic_split_fuse ? "dynamic split-fuse" : "vLLM") << std::endl; + if (!scheduler_config.dynamic_split_fuse) { + std::cout << "\tMax number of batched sequences: " << scheduler_config.max_num_seqs << std::endl; + } + std::cout << "Dataset parameters: " << std::endl; + std::cout << "\tNum prompts: " << num_prompts << std::endl; + std::cout << "\tMax input length: " << max_input_len << std::endl; + std::cout << "\tMax output length: " << max_output_len << std::endl; + std::cout << "\tTarget device: " << device << std::endl; + std::cout << "\tPlugin configuration JSON: " << device_config << std::endl; + + ov::AnyMap device_config_map = {}; + if (!parse_plugin_config_string(device_config, device_config_map)) { + std::cout << "ERROR: Wrong json parameter in device_config." << std::endl; + return EXIT_FAILURE; + } + + // Benchmarking + std::cout << "Loading models, creating pipelines, preparing environment..." << std::endl; + ov::genai::ContinuousBatchingPipeline pipe(models_path, scheduler_config, device, device_config_map); + + std::cout << "Setup finished, launching LLM executor, traffic simulation and statistics reporter threads" << std::endl; + + GenerationInfoCollector generation_info_collector; + + std::atomic<bool> finishGenerationThread{false}; + if (request_rate == "inf") { + std::thread trafficSimulatorThread(trafficSimulator, &pipe, &dataset, request_rate, &generation_info_collector); + trafficSimulatorThread.join(); + } + + std::thread lmmEngineThread(llmEngineLoop, &pipe, &dataset, &finishGenerationThread); + std::thread statisticsReporterThread(statisticsReporter, &generation_info_collector, num_prompts); + if (request_rate != "inf") { + std::thread trafficSimulatorThread(trafficSimulator, &pipe, &dataset, request_rate, &generation_info_collector); + trafficSimulatorThread.join(); + } + statisticsReporterThread.join(); + finishGenerationThread = true; + lmmEngineThread.join(); + + std::cout << "Benchmark finished" << std::endl; +} catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} diff --git a/samples/cpp/greedy_causal_lm/CMakeLists.txt b/samples/cpp/greedy_causal_lm/CMakeLists.txt new file mode 100644 index 0000000000..ff5151676f --- /dev/null +++ b/samples/cpp/greedy_causal_lm/CMakeLists.txt @@ -0,0 +1,22 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +find_package(OpenVINOGenAI REQUIRED + PATHS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. + NO_CMAKE_FIND_ROOT_PATH +) + +add_executable(greedy_causal_lm greedy_causal_lm.cpp) +target_link_libraries(greedy_causal_lm PRIVATE openvino::genai) +set_target_properties(greedy_causal_lm PROPERTIES + COMPILE_PDB_NAME greedy_causal_lm + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) +target_compile_features(greedy_causal_lm PRIVATE cxx_std_11) + +install(TARGETS greedy_causal_lm + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/greedy_causal_lm/README.md b/samples/cpp/greedy_causal_lm/README.md new file mode 100644 index 0000000000..79852e0d10 --- /dev/null +++ b/samples/cpp/greedy_causal_lm/README.md @@ -0,0 +1,36 @@ +# Text generation C++ greedy_causal_lm that supports most popular models like LLaMA 3 + +This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it to run the simplest deterministic greedy sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. + +```sh +pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +## Run + +`greedy_causal_lm TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` + + +Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. + +See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. + +### Troubleshooting + +#### Unicode characters encoding error on Windows + +Example error: +``` +UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to <undefined> +``` + +If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: +1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. +2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/cpp/greedy_causal_lm/greedy_causal_lm.cpp b/samples/cpp/greedy_causal_lm/greedy_causal_lm.cpp new file mode 100644 index 0000000000..09e6af65e8 --- /dev/null +++ b/samples/cpp/greedy_causal_lm/greedy_causal_lm.cpp @@ -0,0 +1,29 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/llm_pipeline.hpp" + +int main(int argc, char* argv[]) try { + if (3 > argc) + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> \"<PROMPT>\""); + + std::string model_path = argv[1]; + std::string prompt = argv[2]; + std::string device = "CPU"; // GPU can be used as well + + ov::genai::LLMPipeline pipe(model_path, device); + ov::genai::GenerationConfig config; + config.max_new_tokens = 100; + std::string result = pipe.generate(prompt, config); + std::cout << result << std::endl; +} catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} diff --git a/samples/cpp/lora_greedy_causal_lm/CMakeLists.txt b/samples/cpp/lora_greedy_causal_lm/CMakeLists.txt new file mode 100644 index 0000000000..1d3f6307c0 --- /dev/null +++ b/samples/cpp/lora_greedy_causal_lm/CMakeLists.txt @@ -0,0 +1,19 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +find_package(OpenVINOGenAI REQUIRED PATHS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. + NO_CMAKE_FIND_ROOT_PATH +) +add_executable(lora_greedy_causal_lm lora_greedy_causal_lm.cpp) +target_link_libraries(lora_greedy_causal_lm PRIVATE openvino::genai) +set_target_properties(lora_greedy_causal_lm PROPERTIES + COMPILE_PDB_NAME lora_greedy_causal_lm + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) +target_compile_features(lora_greedy_causal_lm PRIVATE cxx_std_11) +install(TARGETS lora_greedy_causal_lm + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/lora_greedy_causal_lm/lora_greedy_causal_lm.cpp b/samples/cpp/lora_greedy_causal_lm/lora_greedy_causal_lm.cpp new file mode 100644 index 0000000000..c8e6d182f4 --- /dev/null +++ b/samples/cpp/lora_greedy_causal_lm/lora_greedy_causal_lm.cpp @@ -0,0 +1,33 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/llm_pipeline.hpp" + +int main(int argc, char* argv[]) try { + if (4 > argc) + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> <ADAPTER_SAFETENSORS_FILE> \"<PROMPT>\""); + + std::string model_path = argv[1]; + std::string adapter_path = argv[2]; + std::string prompt = argv[3]; + std::string device = "CPU"; + + using namespace ov::genai; + + Adapter adapter(adapter_path); + LLMPipeline pipe(model_path, device, adapters(adapter)); // register all required adapters here + + std::cout << "Generate with LoRA adapter and alpha set to 0.75:" << std::endl; + std::cout << pipe.generate(prompt, max_new_tokens(100), adapters(adapter, 0.75)) << std::endl; + + std::cout << "\n-----------------------------"; + std::cout << "\nGenerate without LoRA adapter:" << std::endl; + std::cout << pipe.generate(prompt, max_new_tokens(100), adapters()) << std::endl; + +} catch (const std::exception& error) { + std::cerr << error.what() << '\n'; + return EXIT_FAILURE; +} catch (...) { + std::cerr << "Non-exception object thrown\n"; + return EXIT_FAILURE; +} diff --git a/samples/cpp/multinomial_causal_lm/CMakeLists.txt b/samples/cpp/multinomial_causal_lm/CMakeLists.txt new file mode 100644 index 0000000000..83b2335431 --- /dev/null +++ b/samples/cpp/multinomial_causal_lm/CMakeLists.txt @@ -0,0 +1,22 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +find_package(OpenVINOGenAI REQUIRED + PATHS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. + NO_CMAKE_FIND_ROOT_PATH +) + +add_executable(multinomial_causal_lm multinomial_causal_lm.cpp) +target_link_libraries(multinomial_causal_lm PRIVATE openvino::genai) +set_target_properties(multinomial_causal_lm PROPERTIES + COMPILE_PDB_NAME multinomial_causal_lm + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) +target_compile_features(multinomial_causal_lm PRIVATE cxx_std_11) + +install(TARGETS multinomial_causal_lm + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/multinomial_causal_lm/README.md b/samples/cpp/multinomial_causal_lm/README.md new file mode 100644 index 0000000000..21c9a07e77 --- /dev/null +++ b/samples/cpp/multinomial_causal_lm/README.md @@ -0,0 +1,36 @@ +# Text generation C++ multinomial_causal_lm that supports most popular models like LLaMA 3 + +This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it to run random sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. + +```sh +pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +## Run + +`multinomial_causal_lm TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` + + +Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. + +See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. + +### Troubleshooting + +#### Unicode characters encoding error on Windows + +Example error: +``` +UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to <undefined> +``` + +If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: +1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. +2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/cpp/multinomial_causal_lm/multinomial_causal_lm.cpp b/samples/cpp/multinomial_causal_lm/multinomial_causal_lm.cpp new file mode 100644 index 0000000000..1525cbc38a --- /dev/null +++ b/samples/cpp/multinomial_causal_lm/multinomial_causal_lm.cpp @@ -0,0 +1,40 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/llm_pipeline.hpp" + +int main(int argc, char* argv[]) try { + if (3 != argc) { + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> '<PROMPT>'"); + } + + std::string model_path = argv[1]; + std::string prompt = argv[2]; + + std::string device = "CPU"; // GPU can be used as well + ov::genai::LLMPipeline pipe(model_path, device); + + ov::genai::GenerationConfig config; + config.max_new_tokens = 100; + config.do_sample = true; + config.top_p = 0.9; + config.top_k = 30; + auto streamer = [](std::string subword) { + std::cout << subword << std::flush; + return false; + }; + + // Since the streamer is set, the results will + // be printed each time a new token is generated. + pipe.generate(prompt, config, streamer); +} catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} diff --git a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt new file mode 100644 index 0000000000..c899c6e47b --- /dev/null +++ b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt @@ -0,0 +1,30 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +find_package(OpenVINO REQUIRED COMPONENTS Runtime Threading) + +find_package(OpenVINOGenAI REQUIRED + PATHS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. + NO_CMAKE_FIND_ROOT_PATH +) + +add_executable(prompt_lookup_decoding_lm prompt_lookup_decoding_lm.cpp) +target_link_libraries(prompt_lookup_decoding_lm PRIVATE openvino::runtime openvino::threading) +set_target_properties(prompt_lookup_decoding_lm PROPERTIES + COMPILE_PDB_NAME prompt_lookup_decoding_lm + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) +target_compile_features(prompt_lookup_decoding_lm PRIVATE cxx_std_17) + +get_target_property(genai_imported openvino::genai IMPORTED_LOCATION) +set(OPENVINO_TOKENIZERS_PATH $<IF:$<BOOL:${genai_imported}>,${genai_imported},$<TARGET_FILE_DIR:openvino::genai>>) +set(OPENVINO_TOKENIZERS_FILENAME "${CMAKE_SHARED_LIBRARY_PREFIX}openvino_tokenizers${CMAKE_SHARED_LIBRARY_SUFFIX}") +target_compile_definitions(prompt_lookup_decoding_lm PRIVATE + OPENVINO_TOKENIZERS_PATH="${OPENVINO_TOKENIZERS_PATH}/${OPENVINO_TOKENIZERS_FILENAME}") + +install(TARGETS prompt_lookup_decoding_lm + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/prompt_lookup_decoding_lm/README.md b/samples/cpp/prompt_lookup_decoding_lm/README.md new file mode 100644 index 0000000000..c5517c5bf6 --- /dev/null +++ b/samples/cpp/prompt_lookup_decoding_lm/README.md @@ -0,0 +1,39 @@ +# prompt_lookup_decoding_lm C++ sample that supports most popular models like LLaMA 3 + +[Prompt Lookup decoding](https://github.com/apoorvumang/prompt-lookup-decoding) is [assested-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) technique where the draft model is replaced with simple string matching the prompt to generate candidate token sequences. This method highly effective for input grounded generation (summarization, document QA, multi-turn chat, code editing), where there is high n-gram overlap between LLM input (prompt) and LLM output. This could be entity names, phrases, or code chunks that the LLM directly copies from the input while generating the output. Prompt lookup exploits this pattern to speed up autoregressive decoding in LLMs. This results in significant speedups with no effect on output quality. + +This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. Loading `openvino_tokenizers` to `ov::Core` enables tokenization. Run `optimum-cli` to generate IRs for the samples. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. + +```sh +source <INSTALL_DIR>/setupvars.sh +pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +## Run + +`prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "return 0;"` + + +Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. + +See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. + +### Troubleshooting + +#### Unicode characters encoding error on Windows + +Example error: +``` +UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to <undefined> +``` + +If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: +1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. +2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp new file mode 100644 index 0000000000..5e372a3f09 --- /dev/null +++ b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp @@ -0,0 +1,345 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include <openvino/core/parallel.hpp> +#include <openvino/openvino.hpp> +#include <string_view> + +namespace { + +// only batch_size = 1 currently supported +constexpr size_t BATCH_SIZE = 1; + +size_t get_seq_len_axis(std::shared_ptr<ov::Model> model) { + // sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size], + // threfore usually seq_length_axis = 2 + size_t seq_length_axis = 2; + + // "ReadValue" node is KV cache representation in stateful model + std::string kv_node_type_name = std::string(ov::op::v6::ReadValue::get_type_info_static().name); + + for (const auto op : model->get_ops()) { + if (op->get_type_name() != kv_node_type_name) { + continue; + } + + // Shape example: [-1,4,0,64] + auto shape = op->get_input_partial_shape(0); + + for (size_t i = 0; i < shape.rank().get_length(); i++) { + // Find axis = 0. This would be sequence length axis. + if (shape[i] == 0) { + seq_length_axis = i; + } + } + break; + } + + return seq_length_axis; +} + +std::pair<ov::Tensor, ov::Tensor> tokenize(ov::InferRequest& tokenizer, std::string&& prompt) { + tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {BATCH_SIZE}, &prompt}); + tokenizer.infer(); + return {tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask")}; +} + +std::string detokenize(ov::InferRequest& detokenizer, std::vector<int64_t>& tokens) { + detokenizer.set_input_tensor(ov::Tensor{ov::element::i64, {BATCH_SIZE, tokens.size()}, tokens.data()}); + detokenizer.infer(); + return detokenizer.get_output_tensor().data<std::string>()[0]; +} + +// The following reasons require TextStreamer to keep a cache of previous tokens: +// detokenizer removes starting ' '. For example detokenize(tokenize(" a")) == "a", +// but detokenize(tokenize("prefix a")) == "prefix a" +// 1 printable token may consist of 2 token ids: detokenize(incomplete_token_idx) == "�" +struct TextStreamer { + ov::InferRequest detokenizer; + std::vector<int64_t> token_cache; + size_t print_len = 0; + + void put(int64_t token) { + token_cache.push_back(token); + std::string text = detokenize(detokenizer, token_cache); + if (!text.empty() && '\n' == text.back() && text.size() > print_len) { + // Flush the cache after the new line symbol + std::cout << std::string_view{text.data() + print_len, text.size() - print_len}; + token_cache.clear(); + print_len = 0; + return; + } + if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) { + // Don't print incomplete text + return; + } else if (text.size() > print_len) { + // It is possible to have a shorter text after adding new token. + // Print to output only if text lengh is increaeseds. + std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush; + print_len = text.size(); + } + } + + void end() { + std::string text = detokenize(detokenizer, token_cache); + if (text.size() <= print_len) + return; + std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n'; + token_cache.clear(); + print_len = 0; + } +}; + +ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_seq_len) { + // Copy elements from the old to a new tensor and return it. + // Trim kv tensor on sequence length axis + // key/values tensor shape example: [BATCH_SIZE, num_kv_heads, seq_len, head_size] + // Sequense length axis position may vary from one model to another + + auto shape = tensor.get_shape(); + + OPENVINO_ASSERT(seq_len_axis < shape.size(), + "Sequence length axis: ", + seq_len_axis, + " should be less than shape size: ", + shape.size()); + + size_t old_seq_len = shape[seq_len_axis]; + + OPENVINO_ASSERT(new_seq_len <= old_seq_len); + + // if new_seq_len equal to old one no need to copy tensor, return as is + if (old_seq_len == new_seq_len) + return tensor; + + shape[seq_len_axis] = new_seq_len; + + if (seq_len_axis == 0) { + tensor.set_shape(shape); + return tensor; + } + + ov::Coordinate new_shape_begin{0, 0, 0, 0}; + ov::Coordinate new_shape_end{shape}; + + auto new_tensor = ov::Tensor(tensor, new_shape_begin, new_shape_end); + + return new_tensor; +} + +void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t new_seq_len) { + // trim kv_cache values up to the new_seq_len + auto states = request.query_state(); + ov::parallel_for(states.size(), [&](size_t i) { + ov::Tensor old_tensor = states.at(i).get_state(); + states.at(i).set_state(trimm_tensor(old_tensor, seq_len_axis, new_seq_len)); + }); +} + +class PromptLookupCandidateGenerator { +private: + const size_t max_ngram_size = 3; + size_t num_pred_tokens = 5; + const size_t max_pred_tokens = 20; + +public: + PromptLookupCandidateGenerator(const size_t max_ngram_size, const size_t num_pred_tokens) + : max_ngram_size{max_ngram_size}, + num_pred_tokens{num_pred_tokens} {}; + + std::vector<int64_t> generate_candidates(const std::vector<int64_t>& input_ids) { + const size_t input_length = input_ids.size(); + + for (int32_t ngram_size = max_ngram_size; ngram_size > 0; ngram_size--) { + // extract last ngram_size tokens as search ngram + std::vector<int64_t> ngram = std::vector<int64_t>{input_ids.cend() - ngram_size, input_ids.cend()}; + + // find ngram match in input_ids + size_t ngram_i = 0; + for (size_t input_i = 0; input_i < input_length - ngram_size; input_i++) { + if (ngram[ngram_i] != input_ids[input_i]) { + ngram_i = 0; + continue; + } + + ngram_i++; + + if (ngram_i < ngram_size) { + continue; + } + + // match found with the end at input_i + size_t avaliable_num_pred = std::min(input_length - (input_i + 1), num_pred_tokens); + + // return candidates with length of avaliable_num_pred + return std::vector<int64_t>{input_ids.cbegin() + input_i + 1, + input_ids.cbegin() + input_i + 1 + avaliable_num_pred}; + } + } + + return std::vector<int64_t>{}; + } + + void update_candidate_strategy(const size_t num_matches) { + // dynamically adjust number of generated candidates based on number of matches + // we want to balance the benefits of getting assistant tokens correct with the + // cost of forecasting incorrect assistant tokens. + if (num_matches == num_pred_tokens) { + num_pred_tokens = std::min(num_pred_tokens + 2, max_pred_tokens); + } else { + num_pred_tokens = std::max(num_pred_tokens - 1, size_t(1)); + } + } +}; + +int64_t get_eos_token(const std::shared_ptr<ov::Model> tokenizer) { + auto rt_info = tokenizer->get_rt_info(); // Get the runtime info for the model + + auto it = rt_info.find("eos_token_id"); + if (it == rt_info.end()) { + throw std::runtime_error("EOS token ID not found in model's runtime information."); + } + return it->second.as<int64_t>(); +} + +} // namespace + +int main(int argc, char* argv[]) try { + if (argc != 3) { + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> '<PROMPT>'"); + } + + // tokenizer model + ov::Core core; + core.add_extension(OPENVINO_TOKENIZERS_PATH); // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt + + const std::string model_dir = std::string{argv[1]}; + + auto tokenizer_model = core.read_model(model_dir + "/openvino_tokenizer.xml"); + // tokenizer and detokenizer work on CPU only + ov::InferRequest tokenizer = core.compile_model(tokenizer_model, "CPU").create_infer_request(); + auto [input_ids, attention_mask] = tokenize(tokenizer, argv[2]); + + std::vector<int64_t> full_input_ids{input_ids.data<int64_t>(), input_ids.data<int64_t>() + input_ids.get_size()}; + + ov::InferRequest detokenizer = + core.compile_model(model_dir + "/openvino_detokenizer.xml", "CPU").create_infer_request(); + TextStreamer text_streamer{std::move(detokenizer)}; + + std::shared_ptr<ov::Model> ov_model = core.read_model(model_dir + "/openvino_model.xml"); + + size_t seq_len_axis = get_seq_len_axis(ov_model); + + ov::InferRequest model = core.compile_model(ov_model, "CPU").create_infer_request(); + + model.set_tensor("input_ids", input_ids); + model.set_tensor("attention_mask", attention_mask); + + ov::Tensor position_ids = model.get_tensor("position_ids"); + position_ids.set_shape(input_ids.get_shape()); + std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0); + size_t seq_len = input_ids.get_shape()[1]; + + // set beam_idx for stateful model: no beam search is used and BATCH_SIZE = 1 + model.get_tensor("beam_idx").set_shape({BATCH_SIZE}); + model.get_tensor("beam_idx").data<int32_t>()[0] = 0; + + // To collect kv-cache for the <PROMPT> and to get the next token run the very first infer request + model.infer(); + + // logits shape is [BATCH_SIZE, seq_len, vocab_size] + auto logits = model.get_tensor("logits"); + size_t vocab_size = logits.get_shape().back(); + auto data_logits = logits.data<float>() + (seq_len - 1) * vocab_size; + int64_t out_token = std::max_element(data_logits, data_logits + vocab_size) - data_logits; + + full_input_ids.push_back(out_token); + + auto first_token = out_token; + text_streamer.put(out_token); + + const int64_t EOS_TOKEN = get_eos_token(tokenizer_model); + + // Prompt lookup decoding is a speculative decoding technic where the draft model replaced + // with string matching in the prompt to generate candidate token sequences. + int max_sequence_length = 100; + PromptLookupCandidateGenerator candidateGenerator{3, 5}; + + while (out_token != EOS_TOKEN && seq_len < max_sequence_length) { + auto candidates = candidateGenerator.generate_candidates(full_input_ids); + + // cut redundant candidates on last iteration + size_t tokens_to_generate = max_sequence_length - seq_len; + candidates.resize(std::min(candidates.size(), tokens_to_generate - 1)); + size_t candidates_size = candidates.size(); + + // candidates_size + 1 tokens will be fed at once in a single infer request. + input_ids.set_shape({BATCH_SIZE, candidates_size + 1}); + input_ids.data<int64_t>()[0] = first_token; + std::copy_n(candidates.begin(), candidates_size, input_ids.data<int64_t>() + 1); + + attention_mask.set_shape({BATCH_SIZE, seq_len + candidates_size + 1}); + std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1); + + position_ids.set_shape({BATCH_SIZE, candidates_size + 1}); + std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), seq_len); + + model.infer(); + + data_logits = logits.data<float>(); // [BATCH_SIZE, 1 + candidates_size, vocab_size] + + // 1. accept current out token (if not eos) + // 2. check if it matches apropriate candidate + // 2.1 if it's match, continue - accept next token + // 2.2 it it's mismatch, stop iteration but still accept current token as it was last token generated by + // model from a valid sequence. + size_t accepted_tokens_number = 0; + for (size_t i = 0; i < candidates_size + 1; i++) { + auto start = data_logits + vocab_size * i; + auto stop = data_logits + vocab_size * (i + 1); + out_token = std::max_element(start, stop) - start; + + if (out_token == EOS_TOKEN) { + break; + } + + text_streamer.put(out_token); + full_input_ids.push_back(out_token); + accepted_tokens_number++; + + if (i == candidates_size || out_token != candidates[i]) { + break; + } + } + + if (accepted_tokens_number > 0) { + candidateGenerator.update_candidate_strategy(accepted_tokens_number - 1); + } + + // After the inference request, key/values have shape [BATCH_SIZE, seq_len + candidates_size, vocab_size]. + // Increment the sequence length by the number of matched tokens, and + // trim the KV cache to match the new sequence length. + seq_len += accepted_tokens_number; + update_kv_cache(model, seq_len_axis, seq_len); + + first_token = out_token; + } + + text_streamer.end(); + // Model is stateful which means that context (kv-cache) which belongs to a particular + // text sequence is accumulated inside the model during the generation loop above. + // This context should be reset before processing the next text sequence. + // While it is not required to reset context in this sample as only one sequence is processed, + // it is called for education purposes: + model.reset_state(); +} catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} diff --git a/samples/cpp/speculative_decoding_lm/CMakeLists.txt b/samples/cpp/speculative_decoding_lm/CMakeLists.txt new file mode 100644 index 0000000000..078ac8bb52 --- /dev/null +++ b/samples/cpp/speculative_decoding_lm/CMakeLists.txt @@ -0,0 +1,30 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +find_package(OpenVINO REQUIRED COMPONENTS Runtime Threading) + +find_package(OpenVINOGenAI REQUIRED + PATHS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. + NO_CMAKE_FIND_ROOT_PATH +) + +add_executable(speculative_decoding_lm speculative_decoding_lm.cpp) +target_link_libraries(speculative_decoding_lm PRIVATE openvino::runtime openvino::threading) +set_target_properties(speculative_decoding_lm PROPERTIES + COMPILE_PDB_NAME speculative_decoding_lm + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) +target_compile_features(speculative_decoding_lm PRIVATE cxx_std_17) + +get_target_property(genai_imported openvino::genai IMPORTED_LOCATION) +set(OPENVINO_TOKENIZERS_PATH $<IF:$<BOOL:${genai_imported}>,${genai_imported},$<TARGET_FILE_DIR:openvino::genai>>) +set(OPENVINO_TOKENIZERS_FILENAME "${CMAKE_SHARED_LIBRARY_PREFIX}openvino_tokenizers${CMAKE_SHARED_LIBRARY_SUFFIX}") +target_compile_definitions(speculative_decoding_lm PRIVATE + OPENVINO_TOKENIZERS_PATH="${OPENVINO_TOKENIZERS_PATH}/${OPENVINO_TOKENIZERS_FILENAME}") + +install(TARGETS speculative_decoding_lm + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/speculative_decoding_lm/README.md b/samples/cpp/speculative_decoding_lm/README.md new file mode 100644 index 0000000000..644ebd2c94 --- /dev/null +++ b/samples/cpp/speculative_decoding_lm/README.md @@ -0,0 +1,43 @@ +# speculative_decoding_lm C++ sample that supports most popular models like LLaMA 3 + +Speculative decoding (or [assisted-generation](https://huggingface.co/blog/assisted-generation#understanding-text-generation-latency) in HF terminology) is a recent technique, that allows to speed up token generation when an additional smaller draft model is used alonside with the main model. + +Speculative decoding works the following way. The draft model predicts the next K tokens one by one in an autoregressive manner, while the main model validates these predictions and corrects them if necessary. We go through each predicted token, and if a difference is detected between the draft and main model, we stop and keep the last token predicted by the main model. Then the draft model gets the latest main prediction and again tries to predict the next K tokens, repeating the cycle. + +This approach reduces the need for multiple infer requests to the main model, enhancing performance. For instance, in more predictable parts of text generation, the draft model can, in best-case scenarios, generate the next K tokens that exactly match the target. In tha caste the are validated in a single inference request to the main model (which is bigger, more accurate but slower) instead of running K subsequent requests. More details can be found in the original paper https://arxiv.org/pdf/2211.17192.pdf, https://arxiv.org/pdf/2302.01318.pdf + +This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. Loading `openvino_tokenizers` to `ov::Core` enables tokenization. Run `optimum-cli` to generate IRs for the samples. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. + +```sh +pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +optimum-cli export openvino --trust-remote-code --model meta-llama/Llama-2-7b-chat-hf Llama-2-7b-chat-hf +``` + +## Run + +`speculative_decoding_lm TinyLlama-1.1B-Chat-v1.0 Llama-2-7b-chat-hf "Why is the Sun yellow?"` + + +Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. + +See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. + +### Troubleshooting + +#### Unicode characters encoding error on Windows + +Example error: +``` +UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to <undefined> +``` + +If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: +1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. +2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp new file mode 100644 index 0000000000..f26cb6c7c4 --- /dev/null +++ b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp @@ -0,0 +1,411 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include <cmath> +#include <openvino/core/parallel.hpp> +#include <openvino/openvino.hpp> +#include <random> + +namespace { + +constexpr size_t BATCH_SIZE = 1; + +size_t get_seq_len_axis(std::shared_ptr<ov::Model> model) { + // sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size], + // threfore usually seq_length_axis = 2 + size_t seq_length_axis = 2; + + // "ReadValue" node is KV cache representation in stateful model + std::string kv_node_type_name = std::string(ov::op::v6::ReadValue::get_type_info_static().name); + + for (const auto op : model->get_ops()) { + if (op->get_type_name() != kv_node_type_name) { + continue; + } + + // Shape example: [-1,4,0,64] + auto shape = op->get_input_partial_shape(0); + + for (size_t i = 0; i < shape.rank().get_length(); i++) { + // Find axis = 0. This would be sequence length axis. + if (shape[i] == 0) { + seq_length_axis = i; + } + } + break; + } + + return seq_length_axis; +} + +std::pair<ov::Tensor, ov::Tensor> tokenize(ov::InferRequest& tokenizer, std::string&& prompt) { + tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {BATCH_SIZE}, &prompt}); + tokenizer.infer(); + return {tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask")}; +} + +std::string detokenize(ov::InferRequest& detokenizer, std::vector<int64_t>& tokens) { + detokenizer.set_input_tensor(ov::Tensor{ov::element::i64, {BATCH_SIZE, tokens.size()}, tokens.data()}); + detokenizer.infer(); + return detokenizer.get_output_tensor().data<std::string>()[0]; +} + +// The following reasons require TextStreamer to keep a cache of previous tokens: +// detokenizer removes starting ' '. For example detokenize(tokenize(" a")) == "a", +// but detokenize(tokenize("prefix a")) == "prefix a" +// 1 printable token may consist of 2 token ids: detokenize(incomplete_token_idx) == "�" +struct TextStreamer { + ov::InferRequest detokenizer; + std::vector<int64_t> token_cache; + size_t print_len = 0; + + void put(int64_t token) { + token_cache.push_back(token); + std::string text = detokenize(detokenizer, token_cache); + if (!text.empty() && '\n' == text.back() && text.size() > print_len) { + // Flush the cache after the new line symbol + std::cout << std::string_view{text.data() + print_len, text.size() - print_len}; + token_cache.clear(); + print_len = 0; + return; + } + if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) { + // Don't print incomplete text + return; + } else if (text.size() > print_len) { + // It is possible to have a shorter text after adding new token. + // Print to output only if text lengh is increaesed. + std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush; + print_len = text.size(); + } + } + + void end() { + std::string text = detokenize(detokenizer, token_cache); + if (text.size() <= print_len) + return; + std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n'; + token_cache.clear(); + print_len = 0; + } +}; + +ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_seq_len) { + // Copy elements from the old to a new tensor and return it. + // Trim kv tensor on sequence length axis + // key/values tensor shape example: [BATCH_SIZE, num_kv_heads, seq_len, head_size] + // Sequense length axis position may vary from one model to another + + auto shape = tensor.get_shape(); + + OPENVINO_ASSERT(seq_len_axis < shape.size(), + "Sequence length axis: ", + seq_len_axis, + " should be less than shape size: ", + shape.size()); + + size_t old_seq_len = shape[seq_len_axis]; + + OPENVINO_ASSERT(new_seq_len <= old_seq_len); + + // if new_seq_len equal to old one no need to copy tensor, return as is + if (old_seq_len == new_seq_len) + return tensor; + + shape[seq_len_axis] = new_seq_len; + + if (seq_len_axis == 0) { + tensor.set_shape(shape); + return tensor; + } + + ov::Coordinate new_shape_begin{0, 0, 0, 0}; + ov::Coordinate new_shape_end{shape}; + + auto new_tensor = ov::Tensor(tensor, new_shape_begin, new_shape_end); + + return new_tensor; +} + +void update_kv_cache(ov::InferRequest request, uint64_t seq_len_axis, uint64_t new_seq_len) { + // trim kv_cache values up to the new_seq_len + auto states = request.query_state(); + ov::parallel_for(states.size(), [&](size_t i) { + ov::Tensor old_tensor = states.at(i).get_state(); + states.at(i).set_state(trimm_tensor(old_tensor, seq_len_axis, new_seq_len)); + }); +} + +class AssistedCandidateGenerator { +private: + ov::InferRequest draft_model; + size_t max_seq_length; + size_t num_pred_tokens = 5; + size_t seq_len_axis; + const size_t max_pred_tokens = 10; + int64_t out_of_kv_cache_token = -1; + size_t draft_model_seq_length = 0; + +public: + AssistedCandidateGenerator(ov::InferRequest draft_model, + const size_t max_seq_length, + const size_t num_pred_tokens, + const size_t seq_len_axis) + : draft_model{draft_model}, + max_seq_length{max_seq_length}, + num_pred_tokens{num_pred_tokens}, + seq_len_axis{seq_len_axis} {}; + + int64_t generate_next_token(const std::vector<int64_t> tokens) { + size_t tokens_size = tokens.size(); + auto input_ids = draft_model.get_tensor("input_ids"); + input_ids.set_shape({BATCH_SIZE, tokens_size}); + std::copy_n(tokens.begin(), tokens_size, input_ids.data<int64_t>()); + + auto attention_mask = draft_model.get_tensor("attention_mask"); + attention_mask.set_shape({BATCH_SIZE, draft_model_seq_length + tokens_size}); + std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1); + + auto position_ids = draft_model.get_tensor("position_ids"); + position_ids.set_shape({BATCH_SIZE, tokens_size}); + std::iota(position_ids.data<int64_t>(), + position_ids.data<int64_t>() + position_ids.get_size(), + draft_model_seq_length); + + draft_model.get_tensor("beam_idx").set_shape({BATCH_SIZE}); + draft_model.get_tensor("beam_idx").data<int32_t>()[0] = 0; + + draft_model.infer(); + + auto logits = draft_model.get_tensor("logits"); + size_t vocab_size = logits.get_shape().back(); + auto sequence_logits = logits.data<float>() + (tokens_size - 1) * vocab_size; + + draft_model_seq_length += tokens_size; + + return std::max_element(sequence_logits, sequence_logits + vocab_size) - sequence_logits; + } + + std::vector<int64_t> generate_candidates(int64_t out_token) { + std::vector<int64_t> candidates; + + // limit candidates size by num_pred_tokens or by max_seq_length + size_t candidates_to_generate = std::min(num_pred_tokens, max_seq_length - draft_model_seq_length - 1); + + candidates.reserve(candidates_to_generate); + + // generate cadidates + for (size_t i = 0; i < candidates_to_generate; i++) { + // if out_of_kv_cache_token is present, prepend it to out_token in order to collect kv cache for it + if (out_of_kv_cache_token != -1) { + out_token = generate_next_token(std::vector{out_of_kv_cache_token, out_token}); + out_of_kv_cache_token = -1; + } else { + out_token = generate_next_token(std::vector{out_token}); + } + + candidates.push_back(out_token); + } + + out_of_kv_cache_token = candidates.back(); + return candidates; + } + + void update_candidate_strategy(const size_t num_matches) { + // dynamically adjust number of generated candidates based on number of matches + // we want to balance the benefits of getting candidates tokens correct with the + // cost of forecasting incorrect candidates tokens. + if (num_matches == num_pred_tokens) { + num_pred_tokens = std::min(num_pred_tokens + 2, max_pred_tokens); + } else { + num_pred_tokens = std::max(int64_t(num_pred_tokens) - 1, int64_t(1)); + } + } + + void update_kv_cache(const size_t seq_length) { + // this is the case when main model accepted all candidates from draft model + // we need to collect kv cache for out_of_kv_cache_token by infering it + // on next candidates generation cycle out_of_kv_cache_token will be prefixed + // to main models's latest out token + if (draft_model_seq_length < seq_length) { + return; + } + + out_of_kv_cache_token = -1; + ::update_kv_cache(draft_model, seq_len_axis, seq_length); + draft_model_seq_length = seq_length; + } +}; + +int64_t get_eos_token(const std::shared_ptr<ov::Model> tokenizer) { + auto rt_info = tokenizer->get_rt_info(); // Get the runtime info for the model + + auto it = rt_info.find("eos_token_id"); + if (it == rt_info.end()) { + throw std::runtime_error("EOS token ID not found in model's runtime information."); + } + return it->second.as<int64_t>(); +} + +} // namespace + +int main(int argc, char* argv[]) try { + if (argc != 4) { + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <DRAFT MODEL_DIR> <MAIN MODEL_DIR> '<PROMPT>'"); + } + + // tokenizer model + ov::Core core; + core.add_extension(OPENVINO_TOKENIZERS_PATH); // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt + auto tokenizer_model = core.read_model(std::string{argv[1]} + "/openvino_tokenizer.xml"); + // tokenizer and detokenizer work on CPU only + ov::InferRequest tokenizer = core.compile_model(tokenizer_model, "CPU").create_infer_request(); + auto [input_ids, attention_mask] = tokenize(tokenizer, argv[3]); + ov::InferRequest detokenizer = + core.compile_model(std::string{argv[1]} + "/openvino_detokenizer.xml", "CPU").create_infer_request(); + TextStreamer text_streamer{std::move(detokenizer)}; + + // draft model (which is smaller, less accurate but faster) + std::shared_ptr<ov::Model> ov_draft_model = core.read_model(std::string{argv[1]} + "/openvino_model.xml"); + + size_t draft_model_seq_len_axis = get_seq_len_axis(ov_draft_model); + + ov::InferRequest draft_model = core.compile_model(ov_draft_model, "CPU").create_infer_request(); + + size_t seq_len = input_ids.get_shape()[1]; + + // main model (which is bigger, more accurate but slower) + std::shared_ptr<ov::Model> ov_main_model = core.read_model(std::string{argv[2]} + "/openvino_model.xml"); + + size_t main_model_seq_len_axis = get_seq_len_axis(ov_main_model); + + ov::InferRequest main_model = core.compile_model(ov_main_model, "CPU").create_infer_request(); + + size_t max_sequence_length = 100; + + AssistedCandidateGenerator candidateGenerator{draft_model, max_sequence_length, 5, draft_model_seq_len_axis}; + + main_model.set_tensor("input_ids", input_ids); + main_model.set_tensor("attention_mask", attention_mask); + + auto position_ids = main_model.get_tensor("position_ids"); + position_ids.set_shape(input_ids.get_shape()); + std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0); + + // set beam_idx for stateful model: no beam search is used and BATCH_SIZE = 1 + main_model.get_tensor("beam_idx").set_shape({BATCH_SIZE}); + main_model.get_tensor("beam_idx").data<int32_t>()[0] = 0; + + // To coollect kv-cache for the <PROMPT> and to get the next token run the very first infer request + candidateGenerator.generate_next_token( + std::vector<int64_t>(input_ids.data<int64_t>(), input_ids.data<int64_t>() + input_ids.get_size())); + + main_model.infer(); + + size_t vocab_size = draft_model.get_tensor("logits").get_shape().back(); + OPENVINO_ASSERT(vocab_size == main_model.get_tensor("logits").get_shape().back(), + "vocab size should be the same for the both models"); + + // logits shape is [BATCH_SIZE, seq_len, vocab_size] + auto logits = main_model.get_tensor("logits"); + auto data_logits = logits.data<float>() + (seq_len - 1) * vocab_size; + int64_t out_token = std::max_element(data_logits, data_logits + vocab_size) - data_logits; + + text_streamer.put(out_token); + + const int64_t EOS_TOKEN = get_eos_token(tokenizer_model); + + /* Speculative decoding works the following way. The draft model predicts the next K + tokens one by one in an autoregressive manner, while the main model validates these + predictions and corrects them if necessary. We go through each predicted token, and + if a difference is detected between the draft and main model, we stop and keep the + last token predicted by the main model. Then the draft model gets the latest main + prediction and again tries to predict the next K tokens, repeating the cycle. + + This approach reduces the need for multiple infer requests to the main model, + enhancing performance. For instance, in more predictable parts of text generation, + the draft model can, in best-case scenarios, generate the next K tokens that exactly + match the target. In that case they are validated in a single inference call to + the main model instead of running K subsequent requests. + */ + + while (out_token != EOS_TOKEN && seq_len < max_sequence_length) { + // generate candidates from the draft model + std::vector<int64_t> candidates = candidateGenerator.generate_candidates(out_token); + size_t candidates_size = candidates.size(); + + // For the main network, candidates_size + 1 tokens will be fed at once in a single infer request. + input_ids.set_shape({BATCH_SIZE, candidates_size + 1}); + + input_ids.data<int64_t>()[0] = out_token; + if (candidates_size > 0) { + std::copy_n(candidates.begin(), candidates_size, input_ids.data<int64_t>() + 1); + } + + attention_mask.set_shape({BATCH_SIZE, seq_len + candidates_size + 1}); + std::fill_n(attention_mask.data<int64_t>(), attention_mask.get_size(), 1); + + position_ids.set_shape({BATCH_SIZE, candidates_size + 1}); + std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), seq_len); + + main_model.infer(); + + data_logits = logits.data<float>(); // [BATCH_SIZE, K, vocab_size] + + // match model tokens with candidate tokens + // 1. accept current out token (if not eos) + // 2. check if it matches apropriate candidate + // 2.1 if it's match, continue - accept next token + // 2.2 it it's mismatch, stop iteration but still accept current token as it was last token generated by + // model from a valid sequence. + size_t accepted_tokens_number = 0; + for (size_t i = 0; i < candidates_size + 1; i++) { + auto start = data_logits + vocab_size * i; + auto stop = data_logits + vocab_size * (i + 1); + out_token = std::max_element(start, stop) - start; + + if (out_token == EOS_TOKEN) { + break; + } + + text_streamer.put(out_token); + accepted_tokens_number++; + + if (i == candidates_size || out_token != candidates[i]) { + break; + } + } + + // After the inference request, key/values have shape [BATCH_SIZE, seq_len + K, vocab_size]. + // Increment the sequence length by the number of matched tokens, and + // trim the KV cache to match the new sequence length. + seq_len += accepted_tokens_number; + + if (accepted_tokens_number > 0) { + candidateGenerator.update_candidate_strategy(accepted_tokens_number - 1); + } + + candidateGenerator.update_kv_cache(seq_len); + update_kv_cache(main_model, main_model_seq_len_axis, seq_len); + + candidates.clear(); + } + text_streamer.end(); + // Model is stateful which means that context (kv-cache) which belongs to a particular + // text sequence is accumulated inside the model during the generation loop above. + // This context should be reset before processing the next text sequence. + // While it is not required to reset context in this sample as only one sequence is processed, + // it is called for education purposes: + draft_model.reset_state(); + main_model.reset_state(); +} catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} diff --git a/samples/cpp/text2image/512x512.bmp b/samples/cpp/text2image/512x512.bmp new file mode 100644 index 0000000000..b89aadec5f --- /dev/null +++ b/samples/cpp/text2image/512x512.bmp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73fc1a2b80048752350d108852f3598395666b9208d5e0ab34c0613cea9cfd04 +size 786486 diff --git a/samples/cpp/text2image/CMakeLists.txt b/samples/cpp/text2image/CMakeLists.txt new file mode 100644 index 0000000000..ca0f832f6d --- /dev/null +++ b/samples/cpp/text2image/CMakeLists.txt @@ -0,0 +1,47 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +find_package(OpenVINOGenAI REQUIRED + PATHS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. + NO_CMAKE_FIND_ROOT_PATH +) + +# create main sample executable + +add_executable(stable_diffusion + ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/imwrite.cpp) + +target_include_directories(stable_diffusion PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) +target_link_libraries(stable_diffusion PRIVATE openvino::genai) + +set_target_properties(stable_diffusion PROPERTIES + COMPILE_PDB_NAME stable_diffusion + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) + +install(TARGETS stable_diffusion + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) + +# create LoRA sample executable + +add_executable(lora_stable_diffusion + ${CMAKE_CURRENT_SOURCE_DIR}/lora.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/imwrite.cpp) + +target_include_directories(lora_stable_diffusion PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) +target_link_libraries(lora_stable_diffusion PRIVATE openvino::genai) + +set_target_properties(lora_stable_diffusion PROPERTIES + COMPILE_PDB_NAME lora_stable_diffusion + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) + +install(TARGETS lora_stable_diffusion + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/text2image/README.md b/samples/cpp/text2image/README.md new file mode 100644 index 0000000000..16b1aff53c --- /dev/null +++ b/samples/cpp/text2image/README.md @@ -0,0 +1,67 @@ +# Text to Image C++ Generation Pipeline + +Examples in this folder showcase inference of text to image models like Stable Diffusion 1.5, 2.1, LCM. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample features `ov::genai::Text2ImagePipeline` and uses a text prompt as input source. + +There are two sample files: + - [`main.cpp`](./main.cpp) demonstrates basic usage of the text to image pipeline + - [`lora.cpp`](./lora.cpp) shows how to apply LoRA adapters to the pipeline + +Users can change the sample code and play with the following generation parameters: + +- Change width or height of generated image +- Generate multiple images per prompt +- Adjust a number of inference steps +- Play with [guidance scale](https://huggingface.co/spaces/stabilityai/stable-diffusion/discussions/9) (read [more details](https://arxiv.org/abs/2207.12598)) +- (SD 1.x, 2.x only) Add negative prompt when guidance scale > 1 +- Apply multiple different LoRA adapters and mix them with different blending coefficients + +## Download and convert the models and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. + +```sh +pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --model dreamlike-art/dreamlike-anime-1.0 --task stable-diffusion --weight-format fp16 dreamlike_anime_1_0_ov/FP16 +``` + +## Run + +`stable_diffusion ./dreamlike_anime_1_0_ov/FP16 'cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting'` + +### Examples + +Prompt: `cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting` + +  + + +## Run with optional LoRA adapters + +LoRA adapters can be connected to the pipeline and modify generated images to have certain style, details or quality. Adapters are supported in Safetensors format and can be downloaded from public sources like [Civitai](https://civitai.com) or [HuggingFace](https://huggingface.co/models) or trained by the user. Adapters compatible with a base model should be used only. A weighted blend of multiple adapters can be applied by specifying multple adapter files with corresponding alpha parameters in command line. Check `lora.cpp` source code to learn how to enable adapters and specify them in each `generate` call. + +Here is an example how to run the sample with a single adapter. First download adapter file from https://civitai.com/models/67927/soulcard page manually and save it as `soulcard.safetensors`. Or download it from command line: + +`wget -O soulcard.safetensors https://civitai.com/api/download/models/72591` + +Then run `lora_stable_diffusion` executable: + +`./lora_stable_diffusion dreamlike_anime_1_0_ov/FP16 'curly-haired unicorn in the forest, anime, line' soulcard.safetensors 0.7` + +The sample generates two images with and without adapters applied using the same prompt: + - `lora.bmp` with adapters applied + - `baseline.bmp` without adapters applied + +Check the difference: + +With adapter | Without adapter +:---:|:---: + |  + + +## Note + +- Image generated with HuggingFace / Optimum Intel is not the same generated by this C++ sample: + +C++ random generation with MT19937 results differ from `numpy.random.randn()` and `diffusers.utils.randn_tensor`. So, it's expected that image generated by Python and C++ versions provide different images, because latent images are initialize differently. Users can implement their own random generator derived from `ov::genai::Generator` and pass it to `Text2ImagePipeline::generate` method. diff --git a/samples/cpp/text2image/baseline.bmp b/samples/cpp/text2image/baseline.bmp new file mode 100644 index 0000000000..c8a60782ba --- /dev/null +++ b/samples/cpp/text2image/baseline.bmp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb8491607e8c2cce4394ac0b796350745dde04dba7d754c3fad24d86e1c4d2e1 +size 1376310 diff --git a/image_generation/common/imwrite/src/imwrite.cpp b/samples/cpp/text2image/imwrite.cpp similarity index 65% rename from image_generation/common/imwrite/src/imwrite.cpp rename to samples/cpp/text2image/imwrite.cpp index e7894790f8..b25db03051 100644 --- a/image_generation/common/imwrite/src/imwrite.cpp +++ b/samples/cpp/text2image/imwrite.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2023 Intel Corporation +// Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include <fstream> @@ -30,60 +30,59 @@ unsigned char file[14] = { }; unsigned char info[40] = { - 40, - 0, - 0, - 0, // info hd size - 0, - 0, - 0, - 0, // width - 0, - 0, - 0, - 0, // height - 1, - 0, // number color planes - 24, - 0, // bits per pixel - 0, - 0, - 0, - 0, // compression is none - 0, - 0, - 0, - 0, // image bits size - 0x13, - 0x0B, - 0, - 0, // horz resolution in pixel / m - 0x13, - 0x0B, - 0, - 0, // vert resolution (0x03C3 = 96 dpi, 0x0B13 = 72 - // dpi) - 0, - 0, - 0, - 0, // #colors in palette - 0, - 0, - 0, - 0, // #important colors - }; - -} - -void imwrite(const std::string& name, ov::Tensor image, bool convert_bgr2rgb) { - std::ofstream output_file(name, std::ofstream::binary); - OPENVINO_ASSERT(output_file.is_open(), "Failed to open the output BMP image path"); + 40, + 0, + 0, + 0, // info hd size + 0, + 0, + 0, + 0, // width + 0, + 0, + 0, + 0, // height + 1, + 0, // number color planes + 24, + 0, // bits per pixel + 0, + 0, + 0, + 0, // compression is none + 0, + 0, + 0, + 0, // image bits size + 0x13, + 0x0B, + 0, + 0, // horz resolution in pixel / m + 0x13, + 0x0B, + 0, + 0, // vert resolution (0x03C3 = 96 dpi, 0x0B13 = 72 + // dpi) + 0, + 0, + 0, + 0, // #colors in palette + 0, + 0, + 0, + 0, // #important colors +}; +void imwrite_single_image(const std::string& name, ov::Tensor image, bool convert_bgr2rgb) { const ov::Shape shape = image.get_shape(); const size_t width = shape[2], height = shape[1], channels = shape[3]; OPENVINO_ASSERT(image.get_element_type() == ov::element::u8 && shape.size() == 4 && shape[0] == 1 && channels == 3, - "Image of u8 type and [1, H, W, 3] shape is expected"); + "Image of u8 type and [1, H, W, 3] shape is expected.", + "Given image has shape ", shape, " and element type ", image.get_element_type()); + + std::ofstream output_file(name, std::ofstream::binary); + OPENVINO_ASSERT(output_file.is_open(), "Failed to open the output BMP image path"); int padSize = static_cast<int>(4 - (width * channels) % 4) % 4; int sizeData = static_cast<int>(width * height * channels + height * padSize); @@ -131,3 +130,19 @@ void imwrite(const std::string& name, ov::Tensor image, bool convert_bgr2rgb) { output_file.write(reinterpret_cast<const char*>(pad), padSize); } } + +} // namespace + + +void imwrite(const std::string& name, ov::Tensor images, bool convert_bgr2rgb) { + const ov::Shape shape = images.get_shape(), img_shape = {1, shape[1], shape[2], shape[3]}; + uint8_t* img_data = images.data<uint8_t>(); + + for (int img_num = 0, num_images = shape[0], img_size = ov::shape_size(img_shape); img_num < num_images; ++img_num, img_data += img_size) { + char img_name[25]; + sprintf(img_name, name.c_str(), img_num); + + ov::Tensor image(images.get_element_type(), img_shape, img_data); + imwrite_single_image(img_name, image, true); + } +} diff --git a/samples/cpp/text2image/imwrite.hpp b/samples/cpp/text2image/imwrite.hpp new file mode 100644 index 0000000000..9b8752fb07 --- /dev/null +++ b/samples/cpp/text2image/imwrite.hpp @@ -0,0 +1,16 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <string> + +#include "openvino/runtime/tensor.hpp" + +/** + * @brief Writes mutiple images (depending on `image` tensor batch size) to BPM file(s) + * @param name File name or pattern to use to write images + * @param image Image(s) tensor + * @param convert_bgr2rgb Convert BGR to RGB + */ +void imwrite(const std::string& name, ov::Tensor images, bool convert_bgr2rgb); diff --git a/samples/cpp/text2image/lora.bmp b/samples/cpp/text2image/lora.bmp new file mode 100644 index 0000000000..41bde31e7b --- /dev/null +++ b/samples/cpp/text2image/lora.bmp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72760b8ae70a02cf318cfb9a08d520bd4800abb22b5eafe57eafb3cfbed7303d +size 1376310 diff --git a/samples/cpp/text2image/lora.cpp b/samples/cpp/text2image/lora.cpp new file mode 100644 index 0000000000..0db7b55fe9 --- /dev/null +++ b/samples/cpp/text2image/lora.cpp @@ -0,0 +1,53 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/text2image/pipeline.hpp" + +#include "imwrite.hpp" + +int32_t main(int32_t argc, char* argv[]) try { + OPENVINO_ASSERT(argc >= 3 && (argc - 3) % 2 == 0, "Usage: ", argv[0], " <MODEL_DIR> '<PROMPT>' [<LORA_SAFETENSORS> <ALPHA> ...]]"); + + const std::string models_path = argv[1], prompt = argv[2]; + const std::string device = "CPU"; // GPU, NPU can be used as well + + ov::genai::AdapterConfig adapter_config; + // Multiple LoRA adapters applied simultaniously are supported, parse them all and corresponding alphas from cmd parameters: + for(size_t i = 0; i < (argc - 3)/2; ++i) { + ov::genai::Adapter adapter(argv[3 + 2*i]); + float alpha = std::atof(argv[3 + 2*i + 1]); + adapter_config.add(adapter, alpha); + } + + // LoRA adapters passed to the constructor will be activated by default in next generates + ov::genai::Text2ImagePipeline pipe(models_path, device, ov::genai::adapters(adapter_config)); + + std::cout << "Generating image with LoRA adapters applied, resulting image will be in lora.bmp\n"; + ov::Tensor image = pipe.generate(prompt, + ov::genai::random_generator(std::make_shared<ov::genai::CppStdGenerator>(42)), + ov::genai::width(512), + ov::genai::height(896), + ov::genai::num_inference_steps(20)); + imwrite("lora.bmp", image, true); + + std::cout << "Generating image without LoRA adapters applied, resulting image will be in baseline.bmp\n"; + image = pipe.generate(prompt, + ov::genai::adapters(), // passing adapters in generate overrides adapters set in the constructor; adapters() means no adapters + ov::genai::random_generator(std::make_shared<ov::genai::CppStdGenerator>(42)), + ov::genai::width(512), + ov::genai::height(896), + ov::genai::num_inference_steps(20)); + imwrite("baseline.bmp", image, true); + + return EXIT_SUCCESS; +} catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} diff --git a/samples/cpp/text2image/main.cpp b/samples/cpp/text2image/main.cpp new file mode 100644 index 0000000000..1cef148796 --- /dev/null +++ b/samples/cpp/text2image/main.cpp @@ -0,0 +1,35 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/text2image/pipeline.hpp" + +#include "imwrite.hpp" + +int32_t main(int32_t argc, char* argv[]) try { + OPENVINO_ASSERT(argc == 3, "Usage: ", argv[0], " <MODEL_DIR> '<PROMPT>'"); + + const std::string models_path = argv[1], prompt = argv[2]; + const std::string device = "CPU"; // GPU, NPU can be used as well + + ov::genai::Text2ImagePipeline pipe(models_path, device); + ov::Tensor image = pipe.generate(prompt, + ov::genai::width(512), + ov::genai::height(512), + ov::genai::num_inference_steps(20), + ov::genai::num_images_per_prompt(1)); + + // writes `num_images_per_prompt` images by pattern name + imwrite("image_%d.bmp", image, true); + + return EXIT_SUCCESS; +} catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} diff --git a/samples/cpp/visual_language_chat/CMakeLists.txt b/samples/cpp/visual_language_chat/CMakeLists.txt new file mode 100644 index 0000000000..9a1b21632f --- /dev/null +++ b/samples/cpp/visual_language_chat/CMakeLists.txt @@ -0,0 +1,28 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +find_package(OpenVINOGenAI REQUIRED + PATHS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. + NO_CMAKE_FIND_ROOT_PATH +) + +file(DOWNLOAD + https://raw.githubusercontent.com/nothings/stb/f75e8d1cad7d90d72ef7a4661f1b994ef78b4e31/stb_image.h + ${CMAKE_BINARY_DIR}/stb_image.h + EXPECTED_HASH MD5=27932e6fb3a2f26aee2fc33f2cb4e696) + +add_executable(visual_language_chat visual_language_chat.cpp load_image.cpp) +target_include_directories(visual_language_chat PRIVATE "${CMAKE_CURRENT_SOUCE_DIR}" "${CMAKE_BINARY_DIR}") +target_link_libraries(visual_language_chat PRIVATE openvino::genai) + +set_target_properties(visual_language_chat PROPERTIES + COMPILE_PDB_NAME visual_language_chat + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) + +install(TARGETS visual_language_chat + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/visual_language_chat/README.md b/samples/cpp/visual_language_chat/README.md new file mode 100644 index 0000000000..e487d5c1a6 --- /dev/null +++ b/samples/cpp/visual_language_chat/README.md @@ -0,0 +1,37 @@ +# C++ visual language chat + +This example showcases inference of Visual language models (VLMs): [`openbmb/MiniCPM-V-2_6`](https://huggingface.co/openbmb/MiniCPM-V-2_6). The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample features `ov::genai::VLMPipeline` and runs the simplest deterministic greedy sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/minicpm-v-multimodal-chatbot) which provides an example of Visual-language assistant. + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. + +```sh +pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino -m openbmb/MiniCPM-V-2_6 MiniCPM-V-2_6 --trust-remote-code +``` + +## Run + +[This image](https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11) can be used as a sample image. + +`visual_language_chat miniCPM-V-2_6 319483352-d5fbbd1a-d484-415c-88cb-9986625b7b11.jpg` + +Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model `llava-hf/llava-v1.6-mistral-7b-hf` can benefit from being run on a dGPU. Modify the source code to change the device for inference to the `GPU`. + +See [SUPPORTED_MODELS.md](../../../src/docs/SUPPORTED_MODELS.md#visual-language-models) for the list of supported models. + +### Troubleshooting + +#### Unicode characters encoding error on Windows + +Example error: +``` +UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to <undefined> +``` + +If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: +1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. +2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py b/samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py new file mode 100644 index 0000000000..94472bcd77 --- /dev/null +++ b/samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py @@ -0,0 +1,1199 @@ +import argparse +import requests +import torch +from threading import Thread +from copy import deepcopy +import shutil +import json +from PIL import Image +from transformers import AutoModel, AutoTokenizer, AutoProcessor, TextIteratorStreamer +from transformers.generation import GenerationMixin +from transformers import AutoConfig, GenerationConfig +from transformers.modeling_outputs import CausalLMOutputWithPast, BaseModelOutputWithPooling +from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask +from pathlib import Path +from huggingface_hub import snapshot_download +import types +from typing import Optional, Tuple, List, Union +from openvino.runtime import opset13 +import openvino as ov +import openvino_tokenizers +import numpy as np +import gc +from openvino.runtime.passes import Manager, MatcherPass, WrapType, Matcher +import time + +text_emb_path = Path("openvino_text_embeddings_model.xml") +image_emb_path = Path("openvino_vision_embeddings_model.xml") +resampler_path = Path("openvino_resampler_model.xml") +llm_path = Path("openvino_language_model.xml") + +class InsertSlice(MatcherPass): + def __init__(self): + MatcherPass.__init__(self) + self.model_changed = False + + param = WrapType("opset10.Result") + + def callback(matcher: Matcher) -> bool: + root = matcher.get_match_root() + if root is None: + return False + if len(root.get_output_partial_shape(0)) == 3: + parent = root.input_value(0).get_node() + grand_parent = parent.input_value(0).get_node() + + grand_parent_output = parent.input(0).get_source_output() + consumers = grand_parent_output.get_target_inputs() + start = np.array([0, -1, 0], dtype=np.int32) + stop = np.array([1, -2, grand_parent_output.get_partial_shape()[-1].get_length()], dtype=np.int32) + step = np.array([1, -1, 1], dtype=np.int32) + axes = np.array([0, 1, 2], dtype=np.int32) + slice = opset13.slice(grand_parent, start, stop, step, axes, name="inserted_slice") + for consumer in consumers: + consumer.replace_source_output(slice.output(0)) + self.model_changed = True + # Use new operation for additional matching + self.register_new_node(slice) + print("applied slice for lm head") + + return True + + self.register_matcher(Matcher(param, "InsertSlice"), callback) + + +def model_has_state(ov_model: ov.Model): + return len(ov_model.get_sinks()) > 0 + + +def model_has_input_output_name(ov_model: ov.Model, name: str): + """ + Helper function for checking that model has specified input or output name + + Parameters: + ov_model (ov.Model): + name (str): + name of input or output + + Returns: + True if input or output with requested name exists else False + """ + return name in sum([list(t.get_names()) for t in ov_model.inputs + ov_model.outputs], []) + + +def fuse_cache_reorder( + ov_model: ov.Model, + not_kv_inputs: List[str], + key_value_input_names: List[str], + gather_dim: int, +): + """ + Fuses reored_cache during generate cycle into ov.Model. Used with stateful models, because we can not modify model state directly. + + Adds a new beam_idx parameter and Gather op per each kv-cache input in a given model. + Should be run before make_stateful. Implements optimumum's _reorder_cache + inside the model in the beginning of each iteration. + Gather works along given gather_dim dimension that may vary from model to model. + KV-cache inputs are identified based on names in key_value_input_names. + Append the new beam_idx parameter to not_kv_inputs. + + Parameters: + ov_model (`ov.Model`): + openvino model for processing + not_kv_inputs (`List[str]`): + list of input nodes in model that not related to past key values + key_value_input_names (`List[str]`): + list of names for key value input layers + gather_dim (int): + dimension for gathering cache during reorder pass + """ + + if model_has_input_output_name(ov_model, "beam_idx"): + raise ValueError("Model already has fused cache") + input_batch = ov_model.input("inputs_embeds").get_partial_shape()[0] + beam_idx = opset13.parameter(name="beam_idx", dtype=ov.Type.i32, shape=ov.PartialShape([input_batch])) + beam_idx.output(0).get_tensor().add_names({"beam_idx"}) + ov_model.add_parameters([beam_idx]) + not_kv_inputs.append(ov_model.inputs[-1]) + # Go over all cache parameters and fuse _reorder_cache with indices provided by the new parameter beam_idx + for input_name in key_value_input_names: + parameter_output_port = ov_model.input(input_name) + consumers = parameter_output_port.get_target_inputs() + gather = opset13.gather(parameter_output_port, beam_idx, opset13.constant(gather_dim)) + for consumer in consumers: + consumer.replace_source_output(gather.output(0)) + ov_model.validate_nodes_and_infer_types() + + +def build_state_initializer(ov_model: ov.Model, batch_dim: int): + """ + Build initialization ShapeOf Expression for all ReadValue ops + + Parameters: + ov_model (ov.Model): + openvino model + batch_dim (int): + index of dimension corresponding to batch size + """ + input_ids = ov_model.input("inputs_embeds") + batch = opset13.gather( + opset13.shape_of(input_ids, output_type="i64"), + opset13.constant([0]), + opset13.constant(0), + ) + for op in ov_model.get_ops(): + if op.get_type_name() == "ReadValue": + dims = [dim.min_length for dim in list(op.get_output_partial_shape(0))] + dims[batch_dim] = batch + dims = [(opset13.constant(np.array([dim], dtype=np.int64)) if isinstance(dim, int) else dim) for dim in dims] + shape = opset13.concat(dims, axis=0) + broadcast = opset13.broadcast(opset13.constant(0.0, dtype=op.get_output_element_type(0)), shape) + op.set_arguments([broadcast]) + ov_model.validate_nodes_and_infer_types() + + +def make_stateful( + ov_model: ov.Model, + not_kv_inputs: List[str], + key_value_input_names: List[str], + key_value_output_names: List[str], + batch_dim: int, + num_attention_heads: int, + num_beams_and_batch: int = None, +): + """ + Hides kv-cache inputs and outputs inside the model as variables. + + Parameters: + ov_model (ov.Model): + openvino model + not_kv_inputs (`List[str]`): + list of input nodes in model that not related to past key values + key_value_input_names (`List[str]`): + list of names for key value input layers + key_value_output_names (`List[str]`): + list of names for key value input layers + batch_dim (int): + index of batch dimension in key value layers + num_attention_heads (int): + number of attention heads for batch dimension initialization + num_beams_an_batch (int): + precalculated number of beams and batch for shapes initialization + """ + from openvino._offline_transformations import apply_make_stateful_transformation + + input_output_map = {} + + if num_beams_and_batch is not None: + # Set batch size for input_ids and attention mask to avoid dynamic dimension got propagated from the end of the model back to ReadValue + for input in not_kv_inputs: + shape = input.get_partial_shape() + if shape.rank.get_length() <= 2: # == 1 for beam_index + shape[0] = num_beams_and_batch + input.get_node().set_partial_shape(shape) + for kv_name_pair in zip(key_value_input_names, key_value_output_names): + input_output_map[kv_name_pair[0]] = kv_name_pair[1] + if num_beams_and_batch is not None: + input = ov_model.input(kv_name_pair[0]) + shape = input.get_partial_shape() + shape[batch_dim] = num_beams_and_batch * num_attention_heads + input.get_node().set_partial_shape(shape) + + if num_beams_and_batch is not None: + # Re-validation model if shapes are altered above + ov_model.validate_nodes_and_infer_types() + + apply_make_stateful_transformation(ov_model, input_output_map) + if num_beams_and_batch is None: + build_state_initializer(ov_model, batch_dim) + + +def patch_stateful(ov_model): + key_value_input_names = [key.get_any_name() for key in ov_model.inputs[2:-1]] + key_value_output_names = [key.get_any_name() for key in ov_model.outputs[1:]] + not_kv_inputs = [input for input in ov_model.inputs if not any(name in key_value_input_names for name in input.get_names())] + if not key_value_input_names or not key_value_output_names: + return + batch_dim = 0 + num_attention_heads = 1 + + fuse_cache_reorder(ov_model, not_kv_inputs, key_value_input_names, batch_dim) + make_stateful( + ov_model, + not_kv_inputs, + key_value_input_names, + key_value_output_names, + batch_dim, + num_attention_heads, + None, + ) + + +def cleanup_torchscript_cache(): + """ + Helper for removing cached model representation + """ + torch._C._jit_clear_class_registry() + torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore() + torch.jit._state._clear_class_state() + + +def get_2d_sincos_pos_embed(embed_dim, image_size): + """ + image_size: image_size or (image_height, image_width) + return: + pos_embed: [image_height, image_width, embed_dim] + """ + if isinstance(image_size, int): + grid_h_size, grid_w_size = image_size, image_size + else: + grid_h_size, grid_w_size = image_size[0], image_size[1] + + grid_h = np.arange(grid_h_size, dtype=np.float32) + grid_w = np.arange(grid_w_size, dtype=np.float32) + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) + + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) + return pos_embed + + +def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): + assert embed_dim % 2 == 0 + + # use half of dimensions to encode grid_h + emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim // 2, grid[0]) # (H, W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim // 2, grid[1]) # (H, W, D/2) + + emb = np.concatenate([emb_h, emb_w], axis=-1) # (H, W, D) + return emb + + +def get_1d_sincos_pos_embed_from_grid_new(embed_dim, pos): + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (H, W) + out: (H, W, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=np.float32) + omega /= embed_dim / 2.0 + omega = 1.0 / 10000**omega # (D/2,) + + out = np.einsum("hw,d->hwd", pos, omega) # (H, W, D/2), outer product + + # Align with C++ which always uses double + emb_sin = np.sin(out.astype(np.float64)).astype(np.float32) # (H, W, D/2) + emb_cos = np.cos(out.astype(np.float64)).astype(np.float32) # (H, W, D/2) + + emb = np.concatenate([emb_sin, emb_cos], axis=-1) # (H, W, D) + return emb + + +def patch_model_code(orig_model_dir): + model_file = orig_model_dir / "modeling_navit_siglip.py" + orig_model_file = model_file.parent / ("orig_" + model_file.name) + if not orig_model_file.exists(): + model_file.rename(orig_model_file) + with orig_model_file.open("r") as f: + content = f.read() + content = content.replace("if is_flash_attn_2_available():", "") + content = content.replace("from flash_attn import flash_attn_func, flash_attn_varlen_func", "") + content = content.replace("from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input", "") + + with model_file.open("w") as out_f: + out_f.write(content) + + +def convert_llm(model, model_dir): + model.llm.config.save_pretrained(model_dir / text_emb_path.parent) + print("⌛ Convert Input embedding model") + ov_model = ov.convert_model(model.llm.model.embed_tokens, example_input=torch.ones([1, 10], dtype=torch.long)) + + ov.save_model(ov_model, model_dir / text_emb_path) + del ov_model + cleanup_torchscript_cache() + gc.collect() + print("✅ Input embedding model successfully converted") + + print("⌛ Convert Language model") + hidden_size = model.llm.config.hidden_size + num_pkv = model.llm.config.num_hidden_layers + pkv_shape = (2, model.llm.config.num_key_value_heads, 2, hidden_size // model.llm.config.num_attention_heads) + + input_embeds = torch.randn((2, 2, hidden_size)) + attention_mask = torch.ones([2, 4], dtype=torch.long) + position_ids = torch.tensor([[2, 3], [2, 3]], dtype=torch.long) + input_names = ["attention_mask", "position_ids"] + output_names = ["logits"] + + past_key_values = [] + for i in range(num_pkv): + kv = [torch.randn(pkv_shape) for _ in range(2)] + past_key_values.append(kv) + input_names.extend([f"past_key_values.{i}.key", f"past_key_values.{i}.value"]) + output_names.extend([f"present.{i}.key", f"present.{i}.value"]) + input_names.append("inputs_embeds") + + example_input = {"inputs_embeds": input_embeds, "attention_mask": attention_mask, "position_ids": position_ids, "past_key_values": past_key_values} + + model.llm.config.torchscript = True + + ov_model = ov.convert_model(model.llm, example_input=example_input) + + for out, out_name in zip(ov_model.outputs, output_names): + out.get_tensor().set_names({out_name}) + + for inp, inp_name in zip(ov_model.inputs, input_names): + inp.get_tensor().set_names({inp_name}) + + patch_stateful(ov_model) + + ov.save_model(ov_model, model_dir / llm_path) + del ov_model + + cleanup_torchscript_cache() + gc.collect() + print("✅ Language model successfully converted") + + +def convert_vision_encoder(model, model_dir): + tgt_sizes = torch.tensor([[23, 45]]) + if not (model_dir / image_emb_path).exists(): + print("⌛ Convert Image embedding model") + def siglip_vis_embed_forward( + self, + pixel_values: torch.FloatTensor, + patch_attention_mask: torch.BoolTensor, + tgt_sizes: Optional[torch.IntTensor] = None, + position_ids: Optional[torch.FloatTensor] = None, + ) -> torch.Tensor: + patch_embeds = self.patch_embedding(pixel_values) + embeddings = patch_embeds.flatten(2).transpose(1, 2) + + if position_ids is None: + batch_size = pixel_values.size(0) + max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3) + max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, max_im_w // self.patch_size + boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side) + position_ids = torch.full( + size=( + batch_size, + max_nb_patches_h * max_nb_patches_w, + ), + fill_value=0, + ) + + for batch_idx, p_attn_mask in enumerate(patch_attention_mask): + if tgt_sizes is not None: + nb_patches_h = tgt_sizes[batch_idx][0] + nb_patches_w = tgt_sizes[batch_idx][1] + else: + nb_patches_h = p_attn_mask[:, 0].sum() + nb_patches_w = p_attn_mask[0].sum() + + fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h) + fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w) + + bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True) + bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True) + + pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten() + position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids + + position_ids = position_ids.to(self.position_embedding.weight.device) + + embeddings = embeddings + self.position_embedding(position_ids) + return embeddings + + def siglip_attn_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + batch_size, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2) + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, key_states, value_states, attention_mask, is_causal=attention_mask is None + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, None + + def siglip_transformer_forward( + self, + pixel_values, + patch_attention_mask: Optional[torch.BoolTensor] = None, + tgt_sizes: Optional[torch.IntTensor] = None, + position_ids: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size = pixel_values.size(0) + if patch_attention_mask is None: + patch_attention_mask = torch.ones( + size=( + batch_size, + pixel_values.size(2) // self.config.patch_size, + pixel_values.size(3) // self.config.patch_size, + ), + dtype=torch.bool, + device=pixel_values.device, + ) + + hidden_states = self.embeddings( + pixel_values=pixel_values, patch_attention_mask=patch_attention_mask, tgt_sizes=tgt_sizes, position_ids=position_ids + ) + + patch_attention_mask = patch_attention_mask.view(batch_size, -1) + attention_mask = _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype) if not self._use_flash_attention_2 else patch_attention_mask + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + last_hidden_state = self.post_layernorm(last_hidden_state) + + if not return_dict: + return (last_hidden_state, None) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=None, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + vpm = model.vpm + vpm.embeddings.forward = types.MethodType(siglip_vis_embed_forward, vpm.embeddings) + for layer in vpm.encoder.layers: + layer.self_attn.forward = types.MethodType(siglip_attn_forward, layer.self_attn) + vpm.forward = types.MethodType(siglip_transformer_forward, vpm) + + pixel_values = torch.randn([1, 3, 14, 14490]) + patch_attn_mask = torch.zeros((1, 1, 1035), dtype=torch.bool) + patch_attn_mask[0, 0, : tgt_sizes[0][0] * tgt_sizes[0][1]] = True + position_ids = prepare_vis_position_ids( + pixel_values, patch_attn_mask, tgt_sizes, model.config.vision_config.patch_size, model.config.vision_config.image_size // model.config.patch_size + ) + ov_model = ov.convert_model(vpm, example_input={"pixel_values": pixel_values, "position_ids": position_ids, "patch_attention_mask": patch_attn_mask}) + ov.save_model(ov_model, model_dir / image_emb_path) + del ov_model + cleanup_torchscript_cache() + gc.collect() + print("✅ Image embedding model successfully converted") + + if not (model_dir / resampler_path).exists(): + print("⌛ Convert Resamler model") + + def resampler_forward(self, x, pos_embed, key_padding_mask): + bs = x.shape[0] + x = self.kv_proj(x) # B * L * D + x = self.ln_kv(x).permute(1, 0, 2) # L * B * D + + q = self.ln_q(self.query) # Q * D + + q_bs = q.unsqueeze(1).repeat(1, bs, 1) + + out = self.attn(q_bs, x + pos_embed, x, key_padding_mask=key_padding_mask)[0] # Q * B * D # L * B * D + L * B * D + # out: Q * B * D + x = out.permute(1, 0, 2) # B * Q * D + + x = self.ln_post(x) + x = x @ self.proj + return x + + model.resampler.forward = types.MethodType(resampler_forward, model.resampler) + + pos_embed_base = get_2d_sincos_pos_embed(model.resampler.embed_dim, 70) + + patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1] + + max_patch_len = torch.max(patch_len) + key_padding_mask = torch.zeros((1, max_patch_len), dtype=torch.bool) + + pos_embed = [] + tgt_h, tgt_w = tgt_sizes[0] + pos_embed = torch.from_numpy(pos_embed_base[:tgt_h, :tgt_w, :].reshape((tgt_h * tgt_w, 1, -1))) # patches * D + key_padding_mask[0, patch_len:] = True + + ov_model = ov.convert_model(model.resampler, example_input=[torch.randn(1, 1035, 1152), pos_embed, key_padding_mask]) + ov.save_model(ov_model, model_dir / resampler_path) + del ov_model + cleanup_torchscript_cache() + del model.resampler + gc.collect() + print("✅ Resampler model successfully converted") + + +def copy_llm_files(model_dir, dst_dir): + shutil.copy(model_dir / text_emb_path, model_dir / dst_dir / text_emb_path.name) + shutil.copy(model_dir / text_emb_path.with_suffix(".bin"), model_dir / dst_dir / text_emb_path.with_suffix(".bin").name) + shutil.copy(model_dir / llm_path.parent / "config.json", model_dir / dst_dir / "config.json") + shutil.copy(model_dir / llm_path.parent / "configuration_minicpm.py", model_dir / dst_dir / "configuration_minicpm.py") + shutil.copy(model_dir / llm_path.parent / "modeling_navit_siglip.py", model_dir / dst_dir / "modeling_navit_siglip.py") + + +def prepare_vis_position_ids(pixel_values, patch_attention_mask, tgt_sizes, patch_size, num_patches_per_side): + batch_size = pixel_values.size(0) + max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3) + max_nb_patches_h, max_nb_patches_w = max_im_h // patch_size, max_im_w // patch_size + boundaries = torch.arange(1 / num_patches_per_side, 1.0, 1 / num_patches_per_side) + position_ids = torch.full(size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0) + + for batch_idx, p_attn_mask in enumerate(patch_attention_mask): + if tgt_sizes is not None: + nb_patches_h = tgt_sizes[batch_idx][0] + nb_patches_w = tgt_sizes[batch_idx][1] + else: + nb_patches_h = p_attn_mask[:, 0].sum() + nb_patches_w = p_attn_mask[0].sum() + + fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h) + fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w) + + bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True) + bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True) + + pos_ids = (bucket_coords_h[:, None] * num_patches_per_side + bucket_coords_w).flatten() + position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids + + return position_ids + + +core = ov.Core() + + +class OvModelForCausalLMWithEmb(GenerationMixin): + def __init__(self, model_dir, device="CPU", ov_config=None, compile=True, slice_lm_head=True) -> None: + self._supports_cache_class = False + self.config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True) + self.config.is_decoder = True + self.config.is_encoder_decoder = False + self.generation_config = GenerationConfig.from_model_config(self.config) + model_dir = Path(model_dir) + self.model = core.read_model(model_dir / "openvino_language_model.xml") + self.token_emb = core.read_model(model_dir / "openvino_text_embeddings_model.xml") + if slice_lm_head: + self.slice_lm_head() + self.request = None + self.token_emb_request = None + self._device = device.upper() + self.device = torch.device("cpu") + self.ov_config = ov_config + self.next_beam_idx = None + self._past_length = None + self.input_names = [input_t.get_any_name() for input_t in self.model.inputs] + self.main_input_name = "input_ids" + self.llm_times = [] + if compile: + self.compile() + + def slice_lm_head(self): + manager = Manager() + manager.register_pass(InsertSlice()) + manager.run_passes(self.model) + self.model.validate_nodes_and_infer_types() + + def compile(self): + if self.request is None: + self.request = core.compile_model(self.model, self._device, self.ov_config).create_infer_request() + self._compile_token_emb() + + def _compile_token_emb(self): + if self.token_emb_request is None: + self.token_emb_request = core.compile_model(self.token_emb, self._device, self.ov_config) + + def to(self, device: str): + if isinstance(device, str): + self._device = device.upper() + self.clear_requests() + + return self + + def clear_requests(self): + del self.request + del self.token_emb_request + self.request = None + self.token_emb_request = None + + def embed_tokens(self, input_ids: torch.LongTensor): + self._compile_token_emb() + res = self.token_emb_request(input_ids, share_inputs=True) + return res[0] + + def prepare_inputs( + self, + input_ids: torch.LongTensor, + attention_mask: Optional[torch.LongTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + **kwargs, + ): + batch_size = input_ids.shape[0] if input_ids is not None else inputs_embeds.shape[0] + + inputs = {} + # past_key_values are not used explicitly, instead they are handled inside the model + if past_key_values is None: + self.llm_times = [] + # This is the first iteration in a sequence, reset all states + if self.request is not None: + self.request.reset_state() + # Set initial value for the next beam_idx input that will be used at the current iteration + # and will be optionally updated by _reorder_cache at the next iterations if beam_search is used + self.next_beam_idx = np.arange(batch_size, dtype=int) + self._past_length = 0 + past_len = self._get_past_length(past_key_values) + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids if past_key_values is None else input_ids[:, -1:]) + + if hasattr(self.config, "scale_emb"): + inputs_embeds = inputs_embeds * self.config.scale_emb + inputs["inputs_embeds"] = inputs_embeds + + # Add the attention_mask inputs when needed + if "attention_mask" in self.input_names or "position_ids" in self.input_names: + if attention_mask is not None: + attention_mask = np.array(attention_mask) + else: + attention_mask = np.ones((inputs_embeds.shape[0], inputs_embeds.shape[1] + past_len), dtype=int) + + if "attention_mask" in self.input_names: + inputs["attention_mask"] = attention_mask + + if "position_ids" in self.input_names: + if position_ids is not None: + position_ids = np.array(position_ids) + else: + position_ids = np.cumsum(attention_mask, axis=1) - 1 + position_ids[attention_mask == 0] = 1 + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + inputs["position_ids"] = position_ids + + if "beam_idx" in self.input_names: + inputs["beam_idx"] = self.next_beam_idx if self.next_beam_idx is not None else np.arange(batch_size, dtype=int) + + return inputs + + def forward( + self, + input_ids: torch.LongTensor, + attention_mask: Optional[torch.LongTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.LongTensor] = None, + **kwargs, + ): + self.compile() + + inputs = self.prepare_inputs( + input_ids=input_ids, + attention_mask=attention_mask, + past_key_values=past_key_values, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + **kwargs, + ) + + # Run inference + self.request.start_async(inputs, share_inputs=True) + self.request.wait() + logits = self.request.get_tensor("logits").data + logits = torch.from_numpy(logits).to(self.device) + past_key_values = ((),) + self._past_length += inputs["inputs_embeds"].shape[1] + + return CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values) + + # Adapted from transformers.models.llama.modeling_llama.LlamaForCausalLM.prepare_inputs_for_generation + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs): + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + attention_mask = kwargs.get("attention_mask", None) + use_cache = kwargs.get("use_cache", None) + + if past_key_values is not None: + past_len = self._get_past_length(past_key_values) + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as + # input) + if attention_mask is not None and input_ids is not None and attention_mask.shape[1] > input_ids.shape[1]: + input_ids = input_ids[:, -(attention_mask.shape[1] - past_len) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif input_ids is not None and past_len < input_ids.shape[1]: + input_ids = input_ids[:, past_len:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None and "position_ids" in self.input_names: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values and input_ids is not None: + position_ids = position_ids[:, -input_ids.shape[1] :] + + model_inputs = { + "input_ids": input_ids, + "past_key_values": past_key_values, + "use_cache": use_cache, + "position_ids": position_ids, + "attention_mask": attention_mask, + "inputs_embeds": inputs_embeds if past_key_values is None else None, + } + + return model_inputs + + def _get_past_length(self, past_key_values=None): + if past_key_values is None: + return 0 + return self._past_length + + # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel._reorder_cache + def _reorder_cache(self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> Tuple[Tuple[torch.Tensor]]: + """ + This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or + [`~PreTrainedModel.beam_sample`] is called. + This is required to match `past_key_values` with the correct beam_idx at every generation step. + """ + self.next_beam_idx = np.array(beam_idx) # save beam_idx to be used as an input in the next iteration + return past_key_values + + def can_generate(self): + """Returns True to validate the check that the model using `GenerationMixin.generate()` can indeed generate.""" + + return True + + def __call__(self, *args, **kwargs): + return self.forward(*args, **kwargs) + + +class OvMiniCPMV: + def __init__(self, config, vpm, resampler, llm, processor): + self.config = config + self.llm = llm + self.vpm = vpm + self.embed_dim = self.llm.config.hidden_size + self._resampler = resampler + self.processor = processor + self._pos_embeds = torch.from_numpy(get_2d_sincos_pos_embed(self.embed_dim, 70)).float() + self.max_size = (70, 70) + + self.terminators = ["<|im_end|>", "<|endoftext|>"] + + def set_decoder(self, decoder): + self.llm = decoder + + def get_decoder(self): + return self.llm + + def resampler(self, x, tgt_sizes): + bs = x.shape[0] + + patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1] + + self._adjust_pos_cache(tgt_sizes) + + max_patch_len = torch.max(patch_len) + key_padding_mask = torch.zeros((bs, max_patch_len), dtype=torch.bool) + + pos_embed = [] + for i in range(bs): + tgt_h, tgt_w = tgt_sizes[i] + pos_embed.append(self._pos_embeds[:tgt_h, :tgt_w, :].reshape((tgt_h * tgt_w, -1))) # patches * D + key_padding_mask[i, patch_len[i] :] = True + + pos_embed = torch.nn.utils.rnn.pad_sequence(pos_embed, batch_first=True, padding_value=0.0).permute(1, 0, 2) # BLD => L * B * D + + res = torch.from_numpy(self._resampler([x, pos_embed, key_padding_mask])[0]) + return res + + def _set_2d_pos_cache(self, max_size): + pos_embed = torch.from_numpy(get_2d_sincos_pos_embed(self.embed_dim, max_size)).float() + self._pos_embed = pos_embed + + def _adjust_pos_cache(self, tgt_sizes): + max_h = torch.max(tgt_sizes[:, 0]) + max_w = torch.max(tgt_sizes[:, 1]) + if max_h > self.max_size[0] or max_w > self.max_size[1]: + self.max_size = [max(max_h, self.max_size[0]), max(max_w, self.max_size[1])] + self._set_2d_pos_cache(self.max_size) + + def get_vllm_embedding(self, data): + if "vision_hidden_states" not in data: + tgt_sizes = data["tgt_sizes"] + pixel_values_list = data["pixel_values"] + vision_hidden_states = [] + all_pixel_values = [] + img_cnt = [] + for pixel_values in pixel_values_list: + img_cnt.append(len(pixel_values)) + all_pixel_values.extend([i.flatten(end_dim=1).permute(1, 0) for i in pixel_values]) + + # exist image + if all_pixel_values: + tgt_sizes = [tgt_size for tgt_size in tgt_sizes if isinstance(tgt_size, torch.Tensor)] + tgt_sizes = torch.vstack(tgt_sizes).type(torch.int32) + + max_patches = torch.max(tgt_sizes[:, 0] * tgt_sizes[:, 1]) + + all_pixel_values = torch.nn.utils.rnn.pad_sequence(all_pixel_values, batch_first=True, padding_value=0.0) + B, L, _ = all_pixel_values.shape + all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L) + + patch_attn_mask = torch.zeros((B, 1, max_patches), dtype=torch.bool) + for i in range(B): + patch_attn_mask[i, 0, : tgt_sizes[i][0] * tgt_sizes[i][1]] = True + + vision_batch_size = 32 + all_pixel_values = all_pixel_values + if B > vision_batch_size: + hs = [] + for i in range(0, B, vision_batch_size): + start_idx = i + end_idx = i + vision_batch_size + block_pxl_values = all_pixel_values[start_idx:end_idx] + block_patch_attn_mask = patch_attn_mask[start_idx:end_idx] + block_tgt_sizes = tgt_sizes[start_idx:end_idx] + block_position_ids = prepare_vis_position_ids( + block_pxl_values, + block_patch_attn_mask, + block_tgt_sizes, + self.config.vision_config.patch_size, + self.config.vision_config.image_size // self.config.patch_size, + ) + start = time.perf_counter() + tmp_hs = torch.from_numpy(self.vpm([block_pxl_values, block_patch_attn_mask, block_position_ids])[0]) + self.vpm_times.append(time.perf_counter() - start) + hs.append(tmp_hs) + vision_embedding = torch.cat(hs, dim=0) + else: + position_ids = prepare_vis_position_ids( + all_pixel_values, + patch_attn_mask, + tgt_sizes, + self.config.vision_config.patch_size, + self.config.vision_config.image_size // self.config.patch_size, + ) + start = time.perf_counter() + vision_embedding = torch.from_numpy(self.vpm([all_pixel_values, patch_attn_mask, position_ids])[0]) + vision_embedding = torch.from_numpy(self.vpm([all_pixel_values, patch_attn_mask, position_ids])[0]) + vision_embedding = self.resampler(vision_embedding, tgt_sizes) + + start = 0 + for pixel_values in pixel_values_list: + img_cnt = len(pixel_values) + if img_cnt > 0: + vision_hidden_states.append(vision_embedding[start : start + img_cnt]) + start += img_cnt + else: + vision_hidden_states.append([]) + else: # no image + dummy_feature = [] + for _ in range(len(pixel_values_list)): + vision_hidden_states.append(dummy_feature) + + else: + vision_hidden_states = data["vision_hidden_states"] + + if hasattr(self.llm.config, "scale_emb"): + vllm_embedding = self.llm.embed_tokens(data["input_ids"]) * self.llm.config.scale_emb + else: + vllm_embedding = self.llm.embed_tokens(data["input_ids"]) + + bs = len(data["input_ids"]) + for i in range(bs): + cur_vs_hs = vision_hidden_states[i] + if len(cur_vs_hs) > 0: + cur_vllm_emb = torch.from_numpy(vllm_embedding[i]) + cur_image_bound = data["image_bound"][i] + if len(cur_image_bound) > 0: + image_indices = torch.stack([torch.arange(r[0], r[1], dtype=torch.long) for r in cur_image_bound]) + + cur_vllm_emb.scatter_(0, image_indices.view(-1, 1).repeat(1, cur_vllm_emb.shape[-1]), cur_vs_hs.view(-1, cur_vs_hs.shape[-1])) + return vllm_embedding + + def forward(self, data, **kwargs): + vllm_embedding = self.get_vllm_embedding(data) + position_ids = data["position_ids"] + if position_ids.dtype != torch.int64: + position_ids = position_ids.long() + + return self.llm(input_ids=None, position_ids=position_ids, inputs_embeds=vllm_embedding, **kwargs) + + def _decode(self, inputs_embeds, tokenizer, attention_mask, decode_text=False, **kwargs): + terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators] + output = self.llm.generate( + inputs_embeds=torch.from_numpy(inputs_embeds), pad_token_id=0, eos_token_id=terminators, attention_mask=attention_mask, **kwargs + ) + if decode_text: + return self._decode_text(output, tokenizer) + return output + + def _decode_stream(self, inputs_embeds, tokenizer, **kwargs): + terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators] + streamer = TextIteratorStreamer(tokenizer=tokenizer) + generation_kwargs = {"inputs_embeds": torch.from_numpy(inputs_embeds), "pad_token_id": 0, "eos_token_id": terminators, "streamer": streamer} + generation_kwargs.update(kwargs) + + thread = Thread(target=self.llm.generate, kwargs=generation_kwargs) + thread.start() + + return streamer + + def _decode_text(self, result_ids, tokenizer): + terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators] + result_text = [] + for result in result_ids: + result = result[result != 0] + if result[0] == tokenizer.bos_id: + result = result[1:] + if result[-1] in terminators: + result = result[:-1] + result_text.append(tokenizer.decode(result).strip()) + return result_text + + def generate( + self, + input_ids=None, + pixel_values=None, + tgt_sizes=None, + image_bound=None, + attention_mask=None, + tokenizer=None, + vision_hidden_states=None, + return_vision_hidden_states=False, + stream=False, + decode_text=False, + **kwargs, + ): + assert input_ids is not None + assert len(input_ids) == len(pixel_values) + + model_inputs = { + "input_ids": input_ids, + "image_bound": image_bound, + } + + if vision_hidden_states is None: + model_inputs["pixel_values"] = pixel_values + model_inputs["tgt_sizes"] = tgt_sizes + else: + model_inputs["vision_hidden_states"] = vision_hidden_states + + with torch.inference_mode(): + model_inputs["inputs_embeds"] = self.get_vllm_embedding(model_inputs) + + if stream: + result = self._decode_stream(model_inputs["inputs_embeds"], tokenizer, **kwargs) + else: + result = self._decode(model_inputs["inputs_embeds"], tokenizer, attention_mask, decode_text=decode_text, **kwargs) + + return result + + def chat( + self, + image, + msgs, + tokenizer, + processor=None, + vision_hidden_states=None, + max_new_tokens=2048, + min_new_tokens=0, + sampling=True, + max_inp_length=8192, + system_prompt="", + stream=False, + max_slice_nums=None, + use_image_id=None, + **kwargs, + ): + self.vpm_times = [] + self.resampler_times = [] + if isinstance(msgs[0], list): + batched = True + else: + batched = False + msgs_list = msgs + images_list = image + + if batched is False: + images_list, msgs_list = [images_list], [msgs_list] + else: + assert images_list is None, "Please integrate image to msgs when using batch inference." + images_list = [None] * len(msgs_list) + assert len(images_list) == len(msgs_list), "The batch dim of images_list and msgs_list should be the same." + + if processor is None: + if self.processor is None: + self.processor = AutoProcessor.from_pretrained(self.config._name_or_path, trust_remote_code=True) + processor = self.processor + + assert ( + self.config.query_num == processor.image_processor.image_feature_size + ), "These two values should be the same. Check `config.json` and `preprocessor_config.json`." + assert ( + self.config.patch_size == processor.image_processor.patch_size + ), "These two values should be the same. Check `config.json` and `preprocessor_config.json`." + assert ( + self.config.use_image_id == processor.image_processor.use_image_id + ), "These two values should be the same. Check `config.json` and `preprocessor_config.json`." + assert ( + self.config.slice_config.max_slice_nums == processor.image_processor.max_slice_nums + ), "These two values should be the same. Check `config.json` and `preprocessor_config.json`." + assert ( + self.config.slice_mode == processor.image_processor.slice_mode + ), "These two values should be the same. Check `config.json` and `preprocessor_config.json`." + + prompts_lists = [] + input_images_lists = [] + for image, msgs in zip(images_list, msgs_list): + if isinstance(msgs, str): + msgs = json.loads(msgs) + copy_msgs = deepcopy(msgs) + + assert len(msgs) > 0, "msgs is empty" + + if image is not None and isinstance(copy_msgs[0]["content"], str): + copy_msgs[0]["content"] = [image, copy_msgs[0]["content"]] + + images = [] + for i, msg in enumerate(copy_msgs): + role = msg["role"] + content = msg["content"] + assert role in ["user", "assistant"] + if i == 0: + assert role == "user", "The role of first msg should be user" + if isinstance(content, str): + content = [content] + cur_msgs = [] + for c in content: + if isinstance(c, Image.Image): + images.append(c) + cur_msgs.append("(<image>./</image>)") + elif isinstance(c, str): + cur_msgs.append(c) + msg["content"] = "\n".join(cur_msgs) + + if system_prompt: + sys_msg = {"role": "system", "content": system_prompt} + copy_msgs = [sys_msg] + copy_msgs + + prompts_lists.append(processor.tokenizer.apply_chat_template(copy_msgs, tokenize=False, add_generation_prompt=True)) + input_images_lists.append(images) + + inputs = processor( + prompts_lists, input_images_lists, max_slice_nums=max_slice_nums, use_image_id=use_image_id, return_tensors="pt", max_length=max_inp_length + ) + + if sampling: + generation_config = {"top_p": 0.8, "top_k": 100, "temperature": 0.7, "do_sample": True, "repetition_penalty": 1.05} + else: + generation_config = { + "repetition_penalty": 1.0, + } + + if min_new_tokens > 0: + generation_config["min_new_tokens"] = min_new_tokens + + generation_config.update((k, kwargs[k]) for k in generation_config.keys() & kwargs.keys()) + + inputs.pop("image_sizes") + with torch.inference_mode(): + res = self.generate( + **inputs, + tokenizer=tokenizer, + max_new_tokens=max_new_tokens, + vision_hidden_states=vision_hidden_states, + stream=stream, + decode_text=True, + **generation_config, + ) + + if stream: + + def stream_gen(): + for text in res: + for term in self.terminators: + text = text.replace(term, "") + yield text + + return stream_gen() + + else: + if batched: + answer = res + else: + answer = res[0] + return answer + + +def init_model(model_dir, device): + config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True) + llm = OvModelForCausalLMWithEmb(model_dir, device) + img_emb = core.compile_model(model_dir / image_emb_path, device) + resampler = core.compile_model(model_dir / resampler_path, device) + processor = AutoProcessor.from_pretrained(model_dir, trust_remote_code=True) + + ov_model = OvMiniCPMV(config, img_emb, resampler, llm, processor) + return ov_model + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("model_dir", type=Path) + model_dir = parser.parse_args().model_dir + model_id = "openbmb/MiniCPM-V-2_6" + ckpt = model_dir / "ckpt" + if not ckpt.exists(): + snapshot_download(model_id, local_dir=ckpt, force_download=True) + patch_model_code(ckpt) + model = AutoModel.from_pretrained(ckpt, trust_remote_code=True) + model.eval() + model.config.save_pretrained(model_dir) + tokenizer = AutoTokenizer.from_pretrained(ckpt, trust_remote_code=True) + tokenizer.save_pretrained(model_dir) + ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(tokenizer, with_detokenizer=True) + ov.save_model(ov_tokenizer, model_dir / "openvino_tokenizer.xml") + ov.save_model(ov_detokenizer, model_dir / "openvino_detokenizer.xml") + processor = AutoProcessor.from_pretrained(ckpt, trust_remote_code=True) + processor.save_pretrained(model_dir) + + convert_llm(model, model_dir) + del model.llm + gc.collect() + + convert_vision_encoder(model, model_dir) + # ov_cpm = init_model(model_dir, "CPU") + # print(ov_cpm.chat(Image.open(requests.get("https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11", stream=True).raw), [{"role": "user", "content": "What is unusual on this image?"}], ov_cpm.processor.tokenizer, sampling=False)) + +if "__main__" == __name__: + main() diff --git a/samples/cpp/visual_language_chat/load_image.cpp b/samples/cpp/visual_language_chat/load_image.cpp new file mode 100644 index 0000000000..855f7567bf --- /dev/null +++ b/samples/cpp/visual_language_chat/load_image.cpp @@ -0,0 +1,41 @@ + +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#define STB_IMAGE_IMPLEMENTATION +#include "stb_image.h" +#include "load_image.hpp" + +ov::Tensor utils::load_image(const std::filesystem::path& image_path) { + int x = 0, y = 0, channels_in_file = 0; + constexpr int desired_channels = 3; + unsigned char* image = stbi_load( + image_path.string().c_str(), + &x, &y, &channels_in_file, desired_channels); + if (!image) { + throw std::runtime_error{"Failed to load the image."}; + } + struct SharedImageAllocator { + unsigned char* image; + int channels, height, width; + void* allocate(size_t bytes, size_t) const { + if (channels * height * width == bytes) { + return image; + } + throw std::runtime_error{"Unexpected number of bytes was requested to allocate."}; + } + void deallocate(void*, size_t bytes, size_t) { + if (channels * height * width != bytes) { + throw std::runtime_error{"Unexpected number of bytes was requested to deallocate."}; + } + std::free(image); + image = nullptr; + } + bool is_equal(const SharedImageAllocator& other) const noexcept {return this == &other;} + }; + return ov::Tensor( + ov::element::u8, + ov::Shape{1, size_t(desired_channels), size_t(y), size_t(x)}, + SharedImageAllocator{image, desired_channels, y, x} + ); +} diff --git a/samples/cpp/visual_language_chat/load_image.hpp b/samples/cpp/visual_language_chat/load_image.hpp new file mode 100644 index 0000000000..f66dd2caf2 --- /dev/null +++ b/samples/cpp/visual_language_chat/load_image.hpp @@ -0,0 +1,12 @@ + +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <openvino/runtime/tensor.hpp> +#include <filesystem> + +namespace utils { +ov::Tensor load_image(const std::filesystem::path& image_path); +} diff --git a/samples/cpp/visual_language_chat/visual_language_chat.cpp b/samples/cpp/visual_language_chat/visual_language_chat.cpp new file mode 100644 index 0000000000..95342402cb --- /dev/null +++ b/samples/cpp/visual_language_chat/visual_language_chat.cpp @@ -0,0 +1,53 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "load_image.hpp" +#include <openvino/genai/visual_language/pipeline.hpp> +#include <openvino/runtime/intel_gpu/properties.hpp> + +bool print_subword(std::string&& subword) { + return !(std::cout << subword << std::flush); +} + +int main(int argc, char* argv[]) try { + if (3 != argc) { + throw std::runtime_error(std::string{"Usage "} + argv[0] + " <MODEL_DIR> <IMAGE_FILE>"); + } + ov::Tensor image = utils::load_image(argv[2]); + std::string device = "CPU"; // GPU can be used as well + ov::AnyMap enable_compile_cache; + if ("GPU" == device) { + // Cache compiled models on disk for GPU to save time on the + // next run. It's not beneficial for CPU. + enable_compile_cache.insert({ov::cache_dir("vlm_cache")}); + } + ov::genai::VLMPipeline pipe(argv[1], device, enable_compile_cache); + std::string prompt; + + pipe.start_chat(); + std::cout << "question:\n"; + std::getline(std::cin, prompt); + pipe.generate( + prompt, + ov::genai::image(image), + ov::genai::streamer(print_subword) + ); + std::cout << "\n----------\n" + "question:\n"; + while (std::getline(std::cin, prompt)) { + pipe.generate(prompt, ov::genai::streamer(print_subword)); + std::cout << "\n----------\n" + "question:\n"; + } + pipe.finish_chat(); +} catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} diff --git a/samples/cpp/whisper_speech_recognition/CMakeLists.txt b/samples/cpp/whisper_speech_recognition/CMakeLists.txt new file mode 100644 index 0000000000..39f017adb1 --- /dev/null +++ b/samples/cpp/whisper_speech_recognition/CMakeLists.txt @@ -0,0 +1,40 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +find_package(OpenVINOGenAI REQUIRED + PATHS + "${CMAKE_BINARY_DIR}" # Reuse the package from the build. + ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. + NO_CMAKE_FIND_ROOT_PATH +) + +if(POLICY CMP0135) + cmake_policy(SET CMP0135 NEW) +endif() + +if(POLICY CMP0169) + cmake_policy(SET CMP0169 OLD) +endif() + +include(FetchContent) + +if(NOT TARGET dr_libs) + FetchContent_Declare(dr_libs + URL https://github.com/mackron/dr_libs/archive/da35f9d6c7374a95353fd1df1d394d44ab66cf01.tar.gz + URL_HASH SHA256=2704d347f480ca1bc92233fb01747e4550cc8031735b6ea62ca9990ebb8851ae) + FetchContent_MakeAvailable(dr_libs) +endif() + +add_executable(whisper_speech_recognition whisper_speech_recognition.cpp audio_utils.cpp) +target_link_libraries(whisper_speech_recognition PRIVATE openvino::genai) +target_include_directories(whisper_speech_recognition PRIVATE "$<BUILD_INTERFACE:${dr_libs_SOURCE_DIR}>") +set_target_properties(whisper_speech_recognition PROPERTIES + COMPILE_PDB_NAME whisper_speech_recognition + # Ensure out of box LC_RPATH on macOS with SIP + INSTALL_RPATH_USE_LINK_PATH ON) +target_compile_features(whisper_speech_recognition PRIVATE cxx_std_11) + +install(TARGETS whisper_speech_recognition + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/whisper_speech_recognition/README.md b/samples/cpp/whisper_speech_recognition/README.md new file mode 100644 index 0000000000..fec5d9194f --- /dev/null +++ b/samples/cpp/whisper_speech_recognition/README.md @@ -0,0 +1,48 @@ +# Whisper automatic speech recognition sample + +This example showcases inference of speech recognition Whisper Models. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample features `ov::genai::WhisperPipeline` and uses audio file in wav format as an input source. + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. + +```sh +pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --model openai/whisper-base whisper-base +``` + +## Prepare audio file + +Prepare audio file in wav format with sampling rate 16k Hz. + +## Run + +`whisper_speech_recognition whisper-base sample.wav` + +Output: text transcription of `sample.wav` + +Models can be downloaded from [OpenAI HuggingFace](https://huggingface.co/openai). + +Supported Models: +[openai/whisper-tiny](https://huggingface.co/openai/whisper-tiny) +[openai/whisper-tiny.en](https://huggingface.co/openai/whisper-tiny.en) +[openai/whisper-base](https://huggingface.co/openai/whisper-base) +[openai/whisper-base.en](https://huggingface.co/openai/whisper-base.en) +[openai/whisper-small](https://huggingface.co/openai/whisper-small) +[openai/whisper-small.en](https://huggingface.co/openai/whisper-small.en) +[openai/whisper-medium](https://huggingface.co/openai/whisper-medium) +[openai/whisper-medium.en](https://huggingface.co/openai/whisper-medium.en) +[openai/whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) + +### Troubleshooting + +#### Empty or rubbish output + +Example output: +``` +---------------- +``` + +To resolve this ensure that audio data has 16k Hz sampling rate diff --git a/samples/cpp/whisper_speech_recognition/audio_utils.cpp b/samples/cpp/whisper_speech_recognition/audio_utils.cpp new file mode 100644 index 0000000000..a64db44f3f --- /dev/null +++ b/samples/cpp/whisper_speech_recognition/audio_utils.cpp @@ -0,0 +1,113 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "audio_utils.hpp" + +#include <iostream> +#include <vector> + +#include "openvino/genai/whisper_pipeline.hpp" + +#define DR_WAV_IMPLEMENTATION +#include <dr_wav.h> + +#ifdef _WIN32 +# include <fcntl.h> +# include <io.h> +#endif + +namespace { +bool is_wav_buffer(const std::string buf) { + // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format + // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html + if (buf.size() < 12 || buf.substr(0, 4) != "RIFF" || buf.substr(8, 4) != "WAVE") { + return false; + } + + uint32_t chunk_size = *reinterpret_cast<const uint32_t*>(buf.data() + 4); + if (chunk_size + 8 != buf.size()) { + return false; + } + + return true; +} +} // namespace + +namespace utils { +namespace audio { + +#define COMMON_SAMPLE_RATE 16000 + +ov::genai::RawSpeechInput read_wav(const std::string& filename) { + drwav wav; + std::vector<uint8_t> wav_data; // used for pipe input from stdin or ffmpeg decoding output + + if (filename == "-") { + { +#ifdef _WIN32 + _setmode(_fileno(stdin), _O_BINARY); +#endif + + uint8_t buf[1024]; + while (true) { + const size_t n = fread(buf, 1, sizeof(buf), stdin); + if (n == 0) { + break; + } + wav_data.insert(wav_data.end(), buf, buf + n); + } + } + + OPENVINO_ASSERT(drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr), + "Failed to open WAV file from stdin"); + + fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size()); + } else if (is_wav_buffer(filename)) { + OPENVINO_ASSERT(drwav_init_memory(&wav, filename.c_str(), filename.size(), nullptr), + "Failed to open WAV file from fname buffer"); + } else if (!drwav_init_file(&wav, filename.c_str(), nullptr)) { +#if defined(WHISPER_FFMPEG) + OPENVINO_ASSERT(ffmpeg_decode_audio(fname, wav_data) == 0, "Failed to ffmpeg decode") + + OPENVINO_ASSERT(drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr), + "Failed to read wav data as wav") +#else + throw std::runtime_error("failed to open as WAV file"); +#endif + } + + if (wav.channels != 1 && wav.channels != 2) { + drwav_uninit(&wav); + throw std::runtime_error("WAV file must be mono or stereo"); + } + + if (wav.sampleRate != COMMON_SAMPLE_RATE) { + drwav_uninit(&wav); + throw std::runtime_error("WAV file must be " + std::string{COMMON_SAMPLE_RATE / 1000} + " kHz"); + } + + const uint64_t n = + wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size() / (wav.channels * wav.bitsPerSample / 8); + + std::vector<int16_t> pcm16; + pcm16.resize(n * wav.channels); + drwav_read_pcm_frames_s16(&wav, n, pcm16.data()); + drwav_uninit(&wav); + + // convert to mono, float + std::vector<float> pcmf32; + pcmf32.resize(n); + if (wav.channels == 1) { + for (uint64_t i = 0; i < n; i++) { + pcmf32[i] = float(pcm16[i]) / 32768.0f; + } + } else { + for (uint64_t i = 0; i < n; i++) { + pcmf32[i] = float(pcm16[2 * i] + pcm16[2 * i + 1]) / 65536.0f; + } + } + + return pcmf32; +} +} // namespace audio +} // namespace utils diff --git a/samples/cpp/whisper_speech_recognition/audio_utils.hpp b/samples/cpp/whisper_speech_recognition/audio_utils.hpp new file mode 100644 index 0000000000..6e4b141d83 --- /dev/null +++ b/samples/cpp/whisper_speech_recognition/audio_utils.hpp @@ -0,0 +1,12 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "openvino/genai/whisper_pipeline.hpp" + +namespace utils { +namespace audio { +ov::genai::RawSpeechInput read_wav(const std::string& filename); +} // namespace audio +} // namespace utils diff --git a/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp b/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp new file mode 100644 index 0000000000..f758a16085 --- /dev/null +++ b/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp @@ -0,0 +1,50 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "audio_utils.hpp" +#include "openvino/genai/whisper_pipeline.hpp" + +int main(int argc, char* argv[]) try { + if (3 > argc) { + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> \"<WAV_FILE_PATH>\""); + } + + std::string model_path = argv[1]; + std::string wav_file_path = argv[2]; + + ov::genai::RawSpeechInput raw_speech = utils::audio::read_wav(wav_file_path); + + ov::genai::WhisperPipeline pipeline{model_path}; + + ov::genai::WhisperGenerationConfig config{model_path + "/generation_config.json"}; + config.max_new_tokens = 100; + // 'task' and 'language' parameters are supported for multilingual models only + config.language = "<|en|>"; + config.task = "transcribe"; + config.return_timestamps = true; + + auto streamer = [](std::string word) { + std::cout << word; + return false; + }; + + auto result = pipeline.generate(raw_speech, config, streamer); + + std::cout << "\n"; + + for (auto& chunk : *result.chunks) { + std::cout << "timestamps: [" << chunk.start_ts << ", " << chunk.end_ts << "] text: " << chunk.text << "\n"; + } +} catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) { + } + return EXIT_FAILURE; +} catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) { + } + return EXIT_FAILURE; +} diff --git a/samples/generation.gif b/samples/generation.gif new file mode 100644 index 0000000000..d6434df8c8 --- /dev/null +++ b/samples/generation.gif @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b3ea717def68df6493c629551b80e74f58d03be02d837e6a16541b3d95787df +size 5550657 diff --git a/samples/python/beam_search_causal_lm/README.md b/samples/python/beam_search_causal_lm/README.md new file mode 100644 index 0000000000..7e412db379 --- /dev/null +++ b/samples/python/beam_search_causal_lm/README.md @@ -0,0 +1,36 @@ +# Text generation Python sample that supports most popular models like LLaMA 3 + +This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. It's only possible to change the device for inference to a differnt one, GPU for example, from the command line interface. The sample fearures `openvino_genai.LLMPipeline` and configures it to use multiple beam grops. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. + +```sh +pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +## Run + +`beam_search_causal_lm.py TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` + + +Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. + +See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. + +### Troubleshooting + +#### Unicode characters encoding error on Windows + +Example error: +``` +UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to <undefined> +``` + +If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: +1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. +2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/python/beam_search_causal_lm/beam_search_causal_lm.py b/samples/python/beam_search_causal_lm/beam_search_causal_lm.py new file mode 100755 index 0000000000..16b8b76175 --- /dev/null +++ b/samples/python/beam_search_causal_lm/beam_search_causal_lm.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import openvino_genai + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('model_dir') + parser.add_argument('prompts', nargs='+') + args = parser.parse_args() + + device = 'CPU' # GPU can be used as well + pipe = openvino_genai.LLMPipeline(args.model_dir, device) + + config = openvino_genai.GenerationConfig() + config.max_new_tokens = 20 + config.num_beam_groups = 3 + config.num_beams = 15 + config.num_return_sequences = config.num_beams + + beams = pipe.generate(args.prompts, config) + print(beams) + + +if '__main__' == __name__: + main() diff --git a/samples/python/benchmark_genai/README.md b/samples/python/benchmark_genai/README.md new file mode 100644 index 0000000000..9baf17c4d7 --- /dev/null +++ b/samples/python/benchmark_genai/README.md @@ -0,0 +1,47 @@ +# LLMs benchmarking sample + +This sample script demonstrates how to benchmark an LLMs in OpenVINO GenAI. The script includes functionality for warm-up iterations, generating text, and calculating various performance metrics. + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. + +```sh +pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +## Usage + +```sh +python benchmark_genai.py [OPTIONS] +``` + +### Options + +- `-m, --model`: Path to the model and tokenizers base directory. +- `-p, --prompt` (default: `"The Sky is blue because"`): The prompt to generate text. +- `-nw, --num_warmup` (default: `1`): Number of warmup iterations. +- `-n, --num_iter` (default: `3`): Number of iterations. +- `-mt, --max_new_tokens` (default: `20`): Number of warmup iterations. +- `-d, --device` (default: `"CPU"`): Device to run the model on. + +### Output: + +``` +python benchmark_genai.py -m TinyLlama-1.1B-Chat-v1.0 -n 10 +``` + +``` +Load time: 3405.69 ms +Generate time: 1430.77 ± 3.04 ms +Tokenization time: 0.51 ± 0.02 ms +Detokenization time: 0.37 ± 0.01 ms +TTFT: 81.60 ± 0.54 ms +TPOT: 71.52 ± 2.72 ms +Throughput tokens/s: 13.98 ± 0.53 +``` + +For more information on how performance metrics are calculated, see [performance metrics readme](../../../src/README.md#performance-metrics). diff --git a/samples/python/benchmark_genai/benchmark_genai.py b/samples/python/benchmark_genai/benchmark_genai.py new file mode 100755 index 0000000000..9851483880 --- /dev/null +++ b/samples/python/benchmark_genai/benchmark_genai.py @@ -0,0 +1,49 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import openvino_genai as ov_genai + +def main(): + parser = argparse.ArgumentParser(description="Help command") + parser.add_argument("-m", "--model", type=str, help="Path to model and tokenizers base directory") + parser.add_argument("-p", "--prompt", type=str, default="The Sky is blue because", help="Prompt") + parser.add_argument("-nw", "--num_warmup", type=int, default=1, help="Number of warmup iterations") + parser.add_argument("-n", "--num_iter", type=int, default=2, help="Number of iterations") + parser.add_argument("-mt", "--max_new_tokens", type=int, default=20, help="Maximal number of new tokens") + parser.add_argument("-d", "--device", type=str, default="CPU", help="Device") + + args = parser.parse_args() + + # Perf metrics is stored in DecodedResults. + # In order to get DecodedResults instead of a string input should be a list. + prompt = [args.prompt] + model_path = args.model + device = args.device + num_warmup = args.num_warmup + num_iter = args.num_iter + + config = ov_genai.GenerationConfig() + config.max_new_tokens = args.max_new_tokens + + pipe = ov_genai.LLMPipeline(model_path, device) + + for _ in range(num_warmup): + pipe.generate(prompt, config) + + res = pipe.generate(prompt, config) + perf_metrics = res.perf_metrics + for _ in range(num_iter - 1): + res = pipe.generate(prompt, config) + perf_metrics += res.perf_metrics + + print(f"Load time: {perf_metrics.get_load_time():.2f} ms") + print(f"Generate time: {perf_metrics.get_generate_duration().mean:.2f} ± {perf_metrics.get_generate_duration().std:.2f} ms") + print(f"Tokenization time: {perf_metrics.get_tokenization_duration().mean:.2f} ± {perf_metrics.get_tokenization_duration().std:.2f} ms") + print(f"Detokenization time: {perf_metrics.get_detokenization_duration().mean:.2f} ± {perf_metrics.get_detokenization_duration().std:.2f} ms") + print(f"TTFT: {perf_metrics.get_ttft().mean:.2f} ± {perf_metrics.get_ttft().std:.2f} ms") + print(f"TPOT: {perf_metrics.get_tpot().mean:.2f} ± {perf_metrics.get_tpot().std:.2f} ms") + print(f"Throughput : {perf_metrics.get_throughput().mean:.2f} ± {perf_metrics.get_throughput().std:.2f} tokens/s") + +if __name__ == "__main__": + main() diff --git a/samples/python/chat_sample/README.md b/samples/python/chat_sample/README.md new file mode 100644 index 0000000000..dc2c39b3a5 --- /dev/null +++ b/samples/python/chat_sample/README.md @@ -0,0 +1,44 @@ +# Python chat_sample that supports most popular models like LLaMA 3 + +This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `openvino_genai.LLMPipeline` and configures it for the chat scenario. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. + +```sh +pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +## Run: + +`chat_sample.py TinyLlama-1.1B-Chat-v1.0` + + +Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. + +See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. + +### Troubleshooting + +#### Unicode characters encoding error on Windows + +Example error: +``` +UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to <undefined> +``` + +If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: +1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. +2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. + +#### Missing chat template + +If you encounter an exception indicating a missing "chat template" when launching the `ov::genai::LLMPipeline` in chat mode, it likely means the model was not tuned for chat functionality. To work this around, manually add the chat template to tokenizer_config.json of your model. +The following template can be used as a default, but it may not work properly with every model: +``` +"chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user\n' + message['content'] + '<|im_end|>\n<|im_start|>assistant\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|im_end|>\n'}}{% endif %}{% endfor %}", +``` diff --git a/samples/python/chat_sample/chat_sample.py b/samples/python/chat_sample/chat_sample.py new file mode 100755 index 0000000000..eee66fb71d --- /dev/null +++ b/samples/python/chat_sample/chat_sample.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import openvino_genai + + +def streamer(subword): + print(subword, end='', flush=True) + # Return flag corresponds whether generation should be stopped. + # False means continue generation. + return False + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('model_dir') + args = parser.parse_args() + + device = 'CPU' # GPU can be used as well + pipe = openvino_genai.LLMPipeline(args.model_dir, device) + + config = openvino_genai.GenerationConfig() + config.max_new_tokens = 100 + + pipe.start_chat() + while True: + try: + prompt = input('question:\n') + except EOFError: + break + pipe.generate(prompt, config, streamer) + print('\n----------') + pipe.finish_chat() + + +if '__main__' == __name__: + main() diff --git a/samples/python/greedy_causal_lm/README.md b/samples/python/greedy_causal_lm/README.md new file mode 100644 index 0000000000..1f0eb333ea --- /dev/null +++ b/samples/python/greedy_causal_lm/README.md @@ -0,0 +1,36 @@ +# Text generation Python greedy_causal_lm that supports most popular models like LLaMA 3 + +This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `openvino_genai.LLMPipeline` and configures it to run the simplest deterministic greedy sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. + +```sh +pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +## Run + +`greedy_causal_lm.py TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` + + +Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. + +See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. + +### Troubleshooting + +#### Unicode characters encoding error on Windows + +Example error: +``` +UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to <undefined> +``` + +If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: +1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. +2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/python/greedy_causal_lm/greedy_causal_lm.py b/samples/python/greedy_causal_lm/greedy_causal_lm.py new file mode 100755 index 0000000000..983195c696 --- /dev/null +++ b/samples/python/greedy_causal_lm/greedy_causal_lm.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import openvino_genai + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('model_dir') + parser.add_argument('prompt') + args = parser.parse_args() + + device = 'CPU' # GPU can be used as well + pipe = openvino_genai.LLMPipeline(args.model_dir, device) + + config = openvino_genai.GenerationConfig() + config.max_new_tokens = 100 + + print(pipe.generate(args.prompt, config)) + + +if '__main__' == __name__: + main() diff --git a/samples/python/multinomial_causal_lm/README.md b/samples/python/multinomial_causal_lm/README.md new file mode 100644 index 0000000000..351773ec0d --- /dev/null +++ b/samples/python/multinomial_causal_lm/README.md @@ -0,0 +1,44 @@ +# Text generation Python multinomial_causal_lm that supports most popular models like LLaMA 3 + +This example showcases inference of text-generation Large Language Models (LLMs): `chatglm`, `LLaMA`, `Qwen` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample fearures `ov::genai::LLMPipeline` and configures it to run random sampling algorithm. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/llm-chatbot) which provides an example of LLM-powered Chatbot in Python. + +This sample also contains example implementation of an iterable streamer with bufferisation. + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. + +```sh +pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 +``` + +## Run + +`multinomial_causal_lm.py TinyLlama-1.1B-Chat-v1.0 "Why is the Sun yellow?"` + + +Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. For example, the model meta-llama/Llama-2-13b-chat-hf can benefit from being run on a dGPU. Modify the source code to change the device for inference to the GPU. + +See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. + +## Streaming + +This Python example demonstrates custom detokenization with bufferization. The streamer receives integer tokens corresponding to each word or subword, one by one. If tokens are decoded individually, the resulting text misses necessary spaces because of detokenize(tokenize(" a")) == "a". + +To address this, the detokenizer needs a larger context. We accumulate tokens in a tokens_cache buffer and decode multiple tokens together, adding the text to the streaming queue only when a complete decoded chunk is ready. We run a separate thread to print all new elements arriving in this queue from the generation pipeline. Each generated chunk of text is put into a synchronized queue, ensuring that all put and get operations are thread-safe and blocked until they can proceed. + +### Troubleshooting + +#### Unicode characters encoding error on Windows + +Example error: +``` +UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to <undefined> +``` + +If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: +1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. +2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/python/multinomial_causal_lm/multinomial_causal_lm.py b/samples/python/multinomial_causal_lm/multinomial_causal_lm.py new file mode 100755 index 0000000000..6300320264 --- /dev/null +++ b/samples/python/multinomial_causal_lm/multinomial_causal_lm.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python3 +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import openvino_genai +import queue +import threading + + +class IterableStreamer(openvino_genai.StreamerBase): + """ + A custom streamer class for handling token streaming and detokenization with buffering. + + Attributes: + tokenizer (Tokenizer): The tokenizer used for encoding and decoding tokens. + tokens_cache (list): A buffer to accumulate tokens for detokenization. + text_queue (Queue): A synchronized queue for storing decoded text chunks. + print_len (int): The length of the printed text to manage incremental decoding. + """ + + def __init__(self, tokenizer): + """ + Initializes the IterableStreamer with the given tokenizer. + + Args: + tokenizer (Tokenizer): The tokenizer to use for encoding and decoding tokens. + """ + super().__init__() + self.tokenizer = tokenizer + self.tokens_cache = [] + self.text_queue = queue.Queue() + self.print_len = 0 + + def __iter__(self): + """ + Returns the iterator object itself. + """ + return self + + def __next__(self): + """ + Returns the next value from the text queue. + + Returns: + str: The next decoded text chunk. + + Raises: + StopIteration: If there are no more elements in the queue. + """ + value = self.text_queue.get() # get() will be blocked until a token is available. + if value is None: + raise StopIteration + return value + + def get_stop_flag(self): + """ + Checks whether the generation process should be stopped. + + Returns: + bool: Always returns False in this implementation. + """ + return False + + def put_word(self, word: str): + """ + Puts a word into the text queue. + + Args: + word (str): The word to put into the queue. + """ + self.text_queue.put(word) + + def put(self, token_id: int) -> bool: + """ + Processes a token and manages the decoding buffer. Adds decoded text to the queue. + + Args: + token_id (int): The token_id to process. + + Returns: + bool: True if generation should be stopped, False otherwise. + """ + self.tokens_cache.append(token_id) + text = self.tokenizer.decode(self.tokens_cache) + + word = '' + if len(text) > self.print_len and '\n' == text[-1]: + # Flush the cache after the new line symbol. + word = text[self.print_len:] + self.tokens_cache = [] + self.print_len = 0 + elif len(text) >= 3 and text[-3:] == chr(65533): + # Don't print incomplete text. + pass + elif len(text) > self.print_len: + # It is possible to have a shorter text after adding new token. + # Print to output only if text lengh is increaesed. + word = text[self.print_len:] + self.print_len = len(text) + self.put_word(word) + + if self.get_stop_flag(): + # When generation is stopped from streamer then end is not called, need to call it here manually. + self.end() + return True # True means stop generation + else: + return False # False means continue generation + + def end(self): + """ + Flushes residual tokens from the buffer and puts a None value in the queue to signal the end. + """ + text = self.tokenizer.decode(self.tokens_cache) + if len(text) > self.print_len: + word = text[self.print_len:] + self.put_word(word) + self.tokens_cache = [] + self.print_len = 0 + self.put_word(None) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('model_dir') + parser.add_argument('prompt') + args = parser.parse_args() + + device = 'CPU' # GPU can be used as well + pipe = openvino_genai.LLMPipeline(args.model_dir, device) + + text_print_streamer = IterableStreamer(pipe.get_tokenizer()) + def token_printer(): + # Getting next elements from iterable will be blocked until a new token is available. + for word in text_print_streamer: + print(word, end='', flush=True) + printer_thread = threading.Thread(target=token_printer, daemon=True) + printer_thread.start() + + config = openvino_genai.GenerationConfig() + config.max_new_tokens = 100 + config.do_sample = True + config.top_p = 0.9 + config.top_k = 30 + + # Since the streamer is set, the results will be printed + # every time a new token is generated and put into the streamer queue. + pipe.generate(args.prompt, config, text_print_streamer) + printer_thread.join() + +if '__main__' == __name__: + main() diff --git a/samples/python/visual_language_chat/README.md b/samples/python/visual_language_chat/README.md new file mode 100644 index 0000000000..12ffb27f99 --- /dev/null +++ b/samples/python/visual_language_chat/README.md @@ -0,0 +1,38 @@ +# Python vlm_chat_sample that supports VLM models + +This example showcases inference of text-generation Vision Language Models (VLMs): `miniCPM-V-2_6` and other models with the same signature. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample features `openvino_genai.VLMPipeline` and configures it for the chat scenario. There is also a Jupyter [notebook](https://github.com/openvinotoolkit/openvino_notebooks/tree/latest/notebooks/minicpm-v-multimodal-chatbot) which provides an example of Visual-language assistant. + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. + +```sh +pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino -m openbmb/MiniCPM-V-2_6 miniCPM-V-2_6 --trust-remote-code +``` + +## Run: +[This image](https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11) can be used as a sample image. + +`visual_language_chat.py ./miniCPM-V-2_6/ 319483352-d5fbbd1a-d484-415c-88cb-9986625b7b11.jpg` + + +Discrete GPUs (dGPUs) usually provide better performance compared to CPUs. It is recommended to run larger models on a dGPU with 32GB+ RAM. # TODO: examples of larger models +Modify the source code to change the device for inference to the GPU. + +See https://github.com/openvinotoolkit/openvino.genai/blob/master/src/README.md#supported-models for the list of supported models. + +### Troubleshooting + +#### Unicode characters encoding error on Windows + +Example error: +``` +UnicodeEncodeError: 'charmap' codec can't encode character '\u25aa' in position 0: character maps to <undefined> +``` + +If you encounter the error described in the example when sample is printing output to the Windows console, it is likely due to the default Windows encoding not supporting certain Unicode characters. To resolve this: +1. Enable Unicode characters for Windows cmd - open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. +2. Enable UTF-8 mode by setting environment variable `PYTHONIOENCODING="utf8"`. diff --git a/samples/python/visual_language_chat/visual_language_chat.py b/samples/python/visual_language_chat/visual_language_chat.py new file mode 100644 index 0000000000..8d908e0bf7 --- /dev/null +++ b/samples/python/visual_language_chat/visual_language_chat.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse + +import numpy as np +import openvino_genai +from PIL import Image +from openvino import Tensor + + +def streamer(subword: str) -> bool: + ''' + + Args: + subword: sub-word of the generated text. + + Returns: Return flag corresponds whether generation should be stopped. + + ''' + print(subword, end='', flush=True) + + # No value is returned as in this example we don't want to stop the generation in this method. + # "return None" will be treated the same as "return False". + + +def read_image(path: str) -> Tensor: + ''' + + Args: + path: The path to the image. + + Returns: the ov.Tensor containing the image. + + ''' + pic = Image.open(path) + image_data = np.array(pic.getdata()).reshape(1, 3, pic.size[1], pic.size[0]).astype(np.byte) + return Tensor(image_data) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('model_dir') + parser.add_argument('image_dir') + args = parser.parse_args() + + image = read_image(args.image_dir) + + device = 'CPU' # GPU can be used as well + pipe = openvino_genai.VLMPipeline(args.model_dir, device) + + config = openvino_genai.GenerationConfig() + config.max_new_tokens = 100 + + pipe.start_chat() + prompt = input('question:\n') + pipe.generate(prompt, image=image, generation_config=config, streamer=streamer) + print('\n----------') + + while True: + try: + prompt = input('question:\n') + except EOFError: + break + pipe.generate(prompt, generation_config=config, streamer=streamer) + print('\n----------') + pipe.finish_chat() + + +if '__main__' == __name__: + main() diff --git a/samples/python/whisper_speech_recognition/README.md b/samples/python/whisper_speech_recognition/README.md new file mode 100644 index 0000000000..fec5d9194f --- /dev/null +++ b/samples/python/whisper_speech_recognition/README.md @@ -0,0 +1,48 @@ +# Whisper automatic speech recognition sample + +This example showcases inference of speech recognition Whisper Models. The application doesn't have many configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample features `ov::genai::WhisperPipeline` and uses audio file in wav format as an input source. + +## Download and convert the model and tokenizers + +The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. + +It's not required to install [../../requirements.txt](../../requirements.txt) for deployment if the model has already been exported. + +```sh +pip install --upgrade-strategy eager -r ../../requirements.txt +optimum-cli export openvino --trust-remote-code --model openai/whisper-base whisper-base +``` + +## Prepare audio file + +Prepare audio file in wav format with sampling rate 16k Hz. + +## Run + +`whisper_speech_recognition whisper-base sample.wav` + +Output: text transcription of `sample.wav` + +Models can be downloaded from [OpenAI HuggingFace](https://huggingface.co/openai). + +Supported Models: +[openai/whisper-tiny](https://huggingface.co/openai/whisper-tiny) +[openai/whisper-tiny.en](https://huggingface.co/openai/whisper-tiny.en) +[openai/whisper-base](https://huggingface.co/openai/whisper-base) +[openai/whisper-base.en](https://huggingface.co/openai/whisper-base.en) +[openai/whisper-small](https://huggingface.co/openai/whisper-small) +[openai/whisper-small.en](https://huggingface.co/openai/whisper-small.en) +[openai/whisper-medium](https://huggingface.co/openai/whisper-medium) +[openai/whisper-medium.en](https://huggingface.co/openai/whisper-medium.en) +[openai/whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) + +### Troubleshooting + +#### Empty or rubbish output + +Example output: +``` +---------------- +``` + +To resolve this ensure that audio data has 16k Hz sampling rate diff --git a/samples/python/whisper_speech_recognition/whisper_speech_recognition.py b/samples/python/whisper_speech_recognition/whisper_speech_recognition.py new file mode 100755 index 0000000000..f1be0a5b36 --- /dev/null +++ b/samples/python/whisper_speech_recognition/whisper_speech_recognition.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import openvino_genai +import librosa + + +def read_wav(filepath): + raw_speech, samplerate = librosa.load(filepath, sr=16000) + return raw_speech.tolist() + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("model_dir") + parser.add_argument("wav_file_path") + args = parser.parse_args() + + raw_speech = read_wav(args.wav_file_path) + + pipe = openvino_genai.WhisperPipeline(args.model_dir) + + def streamer(word: str) -> bool: + print(word, end="") + return False + + result = pipe.generate( + raw_speech, + max_new_tokens=100, + # 'task' and 'language' parameters are supported for multilingual models only + language="<|en|>", + task="transcribe", + return_timestamps=True, + streamer=streamer, + ) + + print() + + for chunk in result.chunks: + print(f"timestamps: [{chunk.start_ts}, {chunk.end_ts}] text: {chunk.text}") + + +if "__main__" == __name__: + main() diff --git a/samples/requirements.txt b/samples/requirements.txt new file mode 100644 index 0000000000..870597f06f --- /dev/null +++ b/samples/requirements.txt @@ -0,0 +1,8 @@ +--extra-index-url https://download.pytorch.org/whl/cpu +optimum-intel @ git+https://github.com/eaidova/optimum-intel.git@ea/minicpmv +numpy<2.0.0; sys_platform == 'darwin' +einops==0.8.0 # For Qwen +transformers_stream_generator==0.0.5 # For Qwen +diffusers==0.30.3 +librosa # For Whisper +torchvision # needed for mini-CPM export script. Need to remove when we switch to exporting with optimum-intel. diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000000..d9f3cc64db --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +add_subdirectory(cpp) + +if(ENABLE_PYTHON) + add_subdirectory(python) +endif() diff --git a/src/README.md b/src/README.md new file mode 100644 index 0000000000..73fc97d1e9 --- /dev/null +++ b/src/README.md @@ -0,0 +1,349 @@ +# OpenVINO™ GenAI Library + +OpenVINO™ GenAI is a flavor of OpenVINO™, aiming to simplify running inference of generative AI models. +It hides the complexity of the generation process and minimizes the amount of code required. + +## Install OpenVINO™ GenAI + +> **NOTE**: Please make sure that you are following the versions compatibility rules, refer to the [OpenVINO™ GenAI Dependencies](#openvino-genai-dependencies) for more information. + +The OpenVINO™ GenAI flavor is available for installation via Archive and PyPI distributions. +To install OpenVINO™ GenAI, refer to the [Install Guide](https://docs.openvino.ai/2024/get-started/install-openvino.html). + +To build OpenVINO™ GenAI library from source, refer to the [Build Instructions](./docs/BUILD.md). + +### OpenVINO™ GenAI Dependencies + +OpenVINO™ GenAI depends on [OpenVINO](https://github.com/openvinotoolkit/openvino) and [OpenVINO Tokenizers](https://github.com/openvinotoolkit/openvino_tokenizers). + +When installing OpenVINO™ GenAI from PyPi, the same versions of OpenVINO and OpenVINO Tokenizers are used (e.g. `openvino==2024.3.0` and `openvino-tokenizers==2024.3.0.0` are installed for `openvino-genai==2024.3.0`). +If you update one of the dependency packages (e.g. `pip install openvino --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly`), versions might be incompatible due to different ABI and running OpenVINO GenAI can result in errors (e.g. `ImportError: libopenvino.so.2430: cannot open shared object file: No such file or directory`). +Having packages version in format `<MAJOR>.<MINOR>.<PATCH>.<REVISION>`, only `<REVISION>` part of the full version can be varied to ensure ABI compatibility, while changing `<MAJOR>`, `<MINOR>` or `<PATCH>` parts of the version might break ABI. + +GenAI, Tokenizers, and OpenVINO wheels for Linux on PyPI are compiled with `_GLIBCXX_USE_CXX11_ABI=0` to cover a wider range of platforms. In contrast, C++ archive distributions for Ubuntu are compiled with `_GLIBCXX_USE_CXX11_ABI=1`. It is not possible to mix different Application Binary Interfaces (ABIs) because doing so results in a link error. This incompatibility prevents the use of, for example, OpenVINO from C++ archive distributions alongside GenAI from PyPI. + +If you want to try OpenVINO GenAI with different dependencies versions (**not** prebuilt packages as archives or python wheels), build OpenVINO GenAI library from source. + +## Usage + +### Prerequisites + +1. Installed OpenVINO™ GenAI + + > To use OpenVINO GenAI with models that are already in OpenVINO format, no additional python dependencies are needed. To + > convert models with optimum-cli and to run the examples, install the dependencies in [./samples/requirements.txt](./samples/requirements.txt): + ```sh + # (Optional) Clone OpenVINO GenAI repository if it does not exist + git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git + cd openvino.genai + # Install python dependencies + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt + ``` + +2. A model in OpenVINO IR format + + Download and convert a model with `optimum-cli`: + ``` sh + optimum-cli export openvino --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --trust-remote-code "TinyLlama-1.1B-Chat-v1.0" + ``` + +`LLMPipeline` is the main object used for decoding. You can construct it straight away from the folder with the converted model. It will automatically load the main model, tokenizer, detokenizer and default generation configuration. + +### Python + +A simple example: +```python +import openvino_genai as ov_genai +pipe = ov_genai.LLMPipeline(model_path, "CPU") +print(pipe.generate("The Sun is yellow because", max_new_tokens=100)) +``` + +Calling generate with custom generation config parameters, e.g. config for grouped beam search: +```python +import openvino_genai as ov_genai +pipe = ov_genai.LLMPipeline(model_path, "CPU") + +result = pipe.generate("The Sun is yellow because", max_new_tokens=100, num_beam_groups=3, num_beams=15, diversity_penalty=1.5) +print(result) +``` + +output: +``` +'it is made up of carbon atoms. The carbon atoms are arranged in a linear pattern, which gives the yellow color. The arrangement of carbon atoms in' +``` + +A simple chat in Python: +```python +import openvino_genai as ov_genai +pipe = ov_genai.LLMPipeline(model_path) + +config = {'max_new_tokens': 100, 'num_beam_groups': 3, 'num_beams': 15, 'diversity_penalty': 1.5} +pipe.set_generation_config(config) + +pipe.start_chat() +while True: + print('question:') + prompt = input() + if prompt == 'Stop!': + break + print(pipe(prompt, max_new_tokens=200)) +pipe.finish_chat() +``` + +Test to compare with Huggingface outputs + +### C++ + +A simple example: +```cpp +#include "openvino/genai/llm_pipeline.hpp" +#include <iostream> + +int main(int argc, char* argv[]) { + std::string model_path = argv[1]; + ov::genai::LLMPipeline pipe(model_path, "CPU"); + std::cout << pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(256)); +} +``` + +Using group beam search decoding: +```cpp +#include "openvino/genai/llm_pipeline.hpp" +#include <iostream> + +int main(int argc, char* argv[]) { + std::string model_path = argv[1]; + ov::genai::LLMPipeline pipe(model_path, "CPU"); + + ov::genai::GenerationConfig config; + config.max_new_tokens = 256; + config.num_beam_groups = 3; + config.num_beams = 15; + config.diversity_penalty = 1.0f; + + std::cout << pipe.generate("The Sun is yellow because", config); +} +``` + +A simple chat in C++ using grouped beam search decoding: +```cpp +#include "openvino/genai/llm_pipeline.hpp" +#include <iostream> + +int main(int argc, char* argv[]) { + std::string prompt; + + std::string model_path = argv[1]; + ov::genai::LLMPipeline pipe(model_path, "CPU"); + + ov::genai::GenerationConfig config; + config.max_new_tokens = 100; + config.num_beam_groups = 3; + config.num_beams = 15; + config.diversity_penalty = 1.0f; + + pipe.start_chat(); + for (;;;) { + std::cout << "question:\n"; + std::getline(std::cin, prompt); + if (prompt == "Stop!") + break; + + std::cout << "answer:\n"; + auto answer = pipe(prompt, config); + std::cout << answer << std::endl; + } + pipe.finish_chat(); +} +``` + +Streaming example with lambda function: +```cpp +#include "openvino/genai/llm_pipeline.hpp" +#include <iostream> + +int main(int argc, char* argv[]) { + std::string model_path = argv[1]; + ov::genai::LLMPipeline pipe(model_path, "CPU"); + + auto streamer = [](std::string word) { + std::cout << word << std::flush; + // Return flag corresponds whether generation should be stopped. + // false means continue generation. + return false; + }; + std::cout << pipe.generate("The Sun is yellow bacause", ov::genai::streamer(streamer), ov::genai::max_new_tokens(200)); +} +``` + +Streaming with a custom class: + +C++ template for a stremer. +```cpp +#include "openvino/genai/streamer_base.hpp" +#include "openvino/genai/llm_pipeline.hpp" +#include <iostream> + +class CustomStreamer: public ov::genai::StreamerBase { +public: + bool put(int64_t token) { + // Custom decoding/tokens processing logic. + + // Returns a flag whether generation should be stoped, if true generation stops. + return false; + }; + + void end() { + // Custom finalization logic. + }; +}; + +int main(int argc, char* argv[]) { + CustomStreamer custom_streamer; + + std::string model_path = argv[1]; + ov::genai::LLMPipeline pipe(model_path, "CPU"); + std::cout << pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(15), ov::genai::streamer(custom_streamer)); +} +``` + +Python template for a streamer. +```py +import openvino_genai as ov_genai + +class CustomStreamer(ov_genai.StreamerBase): + def __init__(self): + super().__init__() + # Initialization logic. + + def put(self, token_id) -> bool: + # Custom decoding/tokens processing logic. + + # Returns a flag whether generation should be stoped, if true generation stops. + return False + + def end(self): + # Custom finalization logic. + +pipe = ov_genai.LLMPipeline(model_path, "CPU") +custom_streamer = CustomStreamer() + +pipe.generate("The Sun is yellow because", max_new_tokens=15, streamer=custom_streamer) +``` +For fully implemented iterable CustomStreamer please refer to [multinomial_causal_lm](https://github.com/openvinotoolkit/openvino.genai/tree/releases/2024/3/samples/python/multinomial_causal_lm/README.md) sample. + + +Continuous batching with LLMPipeline: + +To activate continuous batching please provide additional property to LLMPipeline config: ov::genai::scheduler_config. This property contains struct SchedulerConfig. +```cpp +#include "openvino/genai/llm_pipeline.hpp" + +int main(int argc, char* argv[]) { + ov::genai::SchedulerConfig scheduler_config; + // fill other fields in scheduler_config with custom data if required + scheduler_config.cache_size = 1; // minimal possible KV cache size in GB, adjust as required + + ov::genai::LLMPipeline pipe(model_path, "CPU", ov::genai::scheduler_config(scheduler_config)); +} +``` + +### Performance Metrics + +`openvino_genai.PerfMetrics` (referred as `PerfMetrics` for simplicity) is a structure that holds performance metrics for each generate call. `PerfMetrics` holds fields with mean and standard deviations for the following metrics: +- Time To the First Token (TTFT), ms +- Time per Output Token (TPOT), ms/token +- Generate total duration, ms +- Tokenization duration, ms +- Detokenization duration, ms +- Throughput, tokens/s + +and: +- Load time, ms +- Number of generated tokens +- Number of tokens in the input prompt + +Performance metrics are stored either in the `DecodedResults` or `EncodedResults` `perf_metric` field. Additionally to the fields mentioned above, `PerfMetrics` has a member `raw_metrics` of type `openvino_genai.RawPerfMetrics` (referred to as `RawPerfMetrics` for simplicity) that contains raw values for the durations of each batch of new token generation, tokenization durations, detokenization durations, and more. These raw metrics are accessible if you wish to calculate your own statistical values such as median or percentiles. However, since mean and standard deviation values are usually sufficient, we will focus on `PerfMetrics`. + +```python +import openvino_genai as ov_genai +pipe = ov_genai.LLMPipeline(model_path, "CPU") +result = pipe.generate(["The Sun is yellow because"], max_new_tokens=20) +perf_metrics = result.perf_metrics + +print(f'Generate duration: {perf_metrics.get_generate_duration().mean:.2f}') +print(f'TTFT: {perf_metrics.get_ttft().mean:.2f} ms') +print(f'TPOT: {perf_metrics.get_tpot().mean:.2f} ms/token') +print(f'Throughput: {perf_metrics.get_throughput().mean:.2f} tokens/s') +``` + +```cpp +#include "openvino/genai/llm_pipeline.hpp" +#include <iostream> + +int main(int argc, char* argv[]) { + std::string model_path = argv[1]; + ov::genai::LLMPipeline pipe(model_path, "CPU"); + auto result = pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(20)); + auto perf_metrics = result.perf_metrics; + + std::cout << std::fixed << std::setprecision(2); + std::cout << "Generate duration: " << perf_metrics.get_generate_duration().mean << " ms" << std::endl; + std::cout << "TTFT: " << metrics.get_ttft().mean << " ms" << std::endl; + std::cout << "TPOT: " << metrics.get_tpot().mean << " ms/token " << std::endl; + std::cout << "Throughput: " << metrics.get_throughput().mean << " tokens/s" << std::endl; +} +``` +output: +```sh +mean_generate_duration: 76.28 +mean_ttft: 42.58 +mean_tpot 3.80 +``` + +>**Note**: If the input prompt is just a string, the generate function returns only a string without perf_metrics. To obtain perf_metrics, provide the prompt as a list with at least one element or call generate with encoded inputs. + +Several `perf_metrics` can be added to each other. In that case `raw_metrics` are concatenated and mean/std values are recalculated. This accumulates statistics from several `generate()` calls + +```cpp +#include "openvino/genai/llm_pipeline.hpp" +#include <iostream> + +int main(int argc, char* argv[]) { + std::string model_path = argv[1]; + ov::genai::LLMPipeline pipe(model_path, "CPU"); + auto result_1 = pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(20)); + auto result_2 = pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(20)); + auto perf_metrics = result_1.perf_metrics + result_2.perf_metrics + + std::cout << std::fixed << std::setprecision(2); + std::cout << "Generate duration: " << perf_metrics.get_generate_duration().mean << " ms" << std::endl; + std::cout << "TTFT: " << metrics.get_ttft().mean << " ms" << std::endl; + std::cout << "TPOT: " << metrics.get_tpot().mean << " ms/token " << std::endl; + std::cout << "Throughput: " << metrics.get_throughput().mean << " tokens/s" << std::endl; +} +``` + +```python +import openvino_genai as ov_genai +pipe = ov_genai.LLMPipeline(model_path, "CPU") +res_1 = pipe.generate(["The Sun is yellow because"], max_new_tokens=20) +res_2 = pipe.generate(["Why Sky is blue because"], max_new_tokens=20) +perf_metrics = res_1.perf_metrics + res_2.perf_metrics + +print(f'Generate duration: {perf_metrics.get_generate_duration().mean:.2f}') +print(f'TTFT: {perf_metrics.get_ttft().mean:.2f} ms') +print(f'TPOT: {perf_metrics.get_tpot().mean:.2f} ms/token') +print(f'Throughput: {perf_metrics.get_throughput().mean:.2f} tokens/s') +``` + +For more examples of how metrics are used, please refer to the Python [benchmark_genai.py](../samples/python/benchmark_genai/README.md) and C++ [benchmark_genai](../samples/cpp/benchmark_genai/README.md) samples. + +## How It Works + +For information on how OpenVINO™ GenAI works, refer to the [How It Works Section](./docs/HOW_IT_WORKS.md). + +## Supported Models + +For a list of supported models, refer to the [Supported Models Section](./docs/SUPPORTED_MODELS.md). diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt new file mode 100644 index 0000000000..ae40818ed8 --- /dev/null +++ b/src/cpp/CMakeLists.txt @@ -0,0 +1,139 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +# Dependencies + +include(FetchContent) + +if(NOT TARGET nlohmann_json) + FetchContent_Declare(nlohmann_json + URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz + URL_HASH SHA256=0d8ef5af7f9794e3263480193c491549b2ba6cc74bb018906202ada498a79406) + FetchContent_MakeAvailable(nlohmann_json) +endif() + +function(ov_genai_build_jinja2cpp) + FetchContent_Declare(jinja2cpp + URL https://github.com/ilya-lavrenov/Jinja2Cpp/archive/04073b62ec950eab6cdcb2c563c1c9bb7698f1ea.tar.gz + URL_HASH SHA256=9f2a346eec91a6a381fe8fd631e9c952fe7087882bbca7f0e4e42e75e680fc1b) + + FetchContent_GetProperties(jinja2cpp) + if(NOT jinja2cpp_POPULATED) + FetchContent_Populate(jinja2cpp) + + set(BUILD_SHARED_LIBS OFF) + set(JINJA2CPP_INSTALL OFF CACHE BOOL "") + set(JINJA2CPP_CXX_STANDARD 17 CACHE STRING "") + set(JINJA2CPP_BUILD_SHARED OFF CACHE BOOL "") + set(JINJA2CPP_USE_REGEX "std" CACHE STRING "") + set(JINJA2CPP_WITH_JSON_BINDINGS "none" CACHE STRING "") + set(JINJA2CPP_STRICT_WARNINGS OFF CACHE BOOL "") + set(JINJA2CPP_PIC ON CACHE BOOL "") + + # options for Jinja2Cpp dependencies + option(RAPIDJSON_BUILD_DOC "Build rapidjson documentation." OFF) + + add_subdirectory("${jinja2cpp_SOURCE_DIR}" "${jinja2cpp_BINARY_DIR}" EXCLUDE_FROM_ALL) + + if(CMAKE_COMPILER_IS_GNUCXX OR OV_COMPILER_IS_CLANG OR (OV_COMPILER_IS_INTEL_LLVM AND UNIX)) + target_compile_options(jinja2cpp PRIVATE -Wno-undef) + endif() + if(SUGGEST_OVERRIDE_SUPPORTED) + target_compile_options(jinja2cpp PRIVATE -Wno-suggest-override) + endif() + endif() +endfunction() + +FetchContent_Declare(safetensors.h + URL https://github.com/hsnyder/safetensors.h/archive/974a85d7dfd6e010558353226638bb26d6b9d756.tar.gz + URL_HASH SHA256=9aaf5961609601cf9aaa96582a207bce7c6e5fbf57ed2cc669bb7bde6a937d4b) +FetchContent_MakeAvailable(safetensors.h) + +ov_genai_build_jinja2cpp() + +# Library + +file(GLOB_RECURSE SOURCE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/src/*.c") + +set(TARGET_NAME openvino_genai) +add_library(${TARGET_NAME} SHARED ${SOURCE_FILES}) +if(TARGET openvino_tokenizers) + add_dependencies(${TARGET_NAME} openvino_tokenizers) +endif() +add_library(openvino::genai ALIAS ${TARGET_NAME}) + +target_include_directories(${TARGET_NAME} + PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>" "$<INSTALL_INTERFACE:runtime/include>" + PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src") + +target_include_directories(${TARGET_NAME} SYSTEM PRIVATE "${safetensors.h_SOURCE_DIR}") + +target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime PRIVATE nlohmann_json::nlohmann_json jinja2cpp) + +target_compile_features(${TARGET_NAME} PUBLIC cxx_std_17) + +set_target_properties(${TARGET_NAME} PROPERTIES + EXPORT_NAME genai + ARCHIVE_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" + LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" + RUNTIME_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" +) +# Extract two last digits from OpenVINOGenAI_VERSION_MAJOR because SOVERSION can only contain up to 4 symbols. +string(REGEX MATCH [=[[0-9][0-9]$]=] MAJOR_SUFFIX ${OpenVINOGenAI_VERSION_MAJOR}) +if(DEFINED PY_BUILD_CMAKE_PACKAGE_NAME AND LINUX) + # Don't pack symlinks but append version hash to the name for wheel + set_target_properties(${TARGET_NAME} PROPERTIES + SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX}.${MAJOR_SUFFIX}${OpenVINOGenAI_VERSION_MINOR}${OpenVINOGenAI_VERSION_PATCH}) +elseif(DEFINED PY_BUILD_CMAKE_PACKAGE_NAME AND APPLE) + set_target_properties(${TARGET_NAME} PROPERTIES + SUFFIX .${MAJOR_SUFFIX}${OpenVINOGenAI_VERSION_MINOR}${OpenVINOGenAI_VERSION_PATCH}${CMAKE_SHARED_LIBRARY_SUFFIX}) +else() + set_target_properties(${TARGET_NAME} PROPERTIES + VERSION ${OpenVINOGenAI_VERSION} + SOVERSION ${MAJOR_SUFFIX}${OpenVINOGenAI_VERSION_MINOR}${OpenVINOGenAI_VERSION_PATCH}) +endif() + +# - Windows: `<openvino_dir>\runtime\bin\intel64\Release\` +# - MacOS_x86: `<openvino_dir>/runtime/lib/intel64/Release` +# - MacOS_arm64: `<openvino_dir>/runtime/lib/arm64/Release/` +# - Linux_x86: `<openvino_dir>/runtime/lib/intel64/` +# - Linux_arm64: `<openvino_dir>/runtime/lib/aarch64/` +string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" ARCH_DIR) +if(ARCH_DIR MATCHES "amd64.*|x86_64.*|AMD64.*") + set(ARCH_DIR intel64) +elseif(ARCH_DIR MATCHES "^(arm64.*|aarch64.*|AARCH64.*|ARM64.*)") + if(APPLE) + set(ARCH_DIR "arm64") + else() + set(ARCH_DIR "aarch64") + endif() +elseif(ARCH_DIR STREQUAL "x86_64" OR ARCH_DIR STREQUAL "amd64" # Windows detects Intel's 64-bit CPU as AMD64 + OR CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64") + set(ARCH_DIR intel64) +endif() +if(MSVC OR APPLE) + set(ARCH_DIR ${ARCH_DIR}/${CMAKE_BUILD_TYPE}) +endif() + +install(TARGETS ${TARGET_NAME} EXPORT OpenVINOGenAITargets + LIBRARY DESTINATION runtime/lib/${ARCH_DIR} COMPONENT core_genai + NAMELINK_COMPONENT core_genai_dev + ARCHIVE DESTINATION runtime/lib/${ARCH_DIR} COMPONENT core_genai_dev + RUNTIME DESTINATION runtime/bin/${ARCH_DIR} COMPONENT core_genai + INCLUDES DESTINATION runtime/include) + +install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/ + DESTINATION runtime/include COMPONENT core_genai_dev) +install(EXPORT OpenVINOGenAITargets FILE OpenVINOGenAITargets.cmake + NAMESPACE openvino:: DESTINATION runtime/cmake + COMPONENT core_genai_dev) + +include(CMakePackageConfigHelpers) +configure_package_config_file("${OpenVINOGenAI_SOURCE_DIR}/cmake/templates/OpenVINOGenAIConfig.cmake.in" + "${CMAKE_BINARY_DIR}/OpenVINOGenAIConfig.cmake" INSTALL_DESTINATION runtime/cmake) +write_basic_package_version_file("${CMAKE_BINARY_DIR}/OpenVINOGenAIConfigVersion.cmake" + VERSION ${OpenVINOGenAI_VERSION} COMPATIBILITY AnyNewerVersion) +install(FILES "${CMAKE_BINARY_DIR}/OpenVINOGenAIConfig.cmake" "${CMAKE_BINARY_DIR}/OpenVINOGenAIConfigVersion.cmake" + DESTINATION runtime/cmake COMPONENT core_genai_dev) +export(EXPORT OpenVINOGenAITargets FILE "${CMAKE_BINARY_DIR}/OpenVINOGenAITargets.cmake" NAMESPACE openvino::) diff --git a/src/cpp/include/openvino/genai/cache_eviction.hpp b/src/cpp/include/openvino/genai/cache_eviction.hpp new file mode 100644 index 0000000000..b8312361eb --- /dev/null +++ b/src/cpp/include/openvino/genai/cache_eviction.hpp @@ -0,0 +1,83 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <cstddef> +#include "openvino/openvino.hpp" + +namespace ov::genai { + /** + * @brief Represents the mode of per-token score aggregation when determining least important tokens for eviction + * from cache + */ + enum class AggregationMode { + SUM, /**< In this mode the importance scores of each token will be summed after each step of generation */ + NORM_SUM /**< Same as SUM, but the importance scores are additionally divided by the lifetime (in tokens generated) + * of a given token in cache */ + }; + + /** + * @brief Configuration struct for the cache eviction algorithm. + */ + class CacheEvictionConfig { + public: + CacheEvictionConfig() {}; + CacheEvictionConfig(size_t start_size, size_t recent_size, size_t max_cache_size, AggregationMode aggregation_mode_) : aggregation_mode(aggregation_mode_), m_start_size(start_size), m_recent_size(recent_size), m_max_cache_size(max_cache_size) { + OPENVINO_ASSERT(start_size, "CacheEvictionConfig.start_size must be non-zero"); + OPENVINO_ASSERT(recent_size, "CacheEvictionConfig.recent_size must be non-zero"); + OPENVINO_ASSERT(max_cache_size, "CacheEvictionConfig.max_cache_size must be non-zero"); + + OPENVINO_ASSERT(max_cache_size > (start_size + recent_size), + "CacheEvictionConfig.max_cache_size must be larger than CacheEvictionConfig.start_size + CacheEvictionConfig.recent_size"); + m_evictable_size = m_max_cache_size - m_start_size - m_recent_size; + + } + + /** @return Number of tokens between the "start" and "recent" areas of KV cache that + * will be considered for eviction. */ + std::size_t get_start_size() const { + return m_start_size; + } + + /** @return Number of tokens between the "start" and "recent" areas of KV cache that + * will be considered for eviction. */ + std::size_t get_recent_size() const { + return m_recent_size; + } + + /** @return Number of tokens between the "start" and "recent" areas of KV cache that + * will be considered for eviction. */ + std::size_t get_max_cache_size() const { + return m_max_cache_size; + } + + /** @return Number of tokens between the "start" and "recent" areas of KV cache that + * will be considered for eviction. */ + std::size_t get_evictable_size() const { + return m_evictable_size; + } + + /** The mode used to compute the importance of tokens for eviction */ + AggregationMode aggregation_mode = AggregationMode::NORM_SUM; + private: + /** Number of tokens in the *beginning* of KV cache that should be retained + * in the KV cache for this sequence during generation. Must be non-zero and a multiple of the KV cache block size for + * this pipeline.*/ + std::size_t m_start_size = 32; + + /** Number of tokens in the *end* of KV cache that should be retained + * in the KV cache for this sequence during generation. Must be non-zero and a multiple of the KV cache block size for + * this pipeline.*/ + std::size_t m_recent_size = 128; + + /** + * @brief Maximum cache size (in tokens) that can be occupied by a sequence with cache eviction enabled. + * Actual occupied size may differ from this by no larger than (block_size) tokens. + * Eviction area is computed from this size and the "start"/"recent" area sizes. + * @return Total cache size (in tokens) allowed to be occupied by a sequence. + */ + std::size_t m_max_cache_size = 672; + std::size_t m_evictable_size = 512; + }; +} diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp new file mode 100644 index 0000000000..efe4bc5e90 --- /dev/null +++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp @@ -0,0 +1,113 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <memory> +#include <openvino/openvino.hpp> + +#include "openvino/genai/scheduler_config.hpp" +#include "openvino/genai/tokenizer.hpp" +#include "openvino/genai/generation_config.hpp" +#include "openvino/genai/generation_handle.hpp" +#include "openvino/genai/llm_pipeline.hpp" +#include "openvino/genai/streamer_base.hpp" +#include "openvino/genai/visibility.hpp" +#include "cache_eviction.hpp" + +namespace ov::genai { + +/** + * @brief Contains general pipeline metrics, either aggregated throughout the lifetime of the generation pipeline + * or measured at the previous generation step. + */ +struct PipelineMetrics { + /** + * Number of requests to be processed by the pipeline. + */ + size_t requests = 0; + + /** + * Number of requests that were scheduled for processing at the previous step of the pipeline. + */ + size_t scheduled_requests = 0; + + /** + * Percentage of KV cache usage in the last generation step. + */ + float cache_usage = 0.0; + + /** + * Max KV cache usage during the lifetime of the pipeline in % + */ + float max_cache_usage = 0.0; + + /** + * Running average of the KV cache usage during the lifetime of the pipeline, with max window size of 1000 steps + */ + float avg_cache_usage = 0.0; +}; + +class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline { + class ImplInterface; + class ContinuousBatchingImpl; + std::shared_ptr<ImplInterface> m_impl; + +public: + ContinuousBatchingPipeline(const std::string& models_path, + const SchedulerConfig& scheduler_config, + const std::string& device = "CPU", + const ov::AnyMap& llm_plugin_config = {}, + const ov::AnyMap& tokenizer_plugin_config = {}); + + /** + * @brief Constructs a ContinuousBatchingPipeline when ov::genai::Tokenizer is initialized manually using file from the different dirs. + * + * @param model_path Path to the dir with model, tokenizer .xml/.bin files, and generation_configs.json + * @param scheduler_config + * @param tokenizer manually initialized ov::genai::Tokenizer + * @param device optional device + * @param plugin_config optional plugin_config + */ + ContinuousBatchingPipeline( + const std::string& model_path, + const ov::genai::Tokenizer& tokenizer, + const SchedulerConfig& scheduler_config, + const std::string& device="CPU", + const ov::AnyMap& plugin_config={} + ); + + ov::genai::Tokenizer get_tokenizer(); + + ov::genai::GenerationConfig get_config() const; + + /** + * Allows to get the current pipeline metrics. + * @return The struct with pipeline metrics for the previous generation step. + */ + ov::genai::PipelineMetrics get_metrics() const; + + GenerationHandle add_request(uint64_t request_id, const ov::Tensor& input_ids, const ov::genai::GenerationConfig& sampling_params); + GenerationHandle add_request(uint64_t request_id, const std::string& prompt, const ov::genai::GenerationConfig& sampling_params); + + void step(); + + bool has_non_finished_requests(); + + // more high level interface, which can process multiple prompts in continuous batching manner + std::vector<EncodedGenerationResult> generate(const std::vector<ov::Tensor>& input_ids, const std::vector<ov::genai::GenerationConfig>& sampling_params, const ov::genai::StreamerVariant& streamer=std::monostate{}); + std::vector<GenerationResult> generate(const std::vector<std::string>& prompts, const std::vector<ov::genai::GenerationConfig>& sampling_params, const ov::genai::StreamerVariant& streamer=std::monostate{}); + + /** + * @brief start chat with keeping history in kv cache. + * + * @param system_message optional system message. + */ + void start_chat(const std::string& system_message = ""); + + /** + * @brief finish chat and clear kv cache. + */ + void finish_chat(); +}; +} diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp new file mode 100644 index 0000000000..a1244d3d75 --- /dev/null +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -0,0 +1,169 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <limits> +#include <variant> +#include <string> + +#include "openvino/runtime/compiled_model.hpp" +#include "openvino/runtime/infer_request.hpp" +#include "openvino/genai/tokenizer.hpp" +#include "lora_adapter.hpp" + +namespace ov { +namespace genai { + +/** + * @brief controls the stopping condition for grouped beam search. The following values are possible: + * "EARLY" stops as soon as there are `num_beams` complete candidates. + "HEURISTIC" stops when is it unlikely to find better candidates. + "NEVER" stops when there cannot be better candidates. + */ +enum class StopCriteria { EARLY, HEURISTIC, NEVER }; + +/** + * @brief Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group + * and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will + * be used while greedy and beam search parameters will not affect decoding at all. + * + * Generic parameters: + * @param max_length the maximum length the generated tokens can have. Corresponds to the length of the input prompt + + * `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set. + * @param max_new_tokens the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. + * @param ignore_eos if set to true, then generation will not stop even if <eos> token is met. + * @param eos_token_id token_id of <eos> (end of sentence) + * @param min_new_tokens set 0 probability for eos_token_id for the first eos_token_id generated tokens. Ignored for non continuous batching. + * + * @param stop_strings vector of strings that will cause pipeline to stop generating further tokens. Ignored for non continuous batching. + * @param include_stop_str_in_output if set to true stop string that matched generation will be included in generation output (default: false) + * @param stop_token_ids vector of tokens that will cause pipeline to stop generating further tokens. Ignored for non continuous batching. + * + * Beam search specific parameters: + * @param num_beams number of beams for beam search. 1 disables beam search. + * @param num_beam_groups number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. + * @param diversity_penalty this value is subtracted from a beam's score if it generates the same token as any beam from other group at a + * particular time. See https://arxiv.org/pdf/1909.05858. + * @param length_penalty exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to + * the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log + * likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while + * `length_penalty` < 0.0 encourages shorter sequences. + * @param num_return_sequences the number of sequences to return for grouped beam search decoding per batch element. num_return_sequences must be less or equel to num_beams. + * @param no_repeat_ngram_size if set to int > 0, all ngrams of that size can only occur once. + * @param stop_criteria controls the stopping condition for grouped beam search. It accepts the following values: + * "EARLY", where the generation stops as soon as there are `num_beams` complete candidates; "HEURISTIC", where an + * "HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; + * "NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). + * + * Random sampling parameters: + * @param temperature the value used to modulate token probabilities for random sampling. + * @param top_p - if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. + * @param top_k the number of highest probability vocabulary tokens to keep for top-k-filtering. + * @param do_sample whether or not to use multinomial random sampling that add up to `top_p` or higher are kept. + * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty. + * @param presence_penalty reduces absolute log prob if the token was generated at least once. Ignored for non continuous batching. + * @param frequency_penalty reduces absolute log prob as many times as the token was generated. Ignored for non continuous batching. + * @param rng_seed initializes random generator. Ignored for non continuous batching. + */ + +class OPENVINO_GENAI_EXPORTS GenerationConfig { + +public: + GenerationConfig() = default; + explicit GenerationConfig(const std::string& json_path); + + // Generic + size_t max_new_tokens = SIZE_MAX; + size_t max_length = SIZE_MAX; + bool ignore_eos = false; + size_t min_new_tokens = 0; + + std::set<std::string> stop_strings; + // Default setting in vLLM (and OpenAI API) is not to include stop string in the output + bool include_stop_str_in_output = false; + std::set<int64_t> stop_token_ids; + + // Beam search specific + size_t num_beam_groups = 1; + size_t num_beams = 1; + float diversity_penalty = 1.0f; + float length_penalty = 1.0f; + size_t num_return_sequences = 1; + size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max(); + StopCriteria stop_criteria = StopCriteria::HEURISTIC; + + // Multinomial + float temperature = 1.0f; + float top_p = 1.0f; + size_t top_k = std::numeric_limits<size_t>::max(); + bool do_sample = false; + float repetition_penalty = 1.0f; + float presence_penalty = 0.0; + float frequency_penalty = 0.0f; + size_t rng_seed = 0; + + // EOS special token + int64_t eos_token_id = -1; + + // Optional adapters + AdapterConfig adapters; + + /** @brief sets eos_token_id to tokenizer_eos_token_id if eos_token_id is less than 0. + * Otherwise verifies eos_token_id == tokenizer_eos_token_id. + */ + void set_eos_token_id(size_t tokenizer_eos_token_id); + size_t get_max_new_tokens(size_t prompt_length = 0) const; + + bool is_greedy_decoding() const; + bool is_beam_search() const; + bool is_multinomial() const; + void update_generation_config(const ov::AnyMap& config_map); + + template <typename... Properties> + util::EnableIfAllStringAny<void, Properties...> update_generation_config(Properties&&... properties) { + return update_generation_config(AnyMap{std::forward<Properties>(properties)...}); + } + + /// @brief checks that are no conflicting parameters, e.g. do_sample=true and num_beams > 1. + /// @throws Exception if config is invalid. + void validate() const; +}; + +/* + * utils that allow to use generate and operator() in the following way: + * pipe.generate(input_ids, ov::genai::max_new_tokens(200), ov::genai::temperature(1.0f),...) + * pipe(text, ov::genai::max_new_tokens(200), ov::genai::temperature(1.0f),...) +*/ +static constexpr ov::Property<size_t> max_new_tokens{"max_new_tokens"}; +static constexpr ov::Property<size_t> max_length{"max_length"}; +static constexpr ov::Property<bool> ignore_eos{"ignore_eos"}; +static constexpr ov::Property<size_t> min_new_tokens{"min_new_tokens"}; +static constexpr ov::Property<std::vector<std::string>> stop_strings{"stop_strings"}; +static constexpr ov::Property<bool> include_stop_str_in_output{"include_stop_str_in_output"}; +static constexpr ov::Property<std::vector<std::vector<int64_t>>> stop_token_ids{"stop_token_ids"}; + +static constexpr ov::Property<size_t> num_beam_groups{"num_beam_groups"}; +static constexpr ov::Property<size_t> num_beams{"num_beams"}; +static constexpr ov::Property<float> diversity_penalty{"diversity_penalty"}; +static constexpr ov::Property<float> length_penalty{"length_penalty"}; +static constexpr ov::Property<size_t> num_return_sequences{"num_return_sequences"}; +static constexpr ov::Property<size_t> no_repeat_ngram_size{"no_repeat_ngram_size"}; +static constexpr ov::Property<StopCriteria> stop_criteria{"stop_criteria"}; + +static constexpr ov::Property<float> temperature{"temperature"}; +static constexpr ov::Property<float> top_p{"top_p"}; +static constexpr ov::Property<int> top_k{"top_k"}; +static constexpr ov::Property<bool> do_sample{"do_sample"}; +static constexpr ov::Property<float> repetition_penalty{"repetition_penalty"}; +static constexpr ov::Property<int64_t> eos_token_id{"eos_token_id"}; +static constexpr ov::Property<float> presence_penalty{"presence_penalty"}; +static constexpr ov::Property<float> frequency_penalty{"frequency_penalty"}; +static constexpr ov::Property<size_t> rng_seed{"rng_seed"}; + +// Predefined Configs +OPENVINO_GENAI_EXPORTS GenerationConfig beam_search(); +OPENVINO_GENAI_EXPORTS GenerationConfig greedy(); +OPENVINO_GENAI_EXPORTS GenerationConfig multinomial(); +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/generation_handle.hpp b/src/cpp/include/openvino/genai/generation_handle.hpp new file mode 100644 index 0000000000..7ff172e645 --- /dev/null +++ b/src/cpp/include/openvino/genai/generation_handle.hpp @@ -0,0 +1,97 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <memory> +#include <unordered_map> + +#include "openvino/genai/generation_config.hpp" +#include "openvino/genai/visibility.hpp" + +namespace ov::genai { +enum class GenerationStatus { + RUNNING = 0, // Default status for ongoing generation + FINISHED = 1, // Status set when generation has been finished + IGNORED = 2, // Status set when generation run into out-of-memory condition and could not be continued + DROPPED_BY_PIPELINE = 3, // Currently not used, TODO: implement abort functionality + DROPPED_BY_HANDLE = 4 // Status set when generation handle is dropped +}; + +struct EncodedGenerationResult { + // request ID - obsolete when handle API is approved as handle will connect results with prompts. + uint64_t m_request_id; + + // in a generic case we have multiple generation results per initial prompt + // depending on sampling parameters (e.g. beam search or parallel sampling) + std::vector<std::vector<int64_t>> m_generation_ids; + // scores + std::vector<float> m_scores; + + // Status of generation + GenerationStatus m_status = GenerationStatus::RUNNING; +}; + +enum class GenerationFinishReason { + NONE = 0, // Default value, when generation is not yet finished + STOP = 1, // Generation finished naturally, by reaching end of sequence token + LENGTH = 2 // Generation finished by reaching max_new_tokens limit +}; + +struct GenerationResult { + // request ID - obsolete when handle API is approved as handle will connect results with prompts. + uint64_t m_request_id; + + // in a generic case we have multiple generation results per initial prompt + // depending on sampling parameters (e.g. beam search or parallel sampling) + std::vector<std::string> m_generation_ids; + // scores + std::vector<float> m_scores; + + // Status of generation + GenerationStatus m_status = GenerationStatus::RUNNING; +}; + +struct GenerationOutput { + std::vector<int64_t> generated_ids; + std::vector<float> generated_log_probs; + float score; + GenerationFinishReason finish_reason; +}; + +using GenerationOutputs = std::unordered_map<uint64_t, GenerationOutput>; + +class GenerationStream; + +class OPENVINO_GENAI_EXPORTS GenerationHandleImpl { + std::shared_ptr<GenerationStream> m_generation_stream; + ov::genai::GenerationConfig m_sampling_params; + + bool is_dropped(); + +public: + GenerationHandleImpl(std::shared_ptr<GenerationStream> generation_stream, const ov::genai::GenerationConfig& sampling_params) : + m_generation_stream(std::move(generation_stream)), + m_sampling_params(sampling_params) {}; + + ~GenerationHandleImpl(); + + // There can be only one handle for a request + GenerationHandleImpl(const GenerationHandleImpl&) = delete; + GenerationHandleImpl& operator=(const GenerationHandleImpl&) = delete; + + GenerationStatus get_status(); + + bool can_read(); + + void drop(); + + GenerationOutputs back(); + // Reads result of a generation for single iteration + GenerationOutputs read(); + // Reads all generated tokens for all sequences + std::vector<GenerationOutput> read_all(); +}; + +using GenerationHandle = std::shared_ptr<GenerationHandleImpl>; +} diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp new file mode 100644 index 0000000000..b21fb43bdb --- /dev/null +++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp @@ -0,0 +1,276 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <optional> +#include <variant> +#include <chrono> + +#include "openvino/core/any.hpp" +#include "openvino/genai/generation_config.hpp" +#include "openvino/genai/tokenizer.hpp" +#include "openvino/genai/streamer_base.hpp" +#include "openvino/genai/perf_metrics.hpp" +#include "openvino/genai/scheduler_config.hpp" + +namespace ov { +namespace genai { + +// Return flag corresponds whether generation should be stopped: false means continue generation, true means stop. +using StreamerVariant = std::variant<std::function<bool(std::string)>, std::shared_ptr<StreamerBase>, std::monostate>; +using OptionalGenerationConfig = std::optional<GenerationConfig>; +using EncodedInputs = std::variant<ov::Tensor, TokenizedInputs>; +using StringInputs = std::variant<std::string, std::vector<std::string>>; + +/** +* @brief scheduler_config property serves to activate continuous batching pipeline. +* Create SchedulerConfig and fill it with sutable values. Copy or move it to plugin_config. +* And create LLMPipeline instance with this config. +*/ +static constexpr ov::Property<SchedulerConfig> scheduler_config{"scheduler_config"}; + +/** +* @brief Structure to store resulting batched tokens and scores for each batch sequence. +* The first num_return_sequences elements correspond to the first batch element. +* In the case if results decoded with beam search and random sampling scores contain +* sum of logarithmic probabilities for each token in the sequence. In the case +* of greedy decoding scores are filled with zeros. +* +* @param tokens sequence of resulting tokens +* @param scores sum of logarithmic probabilities of all tokens in the sequence +* @param metrics performance metrics with tpot, ttft, etc. of type ov::genai::PerfMetrics +*/ +class EncodedResults { +public: + std::vector<std::vector<int64_t>> tokens; + std::vector<float> scores; + PerfMetrics perf_metrics; +}; + +/** +* @brief Structure to store resulting batched text outputs and scores for each batch +* The first num_return_sequences elements correspond to the first batch element. +* +* @param texts vector of resulting sequences +* @param scores scores for each sequence +* @param metrics performance metrics with tpot, ttft, etc. of type ov::genai::PerfMetrics +*/ +class DecodedResults { +public: + std::vector<std::string> texts; + std::vector<float> scores; + PerfMetrics perf_metrics; + + // @brief Convert DecodedResults to a string. + operator std::string() const { + std::stringstream ss; + ss << *this; + return ss.str(); + } + + // @brief Convert DecodedResults to a single string. + // @return std::string containing the texts from the DecodedResults object. + operator std::vector<std::string>() const { + return texts; + } + + // @brief Overloads operator<< to enhance output the contents of DecodedResults. + // @return A reference to the output stream with the concatenated texts. + friend std::ostream& operator<<(std::ostream& os, const DecodedResults& dr) { + OPENVINO_ASSERT( + dr.scores.size() == dr.texts.size(), + "The number of scores and texts doesn't match in DecodedResults." + ); + if (dr.texts.empty()) { + return os; + } + if (dr.texts.size() == 1) { + os << dr.texts[0]; + return os; + } + for (size_t i = 0; i < dr.texts.size() - 1; ++i) { + os << std::to_string(dr.scores[i]) << ": " << dr.texts[i] << '\n'; + } + return os << std::to_string(dr.scores.back()) << ": " << dr.texts.back(); + } +}; + +class LLMPipelineImplBase; + +/** +* @brief This class is used for generation with LLMs. + */ +class OPENVINO_GENAI_EXPORTS LLMPipeline { +public: + /** + * @brief Constructs an LLMPipeline from xml/bin files, tokenizers and configuration in the same dir. + * + * @param model_path Path to the dir model xml/bin files, tokenizers and generation_configs.json + * @param device optional device + * @param plugin_config optional plugin_config + * Add ov::genai::scheduler_config property to plugin_config to create continuous batching pipeline. + * Add ov::genai::adapters property to plugin_config to register LoRA adapters. + */ + LLMPipeline( + const std::string& path, + const std::string& device="CPU", + const ov::AnyMap& plugin_config={} + ); + + /** + * @brief Constructs an LLMPipeline from xml/bin files, tokenizers and configuration in the same dir. + * Accepts arbitrary list of optional properties. + * + * @param model_path Path to the dir model xml/bin files, tokenizers and generation_config.json + * @param device optional device + * @param properties optional plugin properties, ov::genai::adapters property for LoRA adapters and + * ov::genai::scheduler_config property to create continuous batching pipeline. Properties can be + * specified in any order. + */ + template <typename... Properties, typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true> + LLMPipeline( + const std::string& path, + const std::string& device="CPU", + Properties&&... properties) + : LLMPipeline(path, device, AnyMap{std::forward<Properties>(properties)...}) { + } + + /** + * @brief Constructs an LLMPipeline from already existing infer InferRequest and Tokenizer + * + * @param request infer request of the model + * @param tokenizer initialized Tokenizer + * @param generation_config optional generation_config, be default will be initialized for greedy decoding + */ + LLMPipeline( + const ov::InferRequest& request, + const ov::genai::Tokenizer& tokenizer, + OptionalGenerationConfig generation_config=std::nullopt + ); + + /** + * @brief Constructs a LLMPipeline when ov::genai::Tokenizer is initialized manually using file from the different dirs. + * + * @param model_path Path to the dir with model, tokenizer .xml/.bin files, and generation_configs.json + * @param tokenizer manually initialized ov::genai::Tokenizer + * @param device optional device + * @param plugin_config optional plugin_config + * Add ov::genai::scheduler_config property to plugin_config to create continuous batching pipeline + */ + LLMPipeline( + const std::string& model_path, + const ov::genai::Tokenizer& tokenizer, + const std::string& device="CPU", + const ov::AnyMap& plugin_config = {} + ); + + ~LLMPipeline(); + + /** + * @brief High level generate that receives prompts as a string or a vector of strings and returns decoded output. + * + * @param inputs input prompt or a vector of prompts + * @param generation_config optional GenerationConfig + * @param streamer optional streamer + * @return DecodedResults decoded resulting text + */ + DecodedResults generate( + StringInputs inputs, + OptionalGenerationConfig generation_config=std::nullopt, + StreamerVariant streamer=std::monostate() + ); + + /** + * @brief High level generate that receives prompts as a string or a vector of strings and returns decoded output. + * properties can be in any order pipe.generate(..., ov::genai::max_new_tokens(100), ov::genai::streamer(lambda_func)). + * + * @param inputs input prompt or a vector of prompts + * @param properties properties + * @return DecodedResults decoded resulting text + */ + template <typename... Properties> + util::EnableIfAllStringAny<DecodedResults, Properties...> generate( + StringInputs inputs, + Properties&&... properties) { + return generate(inputs, AnyMap{std::forward<Properties>(properties)...}); + } + DecodedResults generate(StringInputs inputs, const ov::AnyMap& config_map); + + + DecodedResults operator()( + StringInputs inputs, + OptionalGenerationConfig generation_config=std::nullopt, + StreamerVariant streamer=std::monostate() + ) { + return generate(inputs, generation_config, streamer); + } + + template <typename... Properties> + util::EnableIfAllStringAny<DecodedResults, Properties...> operator()( + StringInputs inputs, + Properties&&... properties) { + return generate(inputs, AnyMap{std::forward<Properties>(properties)...}); + } + + /** + * @brief Low level generate to be called with already encoded input_ids tokens. + * Streamer cannot be used for multibatch inputs. + * + * @param input_ids or pair of (input_ids, attentino_mask) encoded input prompt tokens + * @param generation_config optional GenerationConfig + * @param streamer optional streamer + * @return EncodedResults a structure with resulting tokens and scores + * @throws Exception if the stremaer is set for inputs_ids with multiple batches + */ + EncodedResults generate( + const EncodedInputs& inputs, + OptionalGenerationConfig generation_config=std::nullopt, + StreamerVariant streamer=std::monostate() + ); + + /** + * @brief Low level generate to be called with already encoded input_ids tokens. + * Streamer cannot be used for multibatch inputs. + * + * @param input_ids or pair of (input_ids, attentino_mask) encoded input prompt tokens + * @param generation config params + * @return EncodedResults a structure with resulting tokens and scores + * @throws Exception if the stremaer is set for inputs_ids with multiple batches + */ + template <typename... Properties> + util::EnableIfAllStringAny<EncodedResults, Properties...> generate( + const EncodedInputs& inputs, + Properties&&... properties) { + return generate(inputs, AnyMap{std::forward<Properties>(properties)...}); + } + EncodedResults generate(const EncodedInputs& inputs, const ov::AnyMap& config_map); + + ov::genai::Tokenizer get_tokenizer(); + GenerationConfig get_generation_config() const; + void set_generation_config(const GenerationConfig& config); + + + /** + * @brief start chat with keeping history in kv cache. + * Turns on keeping KV cache between generate calls and automatic applying of chat templates. + * In case if beam search is used, KV cache is kept fot the generated sequence with maximal scores. + * + * @param system_message optional system message. + */ + void start_chat(const std::string& system_message = ""); + + /** + * @brief finish chat and clear kv cache. + * Turns off keeping KV cache between generate calls. + */ + void finish_chat(); +private: + std::unique_ptr<LLMPipelineImplBase> m_pimpl; +}; + +OPENVINO_GENAI_EXPORTS std::pair<std::string, Any> streamer(StreamerVariant func); +OPENVINO_GENAI_EXPORTS std::pair<std::string, Any> generation_config(const GenerationConfig& config); + +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/lora_adapter.hpp b/src/cpp/include/openvino/genai/lora_adapter.hpp new file mode 100644 index 0000000000..5748abb807 --- /dev/null +++ b/src/cpp/include/openvino/genai/lora_adapter.hpp @@ -0,0 +1,186 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <limits> +#include <variant> +#include <string> +#include <optional> + +#include "openvino/op/constant.hpp" +#include "openvino/runtime/compiled_model.hpp" +#include "openvino/runtime/infer_request.hpp" +#include "openvino/runtime/properties.hpp" +#include "openvino/genai/tokenizer.hpp" + + +namespace ov { +namespace genai { + +class OPENVINO_GENAI_EXPORTS AdapterController; +struct AdapterControllerImpl; + +// Inmutable LoRA Adapter that carries the adaptation matrices and serves as unique adapter identifier +class OPENVINO_GENAI_EXPORTS Adapter { + class Impl; + std::shared_ptr<Impl> m_pimpl; + friend AdapterController; + friend AdapterControllerImpl; + friend bool operator== (const Adapter& a, const Adapter& b); + friend bool operator< (const Adapter& a, const Adapter& b); +public: + explicit Adapter(const std::string& path); + Adapter() = default; + + operator bool() const { + return bool(m_pimpl); + } +}; + +// bool OPENVINO_GENAI_EXPORTS operator== (const Adapter& a, const Adapter& b); +// bool OPENVINO_GENAI_EXPORTS operator< (const Adapter& a, const Adapter& b); + + +struct OPENVINO_GENAI_EXPORTS AdapterConfig { + enum Mode { + MODE_AUTO, // Automatically selected (depends on the place where this mode is applied and device selection) + MODE_DYNAMIC, // A, B, alpha are fully variable + MODE_STATIC_RANK, // A and B have static shape, alpha is variable // FIXME: WA to unlock experiments, gives a unique perf level + MODE_STATIC, // A, B and alpha are constants + MODE_FUSE // A, B and alpha are constants, fused to main matrix W + }; + + Mode get_mode() const { return mode; } + void set_mode(Mode); + + AdapterConfig (Mode mode = MODE_AUTO); + + AdapterConfig (const Adapter& adapter, float alpha, Mode mode = MODE_AUTO) : AdapterConfig(std::vector<std::pair<Adapter, float>>{{adapter, alpha}}, mode) {} + + AdapterConfig (const Adapter& adapter, Mode mode = MODE_AUTO) : AdapterConfig(std::vector<Adapter>{adapter}, mode) {} + + template <typename AT, typename std::enable_if<std::is_constructible<Adapter, AT>::value, bool>::type = true> + AdapterConfig (const std::initializer_list<AT>& adapters, Mode mode = MODE_AUTO) : AdapterConfig(std::vector<Adapter>(adapters), mode) {} + + AdapterConfig (const std::initializer_list<std::pair<Adapter, float>>& adapters, Mode mode = MODE_AUTO) : AdapterConfig(std::vector<std::pair<Adapter, float>>(adapters), mode) {} + + AdapterConfig (const std::vector<Adapter>& adapters, Mode mode = MODE_AUTO); + + AdapterConfig (const std::vector<std::pair<Adapter, float>>& adapters, Mode mode = MODE_AUTO); + + AdapterConfig& add(const Adapter& adapter, float alpha); + AdapterConfig& add(const Adapter& adapter); + AdapterConfig& set_alpha(const Adapter& adapter, float alpha); + float get_alpha(const Adapter& adapter) const; + AdapterConfig& remove(const Adapter&); + const std::vector<Adapter>& get_adapters() const { return adapters; } + + // Returns true if it is not a trivial config + operator bool() const { + return !adapters.empty(); + } + +private: + + Mode mode; + std::vector<Adapter> adapters; + std::vector<float> alphas; + +}; + + +class AdaptersProperty : public ov::Property<AdapterConfig> { +public: + inline constexpr static const char* name () { return "adapters"; } + + constexpr AdaptersProperty() : ov::Property<AdapterConfig>(name()) {} + + inline std::pair<std::string, ov::Any> operator()(const AdapterConfig& config) const { + return ov::Property<AdapterConfig>::operator()(config); + } + + inline std::pair<std::string, ov::Any> operator()() const { + return operator()(AdapterConfig()); + } + + inline std::pair<std::string, ov::Any> operator()(AdapterConfig::Mode mode) const { + return operator()(AdapterConfig(mode)); + } + + inline std::pair<std::string, ov::Any> operator()(const Adapter& adapter, float alpha) const { + return operator()(AdapterConfig(adapter, alpha)); + } + + inline std::pair<std::string, ov::Any> operator()(const Adapter& adapter, float alpha, AdapterConfig::Mode mode) const { + return operator()(AdapterConfig(adapter, alpha, mode)); + } + + inline std::pair<std::string, ov::Any> operator()(const Adapter& adapter, AdapterConfig::Mode mode) const { + return operator()(AdapterConfig(adapter, mode)); + } + + template <typename AT, typename std::enable_if<std::is_constructible<Adapter, AT>::value, bool>::type = true> + inline std::pair<std::string, ov::Any> operator()(const std::initializer_list<AT>& adapters) const { + return operator()(AdapterConfig(adapters)); + } + + template <typename AT, typename std::enable_if<std::is_constructible<Adapter, AT>::value, bool>::type = true> + inline std::pair<std::string, ov::Any> operator()(const std::initializer_list<AT>& adapters, AdapterConfig::Mode mode) const { + return operator()(AdapterConfig(adapters, mode)); + } + + inline std::pair<std::string, ov::Any> operator()(const std::initializer_list<std::pair<Adapter, float>>& adapters) const { + return operator()(AdapterConfig(adapters)); + } + + inline std::pair<std::string, ov::Any> operator()(const std::initializer_list<std::pair<Adapter, float>>& adapters, AdapterConfig::Mode mode) const { + return operator()(AdapterConfig(adapters, mode)); + } + + inline std::pair<std::string, ov::Any> operator()(const std::vector<Adapter>& adapters) const { + return operator()(AdapterConfig(adapters)); + } + + inline std::pair<std::string, ov::Any> operator()(const std::vector<Adapter>& adapters, AdapterConfig::Mode mode) const { + return operator()(AdapterConfig(adapters, mode)); + } + + inline std::pair<std::string, ov::Any> operator()(const std::vector<std::pair<Adapter, float>>& adapters) const { + return operator()(AdapterConfig(adapters)); + } + + inline std::pair<std::string, ov::Any> operator()(const std::vector<std::pair<Adapter, float>>& adapters, AdapterConfig::Mode mode) const { + return operator()(AdapterConfig(adapters, mode)); + } +}; + + +static constexpr AdaptersProperty adapters; + + +class OPENVINO_GENAI_EXPORTS AdapterController { + + std::shared_ptr<AdapterControllerImpl> m_pimpl; + friend AdapterControllerImpl; + +public: + + AdapterController() = default; + + AdapterController(std::shared_ptr<ov::Model> model, const AdapterConfig& config, const std::string& prefix, std::string device = ""); + + // Apply adapters configured in the current config set last time, or set and use new config given as optional `config` argument + void apply(ov::InferRequest& request, const std::optional<AdapterConfig>& config = std::nullopt); + + // the next call of apply will set all adapter tensors regardless of config change, use this method if full state.reset is called for the controlled model + void force_full_apply(bool full_apply = true); + + operator bool() const { + return bool(m_pimpl); + } +}; + + +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/perf_metrics.hpp b/src/cpp/include/openvino/genai/perf_metrics.hpp new file mode 100644 index 0000000000..0960bbb4fd --- /dev/null +++ b/src/cpp/include/openvino/genai/perf_metrics.hpp @@ -0,0 +1,154 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <chrono> +#include "openvino/genai/visibility.hpp" +#include <vector> +#include <memory> +#include <optional> + +namespace ov { +namespace genai { + +using TimePoint = std::chrono::steady_clock::time_point; +using MicroSeconds = std::chrono::duration<float, std::ratio<1, 1000000>>; + +/** + * @brief Structure with raw performance metrics for each generation before any statistics are calculated. + * + * @param generate_durations Durations for each generate call in microseconds. + * @param tokenization_durations Durations for the tokenization process in microseconds. + * @param detokenization_durations Durations for the detokenization process in microseconds. + * @param m_times_to_first_token Times to the first token for each call in microseconds. + * @param m_new_token_times Time points for each new token generated. + * @param m_token_infer_durations Inference time for each token in microseconds. + * @param m_batch_sizes Batch sizes for each generate call. + * @param m_durations Total durations for each generate call in microseconds. + * @param m_inference_durations Total inference duration for each generate call in microseconds. + * @param num_generated_tokens Total number of tokens generated. + * @param num_input_tokens Total number of tokens in the input prompt. + */ +struct OPENVINO_GENAI_EXPORTS RawPerfMetrics { + std::vector<MicroSeconds> generate_durations; + std::vector<MicroSeconds> tokenization_durations; + std::vector<MicroSeconds> detokenization_durations; + + std::vector<MicroSeconds> m_times_to_first_token; + std::vector<TimePoint> m_new_token_times; + std::vector<MicroSeconds> m_token_infer_durations; + std::vector<size_t> m_batch_sizes; + std::vector<MicroSeconds> m_durations; + std::vector<MicroSeconds> m_inference_durations; +}; + +/** +* @brief Structure to store mean and standart deviation values. +*/ +struct OPENVINO_GENAI_EXPORTS MeanStdPair { + float mean; + float std; +}; + +/** + * @brief Holds performance metrics for each generate call. + * + * PerfMetrics holds fields with mean and standard deviations for the following metrics: + * - Time To the First Token (TTFT), ms + * - Time per Output Token (TPOT), ms/token + * - Generate total duration, ms + * - Tokenization duration, ms + * - Detokenization duration, ms + * - Throughput, tokens/s + * + * Additional fields include: + * - Load time, ms + * - Number of generated tokens + * - Number of tokens in the input prompt + * + * Preverable way to access values is via get functions. Getters calculate mean and std values from raw_metrics are return pairs. + * If mean and std were already calcualted getters return cached values. + * @param get_load_time Returns the load time in milliseconds. + * @param get_num_generated_tokens Returns the number of generated tokens. + * @param get_num_input_tokens Returns the number of tokens in the input prompt. + * @param get_ttft Returns the mean and standard deviation of TTFT. + * @param get_tpot Returns the mean and standard deviation of TPOT. + * @param get_throughput Returns the mean and standard deviation of throughput. + * @param get_generate_duration Returns the mean and standard deviation of generate duration. + * @param get_tokenization_duration Returns the mean and standard deviation of tokenization duration. + * @param get_detokenization_duration Returns the mean and standard deviation of detokenization duration. + * @param get_microsec Converts a duration to microseconds. + * @param m_evaluated Flag indicating if raw metrics were evaluated. + * If false, current mean/std TTFT, TPOT, etc. are not actual and evaluate_statistics() should recalculate them. + * @param evaluate_statistics Calculates mean and standard deviation values from raw_metrics. + * Optional start_time can be provided to update durations. + * @param operator+ Adds two PerfMetrics objects. + * @param operator+= Adds and assigns the right-hand PerfMetrics to the current object. + * @param raw_metrics A structure of RawPerfMetrics type that holds raw metrics. + * @param load_time Load time in milliseconds. + * + * Cached mean and standard deviations. + * @param ttft Mean and standard deviation of Time to the First Token (TTFT) in milliseconds. + * @param tpot Mean and standard deviation of Time per Output Token (TPOT) in milliseconds per token. + * @param throughput Mean and standard deviation of tokens per second. + * @param generate_duration Mean and standard deviation of the total duration of generate calls in milliseconds. + * @param tokenization_duration Mean and standard deviation of the tokenization duration in milliseconds. + * @param detokenization_duration Mean and standard deviation of the detokenization duration in milliseconds. + * @param num_generated_tokens Number of generated tokens. + * @param num_input_tokens Number of tokens in the input prompt. + */ +struct OPENVINO_GENAI_EXPORTS PerfMetrics { + float load_time; // Load time in ms. + MeanStdPair ttft; // Time to the first token (in ms) (TTFT). + MeanStdPair tpot; // Time (in ms) per output token (TPOT). + MeanStdPair ipot; // Inference time (in ms) per output token. + MeanStdPair throughput; // Tokens per second. + + MeanStdPair generate_duration; + MeanStdPair inference_duration; + MeanStdPair tokenization_duration = {-1.0f, -1.0f}; + MeanStdPair detokenization_duration = {-1.0f, -1.0f}; + + size_t num_generated_tokens; + size_t num_input_tokens; + + float get_load_time(); // Load time in ms. + size_t get_num_generated_tokens(); + size_t get_num_input_tokens(); + MeanStdPair get_ttft(); // Time to the first token (in ms) (TTFT). + MeanStdPair get_tpot(); // Time (in ms) per output token (TPOT). + MeanStdPair get_ipot(); // Inference time (in ms) per output token. + MeanStdPair get_throughput(); // Tokens per second. + + MeanStdPair get_inference_duration(); // in ms + MeanStdPair get_generate_duration(); // in ms + MeanStdPair get_tokenization_duration(); // in ms + MeanStdPair get_detokenization_duration(); // in ms + + // Flag indicating if raw metrics were evaluated. + // If false means current mean/std ttft, tpot, etc. are not actual + // and evaluate_statistics() should recalculate them. + bool m_evaluated = false; + + /** + * @brief calculates mean/std values from raw_metrics. + * + * @param start_time optional start_time in case if duration needs to be updated. + */ + void evaluate_statistics(std::optional<TimePoint> start_time = std::nullopt); + + /** + * @brief convert duration to microseconds + * + * @param duration steady clock duration + */ + static float get_microsec(std::chrono::steady_clock::duration duration); + PerfMetrics operator+(const PerfMetrics& metrics) const; + PerfMetrics& operator+=(const PerfMetrics& right); + + RawPerfMetrics raw_metrics; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/scheduler_config.hpp b/src/cpp/include/openvino/genai/scheduler_config.hpp new file mode 100644 index 0000000000..da9cdd962b --- /dev/null +++ b/src/cpp/include/openvino/genai/scheduler_config.hpp @@ -0,0 +1,57 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <cstddef> +#include "cache_eviction.hpp" + +namespace ov::genai { +struct SchedulerConfig { + // a maximum number of tokens to batch + // (in constrast to max_batch_size which combines independent sequences, we consider total amount of tokens in a batch) + // TODO: benchmark this value and understand a required value to ensure inference is not memory bound + std::size_t max_num_batched_tokens = 256; + + // total number of KV blocks available to scheduler logic + std::size_t num_kv_blocks = 0; + + // total size of KV cache in GB + std::size_t cache_size = 0; + + // block size for KV cache + std::size_t block_size = 32; + + // whether to split prompt / generate to different scheduling phases + bool dynamic_split_fuse = true; + + + /** + * Whether to use cache eviction for all sequences processed by this pipeline. When cache eviction is enabled, + * the per-sequence KV cache usage is capped by a user-configurable value, leading to memory savings at cost + * to generation quality. + */ + bool use_cache_eviction = false; + + /** + * Configuration struct for the cache eviction algorithm. Setting this has effect only if `use_cache_eviction` is + * set to `true`. + */ + CacheEvictionConfig cache_eviction_config; + + // + // vLLM-like settings + // + + // max number of scheduled sequences (you can think of it as "max batch size") + std::size_t max_num_seqs = 256; + + // Enable caching of KV-blocks. + // When turned on all previously calculated KV-caches are kept in memory for future usages. + // KV-caches can be rewritten if KV-cache limit is reached, but blocks are not released. + // This results in more RAM usage, maximum RAM usage is determined by cache_size or num_kv_blocks parameters. + // When turend off only KV-cache required for batch calculation is kept in memory and + // when a sequence has finished genegartion its cache is released. + bool enable_prefix_caching = false; +}; +} diff --git a/src/cpp/include/openvino/genai/streamer_base.hpp b/src/cpp/include/openvino/genai/streamer_base.hpp new file mode 100644 index 0000000000..dc42f047f9 --- /dev/null +++ b/src/cpp/include/openvino/genai/streamer_base.hpp @@ -0,0 +1,30 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "openvino/genai/tokenizer.hpp" + +namespace ov { +namespace genai { + +/** + * @brief base class for streamers. In order to use inherit from from this class and inplement put, and methods + * + * @param m_tokenizer tokenizer +*/ +class StreamerBase { +public: + /// @brief put is called every time new token is decoded, + /// @return bool flag to indicate whether generation should be stoped, if return true generation stops + virtual bool put(int64_t token) = 0; + + /// @brief end is called at the end of generation. It can be used to flush cache if your own streamer has one + virtual void end() = 0; + + virtual ~StreamerBase() = default; +}; + + +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/text2image/autoencoder_kl.hpp b/src/cpp/include/openvino/genai/text2image/autoencoder_kl.hpp new file mode 100644 index 0000000000..002367a8e0 --- /dev/null +++ b/src/cpp/include/openvino/genai/text2image/autoencoder_kl.hpp @@ -0,0 +1,68 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <vector> +#include <string> + +#include "openvino/core/any.hpp" +#include "openvino/runtime/tensor.hpp" +#include "openvino/runtime/infer_request.hpp" +#include "openvino/runtime/properties.hpp" + +#include "openvino/genai/visibility.hpp" + +namespace ov { +namespace genai { + +class OPENVINO_GENAI_EXPORTS AutoencoderKL { +public: + struct Config { + size_t in_channels = 3; + size_t latent_channels = 4; + size_t out_channels = 3; + float scaling_factor = 0.18215f; + std::vector<size_t> block_out_channels = { 64 }; + + explicit Config(const std::string& config_path); + }; + + explicit AutoencoderKL(const std::string& root_dir); + + AutoencoderKL(const std::string& root_dir, + const std::string& device, + const ov::AnyMap& properties = {}); + + template <typename... Properties, + typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true> + AutoencoderKL(const std::string& root_dir, + const std::string& device, + Properties&&... properties) + : AutoencoderKL(root_dir, device, ov::AnyMap{std::forward<Properties>(properties)...}) { } + + AutoencoderKL(const AutoencoderKL&); + + AutoencoderKL& reshape(int batch_size, int height, int width); + + AutoencoderKL& compile(const std::string& device, const ov::AnyMap& properties = {}); + + template <typename... Properties> + ov::util::EnableIfAllStringAny<AutoencoderKL&, Properties...> compile( + const std::string& device, + Properties&&... properties) { + return compile(device, ov::AnyMap{std::forward<Properties>(properties)...}); + } + + ov::Tensor infer(ov::Tensor latent); + +private: + void merge_vae_image_processor() const; + + Config m_config; + ov::InferRequest m_request; + std::shared_ptr<ov::Model> m_model; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/text2image/clip_text_model.hpp b/src/cpp/include/openvino/genai/text2image/clip_text_model.hpp new file mode 100644 index 0000000000..1f79b039d7 --- /dev/null +++ b/src/cpp/include/openvino/genai/text2image/clip_text_model.hpp @@ -0,0 +1,74 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <string> + +#include "openvino/genai/visibility.hpp" +#include "openvino/genai/tokenizer.hpp" +#include "openvino/genai/lora_adapter.hpp" + +#include "openvino/core/any.hpp" +#include "openvino/runtime/tensor.hpp" +#include "openvino/runtime/infer_request.hpp" +#include "openvino/runtime/properties.hpp" + +namespace ov { +namespace genai { + +class OPENVINO_GENAI_EXPORTS CLIPTextModel { +public: + struct Config { + size_t max_position_embeddings = 77; + size_t hidden_size = 512; + size_t num_hidden_layers = 13; + + explicit Config(const std::string& config_path); + }; + + explicit CLIPTextModel(const std::string root_dir); + + CLIPTextModel(const std::string& root_dir, + const std::string& device, + const ov::AnyMap& properties = {}); + + template <typename... Properties, + typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true> + CLIPTextModel(const std::string& root_dir, + const std::string& device, + Properties&&... properties) + : CLIPTextModel(root_dir, device, ov::AnyMap{std::forward<Properties>(properties)...}) { } + + CLIPTextModel(const CLIPTextModel&); + + const Config& get_config() const; + + CLIPTextModel& reshape(int batch_size); + + CLIPTextModel& compile(const std::string& device, const ov::AnyMap& properties = {}); + + template <typename... Properties> + ov::util::EnableIfAllStringAny<CLIPTextModel&, Properties...> compile( + const std::string& device, + Properties&&... properties) { + return compile(device, ov::AnyMap{std::forward<Properties>(properties)...}); + } + + void set_adapters(const AdapterConfig& adapters); + + ov::Tensor infer(const std::string& pos_prompt, const std::string& neg_prompt, bool do_classifier_free_guidance); + + ov::Tensor get_output_tensor(const size_t idx); + +private: + Config m_config; + AdapterController m_adapter_controller; + ov::InferRequest m_request; + std::shared_ptr<ov::Model> m_model; + + Tokenizer m_clip_tokenizer; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/text2image/clip_text_model_with_projection.hpp b/src/cpp/include/openvino/genai/text2image/clip_text_model_with_projection.hpp new file mode 100644 index 0000000000..e46e76f316 --- /dev/null +++ b/src/cpp/include/openvino/genai/text2image/clip_text_model_with_projection.hpp @@ -0,0 +1,70 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <string> + +#include "openvino/genai/visibility.hpp" +#include "openvino/genai/tokenizer.hpp" + +#include "openvino/core/any.hpp" +#include "openvino/runtime/tensor.hpp" +#include "openvino/runtime/infer_request.hpp" +#include "openvino/runtime/properties.hpp" + +namespace ov { +namespace genai { + +class OPENVINO_GENAI_EXPORTS CLIPTextModelWithProjection { +public: + struct Config { + size_t max_position_embeddings = 77; + size_t hidden_size = 512; + size_t num_hidden_layers = 33; + + explicit Config(const std::string& config_path); + }; + + explicit CLIPTextModelWithProjection(const std::string root_dir); + + CLIPTextModelWithProjection(const std::string& root_dir, + const std::string& device, + const ov::AnyMap& properties = {}); + + template <typename... Properties, + typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true> + CLIPTextModelWithProjection(const std::string& root_dir, + const std::string& device, + Properties&&... properties) + : CLIPTextModelWithProjection(root_dir, device, ov::AnyMap{std::forward<Properties>(properties)...}) { } + + CLIPTextModelWithProjection(const CLIPTextModelWithProjection&); + + const Config& get_config() const; + + CLIPTextModelWithProjection& reshape(int batch_size); + + CLIPTextModelWithProjection& compile(const std::string& device, const ov::AnyMap& properties = {}); + + template <typename... Properties> + ov::util::EnableIfAllStringAny<CLIPTextModelWithProjection&, Properties...> compile( + const std::string& device, + Properties&&... properties) { + return compile(device, ov::AnyMap{std::forward<Properties>(properties)...}); + } + + ov::Tensor infer(const std::string& pos_prompt, const std::string& neg_prompt, bool do_classifier_free_guidance); + + ov::Tensor get_output_tensor(const size_t idx); + +private: + Config m_config; + ov::InferRequest m_request; + std::shared_ptr<ov::Model> m_model; + + Tokenizer m_clip_tokenizer; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/text2image/pipeline.hpp b/src/cpp/include/openvino/genai/text2image/pipeline.hpp new file mode 100644 index 0000000000..e3a59cf025 --- /dev/null +++ b/src/cpp/include/openvino/genai/text2image/pipeline.hpp @@ -0,0 +1,187 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <memory> +#include <string> +#include <random> + +#include "openvino/core/any.hpp" +#include "openvino/runtime/properties.hpp" +#include "openvino/runtime/tensor.hpp" + +#include "openvino/genai/visibility.hpp" + +#include "openvino/genai/lora_adapter.hpp" +#include "openvino/genai/text2image/clip_text_model.hpp" +#include "openvino/genai/text2image/clip_text_model_with_projection.hpp" +#include "openvino/genai/text2image/unet2d_condition_model.hpp" +#include "openvino/genai/text2image/autoencoder_kl.hpp" + +namespace ov { +namespace genai { + +// +// Random generators +// + +class OPENVINO_GENAI_EXPORTS Generator { +public: + virtual float next() = 0; + virtual ~Generator(); +}; + +class OPENVINO_GENAI_EXPORTS CppStdGenerator : public Generator { +public: + // creates 'std::mt19937' with initial 'seed' to generate numbers within a range [0.0f, 1.0f] + explicit CppStdGenerator(uint32_t seed); + + virtual float next() override; +private: + std::mt19937 gen; + std::normal_distribution<float> normal; +}; + +// +// Text to image pipeline +// + +class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { +public: + class OPENVINO_GENAI_EXPORTS Scheduler { + public: + enum Type { + AUTO, + LCM, + LMS_DISCRETE, + DDIM, + EULER_DISCRETE + }; + + static std::shared_ptr<Scheduler> from_config(const std::string& scheduler_config_path, + Type scheduler_type = AUTO); + + virtual ~Scheduler(); + }; + + struct OPENVINO_GENAI_EXPORTS GenerationConfig { + // LCM: prompt only w/o negative prompt + // SD XL: prompt2 and negative_prompt2 + // FLUX: prompt2 (prompt if prompt2 is not defined explicitly) + // SD 3: prompt2, prompt3 (with fallback to prompt) and negative_prompt2, negative_prompt3 + std::optional<std::string> prompt_2 = std::nullopt, prompt_3 = std::nullopt; + std::string negative_prompt, negative_prompt_2, negative_prompt_3; + + size_t num_images_per_prompt = 1; + + // random generator to have deterministic results + std::shared_ptr<Generator> random_generator = std::make_shared<CppStdGenerator>(42); + + // the following values depend on HF diffusers class used to perform generation + float guidance_scale = 7.5f; + int64_t height = -1; + int64_t width = -1; + size_t num_inference_steps = 50; + + AdapterConfig adapters; + + void update_generation_config(const ov::AnyMap& config_map); + + // checks whether is config is valid + void validate() const; + + template <typename... Properties> + ov::util::EnableIfAllStringAny<void, Properties...> update_generation_config(Properties&&... properties) { + return update_generation_config(ov::AnyMap{std::forward<Properties>(properties)...}); + } + }; + + explicit Text2ImagePipeline(const std::string& root_dir); + + Text2ImagePipeline(const std::string& root_dir, const std::string& device, const ov::AnyMap& properties = {}); + + template <typename... Properties, + typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true> + Text2ImagePipeline(const std::string& root_dir, + const std::string& device, + Properties&&... properties) + : Text2ImagePipeline(root_dir, device, ov::AnyMap{std::forward<Properties>(properties)...}) { } + + // creates either LCM or SD pipeline from building blocks + static Text2ImagePipeline stable_diffusion( + const std::shared_ptr<Scheduler>& scheduler_type, + const CLIPTextModel& clip_text_model, + const UNet2DConditionModel& unet, + const AutoencoderKL& vae_decoder); + + // creates either LCM or SD pipeline from building blocks + static Text2ImagePipeline latent_consistency_model( + const std::shared_ptr<Scheduler>& scheduler_type, + const CLIPTextModel& clip_text_model, + const UNet2DConditionModel& unet, + const AutoencoderKL& vae_decoder); + + // creates SDXL pipeline from building blocks + static Text2ImagePipeline stable_diffusion_xl( + const std::shared_ptr<Scheduler>& scheduler_type, + const CLIPTextModel& clip_text_model, + const CLIPTextModelWithProjection& clip_text_model_with_projection, + const UNet2DConditionModel& unet, + const AutoencoderKL& vae_decoder); + + GenerationConfig get_generation_config() const; + void set_generation_config(const GenerationConfig& generation_config); + + // ability to override scheduler + void set_scheduler(std::shared_ptr<Scheduler> scheduler); + + // with static shapes performance is better + void reshape(const int num_images_per_prompt, const int height, const int width, const float guidance_scale); + + void compile(const std::string& device, const ov::AnyMap& properties = {}); + + // Returns a tensor with the following dimensions [num_images_per_prompt, height, width, 3] + ov::Tensor generate(const std::string& positive_prompt, const ov::AnyMap& properties = {}); + + template <typename... Properties> + ov::util::EnableIfAllStringAny<ov::Tensor, Properties...> generate( + const std::string& positive_prompt, + Properties&&... properties) { + return generate(positive_prompt, ov::AnyMap{std::forward<Properties>(properties)...}); + } + +private: + class DiffusionPipeline; + std::shared_ptr<DiffusionPipeline> m_impl; + + explicit Text2ImagePipeline(const std::shared_ptr<DiffusionPipeline>& impl); + + class StableDiffusionPipeline; + class StableDiffusionXLPipeline; +}; + +// +// Generation config properties +// + +static constexpr ov::Property<std::string> prompt_2{"prompt_2"}; +static constexpr ov::Property<std::string> prompt_3{"prompt_3"}; + +static constexpr ov::Property<std::string> negative_prompt{"negative_prompt"}; +static constexpr ov::Property<std::string> negative_prompt_2{"negative_prompt_2"}; +static constexpr ov::Property<std::string> negative_prompt_3{"negative_prompt_3"}; + +static constexpr ov::Property<size_t> num_images_per_prompt{"num_images_per_prompt"}; +static constexpr ov::Property<float> guidance_scale{"guidance_scale"}; +static constexpr ov::Property<int64_t> height{"height"}; +static constexpr ov::Property<int64_t> width{"width"}; +static constexpr ov::Property<size_t> num_inference_steps{"num_inference_steps"}; + +static constexpr ov::Property<std::shared_ptr<Generator>> random_generator{"random_generator"}; + +OPENVINO_GENAI_EXPORTS +std::pair<std::string, ov::Any> generation_config(const Text2ImagePipeline::GenerationConfig& generation_config); + +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/text2image/unet2d_condition_model.hpp b/src/cpp/include/openvino/genai/text2image/unet2d_condition_model.hpp new file mode 100644 index 0000000000..b3cfe1d364 --- /dev/null +++ b/src/cpp/include/openvino/genai/text2image/unet2d_condition_model.hpp @@ -0,0 +1,78 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <vector> +#include <string> +#include <memory> + +#include "openvino/genai/visibility.hpp" + +#include "openvino/core/any.hpp" +#include "openvino/core/model.hpp" +#include "openvino/runtime/tensor.hpp" +#include "openvino/runtime/infer_request.hpp" +#include "openvino/runtime/properties.hpp" +#include "openvino/genai/lora_adapter.hpp" + +namespace ov { +namespace genai { + +class OPENVINO_GENAI_EXPORTS UNet2DConditionModel { +public: + struct Config { + size_t in_channels = 4; + size_t sample_size = 0; + std::vector<size_t> block_out_channels = { 320, 640, 1280, 1280 }; + int time_cond_proj_dim = -1; + + explicit Config(const std::string& config_path); + }; + + explicit UNet2DConditionModel(const std::string root_dir); + + UNet2DConditionModel(const std::string& root_dir, + const std::string& device, + const ov::AnyMap& properties = {}); + + template <typename... Properties, + typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true> + UNet2DConditionModel(const std::string& root_dir, + const std::string& device, + Properties&&... properties) + : UNet2DConditionModel(root_dir, device, ov::AnyMap{std::forward<Properties>(properties)...}) { } + + UNet2DConditionModel(const UNet2DConditionModel&); + + const Config& get_config() const; + + size_t get_vae_scale_factor() const; + + UNet2DConditionModel& reshape(int batch_size, int height, int width, int tokenizer_model_max_length); + + UNet2DConditionModel& compile(const std::string& device, const ov::AnyMap& properties = {}); + + template <typename... Properties> + ov::util::EnableIfAllStringAny<UNet2DConditionModel&, Properties...> compile( + const std::string& device, + Properties&&... properties) { + return compile(device, ov::AnyMap{std::forward<Properties>(properties)...}); + } + + void set_hidden_states(const std::string& tensor_name, ov::Tensor encoder_hidden_states); + + void set_adapters(const AdapterConfig& adapters); + + ov::Tensor infer(ov::Tensor sample, ov::Tensor timestep); + +private: + Config m_config; + AdapterController m_adapter_controller; + std::shared_ptr<ov::Model> m_model; + ov::InferRequest m_request; + size_t m_vae_scale_factor; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp new file mode 100644 index 0000000000..5aa7655c97 --- /dev/null +++ b/src/cpp/include/openvino/genai/tokenizer.hpp @@ -0,0 +1,136 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <string> +#include <vector> +#include <initializer_list> + +#include "openvino/runtime/tensor.hpp" +#include "openvino/genai/visibility.hpp" +#include <openvino/runtime/properties.hpp> + +namespace ov { +namespace genai { + +using ChatHistory = std::vector<std::unordered_map<std::string, std::string>>; + +struct TokenizedInputs { + ov::Tensor input_ids; + ov::Tensor attention_mask; +}; + +/** +* @brief class is used to encode prompts and decode resulting tokens +*/ +class OPENVINO_GENAI_EXPORTS Tokenizer { +public: + /** + * @brief ov::genai::Tokenizer constructor. + * @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path + */ + Tokenizer(const std::string& tokenizer_path, const ov::AnyMap& plugin_config = {}); + + /** + * @brief encode a single prompt + * @param prompt std::string with input prompt + * @param tokenization_params AnyMap with tokenization parameters, e.g. {'add_special_tokens', false} + * @return pair of [input_ids, attention_mask] + */ + TokenizedInputs encode(const std::string prompt, const ov::AnyMap& tokenization_params = {}); + + /** + * @brief encode batch of prompts. Left padding will be applied by default + * @param prompts vector storing batch of prompts + * @param tokenization_params AnyMap with tokenization parameters, e.g. {'add_special_tokens', false} + * @return pair of [input_ids, attention_mask] + */ + TokenizedInputs encode(std::vector<std::string>& prompt, const ov::AnyMap& tokenization_params = {}); + TokenizedInputs encode(std::vector<std::string>&& prompts, const ov::AnyMap& tokenization_params = {}); + TokenizedInputs encode(std::initializer_list<std::string>& prompts, const ov::AnyMap& tokenization_params = {}); + + /** + * @brief encode a single prompt + * @param prompt std::string with input prompt + * @param properties tokenization properties, e.g. ov::genai::add_special_tokens(false) + * @return pair of [input_ids, attention_mask] + */ + template <typename... Properties> + util::EnableIfAllStringAny<TokenizedInputs, Properties...> encode(std::string& prompt, Properties&&... properties) { + return encode(prompt, AnyMap{std::forward<Properties>(properties)...}); + } + + /** + * @brief encode batch of prompts. Left padding will be applied by default + * @param prompts vector storing batch of prompts + * @param properties tokenization properties, e.g. ov::genai::add_special_tokens(false) + * @return pair of [input_ids, attention_mask] + */ + template <typename... Properties> + util::EnableIfAllStringAny<TokenizedInputs, Properties...> encode(std::vector<std::string>& prompts, Properties&&... properties) { + return encode(prompts, AnyMap{std::forward<Properties>(properties)...}); + } + + /** + * @brief decode sequence of tokens + * @param tokens vector storing tokens + * @return sequence string + */ + std::string decode(std::vector<int64_t> tokens); + + /** + * @brief decode tokens. + * @param tokens ov::Tensor with tokens with shape [batch_size, seq_len] + * @return vector of std::string, with size = batch_size + */ + std::vector<std::string> decode(ov::Tensor tokens); + + /** + * @brief batched decoding of tokens. + * @param tokens vector of vectors with tokens, tokens.size() is equal to batch_size + * @return vector of std::string, with size equal to batch_size + */ + std::vector<std::string> decode(std::vector<std::vector<int64_t>> tokens); + + /** + * @brief Embeds input prompts with special tags for a chat scenario. + * + * For example, for Qwen family models, the prompt "1+1=" would be transformed into + * <|im_start|>user\n1+1=<|im_end|>\n<|im_start|>assistant\n. + * + * @param history A vector of maps, with chat history, e.g. [{"role": "user", "content": "prompt"}, ...]. + * @param add_generation_prompt Whether to add an ending that indicates the start of generation. + * @param chat_template An optional chat template string, if not specified will be taken from the tokenizer. + * @return A string with the transformed and concatenated prompts from the chat history. + * @throws Exception if the chat template was unable to parse the input history. + */ + std::string apply_chat_template(ChatHistory history, + bool add_generation_prompt, + const std::string& chat_template="") const; + + /// @brief Override a chat_template read from tokenizer_config.json. + /// @param chat_template The new template to override with. + void set_chat_template(const std::string& chat_template); + + // information about <bos>, <eos> tokens should be public, + // they are used at least in StreamerBase descendants + int64_t get_bos_token_id() const; + int64_t get_eos_token_id() const; + int64_t get_pad_token_id() const; + + std::string get_bos_token() const; + std::string get_eos_token() const; + std::string get_pad_token() const; + + Tokenizer() = default; + ~Tokenizer(); +private: + class TokenizerImpl; + std::shared_ptr<TokenizerImpl> m_pimpl; +}; + +static constexpr ov::Property<bool> add_special_tokens{"add_special_tokens"}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/visibility.hpp b/src/cpp/include/openvino/genai/visibility.hpp new file mode 100644 index 0000000000..4a1a60bb61 --- /dev/null +++ b/src/cpp/include/openvino/genai/visibility.hpp @@ -0,0 +1,12 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "openvino/core/visibility.hpp" + +#ifdef openvino_genai_EXPORTS +# define OPENVINO_GENAI_EXPORTS OPENVINO_CORE_EXPORTS +#else +# define OPENVINO_GENAI_EXPORTS OPENVINO_CORE_IMPORTS +#endif // openvino_genai_EXPORTS diff --git a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp new file mode 100644 index 0000000000..bd83318bb4 --- /dev/null +++ b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp @@ -0,0 +1,109 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "openvino/genai/llm_pipeline.hpp" +#include "openvino/genai/streamer_base.hpp" +#include "openvino/genai/tokenizer.hpp" +#include <filesystem> + +namespace ov::genai { +/// @brief A Visual language modeling pipeline class used to generate a +/// response or run a chat given a prompt and an image. +class OPENVINO_GENAI_EXPORTS VLMPipeline { +public: + /// @brief Construct a pipeline form a folder containing tokenizer + /// and model IRs. + /// @param model_dir A folder to read tokenizer and model IRs. + /// @param device Inference device. A tokenizer is always compiled + /// for CPU. + /// @param device_config A config to pass to ov::Core.set_property() + /// and ov::Core::compile_model(). + /// @param core ov::Core instance to use. + explicit VLMPipeline( + const std::filesystem::path& model_dir, + const std::string& device="CPU", + const ov::AnyMap device_config={} + ); + + /// @brief Default destructor. + ~VLMPipeline(); + + /// @brief Generate a response given a prompt and any number of + /// uint8 RGB images with [NCHW] or [CHW] layout. + /// @param prompt A prompt to respond to. + /// @param images Images to be prepended to a prompt. + /// @param generation_config A config to follow for text generation. + /// @param streamer A streamer to acquire intermidiate result. + /// @return A string generated by a model. + DecodedResults generate( + const std::string& prompt, + const std::vector<ov::Tensor>& rgbs, + const GenerationConfig& generation_config, + const StreamerVariant& streamer + ); + /// @brief Generate a response given a prompt and config. + /// @param prompt A prompt to respond to. + /// @param config_map A config may contain GenerationConfig, values + /// for its members, StreamerVariant a single image or multiple + /// images. + /// @return A string generated by a model. + DecodedResults generate( + const std::string& prompt, + const ov::AnyMap& config_map + ); + /// @brief Generate a response given a prompt and arbitrary number + /// of ov::Property instances. + /// Example: + /// generate("text", image(rgb), do_sample(true)); + /// @param prompt A prompt to respond to. + /// @param ...properties ov::Property instances to be combined into + /// ov::AnyMap. + /// @return A string generated by a model. + template <typename... Properties> + util::EnableIfAllStringAny<DecodedResults, Properties...> generate( + const std::string& prompt, + Properties&&... properties + ) { + return generate( + prompt, AnyMap{std::forward<Properties>(properties)...} + ); + } + /// @brief Activate chat mode. Chat preserves previous history and + /// applies chat_template to input prompts. Calling start_chat() + /// again or finish_chat() drops the memorized history. + /// It's possible to disable + /// chat_template application by calling + /// set_chat_template("{% for message in messages %}{{ message['content'] }}{% endfor %}") + /// @param system_message Some chat_templates contain system role + /// in addition to user and assistant roles. Set a message for that + /// role. + void start_chat(const std::string& system_message=""); + /// @brief Deactivate chat mode. + void finish_chat(); + /// @brief Set a custom chat template. Can be used to deactivate + /// chat_template application for chat mode if called with + /// "{% for message in messages %}{{ message['content'] }}{% endfor %}" + /// or workaround unsupported chat_template entries in a default + /// model chat_template. + /// @param new_template A new template to override with. + void set_chat_template(const std::string& new_template); + /// @brief Extract GenerationConfig used to get default values. + /// @return Default values used. + GenerationConfig get_generation_config() const; + /// @brief Override default values for GenerationConfig + /// @param new_config A config to override default values with. + void set_generation_config(const GenerationConfig& new_config); +private: + class VLMPipelineImpl; + std::unique_ptr<VLMPipelineImpl> m_pimpl; +}; + +/* + * utils that allow to use generate() in the following way: + * pipe.generate(prompt, ov::genai::image(image_tensor)). +*/ +static constexpr ov::Property<ov::Tensor> image{"image"}; +static constexpr ov::Property<std::vector<ov::Tensor>> images{"images"}; +} diff --git a/src/cpp/include/openvino/genai/whisper_generation_config.hpp b/src/cpp/include/openvino/genai/whisper_generation_config.hpp new file mode 100644 index 0000000000..44e3b42571 --- /dev/null +++ b/src/cpp/include/openvino/genai/whisper_generation_config.hpp @@ -0,0 +1,124 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <optional> + +#include "openvino/genai/tokenizer.hpp" +#include "openvino/runtime/compiled_model.hpp" + +namespace ov { +namespace genai { + +/** + * @brief Structure to keep whisper generation config parameters. + */ +class OPENVINO_GENAI_EXPORTS WhisperGenerationConfig { +public: + WhisperGenerationConfig() = default; + explicit WhisperGenerationConfig(const std::string& json_path); + + // Generic + + // the maximum length the generated tokens can have. Corresponds to the length of the input prompt + + // `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set. + size_t max_new_tokens = SIZE_MAX; + // the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. + // max_new_tokens has priority over max_length. + size_t max_length = SIZE_MAX; + + // Whisper specific + + // Corresponds to the ”<|startoftranscript|>” token. + int64_t decoder_start_token_id = 50258; + + // End of stream token id. + int64_t eos_token_id = 50257; + + // Padding token id. + int64_t pad_token_id = 50257; + + // Translate token id. + int64_t translate_token_id = 50358; + + // Transcribe token id. + int64_t transcribe_token_id = 50359; + + // No timestamps token id. + int64_t no_timestamps_token_id = 50363; + + // Begin timestamps token id. + int64_t begin_timestamps_token_id = 50364; + + size_t max_initial_timestamp_index = 50; + + bool is_multilingual = true; + + // Language token to use for generation in the form of <|en|>. + // You can find all the possible language tokens in the generation_config.json lang_to_id dictionary. + // Can be set for multilingual models only. + std::optional<std::string> language = std::nullopt; + + // Language token to token_id map. Initialized from the generation_config.json lang_to_id dictionary. + std::map<std::string, int64_t> lang_to_id; + + // Task to use for generation, either “translate” or “transcribe”. + // Can be set for multilingual models only. + std::optional<std::string> task = std::nullopt; + + // If `true` the pipeline will return timestamps along the text for *segments* of words in the text. + // For instance, if you get + // WhisperDecodedResultChunk + // start_ts = 0.5 + // end_ts = 1.5 + // text = " Hi there!" + // then it means the model predicts that the segment "Hi there!" was spoken after `0.5` and before `1.5` seconds. + // Note that a segment of text refers to a sequence of one or more words, rather than individual words. + bool return_timestamps = false; + + // A list containing tokens that will be supressed at the beginning of the sampling process. + std::vector<int64_t> begin_suppress_tokens; + + // A list containing the non-speech tokens that will be supressed during generation. + std::vector<int64_t> suppress_tokens; + + /** @brief sets eos_token_id to tokenizer_eos_token_id if eos_token_id is less than 0. + * Otherwise verifies eos_token_id == tokenizer_eos_token_id. + */ + void set_eos_token_id(int64_t tokenizer_eos_token_id); + size_t get_max_new_tokens(size_t prompt_length = 0) const; + + void update_generation_config(const ov::AnyMap& config_map = {}); + + template <typename... Properties> + util::EnableIfAllStringAny<void, Properties...> update_generation_config(Properties&&... properties) { + return update_generation_config(AnyMap{std::forward<Properties>(properties)...}); + } + + /// @brief checks that are no conflicting parameters. + /// @throws Exception if config is invalid. + void validate() const; +}; + +/* + * utils that allow to use generate and operator() in the following way: + * pipe.generate(input_ids, ov::genai::max_new_tokens(200),...) + * pipe(text, ov::genai::max_new_tokens(200),...) + */ + +static constexpr ov::Property<std::vector<int64_t>> begin_suppress_tokens{"begin_suppress_tokens"}; +static constexpr ov::Property<std::vector<int64_t>> suppress_tokens{"suppress_tokens"}; +static constexpr ov::Property<int64_t> decoder_start_token_id{"decoder_start_token_id"}; +static constexpr ov::Property<int64_t> pad_token_id{"pad_token_id"}; +static constexpr ov::Property<int64_t> transcribe_token_id{"transcribe_token_id"}; +static constexpr ov::Property<int64_t> translate_token_id{"translate_token_id"}; +static constexpr ov::Property<int64_t> no_timestamps_token_id{"no_timestamps_token_id"}; +static constexpr ov::Property<int64_t> begin_timestamps_token_id{"begin_timestamps_token_id"}; +static constexpr ov::Property<std::string> language{"language"}; +static constexpr ov::Property<std::string> task{"task"}; +static constexpr ov::Property<bool> return_timestamps{"return_timestamps"}; +static constexpr ov::Property<std::map<std::string, int64_t>> lang_to_id{"lang_to_id"}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/include/openvino/genai/whisper_pipeline.hpp b/src/cpp/include/openvino/genai/whisper_pipeline.hpp new file mode 100644 index 0000000000..689dd0eb35 --- /dev/null +++ b/src/cpp/include/openvino/genai/whisper_pipeline.hpp @@ -0,0 +1,101 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <openvino/openvino.hpp> +#include <optional> +#include <variant> + +#include "openvino/core/any.hpp" +#include "openvino/genai/llm_pipeline.hpp" +#include "openvino/genai/whisper_generation_config.hpp" + +namespace ov::genai { + +using OptionalWhisperGenerationConfig = std::optional<WhisperGenerationConfig>; + +using RawSpeechInput = std::vector<float>; + +struct WhisperDecodedResultChunk { + // start of chunk in seconds + float start_ts; + + // end of chunk in seconds + // -1.0f if chunk started but model did not predict an ending timestamp + // can happen if audio is cut off in the middle of a word + float end_ts = -1.0f; + std::string text; +}; + +struct WhisperDecodedResults : public DecodedResults { + std::optional<std::vector<WhisperDecodedResultChunk>> chunks = std::nullopt; +}; + +class OPENVINO_GENAI_EXPORTS WhisperPipeline { + class Impl; + std::unique_ptr<Impl> m_impl; + +public: + /** + * @brief Constructs an WhisperSpeechRecognitionPipeline from xml/bin files, tokenizers and configuration in the + * same dir. + * + * @param model_path Path to the dir model xml/bin files, tokenizers and generation_configs.json + * @param device optional device + * @param plugin_config optional plugin_config + */ + WhisperPipeline(const std::string& model_path, + const std::string& device = "CPU", + const ov::AnyMap& plugin_config = {}); + + /** + * @brief Constructs a WhisperPipeline when ov::genai::Tokenizer is initialized manually using file + * from the different dirs. + * + * @param model_path Path to the dir with model, tokenizer .xml/.bin files, and generation_configs.json + * @param tokenizer manually initialized ov::genai::Tokenizer + * @param device optional device + * @param plugin_config optional plugin_config + */ + WhisperPipeline(const std::string& model_path, + const ov::genai::Tokenizer& tokenizer, + const std::string& device = "CPU", + const ov::AnyMap& plugin_config = {}); + + ~WhisperPipeline(); + + /** + * @brief High level generate that receives raw speech as a vector of floats and returns decoded output. + * + * @param raw_speech_input raw speech input. Required to be normalized to near [-1, 1] range and have 16k Hz + * sampling rate. + * @param generation_config optional GenerationConfig + * @param streamer optional streamer + * @return WhisperDecodedResults decoded resulting text transcription + */ + WhisperDecodedResults generate(const RawSpeechInput& raw_speech_input, + OptionalWhisperGenerationConfig generation_config = std::nullopt, + StreamerVariant streamer = std::monostate()); + + /** + * @brief High level generate that receives raw speech as a vector of floats and returns decoded output. + * properties can be in any order pipe.generate(..., ov::genai::max_new_tokens(100), + * ov::genai::streamer(lambda_func)). + * + * @param raw_speech_input raw speech input + * @param properties properties + * @return WhisperDecodedResults decoded resulting text transcription + */ + template <typename... Properties> + util::EnableIfAllStringAny<WhisperDecodedResults, Properties...> generate(const RawSpeechInput& raw_speech_input, + Properties&&... properties) { + return generate(raw_speech_input, AnyMap{std::forward<Properties>(properties)...}); + } + WhisperDecodedResults generate(const RawSpeechInput& raw_speech_input, const ov::AnyMap& config_map); + + ov::genai::Tokenizer get_tokenizer(); + WhisperGenerationConfig get_generation_config() const; + void set_generation_config(const WhisperGenerationConfig& config); +}; +} // namespace ov::genai diff --git a/src/cpp/src/attention_output.hpp b/src/cpp/src/attention_output.hpp new file mode 100644 index 0000000000..b46ede11d2 --- /dev/null +++ b/src/cpp/src/attention_output.hpp @@ -0,0 +1,8 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include "openvino/openvino.hpp" +using AttentionScoresForCacheOfSubsequence = ov::Tensor; +using AttentionScoresForEachDecoderLayer = std::vector<AttentionScoresForCacheOfSubsequence>; +using AttentionScoresForEachSubsequence = std::map<size_t, AttentionScoresForEachDecoderLayer>; diff --git a/src/cpp/src/block_manager.hpp b/src/cpp/src/block_manager.hpp new file mode 100644 index 0000000000..555abbdbcb --- /dev/null +++ b/src/cpp/src/block_manager.hpp @@ -0,0 +1,1053 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <memory> +#include <list> +#include <map> +#include <algorithm> +#include <fstream> +#include <chrono> + +#include "sequence_group.hpp" + + +namespace ov::genai { + +class KVCacheBlock { + int m_ref_count; + int m_index; + size_t m_hash; + std::chrono::time_point<std::chrono::system_clock> m_timestamp; +public: + using Ptr = std::shared_ptr<KVCacheBlock>; + using CPtr = std::shared_ptr<const KVCacheBlock>; + + explicit KVCacheBlock(int index) + : m_ref_count(0), + m_index(index), + m_timestamp(std::chrono::system_clock::now()) { } + + int get_index() const { + return m_index; + } + + bool is_free() const { + return m_ref_count == 0; + } + + void increment() { + ++m_ref_count; + } + + void release() { + OPENVINO_ASSERT(m_ref_count > 0); + --m_ref_count; + } + + bool copy_on_write() const { + return m_ref_count > 1; + } + + int get_references_count() const { + return m_ref_count; + } + + size_t get_hash() const { + return m_hash; + } + + void set_hash(size_t hash) { + m_hash = hash; + } + + void set_timestamp(const std::chrono::time_point<std::chrono::system_clock>& timestamp) { + m_timestamp = timestamp; + } + + std::chrono::time_point<std::chrono::system_clock> get_timestamp() { + return m_timestamp; + } +}; + +using BlocksPerLayer = std::vector<KVCacheBlock::Ptr>; + +/** + * @brief Allows to store and retrieve KV-cache blocks based on their content- and position-based hash. + * Blocks with the same prefix in the generated sequence will have the same hash. Blocks within this store + * are not owned by any sequence (but had been once) and may be either selected for overwriting, if the allocator + * runs out of fresh blocks, or reused if their contents match to the prefix-based requested hash. + */ +class OverwritableBlocksHashStore { + std::map<size_t, BlocksPerLayer> m_blocks; + size_t m_num_layers; + public: + /** + * Constructs the BlockHashStore. + * @param num_layers The number of separate attention layers with KV caches in the LLM associated with the pipeline. + */ + explicit OverwritableBlocksHashStore(size_t num_layers = 1) : m_num_layers(num_layers) { OPENVINO_ASSERT(num_layers != 0, "num_layers must be non-zero"); } + + /** + * Registers allocated KV cache blocks as overwritable. The blocks must not be owned by any sequence. + * @param blocks_for_all_layers A vector of KV cache blocks (one for each decoder layer) to be added to the store. + * The hash of each block across the vector must be identical. + */ + void add(const BlocksPerLayer& blocks_for_all_layers) { + OPENVINO_ASSERT(blocks_for_all_layers.size() == m_num_layers); + bool is_all_free = std::all_of(blocks_for_all_layers.begin(), blocks_for_all_layers.end(), [](const KVCacheBlock::Ptr& block_ptr) { return block_ptr->is_free(); }); + OPENVINO_ASSERT(is_all_free); + size_t hash = blocks_for_all_layers[0]->get_hash(); + for (const auto& block : blocks_for_all_layers) { + if (block->get_hash() != hash) { + OPENVINO_THROW("internal error - block hashes for all layers must be equal"); + } + } + OPENVINO_ASSERT(m_blocks.count(hash) == 0); + m_blocks[hash] = blocks_for_all_layers; + } + + + /** + * Retrieves KV cache blocks from storage by their hash (expected to be identical for all layers) for their contents + * to be reused by another sequence. Returned blocks will have reference counters equal to 1. + * @param hash The hash value to look up in the store. + * @return A vector of KV cache blocks (one for each decoder layer) previously stored under this hash. + */ + BlocksPerLayer get_block_to_restore(size_t hash) { + auto it = m_blocks.find(hash); + if (it == m_blocks.end()) + { + return {}; + } + BlocksPerLayer blocks_for_all_layers = it->second; + for (auto& block_ptr : blocks_for_all_layers) { + + block_ptr->set_timestamp(std::chrono::system_clock::now()); + block_ptr->increment(); + } + m_blocks.erase(it); + return blocks_for_all_layers; + } + + /** + * Pops the least recently used blocks from the store to be used and overwritten by another sequence. + * Returned blocks will have reference counters equal to 1. + * @return A vector of KV cache blocks (one for each decoder layer) that has least recently been added to the store + * based on the timestamp. + */ + BlocksPerLayer get_lru_block_to_overwrite() { + if (m_blocks.empty()) { + return {}; + } + auto hash_and_blocks_for_all_layers = std::min_element(std::begin(m_blocks), std::end(m_blocks), [](const auto& lhs, const auto& rhs) -> bool { return lhs.second[0]->get_timestamp() < rhs.second[0]->get_timestamp(); }); + auto blocks_for_all_layers = hash_and_blocks_for_all_layers->second; + auto timestamp = std::chrono::system_clock::now(); + for (auto& block_ptr : blocks_for_all_layers) { + block_ptr->set_timestamp(timestamp); + block_ptr->increment(); + } + m_blocks.erase(hash_and_blocks_for_all_layers->first); + return blocks_for_all_layers; + } + + /** + * + * @return Number of blocks (per layer) currently in the store. + */ + size_t num_blocks() const { + return m_blocks.size(); + } + + /** + * @brief Removes blocks matching to the supplied hashes from the store + * @param hashes_to_discard A set of hashes. For each hash, if it is present in the store, the corresponding block will be discarded + * and the block added to the returned vector. If a hash is not present in the store, it is silently ignored. + * @return A vector of blocks, each element corresponding to a removed hash. + */ + std::vector<BlocksPerLayer> clean_store(const std::set<uint64_t>& hashes_to_discard) { + std::vector<BlocksPerLayer> retval; + retval.reserve(hashes_to_discard.size()); + for (uint64_t hash : hashes_to_discard) { + auto it = m_blocks.find(hash); + if (it != m_blocks.end()) { + retval.push_back(it->second); + m_blocks.erase(it); + } + } + return retval; + } +}; + +class CacheStateDumper; + +/** + * @brief Maintains a pool of KV cache block descriptors (layered as configured at initialization), freeing or allocating + * them as requested. + */ +class BlockAllocator { + std::vector<std::list<KVCacheBlock::Ptr>> m_free_blocks; + int m_total_num_blocks; + friend class CacheStateDumper; + size_t m_num_layers; + bool m_enable_prefix_caching; + ov::genai::OverwritableBlocksHashStore m_overwriteable_blocks; +public: + /** + * Constructs the BlockAllocator. + * @param num_blocks Number of KV cache blocks in the free block pool to be owned by this allocator. + * @param enable_prefix_caching Whether prefix caching should be enabled for this allocator. + * See also the equivalent parameter in ov::genai::ContinuousBatchingPipeline + * @param num_layers The number of separate attention layers with KV caches in the LLM associated with the pipeline. + * Blocks returned will be vectors with this size, each vector entry to be associated with a separate layer's KV cache. + */ + BlockAllocator(int num_blocks, bool enable_prefix_caching, size_t num_layers = 1) : + m_total_num_blocks(num_blocks), m_num_layers(num_layers), m_enable_prefix_caching(enable_prefix_caching), m_overwriteable_blocks(num_layers) { + OPENVINO_ASSERT(num_layers != 0, "num_layers must be non-zero"); + m_free_blocks.resize(m_num_layers); + for (auto& per_layer_block_list : m_free_blocks) { + for (int block_id = 0; block_id < m_total_num_blocks; ++block_id) { + per_layer_block_list.push_back(std::make_shared<KVCacheBlock>(block_id)); + } + } + } + + ~BlockAllocator() { + // sanity check to validate that all blocks are freed + // OPENVINO_ASSERT(m_total_num_blocks == m_free_blocks.size()); + } + + /** + * Returns the number of free blocks for a given layer. + * @param layer_idx Index of the layer. + * @return Number of free blocks for this layer. + */ + size_t num_free_blocks(size_t layer_idx) const { + return m_free_blocks[layer_idx].size() + m_overwriteable_blocks.num_blocks(); + } + + /** + * Returns the number of overwriteable blocks (in a prefix caching scenario). + * @return Number of overwriteable blocks for this layer. + */ + size_t num_overwriteable_blocks() const { + return m_overwriteable_blocks.num_blocks(); + } + + /** + * Returns a boolean describing whether a given number of blocks can be allocated, based on the number of currently + * available free blocks. + * @param num_blocks The number of blocks requested to be allocated. + * @return Whether `num_blocks` can be allocated at this time. + */ + bool can_allocate_blocks(size_t num_blocks) const { + bool retval = true; + for (size_t i = 0; i < m_num_layers; i++) retval &= can_allocate_blocks(num_blocks, i); + return retval; + } + + /** + * Returns a boolean describing whether a given number of blocks can be allocated for a given layer, + * based on the number of currently available free blocks for the same layer. + * @param num_blocks The number of blocks requested to be allocated. + * @param layer_idx The index of the layer for which the allocation should occur. + * @return Whether `num_blocks` can be allocated at this time for this layer. + */ + bool can_allocate_blocks(size_t num_blocks, size_t layer_idx) const { + return num_blocks <= num_free_blocks(layer_idx); + } + + /** + * Frees a given block for a given layer. If no sequence is associated with the block after freeing, the block + * is returned to the "free" pool. + * @param block_ptr The block to be freed. + * @param layer_idx The index of the layer with which the block is associated. + */ + void free(KVCacheBlock::Ptr& block_ptr, size_t layer_idx) { + OPENVINO_ASSERT(!m_enable_prefix_caching); + OPENVINO_ASSERT(layer_idx < m_num_layers); + block_ptr->release(); + if (block_ptr->is_free()) { + m_free_blocks[layer_idx].push_back(block_ptr); + } + } + + /** + * Frees a block for each layer. If no sequence is associated with the blocks after freeing, the blocks + * are either returned to the "free" pool, or, if prefix caching is enabled, stored internally for its contents + * to be potentially reused if a prefix of a new sequence matches to the prefix with which the currently freed blocks + * were computed. + * @param blocks_for_all_layers The blocks to be freed (one for each layer). + */ + void free(const BlocksPerLayer& blocks_for_all_layers) { + OPENVINO_ASSERT(blocks_for_all_layers.size() == m_num_layers); + for (size_t i = 0; i < m_num_layers; i++) { + auto& block_ptr = blocks_for_all_layers[i]; + block_ptr->release(); + } + + auto free_predicate = [](const KVCacheBlock::Ptr& block_ptr) { return block_ptr->is_free(); }; + bool is_any_free = std::any_of(blocks_for_all_layers.begin(), blocks_for_all_layers.end(), free_predicate); + bool is_all_free = false; + if (is_any_free && m_num_layers > 1) { + is_all_free = std::all_of(blocks_for_all_layers.begin(), blocks_for_all_layers.end(), free_predicate); + OPENVINO_ASSERT(is_all_free, "blocks across layers must be freed simultaneously"); + } + + if (is_any_free) { + // is_all_free == true due to assert above + if (m_enable_prefix_caching) + { + std::set<uint64_t> hashes_across_blocks; + for (const auto& block : blocks_for_all_layers) { + hashes_across_blocks.insert(block->get_hash()); + } + bool is_all_have_same_hash = (hashes_across_blocks.size() == 1); + if (is_all_have_same_hash) { + // guard against hash collision + auto colliding_blocks = m_overwriteable_blocks.clean_store(hashes_across_blocks); + if (!colliding_blocks.empty()) { + OPENVINO_ASSERT(colliding_blocks.size() == 1); + BlocksPerLayer& colliding_blocks_per_layer = colliding_blocks[0]; + bool is_same_block = true; + for (size_t layer_idx = 0; layer_idx < colliding_blocks_per_layer.size(); layer_idx++) { + if (colliding_blocks_per_layer[layer_idx]->get_index() != blocks_for_all_layers[layer_idx]->get_index()) { + is_same_block = false; + break; + } + } + + if (is_same_block) { + OPENVINO_THROW("internal error - double free when prefix caching"); + } + + // actual collision case + for (size_t layer_idx = 0; layer_idx < colliding_blocks_per_layer.size(); layer_idx++) { + m_free_blocks[layer_idx].push_back(colliding_blocks_per_layer[layer_idx]); + } + } + m_overwriteable_blocks.add(blocks_for_all_layers); + } else { + // This set of blocks to be freed corresponds to blocks from different time steps, and thus not eligible for caching + // TODO (vshampor): more fine-grained hash store control + for (size_t layer_idx = 0; layer_idx < blocks_for_all_layers.size(); layer_idx++) { + m_free_blocks[layer_idx].push_back(blocks_for_all_layers[layer_idx]); + } + } + } + else { + for (size_t layer_idx = 0; layer_idx < blocks_for_all_layers.size(); layer_idx++) { + m_free_blocks[layer_idx].push_back(blocks_for_all_layers[layer_idx]); + } + } + } + } + + /** + * Allocates and returns one block for each layer. Can only be used if prefix caching is disabled. + * @return A vector of blocks allocated (one for each layer). + */ + BlocksPerLayer allocate_block() { + BlocksPerLayer retval; + retval.reserve(m_num_layers); + for (size_t i = 0; i < m_num_layers; i++) { + retval.push_back(allocate_block(i)); + } + return retval; + } + + /** + * Allocates and returns one block for a given layer. Can only be used if prefix caching is disabled. + * @return The block allocated for this layer. + */ + KVCacheBlock::Ptr allocate_block(size_t layer_idx) { + OPENVINO_ASSERT(layer_idx < m_free_blocks.size()); + OPENVINO_ASSERT(!m_enable_prefix_caching); + OPENVINO_ASSERT(can_allocate_blocks(1, layer_idx)); + KVCacheBlock::Ptr allocated_block = m_free_blocks[layer_idx].front(); + allocated_block->increment(); + m_free_blocks[layer_idx].pop_front(); + return allocated_block; + } + + /** + * Returns one block for each layer, either by allocating new blocks if the allocator's initial "free" pool is not + * exhausted, or by selecting a least recently used block from the hash store (so that its contents would be overwritten) otherwise. + * Can only be used if prefix caching is enabled. + * @param[in] hash The expected hash of the new block (based on the current sequence prefix). + * @param[in,out] cached_blocks The map of known hashes to already allocated and filled blocks. If the blocks are freshly allocated, + * it is added to this map under `hash`. If the blocks are reused from the internal overwritable block store, + * the previous hash entry for these is deleted and the reused blocks are likewise stored in the map under the (new) `hash`. + * @return A vector of blocks (one for each layer), either freshly allocated or reused for overwriting, + * or an empty vector if cache is exhausted. + */ + BlocksPerLayer allocate_block(size_t hash, std::map<uint64_t, BlocksPerLayer>& cached_blocks) { + OPENVINO_ASSERT(m_enable_prefix_caching); + OPENVINO_ASSERT(can_allocate_blocks(1)); + + if (m_free_blocks[0].size() > 0) { + // allocate new empty block + BlocksPerLayer allocated_blocks; + allocated_blocks.reserve(m_num_layers); + for (size_t i = 0; i < m_num_layers; i++) { + KVCacheBlock::Ptr allocated_block = m_free_blocks[i].front(); + allocated_block->increment(); + allocated_block->set_hash(hash); + allocated_blocks.push_back(allocated_block); + m_free_blocks[i].pop_front(); + } + cached_blocks[hash] = allocated_blocks; + return allocated_blocks; + } + if (m_overwriteable_blocks.num_blocks() > 0) { + // get least recently used block from store and reuse it + BlocksPerLayer blocks_for_all_layers = m_overwriteable_blocks.get_lru_block_to_overwrite(); + cached_blocks.erase(blocks_for_all_layers[0]->get_hash()); + + // update block with new hash + for (auto& block : blocks_for_all_layers) { + block->set_hash(hash); + } + cached_blocks[hash] = blocks_for_all_layers; + return blocks_for_all_layers; + } + // should not be reachable due to the can_allocate_blocks assert in the beginning + return {}; + } + + /** + * Returns the blocks corresponding to a given hash either from the internal allocator store, + * or from the supplied storage map, or nothing if there are no blocks corresponding to this hash. + * + * @param hash The hash of the blocks to be looked up. + * @param cached_blocks The map of known hashes to already allocated and filled blocks. + * @return A vector of blocks (one for each layer) corresponding to this hash, or an empty vector if the hash is not found in the map. + */ + BlocksPerLayer get_cached_block(size_t hash, std::map<uint64_t, BlocksPerLayer>& cached_blocks) { + auto blocks_for_all_layers = m_overwriteable_blocks.get_block_to_restore(hash); + if (!blocks_for_all_layers.empty()) { + // use cached block from internal store + return blocks_for_all_layers; + } + auto it = cached_blocks.find(hash); + if (it != cached_blocks.end()) { + // use cached block from cached_blocks + // TODO: add tokens validation in case of hash collision + blocks_for_all_layers = it->second; + for (auto& block_ptr : cached_blocks[hash]) { + block_ptr->increment(); + } + return blocks_for_all_layers; + } + return {}; + } + + /** + * @return The percentage of the allocator's free block pool utilization. + */ + float get_used_percentage() const { + size_t sum = 0; + for (size_t layer_idx = 0; layer_idx < m_num_layers; layer_idx++) sum += num_free_blocks(layer_idx); + return static_cast<float>(m_num_layers * m_total_num_blocks - sum) / (m_num_layers * m_total_num_blocks) * 100; + } +}; + +/** + * @brief Works with `ov::genai::SequenceGroup`s and individual `ov::genai::Sequence`s to assign KV cache blocks to these + * at each pipeline generation step. A block table is kept for each sequence, storing the indices of "physical" + * KV cache blocks currently allocated to a given sequence. Each block table defines a linear "logical" block space, with positions of + * blocks within the block table being associated with "logical" block indices. + */ +class BlockManager { + friend class CacheStateDumper; + BlockAllocator m_allocator; + bool m_enable_prefix_caching; + size_t m_block_size; + size_t m_num_layers; + // TODO: caching time can probably be improved if we use the prefix tree + std::map<uint64_t, BlocksPerLayer> m_prefix_hash_to_occupied_block_map; + + // stores blocks for each sequence (not sequence group) + // the same block can be seen in multiple block_tables for different sequences + std::map<uint64_t, std::vector<BlocksPerLayer>> m_block_table; + + std::mutex m_cached_blocks_map_mutex; +public: + /** + * Constructs the BlockManager. + * @param num_blocks Number of KV cache blocks available for assignment to the sequences. + * @param enable_prefix_caching Whether prefix caching should be enabled for this allocator. + * See also the equivalent parameter in ov::genai::ContinuousBatchingPipeline + * @param block_size The size of an individual KV cache block in tokens. + * @param num_layers The number of separate attention layers with KV caches in the LLM associated with the pipeline. + * In current implementation each layer must have the same number of logical blocks allocated at all times. + */ + BlockManager(int num_blocks, bool enable_prefix_caching, size_t block_size, size_t num_layers = 1) + : m_allocator(num_blocks, enable_prefix_caching, num_layers), m_enable_prefix_caching(enable_prefix_caching), m_block_size(block_size), + m_num_layers(num_layers) { + OPENVINO_ASSERT(num_layers != 0, "num_layers must be non-zero"); + } + + ~BlockManager() { + // sanity check that all sequences are freed + // OPENVINO_ASSERT(m_block_table.empty()); + } + + /** + * Gets the block table for a given sequence. + * @param seq_id The identifier of an ov::genai::Sequence. + * @return A vector of per-layer blocks occupied by this sequence. + * Per-layer blocks are themselves each a vector with one KV cache block per layer. + */ + const std::vector<BlocksPerLayer>& get_block_tables(uint64_t seq_id) const { + return m_block_table.at(seq_id); + } + + /** + * Gets the block table for a given sequence and given layer. + * @param seq_id The identifier of an ov::genai::Sequence. + * @param layer_idx The index of a layer. + * @return A vector of blocks (one for each layer) occupied by this sequence for this layer. + */ + const std::vector<KVCacheBlock::Ptr>& get_block_table(uint64_t seq_id, size_t layer_idx) { + OPENVINO_ASSERT(m_block_table.count(seq_id) == 1); + return m_block_table[seq_id][layer_idx]; + } + + /** + * Frees a number of blocks with highest logical index from all sequences within a sequence group. + * @param sequence_group The sequence group to free blocks from. + * @param num_required_blocks The number of blocks to be freed. Will free an equal + * number of blocks from each sequence in the group so that at least this number of blocks is freed in total. + * @return Number of blocks freed in each sequence in the group. + */ + const size_t free_group_partially(SequenceGroup::Ptr sequence_group, size_t num_required_blocks) { + size_t blocks_num = std::ceil(num_required_blocks / sequence_group->get_not_finished_sequences().size()); + auto running_sequences = sequence_group->get_not_finished_sequences(); + for (size_t idx = 0; idx < running_sequences.size(); ++idx) { + auto seq_id = running_sequences[idx]->get_id(); + OPENVINO_ASSERT(m_block_table.count(seq_id) > 0, "Invalid sequence group."); + free_sequence_partially(seq_id, blocks_num); + } + return blocks_num; + } + + const size_t free_last_block_from_each_sequence(SequenceGroup::Ptr sequence_group) { + size_t blocks_released = 0; + auto running_sequences = sequence_group->get_not_finished_sequences(); + for (size_t idx = 0; idx < running_sequences.size(); ++idx) { + auto seq_id = running_sequences[idx]->get_id(); + OPENVINO_ASSERT(m_block_table.count(seq_id) > 0, "Invalid sequence group."); + if (free_last_block(seq_id)) { + blocks_released++; + } + } + return blocks_released; + } + + bool free_last_block(size_t seq_id) { + auto& block_table = m_block_table[seq_id]; + OPENVINO_ASSERT(block_table[0].size() >= 1); + BlocksPerLayer blocks_to_free; + blocks_to_free.reserve(m_num_layers); + for (size_t layer_idx = 0; layer_idx < m_num_layers; layer_idx++) { + blocks_to_free.push_back(block_table[layer_idx].back()); + } + m_allocator.free(blocks_to_free); + for (size_t layer_idx = 0; layer_idx < m_num_layers; layer_idx++) { + block_table[layer_idx].resize(block_table[layer_idx].size() - 1); + } + + if (block_table[0].size() == 0) { + OPENVINO_ASSERT(m_block_table.erase(seq_id) == 1); + } + return blocks_to_free[0]->is_free(); + } + + const size_t free_partially_beam_search_group(SequenceGroup::Ptr sequence_group, size_t num_required_blocks) { + size_t physical_blocks_released = 0; + size_t logical_blocks_released = 0; + while (num_required_blocks > physical_blocks_released) { + size_t released_count = free_last_block_from_each_sequence(sequence_group); + logical_blocks_released ++; + if ((int)sequence_group->get_context_len() - logical_blocks_released * m_block_size <= 0) { + break; + } + physical_blocks_released += released_count; + } + return logical_blocks_released; + } + + /** + * Returns the total number of distinct physical blocks occupied by a given sequence group. + * @param sequence_group The sequence group. + * @return The number of distinct physical blocks occupied by this sequence group. + */ + const size_t get_number_of_blocks_occupied_by_sequence(SequenceGroup::Ptr sequence_group) { + auto running_sequences = sequence_group->get_not_finished_sequences(); + std::set<size_t> indices; + for (size_t idx = 0; idx < running_sequences.size(); ++idx) { + auto seq_id = running_sequences[idx]->get_id(); + if (m_block_table.count(seq_id) == 0) { + continue; + } + auto block_table = m_block_table[seq_id][0]; // assuming all layers always have equal sets of blocks + for (const auto& block : block_table) { + indices.insert(block->get_index()); + } + } + return indices.size(); + } + + /** + * @param seq_id The identifier of an ov::genai::Sequence + * @return Whether or not this BlockManager is managing this sequence group. + */ + const bool has_block_table(uint64_t seq_id) { + return m_block_table.count(seq_id) > 0; + } + + /** + * @return The number of KV cache blocks available to be assigned to new sequences. + */ + size_t num_free_blocks() const { + return m_allocator.num_free_blocks(0); // relying on the invariant that all layers have identical number of blocks + } + + /** + * @param num_blocks A number of KV cache blocks + * @return Whether this number of KV cache blocks may be assigned to new sequences. + */ + bool can_allocate_blocks(size_t num_blocks) const { + for (size_t layer_idx = 0; layer_idx < m_num_layers; layer_idx++) { + if (!m_allocator.can_allocate_blocks(num_blocks, layer_idx)) return false; + } + return true; + } + + /** + * Allocates a given number of KV cache blocks to a given sequence. + * @param sequence The sequence for the blocks to be allocated to. + * @param num_blocks The number of KV cache blocks to be allocated. + * @param prompt_ids Raw token values of the prompt for this sequence. Required if prefix caching is enabled. + */ + void allocate(ov::genai::Sequence::Ptr sequence, size_t num_blocks, const ov::genai::TokenIds& prompt_ids = {}) { + OPENVINO_ASSERT(num_blocks > 0 && can_allocate_blocks(num_blocks)); + OPENVINO_ASSERT(!m_enable_prefix_caching || prompt_ids.size() > 0, "prompt_ids should be set for hash calculation."); + + auto sequence_id = sequence->get_id(); + if (m_block_table.find(sequence_id) == m_block_table.end()) { + m_block_table[sequence_id].resize(m_num_layers); + } + + auto& block_table = m_block_table[sequence_id][0]; + auto content_length = sequence->get_generated_len() + prompt_ids.size(); + size_t allocated_blocks = block_table.size(); // assuming all layers have the same number of allocated blocks + size_t num_hashed_tokens = allocated_blocks * m_block_size; + + + if (!m_enable_prefix_caching) { + for (size_t layer_idx = 0; layer_idx < m_block_table[sequence_id].size(); layer_idx++) { + auto block_table = m_block_table[sequence_id][layer_idx]; + for (size_t i = 0; i < num_blocks; ++i) { + ov::genai::KVCacheBlock::Ptr block = m_allocator.allocate_block(layer_idx); + OPENVINO_ASSERT(block != nullptr); + m_block_table[sequence_id][layer_idx].push_back(block); + } + } + } else { + // If last block was restored from cache by using of a partially filled block, + // its hash would correspond to partially filled block. + // In this case hash needs to be updated to the hash of fully filled block. + if (block_table.size() > 0) { + KVCacheBlock::Ptr last_block = block_table.back(); + auto hash = sequence->get_hash(block_table.size() * m_block_size); + auto prev_hash = last_block->get_hash(); + if (prev_hash != hash) { + BlocksPerLayer last_blocks_vec; + last_blocks_vec.reserve(m_num_layers); + for (size_t layer_idx = 0; layer_idx < m_num_layers; layer_idx++) { + auto& lst_blk = m_block_table[sequence_id][layer_idx].back(); + lst_blk->set_hash(hash); + m_prefix_hash_to_occupied_block_map.erase(prev_hash); + last_blocks_vec.push_back(lst_blk); + } + m_prefix_hash_to_occupied_block_map[hash] = last_blocks_vec; + } + } + for (size_t i = 0; i < num_blocks; ++i) { + num_hashed_tokens += m_block_size; + if (num_hashed_tokens > content_length) { + num_hashed_tokens = content_length; + } + auto hash = sequence->get_hash(num_hashed_tokens); + auto blocks_for_all_layers = m_allocator.allocate_block(hash, m_prefix_hash_to_occupied_block_map); + for (size_t layer_idx = 0; layer_idx < blocks_for_all_layers.size(); layer_idx++) { + m_block_table[sequence_id][layer_idx].push_back(blocks_for_all_layers[layer_idx]); + } + } + } + } + + /** + * @return Percentage of KV cache used by all sequences. + */ + float get_used_percentage() const { + return m_allocator.get_used_percentage(); + } + + /** + * @brief Forks a sequence, establishing a new sequence from an existing one, reusing + * currently allocated blocks of the existing sequence. + * @param parent_id Parent sequence identifier + * @param child_id Sequence identifier for the new, forked sequence. Must be unique across + * other sequences tracked by this BlockManager. + */ + void fork_sequence(uint64_t parent_id, uint64_t child_id) { + OPENVINO_ASSERT(m_block_table.count(child_id) == 0); + m_block_table[child_id].resize(m_num_layers); + for (size_t layer_idx = 0; layer_idx < m_num_layers; layer_idx++) { + m_block_table[child_id][layer_idx].reserve(m_block_table[parent_id][layer_idx].size()); + for (KVCacheBlock::Ptr &block: m_block_table[parent_id][layer_idx]) { + block->increment(); + m_block_table[child_id][layer_idx].push_back(block); + } + } + } + + /** + * @brief Frees all blocks for a given sequence. + * @param seq_id Identifier of the sequence to free. + */ + void free_sequence(size_t seq_id) { + OPENVINO_ASSERT(m_block_table.find(seq_id) != m_block_table.end(), "sequence with id ", seq_id, + " not found in BlockManager, but requested to free"); + auto& block_table = m_block_table[seq_id]; + size_t effective_num_layers = block_table.size(); + size_t num_allocated_blocks = block_table[0].size(); + for (size_t i = 0; i < num_allocated_blocks; i++) { + BlocksPerLayer blocks_to_free; + blocks_to_free.reserve(effective_num_layers); + for (size_t layer_idx = 0; layer_idx < effective_num_layers; layer_idx++) { + blocks_to_free.push_back(block_table[layer_idx][i]); + } + m_allocator.free(blocks_to_free); + } + + OPENVINO_ASSERT(m_block_table.erase(seq_id) == 1); + } + + /** + * Frees a specified number of blocks from the end of a given sequence. + * If a sequence is freed completely, it is removed from this BlockManager. + * @param seq_id Sequence identifier + * @param block_num Number of blocks to be freed from the sequence, starting at + * the highest logical block. + */ + void free_sequence_partially(size_t seq_id, size_t block_num) { + size_t effective_num_layers = m_block_table[seq_id].size(); + for (size_t layer_idx = 0; layer_idx < effective_num_layers; layer_idx++) { + auto& layer_block_table = m_block_table[seq_id][layer_idx]; + OPENVINO_ASSERT(layer_block_table.size() >= block_num); + } + + for (size_t idx = 0; idx < block_num; idx++) { + BlocksPerLayer blocks_to_free; + blocks_to_free.reserve(effective_num_layers); + for (size_t layer_idx = 0; layer_idx < effective_num_layers; layer_idx++) { + auto &layer_block_table = m_block_table[seq_id][layer_idx]; + size_t block_idx = layer_block_table.size() - idx - 1; + blocks_to_free.push_back(layer_block_table[block_idx]); + } + m_allocator.free(blocks_to_free); + } + + for (size_t layer_idx = 0; layer_idx < effective_num_layers; layer_idx++) { + auto& layer_block_table = m_block_table[seq_id][layer_idx]; + layer_block_table.resize(layer_block_table.size() - block_num); + } + + auto empty_predicate = [](const BlocksPerLayer& v) { return v.empty(); }; + bool any_freed_completely = std::any_of(m_block_table[seq_id].begin(), m_block_table[seq_id].end(), empty_predicate); + if (any_freed_completely) { + bool all_freed_completely = std::all_of(m_block_table[seq_id].begin(), m_block_table[seq_id].end(), empty_predicate); + // The invariant must hold at BlockManager level that all per-layer block tables + // must have the same size + OPENVINO_ASSERT(all_freed_completely, "block tables across layers should only be empty all at once"); + OPENVINO_ASSERT(m_block_table.erase(seq_id) == 1); + } + } + + /** + * Frees specific blocks layer-wise from a given sequence. + * @param seq_id Sequence identifier for the blocks to be freed from. + * @param logical_block_index_sets_to_free Sets (one for each layer) of logical block indices to be freed from this sequence. + */ + void free_blocks_from_sequence(size_t seq_id, const std::vector<std::set<size_t>>& logical_block_index_sets_to_free) { + std::vector<std::vector<size_t>> logical_block_indices_to_free(logical_block_index_sets_to_free.size()); + for (size_t i = 0; i < logical_block_index_sets_to_free.size(); i++) { + const auto& index_set = logical_block_index_sets_to_free[i]; + auto& index_vector = logical_block_indices_to_free[i]; + index_vector.resize(index_set.size()); + std::copy(index_set.begin(), index_set.end(), index_vector.begin()); + } + + size_t presumed_num_layers = logical_block_indices_to_free.size(); + OPENVINO_ASSERT(m_num_layers == presumed_num_layers); + for (size_t i = 0; i < presumed_num_layers; i++) { + OPENVINO_ASSERT(logical_block_indices_to_free[i].size() == logical_block_indices_to_free[0].size(), "must free the same amount of blocks per each layer at once"); + } + + if (logical_block_indices_to_free[0].empty()) { + return; + } + + size_t num_blocks_to_free = logical_block_indices_to_free[0].size(); + + // free blocks at the allocator level + for (size_t block_idx = 0; block_idx < num_blocks_to_free; block_idx++) { + BlocksPerLayer per_layer_cache_blocks_to_free; + per_layer_cache_blocks_to_free.reserve(presumed_num_layers); + for (size_t layer_idx = 0; layer_idx < presumed_num_layers; layer_idx++) { + auto& per_layer_block_table = m_block_table[seq_id][layer_idx]; + size_t block_table_size = per_layer_block_table.size(); + size_t logical_block_idx = *(logical_block_indices_to_free[layer_idx].begin() + block_idx); + OPENVINO_ASSERT(logical_block_idx <= block_table_size, + "cannot free logical block ", logical_block_idx, + "from sequence ", seq_id, " since it only has ", block_table_size, "logical blocks"); + auto block = per_layer_block_table[logical_block_idx]; + per_layer_cache_blocks_to_free.push_back(block); + } + m_allocator.free(per_layer_cache_blocks_to_free); + } + + // remove freed entries from the block table at this BlockManager's level + for (size_t layer_idx = 0; layer_idx < presumed_num_layers; layer_idx++) { + auto& per_layer_block_table = m_block_table[seq_id][layer_idx]; + size_t block_table_size = per_layer_block_table.size(); + const auto& per_layer_block_indices_to_free = logical_block_index_sets_to_free[layer_idx]; + BlocksPerLayer new_sequence_blocks; + OPENVINO_ASSERT(per_layer_block_indices_to_free.size() <= block_table_size, "too many blocks to free"); + new_sequence_blocks.reserve(block_table_size - per_layer_block_indices_to_free.size()); + for (size_t logical_block_idx = 0; logical_block_idx < block_table_size; logical_block_idx++) { + if (per_layer_block_indices_to_free.find(logical_block_idx) == per_layer_block_indices_to_free.end()) { + // idx NOT in the requested set to free, need to keep this block + new_sequence_blocks.push_back(per_layer_block_table[logical_block_idx]); + } + } + + per_layer_block_table = new_sequence_blocks; + } + } + + /** + * @param seq_group Pointer to a sequence group. + * @return Whether enough KV cache blocks are available to host the sequences in the group. + */ + bool can_append_slots(SequenceGroup::CPtr seq_group) { + return required_blocks_count(std::move(seq_group)) <= m_allocator.num_free_blocks(0); + } + + /** + * @param seq_group Pointer to a sequence group. + * @return The number of blocks necessary to host the sequences in the group, excluding the already + * allocated ones. + */ + size_t required_blocks_count(SequenceGroup::CPtr seq_group) { + std::vector<Sequence::CPtr> running_sequences = seq_group->get_running_sequences(); + size_t blocks_count = 0; // total number of needed blocks for sequence group + std::set<size_t> last_block_ids; // unique last block indices + + for (auto seq: running_sequences) { + auto seq_id = seq->get_id(); + if (m_block_table.find(seq_id) == m_block_table.end()) { + // the block table is empty, so we need to allocate the number of blocks equal to number of logical blocks + blocks_count += seq_group->get_num_logical_blocks(); + continue; + } + auto& block_table = m_block_table[seq_id][0]; + size_t num_physical_blocks = block_table.size(); + OPENVINO_ASSERT(num_physical_blocks > 0); + + if (num_physical_blocks > seq_group->get_num_logical_blocks()) + // new blocks are not required + // Case when num_physical_blocks == seq_group->get_num_logical_blocks() may still need block allocation + // (such as when a sequence with an incomplete last block was forked) and is handled further in the + // iteration + continue; + + size_t last_block_id = block_table.back()->get_index(); + + if (last_block_ids.find(last_block_id) != last_block_ids.end()) + // this block was already processed + continue; + last_block_ids.insert(last_block_id); + + size_t needed_blocks_per_sequence = seq_group->get_num_logical_blocks() - num_physical_blocks; + + KVCacheBlock::Ptr last_block = block_table.back(); + if (last_block->copy_on_write()) { + // block is used only by multiple sequences + auto references_count = last_block->get_references_count(); + + if (needed_blocks_per_sequence == 0) { + // case when last block is not completely filled and needs to be copied n - 1 times, where n - references count + blocks_count += references_count - 1; + } + else { + blocks_count += needed_blocks_per_sequence * references_count; + } + } + else { + // block is used only by one sequence + blocks_count += needed_blocks_per_sequence; + } + } + return blocks_count; + } + + /** + * Allocates just enough physical KV cache blocks to a sequence group to be enough for the sequences in it. If the sequences + * in the group were forked before and their last block is a copy-on-write, then the block contents will have to be copied separately + * into the freshly allocated block copies as reported in the returned map. + * @param seq_group Pointer to a sequence group. + * @return A map where each key is an index of a source *physical* block, and the corresponding value is a list of newly allocated *physical* block + * indices into which the source block contents should be copied into separately. + */ + std::map<size_t, std::list<size_t>> append_slots(SequenceGroup::Ptr seq_group) { + // Will always allocate the identical number of new blocks (if any) to each of the "layers" to keep the + // number of blocks occupied by each "layer" identical at all times. + size_t num_logical_blocks = seq_group->get_num_logical_blocks(); + std::vector<Sequence::Ptr> running_sequences = seq_group->get_running_sequences(); + + std::map<size_t, std::list<size_t>> copy_blocks_map; + for (size_t i = 0; i < running_sequences.size(); ++i) { + Sequence::Ptr sequence = running_sequences[i]; + auto seq_id = sequence->get_id(); + size_t num_physical_blocks = 0; + + if (m_block_table.find(seq_id) != m_block_table.end()) + { + num_physical_blocks = m_block_table[seq_id][0].size(); + } + + if (num_logical_blocks > num_physical_blocks) { + OPENVINO_ASSERT(can_allocate_blocks(num_logical_blocks - num_physical_blocks)); + allocate(sequence, num_logical_blocks - num_physical_blocks, seq_group->get_prompt_ids()); + } else { + OPENVINO_ASSERT(num_logical_blocks == num_physical_blocks, "A number of physical and logic blocks must be the same in this code path"); + + size_t effective_num_layers = m_block_table[seq_id].size(); + BlocksPerLayer last_blocks; + last_blocks.reserve(m_block_table[seq_id].size()); + for (size_t i = 0; i < effective_num_layers; i++) { + last_blocks.push_back(m_block_table[seq_id][i].back()); + } + + bool is_copy_on_write = last_blocks[0]->copy_on_write(); + + if (is_copy_on_write) { + BlocksPerLayer new_blocks_for_all_layers; + new_blocks_for_all_layers.reserve(effective_num_layers); + if (m_enable_prefix_caching) { + auto hash = sequence->get_hash(); + new_blocks_for_all_layers = m_allocator.allocate_block(hash, m_prefix_hash_to_occupied_block_map); + } else { + for (size_t i = 0; i < effective_num_layers; i++) { + new_blocks_for_all_layers.push_back(m_allocator.allocate_block(i)); + } + } + + for (size_t i = 0; i < effective_num_layers; i++) { + auto& new_block = new_blocks_for_all_layers[i]; + auto& block_table = m_block_table[seq_id][i]; + block_table[num_physical_blocks - 1] = new_blocks_for_all_layers[i]; + auto& last_block = last_blocks[i]; + copy_blocks_map[last_block->get_index()].push_back(new_block->get_index()); + } + m_allocator.free(last_blocks); + } else { + // we are the only users of this block + if (m_enable_prefix_caching) { + // update hash of block + auto prev_hash = last_blocks[0]->get_hash(); + auto hash = sequence->get_hash(); + for (size_t i = 0; i < effective_num_layers; i++) { + auto& last_block = last_blocks[i]; + last_block->set_hash(hash); + } + m_prefix_hash_to_occupied_block_map.erase(prev_hash); + m_prefix_hash_to_occupied_block_map[hash] = last_blocks; + } + } + } + } + + // it returns information which blocks should be forked by CacheManager + return copy_blocks_map; + } + + void restore_cached_blocks(SequenceGroup::Ptr group) { + // When add_request() is executed in multiple threads accessing to cached_blocks causes segfault. + // The mutex is needed to prevent such segfaults. + const std::lock_guard<std::mutex> lock(m_cached_blocks_map_mutex); + auto prompt_ids = group->get_prompt_ids(); + auto sequences = group->get_not_finished_sequences(); + OPENVINO_ASSERT(sequences.size() == 1); + auto sequence = sequences[0]; + auto seq_id = sequence->get_id(); + + if (m_block_table.find(seq_id) == m_block_table.end()) { + m_block_table[seq_id].resize(m_num_layers); + } + auto& block_table = m_block_table[seq_id]; + + size_t content_len = 0; + while (content_len < prompt_ids.size()) { + size_t prev_iteration_content_len = content_len; + content_len += m_block_size; + if (content_len > prompt_ids.size()) { + content_len = prompt_ids.size(); + } + // restore fully filled blocks + auto full_block_hash = sequence->get_hash(content_len); + auto blocks = m_allocator.get_cached_block(full_block_hash, m_prefix_hash_to_occupied_block_map); + auto timestamp = std::chrono::system_clock::now(); + if (!blocks.empty()) { + for (size_t layer_idx = 0; layer_idx < block_table.size(); layer_idx++) { + auto& block = blocks[layer_idx]; + block->set_timestamp(timestamp); + block_table[layer_idx].push_back(block); + } + group->update_processed_tokens_num(content_len == prompt_ids.size() ? content_len - 1 : content_len); + } else { + // restore partially filled block + for (size_t i = 1; i < m_block_size; i++) { + if (prev_iteration_content_len + i > prompt_ids.size()) { + break; + } + auto hash = sequence->get_hash(prev_iteration_content_len + i); + auto blocks = m_allocator.get_cached_block(hash, m_prefix_hash_to_occupied_block_map); + if (!blocks.empty()) { + auto timestamp = std::chrono::system_clock::now(); + + for (size_t layer_idx = 0; layer_idx < block_table.size(); layer_idx++) { + auto& block = blocks[layer_idx]; + block->set_timestamp(timestamp); + block_table[layer_idx].push_back(block); + } + + group->update_processed_tokens_num(prev_iteration_content_len + i == prompt_ids.size() ? prev_iteration_content_len + i - 1 : prev_iteration_content_len + i); + + break; + } + } + break; + } + } + } +}; + + +} diff --git a/src/cpp/src/cache_eviction.cpp b/src/cpp/src/cache_eviction.cpp new file mode 100644 index 0000000000..2d126510bd --- /dev/null +++ b/src/cpp/src/cache_eviction.cpp @@ -0,0 +1,270 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "cache_eviction.hpp" + +namespace ov::genai { + CacheEvictionAlgorithm::CacheEvictionAlgorithm(const CacheEvictionConfig &eviction_config, size_t block_size, + size_t num_decoder_layers) : + m_eviction_config(eviction_config), m_block_size(block_size), m_num_decoder_layers(num_decoder_layers), + m_cache_counter(num_decoder_layers), m_scores(num_decoder_layers) { + OPENVINO_ASSERT(!(m_eviction_config.get_start_size() % m_block_size), + "CacheEvictionConfig.start_size in tokens must be a multiple of block size ", m_block_size); + OPENVINO_ASSERT(!(m_eviction_config.get_recent_size() % m_block_size), + "CacheEvictionConfig.recent_size in tokens must be a multiple of block size ", m_block_size); + OPENVINO_ASSERT(!(m_eviction_config.get_max_cache_size() % m_block_size), + "CacheEvictionConfig.max_cache_size in tokens must be a multiple of block size ", m_block_size); + OPENVINO_ASSERT(m_num_decoder_layers, "num_decoder_layers must be non-zero"); + } + + std::size_t CacheEvictionAlgorithm::get_max_cache_size_after_eviction() const { + // The cache layout after eviction should have blocks in all 3 areas (start, evictable and recent) fully filled, + // and since we evict full blocks only from the middle, evictable part of the cache, then at least one block + // past the "recent" area should be completely filled with fresh tokens before we can evict at least 1 block + // from the evictable area + return m_eviction_config.get_max_cache_size() + m_block_size - 1; + } + + std::vector<std::set<std::size_t>> CacheEvictionAlgorithm::evict_logical_blocks() { + // Returns the indices of logical KV cache blocks to evict (the rest is to be discarded) for each decoder layer in order. + // The kept indices are determined using `attention_scores`, which is expected to be the + // attention head scores that are already reduced by the batch and head dimensions, i.e. the shape of + // `attention_scores` must be [num_new_tokens, current_seq_len], where `num_new_tokens` is the dimension + // corresponding to the number of freshly generated tokens since the last cache eviction has taken place, + // and the `current_seq_len` is the dimension corresponding to the current sequence length at this stage + // in the generation process, i.e. the dimension over which the attention scores over individual previous + // tokens was being computed. + + std::vector<std::set<size_t>> retval(m_num_decoder_layers); + + + for (size_t decoder_layer_idx = 0; decoder_layer_idx < m_scores.size(); decoder_layer_idx++) { + const auto &accumulated_scores_for_current_decoder_layer = m_scores[decoder_layer_idx]; + auto scores_length = accumulated_scores_for_current_decoder_layer.size(); + if (scores_length + m_eviction_config.get_start_size() <= get_max_cache_size_after_eviction()) { + // KV cache is not yet filled, keep all currently occupied blocks + continue; + } + + // Only the blocks in the "intermediate" part of the logical KV cache will be considered for eviction + auto scores_for_all_evictable_blocks = get_scores_for_all_evictable_blocks(decoder_layer_idx); + size_t num_blocks_to_evict = get_num_blocks_to_evict(decoder_layer_idx); + auto evicted_block_indices = get_indices_of_blocks_to_evict(scores_for_all_evictable_blocks, num_blocks_to_evict); + + m_num_evicted_tokens += evicted_block_indices.size() * m_block_size; + + // No longer need to track the overall "heavy-hitter" attention scores for freshly evicted blocks + remove_scores_of_evicted_blocks(evicted_block_indices, decoder_layer_idx); + + // Adjust indices to account for start area + for (auto &idx: evicted_block_indices) idx += get_num_blocks(m_eviction_config.get_start_size()); + // auto remaining_block_indices = get_remaining_block_indices(evicted_block_indices); + for (auto &idx: evicted_block_indices) retval[decoder_layer_idx].insert(idx); + } + return retval; + } + + CacheEvictionAlgorithm::CacheEvictionRange CacheEvictionAlgorithm::get_evictable_block_range() const { + return get_evictable_block_range(0); + } + + CacheEvictionAlgorithm::CacheEvictionRange CacheEvictionAlgorithm::get_evictable_block_range(size_t layer_idx) const { + std::size_t current_sequence_length = m_eviction_config.get_start_size() + m_scores[layer_idx].size(); + if (current_sequence_length <= get_max_cache_size_after_eviction()) { + return CacheEvictionRange::invalid(); // purposely invalid range since no eviction can take place yet + } + std::size_t start = m_eviction_config.get_start_size() / m_block_size; + std::size_t end = current_sequence_length / m_block_size - (m_eviction_config.get_recent_size() / m_block_size); + return {start, end}; + } + + void CacheEvictionAlgorithm::register_new_token_scores( + const AttentionScoresForEachDecoderLayer &attention_scores_for_all_decoder_layers) { + for (size_t decoder_layer_idx = 0; decoder_layer_idx < m_cache_counter.size(); decoder_layer_idx++) { + + const auto &attention_scores = attention_scores_for_all_decoder_layers[decoder_layer_idx]; + // "Start" tokens are never evicted, won't track scores for these + // "Recent" tokens are also not evicted just yet, but need to accumulate their scores since they may + // ultimately move into the "intermediate" eviction region of cache + // Taking the [1, start_size:seq_len] span of the attention scores: + auto attn_shape = attention_scores.get_shape(); + size_t kv_cache_size_in_tokens = attn_shape[0]; + if (kv_cache_size_in_tokens <= m_eviction_config.get_start_size() + 1) { + return; + } + + auto hh_score = ov::Tensor( + attention_scores, + ov::Coordinate{m_eviction_config.get_start_size()}, + ov::Coordinate{kv_cache_size_in_tokens} + ); + + auto &accumulated_scores_for_current_decoder_layer = m_scores[decoder_layer_idx]; + + if (accumulated_scores_for_current_decoder_layer.empty()) { + accumulated_scores_for_current_decoder_layer = std::vector<double>(hh_score.get_size()); + for (size_t idx = 0; idx < accumulated_scores_for_current_decoder_layer.size(); idx++) { + accumulated_scores_for_current_decoder_layer[idx] = hh_score.data<float>()[idx]; + } + if (m_eviction_config.aggregation_mode == AggregationMode::NORM_SUM) { + // New sequence to track - will simulate that the tokens comprising the sequence were added one-by-one + // from the standpoint of the occurence tracker + std::size_t new_scores_size = hh_score.get_size(); + std::vector<std::size_t> counter(new_scores_size); + std::generate(counter.begin(), counter.begin() + new_scores_size, + [&new_scores_size] { return new_scores_size--; }); + m_cache_counter[decoder_layer_idx] = counter; + } + } else { + size_t old_size_in_tokens = accumulated_scores_for_current_decoder_layer.size(); + size_t num_new_tokens = hh_score.get_size() - accumulated_scores_for_current_decoder_layer.size(); + if (m_eviction_config.aggregation_mode == AggregationMode::NORM_SUM) { + // Increment occurence counts of all currently tracked cache blocks + auto &counter_for_current_decoder_layer = m_cache_counter[decoder_layer_idx]; + for (auto it = counter_for_current_decoder_layer.begin(); + it != counter_for_current_decoder_layer.end(); it++) { + *it += num_new_tokens; + } + // Add occurence counts for new tokens like above + counter_for_current_decoder_layer.resize(hh_score.get_size()); + for (size_t i = 0; i < num_new_tokens; i++) { + auto idx = old_size_in_tokens + i; + counter_for_current_decoder_layer[idx] = num_new_tokens - i; + } + } + accumulated_scores_for_current_decoder_layer.resize(hh_score.get_size()); + auto hh_score_data = hh_score.data<float>(); + for (size_t i = 0; i < hh_score.get_size(); ++i) { + accumulated_scores_for_current_decoder_layer[i] += hh_score_data[i]; + } + } + } + } + + std::size_t CacheEvictionAlgorithm::get_num_blocks(std::size_t num_tokens) const { + return static_cast<std::size_t>(std::ceil(((double) num_tokens) / m_block_size)); + } + + std::size_t CacheEvictionAlgorithm::get_num_evictable_blocks(size_t layer_idx) const { + auto range = get_evictable_block_range(layer_idx); + return range.second - range.first; + } + + std::size_t CacheEvictionAlgorithm::get_num_blocks_to_evict(size_t layer_idx) const { + auto num_evictable_blocks = get_num_evictable_blocks(layer_idx); + std::size_t num_evictable_blocks_to_keep_after_eviction = get_num_blocks(m_eviction_config.get_evictable_size()); + if (num_evictable_blocks < num_evictable_blocks_to_keep_after_eviction) { + return 0; + } + return num_evictable_blocks - num_evictable_blocks_to_keep_after_eviction; + } + + std::vector<double> CacheEvictionAlgorithm::get_scores_for_all_evictable_blocks(size_t decoder_layer_idx) const { + auto accumulated_scores_for_current_decoder_layer = m_scores[decoder_layer_idx]; + auto num_tracked_tokens = accumulated_scores_for_current_decoder_layer.size(); + auto counter_for_current_decoder_layer = m_cache_counter[decoder_layer_idx]; + + // Make sure that there is at least one block that can be completely evicted + OPENVINO_ASSERT((num_tracked_tokens + m_eviction_config.get_start_size()) > get_max_cache_size_after_eviction(), + "KV cache must be filled before scores for evictable blocks can be computed"); + + size_t num_evictable_blocks = get_num_evictable_blocks(decoder_layer_idx); + + std::vector<double> block_scores(num_evictable_blocks); + for (size_t i = 0; i < num_evictable_blocks; ++i) { + double normalized_accumulated_attn_score_for_block = 0.0; + for (size_t j = 0; j < m_block_size; ++j) { + size_t token_offset = m_block_size * i + j; + if (m_eviction_config.aggregation_mode == AggregationMode::NORM_SUM) { + normalized_accumulated_attn_score_for_block += + accumulated_scores_for_current_decoder_layer[token_offset] / + counter_for_current_decoder_layer[token_offset]; + } else { + normalized_accumulated_attn_score_for_block += accumulated_scores_for_current_decoder_layer[token_offset]; + } + } + block_scores[i] = normalized_accumulated_attn_score_for_block; + } + return block_scores; + } + + std::vector<std::size_t> + CacheEvictionAlgorithm::get_indices_of_blocks_to_evict( + const std::vector<double> &scores_for_each_evictable_block, size_t num_blocks_to_evict) const { + // Returned indices are offsets of blocks to evict, taken from the beginning of the "intermediate", evictable + // part of the logical KV cache. Indices are sorted in the ascending order. + auto current_num_evictable_blocks = scores_for_each_evictable_block.size(); + OPENVINO_ASSERT(current_num_evictable_blocks >= num_blocks_to_evict); + + std::vector<std::pair<double, std::size_t>> evictable_block_score_and_index_pairs; + evictable_block_score_and_index_pairs.reserve(current_num_evictable_blocks); + for (std::size_t i = 0; i < current_num_evictable_blocks; ++i) { + evictable_block_score_and_index_pairs.emplace_back(scores_for_each_evictable_block[i], i); + } + + std::nth_element(evictable_block_score_and_index_pairs.begin(), + evictable_block_score_and_index_pairs.begin() + num_blocks_to_evict, + evictable_block_score_and_index_pairs.end(), + [](const auto &lhs, const auto &rhs) { + if (lhs.first < rhs.first) return true; + if (lhs.first == rhs.first && lhs.second < rhs.second) return true; + return false; + }); + + evictable_block_score_and_index_pairs.resize(num_blocks_to_evict); + + std::vector<std::size_t> evicted_block_indices; + evicted_block_indices.reserve(num_blocks_to_evict); + for (const auto &pair: evictable_block_score_and_index_pairs) { + evicted_block_indices.push_back(pair.second); + } + + std::sort(evicted_block_indices.begin(), evicted_block_indices.end()); + return evicted_block_indices; + } + + void CacheEvictionAlgorithm::remove_scores_of_evicted_blocks(const std::vector<std::size_t> &evicted_block_indices, + size_t decoder_layer_idx) { + if (evicted_block_indices.empty()) { + return; + } + + const auto &accumulated_scores_for_current_decoder_layer = m_scores[decoder_layer_idx]; + const auto &counter_for_current_decoder_layer = m_cache_counter[decoder_layer_idx]; + + if (m_eviction_config.aggregation_mode == AggregationMode::NORM_SUM) { + OPENVINO_ASSERT( + accumulated_scores_for_current_decoder_layer.size() == counter_for_current_decoder_layer.size()); + } + + auto old_size = accumulated_scores_for_current_decoder_layer.size(); + auto new_size = + accumulated_scores_for_current_decoder_layer.size() - evicted_block_indices.size() * m_block_size; + + std::vector<double> new_scores; + new_scores.reserve(new_size); + + std::vector<size_t> new_counter; + + if (m_eviction_config.aggregation_mode == AggregationMode::NORM_SUM) { + new_counter.reserve(new_size); + } + + for (size_t token_idx = 0, evicted_block_idx = 0; token_idx < old_size;) { + if (evicted_block_idx < evicted_block_indices.size() && + token_idx == evicted_block_indices[evicted_block_idx] * m_block_size) { + ++evicted_block_idx; + token_idx += m_block_size; + continue; + } + new_scores.push_back(accumulated_scores_for_current_decoder_layer[token_idx]); + if (m_eviction_config.aggregation_mode == AggregationMode::NORM_SUM) { + new_counter.push_back(counter_for_current_decoder_layer[token_idx]); + } + ++token_idx; + } + + m_scores[decoder_layer_idx] = new_scores; + m_cache_counter[decoder_layer_idx] = new_counter; + } +} diff --git a/src/cpp/src/cache_eviction.hpp b/src/cpp/src/cache_eviction.hpp new file mode 100644 index 0000000000..9e41e7951f --- /dev/null +++ b/src/cpp/src/cache_eviction.hpp @@ -0,0 +1,120 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + + +#include <vector> +#include <cstdlib> +#include <cmath> + +#include "openvino/openvino.hpp" +#include "attention_output.hpp" +#include "openvino/genai/cache_eviction.hpp" + +namespace ov::genai { + +/** + * @brief Determines blocks to be evicted from the KV cache of a sequence based on the importance score calculated from the + * attention scores of each token at each attention layer in the LLM. + * + * The KV cache is conceptually divided into three areas as shown below: + * + * ``` + * --> *logical KV cache space in blocks* + * | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + * |<- start area->|<- evictable area ->|<- recent area ->| + * ``` + * + * The sizes of each areas are configurable. Once the sequence KV cache utilization is such that all three areas + * are filled, the algorithm determines the blocks from the *evictable area* that should be freed from this sequence + * based on the importance scores accumulated after each previous generation step in the pipeline. The least important + * tokens according to this score are to be evicted. Only the tokens from the *evictable area* are evicted - the tokens + * in the *start* and *recent* areas are never evicted, but throughout the eviction process the *recent* blocks naturally + * move into the *evictable* area. + * + * Eviction only starts when at least one block *past* the *recent area* is completely filled, and the corresponding number + * of blocks is selected to be evicted, so that the remaining blocks completely fit into the arena defined by the *start*, + * *evictable* and *recent* areas. This effectively caps the cache usage for the sequence by the size of the arena (plus, + * in general, one partially filled block past the recent area). + * + * Sizes of *start*, *evictable* and *recent* areas are configurable, but the *evictable* area size specifies the + * _minimal_ size of the evictable area. When tokens overflow the eviction arena, the acutal evictable area is + * determined as the tokens between the fixed-size *start area* and the fixed-size *end area*, so at a given eviction step + * there are in general more tokens considered for eviction than the specified *evictable* size. + * + */ +class CacheEvictionAlgorithm { +public: + /** + * @brief A pair of indices specifying the logical block interval where the blocks may be evicted at this point in time. + */ + class CacheEvictionRange : public std::pair<std::size_t, std::size_t> { + public: + CacheEvictionRange(std::size_t begin, std::size_t end) : std::pair<std::size_t, std::size_t>(begin, end) {} + static const CacheEvictionRange& invalid() { + static CacheEvictionRange inv(0, 0); + return inv; + } + }; + CacheEvictionAlgorithm() = default; // needed only to satisfy DefaultConstructible so that algo objects may be used as values in std::map + + /** + * Constructs a CacheEvictionAlgorithm. + * @param eviction_config The configuration struct for this algorithm. + * @param block_size Block size of the KV cache to evict from. + * @param num_decoder_layers Number of independent KV caches (each corresponding to a single attention layer) in the underlying LLM. + */ + explicit CacheEvictionAlgorithm(const CacheEvictionConfig& eviction_config, size_t block_size, size_t num_decoder_layers); + + /** + * @return Maximum cache size (in tokens) after each eviction step. Could be used as an estimate of the maximum per-sequence cache usage. + */ + std::size_t get_max_cache_size_after_eviction() const; + + /** + * @return Current logical range of evictable block indices. + */ + CacheEvictionRange get_evictable_block_range() const; + + /** + * Registers attention scores (for each layer) of each token in this sequence that is currently still represented + * (i.e. not evicted) in the corresponding KV cache. Must be called after each generation step to properly keep track of + * the tokens' lifetime in the KV cache and of the accumulated importance score of each token. + * @param attention_scores_for_all_decoder_layers A vector with a size equal to the configured num_decoder_layers, where each entry is a + * vector of per-token attention scores calculated within this layer. + */ + void register_new_token_scores(const AttentionScoresForEachDecoderLayer& attention_scores_for_all_decoder_layers); + + /** + * Returns the per-layer sets of logical block indices that should be evicted according to the internally computed importance scores + * and removes the corresponding blocks from the internal algorithm tracking. + * + * @return A vector with size equal to the configured num_decoder_layers, where each entry is a set of logical indices that are to be + * evicted by the external cache-controlling mechanism. + */ + std::vector<std::set<std::size_t>> evict_logical_blocks(); + + +private: + std::size_t get_num_blocks(std::size_t num_tokens) const; + std::size_t get_num_blocks_to_evict(size_t decoder_layer_idx) const; + std::size_t get_num_evictable_blocks(size_t decoder_layer_idx) const; + + CacheEvictionRange get_evictable_block_range(size_t layer_idx) const; + + std::vector<double> get_scores_for_all_evictable_blocks(size_t decoder_layer_idx) const; + + std::vector<std::size_t> get_indices_of_blocks_to_evict(const std::vector<double>& scores_for_each_evictable_block, size_t num_blocks_to_evict) const; + + void remove_scores_of_evicted_blocks(const std::vector<std::size_t>& evicted_block_indices, size_t decoder_layer_idx); + + CacheEvictionConfig m_eviction_config; + std::size_t m_block_size; + std::size_t m_num_evicted_tokens = 0; + std::size_t m_num_decoder_layers; + std::vector<std::vector<double>> m_scores; + std::vector<std::vector<size_t>> m_cache_counter; +}; + +} diff --git a/src/cpp/src/cache_manager.hpp b/src/cpp/src/cache_manager.hpp new file mode 100644 index 0000000000..a7444555ab --- /dev/null +++ b/src/cpp/src/cache_manager.hpp @@ -0,0 +1,102 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <vector> +#include <list> + +#include "openvino/runtime/tensor.hpp" + +#include "device_config.hpp" + +namespace ov::genai { +class CacheManager { + DeviceConfig m_device_config; + std::vector<ov::Tensor> m_key_cache; + std::vector<ov::Tensor> m_value_cache; + ov::Core m_core; + +public: + explicit CacheManager(const DeviceConfig &device_config, ov::Core core) : + m_device_config(device_config), + m_core(core) { + m_key_cache.reserve(m_device_config.get_num_layers()); + m_value_cache.reserve(m_device_config.get_num_layers()); + + const std::string device_name = device_config.get_device(); + if (device_name.find("GPU") == std::string::npos) {// Allocate KV caches + for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) { + ov::Tensor key_cache(device_config.get_cache_precision(), device_config.get_key_cache_shape()); + ov::Tensor value_cache(device_config.get_cache_precision(), device_config.get_value_cache_shape()); + + // force allocation + std::memset(key_cache.data(), 0, key_cache.get_byte_size()); + std::memset(value_cache.data(), 0, value_cache.get_byte_size()); + + m_key_cache.emplace_back(key_cache); + m_value_cache.emplace_back(value_cache); + } + } else { + auto remote_context = m_core.get_default_context(device_name); + for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) { + ov::Tensor key_cache = remote_context.create_tensor(device_config.get_cache_precision(), + device_config.get_key_cache_shape()); + ov::Tensor value_cache = remote_context.create_tensor(device_config.get_cache_precision(), + device_config.get_value_cache_shape()); + + m_key_cache.emplace_back(key_cache); + m_value_cache.emplace_back(value_cache); + } + } + } + + ov::Tensor get_key_cache(size_t decoder_layer_id) const { + OPENVINO_ASSERT(decoder_layer_id < m_key_cache.size()); + return m_key_cache[decoder_layer_id]; + } + + ov::Tensor get_value_cache(size_t decoder_layer_id) const { + OPENVINO_ASSERT(decoder_layer_id < m_value_cache.size()); + return m_value_cache[decoder_layer_id]; + } + + void copy_blocks(const std::map<size_t, std::list<size_t>>& block_copy_map) { + ov::Shape key_shape = m_device_config.get_key_cache_shape(); + ov::Shape value_shape = m_device_config.get_value_cache_shape(); + + ov::Coordinate key_src_start_roi(key_shape.size(), 0); + ov::Coordinate key_src_end_roi = key_shape; + ov::Coordinate key_dst_start_roi(key_shape.size(), 0); + ov::Coordinate key_dst_end_roi = key_shape; + + ov::Coordinate value_src_start_roi(value_shape.size(), 0); + ov::Coordinate value_src_end_roi = value_shape; + ov::Coordinate value_dst_start_roi(value_shape.size(), 0); + ov::Coordinate value_dst_end_roi = value_shape; + + for (const auto & blocks_pair : block_copy_map) { + size_t src_block_id = blocks_pair.first; + key_src_end_roi[0] = (key_src_start_roi[0] = src_block_id) + 1; + value_src_end_roi[0] = (value_src_start_roi[0] = src_block_id) + 1; + + const std::list<size_t>& dst_block_ids = blocks_pair.second; + for (size_t dst_block_id : dst_block_ids) { + key_dst_end_roi[0] = (key_dst_start_roi[0] = dst_block_id) + 1; + value_dst_end_roi[0] = (value_dst_start_roi[0] = dst_block_id) + 1; + + for (size_t decoder_layer_id = 0; decoder_layer_id < m_device_config.get_num_layers(); ++decoder_layer_id) { + ov::Tensor key_src_cache_roi(m_key_cache[decoder_layer_id], key_src_start_roi, key_src_end_roi); + ov::Tensor key_dst_cache_roi(m_key_cache[decoder_layer_id], key_dst_start_roi, key_dst_end_roi); + + ov::Tensor value_src_cache_roi(m_value_cache[decoder_layer_id], value_src_start_roi, value_src_end_roi); + ov::Tensor value_dst_cache_roi(m_value_cache[decoder_layer_id], value_dst_start_roi, value_dst_end_roi); + + key_src_cache_roi.copy_to(key_dst_cache_roi); + value_src_cache_roi.copy_to(value_dst_cache_roi); + } + } + } + } +}; +} diff --git a/src/cpp/src/cache_state_dumper.hpp b/src/cpp/src/cache_state_dumper.hpp new file mode 100644 index 0000000000..ec6b63fdfb --- /dev/null +++ b/src/cpp/src/cache_state_dumper.hpp @@ -0,0 +1,108 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <string> +#include <filesystem> +#include <vector> + +#include "block_manager.hpp" +#include "sequence_group.hpp" +#include "scheduler.hpp" + +namespace ov::genai { +const std::string DEFAULT_POSTFIX = std::string(); + + +/** Class to dump the current state of the KV block cache to disk as a number of text files, to be further parsed +* and visualized by the `cacheviz` tool. +*/ +class CacheStateDumper { +public: + /** + * Constructs the CacheStateDumper + * @param run_id Identifier of the cache dumping session. The output .txt files will have this run_id as a + * postfix in the name. + */ + CacheStateDumper(const std::string &run_id) : m_run_id(run_id) {} + + std::filesystem::path get_per_layer_folder(size_t layer_idx) { + auto per_layer_folder = std::filesystem::path("debug") / "cache_dump"; + per_layer_folder /= std::to_string(layer_idx); + std::filesystem::create_directories(per_layer_folder); + auto file_path = (per_layer_folder / (m_run_id + ".txt")).string(); + return per_layer_folder; + } + + /** + * Dumps the state of the cache described by a given block manager + * @param block_mgr A block manager owning the caches. + * @param sequence_groups Sequence groups currently utilizing the cache. + */ + void dump_cache_state(const BlockManager &block_mgr, const std::vector <SequenceGroup::Ptr> &sequence_groups, + size_t dump_count) { + for (size_t layer_idx = 0; layer_idx < block_mgr.m_num_layers; layer_idx++) { + auto per_layer_folder = get_per_layer_folder(layer_idx); + auto file_path = (per_layer_folder / (m_run_id + ".txt")).string(); + std::ofstream out_stream(file_path, std::ios::out); + OPENVINO_ASSERT(out_stream.is_open()); + + out_stream << block_mgr.m_allocator.m_total_num_blocks << std::endl; + out_stream << sequence_groups.size() << std::endl; + for (const auto &seq_group_ptr: sequence_groups) { + out_stream << seq_group_ptr->get_request_id() << ' '; + for (const auto &seq_ptr: seq_group_ptr->get_sequences()) { + out_stream << seq_ptr->get_id() << ' '; + } + out_stream << std::endl; + } + for (const auto &seq_id_and_blocks: block_mgr.m_block_table) { + for (const auto &block: seq_id_and_blocks.second[layer_idx]) { + const size_t seq_id = seq_id_and_blocks.first; + out_stream << seq_id << " " << block->get_index() << " " << block->get_references_count() + << std::endl; + } + } + out_stream.flush(); + + auto cache_usage_file_path = (per_layer_folder / ("cache_usage.txt")).string(); + std::ofstream out_stream_cache_usage; + + out_stream_cache_usage.open(cache_usage_file_path, std::ios::app); + out_stream_cache_usage << dump_count << ' ' << block_mgr.get_used_percentage() << std::endl; + out_stream_cache_usage.flush(); + dump_count++; + } + } + + /** + * Dumps the state of the cache described by a given scheduler. + * @param schdl A scheduler managing certain sequence groups. + * @param sequence_groups Sequence groups currently utilizing the cache (managed by the scheduler). + */ + void dump_cache_state(const Scheduler &schdl, const std::vector <SequenceGroup::Ptr> &sequence_groups, + size_t dump_count) { + dump_cache_state(schdl.m_block_manager, sequence_groups, dump_count); + + } + + /** + * @param step Current step number during the generation. + * @param postfix Postfix to the returned string ID. + * @return A string identifier for the current generation step. + */ + static std::string get_run_id_for_generation_step(size_t step, const std::string &postfix = DEFAULT_POSTFIX) { + std::stringstream ss; + ss << "cache_dump"; + if (!postfix.empty()) { + ss << "_" << postfix; + } + ss << "_step_" << step; + return ss.str(); + } + +private: + std::string m_run_id; +}; +} diff --git a/src/cpp/src/circular_buffer_queue.hpp b/src/cpp/src/circular_buffer_queue.hpp new file mode 100644 index 0000000000..859e4ec670 --- /dev/null +++ b/src/cpp/src/circular_buffer_queue.hpp @@ -0,0 +1,103 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <queue> +#include <mutex> +#include <future> +#include <algorithm> +#include <atomic> + +namespace ov::genai { + +// From OVMS: +// https://github.com/openvinotoolkit/model_server/blob/d73e85cbb8ac1d761754cb2064a00551a9ffc655/src/queue.hpp#L34 +template <typename T> +class CircularBufferQueue +{ + int m_front_idx; + std::atomic<int> m_back_idx; + std::vector<int> m_values; + std::queue<std::promise<int>> m_promises; + std::vector<T> m_data; + std::mutex m_front_mut; + std::mutex m_queue_mutex; + +public: + + CircularBufferQueue(size_t length, const std::function<T()>& create_fn) : + m_values(length), + m_front_idx{0}, + m_back_idx{0} { + std::iota(m_values.begin(), m_values.end(), 0); + m_data.reserve(length); + for (size_t i = 0; i < length; i++) { + m_data.emplace_back(std::move(create_fn())); + } + } + + CircularBufferQueue(const CircularBufferQueue&) = delete; + CircularBufferQueue(const CircularBufferQueue&&) = delete; + CircularBufferQueue& operator=(const CircularBufferQueue&) = delete; + + T& get(int value) { + return m_data[value]; + } + + std::future<int> get_idle() { + int value; + std::promise<int> idle_promise; + std::future<int> idle_future = idle_promise.get_future(); + std::unique_lock<std::mutex> lk(m_front_mut); + if (m_values[m_front_idx] < 0) { + std::unique_lock<std::mutex> queueLock(m_queue_mutex); + m_promises.push(std::move(idle_promise)); + } else { + value = m_values[m_front_idx]; + m_values[m_front_idx] = -1; + m_front_idx = (m_front_idx + 1) % m_values.size(); + lk.unlock(); + idle_promise.set_value(value); + } + return idle_future; + } + + void return_to(int value) { + std::unique_lock<std::mutex> lk(m_queue_mutex); + if (m_promises.size()) { + std::promise<int> promise = std::move(m_promises.front()); + m_promises.pop(); + lk.unlock(); + promise.set_value(value); + return; + } + int old_back = m_back_idx.load(); + while (!m_back_idx.compare_exchange_weak( + old_back, + (old_back + 1) % m_values.size(), + std::memory_order_relaxed)) { + } + m_values[old_back] = value; + } +}; + +template <typename T> +class CircularBufferQueueElementGuard { + CircularBufferQueue<T>* m_queue; + int m_value; +public: + CircularBufferQueueElementGuard(CircularBufferQueue<T>* queue) : m_queue(queue) { + m_value = m_queue->get_idle().get(); // blocking until we get the element + } + + T& get() { + return m_queue->get(m_value); + } + + ~CircularBufferQueueElementGuard() { + m_queue->return_to(m_value); + } +}; + +} diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp new file mode 100644 index 0000000000..dc5a74a475 --- /dev/null +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -0,0 +1,443 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "text_callback_streamer.hpp" +#include "continuous_batching_impl.hpp" +#include "paged_attention_transformations.hpp" +#include "utils.hpp" + +namespace ov::genai { +template<class... Ts> struct overloaded : Ts... {using Ts::operator()...;}; +template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>; + +ContinuousBatchingPipeline::ContinuousBatchingImpl::ContinuousBatchingImpl( + const std::string& models_path, + const Tokenizer& tokenizer, + const SchedulerConfig& scheduler_config, + const std::string& device, + const ov::AnyMap& plugin_config) { + m_tokenizer = tokenizer; + ov::Core core; + + auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_complile_config(plugin_config); + core.set_property(core_plugin_config); + + // The model can be compiled for GPU as well + std::shared_ptr<ov::Model> model = core.read_model(models_path + "/openvino_model.xml"); + + DeviceConfig device_config(core, scheduler_config, device, compile_plugin_config); + + bool is_need_per_layer_cache_control = scheduler_config.use_cache_eviction; + apply_paged_attention_transformations(model, device_config, is_need_per_layer_cache_control); + + ov::InferRequest infer_request = core.compile_model(model, device_config.get_device(), compile_plugin_config).create_infer_request(); + + // setup KV caches + m_cache_manager = std::make_shared<CacheManager>(device_config, core); + for (size_t decoder_layer_id = 0; decoder_layer_id < device_config.get_num_layers(); ++decoder_layer_id) { + infer_request.set_tensor(std::string("key_cache.") + std::to_string(decoder_layer_id), m_cache_manager->get_key_cache(decoder_layer_id)); + infer_request.set_tensor(std::string("value_cache.") + std::to_string(decoder_layer_id), m_cache_manager->get_value_cache(decoder_layer_id)); + } + + SchedulerConfig updated_config = scheduler_config; + // update KV number in scheduler config + if (scheduler_config.num_kv_blocks != device_config.get_num_kv_blocks()) { + updated_config.num_kv_blocks = device_config.get_num_kv_blocks(); + } + + bool can_use_partial_preemption = true; + if (device_config.get_device().find("GPU") != std::string::npos && !updated_config.dynamic_split_fuse) { + // in case of executing a `vLLM-like` pipeline, it's better not to use partial eviction on the GPU, + // as it may lead to performance slowdown + can_use_partial_preemption = false; + } + + m_scheduler = std::make_shared<Scheduler>(updated_config, device_config.get_num_layers(), can_use_partial_preemption); + // and finally create model runner + bool is_use_cache_eviction = m_scheduler->get_config().use_cache_eviction; + if (is_use_cache_eviction) { + m_model_runner = std::make_shared<ModelRunner>(infer_request, updated_config, device_config.get_num_layers(), true); + } else { + m_model_runner = std::make_shared<ModelRunner>(infer_request, updated_config, device_config.get_num_layers()); + } + m_sampler = std::make_shared<Sampler>(m_tokenizer); + m_sampler->set_seed(m_generation_config.rng_seed); + + // read default generation config +} + +GenerationHandle +ContinuousBatchingPipeline::ContinuousBatchingImpl::add_request(uint64_t request_id, + const ov::Tensor& input_ids, + ov::genai::GenerationConfig sampling_params) { + sampling_params.set_eos_token_id(m_tokenizer.get_eos_token_id()); + sampling_params.validate(); + SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(request_id, input_ids, + sampling_params, + m_scheduler->get_config().block_size, + m_scheduler->get_config().enable_prefix_caching); + sequence_group->set_sequence_group_ptr(sequence_group); + if (m_scheduler->get_config().enable_prefix_caching) { + m_scheduler->restore_cached_blocks(sequence_group); + } + + { + std::lock_guard<std::mutex> lock{m_awaiting_requests_mutex}; + m_awaiting_requests.push_back(sequence_group); + } + return std::make_shared<GenerationHandleImpl>(sequence_group->get_generation_stream(), sampling_params); +}; + +GenerationHandle +ContinuousBatchingPipeline::ContinuousBatchingImpl::add_request(uint64_t request_id, + const std::string& prompt, + ov::genai::GenerationConfig sampling_params) { + static ManualTimer timer("tokenize"); + timer.start(); + ov::Tensor input_ids = m_tokenizer.encode(prompt).input_ids; + timer.end(); + return add_request(request_id, input_ids, sampling_params); +} + +bool ContinuousBatchingPipeline::ContinuousBatchingImpl::has_non_finished_requests() { + std::lock_guard<std::mutex> lock{m_awaiting_requests_mutex}; + return !m_awaiting_requests.empty() || !m_requests.empty(); +} + +void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() { + static ManualTimer step_timer("step()"); + step_timer.start(); + + // Pull awaiting requests + { + std::lock_guard<std::mutex> lock{m_awaiting_requests_mutex}; + m_requests.insert(m_requests.end(), m_awaiting_requests.begin(), m_awaiting_requests.end()); + m_awaiting_requests.clear(); + } + + m_pipeline_metrics.requests = m_requests.size(); + Scheduler::Output scheduler_output; + { + static ManualTimer timer("scheduling"); + timer.start(); + scheduler_output = m_scheduler->schedule(m_requests); + m_pipeline_metrics.scheduled_requests = scheduler_output.m_scheduled_sequence_groups_ids.size(); + m_pipeline_metrics.cache_usage = scheduler_output.m_cache_usage; + m_pipeline_metrics.max_cache_usage = + std::max(m_pipeline_metrics.max_cache_usage, scheduler_output.m_cache_usage); + _register_step_cache_usage(scheduler_output.m_cache_usage); + m_pipeline_metrics.avg_cache_usage = _get_current_running_average_cache_usage(); + m_cache_manager->copy_blocks(scheduler_output.m_block_copy_map); + timer.end(); + } + + // if no tokens were scheduled, we are out of memory + if (scheduler_output.m_total_num_scheduled_tokens == 0) { + for (size_t i = 0; i < m_requests.size(); ++i) { + SequenceGroup::Ptr sequence_group = m_requests[i]; + sequence_group->set_out_of_memory(); + sequence_group->notify_handle(); + } + _free_non_running_requests(); + return; + } + + ov::Tensor logits; + { + static ManualTimer timer("forward"); + timer.start(); + logits = m_model_runner->forward(m_requests, scheduler_output); + timer.end(); + + ov::InferRequest infer_request = m_model_runner->get_infer_request(); + ov::CompiledModel compiled_model = infer_request.get_compiled_model(); + const bool is_profiling_enabled = compiled_model.get_property(ov::enable_profiling); + + // collect detailed statistic + if (is_profiling_enabled) { + std::vector<ov::ProfilingInfo> profiling_info = m_model_runner->get_infer_request().get_profiling_info(); + for (const ov::ProfilingInfo& info : profiling_info) { + double current_time = info.real_time.count(); + if (info.node_type == "PagedAttentionExtension") { + m_perf.m_paged_attention_time_ms += current_time; + } else if (info.node_type == "FullyConnected") { + m_perf.m_matmul_time_ms += current_time; + } + m_perf.m_infer_total_ms += current_time; + } + } + } + +#ifdef DEBUG_CACHE_STATE_DUMP + + CacheStateDumper dumper(CacheStateDumper::get_run_id_for_generation_step(step_count, "before_eviction")); + dumper.dump_cache_state(*m_scheduler, m_requests, step_count); +#endif + const auto& sched_config = m_scheduler->get_config(); + + // evict unimportant blocks from KV cache, if requested + if (sched_config.use_cache_eviction) { + maybe_evict_cache_blocks(sched_config); + } + +#ifdef DEBUG_CACHE_STATE_DUMP + CacheStateDumper dumper_after(CacheStateDumper::get_run_id_for_generation_step(step_count, "eviction")); + dumper_after.dump_cache_state(*m_scheduler, m_requests, step_count); + step_count++; +#endif + + SamplerOutput sampler_output; + { + static ManualTimer timer("sample"); + timer.start(); + sampler_output = m_sampler->sample(m_requests, logits); + timer.end(); + } + + // process sampler_output (e.g. fork or drop sequences from BlockScheduler) + { + static ManualTimer timer("fork / free sequence"); + timer.start(); + + for (const auto& pair : sampler_output.m_forked_sequences) { + uint64_t parent_id = pair.first; + const std::list<uint64_t>& child_ids = pair.second; + for (auto& child_id : child_ids) + m_scheduler->fork_sequence(parent_id, child_id); + } + + for (auto seq_id : sampler_output.m_dropped_sequences) + m_scheduler->free_sequence(seq_id); + + timer.end(); + } + + // notify requests dropped by handle + { + static ManualTimer timer("notify requests dropped by handle"); + timer.start(); + _notify_requests_dropped_by_handle(); + timer.end(); + } + + // free non running requests for current step + + { + static ManualTimer timer("free non running requests"); + timer.start(); + _free_non_running_requests(); + timer.end(); + } + + step_timer.end(); +} + +std::vector<EncodedGenerationResult> +ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<ov::Tensor>& input_ids, + const std::vector<GenerationConfig>& sampling_params, + const StreamerVariant& streamer) { + OPENVINO_ASSERT(!has_non_finished_requests(), "Generate cannot be called while ContinuousBatchingPipeline is already in running state. Use ContinuousBatchingPipeline::add_request"); + OPENVINO_ASSERT(input_ids.size() == sampling_params.size()); + const std::shared_ptr<StreamerBase>& streamer_ptr = std::visit(overloaded{ + [](std::monostate) -> std::shared_ptr<StreamerBase> { + return nullptr; + }, + [](const std::shared_ptr<StreamerBase>& streamer) { + return streamer; + }, + [this](const std::function<bool(std::string)>& streamer) -> std::shared_ptr<StreamerBase> { + return std::make_unique<TextCallbackStreamer>(m_tokenizer, streamer); + } + }, streamer); + + std::vector<GenerationHandle> generations; + for (size_t request_id = 0; request_id < input_ids.size(); ++request_id) { + OPENVINO_ASSERT(1 == input_ids[request_id].get_shape().at(0), "Use multiple tensors to pass a batch."); + generations.push_back(add_request(request_id, input_ids[request_id], sampling_params[request_id])); + } + + std::vector<EncodedGenerationResult> results; + results.reserve(m_awaiting_requests.size()); + + auto drop_requests = [&] () { + for (const std::shared_ptr<ov::genai::SequenceGroup> request : m_requests) { + for (const auto& sequence: request->get_sequences()) { + if (m_scheduler->has_block_table(sequence->get_id())) { + m_scheduler->free_sequence(sequence->get_id()); + } + } + m_sampler->clear_beam_search_info(request->get_request_id()); + } + m_requests.clear(); + }; + + bool continue_generation = true, step_throws_exception = false; + while (has_non_finished_requests() && continue_generation) { + try { + step(); + } catch (...) { + drop_requests(); + throw; + } + if (streamer_ptr && generations.at(0)->can_read()) { + std::unordered_map<uint64_t, GenerationOutput> token = generations.at(0).get()->back(); + OPENVINO_ASSERT(1 == token.size()); + OPENVINO_ASSERT(1 == token.begin()->second.generated_ids.size()); + continue_generation = !streamer_ptr->put(token.begin()->second.generated_ids.at(0)); + } + } + + if (streamer_ptr) { + streamer_ptr->end(); + } + + if (!continue_generation) { + drop_requests(); + } else { + OPENVINO_ASSERT(m_requests.empty(), "Internal error: current request is supposed to be dropped within step() function as completed"); + } + + for (size_t generation_idx = 0; generation_idx < generations.size(); ++generation_idx) { + const auto& generation = generations[generation_idx]; + EncodedGenerationResult result; + result.m_request_id = 1; + std::vector<GenerationOutput> generation_outputs = generation->read_all(); + std::sort(generation_outputs.begin(), generation_outputs.end(), [=] (GenerationOutput& r1, GenerationOutput& r2) { + return r1.score > r2.score; + }); + + auto num_outputs = std::min(sampling_params[generation_idx].num_return_sequences, generation_outputs.size()); + for (size_t generation_output_idx = 0; generation_output_idx < num_outputs; ++generation_output_idx) { + const auto& generation_output = generation_outputs[generation_output_idx]; + result.m_generation_ids.push_back(std::move(generation_output.generated_ids)); + result.m_scores.push_back(generation_output.score); + } + result.m_status = generation->get_status(); + results.push_back(std::move(result)); + } + + OPENVINO_ASSERT(results.size() == input_ids.size()); + return results; +} + +std::vector<GenerationResult> +ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector<std::string>& prompts, + std::vector<ov::genai::GenerationConfig> sampling_params, + const StreamerVariant& streamer) { + std::vector<ov::Tensor> input_ids; + static ManualTimer timer("tokenize"); + if (m_is_chat_conversation) { + OPENVINO_ASSERT(1 == prompts.size(), "Can't chat with multiple prompts"); + m_history.push_back({{"role", "user"}, {"content", prompts.at(0)}}); + constexpr bool add_generation_prompt = true; + std::string history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); + timer.start(); + // ov::genai::add_special_tokens(false) is aligned with stateful pipeline + input_ids.push_back(m_tokenizer.encode(history, ov::genai::add_special_tokens(false)).input_ids); + timer.end(); + } else { + input_ids.reserve(prompts.size()); + for (const std::string& prompt : prompts) { + timer.start(); + input_ids.push_back(m_tokenizer.encode(prompt).input_ids); + timer.end(); + } + } + std::vector<EncodedGenerationResult> encoded = generate(input_ids, sampling_params, streamer); + std::vector<GenerationResult> decoded; + decoded.reserve(encoded.size()); + for (EncodedGenerationResult& res : encoded) { + std::vector<std::string> generated; + generated.reserve(res.m_generation_ids.size()); + for (size_t idx = 0; idx < res.m_generation_ids.size(); ++idx) { + generated.push_back(m_tokenizer.decode(res.m_generation_ids.at(idx))); + if (m_is_chat_conversation && 0 == idx) { + m_history.push_back({{"role", "assistant"}, {"content", generated.back()}}); + } + } + decoded.push_back(GenerationResult{ + res.m_request_id, + std::move(generated), + std::move(res.m_scores), + res.m_status + }); + } + return decoded; +} + +void ContinuousBatchingPipeline::ContinuousBatchingImpl::_free_non_running_requests() { + std::vector<SequenceGroup::Ptr>::iterator requests_iterator = m_requests.begin(); + while (requests_iterator != m_requests.end()) { + const auto& request = *requests_iterator; + if(request->has_finished() || request->out_of_memory() || request->handle_dropped()) { + for (const auto& sequence: request->get_sequences()) { + if (m_scheduler->has_block_table(sequence->get_id())) { + m_scheduler->free_sequence(sequence->get_id()); + } + } + m_sampler->clear_beam_search_info(request->get_request_id()); + requests_iterator = m_requests.erase(requests_iterator); + } else { + requests_iterator++; + } + } +} + +void ContinuousBatchingPipeline::ContinuousBatchingImpl::_notify_requests_dropped_by_handle() { + // Notify the last time by pushing empty output + // This causes read() to unblock by adding anything to the queue + for (SequenceGroup::Ptr& request : m_requests) { + if (request->handle_dropped()) + request->push_empty_outputs(); + } +} + +void ContinuousBatchingPipeline::ContinuousBatchingImpl::_register_step_cache_usage(float step_cache_usage) { + if (m_previous_step_cache_usages.size() >= AVG_CACHE_USAGE_WINDOW_SIZE_IN_STEPS) { + m_previous_step_cache_usages.pop_front(); + } + m_previous_step_cache_usages.push_back(step_cache_usage); +} + +float ContinuousBatchingPipeline::ContinuousBatchingImpl::_get_current_running_average_cache_usage() const { + return std::accumulate(m_previous_step_cache_usages.begin(), m_previous_step_cache_usages.end(), 0.0) / m_previous_step_cache_usages.size(); +} + +void ContinuousBatchingPipeline::ContinuousBatchingImpl::maybe_evict_cache_blocks(const SchedulerConfig& sched_config) { + std::unordered_map<SequenceGroup::Ptr, size_t> seq_group_to_num_blocks_evicted_map; + auto sequence_attention_scores = m_model_runner->get_last_attention_scores(); + for (auto& seq_id_and_attention_scores : sequence_attention_scores) { + auto seq_id = seq_id_and_attention_scores.first; + const auto& attention_scores_for_all_decoder_layers = seq_id_and_attention_scores.second; + if (m_seq_group_id_to_cache_eviction_algo_map.find(seq_id) == m_seq_group_id_to_cache_eviction_algo_map.end()) { + auto num_decoder_layers = attention_scores_for_all_decoder_layers.size(); + + m_seq_group_id_to_cache_eviction_algo_map[seq_id] = CacheEvictionAlgorithm(sched_config.cache_eviction_config, sched_config.block_size, num_decoder_layers); + } + auto& cache_eviction_algo = m_seq_group_id_to_cache_eviction_algo_map[seq_id]; + + cache_eviction_algo.register_new_token_scores(attention_scores_for_all_decoder_layers); + auto logical_blocks_to_evict = cache_eviction_algo.evict_logical_blocks(); + + m_scheduler->free_blocks_from_sequence(seq_id, logical_blocks_to_evict); + + auto seq_group_ptr_it = std::find_if(m_requests.begin(), m_requests.end(), [seq_id](const SequenceGroup::Ptr& val) { return val->has_sequence_with_id(seq_id); }); + OPENVINO_ASSERT(seq_group_ptr_it != m_requests.end(), "could not find sequence group with sequence ", seq_id); + auto seq_group_ptr = *seq_group_ptr_it; + size_t num_blocks_evicted = logical_blocks_to_evict[0].size(); + + if (seq_group_to_num_blocks_evicted_map.find(seq_group_ptr) != seq_group_to_num_blocks_evicted_map.end()) { + OPENVINO_ASSERT(seq_group_to_num_blocks_evicted_map[seq_group_ptr] == num_blocks_evicted, "internal error - each sequence in the same group must have the same number of blocks evicted"); + } else { + seq_group_to_num_blocks_evicted_map[seq_group_ptr] = num_blocks_evicted; + } + + } + for (const auto& seq_group_ptr_and_num_blocks_evicted : seq_group_to_num_blocks_evicted_map) { + // Assuming that the evicted blocks are always full (since they by design are only selected from intermediate-age blocks) + auto seq_group_ptr = seq_group_ptr_and_num_blocks_evicted.first; + auto num_blocks_evicted = seq_group_ptr_and_num_blocks_evicted.second; + seq_group_ptr->register_token_eviction(num_blocks_evicted * sched_config.block_size); + } +} +} diff --git a/src/cpp/src/continuous_batching_impl.hpp b/src/cpp/src/continuous_batching_impl.hpp new file mode 100644 index 0000000000..0d170e07ed --- /dev/null +++ b/src/cpp/src/continuous_batching_impl.hpp @@ -0,0 +1,80 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "continuous_batching_impl_interface.hpp" +#include "openvino/genai/continuous_batching_pipeline.hpp" +#include "cache_eviction.hpp" + +namespace ov::genai { +class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatchingPipeline::ImplInterface { +protected: + std::shared_ptr<Scheduler> m_scheduler; + std::shared_ptr<CacheManager> m_cache_manager; + std::shared_ptr<ModelRunner> m_model_runner; + std::shared_ptr<Sampler> m_sampler; + + // current requests to process + std::vector<SequenceGroup::Ptr> m_requests; + // requests added to the pipeline that will be added to m_requests in the next iteration + std::vector<SequenceGroup::Ptr> m_awaiting_requests; + // Mutex protecting access to m_awaiting_requests, so add_request and step methods can be called from different threads + std::mutex m_awaiting_requests_mutex; + + std::map<size_t, CacheEvictionAlgorithm> m_seq_group_id_to_cache_eviction_algo_map; + + static const size_t AVG_CACHE_USAGE_WINDOW_SIZE_IN_STEPS = 1000; + std::deque<float> m_previous_step_cache_usages; + +#ifdef DEBUG_CACHE_STATE_DUMP + size_t step_count = 0; +#endif + + void _free_non_running_requests(); + void _notify_requests_dropped_by_handle(); + void _register_step_cache_usage(float step_cache_usage); + + float _get_current_running_average_cache_usage() const; + + void maybe_evict_cache_blocks(const SchedulerConfig& sched_config); +public: + ContinuousBatchingImpl(const std::string& models_path, + const Tokenizer& tokenizer, + const SchedulerConfig& scheduler_config, + const std::string& device, + const ov::AnyMap& plugin_config); + + ContinuousBatchingImpl(const std::string& models_path, + const SchedulerConfig& scheduler_config, + const std::string& device, + const ov::AnyMap& llm_plugin_config, + const ov::AnyMap& tokenizer_plugin_config) + : ContinuousBatchingImpl{ models_path, + Tokenizer(models_path, tokenizer_plugin_config), + scheduler_config, + device, + llm_plugin_config } {}; + + + GenerationHandle add_request(uint64_t request_id, + const ov::Tensor& input_ids, + ov::genai::GenerationConfig sampling_params) override; + GenerationHandle add_request(uint64_t request_id, + const std::string& prompt, + ov::genai::GenerationConfig sampling_params) override; + + bool has_non_finished_requests() override; + + void step() override; + + std::vector<EncodedGenerationResult> + generate(const std::vector<ov::Tensor>& input_ids, + const std::vector<GenerationConfig>& sampling_params, + const StreamerVariant& streamer) override; + std::vector<GenerationResult> + generate(const std::vector<std::string>& prompts, + std::vector<ov::genai::GenerationConfig> sampling_params, + const StreamerVariant& streamer) override; +}; +} \ No newline at end of file diff --git a/src/cpp/src/continuous_batching_impl_interface.cpp b/src/cpp/src/continuous_batching_impl_interface.cpp new file mode 100644 index 0000000000..7f7db465fc --- /dev/null +++ b/src/cpp/src/continuous_batching_impl_interface.cpp @@ -0,0 +1,30 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "continuous_batching_impl_interface.hpp" + +namespace ov::genai { +GenerationConfig ContinuousBatchingPipeline::ImplInterface::get_config() const { + return m_generation_config; +} + +PipelineMetrics ContinuousBatchingPipeline::ImplInterface::get_metrics() const { + return m_pipeline_metrics; +} + +Tokenizer ContinuousBatchingPipeline::ImplInterface::get_tokenizer() { + return m_tokenizer; +} + +void ContinuousBatchingPipeline::ImplInterface::start_chat(const std::string& system_message) { + if (!system_message.empty()) { + m_history.push_back({{"role", "system"}, {"content", system_message}}); + } + m_is_chat_conversation = true; +}; + +void ContinuousBatchingPipeline::ImplInterface::finish_chat() { + m_is_chat_conversation = false; + m_history.clear(); +}; +} \ No newline at end of file diff --git a/src/cpp/src/continuous_batching_impl_interface.hpp b/src/cpp/src/continuous_batching_impl_interface.hpp new file mode 100644 index 0000000000..a3615b5828 --- /dev/null +++ b/src/cpp/src/continuous_batching_impl_interface.hpp @@ -0,0 +1,69 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "openvino/genai/continuous_batching_pipeline.hpp" + +#include "cache_manager.hpp" +#include "sampler.hpp" +#include "model_runner.hpp" +#include "scheduler.hpp" + +namespace ov::genai { + +class ContinuousBatchingPipeline::ImplInterface { +protected: + Tokenizer m_tokenizer; + + // TODO (mzegla): GenerationConfig is request specific object + // and pipeline only uses default rng_seed. + ov::genai::GenerationConfig m_generation_config; + + PipelineMetrics m_pipeline_metrics; + + struct PerfTime { + float m_paged_attention_time_ms = 0.0f; + float m_matmul_time_ms = 0.0f; + float m_infer_total_ms = 0.0f; + + ~PerfTime() { + std::cout << "Inference requests aggregated statistic: " << std::endl; + std::cout << "Paged attention % of inference execution: " << (m_paged_attention_time_ms / m_infer_total_ms) * 100 << std::endl; + std::cout << "MatMul % of inference execution: " << (m_matmul_time_ms / m_infer_total_ms) * 100 << std::endl; + std::cout << "Total inference execution secs: " << m_infer_total_ms / 1000. << std::endl; + std::cout << std::endl; + } + } m_perf; + bool m_is_chat_conversation = false; + ChatHistory m_history; + +public: + ov::genai::GenerationConfig get_config() const; + PipelineMetrics get_metrics() const; + ov::genai::Tokenizer get_tokenizer(); + + virtual GenerationHandle add_request(uint64_t request_id, + const ov::Tensor& input_ids, + ov::genai::GenerationConfig sampling_params) = 0; + virtual GenerationHandle add_request(uint64_t request_id, + const std::string& prompt, + ov::genai::GenerationConfig sampling_params) = 0; + + virtual bool has_non_finished_requests() = 0; + + virtual void step() = 0; + + virtual std::vector<EncodedGenerationResult> + generate(const std::vector<ov::Tensor>& input_ids, + const std::vector<GenerationConfig>& sampling_params, + const StreamerVariant& streamer) = 0; + virtual std::vector<GenerationResult> + generate(const std::vector<std::string>& prompts, + std::vector<ov::genai::GenerationConfig> sampling_params, + const StreamerVariant& streamer) = 0; + + void start_chat(const std::string& system_message); + void finish_chat(); +}; +} \ No newline at end of file diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp new file mode 100644 index 0000000000..1cfee51652 --- /dev/null +++ b/src/cpp/src/continuous_batching_pipeline.cpp @@ -0,0 +1,78 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include <cstdint> +#include <mutex> +#include <memory> +#include <openvino/runtime/properties.hpp> + +#include "openvino/genai/continuous_batching_pipeline.hpp" +#include "openvino/genai/generation_handle.hpp" +#include "openvino/genai/tokenizer.hpp" +#include "continuous_batching_impl.hpp" +#include "timer.hpp" +#include "debug_utils.hpp" +#include "cache_state_dumper.hpp" + +using namespace ov::genai; + +ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::string& models_path, + const SchedulerConfig& scheduler_config, + const std::string& device, + const ov::AnyMap& llm_plugin_config, + const ov::AnyMap& tokenizer_plugin_config) { + m_impl = std::make_shared<ContinuousBatchingImpl>(models_path, scheduler_config, device, llm_plugin_config, tokenizer_plugin_config); +} + +ContinuousBatchingPipeline::ContinuousBatchingPipeline( + const std::string& model_path, + const Tokenizer& tokenizer, + const SchedulerConfig& scheduler_config, + const std::string& device, + const ov::AnyMap& plugin_config) { + m_impl = std::make_shared<ContinuousBatchingImpl>(model_path, tokenizer, scheduler_config, device, plugin_config); +} + +ov::genai::Tokenizer ContinuousBatchingPipeline::get_tokenizer() { + return m_impl->get_tokenizer(); +} + +ov::genai::GenerationConfig ContinuousBatchingPipeline::get_config() const{ + return m_impl->get_config(); +} + +PipelineMetrics ContinuousBatchingPipeline::get_metrics() const{ + return m_impl->get_metrics(); +} + +GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, const std::string& prompt, const ov::genai::GenerationConfig& sampling_params) { + return m_impl->add_request(request_id, prompt, sampling_params); +} + +GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, const ov::Tensor& input_ids, const ov::genai::GenerationConfig& sampling_params) { + return m_impl->add_request(request_id, input_ids, sampling_params); +} + +void ContinuousBatchingPipeline::step() { + m_impl->step(); +} + +bool ContinuousBatchingPipeline::has_non_finished_requests() { + return m_impl->has_non_finished_requests(); +} + +std::vector<EncodedGenerationResult> ContinuousBatchingPipeline::generate(const std::vector<ov::Tensor>& input_ids, const std::vector<ov::genai::GenerationConfig>& sampling_params, const StreamerVariant& streamer) { + return m_impl->generate(input_ids, sampling_params, streamer); +} + +std::vector<GenerationResult> ContinuousBatchingPipeline::generate(const std::vector<std::string>& prompts, const std::vector<ov::genai::GenerationConfig>& sampling_params, const StreamerVariant& streamer) { + return m_impl->generate(prompts, sampling_params, streamer); +} + +void ContinuousBatchingPipeline::start_chat(const std::string& system_message) { + m_impl->start_chat(system_message); +}; + +void ContinuousBatchingPipeline::finish_chat() { + m_impl->finish_chat(); +}; diff --git a/src/cpp/src/debug_utils.hpp b/src/cpp/src/debug_utils.hpp new file mode 100644 index 0000000000..948218c671 --- /dev/null +++ b/src/cpp/src/debug_utils.hpp @@ -0,0 +1,31 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <string> +#include <iostream> + +#include <openvino/runtime/tensor.hpp> + +template <typename T> +void print_array(T * array, size_t size) { + std::cout << " => [ "; + for (size_t i = 0; i < size; ++i) { + std::cout << array[i] << " "; + } + std::cout << " ] " << std::endl; +} + +inline void print_tensor(std::string name, ov::Tensor tensor) { + std::cout << name; + if (tensor.get_element_type() == ov::element::i32) { + print_array(tensor.data<int>(), tensor.get_size()); + } else if (tensor.get_element_type() == ov::element::i64) { + print_array(tensor.data<int64_t>(), tensor.get_size()); + } else if (tensor.get_element_type() == ov::element::f32) { + print_array(tensor.data<float>(), tensor.get_size()); + } else if (tensor.get_element_type() == ov::element::boolean) { + print_array(tensor.data<bool>(), tensor.get_size()); + } +} diff --git a/src/cpp/src/device_config.hpp b/src/cpp/src/device_config.hpp new file mode 100644 index 0000000000..945d762462 --- /dev/null +++ b/src/cpp/src/device_config.hpp @@ -0,0 +1,144 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "openvino/runtime/core.hpp" +#include "openvino/core/shape.hpp" +#include "openvino/core/type/element_type.hpp" + +#include "openvino/genai/scheduler_config.hpp" + +namespace ov::genai { +class DeviceConfig { + ov::element::Type m_kv_cache_type; + ov::Shape m_key_cache_shape, m_value_cache_shape; + ov::Shape::value_type m_num_kv_heads, m_head_size, m_num_decoder_layers; + size_t m_num_kv_blocks = 0; + size_t m_block_size = 0; + size_t m_cache_size = 0; + std::string m_device; + +public: + DeviceConfig(ov::Core& core, const SchedulerConfig& scheduling_config, const std::string& device, const ov::AnyMap& plugin_config = {}) { + m_device = device; + + // keep information about blocsk + m_block_size = scheduling_config.block_size; + + if (m_device == "CPU") { + auto inference_precision = core.get_property(device, ov::hint::inference_precision); + m_kv_cache_type = inference_precision == ov::element::bf16 ? ov::element::bf16 : ov::element::f16; + + // if user sets precision hint, kv cache type should be changed + const auto inference_precision_it = plugin_config.find(ov::hint::inference_precision.name()); + if (inference_precision_it != plugin_config.end()) { + const auto inference_precision = inference_precision_it->second.as<ov::element::Type>(); + if (inference_precision == ov::element::f32) { + m_kv_cache_type = ov::element::f32; + } else if (inference_precision == ov::element::f16) { + m_kv_cache_type = ov::element::f16; + } else if (inference_precision == ov::element::bf16) { + m_kv_cache_type = ov::element::bf16; + } else { + // use default f32 + m_kv_cache_type = ov::element::f32; + } + } + + // if user sets ov::kv_cache_precision hint + const auto kv_cache_precision_it = plugin_config.find(ov::hint::kv_cache_precision.name()); + if (kv_cache_precision_it != plugin_config.end()) { + const auto kv_cache_precision = kv_cache_precision_it->second.as<ov::element::Type>(); + m_kv_cache_type = kv_cache_precision; + } + } else if (m_device.find("GPU") != std::string::npos) { + auto inference_precision = core.get_property(device, ov::hint::inference_precision); + m_kv_cache_type = inference_precision == ov::element::f16 ? ov::element::f16 : ov::element::f32; + + // if user sets precision hint, kv cache type should be changed + const auto inference_precision_it = plugin_config.find(ov::hint::inference_precision.name()); + if (inference_precision_it != plugin_config.end()) { + const auto inference_precision = inference_precision_it->second.as<ov::element::Type>(); + if (inference_precision == ov::element::f16) { + m_kv_cache_type = ov::element::f16; + } else { + // use default f32 + m_kv_cache_type = ov::element::f32; + } + } + } else { + OPENVINO_THROW(m_device, " is not supported by OpenVINO Continuous Batching"); + } + + OPENVINO_ASSERT(scheduling_config.num_kv_blocks > 0 || scheduling_config.cache_size > 0, "num_kv_blocks or cache_size should be more than zero."); + if (scheduling_config.num_kv_blocks > 0) { + m_num_kv_blocks = scheduling_config.num_kv_blocks; + } + else { + m_cache_size = scheduling_config.cache_size; + } + } + + void set_model_params(size_t num_kv_heads, size_t head_size, size_t num_decoder_layers) { + m_num_kv_heads = num_kv_heads; + m_head_size = head_size; + m_num_decoder_layers = num_decoder_layers; + + if (m_device == "CPU") { + // Scale, zero point and quantized data will be stored together. + // The layout for per token per head: + // |scale(f32)|zeropoint(f32)|quantized data(u8,idx_1)|quantized data(u8,idx_2)|...|quantized data(u8,idx_head_size)| + // so, we have to extend head_size by 8, which is sizeof(float) + // for scale and sizeof(float) for zeropoint + if (m_kv_cache_type == ov::element::u8) + m_head_size += 8; + } + + if (m_num_kv_blocks == 0) { + OPENVINO_ASSERT(m_cache_size > 0, "num_kv_blocks or cache_size should be more than zero."); + size_t size_in_bytes = m_cache_size * 1024 * 1024 * 1024; + m_num_kv_blocks = size_in_bytes / (m_num_decoder_layers * 2 * m_num_kv_heads * m_block_size * m_head_size * m_kv_cache_type.size()); + } + + m_key_cache_shape = m_value_cache_shape = ov::Shape{m_num_kv_blocks, + m_num_kv_heads, + m_block_size, + m_head_size}; + + if (m_device.find("GPU") != std::string::npos) { + // Update key shape, as the key's shape is different from the value's shape + m_key_cache_shape = ov::Shape{m_num_kv_blocks, + m_num_kv_heads, + m_head_size, + m_block_size}; + } + } + + std::string get_device() const { + return m_device; + } + + ov::element::Type get_cache_precision() const { + return m_kv_cache_type; + } + + size_t get_num_layers() const { + return m_num_decoder_layers; + } + + ov::Shape get_key_cache_shape() const { + OPENVINO_ASSERT(!m_key_cache_shape.empty()); + return m_key_cache_shape; + } + + ov::Shape get_value_cache_shape() const { + OPENVINO_ASSERT(!m_value_cache_shape.empty()); + return m_value_cache_shape; + } + + size_t get_num_kv_blocks() const { + return m_num_kv_blocks; + } +}; +} diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp new file mode 100644 index 0000000000..51ca3c3fa2 --- /dev/null +++ b/src/cpp/src/generation_config.cpp @@ -0,0 +1,193 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include <fstream> +#include <limits> + +#include <nlohmann/json.hpp> +#include <openvino/runtime/core.hpp> +#include "openvino/genai/generation_config.hpp" +#include "utils.hpp" + + +namespace ov { +namespace genai { + +GenerationConfig::GenerationConfig(const std::string& json_path) { + using utils::read_json_param; + + std::ifstream f(json_path); + OPENVINO_ASSERT(f.is_open(), "Failed to open '" + json_path + "' with generation config"); + + nlohmann::json data = nlohmann::json::parse(f); + + read_json_param(data, "max_new_tokens", max_new_tokens); + read_json_param(data, "max_length", max_length); + // note that ignore_eos is not present in HF GenerationConfig + read_json_param(data, "ignore_eos", ignore_eos); + read_json_param(data, "min_new_tokens", min_new_tokens); + read_json_param(data, "stop_strings", stop_strings); + // note that include_stop_str_in_output is not present in HF GenerationConfig + read_json_param(data, "include_stop_str_in_output", include_stop_str_in_output); + // note that stop_token_ids is not present in HF GenerationConfig + read_json_param(data, "stop_token_ids", stop_token_ids); + read_json_param(data, "num_beam_groups", num_beam_groups); + read_json_param(data, "num_beams", num_beams); + read_json_param(data, "diversity_penalty", diversity_penalty); + read_json_param(data, "length_penalty", length_penalty); + read_json_param(data, "num_return_sequences", num_return_sequences); + read_json_param(data, "no_repeat_ngram_size", no_repeat_ngram_size); + read_json_param(data, "temperature", temperature); + read_json_param(data, "top_p", top_p); + read_json_param(data, "top_k", top_k); + read_json_param(data, "do_sample", do_sample); + read_json_param(data, "repetition_penalty", repetition_penalty); + read_json_param(data, "eos_token_id", eos_token_id); + + if (data.contains("early_stopping")) { + auto field_type = data["early_stopping"].type(); + if (field_type == nlohmann::json::value_t::string && data["early_stopping"] == "never") { + stop_criteria = StopCriteria::NEVER; + } else if (field_type == nlohmann::json::value_t::boolean && data["early_stopping"] == true) { + stop_criteria = StopCriteria::EARLY; + } else if (field_type == nlohmann::json::value_t::boolean && data["early_stopping"] == false) { + stop_criteria = StopCriteria::HEURISTIC; + } + } +} + +void GenerationConfig::set_eos_token_id(size_t tokenizer_eos_token_id) { + if (eos_token_id < 0) { + eos_token_id = tokenizer_eos_token_id; + } else { + OPENVINO_ASSERT(eos_token_id == tokenizer_eos_token_id, + "EOS token ID is different in generation config (", eos_token_id, ") and tokenizer (", + tokenizer_eos_token_id, ")"); + } + // Merge user defined stop tokens with model EOS token + stop_token_ids.insert(eos_token_id); +} + +void GenerationConfig::update_generation_config(const ov::AnyMap& config_map) { + using utils::read_anymap_param; + + read_anymap_param(config_map, "max_new_tokens", max_new_tokens); + read_anymap_param(config_map, "max_length", max_length); + read_anymap_param(config_map, "ignore_eos", ignore_eos); + read_anymap_param(config_map, "min_new_tokens", min_new_tokens); + read_anymap_param(config_map, "stop_strings", stop_strings); + read_anymap_param(config_map, "include_stop_str_in_output", include_stop_str_in_output); + read_anymap_param(config_map, "stop_token_ids", stop_token_ids); + read_anymap_param(config_map, "num_beam_groups", num_beam_groups); + read_anymap_param(config_map, "num_beams", num_beams); + read_anymap_param(config_map, "diversity_penalty", diversity_penalty); + read_anymap_param(config_map, "length_penalty", length_penalty); + read_anymap_param(config_map, "num_return_sequences", num_return_sequences); + read_anymap_param(config_map, "no_repeat_ngram_size", no_repeat_ngram_size); + read_anymap_param(config_map, "stop_criteria", stop_criteria); + read_anymap_param(config_map, "temperature", temperature); + read_anymap_param(config_map, "top_p", top_p); + read_anymap_param(config_map, "top_k", top_k); + read_anymap_param(config_map, "do_sample", do_sample); + read_anymap_param(config_map, "repetition_penalty", repetition_penalty); + read_anymap_param(config_map, "eos_token_id", eos_token_id); + read_anymap_param(config_map, "adapters", adapters); +} + +size_t GenerationConfig::get_max_new_tokens(size_t prompt_length) const { + // max_new_tokens has priority over max_length, only if max_new_tokens was not specified use max_length + if (max_new_tokens != SIZE_MAX) { + return max_new_tokens; + } else { + return max_length - prompt_length; + } +} + +bool GenerationConfig::is_greedy_decoding() const { + return !do_sample && !is_beam_search(); +} + +bool GenerationConfig::is_beam_search() const { + return num_beams > 1; +} + +bool GenerationConfig::is_multinomial() const { + return do_sample; +} + +void GenerationConfig::validate() const { + OPENVINO_ASSERT(!do_sample || num_beams == 1, + "Beam search with sampling is not supported yet. " + "Please either set do_sample=false to use beam search " + "or set num_beams=1 if you with to use multinomial sampling."); + OPENVINO_ASSERT(num_return_sequences > 0, "num_return_sequences must be greater than 0"); + OPENVINO_ASSERT(max_new_tokens > 0, "'max_new_tokens' must be greater than 0"); + OPENVINO_ASSERT(min_new_tokens <= max_new_tokens, "min_new_tokens must be less or equal max_new_tokens"); + OPENVINO_ASSERT( + num_beams % num_beam_groups == 0, + "number of beams should be divisible by number of groups" + ); + + // max_new_tokens has priority over max_length + // if max_new_tokens is defined no need to check max_length + OPENVINO_ASSERT(max_new_tokens != SIZE_MAX || max_length > 0, + "'max_length' must be greater than 0 or 'max_new_tokens' should be defined"); + + OPENVINO_ASSERT(!do_sample || top_k > 0, + "top_k must be a strictly positive, but got ", + top_k); + OPENVINO_ASSERT(!do_sample || (top_p > 0 && top_p <= 1.0f), + "top_p must be a positive float > 0 and < 1, but got ", + top_p); + OPENVINO_ASSERT(!do_sample || temperature > 0, + "Temperature must be a strictly positive float, but got ", + temperature); + + OPENVINO_ASSERT(repetition_penalty > 0, + "Repetition penalty must be a strictly positive float, but got ", + repetition_penalty); + + OPENVINO_ASSERT(!ignore_eos || max_new_tokens != SIZE_MAX || max_length != SIZE_MAX, + "ignore_eos == true, in this case either 'max_new_tokens', or 'max_length' should be defined."); + + OPENVINO_ASSERT(eos_token_id != -1 || max_new_tokens != SIZE_MAX || max_length != SIZE_MAX, + "Either 'eos_token_id', or 'max_new_tokens', or 'max_length' should be defined."); + if (is_beam_search()) { + OPENVINO_ASSERT(no_repeat_ngram_size > 0, "no_repeat_ngram_size must be positive"); + } else { + OPENVINO_ASSERT(frequency_penalty >= -2.0f && frequency_penalty <= 2.0f, "frequence_penalty penalty must be a [-2; +2]"); + OPENVINO_ASSERT(presence_penalty >= -2.0f && presence_penalty <= 2.0f, "presence_penalty penalty must be a [-2; +2]"); + } +} + +GenerationConfig beam_search() { + GenerationConfig beam_search_config; + beam_search_config.num_beams = 4; + beam_search_config.num_return_sequences = 3; + beam_search_config.num_beam_groups = 2; + beam_search_config.max_new_tokens = 100; + beam_search_config.diversity_penalty = 2.0f; + return beam_search_config; +} + +GenerationConfig greedy() { + GenerationConfig greedy_config; + greedy_config.max_new_tokens = 30; + return greedy_config; +} + +GenerationConfig multinomial() { + GenerationConfig multinomial_config; + multinomial_config.do_sample = true; + multinomial_config.temperature = 0.9f; + multinomial_config.top_p = 0.9f; + multinomial_config.top_k = 20; + multinomial_config.num_return_sequences = 3; + multinomial_config.presence_penalty = 0.01f; + multinomial_config.frequency_penalty = 0.1f; + multinomial_config.min_new_tokens = 15; + multinomial_config.max_new_tokens = 30; + return multinomial_config; +} +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/generation_handle.cpp b/src/cpp/src/generation_handle.cpp new file mode 100644 index 0000000000..8bf838ef9e --- /dev/null +++ b/src/cpp/src/generation_handle.cpp @@ -0,0 +1,70 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include <openvino/openvino.hpp> + +#include "openvino/genai/generation_handle.hpp" +#include "generation_stream.hpp" + +using namespace ov::genai; + +GenerationHandleImpl::~GenerationHandleImpl() { + drop(); +} + +GenerationStatus GenerationHandleImpl::get_status() { + return m_generation_stream->get_status(); +} + +bool GenerationHandleImpl::can_read() { + return !is_dropped() && m_generation_stream->can_read(); +} + +bool GenerationHandleImpl::is_dropped() { + return get_status() == GenerationStatus::DROPPED_BY_HANDLE; +} + +void GenerationHandleImpl::drop() { + m_generation_stream->drop(); +} + +std::unordered_map<uint64_t, GenerationOutput> GenerationHandleImpl::back() { + OPENVINO_ASSERT(!is_dropped(), "GenerationHandle cannot be used after it is dropped."); + return m_generation_stream->back(); +} + +std::unordered_map<uint64_t, GenerationOutput> GenerationHandleImpl::read() { + OPENVINO_ASSERT(!is_dropped(), "GenerationHandle cannot be used after it is dropped."); + return m_generation_stream->read(); +} + +void add_partial_result(std::unordered_map<uint64_t, GenerationOutput>& partial_results, std::unordered_map<uint64_t, GenerationOutput>& iteration_results) { + for (auto& iteration_result: iteration_results) { + auto partial_result_iter = partial_results.find(iteration_result.first); + if (partial_result_iter == partial_results.end()) { + partial_results.emplace(iteration_result.first, iteration_result.second); + } else { + partial_result_iter->second.generated_ids.push_back(iteration_result.second.generated_ids[0]); + partial_result_iter->second.generated_log_probs.push_back(iteration_result.second.generated_log_probs[0]); + partial_result_iter->second.score = iteration_result.second.score; + partial_result_iter->second.finish_reason = iteration_result.second.finish_reason; + } + } +} + +std::vector<GenerationOutput> GenerationHandleImpl::read_all() { + OPENVINO_ASSERT(!is_dropped(), "GenerationHandle cannot be used after it is dropped."); + std::vector<GenerationOutput> results; + std::unordered_map<uint64_t, GenerationOutput> partial_results; + // We iterate until generation is running or there are tokens we haven't read yet + while (get_status() == GenerationStatus::RUNNING || can_read()) { + // For unary case there's only one iteration and we get all results in a single read() call + std::unordered_map<uint64_t, GenerationOutput> iteration_results = read(); + add_partial_result(partial_results, iteration_results); + } + + for (auto& partial_result: partial_results) { + results.push_back(partial_result.second); + } + return results; +} diff --git a/src/cpp/src/generation_stream.hpp b/src/cpp/src/generation_stream.hpp new file mode 100644 index 0000000000..c092dd0d64 --- /dev/null +++ b/src/cpp/src/generation_stream.hpp @@ -0,0 +1,60 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include <mutex> +#include <atomic> +#include "openvino/genai/continuous_batching_pipeline.hpp" +#include "openvino/genai/generation_handle.hpp" +#include "synchronized_queue.hpp" + +namespace ov::genai { +class GenerationStream { + std::mutex m_mutex; + GenerationStatus m_status = GenerationStatus::RUNNING; + SynchronizedQueue<GenerationOutputs> m_output_queue; + + std::vector<uint64_t> last_sequence_ids; + +public: + using Ptr = std::shared_ptr<GenerationStream>; + + // Don't use directly + GenerationStream() = default; + + static GenerationStream::Ptr create() { + return std::make_shared<GenerationStream>(); + } + + void push(GenerationOutputs outputs) { + m_output_queue.push(std::move(outputs)); + } + + // Retriving vector of pairs <sequence_id, token_id> as we can generate multiple outputs for a single prompt + GenerationOutputs back() { + return m_output_queue.back(); + } + GenerationOutputs read() { + return m_output_queue.pull(); + } + + bool can_read() { + return !m_output_queue.empty(); + } + + void set_generation_status(GenerationStatus status) { + std::lock_guard<std::mutex> lock(m_mutex); + m_status = status; + } + + GenerationStatus get_status() { + std::lock_guard<std::mutex> lock(m_mutex); + return m_status; + } + + void drop() { + std::lock_guard<std::mutex> lock(m_mutex); + m_status = GenerationStatus::DROPPED_BY_HANDLE; + } +}; +} diff --git a/src/cpp/src/greedy_decoding.cpp b/src/cpp/src/greedy_decoding.cpp new file mode 100644 index 0000000000..2f1ed3f89d --- /dev/null +++ b/src/cpp/src/greedy_decoding.cpp @@ -0,0 +1,137 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/perf_metrics.hpp" +#include "utils.hpp" + +namespace ov { +namespace genai { + +EncodedResults greedy_decoding( + ov::InferRequest& m_model_runner, + ov::Tensor input_ids, + ov::Tensor attention_mask, + const ov::genai::GenerationConfig generation_config, + const std::shared_ptr<StreamerBase> streamer, + std::optional<ov::Tensor> position_ids +) { + ov::Shape prompts_shape = input_ids.get_shape(); + const size_t batch_size = prompts_shape[0]; + size_t running_batch_size = batch_size; + size_t prompt_len = prompts_shape[1]; + size_t max_new_tokens = generation_config.get_max_new_tokens(prompt_len); + + // Initialize results and performance metrics. + EncodedResults results; + auto& raw_perf_counters = results.perf_metrics.raw_metrics; + raw_perf_counters.m_new_token_times.reserve(max_new_tokens); + raw_perf_counters.m_batch_sizes.reserve(max_new_tokens); + raw_perf_counters.m_token_infer_durations.reserve(max_new_tokens); + raw_perf_counters.m_inference_durations = {{ MicroSeconds(0.0f) }}; + + results.scores.resize(running_batch_size); + results.tokens.resize(running_batch_size); + std::fill(results.scores.begin(), results.scores.end(), 0); + + m_model_runner.set_tensor("input_ids", input_ids); + m_model_runner.set_tensor("attention_mask", attention_mask); + if (position_ids.has_value()) + m_model_runner.set_tensor("position_ids", *position_ids); + + m_model_runner.get_tensor("beam_idx").set_shape({running_batch_size}); + auto beam_data = m_model_runner.get_tensor("beam_idx").data<int32_t>(); + std::iota(beam_data, beam_data + running_batch_size, 0); + + const auto infer_start = std::chrono::steady_clock::now(); + m_model_runner.infer(); + const auto infer_ms = PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start); + raw_perf_counters.m_inference_durations[0] = MicroSeconds(infer_ms); + raw_perf_counters.m_token_infer_durations.emplace_back(infer_ms); + auto logits = m_model_runner.get_tensor("logits"); + + ov::Shape logits_shape = logits.get_shape(); + size_t seq_len = logits_shape[1], vocab_size = logits_shape[2]; + m_model_runner.get_tensor("input_ids").set_shape({running_batch_size, 1}); + + std::vector<int64_t> token_iter_results(running_batch_size); // results of a single infer request + std::vector<int> eos_met(running_batch_size, 0); // use int because can not use std::all_of with vector<bool> + for (size_t batch = 0; batch < running_batch_size; ++batch) { + auto out_token = utils::argmax(logits, batch); + results.tokens[batch].emplace_back(out_token); + + token_iter_results[batch] = out_token; + eos_met[batch] = (out_token == generation_config.eos_token_id); + m_model_runner.get_tensor("input_ids").data<int64_t>()[batch] = out_token; + } + raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now()); + raw_perf_counters.m_batch_sizes.emplace_back(batch_size); + + if (streamer && streamer->put(token_iter_results[0])) { + return results; + } + + bool all_are_eos = std::all_of(eos_met.begin(), eos_met.end(), [](int elem) { return elem == 1; }); + if (!generation_config.ignore_eos && all_are_eos) + return results; + + for (size_t i = 0; i < max_new_tokens - 1; ++i) { + if (position_ids.has_value()) + utils::update_position_ids(m_model_runner.get_tensor("position_ids"), m_model_runner.get_tensor("attention_mask")); + m_model_runner.set_tensor("attention_mask", utils::extend_attention(m_model_runner.get_tensor("attention_mask"))); + + const auto infer_start = std::chrono::steady_clock::now(); + m_model_runner.infer(); + const auto infer_ms = PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start); + raw_perf_counters.m_inference_durations[0] += MicroSeconds(infer_ms); + raw_perf_counters.m_token_infer_durations.emplace_back(infer_ms); + + auto logits = m_model_runner.get_tensor("logits"); + + ov::Shape logits_shape = logits.get_shape(); + size_t seq_len = logits_shape[1], vocab_size = logits_shape[2]; + + std::vector<int64_t> token_iter_results(running_batch_size); // results of a single infer request + std::vector<int> eos_met(running_batch_size, 0); // use int because can not use std::all_of with vector<bool> + for (size_t batch = 0; batch < running_batch_size; ++batch) { + auto out_token = ov::genai::utils::argmax(logits, batch); + results.tokens[batch].emplace_back(out_token); + + token_iter_results[batch] = out_token; + eos_met[batch] = (out_token == generation_config.eos_token_id); + + m_model_runner.get_tensor("input_ids").data<int64_t>()[batch] = out_token; + } + raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now()); + raw_perf_counters.m_batch_sizes.emplace_back(batch_size); + + if (streamer && streamer->put(token_iter_results[0])) + return results; + + if (generation_config.ignore_eos) + continue; + + // stop generation when EOS is met in all batches + bool all_are_eos = std::all_of(eos_met.begin(), eos_met.end(), [](int elem) { return elem == 1; }); + if (all_are_eos) + break; + + // Filter out batches where eos is met + std::vector<int32_t> beam_idx(running_batch_size); + std::iota(beam_idx.begin(), beam_idx.end(), 0); + auto end_it = std::remove_if(beam_idx.begin(), beam_idx.end(), [&eos_met](int idx) { return eos_met[idx]; }); + beam_idx.erase(end_it, beam_idx.end()); // Remove the eos met indices + + m_model_runner.get_tensor("beam_idx").set_shape({beam_idx.size()}); + auto beam_data = m_model_runner.get_tensor("beam_idx").data<int32_t>(); + std::copy(beam_idx.begin(), beam_idx.end(), beam_data); + running_batch_size = beam_idx.size(); + } + if (streamer) { + streamer->end(); + } + + return results; +} + +} //namespace genai +} //namespace ov diff --git a/src/cpp/src/group_beam_searcher.cpp b/src/cpp/src/group_beam_searcher.cpp new file mode 100644 index 0000000000..1b9729b2f6 --- /dev/null +++ b/src/cpp/src/group_beam_searcher.cpp @@ -0,0 +1,483 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include <openvino/runtime/tensor.hpp> + +#include "openvino/genai/llm_pipeline.hpp" +#include "utils.hpp" + +namespace { + +// Modifyed Knuth–Morris–Pratt algorithm which returns tokens following after every needle occurance in haystack +std::vector<int64_t> kmp_search(const std::vector<int64_t>& haystack, const std::vector<int64_t>& needle) { + if (needle.empty()) { // no_repeat_ngram_size == 1, ban every token + return {haystack.begin(), haystack.end()}; + } + std::vector<int> partial_match_table(needle.size() + 1, -1); + int cnd = 0; + for (size_t pos = 1; pos < needle.size(); ++pos) { + if (needle.at(pos) == needle.at(size_t(cnd))) { + partial_match_table.at(pos) = partial_match_table.at(size_t(cnd)); + } else { + partial_match_table.at(pos) = cnd; + while (cnd >= 0 && needle.at(pos) != needle.at(size_t(cnd))) { + cnd = partial_match_table.at(size_t(cnd)); + } + } + ++cnd; + } + partial_match_table.back() = cnd; + std::vector<int64_t> res; + size_t haystack_id = 0; + int needle_id = 0; + while (haystack_id < haystack.size() - 1) { + if (needle.at(size_t(needle_id)) == haystack.at(haystack_id)) { + ++haystack_id; + ++needle_id; + if (needle_id == int(needle.size())) { + res.push_back(haystack.at(haystack_id)); + needle_id = partial_match_table.at(size_t(needle_id)); + } + } else { + needle_id = partial_match_table.at(size_t(needle_id)); + if (needle_id < 0) { + ++haystack_id; + ++needle_id; + } + } + } + return res; +} + +struct Token { + float log_prob; + int64_t idx; +}; + +std::vector<Token> log_softmax(const ov::Tensor& logits, const size_t batch_idx) { + if (logits.get_shape().at(0) <= batch_idx) { + throw std::runtime_error("logits batch size doesn't match the number of beams"); + } + size_t vocab_size = logits.get_shape().back(); + size_t batch_offset = batch_idx * logits.get_shape().at(1) * vocab_size; + size_t sequence_offset = (logits.get_shape().at(1) - 1) * vocab_size; + const float* beam_logits = logits.data<const float>() + batch_offset + sequence_offset; + float max_logit = *std::max_element(beam_logits, beam_logits + vocab_size); + float log_sum = std::log( + std::accumulate(beam_logits, beam_logits + vocab_size, 0.0f, [max_logit](float accumulated, float to_add) { + return accumulated + std::exp(to_add - max_logit); + })); + std::vector<Token> tokens; + tokens.reserve(vocab_size); + for (size_t idx = 0; idx < vocab_size; ++idx) { + tokens.push_back({beam_logits[idx] - max_logit - log_sum, int64_t(idx)}); + } + return tokens; +} + +struct Beam { + float score = -std::numeric_limits<float>::infinity(); // The bigger, the better + std::vector<int64_t> tokens; + size_t global_beam_idx = 0; +}; + +bool greater(const Beam& left, const Beam& right) { + return left.score > right.score; +} + +struct Parameters { + std::vector<std::vector<int64_t>> prompts; + int64_t eos_token_id; + size_t n_groups = 3; + size_t group_size = 5; + float diversity_penalty = 1.0; + size_t max_new_tokens = 20; + ov::genai::StopCriteria stop_criteria = ov::genai::StopCriteria::HEURISTIC; + float length_penalty = 1.0; + size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max(); + + std::function<bool(const Beam&)> early_finish = [](const Beam&) { + return false; + }; +}; + +struct Group { + std::vector<Beam> ongoing; // Best beams in front + std::vector<Beam> min_heap; // The worst of the best completed beams is the first + bool done = false; + + void finish(Beam&& beam, const Parameters& parameters) { + beam.score /= std::pow(float(beam.tokens.size()), parameters.length_penalty); + + min_heap.push_back(std::move(beam)); + std::push_heap(min_heap.begin(), min_heap.end(), greater); + if (min_heap.size() > parameters.group_size) { + std::pop_heap(min_heap.begin(), min_heap.end(), greater); + min_heap.pop_back(); + } + } + void is_done(const Parameters& parameters) { + if (min_heap.size() < parameters.group_size) { + return; + } + size_t cur_len = ongoing.front().tokens.size(); + float best_sum_logprobs = ongoing.front().score; + float worst_score = min_heap.front().score; + switch (parameters.stop_criteria) { + case ov::genai::StopCriteria::EARLY: + done = true; + return; + case ov::genai::StopCriteria::HEURISTIC: { + float highest_attainable_score = best_sum_logprobs / std::pow(float(cur_len), parameters.length_penalty); + done = worst_score >= highest_attainable_score; + return; + } + case ov::genai::StopCriteria::NEVER: { + size_t length = parameters.length_penalty > 0.0 ? parameters.max_new_tokens : cur_len; + float highest_attainable_score = best_sum_logprobs / std::pow(float(length), parameters.length_penalty); + done = worst_score >= highest_attainable_score; + return; + } + default: + throw std::runtime_error("Never reached"); + } + } +}; + +// GroupBeamSearcher processes logits prduced by a language model and accumulates beams using group beam search +// algorithm. select_next_tokens() returns token ids selected by the algorithm and corresponding beam ids. These values +// are used for next inference. select_next_tokens() returns empty, if all groups are completed +struct GroupBeamSearcher { + Parameters parameters; + std::vector<std::vector<Group>> prompts_groups; + + GroupBeamSearcher(Parameters parameters) : parameters{parameters}, prompts_groups{parameters.prompts.size()} { + if (parameters.no_repeat_ngram_size == 0) { + throw std::runtime_error("no_repeat_ngram_size must be positive"); + } + for (std::vector<Group>& prompts_groups : prompts_groups) { + prompts_groups.resize(parameters.n_groups); + for (Group& group : prompts_groups) { + group.ongoing.resize(parameters.group_size); + group.ongoing.front().score = 0.0; + } + } + } + + std::pair<std::vector<int64_t>, std::vector<int32_t>> select_next_tokens(const ov::Tensor& logits) { + std::vector<int64_t> next_tokens; + std::vector<int32_t> next_beams; + + const size_t promts_size = parameters.prompts.size(); + + next_tokens.reserve(promts_size * parameters.n_groups * parameters.group_size); + next_beams.reserve(promts_size * parameters.n_groups * parameters.group_size); + + size_t beam_count = 0; + size_t prompt_id = 0; + for (std::vector<Group>& groups : prompts_groups) { + for (Group& group : groups) { + if (group.done) { + continue; + } + for (Beam& beam : group.ongoing) { + // beam.tokens.empty() holds for the first select_next_tokens() call. + // Every beam is constructed from the single batch at first call + if (beam.tokens.empty()) { + beam.global_beam_idx = prompt_id; + } else { + beam.global_beam_idx = beam_count; + ++beam_count; + } + } + } + + prompt_id += 1; + } + + for (int prompt_id = 0; prompt_id < promts_size; prompt_id++) { + const std::vector<int64_t> prompt = parameters.prompts[prompt_id]; + std::vector<Group>& groups = prompts_groups[prompt_id]; + auto [prompt_next_tokens, prompt_next_beams] = select_prompt_next_tokens(logits, prompt, groups); + + next_tokens.insert(next_tokens.end(), prompt_next_tokens.begin(), prompt_next_tokens.end()); + next_beams.insert(next_beams.end(), prompt_next_beams.begin(), prompt_next_beams.end()); + } + + return {next_tokens, next_beams}; + } + + std::pair<std::vector<int64_t>, std::vector<int32_t>> select_prompt_next_tokens(const ov::Tensor& logits, + const std::vector<int64_t>& prompt, + std::vector<Group>& groups) { + std::vector<int64_t> next_tokens; + std::vector<int32_t> next_beams; + next_tokens.reserve(parameters.n_groups * parameters.group_size); + next_beams.reserve(parameters.n_groups * parameters.group_size); + + for (auto group = groups.begin(); group != groups.end(); ++group) { + if (group->done) { + continue; + } + std::vector<Beam> candidates; + candidates.reserve(parameters.group_size * 2 * parameters.group_size); + for (const Beam& beam : group->ongoing) { + std::vector<Token> tokens = log_softmax(logits, beam.global_beam_idx); + for (auto prev_group = groups.cbegin(); prev_group != group; ++prev_group) { + for (const Beam& prev_beam : prev_group->ongoing) { + if (prev_beam.tokens.size() > beam.tokens.size()) { + tokens.at(size_t(prev_beam.tokens.back())).log_prob -= parameters.diversity_penalty; + } + } + } + std::vector<int64_t> full_text{prompt}; + full_text.insert(full_text.end(), beam.tokens.begin(), beam.tokens.end()); + if (full_text.size() > 1 && full_text.size() >= parameters.no_repeat_ngram_size) { + auto tail_start = full_text.end() - ptrdiff_t(parameters.no_repeat_ngram_size) + 1; + for (int64_t banned_token : kmp_search(full_text, {tail_start, full_text.end()})) { + tokens.at(size_t(banned_token)).log_prob = -std::numeric_limits<float>::infinity(); + } + } + std::sort(tokens.begin(), tokens.end(), [](Token left, Token right) { + return left.log_prob > right.log_prob; // Most probable tokens in front + }); + size_t add_count = 0; + for (Token token : tokens) { + Beam new_candidate = beam; + new_candidate.score += token.log_prob; + new_candidate.tokens.push_back(token.idx); + if (parameters.early_finish(new_candidate)) { + group->finish(std::move(new_candidate), parameters); + } else { + candidates.push_back(std::move(new_candidate)); + ++add_count; + if (add_count == 2 * parameters.group_size) { + break; + } + } + } + } + // Sample 2 * group_size highest score tokens to get at least 1 non EOS token per beam + if (candidates.size() < 2 * parameters.group_size) { + throw std::runtime_error("No beams left to search"); + } + auto to_sort = candidates.begin() + ptrdiff_t(2 * parameters.group_size); + std::partial_sort(candidates.begin(), to_sort, candidates.end(), greater); + group->ongoing.clear(); + for (size_t cand_idx = 0; cand_idx < candidates.size(); ++cand_idx) { + if (parameters.eos_token_id == candidates.at(cand_idx).tokens.back()) { + // If beam_token does not belong to top num_beams tokens, it should not be added + if (cand_idx >= parameters.group_size) { + continue; + } + group->finish(std::move(candidates.at(cand_idx)), parameters); + } else { + group->ongoing.push_back(std::move(candidates.at(cand_idx))); + if (group->ongoing.size() == parameters.group_size) { + break; + } + } + } + group->is_done(parameters); + if (!group->done) { + for (const Beam& beam : group->ongoing) { + next_tokens.push_back(beam.tokens.back()); + next_beams.push_back(int32_t(beam.global_beam_idx)); + } + } + } + return {next_tokens, next_beams}; + } +}; + +// Consume group_beam_searcher because beams are consumed +std::vector<std::vector<std::vector<Beam>>> finalize(GroupBeamSearcher&& group_beam_searcher) { + std::vector<std::vector<std::vector<Beam>>> finalized; + finalized.resize(group_beam_searcher.prompts_groups.size()); + + for (size_t prompt_id = 0; prompt_id < group_beam_searcher.prompts_groups.size(); prompt_id++) { + std::vector<Group>& groups = group_beam_searcher.prompts_groups.at(prompt_id); + finalized.at(prompt_id).reserve(groups.size()); + + for (Group& group : groups) { + if (!group.done) { + for (Beam& beam : group.ongoing) { + group.finish(std::move(beam), group_beam_searcher.parameters); + } + } + finalized.at(prompt_id).push_back(std::move(group.min_heap)); + } + } + + return finalized; +} + +void update_attention_mask_with_beams(ov::Tensor&& attention_mask, std::vector<int32_t> next_beams) { + ov::Tensor original_mask{ov::element::i64, attention_mask.get_shape()}; + ov::Shape original_shape = original_mask.get_shape(); + attention_mask.copy_to(original_mask); + + ov::Shape new_shape{next_beams.size(), original_mask.get_shape().at(1) + 1}; + attention_mask.set_shape(new_shape); + + for (size_t beam_id = 0; beam_id < next_beams.size(); beam_id++) { + const size_t original_prompt_offset = next_beams.at(beam_id) * original_shape.at(1); + const size_t result_prompt_offset = beam_id * new_shape.at(1); + + int64_t* dest = attention_mask.data<int64_t>() + result_prompt_offset; + const int64_t* src = original_mask.data<int64_t>() + original_prompt_offset; + + std::memcpy(dest, src, original_shape.at(1) * sizeof(int64_t)); + attention_mask.data<int64_t>()[result_prompt_offset + new_shape.at(1) - 1] = 1; + } +} + +void update_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention_mask) { + const size_t batch_size = attention_mask.get_shape().at(0); + const size_t sequence_length = attention_mask.get_shape().at(1); + position_ids.set_shape({batch_size, 1}); + + for (size_t batch = 0; batch < batch_size; batch++) { + int64_t* mask_start = attention_mask.data<int64_t>() + batch * sequence_length; + position_ids.data<int64_t>()[batch] = std::accumulate(mask_start, mask_start + sequence_length - 1, 0); + } +} + +void reset_all_inputs_to_empty_tensors(ov::InferRequest& request) { + request.set_tensor("input_ids", ov::Tensor(ov::element::i64, {0, 0})); + request.set_tensor("beam_idx", ov::Tensor(ov::element::i32, {0})); + if (request.get_compiled_model().inputs().size() == 4) + request.set_tensor("position_ids", ov::Tensor(ov::element::i64, {0, 0})); +} +} // namespace + +namespace ov { +namespace genai { + +std::pair<EncodedResults, int32_t> beam_search(ov::InferRequest& lm, + ov::Tensor input_ids, + ov::Tensor attention_mask, + GenerationConfig config, + std::optional<ov::Tensor> position_ids, + std::optional<int32_t> selected_beam_idx) { + OPENVINO_ASSERT(config.num_beams % config.num_beam_groups == 0, + "number of beams should be divisible by number of groups"); + + auto batch_size = input_ids.get_shape().at(0); + auto sequence_length = input_ids.get_shape().at(1); + + // Initialize beam search. + const int64_t* prompt_data = input_ids.data<const int64_t>(); + std::vector<std::vector<int64_t>> prompts; + prompts.reserve(batch_size); + for (size_t batch = 0; batch < batch_size; batch++) { + size_t batch_offset = batch * sequence_length; + const int64_t* prompt_start = prompt_data + batch_offset; + prompts.push_back(std::vector<int64_t>{prompt_start, prompt_start + sequence_length}); + } + + lm.set_tensor("input_ids", input_ids); + lm.set_tensor("attention_mask", attention_mask); + if (position_ids.has_value()) + lm.set_tensor("position_ids", *position_ids); + + ov::Tensor beam_idx = ov::Tensor(ov::element::i32, {batch_size}); + auto beam_data = beam_idx.data<int32_t>(); + if (selected_beam_idx.has_value()) + beam_data[0] = *selected_beam_idx; + else + std::fill_n(beam_data, batch_size, 0); + lm.set_tensor("beam_idx", beam_idx); + + Parameters parameters{std::move(prompts)}; + parameters.max_new_tokens = config.get_max_new_tokens(sequence_length); + parameters.eos_token_id = config.eos_token_id; + parameters.n_groups = config.num_beam_groups; + parameters.group_size = config.num_beams / config.num_beam_groups; + parameters.diversity_penalty = config.diversity_penalty; + parameters.length_penalty = config.length_penalty; + parameters.stop_criteria = config.stop_criteria; + parameters.no_repeat_ngram_size = config.no_repeat_ngram_size; + GroupBeamSearcher group_beam_searcher{parameters}; + + std::vector<int64_t> next_tokens; + std::vector<int32_t> next_beams; + + // Reserve for performance counters. + std::vector<std::chrono::steady_clock::time_point> new_token_times; + std::vector<size_t> batch_sizes; + new_token_times.reserve(parameters.max_new_tokens); + batch_sizes.reserve(parameters.max_new_tokens); + + for (size_t length_count = 0; ; ++length_count) { + lm.infer(); + + std::tie(next_tokens, next_beams) = group_beam_searcher.select_next_tokens(lm.get_tensor("logits")); + new_token_times.emplace_back(std::chrono::steady_clock::now()); + batch_sizes.emplace_back(batch_size); + + if (next_tokens.empty() || length_count == parameters.max_new_tokens - 1) { + // Break the cycle before masks are extended in update_attention_mask_with_beams. + // If generation is continued, attention_mask length should be equal to KV cache size. + break; + } + + size_t running_batch_size = next_tokens.size(); + // Set pointers + lm.set_tensor("input_ids", ov::Tensor{ov::element::i64, {running_batch_size, 1}, next_tokens.data()}); + lm.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {running_batch_size}, next_beams.data()}); + + // Set auxiliary inputs + update_attention_mask_with_beams(lm.get_tensor("attention_mask"), next_beams); + if (position_ids.has_value()) + update_position_ids(lm.get_tensor("position_ids"), lm.get_tensor("attention_mask")); + } + + reset_all_inputs_to_empty_tensors(lm); + + auto scores_comparator = [](Beam& left, Beam& right) { + return (left.score > right.score); + }; + + auto result = finalize(std::move(group_beam_searcher)); + ov::genai::EncodedResults results; + int32_t res_selected_beam_idx = 0; + results.scores.reserve(config.num_return_sequences * result.size()); + results.tokens.reserve(config.num_return_sequences * result.size()); + auto& raw_perf_counters = results.perf_metrics.raw_metrics; + raw_perf_counters.m_new_token_times = new_token_times; + raw_perf_counters.m_batch_sizes = batch_sizes; + + // align output with HF + for (size_t prompt_id = 0; prompt_id < result.size(); prompt_id++) { + auto prompt_group = result.at(prompt_id); + std::vector<std::reference_wrapper<Beam>> plain_beams; + plain_beams.reserve(parameters.n_groups * parameters.group_size); + for (std::vector<Beam>& group : prompt_group) { + for (Beam& beam : group) { + plain_beams.push_back(beam); + } + } + assert(config.num_return_sequences <= plain_beams.size()); + std::partial_sort( + plain_beams.begin(), + plain_beams.begin() + config.num_return_sequences, + plain_beams.end(), + scores_comparator + ); + res_selected_beam_idx = plain_beams.at(0).get().global_beam_idx; + for ( + auto beam = plain_beams.begin(); + beam != plain_beams.begin() + config.num_return_sequences; + ++beam + ) { + results.scores.push_back(beam->get().score); + results.tokens.push_back(std::move(beam->get().tokens)); + } + } + + return {results, res_selected_beam_idx}; +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp new file mode 100644 index 0000000000..e3815e5944 --- /dev/null +++ b/src/cpp/src/llm_pipeline.cpp @@ -0,0 +1,599 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include <filesystem> +#include <fstream> +#include <variant> +#include <algorithm> +#include <nlohmann/json.hpp> +#include <openvino/openvino.hpp> +#include "openvino/genai/continuous_batching_pipeline.hpp" +#include "openvino/genai/generation_config.hpp" +#include "openvino/genai/llm_pipeline.hpp" +#include "openvino/genai/perf_metrics.hpp" +#include "llm_pipeline_base.hpp" +#include "llm_pipeline_static.hpp" +#include "utils.hpp" +#include "text_callback_streamer.hpp" +#include "openvino/genai/lora_adapter.hpp" +#include "lora_helper.hpp" + +namespace ov { +namespace genai { + +ov::genai::EncodedResults greedy_decoding( + ov::InferRequest& model_runner, + ov::Tensor prompts, + ov::Tensor attention_mask, + const GenerationConfig sampling_params, + const std::shared_ptr<StreamerBase> streamer, + std::optional<ov::Tensor> position_ids +); + +ov::genai::EncodedResults multinominal_decoding( + ov::InferRequest& model_runner, + ov::Tensor prompts, + ov::Tensor attention_mask, + GenerationConfig sampling_params, + std::shared_ptr<StreamerBase> streamer, + std::optional<ov::Tensor> position_ids +); + +std::pair<EncodedResults, int32_t> beam_search( + ov::InferRequest& lm, + ov::Tensor prompts, + ov::Tensor attention_mask, + GenerationConfig config, + std::optional<ov::Tensor> position_ids, + std::optional<int32_t> selected_beam_idx +); + +class StatefulLLMPipeline final : public LLMPipelineImplBase { +public: + ov::InferRequest m_model_runner; + + bool is_chat_conversation = false; + bool m_is_cache_empty = true; + std::optional<int32_t> m_selected_beam = std::nullopt; + ChatHistory m_history; + std::string m_templated_chat_history = ""; + + StatefulLLMPipeline( + const ov::InferRequest& request, + const ov::genai::Tokenizer& tokenizer, + OptionalGenerationConfig generation_config=std::nullopt + ): LLMPipelineImplBase(tokenizer), + m_model_runner(request) { + GenerationConfig default_config; + m_generation_config = (generation_config.has_value()) ? *generation_config : default_config; + } + + StatefulLLMPipeline( + const std::filesystem::path& model_path, + const ov::genai::Tokenizer& tokenizer, + const std::string& device, + const ov::AnyMap& plugin_config + ): + LLMPipelineImplBase(tokenizer, utils::from_config_json_if_exists(model_path)) + { + ov::Core core; + if(auto filtered_plugin_config = extract_adapters_from_properties(plugin_config, &m_generation_config.adapters)) { + auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_complile_config(*filtered_plugin_config); + core.set_property(core_plugin_config); + auto model = core.read_model(model_path / "openvino_model.xml"); + m_adapter_controller = AdapterController(model, m_generation_config.adapters, "base_model.model.model.", device); // TODO: Make the prefix name configurable + utils::slice_matmul_statefull_model(model); + m_model_runner = core.compile_model(model, device, compile_plugin_config).create_infer_request(); + m_adapter_controller->apply(m_model_runner, m_generation_config.adapters); + } else { + auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_complile_config(plugin_config); + core.set_property(core_plugin_config); + auto model = core.read_model(model_path / "openvino_model.xml"); + utils::slice_matmul_statefull_model(model); + m_model_runner = core.compile_model(model, device, compile_plugin_config).create_infer_request(); + } + + // If eos_token_id was not provided, take value + if (m_generation_config.eos_token_id == -1) + m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id()); + } + + StatefulLLMPipeline( + const std::filesystem::path& model_path, + const std::string& device, + const ov::AnyMap& plugin_config + ): StatefulLLMPipeline{model_path, Tokenizer(model_path.string()), device, plugin_config} {} + + DecodedResults generate( + StringInputs inputs, + OptionalGenerationConfig generation_config, + StreamerVariant streamer + ) override { + auto start_time = std::chrono::steady_clock::now(); + GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config; + TokenizedInputs encoded_input; + + if (auto input_vector = std::get_if<std::vector<std::string>>(&inputs)) { + OPENVINO_ASSERT(!is_chat_conversation, "Can't chat with multiple prompts"); + encoded_input = m_tokenizer.encode(*input_vector); + } else if (auto input_prompt = std::get_if<std::string>(&inputs)) { + std::string& prompt = *input_prompt; + + if (is_chat_conversation) { + // KV cache in model already contains prompts and answers from previous iterations. + // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns + // token_ids = {<bos token>, ...<valuable tokens>}. So if tokenizer applies only to the new prompt, + // <bos token> will be inserted on every iteration. + // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt + // and takes only the difference between them. + // The chat history cannot be saved as already encoded tokens because generate call doesn't return <eos> token, but + // KV cache contains it. So we have to add it manually or get it by tokenization all chat history. + + m_history.push_back({{"role", "user"}, {"content", prompt}}); + constexpr bool add_generation_prompt = true; + auto new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); + // Do not add special tokens in chat scenario to be aligned with HF. + bool add_special_tokens = false; + auto new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(add_special_tokens)); + if (m_is_cache_empty) { + encoded_input = new_chat_tokens; + } else { + auto prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(add_special_tokens)); + encoded_input = utils::subtract_chat_tokenized_inputs(new_chat_tokens, prev_chat_tokens); + } + m_templated_chat_history = new_templated_chat_history; + // TODO: Forbid LoRA config change if we are in the chat mode, because it requires regenerating the history with LoRA applied + } else { + encoded_input = m_tokenizer.encode(prompt); + } + } + auto encode_stop_time = std::chrono::steady_clock::now(); + auto encoded_results = generate(encoded_input, config, streamer); + + auto decode_start_time = std::chrono::steady_clock::now(); + DecodedResults decoded_results = {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores}; + auto decode_stop_time = std::chrono::steady_clock::now(); + + if (is_chat_conversation) { + // Tail of chat template is missing in KV cache. + // Find the tail to concatenate it with the next input prompt. + auto answer = decoded_results.texts[0]; + m_templated_chat_history.append(answer); + m_history.push_back({{"role", "assistant"}, {"content", answer}}); + } + + // generate_durations + decoded_results.perf_metrics = encoded_results.perf_metrics; + + auto& raw_counters = decoded_results.perf_metrics.raw_metrics; + auto stop_time = std::chrono::steady_clock::now(); + raw_counters.generate_durations = std::vector<MicroSeconds>(); + raw_counters.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time)); + raw_counters.tokenization_durations.emplace_back(PerfMetrics::get_microsec(encode_stop_time - start_time)); + raw_counters.detokenization_durations.emplace_back(PerfMetrics::get_microsec(decode_stop_time - decode_start_time)); + + // Added tokenization/detokenization times, and updated generate duration, need to reevaluate statistics. + decoded_results.perf_metrics.m_evaluated = false; + decoded_results.perf_metrics.evaluate_statistics(start_time); + return decoded_results; + } + + EncodedResults generate( + const EncodedInputs& inputs, + OptionalGenerationConfig generation_config, + StreamerVariant streamer + ) override { + auto start_time = std::chrono::steady_clock::now(); + ov::Tensor input_ids; + ov::Tensor attention_mask; + if (auto data = std::get_if<ov::Tensor>(&inputs)) { + input_ids = *data; + attention_mask = ov::genai::utils::init_attention_mask(input_ids); + } else if (auto data = std::get_if<TokenizedInputs>(&inputs)) { + input_ids = data->input_ids; + attention_mask = data->attention_mask; + } + + GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config; + + // If eos_token_id was not provided, take value from default m_generation_config + if (config.eos_token_id == -1) + config.eos_token_id = m_generation_config.eos_token_id; + config.validate(); + + std::shared_ptr<StreamerBase> streamer_ptr; + if (auto streamer_obj = std::get_if<std::monostate>(&streamer)) { + streamer_ptr = nullptr; + } else if (auto streamer_obj = std::get_if<std::shared_ptr<StreamerBase>>(&streamer)) { + streamer_ptr = *streamer_obj; + } else if (auto callback = std::get_if<std::function<bool(std::string)>>(&streamer)) { + streamer_ptr = std::make_shared<TextCallbackStreamer>(m_tokenizer, *callback); + } + + auto batch_size = input_ids.get_shape().at(0); + if ((batch_size != 1 || !(config.is_greedy_decoding() || config.is_multinomial())) && streamer_ptr) { + OPENVINO_THROW("Currently streaming is possible only with batch size=1 and " + "only for greedy or multinomial decoding"); + } + + auto num_inputs = m_model_runner.get_compiled_model().inputs().size(); + OPENVINO_ASSERT(num_inputs == 4 || num_inputs == 3, "Model should have 3 or 4 inputs: " + "either (input_ids, attention_mask, beam_idx) or " + "(input_ids, attention_mask, position_ids, beam_idx) " + "but you have '" + std::to_string(num_inputs) + "' inputs"); + + + size_t kv_cache_len = 0; + ov::Tensor concatenated_attention_mask; + if (is_chat_conversation && !m_is_cache_empty) { + OPENVINO_ASSERT(batch_size == 1, "continuation of generation is possible only for batch 1"); + // If history is saved in KV cache, concatenate new attention_mask with the already existing. + // Between subsequent runs attention_mask should not be modified. + auto atten_mask_history = m_model_runner.get_tensor("attention_mask"); + auto prompt_len = attention_mask.get_shape()[1]; + kv_cache_len = atten_mask_history.get_shape()[1]; + + ov::Tensor new_atten_mask = ov::Tensor{ov::element::i64, {batch_size, kv_cache_len + prompt_len}}; + auto start_atten_hst = atten_mask_history.data<int64_t>() + kv_cache_len * (*m_selected_beam); + std::copy(start_atten_hst, start_atten_hst + kv_cache_len, + new_atten_mask.data<int64_t>()); + std::copy(attention_mask.data<int64_t>(), attention_mask.data<int64_t>() + prompt_len, + new_atten_mask.data<int64_t>() + kv_cache_len); + concatenated_attention_mask = new_atten_mask; + } else { + concatenated_attention_mask = attention_mask; + } + + bool position_ids_available = (num_inputs == 4); + std::optional<ov::Tensor> position_ids = std::nullopt; + if (position_ids_available) { + position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()}; + utils::initialize_position_ids(*position_ids, attention_mask, kv_cache_len); + } + + if(m_adapter_controller) { + m_adapter_controller->apply(m_model_runner, config.adapters); + } + + ov::genai::EncodedResults result; + if (config.is_greedy_decoding()) { + result = ov::genai::greedy_decoding(m_model_runner, input_ids, concatenated_attention_mask, + config, streamer_ptr, position_ids); + m_selected_beam = 0; + } else if (config.is_beam_search()) { + std::tie(result, m_selected_beam) = beam_search(m_model_runner, input_ids, concatenated_attention_mask, + config, position_ids, m_selected_beam); + } else if (config.is_multinomial()) { + result = multinominal_decoding(m_model_runner, input_ids, concatenated_attention_mask, + config, streamer_ptr, position_ids); + m_selected_beam = 0; + } else { + OPENVINO_THROW("No decoding algorithm found for provided configuration parameters."); + } + + if (!is_chat_conversation) { + // FIXME: Reset only KV cache part of state, there is also can be LoRA applied in the states and full reset will need to reapply LoRA even if the LoRA config is not changed + m_model_runner.reset_state(); + if(m_adapter_controller) { + m_adapter_controller->force_full_apply(); // FIXME: Reset only KV cache part to avoid this call + } + m_selected_beam = std::nullopt; + } else { + m_is_cache_empty = false; + } + auto stop_time = std::chrono::steady_clock::now(); + + // If is called without tokenization then that stat will not be reported. + auto& metrics = result.perf_metrics; + metrics.num_input_tokens = batch_size * input_ids.get_shape().at(1); + metrics.load_time = this->m_load_time_ms; + metrics.raw_metrics.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time)); + metrics.evaluate_statistics(start_time); + return result; + } + + void start_chat(const std::string& system_message) override { + is_chat_conversation = true; + m_selected_beam = std::nullopt; + if (!m_is_cache_empty) { + m_model_runner.reset_state(); + m_is_cache_empty = true; + m_history = {}; + m_templated_chat_history = ""; + } + if (system_message.empty()) + return; + + m_history.push_back({{"role", "system"}, {"content", system_message}}); + constexpr bool add_generation_prompt = false; + + m_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); + } + + void finish_chat() override { + is_chat_conversation = false; + m_selected_beam = std::nullopt; + if (!m_is_cache_empty) { + m_model_runner.reset_state(); + m_is_cache_empty = true; + m_history = {}; + m_templated_chat_history = ""; + } + } +}; + +DecodedResults LLMPipeline::generate( + StringInputs inputs, + OptionalGenerationConfig generation_config, + StreamerVariant streamer +) { + return m_pimpl->generate(inputs, generation_config, streamer); +} + +DecodedResults LLMPipeline::generate(StringInputs text, const ov::AnyMap& config_map) { + auto config_arg = utils::get_config_from_map(config_map); + GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config(); + config.update_generation_config(config_map); + + return m_pimpl->generate(text, config, utils::get_streamer_from_map(config_map)); +} + +EncodedResults LLMPipeline::generate( + const EncodedInputs& inputs, + OptionalGenerationConfig generation_config, + StreamerVariant streamer +) { + return m_pimpl->generate(inputs, generation_config, streamer); +} + +EncodedResults LLMPipeline::generate(const EncodedInputs& inputs, const ov::AnyMap& config_map) { + auto config_arg = utils::get_config_from_map(config_map); + GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config(); + config.update_generation_config(config_map); + + return m_pimpl->generate(inputs, config, utils::get_streamer_from_map(config_map)); +} + +std::pair<std::string, Any> streamer(StreamerVariant func) { + if (auto streamer_obj = std::get_if<std::shared_ptr<StreamerBase>>(&func)) { + return {utils::STREAMER_ARG_NAME, Any::make<std::shared_ptr<StreamerBase>>(*streamer_obj)}; + } else { + auto callback = std::get<std::function<bool(std::string)>>(func); + return {utils::STREAMER_ARG_NAME, Any::make<std::function<bool(std::string)>>(callback)}; + } +} + +std::pair<std::string, Any> generation_config(const GenerationConfig& config) { + return {utils::CONFIG_ARG_NAME, Any::make<GenerationConfig>(config)}; +} + +} // namespace genai +} // namespace ov + +namespace { +using namespace ov::genai; + +template<class... Ts> struct overloaded : Ts... {using Ts::operator()...;}; +template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>; + +Tokenizer dont_construct() { + OPENVINO_THROW("Continuous Batching backend can't be constructed" + "from ireq because the model must be transformed"); +} + +class ContinuousBatchingAdapter final : public LLMPipelineImplBase { +public: + ContinuousBatchingPipeline m_impl; + + ContinuousBatchingAdapter( + const ov::InferRequest& request, + const Tokenizer& tokenizer, + OptionalGenerationConfig generation_config + ): LLMPipelineImplBase{dont_construct()}, m_impl{"", {}} {} + + ContinuousBatchingAdapter( + const std::filesystem::path& model_path, + const Tokenizer& tokenizer, + const SchedulerConfig& scheduler_config, + const std::string& device, + const ov::AnyMap& plugin_config + ): LLMPipelineImplBase{tokenizer}, m_impl{ + model_path.string(), + tokenizer, + scheduler_config, + device, + plugin_config + } {} + + ContinuousBatchingAdapter( + const std::filesystem::path& model_path, + const SchedulerConfig& scheduler_config, + const std::string& device, + const ov::AnyMap& plugin_config + ): LLMPipelineImplBase{Tokenizer(model_path.string())}, m_impl{ + model_path.string(), + m_tokenizer, + scheduler_config, + device, + plugin_config + } {} + + DecodedResults generate( + StringInputs inputs, + OptionalGenerationConfig generation_config, + StreamerVariant streamer + ) override { + std::vector<std::string> prompts = std::visit(overloaded{ + [](const std::string& prompt) { + return std::vector{prompt}; + }, + [](std::vector<std::string>& prompts) { + return prompts; + } + }, inputs); + const GenerationConfig& config = generation_config.has_value() ? *generation_config : m_generation_config; + // -1 == config.eos_token_id and config.validate() are handled in m_impl. + std::vector<GenerationResult> generated = m_impl.generate( + prompts, + std::vector<GenerationConfig>{prompts.size(), config}, + streamer + ); + std::vector<std::string> plain_replies; + std::vector<float> plain_scores; + for (GenerationResult& res : generated) { + if (GenerationStatus::FINISHED != res.m_status) { + OPENVINO_THROW("Got unfinished GenerationStatus"); + } + std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(plain_replies)); + std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(plain_scores)); + } + return {std::move(plain_replies), std::move(plain_scores)}; + } + + EncodedResults generate( + const EncodedInputs& inputs, + OptionalGenerationConfig generation_config, + StreamerVariant streamer + ) override { + std::vector<ov::Tensor> input_ids = std::visit(overloaded{ + [](const ov::Tensor& inp) { + size_t batch_size = inp.get_shape().at(0); + if (1 == batch_size) { + return std::vector{inp}; + } + std::vector<ov::Tensor> input_ids; + input_ids.reserve(batch_size); + size_t max_len = inp.get_shape().at(1); + const int64_t* const source = inp.data<const int64_t>(); + for (size_t batch_id = 0; batch_id < batch_size; ++batch_id) { + input_ids.emplace_back(ov::element::i64, ov::Shape(1, max_len)); + int64_t* destination = input_ids.back().data<int64_t>(); + std::copy_n(source + batch_id * max_len, max_len, destination); + } + return input_ids; + }, + [](const TokenizedInputs& inp) { + size_t batch_size = inp.input_ids.get_shape().at(0); + std::vector<ov::Tensor> input_ids; + input_ids.reserve(batch_size); + size_t max_len = inp.input_ids.get_shape().at(1); + const int64_t* const source = inp.input_ids.data<const int64_t>(); + const int64_t* const attention_mask = inp.attention_mask.data<const int64_t>(); + for (size_t batch_id = 0; batch_id < batch_size; ++batch_id) { + input_ids.emplace_back(ov::element::i64, ov::Shape(1, max_len)); + int64_t* destination = input_ids.back().data<int64_t>(); + size_t copy_count = 0; + for (size_t idx = 0; idx < max_len; ++idx) { + if (1 == attention_mask[batch_id * max_len + idx]) { + destination[copy_count++] = source[batch_id * max_len + idx]; + } + } + input_ids.back().set_shape({1, copy_count}); + } + return input_ids; + } + }, inputs); + const GenerationConfig& config = generation_config.has_value() ? *generation_config : m_generation_config; + // -1 == config.eos_token_id and config.validate() are handled in m_impl. + std::vector<EncodedGenerationResult> generated = m_impl.generate(input_ids, std::vector<GenerationConfig>{input_ids.size(), config}, streamer); + std::vector<std::vector<int64_t>> plain_tokens; + std::vector<float> plain_scores; + for (EncodedGenerationResult& res : generated) { + if (GenerationStatus::FINISHED != res.m_status) { + OPENVINO_THROW("Got unfinished GenerationStatus"); + } + std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(plain_tokens)); + std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(plain_scores)); + } + return {std::move(plain_tokens), std::move(plain_scores)}; + } + + void start_chat(const std::string& system_message) override { + m_impl.start_chat(); + }; + + void finish_chat() override { + m_impl.finish_chat(); + }; +}; +} + +ov::genai::LLMPipeline::LLMPipeline( + const ov::InferRequest& request, + const ov::genai::Tokenizer& tokenizer, + OptionalGenerationConfig generation_config +) { + auto start_time = std::chrono::steady_clock::now(); + m_pimpl = std::make_unique<StatefulLLMPipeline>(request, tokenizer, generation_config); + auto stop_time = std::chrono::steady_clock::now(); + m_pimpl->m_load_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count(); +} + +ov::genai::LLMPipeline::LLMPipeline( + const std::string& model_path, + const ov::genai::Tokenizer& tokenizer, + const std::string& device, + const ov::AnyMap& plugin_config +){ + auto start_time = std::chrono::steady_clock::now(); + if (plugin_config.find(ov::genai::scheduler_config.name()) != plugin_config.end()) { + auto config_without_scheduler_config = plugin_config; + config_without_scheduler_config.erase(ov::genai::scheduler_config.name()); + auto& scheduler_config = plugin_config.at(ov::genai::scheduler_config.name()).as<SchedulerConfig>(); + m_pimpl = std::make_unique<ContinuousBatchingAdapter>(model_path, tokenizer, scheduler_config, device, config_without_scheduler_config); + } else if ("NPU" == device) { + m_pimpl = std::make_unique<StaticLLMPipeline>(model_path, tokenizer, device, plugin_config); + } else { + m_pimpl = std::make_unique<StatefulLLMPipeline>(model_path, tokenizer, device, plugin_config); + } + auto stop_time = std::chrono::steady_clock::now(); + m_pimpl->m_load_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count(); +} + +ov::genai::LLMPipeline::LLMPipeline( + const std::string& path, + const std::string& device, + const ov::AnyMap& config +){ + auto start_time = std::chrono::steady_clock::now(); + if (config.find(ov::genai::scheduler_config.name()) != config.end()) { + auto config_without_scheduler_config = config; + config_without_scheduler_config.erase(ov::genai::scheduler_config.name()); + auto& scheduler_config = config.at(ov::genai::scheduler_config.name()).as<SchedulerConfig>(); + m_pimpl = std::make_unique<ContinuousBatchingAdapter>(path, scheduler_config, device, config_without_scheduler_config); + } else if ("NPU" == device) { + m_pimpl = std::make_unique<StaticLLMPipeline>(path, device, config); + } else { + m_pimpl = std::make_unique<StatefulLLMPipeline>(path, device, config); + } + auto stop_time = std::chrono::steady_clock::now(); + m_pimpl->m_load_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count(); +} + +ov::genai::GenerationConfig ov::genai::LLMPipeline::get_generation_config() const { + return m_pimpl->m_generation_config; +} + +ov::genai::Tokenizer ov::genai::LLMPipeline::get_tokenizer() { + return m_pimpl->m_tokenizer; +} + +void ov::genai::LLMPipeline::start_chat(const std::string& system_message) { + m_pimpl->start_chat(system_message); +} + +void ov::genai::LLMPipeline::finish_chat() { + m_pimpl->finish_chat(); +} + +void ov::genai::LLMPipeline::set_generation_config(const GenerationConfig& config) { + int64_t default_eos_token_id = m_pimpl->m_generation_config.eos_token_id; + m_pimpl->m_generation_config = config; + // if eos_token_id was not provided in config forward from default config + if (config.eos_token_id == -1) + m_pimpl->m_generation_config.eos_token_id = default_eos_token_id; + + m_pimpl->m_generation_config.validate(); +} + +ov::genai::LLMPipeline::~LLMPipeline() = default; diff --git a/src/cpp/src/llm_pipeline_base.hpp b/src/cpp/src/llm_pipeline_base.hpp new file mode 100644 index 0000000000..b2ad581e0b --- /dev/null +++ b/src/cpp/src/llm_pipeline_base.hpp @@ -0,0 +1,45 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "openvino/genai/llm_pipeline.hpp" +#include "openvino/genai/tokenizer.hpp" +#include "openvino/genai/streamer_base.hpp" + +namespace ov { +namespace genai { + +class LLMPipelineImplBase { +public: + LLMPipelineImplBase(const Tokenizer& tokenizer, + const GenerationConfig& config = {}) + : m_tokenizer(tokenizer), m_generation_config(config) { + } + + virtual DecodedResults generate( + StringInputs inputs, + OptionalGenerationConfig generation_config, + StreamerVariant streamer + ) = 0; + + virtual EncodedResults generate( + const EncodedInputs& inputs, + OptionalGenerationConfig generation_config, + StreamerVariant streamer + ) = 0; + + virtual void start_chat(const std::string& system_message) = 0; + virtual void finish_chat() = 0; + + virtual ~LLMPipelineImplBase() = default; + + Tokenizer m_tokenizer; + GenerationConfig m_generation_config; + std::optional<AdapterController> m_adapter_controller; + + float m_load_time_ms = 0; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp new file mode 100644 index 0000000000..bc18d254dd --- /dev/null +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -0,0 +1,681 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "llm_pipeline_static.hpp" + +#include <fstream> + +#include "openvino/pass/stateful_to_stateless.hpp" +#include "openvino/runtime/core.hpp" +#include "openvino/opsets/opset13.hpp" +#include "openvino/core/preprocess/pre_post_process.hpp" + +#include <jinja2cpp/user_callable.h> + +#include "text_callback_streamer.hpp" +#include "utils.hpp" + +namespace { + +std::shared_ptr<ov::Model> cvt_kvcache_to_fp16(const std::shared_ptr<ov::Model>& model) { + ov::preprocess::PrePostProcessor ppp(model); + + for (auto tensor : model->inputs()) { + if (tensor.get_any_name().find("past_key") != std::string::npos) { + ppp.input(tensor.get_any_name()).tensor().set_element_type(ov::element::Type_t::f16); + } + } + + for (auto tensor : model->outputs()) { + if (tensor.get_any_name().find("present") != std::string::npos) { + ppp.output(tensor.get_any_name()).tensor().set_element_type(ov::element::Type_t::f16); + } + } + + return ppp.build(); +} + +void align_u4_zp_constants(const std::shared_ptr<ov::Model>& model) { + for (auto op : model->get_ops()) { + if (ov::op::util::is_constant(op)) { + auto cst_op = std::dynamic_pointer_cast<ov::op::v0::Constant>(op); + const auto cst_op_out = cst_op->output(0); + if (cst_op_out.get_element_type() == ov::element::u4 && ov::shape_size(cst_op_out.get_shape()) == 1u) { + ov::Tensor cst_tensor(ov::element::u4, cst_op_out.get_shape()); + *static_cast<uint8_t*>(cst_tensor.data()) = cst_op->get_vector<uint8_t>()[0] & 0x0f; + auto new_cst_op = std::make_shared<ov::op::v0::Constant>(cst_tensor); + for (auto target_input : cst_op_out.get_target_inputs()) { + target_input.replace_source_output(new_cst_op); + } + } + } + } +} + +bool allow_to_enable_npuw_dq(const std::shared_ptr<ov::Model>& model) { + std::vector<std::string> rt_info_path = {"nncf", "weight_compression", "group_size"}; + if (!model->has_rt_info(rt_info_path)) { + // NB: Model isn't compressed by NNCF - skip + return false; + } + auto group_size = model->get_rt_info<int>(rt_info_path); + if (group_size == -1) { + // NB: Enable DQ for CW quantized models + return true; + } + return false; +} + +std::optional<ov::Any> pop_option(ov::AnyMap& config, const std::string& option_name) { + if (auto it = config.find(option_name); it != config.end()) { + config.erase(it); + return std::make_optional(it->second); + } + return std::nullopt; +} + +void enable_npuw_dq_if_allowed(ov::AnyMap& config, + const std::shared_ptr<ov::Model>& model) { + if (allow_to_enable_npuw_dq(model)) { + config["NPUW_DQ"] = "YES"; + pop_option(config, "NPUW_ONLINE_AVOID"); + } +} + +std::shared_ptr<ov::Model> redirect_new_kv_to_output(const std::shared_ptr<ov::Model>& model) { + const auto kStartOutputKVCacheLayers = 1u; + for (int i = kStartOutputKVCacheLayers; i < model->outputs().size(); ++i) { + auto kvout = model->output(i); + auto kvrslt = kvout.get_node(); + auto kvcat = kvrslt->inputs()[0].get_source_output().get_node(); + auto kvval = kvcat->inputs()[1].get_source_output(); + kvval.set_names({kvout.get_any_name()}); + kvrslt->inputs()[0].replace_source_output(kvval); + } + model->validate_nodes_and_infer_types(); + return model; +} + +std::shared_ptr<ov::Model> add_slices_to_kvcache_inputs(const std::shared_ptr<ov::Model>& model) { + const auto kvcache_name_pattern = "past_key_values"; + std::vector<std::shared_ptr<ov::opset13::Parameter>> new_params; + for (auto param : model->get_parameters()) { + auto tensor_name = param->get_output_tensor(0).get_any_name(); + if (tensor_name.find(kvcache_name_pattern) == std::string::npos) { + new_params.push_back(param); + continue; + } + auto shape = param->get_output_shape(0); + shape[2] += 1; + + auto new_param = std::make_shared<ov::opset13::Parameter>(param->get_element_type(), shape); + new_param->set_friendly_name(tensor_name); + new_param->outputs().begin()->get_tensor().set_names(param->outputs().begin()->get_tensor().get_names()); + + auto slice_start = std::make_shared<ov::opset13::Constant>( + ov::element::Type_t::i32, ov::Shape{1}, std::vector<int32_t>{1} + ); + auto slice_stop = std::make_shared<ov::opset13::Constant>( + ov::element::Type_t::i32, ov::Shape{1}, std::vector<int32_t>{static_cast<int32_t>(shape[2])} + ); + auto slice_step = std::make_shared<ov::opset13::Constant>( + ov::element::Type_t::i32, ov::Shape{1}, std::vector<int32_t>{1} + ); + auto slice_axes = std::make_shared<ov::opset13::Constant>( + ov::element::Type_t::i32, ov::Shape{1}, std::vector<int32_t>{2} + ); + auto slice_node = std::make_shared<ov::opset13::Slice>( + new_param, slice_start->output(0), slice_stop->output(0), slice_step->output(0), slice_axes->output(0) + ); + slice_node->set_friendly_name(tensor_name + "_Slice"); + for (auto target_input : param->output(0).get_target_inputs()) { + target_input.replace_source_output(slice_node->output(0)); + } + new_params.push_back(new_param); + } + return std::make_shared<ov::Model>(model->get_results(), ov::SinkVector{}, new_params); +} + +struct KVAxesPosition { + uint32_t batch; + uint32_t seq_len; +}; + +KVAxesPosition get_kv_axes(const std::string& model_type) { + KVAxesPosition axes; + if (model_type == "chatglm") { + axes.batch = 1u; + axes.seq_len = 0u; + } else if (model_type == "qwen") { + // Note, qwen2 does not fall into this category and conforms to default layout + axes.batch = 0u; + axes.seq_len = 1u; + } else { + axes.batch = 0u; + axes.seq_len = 2u; + } + return axes; +} + +std::string get_model_type_from_json(const std::filesystem::path& filepath) { + std::ifstream file(filepath); + OPENVINO_ASSERT(file.is_open(), "Could not open file: " + filepath.string()); + nlohmann::json config_data = nlohmann::json::parse(file); + std::string model_type = config_data["model_type"].get<std::string>(); + return model_type; +} + +void reshape_to_static(std::shared_ptr<ov::Model> model, + const uint32_t input_size, + const uint32_t kvcache_size, + const KVAxesPosition& kv_axes_position) { + std::map<std::string, ov::PartialShape> new_shapes; + for (auto input : model->inputs()) { + const auto& input_name = input.get_any_name(); + ov::PartialShape new_shape; + if (input_name.find("input_ids") != std::string::npos) { + new_shape = ov::PartialShape({1, input_size}); + } else if (input_name.find("attention_mask") != std::string::npos) { + new_shape = ov::PartialShape({1, kvcache_size}); + } else if (input_name.find("position_ids") != std::string::npos) { + new_shape = ov::PartialShape({1, input_size}); + } else { + const auto& partial_shape = input.get_partial_shape(); + new_shape = partial_shape; + new_shape[kv_axes_position.batch] = 1; + new_shape[kv_axes_position.seq_len] = kvcache_size - input_size; + } + new_shapes.emplace(input_name, new_shape); + } + model->reshape(new_shapes); +} + +void fill_tensor(ov::Tensor tensor, int64_t fill_val, size_t offset = 0u) { + int64_t* tensor_data = tensor.data<int64_t>(); + std::fill(tensor_data + offset, tensor_data + tensor.get_size(), fill_val); +} + +void copy_with_offset(const ov::Tensor& orig, const std::size_t offset, ov::Tensor& padded) { + int64_t* orig_data = orig.data<int64_t>(); + int64_t* padded_data = padded.data<int64_t>(); + std::copy(orig_data, orig_data + orig.get_size(), padded_data + offset); +} + +void merge_config_with(ov::AnyMap& lhs, const ov::AnyMap& rhs) { + for (const auto& [key, value] : rhs) { + // NB: Overwrite the value if key already exists + if (auto it = lhs.find(key); it != lhs.end()) { + it->second = value; + } else { + lhs.emplace(key, value); + } + } +} + +ov::AnyMap get_default_prefill_config(const std::shared_ptr<ov::Model>& model) { + ov::AnyMap config = { + { "NPU_USE_NPUW", "YES" }, + { "NPUW_FOLD", "YES" }, + { "NPUW_DCOFF_TYPE", "f16" }, + { "NPUW_DCOFF_SCALE", "YES" }, + { "NPUW_WEIGHTS_BANK", "shared" }, + { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add" }, + { "NPUW_ONLINE_AVOID", "P:RMSNorm/NPU" } + }; + enable_npuw_dq_if_allowed(config, model); + return config; +} + +ov::AnyMap get_default_generate_config(const std::shared_ptr<ov::Model>& model) { + ov::AnyMap config = { + { "NPU_USE_NPUW", "YES" }, + { "NPUW_FOLD", "YES" }, + { "NPUW_DCOFF_TYPE", "f16" }, + { "NPUW_DCOFF_SCALE", "YES" }, + { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add" }, + { "NPUW_PARALLEL_COMPILE", "YES" }, + { "NPUW_FUNCALL_ASYNC", "YES" }, + { "NPUW_WEIGHTS_BANK", "shared" }, + { "NPUW_ONLINE_AVOID", "P:RMSNorm/NPU" } + }; + enable_npuw_dq_if_allowed(config, model); + return config; +} + +template <typename T> +T pop_or_default(ov::AnyMap& config, const std::string& key, const T& default_value) { + auto anyopt = pop_option(config, key); + if (anyopt.has_value()) { + return anyopt.value().as<T>(); + } + return default_value; +} + +ov::Tensor make_tensor_slice(ov::Tensor tensor, size_t dim, size_t start_pos, size_t end_pos) { + ov::Shape start_shape(std::vector<size_t>(tensor.get_shape().size(), 0u)); + start_shape[dim] = start_pos; + ov::Shape end_shape = tensor.get_shape(); + end_shape[dim] = end_pos; + return ov::Tensor(tensor, start_shape, end_shape); +} + +void drop_cache_dir(ov::AnyMap& config) { + if (config.count("NPU_USE_NPUW") != 0u) { + pop_option(config, "CACHE_DIR"); + } +} + +} // anonymous namespace + +namespace ov { +namespace genai { + +StaticLLMPipeline::StaticLLMPipeline( + const std::filesystem::path& path, + const ov::genai::Tokenizer& tokenizer, + const std::string& device, + const ov::AnyMap& config +) : LLMPipelineImplBase(tokenizer, + utils::from_config_json_if_exists(path)) { + auto pipeline_config = config; + /* NB: Static LLM pipeline consists of two models, + first to process the input prompt (prefill), + second to use in generation loop (kvcache) + + There are two ways of how these models can be created + and user chooses one or another via configuration option + "USE_BLOBS": + 1. When both models are created from the provided .xml one, + that is "USE_BLOBS=NO" default way. + 2. When both models are directly imported from provided prefill + and generation precompiled blobs, that is "USE_BLOBS=YES" way. + */ + const auto use_blobs = pop_or_default(pipeline_config, "USE_BLOBS", false); + if (!use_blobs) { + setupAndCompileModels(path, device, pipeline_config); + } else { + setupAndImportModels(path, device, pipeline_config); + } + // Initialize tensors + prepare_for_new_conversation(); +}; + +StaticLLMPipeline::StaticLLMPipeline( + const std::filesystem::path& path, + const std::string& device, + const ov::AnyMap& config +) : StaticLLMPipeline(path, path.string(), device, config) { +} + +void StaticLLMPipeline::setupAndCompileModels( + const std::filesystem::path& path, + const std::string& device, + ov::AnyMap& pipeline_config) { + /* Initialization assumes multiple steps if user passes "USE_BLOBS=NO": + 1) Read the template model - this will be kvcache model + 2) Expose KV-cache input and output layers from kvcache model + 3) Align u4 ZP constants - TODO: get rid of this step in future + 4) Replace KV-cache tensors for the entire cache to tensors only for new token (before concat) + 5) Clone the model - this will be prefill + 6) Reshape both models to static shape + 7) Compile both models + */ + + ov::Core core; + + // (1) Read the template model - this will be kvcache model + m_kvcache_model = core.read_model(path / "openvino_model.xml"); + // (2) Expose KV-cache input and output layers from kvcache model + ov::pass::StatefulToStateless().run_on_model(m_kvcache_model); + // (3) Align u4 ZP constants + align_u4_zp_constants(m_kvcache_model); + // (4) Replace KV-tensors for the entire cache to tensors only for new token + m_kvcache_model = redirect_new_kv_to_output(m_kvcache_model); + // (5) Convert kvcache tensors to fp16 precision + m_kvcache_model = cvt_kvcache_to_fp16(m_kvcache_model); + // (6) Clone the model - this will be prefill + m_prefill_model = m_kvcache_model->clone(); + m_prefill_model->set_friendly_name(m_kvcache_model->get_friendly_name() + "_prefill"); + // (7) Reshape both models to static shape + const auto kMaxPromptLen = pop_or_default(pipeline_config, "MAX_PROMPT_LEN", 1024u); + const auto kMinResponseLen = pop_or_default(pipeline_config, "MIN_RESPONSE_LEN", 150u); + KVAxesPosition axes = get_kv_axes(get_model_type_from_json(path / "config.json")); + m_kvcache_desc = KVCacheDesc { kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, axes.seq_len }; + reshape_to_static(m_prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes); + reshape_to_static(m_kvcache_model, 1u, m_kvcache_desc.total_size, axes); + // (8) Compile both model + auto prefill_config = pop_or_default( + pipeline_config, "PREFILL_CONFIG", get_default_prefill_config(m_prefill_model) + ); + auto generate_config = pop_or_default( + pipeline_config, "GENERATE_CONFIG", get_default_generate_config(m_kvcache_model) + ); + merge_config_with(prefill_config, pipeline_config); + merge_config_with(generate_config, pipeline_config); + // FIXME: Drop CACHE_DIR option if NPUW is enabled + drop_cache_dir(prefill_config); + drop_cache_dir(generate_config); + + m_prefill_request = core.compile_model( + m_prefill_model, device, prefill_config + ).create_infer_request(); + m_kvcache_request = core.compile_model( + m_kvcache_model, device, generate_config + ).create_infer_request(); +} + +void StaticLLMPipeline::setupAndImportModels( + const std::filesystem::path& path, + const std::string& device, + ov::AnyMap& pipeline_config) { + /* To initialize pipeline in case when user passes "USE_BLOBS=YES", + next steps are required: + 1) Check that neither MAX_PROMPT_LEN nor MIN_RESPONSE_LEN is + exposed in the config. These parameters will be retrieved + from blobs + 2) Import prefill model from model directory or specified path + 3) Import generate model from model directory or specified path + 4) Fill in m_kvcache_desc + */ + ov::Core core; + + auto import_blob = [this, + &path, + &pipeline_config, + &core, + &device](const std::string& model_name, + ov::AnyMap& model_config) { + auto blob_path = pop_or_default(model_config, "BLOB_PATH", std::string{}); + + if (blob_path.empty()) { + blob_path = (path / + (std::string("openvino_") + model_name + ".blob")).string(); + } + + if (!std::filesystem::exists(blob_path)) { + OPENVINO_THROW("Blob for " + model_name + " model is not found at: " + + blob_path); + } + + merge_config_with(model_config, pipeline_config); + + std::fstream fs(blob_path, std::ios::in | std::ios::binary); + + return core.import_model( + fs, device, model_config); + + }; + + auto get_kvcache_size = [](ov::CompiledModel& model) { + for (auto input : model.inputs()) { + const auto& input_name = input.get_any_name(); + if (input_name.find("attention_mask") != std::string::npos) { + return static_cast<uint32_t>(input.get_shape()[1]); + } + } + OPENVINO_THROW("No attention_mask input is found! Such model isn't supported."); + }; + + // (1) Check that neither MAX_PROMPT_LEN nor MIN_RESPONSE_LEN is + // exposed in the config + if (pipeline_config.count("MAX_PROMPT_LEN") || + pipeline_config.count("MIN_RESPONSE_LEN")) { + OPENVINO_THROW("Neither \"MAX_PROMPT_LEN\" nor \"MIN_RESPONSE_LEN\"" + " can be specified in \"USE_BLOBS=YES\" configuration!"); + } + // (2) Import prefill model from model directory or specified path + auto prefill_config = pop_or_default(pipeline_config, "PREFILL_CONFIG", ov::AnyMap()); + auto prefill_model = import_blob("prefill", prefill_config); + m_prefill_request = prefill_model.create_infer_request(); + // (3) Import generate model from model directory or specified path + auto generate_config = pop_or_default(pipeline_config, "GENERATE_CONFIG", ov::AnyMap()); + auto generate_model = import_blob("generate", generate_config); + m_kvcache_request = generate_model.create_infer_request(); + // (4) Fill in m_kvcache_desc + const uint32_t kMaxPromptLen = get_kvcache_size(prefill_model); + const uint32_t kMinResponseLen = get_kvcache_size(generate_model) - kMaxPromptLen; + // FIXME For some models KV-cache dim != 2u + m_kvcache_desc = KVCacheDesc { kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, 2u }; +} + +void StaticLLMPipeline::start_chat(const std::string& system_message) { + if (!system_message.empty()) { + m_history.push_back({{"role", "system"}, {"content", system_message}}); + } + m_is_chat_conversation = true; +}; + +void StaticLLMPipeline::finish_chat() { + m_is_chat_conversation = false; + m_history.clear(); +}; + +void StaticLLMPipeline::prepare_for_new_conversation() { + fill_tensor(m_prefill_request.get_tensor("input_ids"), m_tokenizer.get_pad_token_id()); + fill_tensor(m_prefill_request.get_tensor("position_ids"), 0u); + fill_tensor(m_prefill_request.get_tensor("attention_mask"), 0u); + fill_tensor(m_kvcache_request.get_tensor("attention_mask"), 0u); + m_kvcache_desc.num_stored_tokens = 0u; +} + +DecodedResults StaticLLMPipeline::generate( + StringInputs inputs, + OptionalGenerationConfig generation_config, + StreamerVariant streamer +) { + auto start_time = std::chrono::steady_clock::now(); + + GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config; + std::string prompt; + if (auto input_vector = std::get_if<std::vector<std::string>>(&inputs)) { + if (input_vector->size() > 1u) { + OPENVINO_THROW("Currently only batch size=1 is supported"); + } + OPENVINO_ASSERT(!input_vector->empty()); + prompt = std::move(input_vector->front()); + } else { + OPENVINO_ASSERT(std::holds_alternative<std::string>(inputs)); + prompt = std::get<std::string>(inputs); + } + + ov::genai::TokenizedInputs tokenized_input; + if (m_is_chat_conversation) { + m_history.push_back({{"role", "user"}, {"content", prompt}}); + constexpr bool add_generation_prompt = true; + prompt = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); + // for chat ov::genai::add_special_tokens(false) is aligned with stateful pipeline and HF + tokenized_input = m_tokenizer.encode(prompt, ov::genai::add_special_tokens(false)); + } else { + tokenized_input = m_tokenizer.encode(prompt); + } + + auto encode_stop_time = std::chrono::steady_clock::now(); + auto encoded_results = generate(tokenized_input, config, streamer); + + auto decode_start_time = std::chrono::steady_clock::now(); + DecodedResults decoded_results = {m_tokenizer.decode(encoded_results.tokens), encoded_results.scores}; + auto decode_stop_time = std::chrono::steady_clock::now(); + + if (m_is_chat_conversation) { + auto answer = decoded_results.texts[0]; + m_history.push_back({{"role", "assistant"}, {"content", answer}}); + } + // generate_durations + decoded_results.perf_metrics = encoded_results.perf_metrics; + auto& raw_counters = decoded_results.perf_metrics.raw_metrics; + auto stop_time = std::chrono::steady_clock::now(); + raw_counters.generate_durations = std::vector<MicroSeconds>(); + raw_counters.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time)); + raw_counters.tokenization_durations.emplace_back(PerfMetrics::get_microsec(encode_stop_time - start_time)); + raw_counters.detokenization_durations.emplace_back(PerfMetrics::get_microsec(decode_stop_time - decode_start_time)); + decoded_results.perf_metrics.m_evaluated = false; + decoded_results.perf_metrics.evaluate_statistics(start_time); + return decoded_results; +} + +EncodedResults StaticLLMPipeline::generate( + const EncodedInputs& inputs, + OptionalGenerationConfig generation_config, + StreamerVariant streamer +) { + auto start_time = std::chrono::steady_clock::now(); + ov::Tensor input_ids; + ov::Tensor attention_mask; + + if (auto data = std::get_if<ov::Tensor>(&inputs)) { + input_ids = *data; + attention_mask = ov::genai::utils::init_attention_mask(input_ids); + } else if (auto data = std::get_if<TokenizedInputs>(&inputs)) { + input_ids = data->input_ids; + attention_mask = data->attention_mask; + } + + if (input_ids.get_shape().at(0) > 1u) { + OPENVINO_THROW("Currently only batch size=1 is supported"); + } + + GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config; + // If eos_token_id was not provided, take value from default m_generation_config + if (config.eos_token_id == -1) + config.set_eos_token_id(m_generation_config.eos_token_id); + config.validate(); + + std::shared_ptr<StreamerBase> streamer_ptr; + if (auto streamer_obj = std::get_if<std::monostate>(&streamer)) { + streamer_ptr = nullptr; + } else if (auto streamer_obj = std::get_if<std::shared_ptr<StreamerBase>>(&streamer)) { + streamer_ptr = *streamer_obj; + } else if (auto callback = std::get_if<std::function<bool(std::string)>>(&streamer)) { + streamer_ptr = std::make_shared<TextCallbackStreamer>(m_tokenizer, *callback); + } + + if (!config.is_greedy_decoding()) { + OPENVINO_THROW("Currently only greedy decoding is supported"); + } + + ov::Shape prompts_shape = input_ids.get_shape(); + const size_t batch_size = prompts_shape[0]; + ov::genai::EncodedResults results; + auto& raw_perf_counters = results.perf_metrics.raw_metrics; + // NB: Only batch=1 is supported now + results.scores.resize(1u); + results.scores[0] = 0u; + results.tokens.resize(1u); + + // NB: Check if there is enough space in KV-cache to process input prompt + auto prompt_len = input_ids.get_size(); + if (prompt_len > m_kvcache_desc.max_prompt_size) { + OPENVINO_THROW("Static LLM pipeline may only process prompts up to " + + std::to_string(m_kvcache_desc.max_prompt_size) + " tokens. " + + "Set the \"MAX_PROMPT_LEN\" config option to increase the limit."); + } + + // NB: From the "generate" perspective, every call is treated as start of new conversation, + // but if continuation is needed, prompt contains information about the entire conversation. + prepare_for_new_conversation(); + + auto padded_input_ids = m_prefill_request.get_tensor("input_ids"); + const size_t offset = padded_input_ids.get_size() - input_ids.get_size(); + copy_with_offset(input_ids, offset, padded_input_ids); + + auto padded_attention_mask = m_prefill_request.get_tensor("attention_mask"); + fill_tensor(padded_attention_mask, 1u, offset); + + auto padded_position_ids = m_prefill_request.get_tensor("position_ids"); + auto* padded_pos_data = padded_position_ids.data<int64_t>(); + std::iota(padded_pos_data + offset, padded_pos_data + padded_position_ids.get_size(), 0u); + + m_prefill_request.infer(); + raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now()); + raw_perf_counters.m_batch_sizes.emplace_back(batch_size); + + // NB: Now there are prompt_len tokens in KV-cache + m_kvcache_desc.num_stored_tokens += static_cast<uint32_t>(prompt_len); + int64_t last_token = utils::argmax(m_prefill_request.get_tensor("logits"), 0); + results.tokens[0].push_back(last_token); + if (streamer_ptr && streamer_ptr->put(last_token)) { + return results; + } + + // Inputs: input_ids, attention_mask, position_ids, ... + // Outputs: logits, ... + const auto kStartInputKVCacheLayers = 3u; + const auto kStartOutputKVCacheLayers = 1u; + + // NB: Copy KV-cache tensors from prefill model to kvcache model + const auto& kvcache_compiled = m_kvcache_request.get_compiled_model(); + for (int i = 0; i < kvcache_compiled.outputs().size() - 1; ++i) { + + const auto& output_name = kvcache_compiled.outputs()[kStartOutputKVCacheLayers + i].get_any_name(); + auto prefill_out_tensor = m_prefill_request.get_tensor(output_name); + auto prefill_out_slice = make_tensor_slice( + prefill_out_tensor, m_kvcache_desc.dim, m_kvcache_desc.max_prompt_size - m_kvcache_desc.num_stored_tokens, m_kvcache_desc.max_prompt_size + ); + + const auto& input_name = kvcache_compiled.inputs()[kStartInputKVCacheLayers + i].get_any_name(); + auto kvcache_in_tensor = m_kvcache_request.get_tensor(input_name); + auto kvcache_in_slice = make_tensor_slice( + kvcache_in_tensor, m_kvcache_desc.dim, 0u, m_kvcache_desc.num_stored_tokens + ); + + prefill_out_slice.copy_to(kvcache_in_slice); + } + + auto* input_ids_data = m_kvcache_request.get_tensor("input_ids").data<int64_t>(); + auto* position_ids_data = m_kvcache_request.get_tensor("position_ids").data<int64_t>(); + auto* attention_mask_data = m_kvcache_request.get_tensor("attention_mask").data<int64_t>(); + + // NB: Fill attention mask in the correct format [1, 1 ... 1, 0, 0 ... 0, 1] + std::fill(attention_mask_data, attention_mask_data + m_kvcache_desc.num_stored_tokens - 1u, 1u); + attention_mask_data[m_kvcache_desc.total_size - 1] = 1u; + + const size_t max_tokens = config.get_max_new_tokens(prompt_len); + for (int i = 0; i < max_tokens - 1; ++i) { + input_ids_data[0] = last_token; + position_ids_data[0] = m_kvcache_desc.num_stored_tokens; + attention_mask_data[m_kvcache_desc.num_stored_tokens - 1] = 1u; + + m_kvcache_request.infer(); + m_kvcache_desc.num_stored_tokens += 1; + + last_token = utils::argmax(m_kvcache_request.get_tensor("logits"), 0); + results.tokens[0].push_back(last_token); + + raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now()); + raw_perf_counters.m_batch_sizes.emplace_back(batch_size); + if (streamer_ptr && streamer_ptr->put(last_token)) { + break; + } + + if (last_token == config.eos_token_id && !config.ignore_eos) { + break; + } + + // NB: KV-cache is full, further generation is impossible + if (m_kvcache_desc.num_stored_tokens == m_kvcache_desc.total_size) { + break; + } + + // NB: Write KV-cache for the new token to the correct input position for the next iteration + for (int i = 0; i < kvcache_compiled.outputs().size() - 1; ++i) { + const auto& input_name = kvcache_compiled.inputs()[kStartInputKVCacheLayers + i].get_any_name(); + auto kvcache_in_tensor = m_kvcache_request.get_tensor(input_name); + auto kvcache_in_slice = make_tensor_slice( + kvcache_in_tensor, m_kvcache_desc.dim, m_kvcache_desc.num_stored_tokens - 1, m_kvcache_desc.num_stored_tokens + ); + const auto& output_name = kvcache_compiled.outputs()[kStartOutputKVCacheLayers + i].get_any_name(); + m_kvcache_request.get_tensor(output_name).copy_to(kvcache_in_slice); + } + } + auto stop_time = std::chrono::steady_clock::now(); + // If is called without tokenization then that stat will not be reported. + auto& metrics = results.perf_metrics; + metrics.num_input_tokens = batch_size * input_ids.get_shape().at(1); + metrics.load_time = this->m_load_time_ms; + metrics.raw_metrics.generate_durations.emplace_back(PerfMetrics::get_microsec(stop_time - start_time)); + metrics.evaluate_statistics(start_time); + return results; +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp new file mode 100644 index 0000000000..55b75ae3b3 --- /dev/null +++ b/src/cpp/src/llm_pipeline_static.hpp @@ -0,0 +1,76 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <filesystem> + +#include "llm_pipeline_base.hpp" + +namespace ov { +namespace genai { + +class StaticLLMPipeline final : public LLMPipelineImplBase { +public: + StaticLLMPipeline( + const std::filesystem::path& path, + const ov::genai::Tokenizer& tokenizer, + const std::string& device, + const ov::AnyMap& config + ); + + StaticLLMPipeline( + const std::filesystem::path& path, + const std::string& device, + const ov::AnyMap& config + ); + + void setupAndCompileModels( + const std::filesystem::path& path, + const std::string& device, + ov::AnyMap& pipeline_config); + + void setupAndImportModels( + const std::filesystem::path& path, + const std::string& device, + ov::AnyMap& pipeline_config); + + DecodedResults generate( + StringInputs inputs, + OptionalGenerationConfig generation_config, + StreamerVariant streamer + ) override; + + EncodedResults generate( + const EncodedInputs& inputs, + OptionalGenerationConfig generation_config, + StreamerVariant streamer + ) override; + + void start_chat(const std::string& system_message) override; + void finish_chat() override; +private: + void prepare_for_new_conversation(); + +private: + struct KVCacheDesc { + uint32_t max_prompt_size; + uint32_t total_size; + uint32_t num_stored_tokens; + uint32_t dim; + }; + + // FIXME: Ideally, we don't need to keep those + std::shared_ptr<ov::Model> m_kvcache_model; + std::shared_ptr<ov::Model> m_prefill_model; + + KVCacheDesc m_kvcache_desc; + ov::InferRequest m_kvcache_request; + ov::InferRequest m_prefill_request; + + bool m_is_chat_conversation = false; + ChatHistory m_history; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/logit_processor.hpp b/src/cpp/src/logit_processor.hpp new file mode 100644 index 0000000000..2e904cb023 --- /dev/null +++ b/src/cpp/src/logit_processor.hpp @@ -0,0 +1,390 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <algorithm> +#include <cmath> + +#include "openvino/genai/generation_config.hpp" + +struct Token { + float m_log_prob = 0.; + int64_t m_index = 0; + + Token(float log_prob, int64_t index) : m_log_prob(log_prob), m_index(index) {} + Token() = default; +}; + +struct Logits { + float * m_data = nullptr; + size_t m_size; + // Late initialized for top_p or top_k transforms + std::vector<Token> m_vector; + + Logits(float* data, size_t size): m_data(data), m_size(size) {} + + + void initialize_vector() { + OPENVINO_ASSERT(m_vector.size() == 0, "Logits vector already initialized"); + m_vector.reserve(m_size); + for (size_t i = 0; i < m_size; i++) + m_vector.emplace_back(m_data[i], i); + } + + bool is_vector_initialized() const { + return m_vector.size() > 0; + } + + void resize(size_t new_size) { + m_size = new_size; + m_vector.resize(new_size); + } +}; + +namespace LogitTransformers { +using TokenIds = std::vector<int64_t>; + +class ILogitTransformer { +public: + virtual void apply(Logits& logits) = 0; + + virtual bool is_applicable(size_t generated_tokens_cnt = 0) { + return true; + } +}; + +class TopPFilter : public ILogitTransformer { +public: + TopPFilter(double top_p) : m_top_p(top_p) {} + + bool partial_sort_and_resize(Logits& logits) { + // Since most of the time huge part of logits vector contains minimal values + // expensive sorting of entire vector might be unnecessary, especially for low values of top_p. + // This method partially sorts vector finding M top elements and stops when top_p condition is met. + // It iterates a few times starting with M = 16 and multiplying it by 2 each iteration until M = 1024. + // If top_p is found in considered scope it resizes logits vector and returns true. Otherwise it returns false. + // Note that it can we less performant than standard approach if logits value are more evenly distributed across the vector. + for (size_t step = 16; step <= 1024; step *= 2) { + if (logits.m_vector.size() <= step) + break; + std::partial_sort(logits.m_vector.begin(), logits.m_vector.begin() + step, logits.m_vector.end(), [](const Token& lhs, const Token& rhs) {return lhs.m_log_prob > rhs.m_log_prob; }); + float sum = 0.0; + for (int i = 0; i < step; i++) { + sum += logits.m_vector[i].m_log_prob; + if (sum > m_top_p) { + logits.resize(i+1); + return true; + } + } + } + return false; + } + + void full_sort_and_resize(Logits& logits) { + std::sort(logits.m_vector.begin(), logits.m_vector.end(), [](const Token& lhs, const Token& rhs) {return lhs.m_log_prob > rhs.m_log_prob; }); + float probability_sum = 0.0f; + size_t nucleus_size = 0; + for (const auto& logit : logits.m_vector) { + probability_sum += logit.m_log_prob; + nucleus_size += 1; + if (probability_sum > m_top_p) break; + } + logits.resize(nucleus_size); + } + + void apply(Logits& logits) override { + // Initialize and sort vector. Try partial sorting first and if it's not enough, sort entire vector. + logits.initialize_vector(); + if(!partial_sort_and_resize(logits)) + full_sort_and_resize(logits); + } + +protected: + double m_top_p = 0.f; +}; + +class TopKFilter : public ILogitTransformer { +public: + TopKFilter(size_t top_k) : m_top_k(top_k) {} + + // If this transform is used along with top_p, it should be applied after it since top_p sorts entire vector and top_k does it only partially + void apply(Logits& logits) override { + + if (m_top_k >= logits.m_size) + return; + + // If top_p is also used vector is already initialized and sorted + if (!logits.is_vector_initialized()) { + // Initialize and partially sort vector + logits.initialize_vector(); + std::partial_sort(logits.m_vector.begin(), logits.m_vector.begin() + m_top_k, logits.m_vector.end(), [](const Token& lhs, const Token& rhs) {return lhs.m_log_prob > rhs.m_log_prob; }); + } + logits.resize(m_top_k); + } + +protected: + size_t m_top_k = 0; +}; + +class TemperatureLogitTransform : public ILogitTransformer { +public: + TemperatureLogitTransform(double temperature) : m_temperature(temperature) {}; + + void apply(Logits& logits) override { + float max_logit = -std::numeric_limits<float>::infinity(); + for (size_t i = 0; i < logits.m_size; i++) { + if (logits.m_data[i] > max_logit) { + max_logit = logits.m_data[i]; + } + } + + float norm_sum = 0.0; + for (size_t i = 0; i < logits.m_size; i++) { + logits.m_data[i] = expf((logits.m_data[i] - max_logit) / this->m_temperature); + norm_sum += logits.m_data[i]; + } + + for (size_t i = 0; i < logits.m_size; i++) { + logits.m_data[i] /= norm_sum; + } + } + +protected: + float m_temperature = 0.f; +}; + + +class IPenaltyTransformer : public ILogitTransformer { +public: + void set_unique_generated_token_ids(const std::shared_ptr<std::map<int64_t, size_t>>& unique_generated_token_ids) { + if (unique_generated_token_ids != nullptr) { + m_unique_generated_token_ids = unique_generated_token_ids; + } else { + m_unique_generated_token_ids = std::shared_ptr<std::map<int64_t, size_t>>(new std::map<int64_t, size_t>); + } + } + + void extract_generated_tokens(const TokenIds& input_ids) { + set_unique_generated_token_ids(m_unique_generated_token_ids); + + for (const auto& input_id : input_ids) { + if (m_unique_generated_token_ids->count(input_id)) { + m_unique_generated_token_ids->at(input_id)++; + } else { + m_unique_generated_token_ids->insert({input_id, 1}); + } + } + } + +protected: + std::shared_ptr<std::map<int64_t, size_t>> m_unique_generated_token_ids = nullptr; + double m_penalty = 0.f; +}; + +class RepetitionPenaltyTransform : public IPenaltyTransformer { +public: + RepetitionPenaltyTransform(double repetition_penalty) { + m_penalty = repetition_penalty; + }; + + void apply(Logits& logits) override { + size_t vocab_size = logits.m_size; + for (const auto& prompt_id : *m_unique_prompt_token_ids) { + OPENVINO_ASSERT((prompt_id >= 0) && (prompt_id < vocab_size), "input_ids token out of bounds"); + if (logits.m_data[prompt_id] >= 0) { + logits.m_data[prompt_id] /= m_penalty; + } else { + logits.m_data[prompt_id] *= m_penalty; + }; + } + for (const auto& input_id_pair : *m_unique_generated_token_ids) { + const auto& input_id = input_id_pair.first; + OPENVINO_ASSERT((input_id >= 0) && (input_id < vocab_size), "input_ids token out of bounds"); + if (logits.m_data[input_id] >= 0) { + logits.m_data[input_id] /= m_penalty; + } else { + logits.m_data[input_id] *= m_penalty; + }; + } + } + + void apply(Logits& logits, const TokenIds& input_ids) { + set_unique_prompt_token_ids(nullptr); + extract_generated_tokens(input_ids); + apply(logits); + } + + void set_unique_prompt_token_ids(const std::shared_ptr<std::set<int64_t>>& unique_prompt_token_ids) { + if (unique_prompt_token_ids != nullptr) { + m_unique_prompt_token_ids = unique_prompt_token_ids; + } else { + m_unique_prompt_token_ids = std::shared_ptr<std::set<int64_t>>(new std::set<int64_t>); + } + } + +protected: + std::shared_ptr<std::set<int64_t>> m_unique_prompt_token_ids = nullptr; +}; + +class EOSPenaltyTransform : public ILogitTransformer { +public: + EOSPenaltyTransform(const std::set<int64_t>& stop_token_ids, size_t min_generated_tokens) : + m_stop_token_ids(stop_token_ids), m_applicable_tensor_len(min_generated_tokens) {} + + void apply(Logits& logits) override { + // Since EOS penalty is applied early, the token vector is not initialized yet + // and we can assume element order match token ids. + for (auto stop_token_id: m_stop_token_ids) + logits.m_data[stop_token_id] = 0.f; + } + + + bool is_applicable(size_t generated_tokens_cnt = 0) override { + return generated_tokens_cnt < m_applicable_tensor_len; + } + +protected: + size_t m_applicable_tensor_len = std::numeric_limits<size_t>::max(); + std::set<int64_t> m_stop_token_ids; +}; + +class FrequencyPenaltyTransform : public IPenaltyTransformer { +public: + FrequencyPenaltyTransform(double value) { + m_penalty = value; + }; + + void apply(Logits& logits) override { + size_t vocab_size = logits.m_size; + for (const auto& input_id_pair : *m_unique_generated_token_ids) { + const auto& input_id = input_id_pair.first; + OPENVINO_ASSERT((input_id >= 0) && (input_id < vocab_size), "input_ids token out of bounds"); + if (logits.m_data[input_id] >= 0) { + logits.m_data[input_id] -= m_penalty * input_id_pair.second; + } else { + logits.m_data[input_id] += m_penalty * input_id_pair.second; + }; + } + } + + void apply(Logits& logits, const TokenIds& input_ids) { + extract_generated_tokens(input_ids); + apply(logits); + } +}; + +class PresencePenaltyTransform : public IPenaltyTransformer { +public: + PresencePenaltyTransform(double value) { + m_penalty = value; + }; + + void apply(Logits& logits) override { + size_t vocab_size = logits.m_size; + for (const auto& input_id_pair : *m_unique_generated_token_ids) { + const auto& input_id = input_id_pair.first; + OPENVINO_ASSERT((input_id >= 0) && (input_id < vocab_size), "input_ids token out of bounds"); + if (logits.m_data[input_id] >= 0) { + logits.m_data[input_id] -= m_penalty; + } else { + logits.m_data[input_id] += m_penalty; + }; + } + } + + void apply(Logits& logits, const TokenIds& input_ids) { + extract_generated_tokens(input_ids); + apply(logits); + } +}; + + +} // namespace LogitTransformers + +class LogitProcessor { +protected: + std::vector<std::shared_ptr<LogitTransformers::ILogitTransformer>> m_logit_transformers; + + std::shared_ptr<std::map<int64_t, size_t>> m_unique_generated_token_ids = std::shared_ptr<std::map<int64_t, size_t>>(new std::map<int64_t, size_t>); + std::shared_ptr<std::set<int64_t>> m_unique_prompt_token_ids = std::shared_ptr<std::set<int64_t>>(new std::set<int64_t>); + size_t m_generated_tokens = 0; + +public: + LogitProcessor(const ov::genai::GenerationConfig& sampling_params, + const LogitTransformers::TokenIds& input_ids) { + for (const auto& input_id : input_ids) { + m_unique_prompt_token_ids->insert(input_id); + } + + if (sampling_params.min_new_tokens > 0) { + m_logit_transformers.emplace_back( + new LogitTransformers::EOSPenaltyTransform(sampling_params.stop_token_ids, sampling_params.min_new_tokens) + ); + } + + if (sampling_params.is_multinomial() || sampling_params.is_greedy_decoding()) { + if (sampling_params.repetition_penalty != 1.0f) { + std::shared_ptr<LogitTransformers::RepetitionPenaltyTransform> transformer = + std::shared_ptr<LogitTransformers::RepetitionPenaltyTransform>(new LogitTransformers::RepetitionPenaltyTransform(sampling_params.repetition_penalty)); + transformer->set_unique_prompt_token_ids(m_unique_prompt_token_ids); + transformer->set_unique_generated_token_ids(m_unique_generated_token_ids); + m_logit_transformers.push_back(transformer); + } + if (sampling_params.presence_penalty != 0.0f) { + std::shared_ptr<LogitTransformers::PresencePenaltyTransform> transformer = + std::shared_ptr<LogitTransformers::PresencePenaltyTransform>(new LogitTransformers::PresencePenaltyTransform(sampling_params.presence_penalty)); + transformer->set_unique_generated_token_ids(m_unique_generated_token_ids); + m_logit_transformers.push_back(transformer); + + } + if (sampling_params.frequency_penalty != 0.0f) { + std::shared_ptr<LogitTransformers::FrequencyPenaltyTransform> transformer = + std::shared_ptr<LogitTransformers::FrequencyPenaltyTransform>(new LogitTransformers::FrequencyPenaltyTransform(sampling_params.frequency_penalty)); + transformer->set_unique_generated_token_ids(m_unique_generated_token_ids); + m_logit_transformers.push_back(transformer); + } + + if (sampling_params.is_multinomial()) { + m_logit_transformers.emplace_back(new LogitTransformers::TemperatureLogitTransform(sampling_params.temperature)); + if (sampling_params.top_p != 1.0f) { + m_logit_transformers.emplace_back(new LogitTransformers::TopPFilter(sampling_params.top_p)); + } + if (sampling_params.top_k > 0 && sampling_params.top_k < std::numeric_limits<size_t>::max()) { + m_logit_transformers.emplace_back(new LogitTransformers::TopKFilter(sampling_params.top_k)); + } + } + } + } + + void apply(Logits& logits) { + for (const auto& transformer : m_logit_transformers) { + if (transformer->is_applicable(m_generated_tokens)) { + transformer->apply(logits); + } + } + } + + void update_generated_len(size_t updated_len) { + m_generated_tokens = updated_len; + } + + size_t get_generated_len() { + return m_generated_tokens; + } + + void register_new_generated_token(int64_t new_token_id) { + auto it = m_unique_generated_token_ids->find(new_token_id); + if (it == m_unique_generated_token_ids->end()) { + m_unique_generated_token_ids->insert({new_token_id, 1}); + } else { + it->second++; + } + } + + void decrease_generated_token_occurance(int64_t token_id) { + OPENVINO_ASSERT(m_unique_generated_token_ids->count(token_id) > 0); + m_unique_generated_token_ids->at(token_id)--; + } + +}; diff --git a/src/cpp/src/lora_adapter.cpp b/src/cpp/src/lora_adapter.cpp new file mode 100644 index 0000000000..e66800e1c9 --- /dev/null +++ b/src/cpp/src/lora_adapter.cpp @@ -0,0 +1,1295 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include <algorithm> +#include <set> +#include <map> +#include <string> +#include <vector> +#include <fstream> +#include <regex> +#include <optional> + +#include "openvino/op/add.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/matmul.hpp" +#include "openvino/op/convert.hpp" +#include "openvino/op/convolution.hpp" +#include "openvino/op/matmul.hpp" +#include "openvino/op/concat.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/op/broadcast.hpp" +#include "openvino/op/read_value.hpp" +#include "openvino/op/assign.hpp" +#include "openvino/op/transpose.hpp" +#include "openvino/op/util/variable.hpp" +#include "openvino/pass/pattern/matcher.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "openvino/pass/graph_rewrite.hpp" +#include "openvino/pass/manager.hpp" +#include "openvino/runtime/core.hpp" + +#include "openvino/genai/lora_adapter.hpp" + +extern "C" { + #include "safetensors.h" +} + +// If set to 1, the empty tensors will be used to switch LoRA adapter off. +// FIXME: Fix the plugins and set to 1 permanently. +#define EMPTY_TENSORS_SUPPORTED_IN_MATMUL 0 + +// If set to 1, LoRA state tensors will have the original type of LoRA adapter come from safetensors file. +// If there are multiple LoRA adapters are applied, then negotiation between them happens. +// If set to 0, LoRA state etnsors are always have type f32. +// FIXME: Fix the plugins and set to 1 permanently. +#define FP16_BF16_TENSORS_SUPPORTED_IN_STATE 0 + +// FIXME: Remove or move to a dedicated common header +#ifdef NDEBUG + #define DEBUG_PRINT(X) do {} while(false) +#else + #define DEBUG_PRINT(X) do { std::cerr << "[ DEBUG ] " << X << "\n"; } while(false) +#endif + +namespace { + +using NodePtr = std::shared_ptr<ov::Node>; +using ov::NodeVector; +using namespace ov::op; + +// FIXME: Use ov::AlignedBuffer instead of std::vector. ov::AlignedBuffer is not available in public OV API +using Buffer = std::vector<char>; +using BufferPtr = std::shared_ptr<Buffer>; +using ConstantVector = std::vector<std::shared_ptr<v0::Constant>>; + + +// Holds usual LoRA parameters alpha, A and B of a given type. +template <typename T> +struct LoRAParts { + T alpha, A, B; + + LoRAParts() = default; + LoRAParts(const T& alpha, const T& A, const T& B) : alpha(alpha), A(A), B(B) {} + + template <typename Other> + LoRAParts(const LoRAParts<Other>& other) : alpha(other.alpha), A(other.A), B(other.B) {} +}; + + +using LoRAWeight = LoRAParts<std::shared_ptr<v0::Constant>>; +using LoRANode = LoRAParts<std::shared_ptr<ov::Node>>; +using LoRAPartsParser = LoRAParts<std::function<std::optional<std::string>(const std::string& name)>>; +using LoRATensors = std::map<std::string, LoRAWeight>; + + +// Read binary file to memory. +BufferPtr read_file_helper(const std::string& filename) { + std::ifstream file(filename, std::ios::binary | std::ios::ate); + OPENVINO_ASSERT(file.is_open(), "Cannot open file with LoRA weights: ", filename); + + size_t filesize = file.tellg(); + auto buffer = std::make_shared<Buffer>(filesize); + file.seekg(0, std::ios::beg); + // TODO: Use mmapped AlignedBuffer as ov::Core::read_model can do, necessary functionality is not available in public OV API. + // LoRA files do not usuall have huge size in comparison to the base models, but it can vary depending on adapter, + // and using mmap will help to optimize memory consumption and could be critical + // when the application at the edge of available memory that is not really uncommon for applications dealing with LLMs. + file.read(&(*buffer)[0], filesize); + + return buffer; +} + + +// Converts Safetensors element type to OV element type. Only part of the types are supported. +ov::element::Type safetensors_to_ov_element_type (int dtype) { + switch(dtype) { + case SAFETENSORS_F32: + return ov::element::f32; + case SAFETENSORS_F16: + return ov::element::f16; + case SAFETENSORS_BF16: + return ov::element::bf16; + default: + OPENVINO_THROW("Not supported safetensors dtype: ", dtype); + } +} + + +using ConstantMap = std::map<std::string, std::shared_ptr<ov::op::v0::Constant>>; + + +// Safetensor file parser that deallocates temporary buffers automatically. +// Drop-in replacement for the third party safetensors_File struct. +struct AutoSafetensor: public safetensors_File { + ~AutoSafetensor () { + std::free(tensors); + std::free(metadata); + } +}; + + +// Reads a file with a given filename expecting Safetensors file format. +// The data is read to a solid memory block and the function returns a map of OV Constants allocated on top of that block. +// The key in the map is a tensor name and the Constant uses a region of memory from the memory block. +// Each Constant holds a shared pointer to the block in the runtime info. +// The memory block will be deallocated when the last Constant is destroyed. +ConstantMap read_safetensors(const std::string& filename) { + auto buffer = read_file_helper(filename); + AutoSafetensor safe_tensors_file{}; + + OPENVINO_ASSERT( + safetensors_file_init(&(*buffer)[0], buffer->size(), &safe_tensors_file) == nullptr, + "Cannot parse ", filename, " as a Safetensors file format. Safetensors file format is supported only" + ); + + ConstantMap tensors; + for (int i = 0; i < safe_tensors_file.num_tensors; i++) { + safetensors_TensorDescriptor tensor = safe_tensors_file.tensors[i]; + std::string name(tensor.name.ptr, tensor.name.ptr + tensor.name.len); + ov::Shape shape(tensor.shape, tensor.shape + tensor.n_dimensions); + void* ptr = tensor.ptr; // FIXME: needs a non-constant pointer because Tensor doesn't accept a constant pointer + + OPENVINO_ASSERT( + ov::shape_size(shape) <= tensor.end_offset_bytes - tensor.begin_offset_bytes, + "Tensor shape ", ov::shape_size(shape), " for tensor \"", name, "\" from Safetensors file \"", filename, "\" doesn't match the expected tensor size ", + tensor.end_offset_bytes - tensor.begin_offset_bytes); + + auto type = safetensors_to_ov_element_type(tensor.dtype); + auto constant = + std::make_shared<v0::Constant>(type, shape, ptr, nullptr); // wraps existing memory, no ownership + constant->get_rt_info()["__safetensors_buffer_holder"] = buffer; // to automatically deallocate underlying memory buffer when last constant that holds it is destoyed + tensors[name] = constant; + } + return tensors; +} + + +// Holds a compiled regex pattern and an index to a particular capture group +// operator() takes a string, parses it with that regex pattern and returns the capture group value +struct RegexParser { + std::regex pattern; + size_t capture_index; + RegexParser (const std::string& pattern, size_t capture_index) : pattern(pattern), capture_index(capture_index) {} + std::optional<std::string> operator() (const std::string& name) { + std::smatch match; + if(std::regex_match(name, match, pattern)) { + return match[capture_index]; + } + return std::nullopt; + } +}; + + +// Default LoRA tensor name patterns observed in the existing LoRA adapters, captures the prefix that should correspond to a layer name in the base model +LoRAPartsParser default_lora_patterns () { + return LoRAPartsParser( + RegexParser(R"((.*)\.alpha)", 1), + RegexParser(R"((.*)\.(lora_(A|down)\.weight))", 1), + RegexParser(R"((.*)\.(lora_(B|up)\.weight))", 1) + ); +} + + +// Group tensors loaded from LoRA adapter file into triads A, B and alpha grouped by layer names. +LoRATensors group_lora_tensors(const ConstantMap& tensors, const LoRAPartsParser& parts_parser) { + LoRATensors result; + for(const auto& named_tensor: tensors) { + if(auto parsed = parts_parser.A(named_tensor.first)) { + result[*parsed].A = named_tensor.second; + } else if(auto parsed = parts_parser.B(named_tensor.first)) { + result[*parsed].B = named_tensor.second; + } else if(auto parsed = parts_parser.alpha(named_tensor.first)) { + result[*parsed].alpha = named_tensor.second; + } else { + DEBUG_PRINT("Ignored LoRA tensor \"" << named_tensor.first << "\" because couldn't recognize expected name pattern." ); + } + } + + // Check that A and B exist for each LoRA entry + for(const auto& lora_tensor: result) { + OPENVINO_ASSERT(lora_tensor.second.A && lora_tensor.second.B, "Either A, B or both matrices are missing in LoRA tensors for layer: ", lora_tensor.first); + } + return result; +} + + +// Squeeze all dimensions from the right of the shape producing a tensor of 2D shape. +NodePtr squeeze_2d (const ov::Output<ov::Node>& input) { + auto shape = v0::Constant::create(ov::element::i32, {2}, std::vector<int>{0, 0}); + auto reshape = std::make_shared<v1::Reshape>(input, shape->output(0), true); + return reshape; +} + + +// Unsqueeze shape to add dimensions to the right of the shape to have a tensor of a given rank. +NodePtr unsqueeze (const ov::Output<ov::Node>& input, unsigned int rank) { + auto src_rank = input.get_partial_shape().rank().get_length(); + std::vector<unsigned int> dims(rank); + std::fill(dims.begin() + src_rank, dims.end(), 1); + auto shape = v0::Constant::create(ov::element::i32, {rank}, dims); + auto reshape = std::make_shared<v1::Reshape>(input, shape->output(0), true); + return reshape; +} + + +using LoRAWeightGetter = std::function<std::optional<LoRANode>(const std::string&)>; +using LoRAWeightByNodeGetter = std::function<std::optional<LoRANode>(NodePtr)>; + + +// LoRA adapter parameters applied to a specific place in the model. +// Depending on LoRA mode can have static or dynamic LoRA rank that accumulates +// the ranks from all applicable LoRA tensors (if there are multiple LoRA adapters). +struct LoRAParameters { + ov::Dimension rank; // accumulated LoRA rank, could be dynamic if rank is not known or DYNAMIC mode is applied + ov::element::Type type; // element type of a tensor that will be applied to the model, negotiated based on multple LoRA adapters + bool fine_grained_alpha; // use 1D tensor of the same rank for alpha instead of a scalar to blend multiple weighted LoRAs + // TODO: flag to have various alphas over the batch +}; + +using LoRAParametersGetter = std::function<std::optional<LoRAParameters>(NodePtr node)>; + +// Maps a given layer name to corresponding LoRA tensors based on the default name mapping schema. +// Layer name should start with a given prefix that is eliminated from the name before search for matching LoRA tensor. +// It works for a single LoRA adapter. +// Returns std::nullopt, if there is no LoRA adapter for a given layer name. +struct LoRAWeightGetterDefault { + // TODO: Add filtering by tensor name prefix + const LoRATensors* lora_tensors; + const std::string prefix; + mutable std::set<std::string> used_tensors; + + LoRAWeightGetterDefault (const LoRATensors* lora_tensors, const std::string& prefix) : lora_tensors(lora_tensors), prefix(prefix) {} + + std::optional<LoRANode> operator() (const std::string& name) const { + std::string name_with_underscores = name; + // TODO: Investigate what is the root cause for this replacement in the name. Customize mapping or change PT FE to produce correct weight names. + std::replace(name_with_underscores.begin(), name_with_underscores.end(), '.', '_'); + auto it = std::find_if(lora_tensors->begin(), lora_tensors->end(), [this, name, name_with_underscores](const LoRATensors::value_type& pair){ + std::string lora_name = pair.first; + // TODO: Make this filtering for prefix once in ctor as a more efficient solution + if(lora_name.find(prefix) == 0) { + lora_name = lora_name.substr(prefix.length()); + } else { + return false; + } + // TODO: Should it be an exact match instead of substring taking into account that we should provide custom mapper for names? + return name.find(lora_name) != std::string::npos || name_with_underscores.find(lora_name) != std::string::npos; + }); + if(it != lora_tensors->end()) { + used_tensors.insert(it->first); + return it->second; + } + return std::nullopt; + } +}; + + +// Maps a node in the base model to LoRA parameters object that describes how the LoRA tensors should be injected for that node. +// Works with multiple LoRAs accumulating their properties into a single LoRAParameter instance. +// Returns std::nullopt, if there is no LoRA adapter for a given node. +struct LoRAParametersByWeightGetter { + std::vector<LoRAWeightGetter> weight_getter; + bool dynamic_lora_rank = true; + bool fine_grained_alpha = true; + ov::element::Type type; + + std::optional<LoRAParameters> operator() (NodePtr node) const { + // If at least one weight_getter gives the weight for the node, then this node should be processed. + + ov::Dimension rank = ov::Dimension::dynamic(); + if(dynamic_lora_rank) { + // Leave rank dynamic if at least one adapter exist for a give node. + if(weight_getter.end() == + std::find_if(weight_getter.begin(), weight_getter.end(), [node](const LoRAWeightGetter& getter) { + return bool(getter(node->get_friendly_name())); + })) { + return std::nullopt; + } + } else { + // Accumulates all ranks from all adapters applicable for a given node. + auto size = std::accumulate(weight_getter.begin(), weight_getter.end(), 0u, [node](unsigned int acc, const LoRAWeightGetter& getter) { + if(auto nodes = getter(node->get_friendly_name())) { + return static_cast<unsigned int>(acc + nodes->A->get_output_partial_shape(0)[0].get_length()); + } else { + return acc; + } + }); + if(size == 0) { + // as LoRA adapters with 0 rank cannot exist, 0 menas there are no adapters for a given node + return std::nullopt; + } + rank = size; + } + + LoRAParameters result; + result.rank = rank; + result.type = type; + result.fine_grained_alpha = fine_grained_alpha; + return result; + } +}; + + +// TODO: There is possible simplification if a new feature is implemented in OpenVINO: +// move name from LoRAVarIDs to to LoRAIndices when the order of tensors in the model state in OV infer request will +// be the same as the order of variables, remove LoRAVarsIDs in this case. + +struct LoRAIndices : public LoRAParts<size_t> { + std::string name; +}; + +struct LoRAVarIDs : public LoRAParts<ov::op::util::VariableInfo> { + std::string name; // layer name where LoRA with given variables is attached +}; + + +// Deduce expected LoRA input and output static dimensions based on a given node where LoRA is applied +// A given node should be MatMul or Convolution +void deduce_input_output_dims(NodePtr node, ov::Dimension& input_dim, ov::Dimension& output_dim) { + if(std::dynamic_pointer_cast<v1::Convolution>(node)) { + input_dim = node->get_input_partial_shape(1)[1]; + output_dim = node->get_input_partial_shape(1)[0]; + } else if(auto matmul = std::dynamic_pointer_cast<v0::MatMul>(node)) { + input_dim = node->get_input_partial_shape(1)[matmul->get_transpose_b()]; + output_dim = node->get_input_partial_shape(1)[!matmul->get_transpose_b()]; + } else { + OPENVINO_THROW( + "deduce_input_output_dims expects MatMul or Convolution, but got ", node, + ". Given LoRA adapter is unsupported." + ); + } +} + + +// Creates ReadValue and Assign nodes to inject LoRA tensors as variables for a given node but +// doesn't connect them to the model returning as LoRANode instance. +struct LoRAWeightStateGetter { + LoRAParametersGetter params_getter; + std::shared_ptr<ov::Model> model; + std::vector<LoRAVarIDs>& variable_ids; + // TODO: Use variable indices instead of variable_id for faster search for a state tensor + + LoRAWeightStateGetter (const LoRAParametersGetter& params_getter, std::shared_ptr<ov::Model> model, std::vector<LoRAVarIDs>& variable_ids) : + params_getter(params_getter), model(model), variable_ids(variable_ids) {} + + std::optional<LoRANode> operator() (NodePtr node) const { + if(auto params = params_getter(node)) { + ov::Dimension input_dim, output_dim; + deduce_input_output_dims(node, input_dim, output_dim); + + std::string name = node->get_friendly_name(); + // FIXME: Potential name conflict if LoRA is applied multiple times by using this infrastrcuture independently each time (not a recommended approach). + // TODO: Check for name collisions searching for existing variables with the same names. + std::string variable_id_prefix = "lora_state_" + std::to_string(model->get_sinks().size()) + name; + LoRANode result; + LoRAVarIDs var_ids; + var_ids.name = name; + + // FIXME: No guarantees on ordering of state in InferRequest makes impossible using indices of variables later, forced to use variable_id instead + //indices.A = model->get_variables().size(); + var_ids.A = ov::op::util::VariableInfo{ + ov::PartialShape{params->rank, input_dim}, // Will be used with transpose_b == true + params->type, + variable_id_prefix + ".A" + }; + result.A = add_variable(var_ids.A); + // FIXME: No guarantees on ordering of state in InferRequest makes impossible using indices of variables later, forced to use variable_id instead + //indices.A = model->get_variables().size(); + var_ids.alpha = ov::op::util::VariableInfo{ + params->fine_grained_alpha ? ov::PartialShape{1, params->rank} : ov::PartialShape{}, + ov::element::f32, // alpha is always f32 because it is set from host as float data type + variable_id_prefix + ".alpha" + }; + result.alpha = add_variable(var_ids.alpha); + // FIXME: No guarantees on ordering of state in InferRequest makes impossible using indices of variables later, forced to use variable_id instead + //indices.B = model->get_variables().size(); + var_ids.B = ov::op::util::VariableInfo{ + ov::PartialShape{output_dim, params->rank}, // Will be used with transpose_b == true + params->type, + variable_id_prefix + ".B" + }; + result.B = add_variable(var_ids.B); + variable_ids.emplace_back(var_ids); + return result; + } else { + return std::nullopt; + } + } + + NodePtr add_variable(const ov::op::util::VariableInfo& variable_info) const { + auto variable = std::make_shared<ov::op::util::Variable>(variable_info); + model->add_variables({variable}); + #if 0 + // Attempt to pre-build initialization expression with empty tensors that should discard LoRA effect by default + // FIXME: CPU plugin fails when there is no initialization expression is given and type is not fp32 + ov::Shape init_shape(shape.rank().get_length()); + for(size_t i = 0; i < shape.size(); ++i) { + init_shape[i] = shape[i].get_min_length(); + } + auto init = v0::Constant::create(type, init_shape, std::vector<float>(ov::shape_size(init_shape), 0)); + auto read_value = std::make_shared<v6::ReadValue>(init, variable); + #else + auto read_value = std::make_shared<v6::ReadValue>(variable); + #endif + model->add_sinks({std::make_shared<v6::Assign>(read_value, variable)}); // FIXME: Required? -- Yes, create ticket against CPU + return read_value; + } +}; + + +// Transformation that injects LoRA tensors or tensors entry points into the base model. +// The exact form of injection is implemented in the derived classes via overriding `apply` method +// that is called for each applicable node in the base model. +// Detects if a given node requires adaptation based on LoRAWeightByNodeGetter object which maps +// a node to LoRA parameters object. +// Applies only for MatMul and Convolution nodes. +class LoRATransformBase : public ov::pass::MatcherPass { +public: + + OPENVINO_RTTI("LoRATransformBase"); + + LoRATransformBase(const LoRAWeightByNodeGetter& lora_weight_getter) { + register_matcher( + std::make_shared<ov::pass::pattern::Matcher>(ov::pass::pattern::wrap_type<v0::MatMul, v1::Convolution>(), this->get_type_info().name), + ([lora_weight_getter, this](ov::pass::pattern::Matcher& m) { + auto node = m.get_match_root(); + try { + if(auto lora_weight = lora_weight_getter(node)) { + if(apply(node, *lora_weight)) { + ++applied; // FIXME: For debugging purposes only + return true; + } + } + return false; + } catch(const std::exception& exception) { + DEBUG_PRINT("Exception happens on layer: " << node << " with exception message: " << exception.what()); + throw; + } catch(...) { + DEBUG_PRINT("Unknown exception happens on layer: " << node); + throw; + } + }) + ); + } + + ~LoRATransformBase () { + DEBUG_PRINT("LoRA applied for " << applied << " layers"); // For debugging purposes only + } + +protected: + + virtual bool apply(NodePtr node, const LoRANode& lora_weight) = 0; + +private: + + size_t applied = 0; // For debug statistics only + +}; + + +// Builds LoRA subgraph that consists of several matrix and element-wise multiplications with optional data type conversions and reshapes +// to build a consistent graph. +NodePtr tensors_multiplication(NodePtr input, const NodeVector multipliers, ov::Output<ov::Node> target, bool transpose_weights, size_t alpha_pos, bool transpose_in_end) { + const auto target_type = target.get_element_type(); + const auto target_shape = target.get_partial_shape(); + const auto target_rank = target_shape.rank().get_length(); + for(size_t i = 0; i < multipliers.size(); ++i) { + NodePtr normalized = multipliers[i]; + if(normalized->get_output_element_type(0) != target_type) { + normalized = std::make_shared<v0::Convert>(normalized, target_type); + } + if(normalized->get_output_partial_shape(0).rank().get_length() > 2) { + // FIXME: Any other shape patterns possible? + normalized = squeeze_2d(normalized); + } + if(input) { + if(i == alpha_pos) { + // TODO: Apply alpha multiplication separately + input = std::make_shared<v1::Multiply>(input, normalized); + } else { + input = std::make_shared<v0::MatMul>(input, normalized, /*transpose_a = */false, transpose_weights); // FIXME: verify transpose_a == true + } + } else { + input = normalized; + } + } + + if(transpose_in_end) { + // FIXME: Check the dimensions we really need to move, currently it is hardcoded 2 + 2 dimensions that usually appears in 2D Convolution case + // where we need to apply LoRA for the first two dimensions (channels) while interpreting two last dimensions (spatial ) + // TODO: Stash transposition constant to reuse + auto transposition = v0::Constant::create(ov::element::i32, ov::Shape{4}, std::vector<int>{2, 3, 0, 1}); + input = std::make_shared<v1::Transpose>(input, transposition); + } else if(input->get_output_partial_shape(0).rank().get_length() != target_rank) { + input = unsqueeze(input, target_rank); + } + + input = std::make_shared<v1::Add>(target, input); + + return input; +} + + +// Taking a node detects an optional weight decompression pattern Constant -> Convert. +// Returns a pointer to Convert node if it exists, or nullptr if there is no Convert. +// If unsupported decompression pattern is used, throws an exception. +NodePtr decompression_convert (NodePtr node) { + auto convert = std::dynamic_pointer_cast<v0::Convert>(node); + if(convert) { + node = convert->get_input_node_shared_ptr(0); + } + OPENVINO_ASSERT( + std::dynamic_pointer_cast<v0::Constant>(node), + "Not supported decompression pattern at the weight input (presumably low-bit compression). Use f32/f16/bf16 weights only." + ); + return convert; +} + + +// Cache of infer request for on-demand build and compiled helper models for weight modification. +// It maps a model signature which is an arbitrary string to OpenVINO infer request. +// Defines `evaluate` method that compute a model by a given signature and input tensors. +class InferRequestSignatureCache { +public: + using Signature = std::string; + + InferRequestSignatureCache (const std::string& device) : device(device) {} + + bool exist (const Signature& signature) { + return requests.count(signature); + } + + void insert (const Signature& signature, std::shared_ptr<ov::Model> model) { + requests[signature] = core.compile_model(model, device).create_infer_request(); + } + + ov::InferRequest& at(const Signature& signature) { + return requests.at(signature); + } + + void evaluate(const Signature& signature, const ov::TensorVector& inputs, ov::TensorVector& outputs) { + auto& request = at(signature); + auto compiled_model = request.get_compiled_model(); + OPENVINO_ASSERT(inputs.size() == compiled_model.inputs().size()); + OPENVINO_ASSERT(outputs.size() == compiled_model.outputs().size()); + for(size_t i = 0; i < inputs.size(); ++i) { + request.set_input_tensor(i, inputs[i]); + } + for(size_t i = 0; i < outputs.size(); ++i) { + auto target_shape = request.get_compiled_model().output(i).get_partial_shape(); + if(target_shape != outputs[i].get_shape() && target_shape.is_static()) { + // do it for static case only, because if target shape is dynamic, the plugin is allowed to set shape on its own + outputs[i].set_shape(target_shape.get_shape()); + } + request.set_output_tensor(i, outputs[i]); + } + request.infer(); // TODO: Consider using async to increase througput, requies more complicated archestration + } + +private: + + ov::Core core; + std::unordered_map<Signature, ov::InferRequest> requests; + std::string device; +}; + + +// Transformation that modifies existing weights in the base model fusing an arbitrary number of LoRA adapters. +// This is one-way LoRA fusion that cannot be undone. +// By default it uses CPU plugin to modify the base model weights. +// TODO: This transformation unpacks potentially compressed to f16/bf16 weights to f32, +// we should pack it back into the original precsion to maintain the same wieght size. +// But it will work well if all plugins equally support fp-compressed weights and can unpack them on-line. +class LoRAFuseTransform : public LoRATransformBase { + + InferRequestSignatureCache fusers; + + void signature_push_back(InferRequestSignatureCache::Signature& signature, ov::Output<ov::Node> input) const { + // TODO: Define hash function on vector<tuple<element_type, PartialShape>> to make it C++ish + signature += "(el: " + input.get_element_type().get_type_name() + ", shape: " + input.get_partial_shape().to_string() + ")"; + } + +public: + + OPENVINO_RTTI("LoRAFuseTransform"); + + LoRAFuseTransform(const LoRAWeightByNodeGetter& lora_weight_getter, const std::string& device_for_fusion = "CPU") : + LoRATransformBase(lora_weight_getter), + fusers(device_for_fusion) + {} + + bool apply (NodePtr node, const LoRANode& lora_weight) override { + auto weights_input = node->input_value(1); + auto weights_input_type = weights_input.get_element_type(); + auto weights_convert = decompression_convert(weights_input.get_node_shared_ptr()); + auto weights_constant = weights_convert ? weights_convert->input_value(0) : weights_input; + ConstantVector adapter = { + std::dynamic_pointer_cast<v0::Constant>(lora_weight.alpha), + std::dynamic_pointer_cast<v0::Constant>(lora_weight.B), + std::dynamic_pointer_cast<v0::Constant>(lora_weight.A)}; + InferRequestSignatureCache::Signature signature; + signature_push_back(signature, weights_input); + for(auto multiplier : adapter) { + signature_push_back(signature, multiplier); + } + + // TODO: In case when comressed repacking of newly created weights is retained, + // replace weights_input by weigths_constant to keep decompression Convert in the model. + auto consumers = weights_input.get_target_inputs(); + + if(!fusers.exist(signature)) { + // Build a small model for weight and LoRA fusion, and stash it into `fusers` cache. + ov::ParameterVector parameters; + auto target_parameter = std::make_shared<v0::Parameter>(weights_constant.get_element_type(), weights_constant.get_partial_shape()); + parameters.push_back(target_parameter); // original weights input is one of the parameters + ov::Output<ov::Node> target = weights_convert ? weights_convert->clone_with_new_inputs({target_parameter}) : target_parameter; + for(auto multiplier : adapter) { + parameters.push_back(std::make_shared<v0::Parameter>(multiplier->get_output_element_type(0), multiplier->get_output_partial_shape(0))); + } + auto result = std::make_shared<v0::Result>(tensors_multiplication(nullptr, NodeVector{parameters.begin() + 1, parameters.end()}, target, false, 1, false)); + auto weights_model = std::make_shared<ov::Model>(ov::ResultVector{result}, parameters); + fusers.insert(signature, weights_model); + } + + // Newly created contants in the next line are not mmaped unlike original weights, so it will inflate required memory + // eventually allocating up to 2x of the base model size. + // 2X is due to usually applied compression in the base model that is not retained in the current version of this code. + // But even if the compression is used, then still a copy of all weights that affected by the LoRA adapters are allocated in memory. + // FIXME: Provide a way for postponed weight repacking that will be triggered by the plugin in compile_model call for the base model. + // Constant sub-expression can be a solution, but it requres improvements inside plugins, because currently it works extremely slow. + auto replacement_const = std::make_shared<v0::Constant>(weights_input.get_element_type(), weights_input.get_shape()); + + ov::TensorVector outputs{replacement_const->get_tensor_view()}; + // set input constants + ov::TensorVector inputs; + inputs.reserve(1 + adapter.size()); + inputs.push_back(std::dynamic_pointer_cast<v0::Constant>(weights_constant.get_node_shared_ptr())->get_tensor_view()); + for(size_t i = 0; i < adapter.size(); ++i) { + inputs.push_back(adapter[i]->get_tensor_view()); + } + fusers.evaluate(signature, inputs, outputs); + + for (auto consumer : consumers) { + consumer.replace_source_output(replacement_const->output(0)); + } + return true; + } +}; + + +// Transformation that modifies the base model inserting new nodes that do LoRA matrix multiplications alongside with the original MatMul/Convolution. +class LoRASeparateTransform : public LoRATransformBase { +public: + + OPENVINO_RTTI("LoRASeparateTransform"); + + LoRASeparateTransform(const LoRAWeightByNodeGetter& lora_getter) : LoRATransformBase(lora_getter) {} + + bool apply (NodePtr node, const LoRANode& lora_weight) override { + auto activations = node->input_value(0); // FIXME: consider MatMul.transpose_a + auto weights_input = node->input_value(1); + auto weights_input_type = weights_input.get_element_type(); + //DEBUG_PRINT("WEIGHTS SHAPE: " << weights_input.get_partial_shape()); + NodePtr add_term = nullptr; + NodePtr replacement = nullptr; + + auto target = node->output(0); + + auto target_rank = target.get_partial_shape().rank().get_length(); + auto consumers = target.get_target_inputs(); + bool transpose_in_end = false; + + // FIXME: Should check rank of activations instead of target rank + if(target_rank == 4 && target.get_partial_shape()[target_rank - 3].get_length() > 1) { + // FIXME: Check the dimensions we really need to move, currently it is hardcoded 2 + 2 dimensions + // FIXME: Stash transposition constant to reuse + auto transposition = v0::Constant::create(ov::element::i32, ov::Shape{4}, std::vector<int>{2, 3, 0, 1}); + auto transpose = register_new_node<v1::Transpose>(activations, transposition); + activations = transpose; + transpose_in_end = true; + } + + NodeVector lora_variables{lora_weight.A, lora_weight.alpha, lora_weight.B}; + replacement = tensors_multiplication(activations.get_node_shared_ptr(), lora_variables, target, true, 1, transpose_in_end); + + for (auto consumer : consumers) { + consumer.replace_source_output(replacement->output(0)); + } + + return true; + } +}; + + +std::shared_ptr<v0::Constant> alpha_as_constant(float alpha) { + return v0::Constant::create(ov::element::f32, ov::Shape{1}, {alpha}); +} + + +} // namespace + + +namespace ov { +namespace genai { + + +class Adapter::Impl { +public: + Impl(const std::string& path) : + tensors(group_lora_tensors(read_safetensors(path), default_lora_patterns())) + {} + + LoRATensors tensors; +}; + + +Adapter::Adapter(const std::string& path) : + m_pimpl(std::make_shared<Adapter::Impl>(path)) { +} + + +bool operator== (const Adapter& a, const Adapter& b) { + return a.m_pimpl == b.m_pimpl; +} + + +bool operator< (const Adapter& a, const Adapter& b) { + return a.m_pimpl < b.m_pimpl; +} + + +struct AdapterControllerImpl { + std::vector<LoRAVarIDs> variable_ids; + const std::string prefix; + AdapterConfig current_config; + bool need_full_apply = true; + InferRequestSignatureCache lora_state_evaluators; + + AdapterControllerImpl(std::shared_ptr<ov::Model> model, const AdapterConfig& config, const std::string& prefix) : + prefix(prefix), + current_config(config), // FIXME: Compare current and passed configs and change incrementally + lora_state_evaluators("CPU") // FIXME: Try to run on the same device that is used for model inference + { + LoRAParametersByWeightGetter params_getter; + #if FP16_BF16_TENSORS_SUPPORTED_IN_STATE + params_getter.type = ov::element::dynamic; + #else + params_getter.type = ov::element::f32; + #endif + + for(auto const& adapter : current_config.get_adapters()) { + auto adapter_impl = get_adapter_impl(adapter); + params_getter.weight_getter.push_back(LoRAWeightGetterDefault(&adapter_impl->tensors, prefix)); + // TODO: Instead of aggregating types over all tensors in each adapter, make decision per node in LoRAWeightStateGetter + /*if(params_getter.type != ov::element::f32)*/ { // FIXME: Implement element_type tolerant code when state is set and uncomment this condition + for(auto const& tensor : adapter_impl->tensors) { + auto lora_tensor_type = tensor.second.A->get_output_element_type(0); + OPENVINO_ASSERT(lora_tensor_type == tensor.second.B->get_output_element_type(0)); + if(params_getter.type == ov::element::dynamic) { + params_getter.type = lora_tensor_type; + } else if(params_getter.type != lora_tensor_type) { + // if types are not match among multiple LoRA tensos then fall back to f32 + // TODO: Provide a more smart negotiation between multiple LoRAs: check ranges, try to pack to f16 + params_getter.type = ov::element::f32; + break; + } + } + } + } + + auto weight_as_constant = [&, this](NodePtr node) -> std::optional<LoRANode> { + // FIXME: lora_placeholder is for passing element type only + LoRAParts<ov::Tensor> lora_placeholder{ + ov::Tensor(ov::element::f32, Shape{0}), + ov::Tensor(params_getter.type, ov::Shape{0}), + ov::Tensor(params_getter.type, ov::Shape{0}) + }; + auto name = node->get_friendly_name(); + auto lora_weight = prepare_lora_tensors(name, params_getter.weight_getter, lora_placeholder, false); + if(lora_weight.alpha) { + return LoRANode( + // TODO: Make sure that tensors will not be disposed during constant life time + std::make_shared<v0::Constant>(lora_weight.alpha), + std::make_shared<v0::Constant>(lora_weight.A), + std::make_shared<v0::Constant>(lora_weight.B) + ); + } else { + return std::nullopt; + } + }; + + ov::pass::Manager pm; + auto mode = current_config.get_mode(); + if(mode == AdapterConfig::MODE_DYNAMIC || mode == AdapterConfig::MODE_STATIC_RANK || mode == AdapterConfig::MODE_AUTO) { + // State mode + params_getter.dynamic_lora_rank = (mode != AdapterConfig::MODE_STATIC_RANK); + pm.register_pass<LoRASeparateTransform>(LoRAWeightStateGetter(params_getter, model, variable_ids)); + } else if(mode == AdapterConfig::MODE_STATIC) { + // Separate constant mode + pm.register_pass<LoRASeparateTransform>(weight_as_constant); + } else if(mode == AdapterConfig::MODE_FUSE) { + // Fuse mode + pm.register_pass<LoRAFuseTransform>(weight_as_constant); + } else { + OPENVINO_THROW("Unrecognized AdapterConfig::Mode was used: ", mode); + } + + pm.run_passes(model); + model->validate_nodes_and_infer_types(); // FIXME: For debugging purposes only + } + + static std::shared_ptr<Adapter::Impl> get_adapter_impl(const Adapter& adapter) { + return adapter.m_pimpl; + } + + struct ConfigChanged { + bool mode = false; + bool alpha = false; + bool adapter = false; + + operator bool() const { + return mode || alpha || adapter; + } + }; + + ConfigChanged compare_configs(const AdapterConfig& config1, const AdapterConfig& config2) { + ConfigChanged diff; + diff.mode = config1.get_mode() != config2.get_mode(); + // TODO: Use `set` from this commented block when the config change tracking is implemented at adapter granularity and will track order of adapters correctly + // std::set<Adapter> + // adapters1(config1.adapters.begin(), config1.adapters.end()), + // adapters2(config2.adapters.begin(), config2.adapters.end()); + const auto& adapters1 = config1.get_adapters(), adapters2 = config2.get_adapters(); + + if(adapters1 != adapters2) { + diff.adapter = true; + diff.alpha = true; + } else { + for(auto const& adapter: adapters1) { + diff.alpha = config1.get_alpha(adapter) != config2.get_alpha(adapter); + } + } + return diff; + } + + void apply (ov::InferRequest& infer_request, std::optional<AdapterConfig> config) { + // FIXME: If a part of LoRA state tensors are not set here, then need to carefully reset state in LLMPipeline where global reset is called after the generation + ConfigChanged diff; + if(config) { + diff = compare_configs(current_config, *config); + OPENVINO_ASSERT( + !diff.mode || config->get_mode() == AdapterConfig::MODE_AUTO, // MODE_AUTO in this call means that mode is not changed + "AdapterConfig::mode cannot be changed and should be configured once for a model at the initialization"); + OPENVINO_ASSERT( + config->get_mode() == AdapterConfig::MODE_AUTO || config->get_mode() == AdapterConfig::MODE_DYNAMIC || config->get_mode() == AdapterConfig::MODE_STATIC_RANK || (!diff.alpha && !diff.adapter), + "Cannot change adapters and/or the alphas when not one of the dynamic modes are used."); + current_config = *config; + } + if(need_full_apply) { + need_full_apply = false; + set_new_adapter_tensors(infer_request); + } else if(diff) { + if(diff.adapter) { + set_new_adapter_tensors(infer_request); + } else { + OPENVINO_ASSERT(diff.alpha); + set_new_adapter_alphas(infer_request); + } + } + } + + void force_full_apply(bool full_apply) { + need_full_apply = full_apply; + } + + void set_new_adapter_alphas (ov::InferRequest& infer_request) { + // FIXME: Provide more economical way to update only alphas + set_new_adapter_tensors(infer_request); + } + + void set_new_adapter_tensors (ov::InferRequest& infer_request) { + if(current_config.get_mode() != AdapterConfig::MODE_AUTO && current_config.get_mode() != AdapterConfig::MODE_DYNAMIC && current_config.get_mode() != AdapterConfig::MODE_STATIC_RANK ) { + return; + } + + std::vector<LoRAWeightGetter> weight_getters; + const auto& adapters = current_config.get_adapters(); + weight_getters.reserve(adapters.size()); + for(const auto& adapter: adapters) { + weight_getters.emplace_back(LoRAWeightGetterDefault(&get_adapter_impl(adapter)->tensors, prefix)); + } + + auto state = infer_request.query_state(); + + // TODO: Forced to use variable_id instead of index to address the state tensors, require the same order for state as for variables from plugins + + // Convert LoRAVarIDs to LoRAIndices to speedup search for state with a given name + // TODO: If state order is stable, then the mapping should be done once for a given infer request, TODO: cache it based on the infer request + std::map<std::string, size_t> state_name_to_index; + for(size_t i = 0; i < state.size(); ++i) { + auto name = state[i].get_name(); + state_name_to_index[name] = i; + } + + for(const auto& lora_var_ids : variable_ids) { + // FIXME: Remove this mapping when the order of state will be the same as the order of variables + LoRAIndices lora_indices; + lora_indices.alpha = state_name_to_index.at(lora_var_ids.alpha.variable_id); + lora_indices.A = state_name_to_index.at(lora_var_ids.A.variable_id); + lora_indices.B = state_name_to_index.at(lora_var_ids.B.variable_id); + lora_indices.name = lora_var_ids.name; // TODO: Redundant? + + set_lora_tensors(state, lora_var_ids, lora_indices, weight_getters); + } + } + + std::vector<LoRAWeight> collect_applicable_tensors (const std::string& lora_name, const std::vector<LoRAWeightGetter>& weight_getters) { + const auto& adapters = current_config.get_adapters(); + OPENVINO_ASSERT(weight_getters.size() == adapters.size()); + std::vector<LoRAWeight> result; + result.reserve(weight_getters.size()); + for(size_t i = 0; i < adapters.size(); ++i) { + if(auto lora_tensors = weight_getters[i](lora_name)) { + // FIXME: Introduce more flexible logic of setting alpha based on alpha set in the adapter file itself, now it is ignored and only alpha from config is used + OPENVINO_ASSERT(lora_tensors->A); + OPENVINO_ASSERT(lora_tensors->B); + lora_tensors->alpha = alpha_as_constant(current_config.get_alpha(adapters[i])); + result.push_back(LoRAWeight( + std::dynamic_pointer_cast<v0::Constant>(lora_tensors->alpha), + std::dynamic_pointer_cast<v0::Constant>(lora_tensors->A), + std::dynamic_pointer_cast<v0::Constant>(lora_tensors->B) + )); + } + } + return result; + } + + InferRequestSignatureCache::Signature get_tensor_signature(const ov::Output<ov::Node>& output) { + return get_tensor_signature(output.get_element_type(), output.get_partial_shape()); + } + + InferRequestSignatureCache::Signature get_tensor_signature(const ov::element::Type& type, const ov::PartialShape& shape) { + return type.get_type_name() + shape.to_string(); + } + + InferRequestSignatureCache::Signature get_lora_signature(const std::vector<LoRAWeight>& inputs, const LoRAParts<ov::Tensor>& outputs) { + InferRequestSignatureCache::Signature signature; + for(const auto& input: inputs) { + signature += + std::string("(") + + "(" + get_tensor_signature(input.alpha) + ")" + + "(" + get_tensor_signature(input.A) + ")" + // TODO: Adjust shape to have a dynamic low-rank LoRA dimension in case of fully static shape doesn't have significant speedup + "(" + get_tensor_signature(input.B) + ")" + // TODO: Adjust shape to have a dynamic low-rank LoRA dimension in case of fully static shape doesn't have significant speedup + ")"; + } + for(const auto& input: inputs) { + signature += + std::string("(") + + // Shape is set to be dynamic because it doesn't mater for signature as it is completelly determined by the corresponding model + "(" + get_tensor_signature(outputs.alpha.get_element_type(), ov::PartialShape::dynamic(1)) + ")" + + "(" + get_tensor_signature(outputs.A.get_element_type(), ov::PartialShape::dynamic(2)) + ")" + + "(" + get_tensor_signature(outputs.B.get_element_type(), ov::PartialShape::dynamic(2)) + ")" + + ")"; + } + return signature; + } + + ov::TensorVector to_tensor_vector(const std::vector<LoRAWeight>& v) { + ov::TensorVector result; + result.reserve(v.size()*3); + for(auto const& lora_weights: v) { + result.push_back(lora_weights.alpha->get_tensor_view()); + result.push_back(lora_weights.A->get_tensor_view()); + result.push_back(lora_weights.B->get_tensor_view()); + } + return result; + } + + ov::TensorVector to_tensor_vector(const LoRAParts<ov::Tensor>& lora_tensors) { + ov::TensorVector result; + result.reserve(3); + result.push_back(lora_tensors.alpha); + result.push_back(lora_tensors.A); + result.push_back(lora_tensors.B); + return result; + } + + void build_concat_model( + ov::ParameterVector& parameters, + ov::ResultVector& results, + const std::vector<LoRAWeight>& inputs, + ov::Tensor output, + size_t offset, + size_t concat_axis, + std::function<std::shared_ptr<v0::Parameter>(const LoRAWeight&)> input_accessor, + std::function<NodePtr(const LoRAWeight&, NodePtr)> parameter_postprocessing = [](const LoRAWeight&, NodePtr node) { return node; } + ) { + ov::OutputVector concat_inputs; + concat_inputs.reserve(inputs.size()); + for(size_t i = 0; i < inputs.size(); ++i) { + NodePtr input = parameters[3*i + offset] = input_accessor(inputs[i]); + if(input->get_output_element_type(0) != output.get_element_type()) { + input = std::make_shared<v0::Convert>(input, output.get_element_type()); + } + if(input->get_output_partial_shape(0).rank().get_length() > 2) { + input = squeeze_2d(input); + } + input = parameter_postprocessing(inputs[i], input); + concat_inputs.push_back(input); + } + + NodePtr result; + if(concat_inputs.size() > 1) { + result = std::make_shared<v0::Concat>(concat_inputs, concat_axis); + } else { + result = concat_inputs.front().get_node_shared_ptr(); + // FIXME: Workaround CPU plugin bug with Parameter -> Result models: add a small constant to force copying input to output + // FIXME: Do it differently: not use model-based evaluation in this case but just pass lora tensor directly as a new state value + if(result == parameters[offset]) { + result = std::make_shared<v1::Add>(result, v0::Constant::create(result->get_output_element_type(0), Shape{}, {1e-37f})); + } + } + + results[offset] = std::make_shared<v0::Result>(result); + + // TODO: Optimize trivial Parameter->Result cases + } + + LoRAParts<ov::Tensor> empty_adapters(const std::vector<LoRAWeight>& inputs, LoRAParts<ov::Tensor>& outputs) { + #if EMPTY_TENSORS_SUPPORTED_IN_MATMUL + + outputs.alpha.set_shape({1, 0}); + outputs.A.set_shape({0, outputs.A.get_shape()[1]}); + outputs.B.set_shape({outputs.B.get_shape()[0], 0}); + + #else + + // TODO: As ov::Tensor lacks a convenient constructor to fill all elements with the same scalar value, do it via Constant that has such constructor + // FIXME: It's a huge overhead for setting just a scalar 0 + + ov::Shape + alpha_shape{1, 1}, + A_shape{1, outputs.A.get_shape()[1]}, + B_shape{outputs.B.get_shape()[0], 1}; + + outputs.alpha.set_shape(alpha_shape); + outputs.A.set_shape(A_shape); + outputs.B.set_shape(B_shape); + std::make_shared<v0::Constant>(outputs.alpha.get_element_type(), alpha_shape, 0)->get_tensor_view().copy_to(outputs.alpha); + // Element values for A and B don't matter as we are multiplying by 0 in alpha anyway + std::make_shared<v0::Constant>(outputs.A.get_element_type(), A_shape, 0)->get_tensor_view().copy_to(outputs.A); + std::make_shared<v0::Constant>(outputs.B.get_element_type(), B_shape, 0)->get_tensor_view().copy_to(outputs.B); + + #endif + + return outputs; + } + + LoRAParts<ov::Tensor> concat_adapters(const std::vector<LoRAWeight>& inputs, LoRAParts<ov::Tensor>& outputs) { + auto signature = get_lora_signature(inputs, outputs); + if(!lora_state_evaluators.exist(signature)) { + // Prepare LoRA state evaluate model + ov::ParameterVector parameters(3*inputs.size()); + ov::ResultVector results(3); + + build_concat_model(parameters, results, inputs, outputs.alpha, 0, 1, + [](const LoRAWeight& lora_weight) { + return std::make_shared<v0::Parameter>( + lora_weight.alpha->get_output_element_type(0), + lora_weight.alpha->get_output_partial_shape(0)); // TODO: Consider using dynamic LoRA rank dimension instead of static dimension + }, + [](const LoRAWeight& lora_weight, NodePtr parameter) { + // TODO: This code should be modified if dynamic LoRA rank is used in the evaluator + auto lora_rank = lora_weight.A->get_output_partial_shape(0)[0].get_length(); + // Broadcast a single alpha element to shape [lora_rank] + auto lora_rank_constant = v0::Constant::create(ov::element::u32, Shape{2}, std::vector<decltype(lora_rank)>{1, lora_rank}); + return std::make_shared<v3::Broadcast>(parameter, lora_rank_constant); + }); + + build_concat_model(parameters, results, inputs, outputs.A, 1, 0, + [](const LoRAWeight& lora_weight) { + return std::make_shared<v0::Parameter>( + lora_weight.A->get_output_element_type(0), + lora_weight.A->get_output_partial_shape(0)); // TODO: Consider using dynamic LoRA rank dimension instead of static dimension + } + ); + + build_concat_model(parameters, results, inputs, outputs.B, 2, 1, + [](const LoRAWeight& lora_weight) { + return std::make_shared<v0::Parameter>( + lora_weight.B->get_output_element_type(0), + lora_weight.B->get_output_partial_shape(0)); // TODO: Consider using dynamic LoRA rank dimension instead of static dimension + } + ); + + lora_state_evaluators.insert(signature, std::make_shared<ov::Model>(results, parameters)); + } + auto output_tensors = to_tensor_vector(outputs); + lora_state_evaluators.evaluate(signature, to_tensor_vector(inputs), output_tensors); + return outputs; + } + + ov::Shape dynamic_to_static(const ov::PartialShape& pshape) { + ov::Shape shape(pshape.rank().get_length()); + for(size_t i = 0; i < pshape.rank().get_length(); ++i) { + shape[i] = pshape[i].is_dynamic() ? 0 : pshape[i].get_length(); + } + return shape; + } + + void set_lora_tensors(std::vector<VariableState>& state, const LoRAVarIDs& lora_var_ids, const LoRAIndices& lora_indices, const std::vector<LoRAWeightGetter>& weight_getters) { + LoRAParts<ov::Tensor> lora_state_tensors{ + ov::Tensor(lora_var_ids.alpha.data_type, dynamic_to_static(lora_var_ids.alpha.data_shape)), + ov::Tensor(lora_var_ids.A.data_type, dynamic_to_static(lora_var_ids.A.data_shape)), + ov::Tensor(lora_var_ids.B.data_type, dynamic_to_static(lora_var_ids.B.data_shape)) + }; + auto new_tensors = prepare_lora_tensors(lora_indices.name, weight_getters, lora_state_tensors); + + state[lora_indices.alpha].set_state(new_tensors.alpha); + state[lora_indices.A].set_state(new_tensors.A); + state[lora_indices.B].set_state(new_tensors.B); + } + + LoRAParts<ov::Tensor> prepare_lora_tensors ( + const std::string& name, + const std::vector<LoRAWeightGetter>& weight_getters, + LoRAParts<ov::Tensor>& output, + bool set_empty_adapters = true + ) { + auto lora_tensors = collect_applicable_tensors(name, weight_getters); + LoRAParts<ov::Tensor> new_tensors; + if(!lora_tensors.empty()) { + new_tensors = concat_adapters(lora_tensors, output); + } else if(set_empty_adapters) { // FIXME: Make it as a separate step outside of this function + new_tensors = empty_adapters(lora_tensors, output); + } + return new_tensors; + } +}; + + +AdapterController::AdapterController(std::shared_ptr<ov::Model> model, const AdapterConfig& config, const std::string& prefix, std::string device) +{ + // If AdapterConfig::MODE_AUTO is used, then set real mode depending on the device capabilities + // TODO: Remove this code when devices become aligned on their capabilities for LoRA adapaters + if (config.get_mode() == AdapterConfig::MODE_AUTO) { + static const std::map<std::string, AdapterConfig::Mode> default_modes { + {"CPU", AdapterConfig::MODE_DYNAMIC}, + {"GPU", AdapterConfig::MODE_STATIC_RANK}, + {"NPU", AdapterConfig::MODE_STATIC}, + }; + if(device.find("GPU") != std::string::npos) { // to handle GPU device variants which doesn't matter for adapter mode + device = "GPU"; + } + auto default_mode = default_modes.find(device); + if(default_mode != default_modes.end()) { + AdapterConfig updated_config = config; + updated_config.set_mode(default_mode->second); + m_pimpl = std::make_shared<AdapterControllerImpl>(model, updated_config, prefix); + return; + } else { + std::string device_msg; + if(device.empty()) { + device_msg = "No device set"; + } else { + device_msg = "Device \"" + device + "\" is unrecognized"; + } + std::cout + << "[ WARNING ] " << device_msg << " to deduce default device-dependent LoRA application mode.\n" + << "This warning appears because no specific LoRA mode was set in AdapterConfig or MODE_AUTO was used explicitly.\n" + << "To avoid this warning set one of the AdapterConfig::Mode values except MODE_AUTO."; + } + } + m_pimpl = std::make_shared<AdapterControllerImpl>(model, config, prefix); +} + + +// Call it every time when adapter config is changed; if adapter was configured as a static one, this call is not required +void AdapterController::apply(ov::InferRequest& request, const std::optional<AdapterConfig>& config) { + OPENVINO_ASSERT(m_pimpl || !config || !*config, + "Adapters are passed to AdapterController but it was not configured to use adapters. " + "Enable using adapters by pass them in the constructor first."); + if (m_pimpl) { + m_pimpl->apply(request, config); + } +} + + +void AdapterController::force_full_apply(bool full_apply) { + return m_pimpl->force_full_apply(full_apply); +} + + +void AdapterConfig::set_mode(Mode _mode) { + mode = _mode; +} + + +AdapterConfig::AdapterConfig (const std::vector<Adapter>& adapters, Mode mode) : mode(mode), adapters(adapters) { + alphas.reserve(adapters.size()); + for(const auto& adapter: adapters) { + auto const alpha = 1; + alphas.push_back(alpha); + } +} + + +AdapterConfig::AdapterConfig (const std::vector<std::pair<Adapter, float>>& _adapters, Mode mode) : mode(mode) { + adapters.reserve(_adapters.size()); + alphas.reserve(_adapters.size()); + for(auto const& adapter_and_alpha: _adapters) { + adapters.push_back(adapter_and_alpha.first); + alphas.push_back(adapter_and_alpha.second); + } +} + + +AdapterConfig::AdapterConfig(Mode mode) : mode(mode) {} + + +AdapterConfig& AdapterConfig::add(const Adapter& adapter, float alpha) { + OPENVINO_ASSERT(adapters.size() == alphas.size()); + OPENVINO_ASSERT(adapters.end() == std::find(adapters.begin(), adapters.end(), adapter), "Adapter object passed to AdapterConfig::add was already registered"); + adapters.push_back(adapter); + alphas.push_back(alpha); + return *this; +} + + +AdapterConfig& AdapterConfig::add(const Adapter& adapter) { + return add(adapter, 1); +} + + +AdapterConfig& AdapterConfig::set_alpha(const Adapter& adapter, float alpha) { + OPENVINO_ASSERT(adapters.size() == alphas.size()); + auto it = std::find(adapters.begin(), adapters.end(), adapter); + OPENVINO_ASSERT(adapters.end() != it, "Unknown adapter object passed to AdapterConfig::set_alpha, register adapter object first with AdapterConfig::add"); + auto index = it - adapters.begin(); + alphas[index] = alpha; + return *this; +} + + +float AdapterConfig::get_alpha(const Adapter& adapter) const { + OPENVINO_ASSERT(adapters.size() == alphas.size()); + auto it = std::find(adapters.begin(), adapters.end(), adapter); + OPENVINO_ASSERT(adapters.end() != it, "Unknown adapter object passed to AdapterConfig::get_alpha, alpha can be retrieved for previously registered adatpers only"); + return alphas[it - adapters.begin()]; +} + + +AdapterConfig& AdapterConfig::remove(const Adapter& adapter) { + OPENVINO_ASSERT(adapters.size() == alphas.size()); + auto it = std::find(adapters.begin(), adapters.end(), adapter); + OPENVINO_ASSERT(adapters.end() != it, "Unknown adapter object passed to AdapterConfig::remove, you can remove previously registered adapters only"); + auto index = it - adapters.begin(); + alphas.erase(alphas.begin() + index); + adapters.erase(it); + return *this; +} + + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/lora_helper.cpp b/src/cpp/src/lora_helper.cpp new file mode 100644 index 0000000000..7e7a6e613c --- /dev/null +++ b/src/cpp/src/lora_helper.cpp @@ -0,0 +1,28 @@ +#include "lora_helper.hpp" + + +namespace ov { +namespace genai { + +std::optional<AnyMap> extract_adapters_from_properties (const AnyMap& properties, AdapterConfig* adapter_config) { + auto adapters_iter = properties.find(AdaptersProperty::name()); + if (adapters_iter != properties.end()) { + if(adapter_config) { + *adapter_config = adapters_iter->second.as<AdapterConfig>(); + } + auto filtered_properties = properties; + filtered_properties.erase(AdaptersProperty::name()); + return filtered_properties; + } + return std::nullopt; +} + +void update_adapters_from_properties (const AnyMap& properties, AdapterConfig& adapter_config) { + auto adapters_iter = properties.find(AdaptersProperty::name()); + if (adapters_iter != properties.end()) { + adapter_config = adapters_iter->second.as<AdapterConfig>(); + } +} + +} +} \ No newline at end of file diff --git a/src/cpp/src/lora_helper.hpp b/src/cpp/src/lora_helper.hpp new file mode 100644 index 0000000000..b9e41e8b4c --- /dev/null +++ b/src/cpp/src/lora_helper.hpp @@ -0,0 +1,21 @@ +#pragma once + +#include <optional> + +#include "openvino/genai/lora_adapter.hpp" + + +namespace ov { +namespace genai { + +// Search for `adapters` property in `properties` map. If it is found and `adapter_config` is not nullptr, +// set `adapter_config` with found value, and return a copy of `properties` with the `adapters` property removed. +// If there is no `adapters` property, `adapter_config` is left unchanged and std::nullopt is returned. +std::optional<AnyMap> extract_adapters_from_properties (const AnyMap& properties, AdapterConfig* adapter_config = nullptr); + +// Search for `adapters` property in `properties` map. If it is found, set `adapter_config` with found value. +// If `adapters` property is not found, do nothing. +void update_adapters_from_properties (const AnyMap& properties, AdapterConfig& adapter_config); + +} +} \ No newline at end of file diff --git a/src/cpp/src/make_combine_segments_stateful.cpp b/src/cpp/src/make_combine_segments_stateful.cpp new file mode 100644 index 0000000000..2285c172dc --- /dev/null +++ b/src/cpp/src/make_combine_segments_stateful.cpp @@ -0,0 +1,46 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "make_combine_segments_stateful.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/select.hpp" +#include "openvino/op/read_value.hpp" +#include "openvino/op/assign.hpp" + + +using namespace ov; +using namespace ov::op; + +bool ov::genai::MakeCombineSegmentsSatateful::run_on_model(const std::shared_ptr<ov::Model>& model) { + + std::shared_ptr<ov::Node> combine_seg_node; + for (auto node: model->get_ordered_ops()) { + if (strcmp(node->get_type_info().name, "CombineSegments") == 0) { + combine_seg_node = node; + } + } + if (!combine_seg_node || combine_seg_node->input_value(1).get_element_type() != ov::element::i32) { + return false; + } + + std::shared_ptr<v0::Constant> input_1_const = std::dynamic_pointer_cast<v0::Constant>(combine_seg_node->get_input_node_shared_ptr(1)); + if (!input_1_const) { + return false; + } + + op::util::VariableInfo var_info{ov::Shape{}, ov::element::boolean, ADD_SPECIAL_TOKENS_VAR_ID}; + auto variable = std::make_shared<op::util::Variable>(var_info); + + // Default mode is add_special_tokens. + auto default_mode_const = std::make_shared<v0::Constant>(ov::element::boolean, ov::Shape{}, std::vector{true}); + auto read_value = std::make_shared<v6::ReadValue>(default_mode_const, variable); + auto zero_constant = std::make_shared<v0::Constant>(ov::element::i32, ov::Shape{}, std::vector{0}); + auto select_node = std::make_shared<v1::Select>(read_value, input_1_const, zero_constant); + combine_seg_node->input(1).replace_source_output(select_node->output(0)); + + auto assign = std::make_shared<v6::Assign>(read_value, variable); + + model->add_sinks({assign}); + model->add_variables({variable}); + return true; +} diff --git a/src/cpp/src/make_combine_segments_stateful.hpp b/src/cpp/src/make_combine_segments_stateful.hpp new file mode 100644 index 0000000000..f81f8f08d6 --- /dev/null +++ b/src/cpp/src/make_combine_segments_stateful.hpp @@ -0,0 +1,44 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/op/constant.hpp" +#include "openvino/pass/pass.hpp" + +namespace ov { +namespace genai { + +/** + * @brief This pass modifies tokenizer ov::Model so that special tokens adding will be + * enabled or diabled depending on stateful value. + * + * +--------------+ + * | DefaultMode | + * +--------------+ + * | + * | + * v + * +--------------+ +--------+ +------------------+ + * | ReadValue | | ends | | const value = 0 | + * +--------------+ +--------+ +------------------+ + * \ | / + * \ | / + * v v v + * +--------------+ + * | Select | + * +--------------+ + * | + * v + * +-------------------------+ + * | CombineSegments | + * +-------------------------+ +**/ +class MakeCombineSegmentsSatateful : public ov::pass::ModelPass { +public: + OPENVINO_RTTI("MakeCombineSegmentsSatateful", "0"); + bool run_on_model(const std::shared_ptr<ov::Model>& model) override; +}; + +const std::string ADD_SPECIAL_TOKENS_VAR_ID = "add_special_tokens"; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/model_runner.hpp b/src/cpp/src/model_runner.hpp new file mode 100644 index 0000000000..e3d482b958 --- /dev/null +++ b/src/cpp/src/model_runner.hpp @@ -0,0 +1,276 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <vector> +#include <cstdlib> + +#include <openvino/runtime/infer_request.hpp> + +#include "debug_utils.hpp" +#include "sequence_group.hpp" +#include "scheduler.hpp" +#include "timer.hpp" + +#include "attention_output.hpp" + +namespace ov::genai { + +inline std::string get_paged_attention_score_output_for_decoder_layer(size_t decoder_layer_id) { + std::stringstream ss; + ss << "scores." << decoder_layer_id; + return ss.str(); +} + +/** + * @brief Runs the LLM infer request, parsing the continuous batching scheduler output into proper inputs in terms of OV API (e.g. token input IDs, + * KV cache block indices etc.) and returning the logit scores for the next token to be generated for each of the currently scheduled sequences. + */ +class ModelRunner { + ov::InferRequest m_request; + SchedulerConfig m_scheduler_config; + AttentionScoresForEachSubsequence m_last_attention_scores; + size_t m_num_decoder_layers; + bool m_collect_attention_scores; +public: + /** + * Constructs the ModelRunner. + * @param request The ov::InferRequest for the LLM to be inferred in the continous batching mode. + * @param scheduler_config Configuration struct for the scheduler that is to be used with this ModelRunner. + * @param num_decoder_layers Number of decoder attention layers in the LLM corresponding to the request. + * @param collect_attention_scores If true, then after each `forward` call the ModelRunner will collect and make available the per-token attention + * scores for each decoder layer, so that these can be used in per-step cache optimizations (such as cache eviction algorithm). + */ + ModelRunner(ov::InferRequest request, const SchedulerConfig& scheduler_config, size_t num_decoder_layers = 1, bool collect_attention_scores = false) : + m_request(std::move(request)), + m_scheduler_config(scheduler_config), + m_num_decoder_layers(num_decoder_layers), + m_collect_attention_scores(collect_attention_scores) { + OPENVINO_ASSERT(m_num_decoder_layers != 0, "num_decoder_layers must be non-zero"); + } + + /** + * @return The ov::InferRequest this ModelRunner is handling. + */ + ov::InferRequest get_infer_request() const { + return m_request; + } + + /** + * @return A map of sequence IDs to vectors of ov::Tensor per-token attention scores. Each vector element is associated with its own + * decoder layer, in order of their execution in the model. Each ov::Tensor has a shape of {N_k}, where N_k is the length of + * a sequence with ID k processed during the previous `forward` call. + */ + const AttentionScoresForEachSubsequence& get_last_attention_scores() const { + return m_last_attention_scores; + } + + /** + * Runs the forward inference call on the underlying LLM's ov::InferRequest, scheduling for inferencing tokens for given sequences + * taking into account the supplied scheduler output struct. + * @param sequence_groups A vector of pointers to sequence groups to be processed during this `forward` call + * @param scheduler_output The scheduler output struct with information on the specifics of the token scheduling during this forward call + * @return An ov::Tensor with next-token logit scores for each sequence processed during this `forward` call. + */ + ov::Tensor forward(const std::vector<SequenceGroup::Ptr> & sequence_groups, const Scheduler::Output& scheduler_output) { + size_t num_sequence_groups = scheduler_output.m_scheduled_sequence_groups_ids.size(); + size_t batch_size_in_sequences = 0; + size_t total_num_tokens = 0, total_num_blocks = 0; + size_t max_context_len_val = 0; + + // compute aggregated values + for (size_t i = 0; i < num_sequence_groups; ++i) { + size_t seq_group_id = scheduler_output.m_scheduled_sequence_groups_ids[i]; + SequenceGroup::CPtr sequence_group = sequence_groups[seq_group_id]; + size_t num_sequences = sequence_group->num_running_seqs(); + batch_size_in_sequences += num_sequences; + total_num_tokens += sequence_group->get_num_scheduled_tokens() * num_sequences; + total_num_blocks += sequence_group->get_num_blocks() * num_sequences; + max_context_len_val = std::max(max_context_len_val, sequence_group->get_context_len()); + } + + ov::Tensor + input_ids(ov::element::i64, {total_num_tokens}), + position_ids(ov::element::i64, {total_num_tokens}), + // PA specific parameters + past_lens(ov::element::i32, {batch_size_in_sequences}), + subsequence_begins(ov::element::i32, {batch_size_in_sequences + 1}), + // block_indices are handled in a special fashion below + block_indices_begins(ov::element::i32, {batch_size_in_sequences + 1}), + max_context_len(ov::element::i32, {}); + + max_context_len.data<int32_t>()[0] = max_context_len_val; + + // get raw pointers to copy to + int64_t + * input_ids_data = input_ids.data<int64_t>(), + * position_ids_data = position_ids.data<int64_t>(); + int32_t + * past_lens_data = past_lens.data<int32_t>(), + * subsequence_begins_data = subsequence_begins.data<int32_t>(), + * block_indices_begins_data = block_indices_begins.data<int32_t>(); + + // sub-sequence data starts with 0 + subsequence_begins_data[0] = 0; + block_indices_begins_data[0] = 0; + + for (size_t i = 0; i < num_sequence_groups; ++i) { + size_t seq_group_id = scheduler_output.m_scheduled_sequence_groups_ids[i]; + SequenceGroup::CPtr sequence_group = sequence_groups[seq_group_id]; + std::vector<Sequence::CPtr> running_sequences = sequence_group->get_running_sequences(); + size_t num_running_sequences = running_sequences.size(); + size_t num_scheduled_tokens = sequence_group->get_num_scheduled_tokens(); + size_t group_position_id = sequence_group->get_num_processed_tokens(); + + // spec: In case of multiple input tokens for current sequence (prompt_len > 1), + // context_len corresponds to first token within subgroup of scheduled tokens + size_t group_context_len = group_position_id; + + for (size_t seq_id = 0; seq_id < num_running_sequences; ++seq_id) { + Sequence::CPtr sequence = running_sequences[seq_id]; + + for (size_t token_id = 0, position_id = group_position_id; token_id < num_scheduled_tokens; ++token_id, ++position_id) { + // compute token for current sequence + input_ids_data[token_id] = position_id < sequence_group->get_prompt_len() ? + sequence_group->get_prompt_ids()[position_id] : + sequence->get_generated_ids()[position_id - sequence_group->get_prompt_len()]; + + position_ids_data[token_id] = position_id; + } + + size_t expected_kv_cache_size = sequence_group->get_num_processed_tokens() - sequence_group->get_num_evicted_tokens(); + past_lens_data[0] = expected_kv_cache_size; + + subsequence_begins_data[1] = subsequence_begins_data[0] + num_scheduled_tokens; + + size_t num_blocks = (sequence_group->get_context_len() - sequence_group->get_num_evicted_tokens() + m_scheduler_config.block_size - 1) / m_scheduler_config.block_size; + block_indices_begins_data[1] = block_indices_begins_data[0] + num_blocks; + + // apply strides to shift to a next sequence + input_ids_data += num_scheduled_tokens; + position_ids_data += num_scheduled_tokens; + past_lens_data += 1; + subsequence_begins_data += 1; + block_indices_begins_data += 1; + } + } + + // typical LLM parameters + m_request.set_tensor("input_ids", input_ids); + m_request.set_tensor("position_ids", position_ids); + + // PA specific parameters + m_request.set_tensor("past_lens", past_lens); + m_request.set_tensor("subsequence_begins", subsequence_begins); + + _set_block_indices(m_request, sequence_groups, scheduler_output, total_num_blocks); + + m_request.set_tensor("block_indices_begins", block_indices_begins); + m_request.set_tensor("max_context_len", max_context_len); + + // print_tensor("input_ids", input_ids); + // print_tensor("position_ids", position_ids); + + // print_tensor("past_lens", past_lens); + // print_tensor("subsequence_begins", subsequence_begins); + // print_tensor("block_indices", block_indices); + // print_tensor("block_indices_begins", block_indices_begins); + // print_tensor("max_context_len", max_context_len); + + { + static ManualTimer timer("pure generate inference"); + timer.start(); + m_request.infer(); + timer.end(); + } + + if (m_collect_attention_scores && m_scheduler_config.use_cache_eviction) { + _collect_attention_scores(sequence_groups, scheduler_output); + } + + // return logits + return m_request.get_tensor("logits"); + } + +private: + void _set_block_indices(ov::InferRequest& infer_request, const std::vector<SequenceGroup::Ptr> & sequence_groups, const Scheduler::Output& scheduler_output, + size_t total_num_blocks) { + size_t num_sequence_groups = scheduler_output.m_scheduled_sequence_groups_ids.size(); + std::vector<std::string> tensor_names = {"block_indices"}; + + if (m_scheduler_config.use_cache_eviction) { + tensor_names.resize(m_num_decoder_layers); + for (size_t i = 0; i < tensor_names.size(); i++) { + tensor_names[i] = std::string("block_indices.") + std::to_string(i); + } + } + + for (auto& name : tensor_names) { + m_request.get_tensor(name).set_shape({total_num_blocks}); + } + + size_t block_offset = 0; + for (size_t i = 0; i < num_sequence_groups; ++i) { + size_t seq_group_id = scheduler_output.m_scheduled_sequence_groups_ids[i]; + SequenceGroup::CPtr sequence_group = sequence_groups[seq_group_id]; + std::vector<Sequence::CPtr> running_sequences = sequence_group->get_running_sequences(); + size_t num_running_sequences = running_sequences.size(); + + for (size_t seq_id = 0; seq_id < num_running_sequences; ++seq_id) { + Sequence::CPtr sequence = running_sequences[seq_id]; + + size_t num_blocks = (sequence_group->get_context_len() - sequence_group->get_num_evicted_tokens() + m_scheduler_config.block_size - 1) / m_scheduler_config.block_size; + const auto & kv_blocks = scheduler_output.m_block_tables.at(sequence->get_id()); + + for (size_t layer_idx = 0; layer_idx < tensor_names.size(); layer_idx++) { + auto input_tensor = infer_request.get_tensor(tensor_names[layer_idx]); + auto block_indices_data = input_tensor.data<int32_t>() + block_offset; + for (size_t block_id = 0; block_id < num_blocks; ++block_id) + // In case no cache eviction is requested, all per-layer block tables are expected to be identical + // at all times + block_indices_data[block_id] = kv_blocks[layer_idx][block_id]->get_index(); + } + + block_offset += num_blocks; + } + } + } + + void _collect_attention_scores(const std::vector<SequenceGroup::Ptr> & sequence_groups, const Scheduler::Output& scheduler_output) { + m_last_attention_scores.clear(); + size_t num_sequence_groups = scheduler_output.m_scheduled_sequence_groups_ids.size(); + using IndexSpan = std::pair<size_t, size_t>; + std::list<std::pair<size_t, IndexSpan>> running_sequence_group_ids_and_kvcache_spans; + size_t offset = 0; + for (size_t i = 0; i < num_sequence_groups; ++i) { + size_t seq_group_id = scheduler_output.m_scheduled_sequence_groups_ids[i]; + SequenceGroup::CPtr sequence_group = sequence_groups[seq_group_id]; + std::vector<Sequence::CPtr> running_sequences = sequence_group->get_running_sequences(); + + for (size_t seq_id = 0; seq_id < running_sequences.size(); ++seq_id) { + Sequence::CPtr sequence = running_sequences[seq_id]; + size_t subsequence_length = sequence_group->get_context_len() - sequence_group->get_num_evicted_tokens(); + IndexSpan span = {offset, offset + subsequence_length}; + size_t global_sequence_id = sequence->get_id(); + running_sequence_group_ids_and_kvcache_spans.emplace_back(global_sequence_id, span); + offset += subsequence_length; + } + } + + for (const auto& seq_id_and_score_span : running_sequence_group_ids_and_kvcache_spans) { + auto attention_scores_across_decoder_layers_for_current_sequence = AttentionScoresForEachDecoderLayer(m_num_decoder_layers); + size_t global_sequence_id = seq_id_and_score_span.first; + IndexSpan span = seq_id_and_score_span.second; + for (size_t decoder_layer_id = 0; decoder_layer_id < m_num_decoder_layers; decoder_layer_id++) { + auto attention_score = m_request.get_tensor(get_paged_attention_score_output_for_decoder_layer(decoder_layer_id)); + auto scores_for_cache_of_current_sequence_group = ov::Tensor(attention_score, ov::Coordinate{span.first}, ov::Coordinate{span.second}); + auto copied_tensor = ov::Tensor(scores_for_cache_of_current_sequence_group.get_element_type(), ov::Shape{span.second - span.first}); + scores_for_cache_of_current_sequence_group.copy_to(copied_tensor); + attention_scores_across_decoder_layers_for_current_sequence[decoder_layer_id] = scores_for_cache_of_current_sequence_group; + } + m_last_attention_scores[global_sequence_id] = attention_scores_across_decoder_layers_for_current_sequence; + } + } +}; +} diff --git a/src/cpp/src/multinomial_decoding.cpp b/src/cpp/src/multinomial_decoding.cpp new file mode 100644 index 0000000000..7e298b5598 --- /dev/null +++ b/src/cpp/src/multinomial_decoding.cpp @@ -0,0 +1,268 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include <algorithm> +#include <cmath> +#include <iostream> +#include <numeric> +#include <random> +#include <regex> +#include <vector> + +#include "openvino/genai/llm_pipeline.hpp" +#include "utils.hpp" + + +namespace { + +struct TokenIdScore { + int64_t id; + float score; + + bool operator<(const TokenIdScore& other) const { + return score < other.score; + } + + bool operator>(const TokenIdScore& other) const { + return score > other.score; + } +}; + +void apply_softmax_inplace(std::vector<TokenIdScore>& tokens) { + float max_score = std::max_element(tokens.begin(), tokens.end())->score; + float sum = 0.f; + + for (auto& token : tokens) { + float s = std::exp(token.score - max_score); + token.score = s; + sum += s; + } + + float inv_sum = 1.f / sum; + + for (auto& token : tokens) { + token.score *= inv_sum; + } +} + +TokenIdScore* sample_top_p(TokenIdScore* first, TokenIdScore* last, float top_p) { + // sort score + std::sort(first, last, std::greater<TokenIdScore>()); + + int tokens_size = last - first; + std::vector<TokenIdScore> token_scores(tokens_size); + for (size_t i = 0; i < tokens_size; i++) { + token_scores[i] = first[i]; + } + + // calculate softmax + apply_softmax_inplace(token_scores); + + float prefix_sum = 0.0f; + + // top_p + for (size_t i = 0; i < tokens_size; i++) { + prefix_sum += token_scores[i].score; + if (prefix_sum >= top_p) { + return first + (i + 1); + } + } + + return last; +} + +void apply_repetition_penalty(float* first, float* last, const std::vector<int64_t>& input_ids, float penalty) { + const float inv_penalty = 1.f / penalty; + const int vocab_size = last - first; + std::vector<bool> occurrence(vocab_size, false); + for (const int64_t id : input_ids) { + if (!occurrence[id]) { + first[id] *= (first[id] > 0) ? inv_penalty : penalty; + } + occurrence[id] = true; + } +} + +void apply_inv_temperature(float* first, float* last, float inv_temperature) { + for (float* it = first; it != last; it++) { + *it *= inv_temperature; + } +} + +struct RandomSampling { + const size_t top_k; + const float top_p; + const float inv_temperature; + const float repetition_penalty; + + std::mt19937 gen{std::random_device{}()}; + + RandomSampling(ov::genai::GenerationConfig generation_config) + : top_k{generation_config.top_k}, + top_p{generation_config.top_p}, + inv_temperature{1.f / generation_config.temperature}, + repetition_penalty{generation_config.repetition_penalty} { + } + + TokenIdScore get_out_token(float* logits, size_t vocab_size, const std::vector<int64_t>& tokens) { + // logits pre-process + if (repetition_penalty != 1.0f) { + apply_repetition_penalty(logits, logits + vocab_size, tokens, repetition_penalty); + } + + if (inv_temperature != 1.0f) { + apply_inv_temperature(logits, logits + vocab_size, inv_temperature); + } + + std::vector<TokenIdScore> token_scores(vocab_size); + for (size_t i = 0; i < vocab_size; i++) { + token_scores[i] = TokenIdScore{int64_t(i), logits[i]}; + } + + // top_k sampling + if (0 < top_k && top_k < token_scores.size()) { + std::nth_element(token_scores.data(), + token_scores.data() + top_k, + token_scores.data() + token_scores.size(), + std::greater<TokenIdScore>()); + token_scores.resize(top_k); + } + + // top_p sampling + if (0.f < top_p && top_p < 1.0f) { + auto pos = sample_top_p(token_scores.data(), token_scores.data() + token_scores.size(), top_p); + token_scores.resize(pos - token_scores.data()); + } + + // sample next token + apply_softmax_inplace(token_scores); + for (size_t i = 0; i < token_scores.size(); i++) { + logits[i] = token_scores[i].score; + } + + std::discrete_distribution<> dist(logits, logits + token_scores.size()); + return token_scores[dist(gen)]; + } +}; +} // namespace + +namespace ov { +namespace genai { + +ov::genai::EncodedResults multinominal_decoding(ov::InferRequest& m_model_runner, + ov::Tensor input_ids, + ov::Tensor attention_mask, + ov::genai::GenerationConfig config, + std::shared_ptr<ov::genai::StreamerBase> streamer, + std::optional<ov::Tensor> position_ids) { + ov::Shape prompts_shape = input_ids.get_shape(); + const size_t batch_size = prompts_shape[0]; + + OPENVINO_ASSERT(batch_size == 1, "Only batch size = 1 supported for multinomial decoding"); + + const size_t prompt_len = prompts_shape[1]; + const size_t max_new_tokens = config.get_max_new_tokens(prompt_len); + + // Initialize results and performance metrics. + EncodedResults results; + auto& raw_perf_counters = results.perf_metrics.raw_metrics; + raw_perf_counters.m_new_token_times.reserve(max_new_tokens); + raw_perf_counters.m_batch_sizes.reserve(max_new_tokens); + raw_perf_counters.m_token_infer_durations.reserve(max_new_tokens); + raw_perf_counters.m_inference_durations = {{ MicroSeconds(0.0f) }}; + results.scores.resize(batch_size, 0); + results.tokens.resize(batch_size); + + // Initialize inputs + m_model_runner.set_tensor("input_ids", input_ids); + m_model_runner.set_tensor("attention_mask", attention_mask); + + if (position_ids.has_value()) + m_model_runner.set_tensor("position_ids", *position_ids); + + // Input values are persistent between inference calls. + // That allows to set values, which aren't going to change, only once + m_model_runner.get_tensor("beam_idx").set_shape({batch_size}); + m_model_runner.get_tensor("beam_idx").data<int32_t>()[0] = 0; + + const auto infer_start = std::chrono::steady_clock::now(); + m_model_runner.infer(); + const auto infer_end = std::chrono::steady_clock::now(); + const auto infer_ms = PerfMetrics::get_microsec(infer_end - infer_start); + raw_perf_counters.m_inference_durations[0] += MicroSeconds(infer_ms); + raw_perf_counters.m_token_infer_durations.emplace_back(infer_ms); + raw_perf_counters.m_new_token_times.emplace_back(infer_end); + raw_perf_counters.m_batch_sizes.emplace_back(batch_size); + + auto logits_tensor = m_model_runner.get_tensor("logits"); + + int64_t sequence_offset = logits_tensor.get_shape().at(1) - 1; + size_t vocab_size = logits_tensor.get_shape().back(); + + float* logits = logits_tensor.data<float>() + sequence_offset * vocab_size; + + const int64_t* input_ids_data = input_ids.data<const int64_t>(); + + std::vector<int64_t> tokens{input_ids_data, input_ids_data + input_ids.get_size()}; + + RandomSampling sampling{config}; + + TokenIdScore out_token = sampling.get_out_token(logits, vocab_size, tokens); + + tokens.push_back(out_token.id); + results.tokens[0].push_back(out_token.id); + results.scores[0] += out_token.score; + + if (streamer && streamer->put(out_token.id)) { + return results; + } + + if (!config.ignore_eos && out_token.id == config.eos_token_id) { + return results; + } + + m_model_runner.get_tensor("input_ids").set_shape({batch_size, 1}); + + for (size_t i = 0; i < max_new_tokens - 1; i++) { + if (position_ids.has_value()) { + ov::genai::utils::update_position_ids(m_model_runner.get_tensor("position_ids"), + m_model_runner.get_tensor("attention_mask")); + } + m_model_runner.set_tensor("attention_mask", + ov::genai::utils::extend_attention(m_model_runner.get_tensor("attention_mask"))); + + m_model_runner.get_tensor("input_ids").data<int64_t>()[0] = out_token.id; + + const auto infer_start = std::chrono::steady_clock::now(); + m_model_runner.infer(); + const auto infer_end = std::chrono::steady_clock::now(); + const auto infer_ms = PerfMetrics::get_microsec(infer_end - infer_start); + raw_perf_counters.m_inference_durations[0] += MicroSeconds(infer_ms); + raw_perf_counters.m_token_infer_durations.emplace_back(infer_ms); + raw_perf_counters.m_new_token_times.emplace_back(infer_end); + raw_perf_counters.m_batch_sizes.emplace_back(batch_size); + + logits = m_model_runner.get_tensor("logits").data<float>(); + out_token = sampling.get_out_token(logits, vocab_size, tokens); + + tokens.push_back(out_token.id); + results.tokens[0].push_back(out_token.id); + results.scores[0] += out_token.score; + + if (streamer && streamer->put(out_token.id)) { + return results; + } + + if (!config.ignore_eos && out_token.id == config.eos_token_id) { + break; + } + } + + if (streamer) { + streamer->end(); + } + + return results; +} +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/paged_attention_transformations.cpp b/src/cpp/src/paged_attention_transformations.cpp new file mode 100644 index 0000000000..28dda4dea3 --- /dev/null +++ b/src/cpp/src/paged_attention_transformations.cpp @@ -0,0 +1,69 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/pass/manager.hpp" +#include "openvino/pass/sdpa_to_paged_attention.hpp" + +#include "paged_attention_transformations.hpp" +#include "cache_manager.hpp" + +namespace ov::genai { +inline ov::PartialShape to_partial_with_dyn_0_dim(const ov::Shape& static_shape) { + ov::PartialShape partial_shape = static_shape; + partial_shape[0] = ov::Dimension::dynamic(); + return partial_shape; +} + +/** Applies transformations to the ov::Model to enable paged attention inference. + * @param model Pointer to the ov::Model representing one of the supported LLM architectures. + * @param device_config Configuration struct for inferencing device specifics. + * @param per_layer_cache_control If true, then the transformations will enable per-layer control of KV cache blocks, allowing to specify + * different sets of KV cache blocks for different attention layers. If false, then the KV cache block structure will be identical across all + * decoder layers. + */ +void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model, DeviceConfig& device_config, bool per_layer_cache_control) { + const ov::op::util::VariableVector& variables = model->get_variables(); + OPENVINO_ASSERT(!variables.empty(), "Model is supposed to be stateful"); + + bool use_block_indices_inputs = per_layer_cache_control; + bool use_score_outputs = per_layer_cache_control; + ov::pass::SDPAToPagedAttention(use_block_indices_inputs, use_score_outputs).run_on_model(model); + + const ov::ParameterVector& parameters = model->get_parameters(); + + std::map<std::string, std::shared_ptr<ov::op::v0::Parameter>> key_cache_params; + std::map<std::string, std::shared_ptr<ov::op::v0::Parameter>> value_cache_params; + for (const auto& param_ptr : parameters) { + const auto& name = param_ptr->get_friendly_name(); + if (name.find("key_cache.") == 0) { + key_cache_params[name] = param_ptr; + } + else if (name.find("value_cache.") == 0) { + value_cache_params[name] = param_ptr; + } + } + + OPENVINO_ASSERT(key_cache_params.size() == value_cache_params.size()); + OPENVINO_ASSERT(key_cache_params.size() > 0); + + size_t num_layers = key_cache_params.size(); + // extract num_kv_heads and head_size + std::string key_cache_param_name = "key_cache.0"; + OPENVINO_ASSERT(key_cache_params.count(key_cache_param_name) != 0, "key_cache.0 tensor not found among model parameters"); + ov::PartialShape k_shape = key_cache_params[key_cache_param_name]->get_partial_shape(); + OPENVINO_ASSERT(k_shape.rank().get_length() == 3, "KV cache shape is expected to have rank 3, while shape is ", k_shape); + size_t num_kv_heads = k_shape[1].get_length(), head_size = k_shape[2].get_length(); + + device_config.set_model_params(num_kv_heads, head_size, num_layers); + + for (auto it_k = key_cache_params.begin(), it_v = value_cache_params.begin(); it_k != key_cache_params.end();++it_k, ++it_v) { + it_k->second->set_element_type(device_config.get_cache_precision()); + it_v->second->set_element_type(device_config.get_cache_precision()); + // TODO: CVS-145270 + it_k->second->set_partial_shape(to_partial_with_dyn_0_dim(device_config.get_key_cache_shape())); + it_v->second->set_partial_shape(to_partial_with_dyn_0_dim(device_config.get_value_cache_shape())); + } + + model->validate_nodes_and_infer_types(); +} +} diff --git a/src/cpp/src/paged_attention_transformations.hpp b/src/cpp/src/paged_attention_transformations.hpp new file mode 100644 index 0000000000..a7bce23757 --- /dev/null +++ b/src/cpp/src/paged_attention_transformations.hpp @@ -0,0 +1,11 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "openvino/core/model.hpp" +#include "device_config.hpp" + +namespace ov::genai { +void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model, DeviceConfig& device_config, bool per_layer_cache_control = false); +} \ No newline at end of file diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp new file mode 100644 index 0000000000..3bdc1b27a5 --- /dev/null +++ b/src/cpp/src/perf_metrics.cpp @@ -0,0 +1,178 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/perf_metrics.hpp" +#include "openvino/openvino.hpp" +#include <tuple> +#include <numeric> +#include <cmath> + +namespace { + +ov::genai::MeanStdPair calc_mean_and_std(const std::vector<ov::genai::MicroSeconds>& durations) { + if (durations.size() == 0) { + return {-1, -1}; + } + // Accepts time durations in microseconds and returns standard deviation and mean in milliseconds. + float mean = std::accumulate(durations.begin(), durations.end(), 0.0f, + [](const float& acc, const ov::genai::MicroSeconds& duration) -> float { + return acc + duration.count() / 1000.0f; + }); + mean /= durations.size(); + + float sum_square_durations = std::accumulate(durations.begin(), durations.end(), 0.0f, + [](const float& acc, const ov::genai::MicroSeconds& duration) -> float { + auto d = duration.count() / 1000.0f; + return acc + d * d; + }); + float std = std::sqrt(sum_square_durations / durations.size() - mean * mean); + return {mean, std}; +} + + +} // namespace + +namespace ov { +namespace genai { + +float PerfMetrics::get_load_time() { + return load_time; +} + +size_t PerfMetrics::get_num_generated_tokens() { + evaluate_statistics(); + return num_generated_tokens; +} + +size_t PerfMetrics::get_num_input_tokens() { + evaluate_statistics(); + return num_input_tokens; +} + +MeanStdPair PerfMetrics::get_ttft() { + evaluate_statistics(); + return ttft; +} + +MeanStdPair PerfMetrics::get_tpot() { + evaluate_statistics(); + return tpot; +} + +MeanStdPair PerfMetrics::get_ipot() { + evaluate_statistics(); + return ipot; +} + +MeanStdPair PerfMetrics::get_throughput() { + evaluate_statistics(); + return throughput; +} + +MeanStdPair PerfMetrics::get_generate_duration() { + evaluate_statistics(); + return generate_duration; +} + +MeanStdPair PerfMetrics::get_tokenization_duration() { + evaluate_statistics(); + return tokenization_duration; +} + +MeanStdPair PerfMetrics::get_detokenization_duration() { + evaluate_statistics(); + return detokenization_duration; +} + +MeanStdPair PerfMetrics::get_inference_duration() { + evaluate_statistics(); + return inference_duration; +} + +float PerfMetrics::get_microsec(std::chrono::steady_clock::duration duration) { + return std::chrono::duration_cast<std::chrono::microseconds>(duration).count(); +} + +void PerfMetrics::evaluate_statistics(std::optional<TimePoint> start_time) { + if (m_evaluated){ + return; + } + // If start_tiem is specified then recalcualte durations according to start times and calculate statistics only after that. + if (start_time.has_value()) { + auto start_time_val = *start_time; + auto& tok_times = raw_metrics.m_new_token_times; + auto& batch_sizes = raw_metrics.m_batch_sizes; + raw_metrics.m_durations = std::vector<MicroSeconds>(tok_times.size()); + + auto ttft = tok_times[0] - start_time_val; + raw_metrics.m_times_to_first_token = std::vector<MicroSeconds>(); + raw_metrics.m_times_to_first_token.emplace_back(ttft / batch_sizes[0]); + num_generated_tokens = 0; + for (size_t i = 0; i < tok_times.size(); ++i) { + raw_metrics.m_durations[i] = tok_times[i] - start_time_val; + + // If in 10 ms a batch of 5 new tokens is generated then TPOT is 10 / 5 = 2 tok/ms. + raw_metrics.m_durations[i] /= batch_sizes[i]; + num_generated_tokens += batch_sizes[i]; + start_time_val = tok_times[i]; + } + } + + // calc_mean_and_std will convert microsecond to milliseconds. + tpot = calc_mean_and_std(raw_metrics.m_durations); + ipot = calc_mean_and_std(raw_metrics.m_token_infer_durations); + ttft = calc_mean_and_std(raw_metrics.m_times_to_first_token); + + generate_duration = calc_mean_and_std(raw_metrics.generate_durations); + tokenization_duration = calc_mean_and_std(raw_metrics.tokenization_durations); + detokenization_duration = calc_mean_and_std(raw_metrics.detokenization_durations); + inference_duration = calc_mean_and_std(raw_metrics.m_inference_durations); + + // tokens per second + throughput = {1000.0f / tpot.mean, (tpot.std * 1000.0f) / (tpot.mean * tpot.mean)}; + m_evaluated = true; +} + +PerfMetrics PerfMetrics::operator+(const PerfMetrics& right) const { + OPENVINO_ASSERT(right.load_time == load_time, "generation metrics can be accumulated only for the same pipeline"); + + // Copy left value to res. + PerfMetrics res = *this; + + // Concatenate durations, batch_sizes first token times. + auto& new_durations = res.raw_metrics.m_durations; + auto& new_batch_sizes = res.raw_metrics.m_batch_sizes; + auto& new_times_to_first_token = res.raw_metrics.m_times_to_first_token; + auto& right_durations = right.raw_metrics.m_durations; + auto& right_batch_sizes = right.raw_metrics.m_batch_sizes; + auto& right_times_to_first_token = right.raw_metrics.m_times_to_first_token; + + new_durations.insert(new_durations.end(), right_durations.begin(), right_durations.end()); + new_times_to_first_token.insert(new_times_to_first_token.end(), right_times_to_first_token.begin(), right_times_to_first_token.end()); + new_batch_sizes.insert(new_batch_sizes.end(), right_batch_sizes.begin(), right_batch_sizes.end()); + + // Concatenate tokenization/detokenization and total generation times. + auto& new_tok_durations = res.raw_metrics.tokenization_durations; + auto& new_detok_durations = res.raw_metrics.detokenization_durations; + auto& new_gen_durations = res.raw_metrics.generate_durations; + auto& right_tok_durations = right.raw_metrics.tokenization_durations; + auto& right_detok_durations = right.raw_metrics.detokenization_durations; + auto& right_gen_durations = right.raw_metrics.generate_durations; + + new_tok_durations.insert(new_tok_durations.end(), right_tok_durations.begin(), right_tok_durations.end()); + new_detok_durations.insert(new_detok_durations.end(), right_detok_durations.begin(), right_detok_durations.end()); + new_gen_durations.insert(new_gen_durations.end(), right_gen_durations.begin(), right_gen_durations.end()); + + res.num_generated_tokens += right.num_generated_tokens; + res.num_input_tokens += right.num_input_tokens; + res.m_evaluated = false; + return res; +} + +PerfMetrics& PerfMetrics::operator+=(const PerfMetrics& right) { + *this = *this + right; + return *this; +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/safetensors.c b/src/cpp/src/safetensors.c new file mode 100644 index 0000000000..61559882c6 --- /dev/null +++ b/src/cpp/src/safetensors.c @@ -0,0 +1,2 @@ +#define SAFETENSORS_IMPLEMENTATION +#include "safetensors.h" \ No newline at end of file diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp new file mode 100644 index 0000000000..5ae604c725 --- /dev/null +++ b/src/cpp/src/sampler.cpp @@ -0,0 +1,834 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "sampler.hpp" + +namespace ov::genai { +// Modifyed Knuth–Morris–Pratt algorithm which returns tokens following after every needle occurance in haystack +std::vector<int64_t> kmp_search(const std::vector<int64_t>& haystack, const std::vector<int64_t>& needle) { + if (needle.empty()) { // no_repeat_ngram_size == 1, ban every token + return {haystack.begin(), haystack.end()}; + } + std::vector<int> partial_match_table(needle.size() + 1, -1); + int cnd = 0; + for (size_t pos = 1; pos < needle.size(); ++pos) { + if (needle.at(pos) == needle.at(size_t(cnd))) { + partial_match_table.at(pos) = partial_match_table.at(size_t(cnd)); + } else { + partial_match_table.at(pos) = cnd; + while (cnd >= 0 && needle.at(pos) != needle.at(size_t(cnd))) { + cnd = partial_match_table.at(size_t(cnd)); + } + } + ++cnd; + } + partial_match_table.back() = cnd; + std::vector<int64_t> res; + size_t haystack_id = 0; + int needle_id = 0; + while (haystack_id < haystack.size() - 1) { + if (needle.at(size_t(needle_id)) == haystack.at(haystack_id)) { + ++haystack_id; + ++needle_id; + if (needle_id == int(needle.size())) { + res.push_back(haystack.at(haystack_id)); + needle_id = partial_match_table.at(size_t(needle_id)); + } + } else { + needle_id = partial_match_table.at(size_t(needle_id)); + if (needle_id < 0) { + ++haystack_id; + ++needle_id; + } + } + } + return res; +} + +std::vector<Token> log_softmax(const ov::Tensor& logits, size_t batch_idx) { + ov::Shape shape = logits.get_shape(); + OPENVINO_ASSERT(shape.size() == 3); + size_t batch = shape[0], seq_len = shape[1], vocab_size = shape[2]; + OPENVINO_ASSERT(batch_idx < batch, "Logits batch size doesn't match the number of beams"); + + size_t batch_offset = batch_idx * seq_len * vocab_size, sequence_offset = (seq_len - 1) * vocab_size; + const float* beam_logits = logits.data<const float>() + batch_offset + sequence_offset; + float max_logit = *std::max_element(beam_logits, beam_logits + vocab_size); + float log_sum = std::log(std::accumulate( + beam_logits, beam_logits + vocab_size, 0.0f, [max_logit](float accumulated, float to_add) { + return accumulated + std::exp(to_add - max_logit); + })); + + std::vector<Token> tokens; + tokens.reserve(vocab_size); + for (size_t idx = 0; idx < vocab_size; ++idx) + tokens.push_back({beam_logits[idx] - max_logit - log_sum, int64_t(idx)}); + + return tokens; +} + +std::vector<int64_t> wrap_tokens(const std::vector<int64_t>& tokens, const std::vector<int64_t>& prefix_tokens, const std::vector<int64_t>& suffix_tokens) { + std::vector<int64_t> all_tokens = prefix_tokens; + all_tokens.insert(all_tokens.end(), tokens.begin(), tokens.end()); + all_tokens.insert(all_tokens.end(), suffix_tokens.begin(), suffix_tokens.end()); + return all_tokens; +} + +std::string clean_wrapped_text(const std::string& wrapped_text, const std::string& prefix, const std::string& suffix) { + auto prefix_pos = wrapped_text.find(prefix); + OPENVINO_ASSERT(prefix_pos != std::string::npos); + auto suffix_pos = wrapped_text.rfind(suffix); + OPENVINO_ASSERT(suffix_pos != std::string::npos); + auto clean_text_start = prefix_pos + prefix.size(); + auto clean_text_length = suffix_pos - clean_text_start; + std::string clean_text = wrapped_text.substr(clean_text_start, clean_text_length); + return clean_text; +} + +// Return number of last tokens that match one of the stop_strings. If there's no match 0 is returned. +int match_stop_string(Tokenizer & tokenizer, const TokenIds & generated_tokens, const std::set<std::string> & stop_strings) { + /* + For catching stop_string hit we run comparisons character-wise to catch cases where stop string + overlaps with part of another token on both sides or is just a part of a single token. + For every stop_string we iterate over generated tokens starting from the last one and going backwards. + Every token is wrapped with prefix tokens to ensure tokenizer doesn't remove prefix whitespace of the actual token. + After that all tokens are decoded and prefix is removed from the decoded text, so we end up with decoded token. + Its characters are compared to the stop_string character at a current_position + (position of a character in the stop_string counting from the last one) - at the begining position is 0. + When characters match we increase current_position and check if we have a full match already, if not we continue. + If we have already matched some characters (current_position > 0) and next character is not matching + before we reach the full match, then we reset current_position to 0. + */ + std::string prefix = "a"; + auto prefix_ov = tokenizer.encode(prefix).input_ids; + std::vector<int64_t> prefix_tokens(prefix_ov.data<int64_t>(), prefix_ov.data<int64_t>() + prefix_ov.get_size()); + std::string suffix = "b"; + auto suffix_ov = tokenizer.encode(suffix).input_ids; + std::vector<int64_t> suffix_tokens(suffix_ov.data<int64_t>(), suffix_ov.data<int64_t>() + suffix_ov.get_size()); + + // Since whitespace can be added at the beginning of the suffix we also try to capture that behavior here + // and get suffix string that will actually be part of the decoded string so we can remove it correctly + auto wrapped_suffix_tokens = suffix_tokens; + wrapped_suffix_tokens.insert(wrapped_suffix_tokens.begin(), prefix_tokens.begin(), prefix_tokens.end()); + std::string wrapped_suffix = tokenizer.decode(wrapped_suffix_tokens); + auto wrapper_pos = wrapped_suffix.find(prefix); + suffix = wrapped_suffix.substr(wrapper_pos + prefix.size()); + + for (auto stop_string: stop_strings) { + int current_position = 0; + int num_matched_tokens = 0; + // Getting reverse iterator to check tokens starting from the last one generated and going backwards + auto generated_tokens_rit = generated_tokens.rbegin(); + std::vector<int64_t> tokens_buffer; + while (generated_tokens_rit != generated_tokens.rend()) { + num_matched_tokens++; + tokens_buffer.insert(tokens_buffer.begin(), *generated_tokens_rit); + + std::vector<int64_t> wrapped_tokens = wrap_tokens(tokens_buffer, prefix_tokens, suffix_tokens); + std::string wrapped_text = tokenizer.decode(wrapped_tokens); + std::string clean_text = clean_wrapped_text(wrapped_text, prefix, suffix); + + if (clean_text == "" || (clean_text.size() >= 3 && (clean_text.compare(clean_text.size() - 3, 3, "�") == 0))) { + generated_tokens_rit++; + continue; + } else { + tokens_buffer.clear(); + } + // Checking clean_text characters starting from the last one + for (auto clean_text_rit = clean_text.rbegin(); clean_text_rit != clean_text.rend(); clean_text_rit++) { + // On character match increment current_position for the next comparisons + if (*clean_text_rit == *(stop_string.rbegin() + current_position)) { + current_position++; + // If this is the last character from the stop_string we have a match + if ((stop_string.rbegin() + current_position) == stop_string.rend()) { + return num_matched_tokens; + } + } else if (current_position) { + // Already found matching characters, but the last one didn't match, so we reset current_position + current_position = 0; + // Looking for the match will start over from this character so we decrement iterator + clean_text_rit--; + } + } + generated_tokens_rit++; + } + } + return 0; +} + +// Return number of last tokens that match one of the stop_strings. If there's no match 0 is returned. +// Number of tokens might not be exact as if there's no direct token match, we decode generated tokens incrementally expanding decoding scope +// with 4 next tokens with each iteration until we check all tokens. +int match_stop_string2(Tokenizer & tokenizer, const TokenIds & generated_tokens, const std::set<std::string> & stop_strings) { + for (auto stop_string: stop_strings) { + auto stop_tokens_ov = tokenizer.encode(stop_string).input_ids; + size_t num_tokens = stop_tokens_ov.get_size(); + if(num_tokens > generated_tokens.size()) + continue; + + // Check direct token match + std::vector<int64_t> stop_tokens(stop_tokens_ov.data<int64_t>(), stop_tokens_ov.data<int64_t>() + num_tokens); + std::vector<int64_t> last_generated_tokens(generated_tokens.end()-num_tokens, generated_tokens.end()); + if (stop_tokens == last_generated_tokens) + return num_tokens; + + // Continue checking chunks of 4 tokens + num_tokens += 4; + while (num_tokens <= generated_tokens.size()) { + std::vector<int64_t> last_generated_tokens(generated_tokens.end()-num_tokens, generated_tokens.end()); + std::string decoded_last_tokens = tokenizer.decode(last_generated_tokens); + if (decoded_last_tokens.find(stop_string) != std::string::npos) { + return num_tokens; + } + num_tokens += 4; + } + } + return 0; +} + +void Sampler::GroupBeamSearcher::finalize(SamplerOutput& sampler_output) { + for (Group& group : m_groups) { + if (!group.done) { + for (Beam& beam : group.ongoing) { + uint64_t sequence_id = beam.m_sequence->get_id(); + + int64_t preempted_id = group.finish(beam, m_parameters); + if (preempted_id >= 0) { + // remove preempted one + m_sequence_group->remove_sequence(preempted_id); + } + + // mark current sequence as finished + beam.m_sequence->set_status(SequenceStatus::FINISHED); + // Setting length since this function is used when sequence generated tokens number reaches max_new_tokens + beam.m_sequence->set_finish_reason(GenerationFinishReason::LENGTH); + // we also need to drop add ongoing / forked sequences from scheduler + sampler_output.m_dropped_sequences.push_back(sequence_id); + } + } + } +} + +Sampler::GroupBeamSearcher::GroupBeamSearcher(SequenceGroup::Ptr sequence_group, Tokenizer tokenizer) + : m_sequence_group(sequence_group), + m_parameters{m_sequence_group->get_sampling_parameters()}, + m_groups{m_parameters.num_beam_groups}, + m_tokenizer(tokenizer) { + OPENVINO_ASSERT(m_sequence_group->num_running_seqs() == 1); + assert(m_parameters.num_beams % m_parameters.num_beam_groups == 0 && + "number of beams should be divisible by number of groups"); + size_t group_size = m_parameters.num_beams / m_parameters.num_beam_groups; + + for (Group& group : m_groups) { + group.ongoing.reserve(group_size); + // initially we just add our "base" sequence to beams inside each group + for (size_t i = 0; i < group_size; ++i) + group.ongoing.push_back(Beam((*sequence_group)[0])); + // to avoid selecting the same tokens for beams within group, let's just initialize score + // for the front one + group.ongoing.front().m_score = 0.0f; + } +} + +void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, SamplerOutput& sampler_output) { + assert(m_parameters.num_beams % m_parameters.num_beam_groups == 0 && + "number of beams should be divisible by number of groups"); + size_t group_size = m_parameters.num_beams / m_parameters.num_beam_groups; + std::vector<int64_t> next_tokens; + std::vector<int32_t> next_beams; + next_tokens.reserve(m_parameters.num_beams); + next_beams.reserve(m_parameters.num_beams); + + // parent sequence ID -> number of child sequences + std::map<uint64_t, uint64_t> parent_2_num_childs_map; + + for (Group& group : m_groups) { + if (!group.done) { + for (Beam& beam : group.ongoing) { + uint64_t parent_seq_id = beam.m_sequence->get_id(); + + // here we need to map index of sequence in beam search group(s) and sequence group + beam.m_global_beam_idx = [this] (uint64_t seq_id) -> size_t { + std::vector<Sequence::Ptr> running_seqs = m_sequence_group->get_running_sequences(); + for (size_t seq_global_index = 0; seq_global_index < running_seqs.size(); ++seq_global_index) { + if (seq_id == running_seqs[seq_global_index]->get_id()) + return seq_global_index; + } + OPENVINO_THROW("Internal error in beam search: should not be here"); + } (parent_seq_id); + + // zero out all parent forks counts + parent_2_num_childs_map[parent_seq_id] = 0; + } + } + } + + auto try_to_finish_candidate = [&] (Group& group, Beam& candidate, bool include_candidate_token = true) -> void { + uint64_t seq_id = candidate.m_sequence->get_id(); + // try to finish candidate + int64_t preempted_seq_id = group.finish(candidate, m_parameters); + + // if candidate has lower score than others finished + if (preempted_seq_id == seq_id) { + // do nothing and just ignore current finished candidate + } else { + if (preempted_seq_id >= 0) { + m_sequence_group->remove_sequence(preempted_seq_id); + } + + // need to insert candidate to a sequence group + Sequence::Ptr forked_sequence = m_sequence_group->fork_sequence(candidate.m_sequence); + // and finish immidiately + forked_sequence->set_status(SequenceStatus::FINISHED); + // Setting stop since this function is used when sequence generated eos token + forked_sequence->set_finish_reason(GenerationFinishReason::STOP); + + // TODO: make it more simplier + // currently, we finish sequence and then fork it in current code + { + for (size_t i = 0; i < group.min_heap.size(); ++i) { + if (group.min_heap[i].m_sequence->get_id() == seq_id) { + group.min_heap[i].m_sequence = forked_sequence; + break; + } + } + } + + // append token from candidate to actual sequence + if (include_candidate_token) + forked_sequence->append_token(candidate.m_token_id, candidate.m_log_prob); + } + }; + + // group ID => child beams + std::map<int, std::vector<Beam>> child_beams_per_group; + + for (size_t group_id = 0; group_id < m_groups.size(); ++group_id) { + Group & group = m_groups[group_id]; + if (group.done) + continue; + + std::vector<Beam> candidates; + candidates.reserve(group_size * 2 * group_size); + for (const Beam& beam : group.ongoing) { + std::vector<Token> tokens = log_softmax(logits, beam.m_global_beam_idx); + + // apply diversity penalty + for (auto prev_group_id = 0; prev_group_id < group_id; ++prev_group_id) { + for (const Beam& prev_beam : child_beams_per_group[prev_group_id]) { + tokens[prev_beam.m_token_id].m_log_prob -= m_parameters.diversity_penalty; + } + } + + // apply n_gramm + std::vector<int64_t> full_text{m_sequence_group->get_prompt_ids()}; + full_text.insert(full_text.end(), beam.m_sequence->get_generated_ids().begin(), beam.m_sequence->get_generated_ids().end()); + if (full_text.size() > 1 && full_text.size() >= m_parameters.no_repeat_ngram_size) { + auto tail_start = full_text.end() - ptrdiff_t(m_parameters.no_repeat_ngram_size) + 1; + for (int64_t banned_token : kmp_search(full_text, {tail_start, full_text.end()})) { + tokens[banned_token].m_log_prob = -std::numeric_limits<float>::infinity(); + } + } + + // sort tokens + std::sort(tokens.begin(), tokens.end(), [](Token left, Token right) { + return left.m_log_prob > right.m_log_prob; // Most probable tokens in front + }); + + size_t add_count = 0; + for (Token token : tokens) { + Beam new_candidate = beam; + new_candidate.m_score += new_candidate.m_log_prob = token.m_log_prob; + new_candidate.m_token_id = token.m_index; + + // TODO: fix it + // and ensure cumulative_log prob is used + if (/* m_parameters.early_finish(new_candidate) */ false) { + try_to_finish_candidate(group, new_candidate); + } else { + candidates.push_back(new_candidate); + if (++add_count == 2 * group_size) { + break; + } + } + } + } + + // Sample 2 * group_size highest score tokens to get at least 1 non EOS token per beam + OPENVINO_ASSERT(candidates.size() >= 2 * group_size, "No beams left to search"); + + auto to_sort = candidates.begin() + ptrdiff_t(2 * group_size); + std::partial_sort(candidates.begin(), to_sort, candidates.end(), greater); + + for (size_t cand_idx = 0; cand_idx < candidates.size(); ++cand_idx) { + Beam & candidate = candidates[cand_idx]; + if (is_stop_token_id_hit(candidate.m_token_id, m_sequence_group->get_sampling_parameters().stop_token_ids)) { + // If beam_token does not belong to top num_beams tokens, it should not be added + if (cand_idx >= group_size) + continue; + + // try to finish candidate + try_to_finish_candidate(group, candidate); + continue; + } + + if (!m_parameters.stop_strings.empty()) { + // We need to include candidate token to already generated tokens to check if stop string has been generated + // There's probably a better way to do that, than copying whole vector... + std::vector<int64_t> token_ids = candidate.m_sequence->get_generated_ids(); + token_ids.push_back(candidate.m_token_id); + int num_last_matched_tokens = match_stop_string(m_tokenizer, token_ids, m_sequence_group->get_sampling_parameters().stop_strings); + if (num_last_matched_tokens) { + // If beam_token does not belong to top num_beams tokens, it should not be added + if (cand_idx >= group_size) + continue; + + if(!m_parameters.include_stop_str_in_output) { + // remove tokens that match stop_string from output (last token is not included in candidate.m_sequence at this point) + candidate.m_sequence->remove_last_tokens(num_last_matched_tokens - 1); + } + + // try to finish candidate + try_to_finish_candidate(group, candidate, m_parameters.include_stop_str_in_output); + continue; + } + } + + parent_2_num_childs_map[candidate.m_sequence->get_id()] += 1; + child_beams_per_group[group_id].push_back(candidate); + + // if num childs are enough + if (child_beams_per_group[group_id].size() == group_size) { + break; + } + } + + // check whether group has finished + group.is_done(m_parameters); + + // group cannot continue if there are no valid child beams + if (child_beams_per_group[group_id].size() == 0) { + group.done = true; + } + + if (group.done) { + // group has finished, group all running sequences + for (const Beam& beam : group.ongoing) { + uint64_t seq_id = beam.m_sequence->get_id(); + m_sequence_group->remove_sequence(seq_id); + sampler_output.m_dropped_sequences.push_back(seq_id); + } + group.ongoing.clear(); + } + } + + // fork child sequences for non-finished groups + + for (size_t group_id = 0; group_id < m_groups.size(); ++group_id) { + Group & group = m_groups[group_id]; + + if (!group.done) { + for (Beam& child_beam : child_beams_per_group[group_id]) { + uint64_t parent_sequence_id = child_beam.m_sequence->get_id(); + uint64_t& num_childs = parent_2_num_childs_map[parent_sequence_id]; + + // if current beam is forked multiple times + if (num_childs > 1) { + child_beam.m_sequence = m_sequence_group->fork_sequence(child_beam.m_sequence); + child_beam.m_sequence->append_token(child_beam.m_token_id, child_beam.m_log_prob); + + // reduce forks count, since fork already happened and next loop iteration + // will go by the second branch (num_childs == 1) + --num_childs; + + // fill out sampler output + sampler_output.m_forked_sequences[parent_sequence_id].push_back(child_beam.m_sequence->get_id()); + } else if (num_childs == 1) { + // keep current sequence going and add a new token + child_beam.m_sequence->append_token(child_beam.m_token_id, child_beam.m_log_prob); + } + } + + // drop beams which are not forked by current group + for (const Beam& beam : group.ongoing) { + size_t num_childs = parent_2_num_childs_map[beam.m_sequence->get_id()]; + if (num_childs == 0) { + // drop sequence as not forked + sampler_output.m_dropped_sequences.push_back(beam.m_sequence->get_id()); + m_sequence_group->remove_sequence(beam.m_sequence->get_id()); + } + } + + // child become parents + group.ongoing = child_beams_per_group[group_id]; + } + } +} + +Logits Sampler::_get_logit_vector(ov::Tensor logits, size_t batch_idx, size_t token_idx) { + ov::Shape logits_shape = logits.get_shape(); + size_t batch_size = logits_shape[0], seq_len = logits_shape[1], vocab_size = logits_shape[2]; + OPENVINO_ASSERT(batch_idx <= batch_size); + OPENVINO_ASSERT(token_idx < seq_len); + size_t batch_offset = batch_idx * seq_len * vocab_size; + size_t sequence_offset = (seq_len - token_idx - 1) * vocab_size; + float* logits_data = logits.data<float>() + batch_offset + sequence_offset; + + return Logits{logits_data, vocab_size}; +} + +Token Sampler::_greedy_sample(const Logits& logits) const { + // For greedy sampling we do not expect sorting or shrinking considered tokens + // so we can operate directly on the data buffer + float max_value = -std::numeric_limits<float>::infinity(); + size_t max_index = 0; + for (size_t i = 0; i < logits.m_size; ++i) { + if (logits.m_data[i] > max_value) { + max_value = logits.m_data[i]; + max_index = i; + } + } + + // apply log softmax to max value + float log_sum = std::log(std::accumulate( + logits.m_data, logits.m_data + logits.m_size, 0.0f, [max_value](float accumulated, float to_add) { + return accumulated + std::exp(to_add - max_value); + })); + max_value = -log_sum; + + return Token(max_value, max_index); +} + +std::vector<Token> Sampler::_multinomial_sample(const Logits& logits, size_t num_tokens_per_sequence) { + // If top_p or top_k was applied we use sorted vector, if not we go with original buffer. + std::vector<float> multinomial_weights; + multinomial_weights.reserve(logits.m_size); + if (logits.is_vector_initialized()) + for (auto& logit: logits.m_vector) multinomial_weights.emplace_back(logit.m_log_prob); + else + multinomial_weights.assign(logits.m_data, logits.m_data + logits.m_size); + + // std::discrete_distribution returns corrupted results when applied to log probabilies + // which result returning NAN only logprobs. + // so log() is applied after this line + auto dist = std::discrete_distribution<size_t>(multinomial_weights.begin(), multinomial_weights.end()); // equivalent to multinomial with number of trials == 1 + + std::vector<Token> out_tokens; + for (size_t token_idx = 0; token_idx < num_tokens_per_sequence; ++token_idx) { + size_t element_to_pick = dist(rng_engine); + if (logits.is_vector_initialized()) { + auto logit = logits.m_vector[element_to_pick]; + logit.m_log_prob = std::log(logit.m_log_prob); + out_tokens.push_back(logit); + } + else + out_tokens.emplace_back(std::log(logits.m_data[element_to_pick]), element_to_pick); + } + return out_tokens; +} + +std::vector<int64_t> Sampler::_try_finish_generation(SequenceGroup::Ptr & sequence_group) { + auto sampling_params = sequence_group->get_sampling_parameters(); + std::vector<int64_t> dropped_seq_ids; + for (auto& running_sequence : sequence_group->get_running_sequences()) { + const auto generated_len = running_sequence->get_generated_len(); + if (sampling_params.max_new_tokens == generated_len || + is_stop_token_id_hit(running_sequence->get_generated_ids().back(), sampling_params.stop_token_ids) && !sampling_params.ignore_eos) { + // stop sequence by max_new_tokens or stop token (eos included) + running_sequence->set_status(SequenceStatus::FINISHED); + + if (is_stop_token_id_hit(running_sequence->get_generated_ids().back(), sampling_params.stop_token_ids) && !sampling_params.ignore_eos) { + running_sequence->set_finish_reason(GenerationFinishReason::STOP); + } else if (sampling_params.max_new_tokens == generated_len) { + running_sequence->set_finish_reason(GenerationFinishReason::LENGTH); + } + + dropped_seq_ids.push_back(running_sequence->get_id()); + continue; + } + + if (!sampling_params.stop_strings.empty()) { + int num_matched_last_tokens = match_stop_string(m_tokenizer, running_sequence->get_generated_ids(), sampling_params.stop_strings); + if (num_matched_last_tokens) { + if (!sampling_params.include_stop_str_in_output) + running_sequence->remove_last_tokens(num_matched_last_tokens); + running_sequence->set_status(SequenceStatus::FINISHED); + running_sequence->set_finish_reason(GenerationFinishReason::STOP); + dropped_seq_ids.push_back(running_sequence->get_id()); + } + } + } + return dropped_seq_ids; +} + +void register_new_token(const Token& sampled_token_id, + Sequence::Ptr running_sequence, + LogitProcessor& logit_processor, + bool is_extend_sequence, + bool is_update_len_logit_processor) { + logit_processor.register_new_generated_token(sampled_token_id.m_index); + size_t generated_len = logit_processor.get_generated_len(); + if (is_extend_sequence) { + running_sequence->append_token(sampled_token_id.m_index, sampled_token_id.m_log_prob); + } else { + // just update the token log prob in case of successfully validated token + OPENVINO_ASSERT(generated_len < running_sequence->get_generated_len()); + running_sequence->update_generated_log_prob(generated_len, sampled_token_id.m_log_prob); + } + // increment seq len only for one sequence in sequence group to sync them + if (is_update_len_logit_processor) { + logit_processor.update_generated_len(++generated_len); + } +}; + +std::list<uint64_t> +create_n_forked_sequences(SequenceGroup::Ptr sequence_group, + LogitProcessor& logit_processor, + const std::vector<Token>& sampled_tokens) { + const auto& running_sequences = sequence_group->get_running_sequences(); + OPENVINO_ASSERT(running_sequences.size() == 1); + Sequence::Ptr sequence_to_fork = running_sequences.front(); + std::list<uint64_t> forked_seq_ids; + for (size_t i = 1; i < sampled_tokens.size(); ++i) { + const auto forked_sequence = sequence_group->fork_sequence(sequence_to_fork); + const auto forked_seq_id = forked_sequence->get_id(); + forked_seq_ids.push_back(forked_seq_id); + register_new_token(sampled_tokens[i], forked_sequence, logit_processor, true, false); + } + return forked_seq_ids; +} + +bool +is_continue_to_sample_tokens(Sequence::Ptr running_sequence, + size_t token_idx, + size_t max_gen_len, + size_t& decrease_context_len_per_seq_group) { + if (max_gen_len == 0) { + running_sequence->remove_last_tokens(token_idx); + decrease_context_len_per_seq_group = std::max(decrease_context_len_per_seq_group, token_idx); + return false; + } + return true; +} + +bool +validate_candidate(Sequence::Ptr running_sequence, + size_t& token_idx, + Token& sampled_token, + bool& is_extend_sequence, + size_t& decrease_context_len_per_seq_group) { + if (token_idx > 0) { + const auto& generated_tokens = running_sequence->get_generated_ids(); + auto it = generated_tokens.rbegin(); + std::advance(it, token_idx - 1); + // to validate candidates from assisting model and remove incorrect ones from generated sequence + if (*it != sampled_token.m_index) { + running_sequence->remove_last_tokens(token_idx); + decrease_context_len_per_seq_group = std::max(decrease_context_len_per_seq_group, token_idx); + is_extend_sequence = true; + return false; + } else { + sampled_token.m_index = *it; + } + } + return true; + +} + +SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups, + ov::Tensor logits, + bool is_validation_mode_enabled) { + const float * logits_data = logits.data<float>(); + ov::Shape logits_shape = logits.get_shape(); + OPENVINO_ASSERT(logits_shape.size() == 3); + size_t batch_seq_len = logits_shape[1], vocab_size = logits_shape[2]; + + SamplerOutput sampler_output; + for (size_t sequence_group_id = 0, currently_processed_tokens = 0; sequence_group_id < sequence_groups.size(); ++sequence_group_id) { + SequenceGroup::Ptr sequence_group = sequence_groups[sequence_group_id]; + if (!sequence_group->is_scheduled()) + continue; + + size_t num_running_sequences = sequence_group->num_running_seqs(); + size_t actual_seq_len = sequence_group->get_num_scheduled_tokens(); // points to a token which needs to be sampled + size_t padded_amount_of_processed_tokens = std::max(actual_seq_len, batch_seq_len); + const ov::genai::GenerationConfig& sampling_params = sequence_group->get_sampling_parameters(); + + const auto request_id = sequence_group->get_request_id(); + if (!m_logit_processors.count(request_id)) { + m_logit_processors.insert({request_id, LogitProcessor(sampling_params, sequence_group->get_prompt_ids())}); + } + auto& logit_processor = m_logit_processors.at(request_id); + + const void * sequence_group_logits_data = logits_data + vocab_size * currently_processed_tokens; + ov::Tensor sequence_group_logits(ov::element::f32, ov::Shape{num_running_sequences, actual_seq_len, vocab_size}, (void *)sequence_group_logits_data); + size_t decrease_context_len_per_seq_group = 0; + if (sequence_group->requires_sampling()) { + // get number of token to be validated + auto num_tokens_to_process = sequence_group->get_num_tokens_to_validate(); + if (sampling_params.is_greedy_decoding() || sampling_params.is_multinomial()) { + std::vector<Sequence::Ptr> running_sequences = sequence_group->get_running_sequences(); + if (sampling_params.is_greedy_decoding()) { + OPENVINO_ASSERT(num_running_sequences == 1); + } + for (size_t running_sequence_id = 0; running_sequence_id < num_running_sequences; ++running_sequence_id) { + auto& running_sequence = running_sequences[running_sequence_id]; + // make `num_tokens_to_process` iteration to validate a candidate generated by `draft_model` + 1 iteration to generate one more token by `main_model` + for (size_t i = 0; i <= num_tokens_to_process; ++i) { + // calculate token offset from the end of logit + size_t token_offset = num_tokens_to_process - i; + // max counter of needed to be sampled tokens + size_t max_num_sampled_token = sampling_params.max_new_tokens + token_offset - running_sequence->get_generated_len(); + if (!is_continue_to_sample_tokens(running_sequence, token_offset, max_num_sampled_token, decrease_context_len_per_seq_group)) { + break; + } + + // do sampling only for token validation/generation. + // continue in case of extending draft model sequences by main model generated tokens which + // should be taken to KV cache without validation + if (!is_validation_mode_enabled && token_offset > 0) { + continue; + } + + auto logit_vector = _get_logit_vector(sequence_group_logits, running_sequence_id, token_offset); + logit_processor.apply(logit_vector); + + Token sampled_token_id; + if (sampling_params.is_greedy_decoding()) { + sampled_token_id = _greedy_sample(logit_vector); + } else { + // is_multinomial() + const bool is_generate_n_tokens = sequence_group->num_total_seqs() == 1; + const size_t num_tokens_per_sequence = is_generate_n_tokens ? sampling_params.num_return_sequences : 1; + auto sampled_token_ids = _multinomial_sample(logit_vector, num_tokens_per_sequence); + OPENVINO_ASSERT(sampled_token_ids.size(), num_tokens_per_sequence); + if (is_generate_n_tokens) { + const auto forked_seq_ids = create_n_forked_sequences(sequence_group, logit_processor, sampled_token_ids); + sampler_output.m_forked_sequences.insert({running_sequences[0]->get_id(), forked_seq_ids}); + } + sampled_token_id = sampled_token_ids.front(); + } + // flag to add sampled token to generated sequence or extend logit processors only + bool is_extend_sequence = token_offset == 0, + // flag to update generated length of sequence group in logit processor + is_update_len_logit_processor = running_sequence_id == num_running_sequences - 1, + is_validation_passed = true; + if (is_validation_mode_enabled) { + is_validation_passed = validate_candidate(running_sequences[running_sequence_id], token_offset, sampled_token_id, is_extend_sequence, decrease_context_len_per_seq_group); + } + register_new_token(sampled_token_id, running_sequences[running_sequence_id], logit_processor, is_extend_sequence, is_update_len_logit_processor); + // to exit from sampling in case of failed token validation + if (!is_validation_passed) { + break; + } + } + } + for (const auto& dropped_seq_id : _try_finish_generation(sequence_group)) { + sampler_output.m_dropped_sequences.push_back(dropped_seq_id); + } + } else if (sampling_params.is_beam_search()) { + uint64_t request_id = sequence_group->get_request_id(); + + // create beam search info if we are on the first generate + if (m_beam_search_info.find(request_id) == m_beam_search_info.end()) { + m_beam_search_info.emplace(request_id, GroupBeamSearcher(sequence_group, m_tokenizer)); + } + + // current algorithm already adds new tokens to running sequences and + m_beam_search_info.at(request_id).select_next_tokens(sequence_group_logits, sampler_output); + + // check max length stop criteria + std::vector<Sequence::Ptr> running_sequences = sequence_group->get_running_sequences(); + if (!sequence_group->has_finished() && + running_sequences[0]->get_generated_len() == sampling_params.max_new_tokens) { + // stop sequence by max_new_tokens + m_beam_search_info.at(request_id).finalize(sampler_output); + } + } + // Notify handle after sampling is done. + // For non-streaming this is effective only when the generation is finished. + sequence_group->notify_handle(); + } else { + // we are in prompt processing phase when prompt is split into chunks and processed step by step + } + + // NOTE: it should be before 'get_num_scheduled_tokens' is used + // update internal state of sequence group to reset scheduler tokens and update currently processed ones + sequence_group->finish_iteration(); + // decrease sequence_group context in case of candidates generated by draft_model were not accepted by main_model + if (decrease_context_len_per_seq_group) { + const auto num_processed_tokens = sequence_group->get_num_processed_tokens(); + OPENVINO_ASSERT(num_processed_tokens >= decrease_context_len_per_seq_group); + OPENVINO_ASSERT(sequence_group->get_context_len() >= decrease_context_len_per_seq_group); + sequence_group->update_processed_tokens_num(num_processed_tokens - decrease_context_len_per_seq_group); + } + + // accumulate a number of processed tokens + currently_processed_tokens += (padded_amount_of_processed_tokens - decrease_context_len_per_seq_group) * num_running_sequences; + } + + return sampler_output; +} + +void Sampler::update_logit_processor(uint64_t request_id, uint64_t token_id) { + OPENVINO_ASSERT(m_logit_processors.count(request_id)); + auto& logit_processor = m_logit_processors.at(request_id); + logit_processor.decrease_generated_token_occurance(token_id); + auto gen_size = logit_processor.get_generated_len(); + logit_processor.update_generated_len(gen_size - 1); +} + +void Sampler::clear_beam_search_info(uint64_t request_id) { + m_beam_search_info.erase(request_id); +} + +int64_t Sampler::GroupBeamSearcher::Group::finish(Beam beam, const ov::genai::GenerationConfig& sampling_params) { + int64_t preeempted_sequence_id = -1; + float generated_len = beam.get_generated_len() + (is_stop_token_id_hit(beam.m_token_id, sampling_params.stop_token_ids) ? 1 : 0); // HF counts EOS token in generation length + beam.m_score /= std::pow(generated_len, sampling_params.length_penalty); + + min_heap.push_back(beam); + std::push_heap(min_heap.begin(), min_heap.end(), greater); + assert(sampling_params.num_beams % sampling_params.num_beam_groups == 0 && + "number of beams should be divisible by number of groups"); + size_t group_size = sampling_params.num_beams / sampling_params.num_beam_groups; + if (min_heap.size() > group_size) { + std::pop_heap(min_heap.begin(), min_heap.end(), greater); + preeempted_sequence_id = min_heap.back().m_sequence->get_id(); + min_heap.pop_back(); + } + + return preeempted_sequence_id; +} + +void Sampler::GroupBeamSearcher::Group::is_done(const ov::genai::GenerationConfig& sampling_params) { + assert(sampling_params.num_beams % sampling_params.num_beam_groups == 0 && + "number of beams should be divisible by number of groups"); + size_t group_size = sampling_params.num_beams / sampling_params.num_beam_groups; + if (min_heap.size() < group_size) + return; + + const Beam& best_running_sequence = ongoing.front(), & worst_finished_sequence = min_heap.front(); + size_t cur_len = best_running_sequence.m_sequence->get_generated_len(); + float best_sum_logprobs = best_running_sequence.m_score; + float worst_score = worst_finished_sequence.m_score; + switch (sampling_params.stop_criteria) { + case ov::genai::StopCriteria::EARLY: + done = true; + return; + case ov::genai::StopCriteria::HEURISTIC: { + float highest_attainable_score = best_sum_logprobs / std::pow(float(cur_len), sampling_params.length_penalty); + done = worst_score >= highest_attainable_score; + return; + } + case ov::genai::StopCriteria::NEVER: { + size_t length = sampling_params.length_penalty > 0.0 ? sampling_params.max_new_tokens : cur_len; + float highest_attainable_score = best_sum_logprobs / std::pow(float(length), sampling_params.length_penalty); + done = worst_score >= highest_attainable_score; + return; + } + default: + OPENVINO_THROW("Beam search internal error: unkown mode"); + } +} +} diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp new file mode 100644 index 0000000000..8188b35573 --- /dev/null +++ b/src/cpp/src/sampler.hpp @@ -0,0 +1,113 @@ + +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <list> +#include <cassert> +#include <cstdlib> +#include <limits> +#include <map> +#include <algorithm> +#include <cmath> +#include <random> +#include <set> + +#include "openvino/runtime/tensor.hpp" + +#include "logit_processor.hpp" +#include "scheduler.hpp" +#include "sequence_group.hpp" + +namespace ov::genai { +// Handle stop_token_ids +inline bool is_stop_token_id_hit(int64_t generated_token, const std::set<int64_t> & stop_token_ids) { + for (auto & stop_token_id : stop_token_ids) { + if (generated_token == stop_token_id) + return true; + } + return false; +} + +std::vector<Token> log_softmax(const ov::Tensor& logits, size_t batch_idx); + +struct SamplerOutput { + // IDs of sequences that need to be dropped + std::vector<uint64_t> m_dropped_sequences; + // IDs of sequences that need to be forked (note, the same sequence can be forked multiple times) + // it will later be used by scheduler to fork block_tables for child sequences + std::unordered_map<uint64_t, std::list<uint64_t>> m_forked_sequences; +}; + +class Sampler { + class GroupBeamSearcher; + + Logits _get_logit_vector(ov::Tensor logits, size_t batch_idx, size_t token_idx); + Token _greedy_sample(const Logits& logits) const; + std::vector<Token> _multinomial_sample(const Logits& logits, size_t num_tokens_per_sequence); + std::vector<int64_t> _try_finish_generation(SequenceGroup::Ptr & sequence_group); + void update_logit_processor(uint64_t request_id, uint64_t token_id); + + // request ID => beam search tracking information + std::map<uint64_t, GroupBeamSearcher> m_beam_search_info; + + std::mt19937 rng_engine; + // { request_id, logit_processor } + std::map<uint64_t, LogitProcessor> m_logit_processors; + + Tokenizer m_tokenizer; + +public: + Sampler() = default; + Sampler(Tokenizer & tokenizer) : m_tokenizer(tokenizer) {}; + + SamplerOutput sample(std::vector<SequenceGroup::Ptr> & sequence_groups, ov::Tensor logits, bool is_validation_mode_enabled = false); + void set_seed(size_t seed) { rng_engine.seed(seed); } + void clear_beam_search_info(uint64_t request_id); +}; + +class Sampler::GroupBeamSearcher { + struct Beam { + Sequence::Ptr m_sequence; + size_t m_global_beam_idx = 0; + + // beam is made on top of sequence + float m_log_prob = 0.0f; + int64_t m_token_id = -1; + + // cumulative log probabilities + float m_score = -std::numeric_limits<float>::infinity(); + + Beam(Sequence::Ptr sequence) + : m_sequence(std::move(sequence)) { } + + size_t get_generated_len() const { + return m_sequence->get_generated_len(); + } + }; + + static bool greater(const Beam& left, const Beam& right) { + return left.m_score > right.m_score; + } + + struct Group { + std::vector<Beam> ongoing; // Best beams in front + std::vector<Beam> min_heap; // The worst of the best completed beams is the first + bool done = false; + + int64_t finish(Beam beam, const ov::genai::GenerationConfig& sampling_params); + void is_done(const ov::genai::GenerationConfig& sampling_params); + }; + + SequenceGroup::Ptr m_sequence_group; + ov::genai::GenerationConfig m_parameters; + std::vector<Group> m_groups; + Tokenizer m_tokenizer; +public: + explicit GroupBeamSearcher(SequenceGroup::Ptr sequence_group, Tokenizer tokenizer); + + void select_next_tokens(const ov::Tensor& logits, SamplerOutput& sampler_output); + void finalize(SamplerOutput& sampler_output); +}; +} diff --git a/src/cpp/src/scheduler.hpp b/src/cpp/src/scheduler.hpp new file mode 100644 index 0000000000..e53d4c14bc --- /dev/null +++ b/src/cpp/src/scheduler.hpp @@ -0,0 +1,400 @@ + +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <cstdlib> +#include <vector> + +#include "openvino/genai/scheduler_config.hpp" +#include "device_config.hpp" +#include "block_manager.hpp" +#include "sequence_group.hpp" + +namespace ov::genai { +class Scheduler { + bool m_can_use_partial_preemption; + + SchedulerConfig m_config; + BlockManager m_block_manager; + friend class CacheStateDumper; + +public: + struct Output { + // IDs of scheduled groups + std::vector<uint64_t> m_scheduled_sequence_groups_ids; + // map of src -> dst blocks copies, which need to be performed by CacheManager + std::map<size_t, std::list<size_t>> m_block_copy_map; + // block tables for scheduled sequences per each attention layer in the model + std::map<uint64_t, std::vector<BlocksPerLayer>> m_block_tables; + // total number of scheduled tokens + size_t m_total_num_scheduled_tokens = 0; + // dedicated prompt phase + bool is_prompt = false; + // current cache usage + float m_cache_usage = 0.0; + }; + + explicit Scheduler(const SchedulerConfig & config = {}, size_t num_layers = 1, bool can_use_partial_preemption = true) : + m_can_use_partial_preemption(can_use_partial_preemption), + m_config(config), + m_block_manager(m_config.num_kv_blocks, m_config.enable_prefix_caching, m_config.block_size, num_layers) { + OPENVINO_ASSERT(num_layers != 0, "num_layers must be non-zero"); + } + + Output schedule(std::vector<SequenceGroup::Ptr>& sequence_groups) { + Output scheduler_output; + + if (m_config.dynamic_split_fuse) { + // deepspeed-mii case + // generation phase is always scheduled first + _schedule_generate_phase_dynamic_split_fuse(sequence_groups, scheduler_output); + // some tokens from generation prompt are also scheduled + _schedule_prompt_phase_dynamic_split_fuse(sequence_groups, scheduler_output); + } else { + // vLLM case + // schedule prompt phase using whole prompt's input_ids + + _schedule_prompt_phase_vllm(sequence_groups, scheduler_output); + + if (!scheduler_output.is_prompt) { + // prompt sequences are not scheduler => scheduler generation phase by dynamic_split_fuse implementation + _schedule_generate_phase_dynamic_split_fuse(sequence_groups, scheduler_output); + } + } + + _clear_waiting_sequences(sequence_groups); + scheduler_output.m_cache_usage = m_block_manager.get_used_percentage(); + + return scheduler_output; + } + + const std::vector<BlocksPerLayer>& get_block_tables(const Sequence& seq) const { + return m_block_manager.get_block_tables(seq.get_id()); + } + + const bool has_block_table(uint64_t seq_id) { + return m_block_manager.has_block_table(seq_id); + } + + void free_sequence(uint64_t seq_id) { + m_block_manager.free_sequence(seq_id); + } + + void fork_sequence(uint64_t parent_id, uint64_t child_id) { + m_block_manager.fork_sequence(parent_id, child_id); + } + + void restore_cached_blocks(const SequenceGroup::Ptr& sequence_group) { + m_block_manager.restore_cached_blocks(sequence_group); + } + + const SchedulerConfig& get_config() const { + return m_config; + } + + void free_blocks_from_sequence(size_t seq_id, const std::vector<std::set<size_t>>& per_layer_logical_block_indices_to_free) { + m_block_manager.free_blocks_from_sequence(seq_id, per_layer_logical_block_indices_to_free); + } + +private: + static size_t _num_running_sequence_groups(const std::vector<SequenceGroup::Ptr>& sequence_groups) { + size_t num_running = 0; + for (const SequenceGroup::CPtr& seq_group : sequence_groups) { + if (seq_group->can_generate_tokens()) + ++num_running; + } + + return num_running; + } + + + bool _preempt_by_recompute(SequenceGroup::Ptr sequence_group, size_t blocks_needed) { + size_t processed_tokens = sequence_group->get_num_processed_tokens(); + size_t block_size = m_config.block_size; + size_t prev_blocks_count = m_block_manager.num_free_blocks(); + size_t preempted_tokens = 0; + size_t num_blocks_occupied_by_sequence = m_block_manager.get_number_of_blocks_occupied_by_sequence(sequence_group); + bool was_evicted_from = (sequence_group->get_num_evicted_tokens() != 0); + + if (num_blocks_occupied_by_sequence <= blocks_needed || !m_can_use_partial_preemption || was_evicted_from) { + auto sequences = sequence_group->get_not_finished_sequences(); + for (size_t s = 0; s < sequences.size(); ++s) { + auto seq_id = sequences[s]->get_id(); + m_block_manager.free_sequence(seq_id); + } + sequence_group->preempt_tokens(processed_tokens); + if (was_evicted_from) { + sequence_group->reset_eviction_token_count(); + } + sequence_group->set_waiting(); + return m_block_manager.num_free_blocks() > prev_blocks_count; + } + + size_t logical_blocks_released; + if (sequence_group->get_sampling_parameters().is_beam_search()) { + logical_blocks_released = m_block_manager.free_partially_beam_search_group(sequence_group, blocks_needed); + } + else { + logical_blocks_released = m_block_manager.free_group_partially(sequence_group, blocks_needed); + } + + // calculate the number of preempted tokens + auto tokens_in_last_block = processed_tokens % block_size; + if (tokens_in_last_block == 0) { + tokens_in_last_block = block_size; + } + preempted_tokens = tokens_in_last_block + std::max<size_t>((int)logical_blocks_released - 1, 0) * block_size; + + // case when preemption requires preempt prompt tokens + if (!m_config.dynamic_split_fuse && processed_tokens - preempted_tokens < sequence_group->get_prompt_len()) { + // preempt prompt fully to not leave partially generated prompt + preempted_tokens = processed_tokens; + for (auto sequence: sequence_group->get_not_finished_sequences()) { + auto seq_id = sequence->get_id(); + if (m_block_manager.has_block_table(seq_id)) { + m_block_manager.free_sequence(seq_id); + } + } + } + sequence_group->preempt_tokens(preempted_tokens); + sequence_group->set_waiting(); + return m_block_manager.num_free_blocks() > prev_blocks_count; + } + + static size_t _get_low_priority_sequence_group_id(const std::vector<SequenceGroup::Ptr>& sequence_groups) { + for (size_t seq_group_id = 0, num_groups = sequence_groups.size(); seq_group_id < num_groups; ++seq_group_id) { + size_t group_idx = num_groups - seq_group_id - 1; + SequenceGroup::CPtr sequence_group = sequence_groups[group_idx]; + if (sequence_group->get_num_processed_tokens() > 0) { + // we are here, because current sequence group has some reserved KV blocks in block manager + // which can be freed + return group_idx; + } + } + + return std::numeric_limits<size_t>::max(); + } + + void _apply_preemption(size_t sequence_group_id, const std::vector<SequenceGroup::Ptr>& sequence_groups) { + SequenceGroup::Ptr sequence_group = sequence_groups[sequence_group_id]; + + // check whether current sequence requires a new slot / block + while (!m_block_manager.can_append_slots(sequence_group)) { + // let's run a sequence for eviction + size_t evicted_sequence_group_id = _get_low_priority_sequence_group_id(sequence_groups); + + if (evicted_sequence_group_id <= sequence_group_id) { + // we have a cycle when current group need to evict itself to be in a running state + break; + } + size_t blocks_needed = m_block_manager.required_blocks_count(sequence_group); + if (!_preempt_by_recompute(sequence_groups[evicted_sequence_group_id], blocks_needed)){ + break; + } + } + } + + void _schedule_prompt_phase_dynamic_split_fuse(std::vector<SequenceGroup::Ptr>& sequence_groups, Output& scheduler_output) { + // in the current method we need to balance multiple prompts (or parts of prompts) between + // available amount of tokens in megabatch + // Considerations: + // 1. To reduce discrepancy between ragged dimensions (context lengths) in Attention module + // we can slice prompt on chunks and schedule only portion of each prompt instead of + // greedy scheduling of prompt with higher priority + // 2. The machanism below performs greedy scheduling of high priority prompts + + for (size_t sequence_group_id = 0; sequence_group_id < sequence_groups.size(); ++sequence_group_id) { + SequenceGroup::Ptr sequence_group = sequence_groups[sequence_group_id]; + if (!sequence_group->can_generate_tokens() && !sequence_group->is_waiting()) { + size_t num_running_seqs = sequence_group->num_running_seqs(); + // prompt phases can have a single running sequence + OPENVINO_ASSERT(num_running_seqs == 1); + Sequence::Ptr sequence = (*sequence_group)[0]; + uint64_t seq_id = sequence->get_id(); + + size_t num_tokens_in_megabatch = m_config.max_num_batched_tokens - scheduler_output.m_total_num_scheduled_tokens; + size_t num_available_tokens = sequence_group->get_num_available_tokens_for_batching(); + + // apply megabatch limitations + size_t num_scheduled_tokens = std::min(num_tokens_in_megabatch, num_available_tokens); + + // apply KV cache limitations + size_t currently_allocated_token_slots = sequence_group->get_num_blocks() * m_config.block_size; + size_t occupied_token_slots = sequence_group->get_num_processed_tokens() - sequence_group->get_num_evicted_tokens(); + OPENVINO_ASSERT(currently_allocated_token_slots >= occupied_token_slots, "internal error"); + size_t available_slots = currently_allocated_token_slots - occupied_token_slots, + required_slots = num_scheduled_tokens > available_slots ? num_scheduled_tokens - available_slots : 0; + size_t num_required_blocks = (required_slots + m_config.block_size - 1) / m_config.block_size, num_free_blocks = m_block_manager.num_free_blocks(); + size_t num_scheduled_blocks = std::min(num_required_blocks, num_free_blocks); + // some scheduled blocks can be no fully occupied, so we need to take min between num_scheduled_blocks + // and total "scheduled capacity" + num_scheduled_tokens = std::min(num_scheduled_tokens, available_slots + num_scheduled_blocks * m_config.block_size); + + if (num_scheduled_tokens > 0) { + // allocate KV blocks if required + if (num_scheduled_blocks > 0) + m_block_manager.allocate(sequence, num_scheduled_blocks, sequence_group->get_prompt_ids()); + // and schedule tokens + sequence_group->schedule_tokens(num_scheduled_tokens); + + // add information to scheduler_output + { + scheduler_output.m_scheduled_sequence_groups_ids.push_back(sequence_group_id); + scheduler_output.m_block_tables[seq_id] = m_block_manager.get_block_tables(seq_id); + scheduler_output.m_total_num_scheduled_tokens += num_scheduled_tokens * num_running_seqs; + } + } + + // if we added maximum amount of tokens to compute + if (scheduler_output.m_total_num_scheduled_tokens == m_config.max_num_batched_tokens) + break; + } + } + } + + void _schedule_generate_phase_dynamic_split_fuse(const std::vector<SequenceGroup::Ptr>& sequence_groups, Output& scheduler_output) { + for (size_t sequence_group_id = 0; sequence_group_id < sequence_groups.size(); ++sequence_group_id) { + SequenceGroup::Ptr sequence_group = sequence_groups[sequence_group_id]; + // Note, that can_generate_tokens will mix preempted sequence groups + // and real generate ones + // Question: do we need to schedule preeempted first as it's done in vLLM? + // Answer: preempted sequences have low priority, so they should be after "running" ones. So, here we + // keep latencies for sequence groups of high priority + if (sequence_group->can_generate_tokens() && !sequence_group->is_waiting()) { + OPENVINO_ASSERT(!sequence_group->has_finished()); + size_t num_running_seqs = sequence_group->num_running_seqs(); + size_t num_tokens_in_megabatch = m_config.max_num_batched_tokens - scheduler_output.m_total_num_scheduled_tokens; + size_t available_tokens_per_seq_in_megabatch = num_tokens_in_megabatch / num_running_seqs; + + // we cannot schedule even a single token per each sequence in a group + if (!available_tokens_per_seq_in_megabatch) + continue; + + // Note: current function can return more than 1 token even for generation phase in case of some tokens + // of current sequence group were evicted before + size_t num_available_tokens_per_seq = sequence_group->get_num_available_tokens_for_batching(); + + size_t num_scheduled_tokens_per_seq = std::min(available_tokens_per_seq_in_megabatch, num_available_tokens_per_seq); + sequence_group->schedule_tokens(num_scheduled_tokens_per_seq); + + _apply_preemption(sequence_group_id, sequence_groups); + + // if we can't preemt any more sequences, clear scheduled tokens and move to next sequence + if (!m_block_manager.can_append_slots(sequence_group)){ + sequence_group->clear_scheduled_tokens(); + continue; + } + + // allocate new slots + std::map<size_t, std::list<size_t>> copy_blocks_map = m_block_manager.append_slots(sequence_group); + + // add information to scheduler_output + { + auto request_id = sequence_group->get_request_id(); + scheduler_output.m_scheduled_sequence_groups_ids.push_back(sequence_group_id); + scheduler_output.m_total_num_scheduled_tokens += num_scheduled_tokens_per_seq * num_running_seqs; + + // block tables for each running sequence within a group + std::vector<Sequence::Ptr> running_seqs = sequence_group->get_running_sequences(); + for (const auto & seq : sequence_group->get_running_sequences()) { + scheduler_output.m_block_tables[seq->get_id()] = m_block_manager.get_block_tables(seq->get_id()); + } + + // merge copy_blocks + for (const auto& src_dst : copy_blocks_map) { + size_t src_index = src_dst.first; + const std::list<size_t>& dst_indexes = src_dst.second; + for (const auto dst_index : dst_indexes) + scheduler_output.m_block_copy_map[src_index].push_back(dst_index); + } + } + + // if we added maximum amount of tokens to compute + if (scheduler_output.m_total_num_scheduled_tokens == m_config.max_num_batched_tokens) + break; + } + } + } + + void _schedule_prompt_phase_vllm(std::vector<SequenceGroup::Ptr>& sequence_groups, Output& scheduler_output) { + // Current scheduling method schedules prompts only in a manner similar to vLLM: + // - Limits max batch size by: + // - max_num_seqs (256 in vLLM's defaults) + // - max_num_batched_tokens (max_model_length (and at least 2048) in vLLM's defaults) + + OPENVINO_ASSERT(!m_config.dynamic_split_fuse, "Internal error: we are in vLLM scheduling"); + OPENVINO_ASSERT(m_config.max_num_seqs <= m_config.max_num_batched_tokens, "Max num batched tokens (", m_config.max_num_batched_tokens, + ") must be greater or equal to max num sequences (", m_config.max_num_seqs, ")"); + OPENVINO_ASSERT(scheduler_output.m_scheduled_sequence_groups_ids.empty(), "Internal error: in vLLM scheduling, prompt phase is always first one"); + + // TODO: it currently does not handle beam search, where beam width should contribute to total number of "num running sequences" + size_t num_running_sequence_groups = _num_running_sequence_groups(sequence_groups); + + for (size_t sequence_group_id = 0; sequence_group_id < sequence_groups.size(); ++sequence_group_id) { + SequenceGroup::Ptr sequence_group = sequence_groups[sequence_group_id]; + const bool recompute_evicted_sequences = sequence_group->get_num_processed_tokens() == 0 && !m_can_use_partial_preemption; + if ((!sequence_group->can_generate_tokens() || recompute_evicted_sequences) && !sequence_group->is_waiting()) { + size_t num_running_seqs = sequence_group->num_running_seqs(); + // prompt phases can have a single running sequence + OPENVINO_ASSERT(num_running_seqs == 1); + // here we also assume that sequence must be scheduler in a single shot and has no already generated context + if (!m_config.enable_prefix_caching) + OPENVINO_ASSERT(sequence_group->get_context_len() == 0); + + size_t num_available_tokens_in_megabatch = m_config.max_num_batched_tokens - scheduler_output.m_total_num_scheduled_tokens; + size_t sequence_len = sequence_group->get_num_available_tokens_for_batching(); + + // TODO: better handling + // e.g. return status that sequence is ignored and cannot be processed by current scheduling algorigthm + OPENVINO_ASSERT(m_config.max_num_batched_tokens >= sequence_len, "Sequence length (", sequence_len, ") is longer than max number of tokens in batch (", m_config.max_num_batched_tokens, ")"); + + // if we limited by max_num_seqs condition + if (num_running_sequence_groups >= m_config.max_num_seqs) + break; + + // apply max num batched tokens limitation + if (num_available_tokens_in_megabatch < sequence_len) + break; + + // apply KV cache limitations + const size_t num_required_blocks = (sequence_len + m_config.block_size - 1) / m_config.block_size; + if (!m_block_manager.can_allocate_blocks(num_required_blocks)) + break; + + // add scheduling information + { + Sequence::Ptr sequence = (*sequence_group)[0]; + uint64_t seq_id = sequence->get_id(); + + // and schedule tokens + sequence_group->schedule_tokens(sequence_len); + + // allocate KV blocks + m_block_manager.append_slots(sequence_group); + + // add information to scheduler_output + { + scheduler_output.m_scheduled_sequence_groups_ids.push_back(sequence_group_id); + scheduler_output.m_block_tables[seq_id] = m_block_manager.get_block_tables(seq_id); + scheduler_output.m_total_num_scheduled_tokens += sequence_len; + } + + // update "is_prompt" flag + scheduler_output.is_prompt = true; + } + + num_running_sequence_groups += 1; + } + } + } + + void _clear_waiting_sequences(const std::vector<SequenceGroup::Ptr>& sequence_groups) { + for (size_t sequence_group_id = 0; sequence_group_id < sequence_groups.size(); ++sequence_group_id) { + sequence_groups[sequence_group_id]->clear_waiting_sequences(); + } + } +}; + +} diff --git a/src/cpp/src/sequence_group.cpp b/src/cpp/src/sequence_group.cpp new file mode 100644 index 0000000000..854fc85777 --- /dev/null +++ b/src/cpp/src/sequence_group.cpp @@ -0,0 +1,63 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include <string_view> +#include "sequence_group.hpp" + +namespace ov { +namespace genai { + +std::mutex Sequence::m_counter_mutex; + +size_t Sequence::_make_hash(size_t content_length) { + auto sequence_group = get_sequence_group_ptr(); + auto block_size = sequence_group->get_block_size(); + size_t block_start_idx = content_length - (content_length % block_size); + if (block_start_idx == content_length) { + block_start_idx -= block_size; + } + + // hash of current block depends on prefix hashes + std::vector<int64_t> content; + size_t prefix_hashes_needed_count = block_start_idx / block_size; + OPENVINO_ASSERT(prefix_hashes_needed_count <= m_prefix_hashes.size()); + content.insert(content.end(), m_prefix_hashes.begin(), m_prefix_hashes.begin() + prefix_hashes_needed_count); + + // get tokens corresponding to current block + const auto prompt_ids = sequence_group->get_prompt_ids(); + OPENVINO_ASSERT(content_length <= prompt_ids.size() + m_generated_ids.size()); + if (block_start_idx < prompt_ids.size()) { + content.insert(content.end(), prompt_ids.begin() + block_start_idx, prompt_ids.begin() + std::min(prompt_ids.size(), content_length)); + } + if (content_length > prompt_ids.size()) { + size_t start = block_start_idx < prompt_ids.size() ? 0 : block_start_idx - prompt_ids.size(); + content.insert(content.end(), m_generated_ids.begin() + start, m_generated_ids.begin() + content_length - prompt_ids.size()); + } + const char* data = reinterpret_cast<const char*>(content.data()); + std::size_t size = content.size() * sizeof(content[0]); + return std::hash<std::string_view>{}(std::string_view(data, size)); +} + +// Each KV block can be uniquely identified by +// the tokens within the block and the tokens in the prefix before the block. +// hash(prefix tokens + block tokens) <--> KV Block +size_t Sequence::get_hash(size_t content_length) { + + auto sequence_group = get_sequence_group_ptr(); + OPENVINO_ASSERT(sequence_group, "Hash computation requires setting of sequence_group ptr."); + auto content_len = content_length == 0 ? sequence_group->get_context_len() : content_length; + auto block_size = sequence_group->get_block_size(); + size_t cur_content = block_size * (m_prefix_hashes.size() + 1); + while (cur_content <= content_len) + { + m_prefix_hashes.push_back(_make_hash(cur_content)); + cur_content += block_size; + } + if (content_len % block_size == 0) { + return m_prefix_hashes[content_len / block_size - 1]; + } + + return _make_hash(content_len); +} +} // namespace genai +} // namespace ov \ No newline at end of file diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp new file mode 100644 index 0000000000..5c87e8ebfa --- /dev/null +++ b/src/cpp/src/sequence_group.hpp @@ -0,0 +1,603 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <vector> +#include <set> +#include <cstdlib> +#include <string_view> + +#include "openvino/genai/generation_handle.hpp" +#include "openvino/genai/generation_config.hpp" +#include "generation_stream.hpp" + +namespace ov::genai { +enum class SequenceStatus { + RUNNING = 0, + FINISHED = 1, + OUT_OF_MEMORY = 2, + WAITING = 3 +}; + +using TokenIds = std::vector<int64_t>; +using LogProbs = std::vector<float>; +class SequenceGroup; + +class Sequence { + // This can be a problem if we launch two pipelines in the same application. + static uint64_t _get_next_global_sequence_id() { + const std::lock_guard<std::mutex> lock(m_counter_mutex); + static uint64_t m_counter = 0; + return m_counter++; + } + + TokenIds m_generated_ids; + LogProbs m_generated_log_probs; + uint64_t m_grouped_id; + uint64_t m_id = _get_next_global_sequence_id(); + SequenceStatus m_status = SequenceStatus::RUNNING; + GenerationFinishReason m_finish_reason = GenerationFinishReason::NONE; + float m_cumulative_log_prob = 0.0f; + std::vector<int64_t> m_prefix_hashes; + std::weak_ptr<SequenceGroup> m_sequence_group; + static std::mutex m_counter_mutex; + + size_t _make_hash(size_t content_length); +public: + using Ptr = std::shared_ptr<Sequence>; + using CPtr = std::shared_ptr<const Sequence>; + + // don't use directly + Sequence(const uint64_t id) : m_grouped_id(id) {}; + + // don't use directly + Sequence(const Sequence& seq, const uint64_t id) : + m_generated_ids(seq.m_generated_ids), + m_grouped_id(id), + m_status(seq.m_status), + m_cumulative_log_prob(seq.m_cumulative_log_prob){ + OPENVINO_ASSERT(seq.m_id != m_id); + } + + static Sequence::Ptr create(const uint64_t id) { + return std::make_shared<Sequence>(id); + } + + static Sequence::Ptr fork(Sequence::CPtr sequence, const uint64_t id) { + return std::make_shared<Sequence>(*sequence, id); + } + + bool operator ==(const Sequence& other) const { + return other.m_id == m_id; + } + + uint64_t get_id() const { + return m_id; + } + + uint64_t get_grouped_id() const { + return m_grouped_id; + } + + bool has_finished() const { + return m_status == SequenceStatus::FINISHED; + } + + bool is_running() const { + return m_status == SequenceStatus::RUNNING; + } + + bool out_of_memory() const { + return m_status == SequenceStatus::OUT_OF_MEMORY; + } + + bool is_waiting() const { + return m_status == SequenceStatus::WAITING; + } + + void set_status(SequenceStatus status) { + m_status = status; + } + + GenerationFinishReason get_finish_reason() const { + return m_finish_reason; + } + + void set_finish_reason(GenerationFinishReason finish_reason) { + m_finish_reason = finish_reason; + } + + // appends new tokens to a generated part + void append_token(int64_t token_id, float log_prob) { + m_cumulative_log_prob += log_prob; + m_generated_log_probs.push_back(log_prob); + m_generated_ids.push_back(token_id); + } + + // removes n last tokens and updates cumulative log prob + // used to remove stop_string from the output + void remove_last_tokens(int n) { + OPENVINO_ASSERT(m_generated_ids.size() >= n, "Cannot remove more tokens than has been generated"); + for (int i = 0; i < n; i++) { + m_cumulative_log_prob -= m_generated_log_probs.back(); + m_generated_log_probs.pop_back(); + m_generated_ids.pop_back(); + } + } + + GenerationOutput get_last_generation_output() { + GenerationOutput output; + OPENVINO_ASSERT(m_generated_ids.size()); + output.score = get_cumulative_log_probs(); + output.generated_ids = std::vector<int64_t> {m_generated_ids.back()}; + output.generated_log_probs = std::vector<float> {m_generated_log_probs.back()}; + output.finish_reason = get_finish_reason(); + return output; + } + + size_t get_generated_len() const { + return m_generated_ids.size(); + } + + const TokenIds & get_generated_ids() const { + return m_generated_ids; + } + + const LogProbs & get_generated_log_probs() const { + return m_generated_log_probs; + } + + float get_cumulative_log_probs() const { + return m_cumulative_log_prob; + } + + void update_generated_log_prob(size_t idx, float log_prob) { + OPENVINO_ASSERT(idx < m_generated_log_probs.size()); + m_generated_log_probs[idx] = log_prob; + } + + float get_beam_search_score(const ov::genai::GenerationConfig& sampling_params) const { + float cumulative_log_prob = get_cumulative_log_probs(), current_length = get_generated_len(); + float score = cumulative_log_prob / std::pow(current_length, sampling_params.length_penalty); + return score; + } + + + + // Each KV block can be uniquely identified by + void set_sequence_group_ptr(std::shared_ptr<SequenceGroup> sequence_group) { + m_sequence_group = sequence_group; + } + + std::shared_ptr<SequenceGroup> get_sequence_group_ptr() const { + OPENVINO_ASSERT(!m_sequence_group.expired()); + return m_sequence_group.lock(); + } + + // Each KV block can be uniquely identified by + // the tokens within the block and the tokens in the prefix before the block. + // hash(prefix tokens + block tokens) <--> KV Block + size_t get_hash(size_t content_length = 0); +}; + +// contains a list of Sequences in generic case (beam search or parallel sampling) +// - each sequence shares the same prompt and KV-caches for promp +// - in case of beam search each sequence also shares specific part of generic phase +// via reference counter machanism on BlockManager level +class SequenceGroup { + uint64_t m_request_id; + std::vector<Sequence::Ptr> m_sequences; + ov::genai::GenerationConfig m_sampling_params; + std::size_t m_block_size; + TokenIds m_prompt_ids; + GenerationStream::Ptr m_generation_stream; + bool m_enable_prefix_caching; + size_t m_num_evicted_tokens = 0; + + uint64_t m_next_sequence_id = 0; + + // amount of processed tokens, e.g. prompt can be processed using multiple consequence inferences + // so, we need to track which part of the prompt we have already processed + size_t m_num_processed_tokens = 0; + // a number of scheduled tokens by Scheduler::schedule logic + size_t m_num_scheduled_tokens = 0; + // context length of longest sequence within a group + size_t m_max_content_len = 0; + // max validation length within a group to check generated tokens + size_t m_num_validated_tokens = 0; + + + SequenceGroup(uint64_t request_id, const ov::genai::GenerationConfig& sampling_params, std::size_t block_size, bool enable_prefix_caching) + : m_request_id(request_id), + m_sampling_params(sampling_params), + m_block_size(block_size), + m_enable_prefix_caching(enable_prefix_caching) { + m_generation_stream = GenerationStream::create(); + } + +public: + using Ptr = std::shared_ptr<SequenceGroup>; + using CPtr = std::shared_ptr<const SequenceGroup>; + + SequenceGroup(uint64_t request_id, const TokenIds& input_ids, const ov::genai::GenerationConfig& sampling_params, std::size_t block_size, bool enable_prefix_caching) + : SequenceGroup(request_id, ov::Tensor(ov::element::i64, ov::Shape{input_ids.size()}, (void *)input_ids.data()), sampling_params, block_size, enable_prefix_caching) { + } + + SequenceGroup(uint64_t request_id, const ov::Tensor input_ids, const ov::genai::GenerationConfig& sampling_params, std::size_t block_size, bool enable_prefix_caching) + : SequenceGroup(request_id, sampling_params, block_size, enable_prefix_caching) { + add_sequence(Sequence::create(m_next_sequence_id++)); + + m_prompt_ids.resize(input_ids.get_size()); + std::copy_n(input_ids.data<int64_t>(), input_ids.get_size(), m_prompt_ids.begin()); + } + + void add_sequence(const Sequence::Ptr & sequence) { + m_sequences.emplace_back(sequence); + } + + void remove_sequence(uint64_t sequence_id) { + auto remove_it = std::remove_if(m_sequences.begin(), m_sequences.end(), [sequence_id] (Sequence::Ptr seq) { + return seq->get_id() == sequence_id; + }); + OPENVINO_ASSERT(remove_it != m_sequences.end(), "Failed to remove sequence with specified ID"); + m_sequences.erase(remove_it); + } + + size_t get_prompt_len() const { + return m_prompt_ids.size(); + } + + // a sequence group can generate new tokens if it already proccessed m_max_content_len before + bool can_generate_tokens() const { + return m_max_content_len >= get_prompt_len(); + } + + Sequence::Ptr operator[] (size_t index) { + OPENVINO_ASSERT(m_sequences.size() > index); + return m_sequences[index]; + } + + Sequence::CPtr operator[] (size_t index) const { + OPENVINO_ASSERT(m_sequences.size() > index); + return m_sequences[index]; + } + + size_t num_total_seqs() const { + return m_sequences.size(); + } + + size_t num_finished_seqs() const { + return std::count_if(m_sequences.begin(), m_sequences.end(), [] (Sequence::CPtr seq) { + return seq->has_finished(); + }); + } + + size_t num_running_seqs() const { + return num_total_seqs() - num_finished_seqs(); + } + + bool has_finished() const { + return num_running_seqs() == 0; + } + + bool is_running() const { + return !has_finished(); + } + + const std::vector<Sequence::Ptr>& get_sequences() const { + return m_sequences; + } + + /** + * @param seq_id Sequence identifier + * @return Whether this group has the sequence with this ID. + */ + bool has_sequence_with_id(size_t seq_id) const { + auto it = std::find_if(m_sequences.begin(), m_sequences.end(), [seq_id](const Sequence::Ptr& val) {return val->get_id() == seq_id;}); + return it != m_sequences.end(); + } + + + /** + * @param seq_id Sequence identifier + * @return Pointer to the sequence with this ID. + * @throw ov::Exception if the sequence with ID seq_id is not in this SequenceGroup + */ + Sequence::Ptr get_sequence_by_id(size_t seq_id) const { + auto it = std::find_if(m_sequences.begin(), m_sequences.end(), [seq_id](const Sequence::Ptr& val) {return val->get_id() == seq_id;}); + OPENVINO_ASSERT(it != m_sequences.end(), "sequence with id ", seq_id, " not found in sequence group with request id ", m_request_id); + return *it; + } + + std::vector<Sequence::CPtr> get_finished_sequences() const { + std::vector<Sequence::CPtr> finished_seqs; + for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) { + if (m_sequences[seq_id]->has_finished() || m_sequences[seq_id]->out_of_memory()) { + finished_seqs.push_back(m_sequences[seq_id]); + } + } + + // do we need to sort sequences here or sampler can handle it for us? + std::sort(finished_seqs.begin(), finished_seqs.end(), [=] (Sequence::CPtr s1, Sequence::CPtr s2) { + return s1->get_beam_search_score(m_sampling_params) > s2->get_beam_search_score(m_sampling_params); + }); + + return finished_seqs; + } + + std::vector<Sequence::Ptr> get_running_sequences() { + std::vector<Sequence::Ptr> running_seqs; + for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) { + if (m_sequences[seq_id]->is_running()) { + running_seqs.emplace_back(m_sequences[seq_id]); + } + } + + return running_seqs; + } + + std::vector<Sequence::Ptr> get_not_finished_sequences() { + std::vector<Sequence::Ptr> running_seqs; + for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) { + if (!m_sequences[seq_id]->has_finished()) { + running_seqs.emplace_back(m_sequences[seq_id]); + } + } + + return running_seqs; + } + + std::vector<Sequence::CPtr> get_running_sequences() const { + std::vector<Sequence::CPtr> running_seqs; + for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) { + if (m_sequences[seq_id]->is_running()) { + running_seqs.emplace_back(m_sequences[seq_id]); + } + } + + return running_seqs; + } + + uint64_t get_request_id() const { + return m_request_id; + } + + size_t get_num_scheduled_tokens() const { + return m_num_scheduled_tokens; + } + + size_t get_num_processed_tokens() const { + return m_num_processed_tokens; + } + + /** + * Registers within the sequence group that a given amount of tokens + * has been evicted from the underlying KV cache. + * NB: no per-layer or per-sequence indexing is required since current invariant is that + * there is always the same amount of KV cache blocks for each layer (i.e. eviction algo + * always evicts the same amount of blocks for each layer in each eviction step) and for each sequence in a group + * @param num_evicted_tokens Number of tokens evicted for this sequence at this generation step. + */ + void register_token_eviction(size_t num_evicted_tokens) { + m_num_evicted_tokens += num_evicted_tokens; + } + + + /** + * Resets the eviction tracking on this sequence to the state prior to any eviction taking place. + */ + void reset_eviction_token_count() { + m_num_evicted_tokens = 0; + } + + /** + * @return Number of tokens evicted for this sequence since the start of the processing for this sequence + */ + size_t get_num_evicted_tokens() const { + return m_num_evicted_tokens; + } + + void preempt_tokens(size_t num_preempt_tokens) { + OPENVINO_ASSERT(num_preempt_tokens <= m_num_processed_tokens); + m_num_processed_tokens -= num_preempt_tokens; + } + + // returns context length taking into account scheduled tokens + size_t get_context_len() const { + OPENVINO_ASSERT(!has_finished()); + return get_num_processed_tokens() + get_num_scheduled_tokens(); + } + + + bool requires_sampling() const { + return get_context_len() >= get_prompt_len() && get_context_len() > m_max_content_len; + } + + void schedule_tokens(size_t num_tokens) { + m_num_scheduled_tokens = num_tokens; + } + + void clear_scheduled_tokens() { + m_num_scheduled_tokens = 0; + m_num_validated_tokens = 0; + } + + bool is_scheduled() const { + return m_num_scheduled_tokens > 0; + } + + void set_num_validated_tokens(size_t k) { + // in case of non-prompt we need to take prev tokens + token to validate + // m_num_validated_tokens = get_num_processed_tokens() ? k + 1 : k; + m_num_validated_tokens = k; + } + + size_t get_num_tokens_to_validate() { + return m_num_validated_tokens; + } + + size_t get_num_available_tokens_for_batching() const { + OPENVINO_ASSERT(!has_finished(), "Internal error: this function cannot be called on finished sequence group"); + OPENVINO_ASSERT(get_num_scheduled_tokens() == 0, "Internal error: this function cannot be called when we are already in scheduling phase"); + // if sequence group has not finished, it has at least one token to process + size_t num_available_tokens = std::max(get_prompt_len(), m_max_content_len); + return std::max<size_t>(num_available_tokens - m_num_processed_tokens, 1u) + m_num_validated_tokens; + } + + // mark current schedule phase as finished and updates internal counters + void finish_iteration() { + m_num_processed_tokens += m_num_scheduled_tokens; + // if some processed tokens were evicted, max content len is greater than number of processed tokens + m_max_content_len = std::max(m_max_content_len, m_num_processed_tokens); + clear_scheduled_tokens(); + } + + void update_processed_tokens_num(size_t processed_tokens) { + m_num_processed_tokens = processed_tokens; + m_max_content_len = processed_tokens; + } + + void clear_waiting_sequences() { + for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) { + if (m_sequences[seq_id]->is_waiting()) { + m_sequences[seq_id]->set_status(SequenceStatus::RUNNING); + } + } + } + + const TokenIds& get_prompt_ids() const { + return m_prompt_ids; + } + + /** + * @return The number of logical KV cache blocks required to host all the tokens in this sequence group, taking into account previous token evictions. + */ + size_t get_num_logical_blocks() const { + return (get_context_len() - get_num_evicted_tokens() + m_block_size - 1) / m_block_size; + } + + + // requires number of physical blocks for next generation + size_t get_num_blocks() const { + return get_num_logical_blocks(); + } + + size_t get_block_size() const { + return m_block_size; + } + + Sequence::Ptr fork_sequence(Sequence::CPtr sequence) { + auto ptr = sequence->get_sequence_group_ptr(); + m_sequences.emplace_back(Sequence::fork(std::move(sequence), m_next_sequence_id++)); + set_sequence_group_ptr(ptr); + return m_sequences.back(); + } + + const ov::genai::GenerationConfig& get_sampling_parameters() const { + return m_sampling_params; + } + + void set_out_of_memory() { + for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) { + if (m_sequences[seq_id]->is_running()) { + m_sequences[seq_id]->set_status(SequenceStatus::OUT_OF_MEMORY); + } + } + } + + void set_waiting() { + for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) { + if (m_sequences[seq_id]->is_running()) { + m_sequences[seq_id]->set_status(SequenceStatus::WAITING); + } + } + } + + bool out_of_memory() const { + for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) { + if (m_sequences[seq_id]->out_of_memory()) { + return true; + } + } + return false; + } + + bool is_waiting() const { + for (size_t seq_id = 0; seq_id < m_sequences.size(); ++seq_id) { + if (m_sequences[seq_id]->is_waiting()) { + return true; + } + } + return false; + } + + void set_sequence_group_ptr(std::shared_ptr<SequenceGroup> sequence_group) { + for (auto sequence: m_sequences) { + sequence->set_sequence_group_ptr(sequence_group); + } + } + + GenerationStream::Ptr get_generation_stream() { + return m_generation_stream; + } + + void set_generation_status(GenerationStatus status) { + m_generation_stream->set_generation_status(status); + } + + bool handle_dropped() { + return m_generation_stream->get_status() == GenerationStatus::DROPPED_BY_HANDLE; + } + + void push_empty_outputs() { + m_generation_stream->push({}); + } + + void push_outputs() { + GenerationOutputs outputs; + for (auto& sequence: m_sequences) { + GenerationOutput output; + output.generated_ids = sequence->get_generated_ids(); + output.generated_log_probs = sequence->get_generated_log_probs(); + output.score = m_sampling_params.is_beam_search() ? sequence->get_beam_search_score(m_sampling_params) : sequence->get_cumulative_log_probs(); + output.finish_reason = sequence->get_finish_reason(); + outputs.emplace(sequence->get_grouped_id(), output); + } + m_generation_stream->push(std::move(outputs)); + } + + void push_partial_outputs() { + GenerationOutputs outputs; + for (auto& sequence : m_sequences) { + // todo: check seq.is_finished() to generate without several </s> + // or is it ok to use padding? + const auto last_gen_token = sequence->get_last_generation_output(); + outputs.emplace(sequence->get_grouped_id(), last_gen_token); + } + m_generation_stream->push(std::move(outputs)); + } + + void notify_handle() { + if (out_of_memory()) { + set_generation_status(GenerationStatus::IGNORED); + } else if (has_finished()) { + set_generation_status(GenerationStatus::FINISHED); + } + // For beam search streaming is not available, so we notify only upon finishing + if(m_sampling_params.is_beam_search()) { + if (has_finished() || out_of_memory()) { + push_outputs(); + } + } else if (m_sampling_params.is_greedy_decoding() || m_sampling_params.is_multinomial()) { + // We can stream only when one sequence is returned and we don't use stop strings that would be excluded from the output + // (after stop string is detected its tokens are already sent) + if (num_total_seqs() == 1&& (m_sampling_params.stop_strings.empty() || m_sampling_params.include_stop_str_in_output)) { + push_partial_outputs(); + } else if (has_finished() || out_of_memory()) { + push_outputs(); + } + } + } +}; +} diff --git a/src/cpp/src/synchronized_queue.hpp b/src/cpp/src/synchronized_queue.hpp new file mode 100644 index 0000000000..55649a7032 --- /dev/null +++ b/src/cpp/src/synchronized_queue.hpp @@ -0,0 +1,47 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <queue> +#include <mutex> +#include <condition_variable> + +template <typename T> +class SynchronizedQueue +{ + std::queue<T> m_queue; + std::mutex m_mutex; + std::condition_variable m_cv; + +public: + SynchronizedQueue() = default; + SynchronizedQueue(const SynchronizedQueue&) = delete; + SynchronizedQueue(const SynchronizedQueue&&) = delete; + SynchronizedQueue& operator=(const SynchronizedQueue&) = delete; + + T back() { + std::unique_lock<std::mutex> lock(m_mutex); + m_cv.wait(lock, [this]{return !m_queue.empty();}); + return m_queue.back(); + } + + T pull() { + std::unique_lock<std::mutex> lock(m_mutex); + m_cv.wait(lock, [this]{return !m_queue.empty();}); + auto val = m_queue.front(); + m_queue.pop(); + return val; + } + + void push(const T& item) { + std::unique_lock<std::mutex> lock(m_mutex); + m_queue.push(item); + m_cv.notify_one(); + } + + bool empty() { + std::unique_lock<std::mutex> lock(m_mutex); + return m_queue.empty(); + } +}; diff --git a/src/cpp/src/text2image/diffusion_pipeline.hpp b/src/cpp/src/text2image/diffusion_pipeline.hpp new file mode 100644 index 0000000000..58843b8667 --- /dev/null +++ b/src/cpp/src/text2image/diffusion_pipeline.hpp @@ -0,0 +1,83 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <fstream> + +#include "text2image/schedulers/ischeduler.hpp" +#include "openvino/genai/text2image/pipeline.hpp" + +#include "utils.hpp" + +namespace { + +void batch_copy(ov::Tensor src, ov::Tensor dst, size_t src_batch, size_t dst_batch, size_t batch_size = 1) { + const ov::Shape src_shape = src.get_shape(), dst_shape = dst.get_shape(); + ov::Coordinate src_start(src_shape.size(), 0), src_end = src_shape; + ov::Coordinate dst_start(dst_shape.size(), 0), dst_end = dst_shape; + + src_start[0] = src_batch; + src_end[0] = src_batch + batch_size; + + dst_start[0] = dst_batch; + dst_end[0] = dst_batch + batch_size; + + ov::Tensor(src, src_start, src_end).copy_to(ov::Tensor(dst, dst_start, dst_end)); +} + +const std::string get_class_name(const std::string& root_dir) { + const std::string model_index_path = root_dir + "/model_index.json"; + std::ifstream file(model_index_path); + OPENVINO_ASSERT(file.is_open(), "Failed to open ", model_index_path); + + nlohmann::json data = nlohmann::json::parse(file); + using ov::genai::utils::read_json_param; + + return data["_class_name"].get<std::string>(); +} + +} // namespace + + +namespace ov { +namespace genai { + +class Text2ImagePipeline::DiffusionPipeline { +public: + GenerationConfig get_generation_config() const { + return m_generation_config; + } + + void set_generation_config(const GenerationConfig& generation_config) { + m_generation_config = generation_config; + m_generation_config.validate(); + } + + void set_scheduler(std::shared_ptr<Scheduler> scheduler) { + auto casted = std::dynamic_pointer_cast<IScheduler>(scheduler); + OPENVINO_ASSERT(casted != nullptr, "Passed incorrect scheduler type"); + m_scheduler = casted; + } + + virtual void reshape(const int num_images_per_prompt, const int height, const int width, const float guidance_scale) = 0; + + virtual void compile(const std::string& device, const ov::AnyMap& properties) = 0; + + virtual ov::Tensor generate(const std::string& positive_prompt, const ov::AnyMap& properties) = 0; + + virtual ~DiffusionPipeline() = default; + +protected: + virtual void initialize_generation_config(const std::string& class_name) = 0; + + virtual void check_image_size(const int height, const int width) const = 0; + + virtual void check_inputs(const GenerationConfig& generation_config) const = 0; + + std::shared_ptr<IScheduler> m_scheduler; + GenerationConfig m_generation_config; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text2image/models/autoencoder_kl.cpp b/src/cpp/src/text2image/models/autoencoder_kl.cpp new file mode 100644 index 0000000000..30b378963b --- /dev/null +++ b/src/cpp/src/text2image/models/autoencoder_kl.cpp @@ -0,0 +1,115 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/text2image/autoencoder_kl.hpp" + +#include <fstream> +#include <memory> + +#include "openvino/runtime/core.hpp" +#include "openvino/core/preprocess/pre_post_process.hpp" +#include "openvino/op/clamp.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/constant.hpp" + +#include "utils.hpp" +#include "lora_helper.hpp" + +namespace ov { +namespace genai { + +AutoencoderKL::Config::Config(const std::string& config_path) { + std::ifstream file(config_path); + OPENVINO_ASSERT(file.is_open(), "Failed to open ", config_path); + + nlohmann::json data = nlohmann::json::parse(file); + using utils::read_json_param; + + read_json_param(data, "in_channels", in_channels); + read_json_param(data, "latent_channels", latent_channels); + read_json_param(data, "out_channels", out_channels); + read_json_param(data, "scaling_factor", scaling_factor); + read_json_param(data, "block_out_channels", block_out_channels); +} + +AutoencoderKL::AutoencoderKL(const std::string& root_dir) + : m_config(root_dir + "/config.json") { + m_model = ov::Core().read_model(root_dir + "/openvino_model.xml"); + // apply VaeImageProcessor postprocessing steps by merging them into the VAE decoder model + merge_vae_image_processor(); +} + +AutoencoderKL::AutoencoderKL(const std::string& root_dir, + const std::string& device, + const ov::AnyMap& properties) + : AutoencoderKL(root_dir) { + if(auto filtered_properties = extract_adapters_from_properties(properties)) { + compile(device, *filtered_properties); + } else { + compile(device, properties); + } +} + +AutoencoderKL::AutoencoderKL(const AutoencoderKL&) = default; + +AutoencoderKL& AutoencoderKL::reshape(int batch_size, int height, int width) { + OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot reshape already compiled model"); + + const size_t vae_scale_factor = std::pow(2, m_config.block_out_channels.size() - 1); + + height /= vae_scale_factor; + width /= vae_scale_factor; + + ov::PartialShape input_shape = m_model->input(0).get_partial_shape(); + std::map<size_t, ov::PartialShape> idx_to_shape{{0, {batch_size, input_shape[1], height, width}}}; + m_model->reshape(idx_to_shape); + + return *this; +} + +AutoencoderKL& AutoencoderKL::compile(const std::string& device, const ov::AnyMap& properties) { + OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot re-compile already compiled model"); + ov::CompiledModel compiled_model = ov::Core().compile_model(m_model, device, properties); + m_request = compiled_model.create_infer_request(); + // release the original model + m_model.reset(); + + return *this; +} + +ov::Tensor AutoencoderKL::infer(ov::Tensor latent) { + OPENVINO_ASSERT(m_request, "VAE decoder model must be compiled first. Cannot infer non-compiled model"); + + m_request.set_input_tensor(latent); + m_request.infer(); + return m_request.get_output_tensor(); +} + +void AutoencoderKL::merge_vae_image_processor() const { + ov::preprocess::PrePostProcessor ppp(m_model); + + // scale input before VAE encoder + ppp.input().preprocess().scale(m_config.scaling_factor); + + // apply VaeImageProcessor normalization steps + // https://github.com/huggingface/diffusers/blob/v0.30.1/src/diffusers/image_processor.py#L159 + ppp.output().postprocess().custom([](const ov::Output<ov::Node>& port) { + auto constant_0_5 = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1}, 0.5f); + auto constant_255 = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1}, 255.0f); + auto scaled_0_5 = std::make_shared<ov::op::v1::Multiply>(port, constant_0_5); + auto added_0_5 = std::make_shared<ov::op::v1::Add>(scaled_0_5, constant_0_5); + auto clamped = std::make_shared<ov::op::v0::Clamp>(added_0_5, 0.0f, 1.0f); + return std::make_shared<ov::op::v1::Multiply>(clamped, constant_255); + }); + ppp.output().postprocess().convert_element_type(ov::element::u8); + // layout conversion + // https://github.com/huggingface/diffusers/blob/v0.30.1/src/diffusers/image_processor.py#L144 + ppp.output().model().set_layout("NCHW"); + ppp.output().tensor().set_layout("NHWC"); + + ppp.build(); +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text2image/models/clip_text_model.cpp b/src/cpp/src/text2image/models/clip_text_model.cpp new file mode 100644 index 0000000000..06cbdd1852 --- /dev/null +++ b/src/cpp/src/text2image/models/clip_text_model.cpp @@ -0,0 +1,120 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/text2image/clip_text_model.hpp" + +#include <fstream> + +#include "openvino/runtime/core.hpp" + +#include "utils.hpp" +#include "lora_helper.hpp" + +namespace ov { +namespace genai { + +CLIPTextModel::Config::Config(const std::string& config_path) { + std::ifstream file(config_path); + OPENVINO_ASSERT(file.is_open(), "Failed to open ", config_path); + + nlohmann::json data = nlohmann::json::parse(file); + using utils::read_json_param; + + read_json_param(data, "max_position_embeddings", max_position_embeddings); + read_json_param(data, "hidden_size", hidden_size); + read_json_param(data, "num_hidden_layers", num_hidden_layers); +} + +CLIPTextModel::CLIPTextModel(const std::string root_dir) : + m_clip_tokenizer(root_dir + "/../tokenizer"), + m_config(root_dir + "/config.json") { + m_model = ov::Core().read_model(root_dir + "/openvino_model.xml"); +} + +CLIPTextModel::CLIPTextModel(const std::string& root_dir, + const std::string& device, + const ov::AnyMap& properties) : + CLIPTextModel(root_dir) { + AdapterConfig adapters; + if(auto filtered_properties = extract_adapters_from_properties(properties, &adapters)) { + m_adapter_controller = AdapterController(m_model, adapters, "lora_te", device); + compile(device, *filtered_properties); + } else { + compile(device, properties); + } +} + +CLIPTextModel::CLIPTextModel(const CLIPTextModel&) = default; + +const CLIPTextModel::Config& CLIPTextModel::get_config() const { + return m_config; +} + +CLIPTextModel& CLIPTextModel::reshape(int batch_size) { + OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot reshape already compiled model"); + + ov::PartialShape input_shape = m_model->input(0).get_partial_shape(); + input_shape[0] = batch_size; + input_shape[1] = m_config.max_position_embeddings; + std::map<size_t, ov::PartialShape> idx_to_shape{{0, input_shape}}; + m_model->reshape(idx_to_shape); + + return *this; +} + +CLIPTextModel& CLIPTextModel::compile(const std::string& device, const ov::AnyMap& properties) { + OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot re-compile already compiled model"); + ov::CompiledModel compiled_model = ov::Core().compile_model(m_model, device, properties); + m_request = compiled_model.create_infer_request(); + // release the original model + m_model.reset(); + + return *this; +} + +void CLIPTextModel::set_adapters(const AdapterConfig& adapters) { + m_adapter_controller.apply(m_request, adapters); +} + +ov::Tensor CLIPTextModel::infer(const std::string& pos_prompt, const std::string& neg_prompt, bool do_classifier_free_guidance) { + OPENVINO_ASSERT(m_request, "CLIP text encoder model must be compiled first. Cannot infer non-compiled model"); + + const int32_t pad_token_id = m_clip_tokenizer.get_pad_token_id(); + const size_t text_embedding_batch_size = do_classifier_free_guidance ? 2 : 1; + + auto perform_tokenization = [&](const std::string& prompt, ov::Tensor input_ids) { + std::fill_n(input_ids.data<int32_t>(), input_ids.get_size(), pad_token_id); + + ov::Tensor input_ids_token = m_clip_tokenizer.encode(prompt).input_ids; + std::copy_n(input_ids_token.data<std::int64_t>(), input_ids_token.get_size(), input_ids.data<std::int32_t>()); + }; + + ov::Tensor input_ids(ov::element::i32, {text_embedding_batch_size, m_config.max_position_embeddings}); + size_t current_batch_idx = 0; + + if (do_classifier_free_guidance) { + perform_tokenization(neg_prompt, + ov::Tensor(input_ids, {current_batch_idx , 0}, + {current_batch_idx + 1, m_config.max_position_embeddings})); + ++current_batch_idx; + } else { + // Negative prompt is ignored when --guidanceScale < 1.0 + } + + perform_tokenization(pos_prompt, + ov::Tensor(input_ids, {current_batch_idx , 0}, + {current_batch_idx + 1, m_config.max_position_embeddings})); + + // text embeddings + m_request.set_tensor("input_ids", input_ids); + m_request.infer(); + + return m_request.get_output_tensor(0); +} + +ov::Tensor CLIPTextModel::get_output_tensor(const size_t idx) { + return m_request.get_output_tensor(idx); +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text2image/models/clip_text_model_with_projection.cpp b/src/cpp/src/text2image/models/clip_text_model_with_projection.cpp new file mode 100644 index 0000000000..6a268402e1 --- /dev/null +++ b/src/cpp/src/text2image/models/clip_text_model_with_projection.cpp @@ -0,0 +1,109 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/text2image/clip_text_model_with_projection.hpp" + +#include <fstream> + +#include "openvino/runtime/core.hpp" + +#include "utils.hpp" + +namespace ov { +namespace genai { + +CLIPTextModelWithProjection::Config::Config(const std::string& config_path) { + std::ifstream file(config_path); + OPENVINO_ASSERT(file.is_open(), "Failed to open ", config_path); + + nlohmann::json data = nlohmann::json::parse(file); + using utils::read_json_param; + + read_json_param(data, "max_position_embeddings", max_position_embeddings); + read_json_param(data, "hidden_size", hidden_size); + read_json_param(data, "num_hidden_layers", num_hidden_layers); +} + +CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::string root_dir) : + m_clip_tokenizer(root_dir + "/../tokenizer_2"), + m_config(root_dir + "/config.json") { + m_model = ov::Core().read_model(root_dir + "/openvino_model.xml"); +} + +CLIPTextModelWithProjection::CLIPTextModelWithProjection(const std::string& root_dir, + const std::string& device, + const ov::AnyMap& properties) : + CLIPTextModelWithProjection(root_dir) { + compile(device, properties); +} + +CLIPTextModelWithProjection::CLIPTextModelWithProjection(const CLIPTextModelWithProjection&) = default; + +const CLIPTextModelWithProjection::Config& CLIPTextModelWithProjection::get_config() const { + return m_config; +} + +CLIPTextModelWithProjection& CLIPTextModelWithProjection::reshape(int batch_size) { + OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot reshape already compiled model"); + + ov::PartialShape input_shape = m_model->input(0).get_partial_shape(); + input_shape[0] = batch_size; + input_shape[1] = m_config.max_position_embeddings; + std::map<size_t, ov::PartialShape> idx_to_shape{{0, input_shape}}; + m_model->reshape(idx_to_shape); + + return *this; +} + +CLIPTextModelWithProjection& CLIPTextModelWithProjection::compile(const std::string& device, const ov::AnyMap& properties) { + OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot re-compile already compiled model"); + ov::CompiledModel compiled_model = ov::Core().compile_model(m_model, device, properties); + m_request = compiled_model.create_infer_request(); + // release the original model + m_model.reset(); + + return *this; +} + +ov::Tensor CLIPTextModelWithProjection::infer(const std::string& pos_prompt, const std::string& neg_prompt, bool do_classifier_free_guidance) { + OPENVINO_ASSERT(m_request, "CLIP text encoder model must be compiled first. Cannot infer non-compiled model"); + + const int32_t pad_token_id = m_clip_tokenizer.get_pad_token_id(); + const size_t text_embedding_batch_size = do_classifier_free_guidance ? 2 : 1; + + auto perform_tokenization = [&](const std::string& prompt, ov::Tensor input_ids) { + std::fill_n(input_ids.data<int64_t>(), input_ids.get_size(), pad_token_id); + + ov::Tensor input_ids_token = m_clip_tokenizer.encode(prompt).input_ids; + std::copy_n(input_ids_token.data<std::int64_t>(), input_ids_token.get_size(), input_ids.data<std::int64_t>()); + }; + + ov::Tensor input_ids(ov::element::i64, {text_embedding_batch_size, m_config.max_position_embeddings}); + size_t current_batch_idx = 0; + + if (do_classifier_free_guidance) { + perform_tokenization(neg_prompt, + ov::Tensor(input_ids, {current_batch_idx , 0}, + {current_batch_idx + 1, m_config.max_position_embeddings})); + ++current_batch_idx; + } else { + // Negative prompt is ignored when --guidanceScale < 1.0 + } + + perform_tokenization(pos_prompt, + ov::Tensor(input_ids, {current_batch_idx , 0}, + {current_batch_idx + 1, m_config.max_position_embeddings})); + + // text embeddings + m_request.set_tensor("input_ids", input_ids); + m_request.infer(); + + return m_request.get_output_tensor(0); +} + +ov::Tensor CLIPTextModelWithProjection::get_output_tensor(const size_t idx) { + return m_request.get_output_tensor(idx); +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text2image/models/unet2d_condition_model.cpp b/src/cpp/src/text2image/models/unet2d_condition_model.cpp new file mode 100644 index 0000000000..d356515678 --- /dev/null +++ b/src/cpp/src/text2image/models/unet2d_condition_model.cpp @@ -0,0 +1,118 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/text2image/unet2d_condition_model.hpp" + +#include <fstream> + +#include "openvino/runtime/core.hpp" + +#include "utils.hpp" +#include "lora_helper.hpp" + +namespace ov { +namespace genai { + +UNet2DConditionModel::Config::Config(const std::string& config_path) { + std::ifstream file(config_path); + OPENVINO_ASSERT(file.is_open(), "Failed to open ", config_path); + + nlohmann::json data = nlohmann::json::parse(file); + using utils::read_json_param; + + read_json_param(data, "in_channels", in_channels); + read_json_param(data, "sample_size", sample_size); + read_json_param(data, "block_out_channels", block_out_channels); + read_json_param(data, "time_cond_proj_dim", time_cond_proj_dim); +} + +UNet2DConditionModel::UNet2DConditionModel(const std::string root_dir) : + m_config(root_dir + "/config.json") { + m_model = ov::Core().read_model(root_dir + "/openvino_model.xml"); + // compute VAE scale factor + m_vae_scale_factor = std::pow(2, m_config.block_out_channels.size() - 1); +} + +UNet2DConditionModel::UNet2DConditionModel(const std::string& root_dir, + const std::string& device, + const ov::AnyMap& properties) : + UNet2DConditionModel(root_dir) { + AdapterConfig adapters; + if(auto filtered_properties = extract_adapters_from_properties(properties, &adapters)) { + m_adapter_controller = AdapterController(m_model, adapters, "lora_unet", device); + compile(device, *filtered_properties); + } else { + compile(device, properties); + } +} + +UNet2DConditionModel::UNet2DConditionModel(const UNet2DConditionModel&) = default; + +const UNet2DConditionModel::Config& UNet2DConditionModel::get_config() const { + return m_config; +} + +size_t UNet2DConditionModel::get_vae_scale_factor() const { + return m_vae_scale_factor; +} + +UNet2DConditionModel& UNet2DConditionModel::reshape(int batch_size, int height, int width, int tokenizer_model_max_length) { + OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot reshape already compiled model"); + + height /= m_vae_scale_factor; + width /= m_vae_scale_factor; + + std::map<std::string, ov::PartialShape> name_to_shape; + + for (auto && input : m_model->inputs()) { + std::string input_name = input.get_any_name(); + name_to_shape[input_name] = input.get_partial_shape(); + if (input_name == "timestep") { + name_to_shape[input_name][0] = 1; + } else if (input_name == "sample") { + name_to_shape[input_name] = {batch_size, name_to_shape[input_name][1], height, width}; + } else if (input_name == "time_ids" || input_name == "text_embeds") { + name_to_shape[input_name][0] = batch_size; + } else if (input_name == "encoder_hidden_states") { + name_to_shape[input_name][0] = batch_size; + name_to_shape[input_name][1] = tokenizer_model_max_length; + } + } + + m_model->reshape(name_to_shape); + + return *this; +} + +UNet2DConditionModel& UNet2DConditionModel::compile(const std::string& device, const ov::AnyMap& properties) { + OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot re-compile already compiled model"); + ov::CompiledModel compiled_model = ov::Core().compile_model(m_model, device, properties); + m_request = compiled_model.create_infer_request(); + // release the original model + m_model.reset(); + + return *this; +} + +void UNet2DConditionModel::set_hidden_states(const std::string& tensor_name, ov::Tensor encoder_hidden_states) { + OPENVINO_ASSERT(m_request, "UNet model must be compiled first"); + m_request.set_tensor(tensor_name, encoder_hidden_states); +} + +void UNet2DConditionModel::set_adapters(const AdapterConfig& adapters) { + m_adapter_controller.apply(m_request, adapters); +} + +ov::Tensor UNet2DConditionModel::infer(ov::Tensor sample, ov::Tensor timestep) { + OPENVINO_ASSERT(m_request, "UNet model must be compiled first. Cannot infer non-compiled model"); + + m_request.set_tensor("sample", sample); + m_request.set_tensor("timestep", timestep); + + m_request.infer(); + + return m_request.get_output_tensor(); +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text2image/numpy_utils.cpp b/src/cpp/src/text2image/numpy_utils.cpp new file mode 100644 index 0000000000..9554681820 --- /dev/null +++ b/src/cpp/src/text2image/numpy_utils.cpp @@ -0,0 +1,79 @@ +#include "text2image/numpy_utils.hpp" +#include "openvino/core/except.hpp" + +namespace ov { +namespace genai { +namespace numpy_utils { + +void rescale_zero_terminal_snr(std::vector<float>& betas) { + // Convert betas to alphas_bar_sqrt + std::vector<float> alphas, alphas_bar_sqrt; + for (float b : betas) { + alphas.push_back(1.0f - b); + } + + for (size_t i = 1; i <= alphas.size(); ++i) { + float alpha_cumprod = + std::accumulate(std::begin(alphas), std::begin(alphas) + i, 1.0, std::multiplies<float>{}); + alphas_bar_sqrt.push_back(std::sqrt(alpha_cumprod)); + } + + float alphas_bar_sqrt_0 = alphas_bar_sqrt[0]; + float alphas_bar_sqrt_T = alphas_bar_sqrt[alphas_bar_sqrt.size() - 1]; + + for (float& x : alphas_bar_sqrt) { + // Shift so the last timestep is zero. + x = x - alphas_bar_sqrt_T; + // Scale so the first timestep is back to the old value. + x *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T); + // Revert sqrt + x = std::pow(x, 2); + } + + // Revert cumprod + std::vector<float> end = alphas_bar_sqrt, begin = alphas_bar_sqrt; + end.erase(end.begin()); + begin.pop_back(); + + alphas[0] = alphas_bar_sqrt[0]; + for (size_t i = 1; i < alphas.size(); ++i) { + alphas[i] = end[i - 1] / begin[i - 1]; + } + + std::transform(alphas.begin(), alphas.end(), betas.begin(), [](float x) { + return (1 - x); + }); +} + +std::vector<float> interp(const std::vector<std::int64_t>& x, const std::vector<size_t>& xp, const std::vector<float>& fp) { + OPENVINO_ASSERT(xp.size() == fp.size(), "`xp` and `fp`vectors must have the same sizes"); + + std::vector<float> interp_res; + + for (const auto& i : x) { + if (i <= xp[0]) { + interp_res.push_back(fp[0]); + } else if (i >= xp[xp.size() - 1]) { + interp_res.push_back(fp[fp.size() - 1]); + } else { + // Find the first xp element that is not less than x[i] + auto it = std::lower_bound(xp.begin(), xp.end(), i); + + // idx of the left boundary + size_t idx = std::distance(xp.begin(), it) - 1; + + float x0 = xp[idx], x1 = xp[idx + 1]; + float y0 = fp[idx], y1 = fp[idx + 1]; + + float interp_val = (y1 - y0) / (x1 - x0) * (i - x0) + y0; + + interp_res.push_back(interp_val); + } + } + + return interp_res; +} + +} // namespace ov +} // namespace genai +} // namespace numpy_utils diff --git a/src/cpp/src/text2image/numpy_utils.hpp b/src/cpp/src/text2image/numpy_utils.hpp new file mode 100644 index 0000000000..d6144eeb99 --- /dev/null +++ b/src/cpp/src/text2image/numpy_utils.hpp @@ -0,0 +1,47 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <vector> +#include <cstddef> +#include <cstdint> +#include <numeric> +#include <algorithm> +#include <cmath> + +namespace ov { +namespace genai { +namespace numpy_utils { + +// https://gist.github.com/lorenzoriano/5414671 +template <typename T, typename U> +std::vector<T> linspace(U start, U end, size_t num, bool endpoint = false) { + std::vector<T> indices; + if (num != 0) { + if (num == 1) + indices.push_back(static_cast<T>(start)); + else { + if (endpoint) + --num; + + U delta = (end - start) / static_cast<U>(num); + for (size_t i = 0; i < num; i++) + indices.push_back(static_cast<T>(start + delta * i)); + + if (endpoint) + indices.push_back(static_cast<T>(end)); + } + } + return indices; +} + +// Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1) +void rescale_zero_terminal_snr(std::vector<float>& betas); + +// np.interp(...) implementation +std::vector<float> interp(const std::vector<std::int64_t>& x, const std::vector<size_t>& xp, const std::vector<float>& fp); + +} // namespace ov +} // namespace genai +} // namespace numpy_utils diff --git a/src/cpp/src/text2image/schedulers/ddim.cpp b/src/cpp/src/text2image/schedulers/ddim.cpp new file mode 100644 index 0000000000..a25cf7227e --- /dev/null +++ b/src/cpp/src/text2image/schedulers/ddim.cpp @@ -0,0 +1,202 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include <cassert> +#include <random> +#include <fstream> +#include <iterator> + +#include "text2image/schedulers/ddim.hpp" +#include "utils.hpp" +#include "text2image/numpy_utils.hpp" + +namespace ov { +namespace genai { + +DDIMScheduler::Config::Config(const std::string& scheduler_config_path) { + std::ifstream file(scheduler_config_path); + OPENVINO_ASSERT(file.is_open(), "Failed to open ", scheduler_config_path); + + nlohmann::json data = nlohmann::json::parse(file); + using utils::read_json_param; + + read_json_param(data, "num_train_timesteps", num_train_timesteps); + read_json_param(data, "beta_start", beta_start); + read_json_param(data, "beta_end", beta_end); + read_json_param(data, "beta_schedule", beta_schedule); + read_json_param(data, "trained_betas", trained_betas); + read_json_param(data, "clip_sample", clip_sample); + read_json_param(data, "set_alpha_to_one", set_alpha_to_one); + read_json_param(data, "steps_offset", steps_offset); + read_json_param(data, "prediction_type", prediction_type); + read_json_param(data, "thresholding", thresholding); + read_json_param(data, "dynamic_thresholding_ratio", dynamic_thresholding_ratio); + read_json_param(data, "clip_sample_range", clip_sample_range); + read_json_param(data, "sample_max_value", sample_max_value); + read_json_param(data, "timestep_spacing", timestep_spacing); + read_json_param(data, "rescale_betas_zero_snr", rescale_betas_zero_snr); +} + +DDIMScheduler::DDIMScheduler(const std::string scheduler_config_path) + : DDIMScheduler(Config(scheduler_config_path)) { +} + +DDIMScheduler::DDIMScheduler(const Config& scheduler_config) + : m_config(scheduler_config) { + + std::vector<float> alphas, betas; + + using numpy_utils::linspace; + + if (!m_config.trained_betas.empty()) { + betas = m_config.trained_betas; + } else if (m_config.beta_schedule == BetaSchedule::LINEAR) { + betas = linspace<float>(m_config.beta_start, m_config.beta_end, m_config.num_train_timesteps); + } else if (m_config.beta_schedule == BetaSchedule::SCALED_LINEAR) { + float start = std::sqrt(m_config.beta_start); + float end = std::sqrt(m_config.beta_end); + betas = linspace<float>(start, end, m_config.num_train_timesteps); + std::for_each(betas.begin(), betas.end(), [] (float & x) { x *= x; }); + } else { + OPENVINO_THROW("'beta_schedule' must be one of 'LINEAR' or 'SCALED_LINEAR'. Please, add support of other types"); + } + + if (m_config.rescale_betas_zero_snr) { + using numpy_utils::rescale_zero_terminal_snr; + rescale_zero_terminal_snr(betas); + } + + std::transform(betas.begin(), betas.end(), std::back_inserter(alphas), [] (float b) { return 1.0f - b; }); + + for (size_t i = 1; i <= alphas.size(); i++) { + float alpha_cumprod = + std::accumulate(std::begin(alphas), std::begin(alphas) + i, 1.0, std::multiplies<float>{}); + m_alphas_cumprod.push_back(alpha_cumprod); + } + + m_final_alpha_cumprod = m_config.set_alpha_to_one ? 1 : m_alphas_cumprod[0]; +} + +void DDIMScheduler::set_timesteps(size_t num_inference_steps) { + m_timesteps.clear(); + + OPENVINO_ASSERT(num_inference_steps <= m_config.num_train_timesteps, + "`num_inference_steps` cannot be larger than `m_config.num_train_timesteps`"); + + m_num_inference_steps = num_inference_steps; + + switch (m_config.timestep_spacing) { + case TimestepSpacing::LINSPACE: + { + using numpy_utils::linspace; + float end = static_cast<float>(m_config.num_train_timesteps - 1); + auto linspaced = linspace<float>(0.0f, end, num_inference_steps, true); + for (auto it = linspaced.rbegin(); it != linspaced.rend(); ++it) { + m_timesteps.push_back(static_cast<int64_t>(std::round(*it))); + } + break; + } + case TimestepSpacing::LEADING: + { + size_t step_ratio = m_config.num_train_timesteps / m_num_inference_steps; + for (size_t i = num_inference_steps - 1; i != -1; --i) { + m_timesteps.push_back(i * step_ratio + m_config.steps_offset); + } + break; + } + case TimestepSpacing::TRAILING: + { + float step_ratio = static_cast<float>(m_config.num_train_timesteps) / static_cast<float>(m_num_inference_steps); + for (float i = m_config.num_train_timesteps; i > 0; i-=step_ratio){ + m_timesteps.push_back(static_cast<int64_t>(std::round(i)) - 1); + } + break; + } + default: + OPENVINO_THROW("Unsupported value for 'timestep_spacing'"); + } +} + +std::map<std::string, ov::Tensor> DDIMScheduler::step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step) { + // noise_pred - model_output + // latents - sample + // inference_step + + size_t timestep = get_timesteps()[inference_step]; + + // get previous step value (=t-1) + int prev_timestep = timestep - m_config.num_train_timesteps / m_num_inference_steps; + + // compute alphas, betas + float alpha_prod_t = m_alphas_cumprod[timestep]; + float alpha_prod_t_prev = (prev_timestep >= 0) ? m_alphas_cumprod[prev_timestep] : m_final_alpha_cumprod; + float beta_prod_t = 1 - alpha_prod_t; + + // compute predicted original sample from predicted noise also called + // "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf + std::vector<float> pred_original_sample, pred_epsilon; + float pos_val, pe_val; + for (size_t j = 0; j < noise_pred.get_size(); j++) { + switch (m_config.prediction_type) { + case PredictionType::EPSILON: + pos_val = (latents.data<float>()[j] - std::sqrt(beta_prod_t) * noise_pred.data<float>()[j]) / std::sqrt(alpha_prod_t); + pe_val = noise_pred.data<float>()[j]; + pred_original_sample.push_back(pos_val); + pred_epsilon.push_back(pe_val); + break; + case PredictionType::SAMPLE: + pos_val = noise_pred.data<float>()[j]; + pe_val = (latents.data<float>()[j] - std::sqrt(alpha_prod_t) * pos_val) / std::sqrt(beta_prod_t); + pred_original_sample.push_back(pos_val); + pred_epsilon.push_back(pe_val); + break; + case PredictionType::V_PREDICTION: + pos_val = std::sqrt(alpha_prod_t) * latents.data<float>()[j] - std::sqrt(beta_prod_t) * noise_pred.data<float>()[j]; + pe_val = std::sqrt(alpha_prod_t) * noise_pred.data<float>()[j] + std::sqrt(beta_prod_t) * latents.data<float>()[j]; + pred_original_sample.push_back(pos_val); + pred_epsilon.push_back(pe_val); + break; + default: + OPENVINO_THROW("Unsupported value for 'PredictionType'"); + } + } + + // TODO: support m_config.thresholding + OPENVINO_ASSERT(!m_config.thresholding, + "Parameter 'thresholding' is not supported. Please, add support."); + // TODO: support m_config.clip_sample + OPENVINO_ASSERT(!m_config.clip_sample, + "Parameter 'clip_sample' is not supported. Please, add support."); + + // compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf + std::vector<float> pred_sample_direction(pred_epsilon.size()); + std::transform(pred_epsilon.begin(), pred_epsilon.end(), pred_sample_direction.begin(), [alpha_prod_t_prev](auto x) { + return std::sqrt(1 - alpha_prod_t_prev) * x; + }); + + // compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf + ov::Tensor prev_sample(latents.get_element_type(), latents.get_shape()); + float* prev_sample_data = prev_sample.data<float>(); + for (size_t i = 0; i < prev_sample.get_size(); ++i) { + prev_sample_data[i] = std::sqrt(alpha_prod_t_prev) * pred_original_sample[i] + pred_sample_direction[i]; + } + + std::map<std::string, ov::Tensor> result{{"latent", prev_sample}}; + + return result; +} + +std::vector<int64_t> DDIMScheduler::get_timesteps() const { + return m_timesteps; +} + +float DDIMScheduler::get_init_noise_sigma() const { + return 1.0f; +} + +void DDIMScheduler::scale_model_input(ov::Tensor sample, size_t inference_step) { + return; +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text2image/schedulers/ddim.hpp b/src/cpp/src/text2image/schedulers/ddim.hpp new file mode 100644 index 0000000000..936f4991ea --- /dev/null +++ b/src/cpp/src/text2image/schedulers/ddim.hpp @@ -0,0 +1,58 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <list> +#include <string> + +#include "text2image/schedulers/types.hpp" +#include "text2image/schedulers/ischeduler.hpp" + +namespace ov { +namespace genai { + +class DDIMScheduler : public IScheduler { +public: + struct Config { + int32_t num_train_timesteps = 1000; + float beta_start = 0.0001f, beta_end = 0.02f; + BetaSchedule beta_schedule = BetaSchedule::SCALED_LINEAR; + std::vector<float> trained_betas = {}; + bool clip_sample = true, set_alpha_to_one = true; + size_t steps_offset = 0; + PredictionType prediction_type = PredictionType::EPSILON; + bool thresholding = false; + float dynamic_thresholding_ratio = 0.995f, clip_sample_range = 1.0f, sample_max_value = 1.0f; + TimestepSpacing timestep_spacing = TimestepSpacing::LEADING; + bool rescale_betas_zero_snr = false; + + Config() = default; + explicit Config(const std::string& scheduler_config_path); + }; + + explicit DDIMScheduler(const std::string scheduler_config_path); + explicit DDIMScheduler(const Config& scheduler_config); + + void set_timesteps(size_t num_inference_steps) override; + + std::vector<std::int64_t> get_timesteps() const override; + + float get_init_noise_sigma() const override; + + void scale_model_input(ov::Tensor sample, size_t inference_step) override; + + std::map<std::string, ov::Tensor> step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step) override; + +private: + Config m_config; + + std::vector<float> m_alphas_cumprod; + float m_final_alpha_cumprod; + + size_t m_num_inference_steps; + std::vector<int64_t> m_timesteps; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text2image/schedulers/euler_discrete.cpp b/src/cpp/src/text2image/schedulers/euler_discrete.cpp new file mode 100644 index 0000000000..2af3b83637 --- /dev/null +++ b/src/cpp/src/text2image/schedulers/euler_discrete.cpp @@ -0,0 +1,284 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "text2image/schedulers/euler_discrete.hpp" + +#include <cassert> +#include <fstream> +#include <iterator> +#include <random> + +#include "text2image/numpy_utils.hpp" +#include "utils.hpp" + +namespace ov { +namespace genai { + +EulerDiscreteScheduler::Config::Config(const std::string& scheduler_config_path) { + std::ifstream file(scheduler_config_path); + OPENVINO_ASSERT(file.is_open(), "Failed to open ", scheduler_config_path); + + nlohmann::json data = nlohmann::json::parse(file); + using utils::read_json_param; + + read_json_param(data, "num_train_timesteps", num_train_timesteps); + read_json_param(data, "beta_start", beta_start); + read_json_param(data, "beta_end", beta_end); + read_json_param(data, "beta_schedule", beta_schedule); + read_json_param(data, "trained_betas", trained_betas); + read_json_param(data, "final_sigmas_type", final_sigmas_type); + read_json_param(data, "interpolation_type", interpolation_type); + read_json_param(data, "sigma_max", sigma_max); + read_json_param(data, "sigma_min", sigma_min); + read_json_param(data, "steps_offset", steps_offset); + read_json_param(data, "prediction_type", prediction_type); + read_json_param(data, "timestep_spacing", timestep_spacing); + read_json_param(data, "timestep_type", timestep_type); + read_json_param(data, "rescale_betas_zero_snr", rescale_betas_zero_snr); + read_json_param(data, "use_karras_sigmas", use_karras_sigmas); + read_json_param(data, "use_exponential_sigmas", use_exponential_sigmas); + read_json_param(data, "use_beta_sigmas", use_beta_sigmas); +} + +EulerDiscreteScheduler::EulerDiscreteScheduler(const std::string scheduler_config_path) + : EulerDiscreteScheduler(Config(scheduler_config_path)) {} + +EulerDiscreteScheduler::EulerDiscreteScheduler(const Config& scheduler_config) : m_config(scheduler_config) { + std::vector<float> alphas, betas; + + using numpy_utils::linspace; + + if (!m_config.trained_betas.empty()) { + betas = m_config.trained_betas; + } else if (m_config.beta_schedule == BetaSchedule::LINEAR) { + betas = linspace<float>(m_config.beta_start, m_config.beta_end, m_config.num_train_timesteps); + } else if (m_config.beta_schedule == BetaSchedule::SCALED_LINEAR) { + float start = std::sqrt(m_config.beta_start); + float end = std::sqrt(m_config.beta_end); + betas = linspace<float>(start, end, m_config.num_train_timesteps); + std::for_each(betas.begin(), betas.end(), [](float& x) { + x *= x; + }); + } else { + OPENVINO_THROW( + "'beta_schedule' must be one of 'LINEAR' or 'SCALED_LINEAR'. Please, add support of other types"); + } + + if (m_config.rescale_betas_zero_snr) { + using numpy_utils::rescale_zero_terminal_snr; + rescale_zero_terminal_snr(betas); + } + + std::transform(betas.begin(), betas.end(), std::back_inserter(alphas), [](float b) { + return 1.0f - b; + }); + + for (size_t i = 1; i <= alphas.size(); ++i) { + float alpha_cumprod = + std::accumulate(std::begin(alphas), std::begin(alphas) + i, 1.0, std::multiplies<float>{}); + m_alphas_cumprod.push_back(alpha_cumprod); + } + + if (m_config.rescale_betas_zero_snr) { + m_alphas_cumprod.back() = std::pow(2, -24); + } + + for (auto it = m_alphas_cumprod.rbegin(); it != m_alphas_cumprod.rend(); ++it) { + float sigma = std::pow(((1 - (*it)) / (*it)), 0.5); + m_sigmas.push_back(sigma); + } + + auto linspaced = + linspace<float>(0.0f, static_cast<float>(m_config.num_train_timesteps - 1), m_config.num_train_timesteps, true); + for (auto it = linspaced.rbegin(); it != linspaced.rend(); ++it) { + m_timesteps.push_back(static_cast<int64_t>(std::round(*it))); + } + + OPENVINO_ASSERT( + m_config.timestep_type != TimestepType::CONTINUOUS || m_config.prediction_type != PredictionType::V_PREDICTION, + "This case isn't supported: `timestep_type=continuous` and `prediction_type=v_prediction`. Please, add " + "support."); + + m_sigmas.push_back(0); + + m_step_index = -1; +} + +void EulerDiscreteScheduler::set_timesteps(size_t num_inference_steps) { + // TODO: support `timesteps` and `sigmas` inputs + m_timesteps.clear(); + m_sigmas.clear(); + m_step_index = -1; + + m_num_inference_steps = num_inference_steps; + std::vector<float> sigmas; + + OPENVINO_ASSERT( + m_config.timestep_type != TimestepType::CONTINUOUS || m_config.prediction_type != PredictionType::V_PREDICTION, + "This case isn't supported: `timestep_type=continuous` and `prediction_type=v_prediction`. Please, add " + "support."); + + switch (m_config.timestep_spacing) { + case TimestepSpacing::LINSPACE: { + using numpy_utils::linspace; + float end = static_cast<float>(m_config.num_train_timesteps - 1); + auto linspaced = linspace<float>(0.0f, end, num_inference_steps, true); + for (auto it = linspaced.rbegin(); it != linspaced.rend(); ++it) { + m_timesteps.push_back(static_cast<int64_t>(std::round(*it))); + } + break; + } + case TimestepSpacing::LEADING: { + size_t step_ratio = m_config.num_train_timesteps / m_num_inference_steps; + for (size_t i = num_inference_steps - 1; i != -1; --i) { + m_timesteps.push_back(i * step_ratio + m_config.steps_offset); + } + break; + } + case TimestepSpacing::TRAILING: { + float step_ratio = static_cast<float>(m_config.num_train_timesteps) / static_cast<float>(m_num_inference_steps); + for (float i = m_config.num_train_timesteps; i > 0; i -= step_ratio) { + m_timesteps.push_back(static_cast<int64_t>(std::round(i)) - 1); + } + break; + } + default: + OPENVINO_THROW("Unsupported value for 'timestep_spacing'"); + } + + for (const float& i : m_alphas_cumprod) { + float sigma = std::pow(((1 - i) / i), 0.5); + sigmas.push_back(sigma); + } + + switch (m_config.interpolation_type) { + case InterpolationType::LINEAR: { + using numpy_utils::interp; + + std::vector<size_t> x_data_points(sigmas.size()); + std::iota(x_data_points.begin(), x_data_points.end(), 0); + m_sigmas = interp(m_timesteps, x_data_points, sigmas); + break; + } + case InterpolationType::LOG_LINEAR: { + using numpy_utils::linspace; + + m_sigmas = linspace<float>(std::log(sigmas.back()), std::log(sigmas[0]), num_inference_steps + 1, true); + std::transform(m_sigmas.begin(), m_sigmas.end(), m_sigmas.begin(), [](float x) { + return std::exp(x); + }); + break; + } + default: + OPENVINO_THROW("Unsupported value for 'interpolation_type'"); + } + + OPENVINO_ASSERT(!m_config.use_karras_sigmas, + "Parameter 'use_karras_sigmas' is not supported. Please, add support."); + + OPENVINO_ASSERT(!m_config.use_exponential_sigmas, + "Parameter 'use_exponential_sigmas' is not supported. Please, add support."); + + OPENVINO_ASSERT(!m_config.use_beta_sigmas, "Parameter 'use_beta_sigmas' is not supported. Please, add support."); + + float sigma_last = 0; + switch (m_config.final_sigmas_type) { + case FinalSigmaType::SIGMA_MIN: + sigma_last = std::pow(((1 - m_alphas_cumprod[0]) / m_alphas_cumprod[0]), 0.5); + break; + case FinalSigmaType::ZERO: + break; + default: + OPENVINO_THROW("Unsupported value for 'final_sigmas_type'"); + } + m_sigmas.push_back(sigma_last); +} + +std::map<std::string, ov::Tensor> EulerDiscreteScheduler::step(ov::Tensor noise_pred, + ov::Tensor latents, + size_t inference_step) { + // noise_pred - model_output + // latents - sample + // inference_step + + size_t timestep = get_timesteps()[inference_step]; + + if (m_step_index == -1) + m_step_index = 0; + + float sigma = m_sigmas[m_step_index]; + // TODO: hardcoded gamma + float gamma = 0.0f; + float sigma_hat = sigma * (gamma + 1); + + float* model_output_data = noise_pred.data<float>(); + float* sample_data = latents.data<float>(); + + ov::Tensor pred_original_sample(noise_pred.get_element_type(), noise_pred.get_shape()); + float* pred_original_sample_data = pred_original_sample.data<float>(); + + ov::Tensor prev_sample(noise_pred.get_element_type(), noise_pred.get_shape()); + float* prev_sample_data = prev_sample.data<float>(); + + // 1. compute predicted original sample (x_0) from sigma-scaled predicted noise + switch (m_config.prediction_type) { + case PredictionType::EPSILON: + for (size_t i = 0; i < noise_pred.get_size(); ++i) { + pred_original_sample_data[i] = sample_data[i] - model_output_data[i] * sigma_hat; + } + break; + case PredictionType::SAMPLE: + for (size_t i = 0; i < noise_pred.get_size(); ++i) { + pred_original_sample_data[i] = model_output_data[i]; + } + break; + case PredictionType::V_PREDICTION: + for (size_t i = 0; i < noise_pred.get_size(); ++i) { + pred_original_sample_data[i] = model_output_data[i] * (-sigma / std::pow((std::pow(sigma, 2) + 1), 0.5)) + + (sample_data[i] / (std::pow(sigma, 2) + 1)); + } + break; + default: + OPENVINO_THROW("Unsupported value for 'PredictionType'"); + } + + float dt = m_sigmas[m_step_index + 1] - sigma_hat; + + // 2. Convert to an ODE derivative + for (size_t i = 0; i < prev_sample.get_size(); ++i) { + prev_sample_data[i] = ((sample_data[i] - pred_original_sample_data[i]) / sigma_hat) * dt + sample_data[i]; + } + + m_step_index += 1; + + return {{"latent", prev_sample}, {"denoised", pred_original_sample}}; +} + +std::vector<int64_t> EulerDiscreteScheduler::get_timesteps() const { + return m_timesteps; +} + +float EulerDiscreteScheduler::get_init_noise_sigma() const { + float max_sigma = *std::max_element(m_sigmas.begin(), m_sigmas.end()); + + if (m_config.timestep_spacing == TimestepSpacing::LINSPACE || + m_config.timestep_spacing == TimestepSpacing::TRAILING) { + return max_sigma; + } + + return std::sqrt(max_sigma * max_sigma + 1); +} + +void EulerDiscreteScheduler::scale_model_input(ov::Tensor sample, size_t inference_step) { + if (m_step_index == -1) + m_step_index = 0; + + float sigma = m_sigmas[m_step_index]; + float* sample_data = sample.data<float>(); + for (size_t i = 0; i < sample.get_size(); i++) { + sample_data[i] /= std::pow((std::pow(sigma, 2) + 1), 0.5); + } +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text2image/schedulers/euler_discrete.hpp b/src/cpp/src/text2image/schedulers/euler_discrete.hpp new file mode 100644 index 0000000000..96c3fbfbdf --- /dev/null +++ b/src/cpp/src/text2image/schedulers/euler_discrete.hpp @@ -0,0 +1,60 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <list> +#include <string> + +#include "text2image/schedulers/types.hpp" +#include "text2image/schedulers/ischeduler.hpp" + +namespace ov { +namespace genai { + +class EulerDiscreteScheduler : public IScheduler { +public: + struct Config { + int32_t num_train_timesteps = 1000; + float beta_start = 0.0001f, beta_end = 0.02f; + BetaSchedule beta_schedule = BetaSchedule::SCALED_LINEAR; + std::vector<float> trained_betas = {}; + FinalSigmaType final_sigmas_type = FinalSigmaType::ZERO; + InterpolationType interpolation_type = InterpolationType::LINEAR; + float sigma_max = 0.0f, sigma_min = 0.0f; + size_t steps_offset = 0; + PredictionType prediction_type = PredictionType::EPSILON; + TimestepSpacing timestep_spacing = TimestepSpacing::LEADING; + TimestepType timestep_type = TimestepType::DISCRETE; + bool rescale_betas_zero_snr = false; + bool use_karras_sigmas = false, use_exponential_sigmas = false, use_beta_sigmas = false; + + Config() = default; + explicit Config(const std::string& scheduler_config_path); + }; + + explicit EulerDiscreteScheduler(const std::string scheduler_config_path); + explicit EulerDiscreteScheduler(const Config& scheduler_config); + + void set_timesteps(size_t num_inference_steps) override; + + std::vector<std::int64_t> get_timesteps() const override; + + float get_init_noise_sigma() const override; + + void scale_model_input(ov::Tensor sample, size_t inference_step) override; + + std::map<std::string, ov::Tensor> step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step) override; + +private: + Config m_config; + + std::vector<float> m_alphas_cumprod, m_sigmas; + std::vector<int64_t> m_timesteps; + size_t m_num_inference_steps; + + size_t m_step_index; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text2image/schedulers/ischeduler.hpp b/src/cpp/src/text2image/schedulers/ischeduler.hpp new file mode 100644 index 0000000000..51039765bf --- /dev/null +++ b/src/cpp/src/text2image/schedulers/ischeduler.hpp @@ -0,0 +1,28 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <cstdint> +#include <vector> + +#include "openvino/genai/text2image/pipeline.hpp" + +namespace ov { +namespace genai { + +class IScheduler : public Text2ImagePipeline::Scheduler { +public: + virtual void set_timesteps(size_t num_inference_steps) = 0; + + virtual std::vector<std::int64_t> get_timesteps() const = 0; + + virtual float get_init_noise_sigma() const = 0; + + virtual void scale_model_input(ov::Tensor sample, size_t inference_step) = 0; + + virtual std::map<std::string, ov::Tensor> step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step) = 0; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text2image/schedulers/lcm.cpp b/src/cpp/src/text2image/schedulers/lcm.cpp new file mode 100644 index 0000000000..f9a87da8fb --- /dev/null +++ b/src/cpp/src/text2image/schedulers/lcm.cpp @@ -0,0 +1,251 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include <cassert> +#include <random> +#include <fstream> +#include <iterator> + +#include "text2image/schedulers/lcm.hpp" +#include "utils.hpp" +#include "text2image/numpy_utils.hpp" + + +namespace ov { +namespace genai { + +LCMScheduler::Config::Config(const std::string scheduler_config_path) { + std::ifstream file(scheduler_config_path); + OPENVINO_ASSERT(file.is_open(), "Failed to open ", scheduler_config_path); + + nlohmann::json data = nlohmann::json::parse(file); + using utils::read_json_param; + + read_json_param(data, "beta_start", beta_start); + read_json_param(data, "beta_end", beta_end); + read_json_param(data, "clip_sample", clip_sample); + read_json_param(data, "clip_sample_range", clip_sample_range); + read_json_param(data, "dynamic_thresholding_ratio", dynamic_thresholding_ratio); + read_json_param(data, "num_train_timesteps", num_train_timesteps); + read_json_param(data, "original_inference_steps", original_inference_steps); + read_json_param(data, "rescale_betas_zero_snr", rescale_betas_zero_snr); + read_json_param(data, "sample_max_value", sample_max_value); + read_json_param(data, "set_alpha_to_one", set_alpha_to_one); + read_json_param(data, "steps_offset", steps_offset); + read_json_param(data, "thresholding", thresholding); + read_json_param(data, "timestep_scaling", timestep_scaling); + read_json_param(data, "trained_betas", trained_betas); + read_json_param(data, "beta_schedule", beta_schedule); + read_json_param(data, "prediction_type", prediction_type); + read_json_param(data, "timestep_spacing", timestep_spacing); +} + +LCMScheduler::LCMScheduler(const std::string scheduler_config_path) : + LCMScheduler(Config(scheduler_config_path)) { +} + +LCMScheduler::LCMScheduler(const Config& scheduler_config) + : m_config(scheduler_config), + m_seed(42), + m_gen(m_seed), + m_normal(0.0f, 1.0f) { + + m_sigma_data = 0.5f; // Default: 0.5 + + std::vector<float> alphas, betas; + + if (!m_config.trained_betas.empty()) { + betas = m_config.trained_betas; + } else if (m_config.beta_schedule == BetaSchedule::LINEAR) { + for (size_t i = 0; i < m_config.num_train_timesteps; i++) { + betas.push_back(m_config.beta_start + (m_config.beta_end - m_config.beta_start) * i / (m_config.num_train_timesteps - 1)); + } + } else if (m_config.beta_schedule == BetaSchedule::SCALED_LINEAR) { + float start = std::sqrt(m_config.beta_start); + float end = std::sqrt(m_config.beta_end); + + using numpy_utils::linspace; + std::vector<float> temp = linspace<float, float>(start, end, m_config.num_train_timesteps, true); + for (float b : temp) { + betas.push_back(b * b); + } + } else { + OPENVINO_THROW("'beta_schedule' must be one of 'LINEAR' or 'SCALED_LINEAR'"); + } + + for (float b : betas) { + alphas.push_back(1.0f - b); + } + + for (size_t i = 1; i <= alphas.size(); i++) { + float alpha_cumprod = + std::accumulate(std::begin(alphas), std::begin(alphas) + i, 1.0, std::multiplies<float>{}); + m_alphas_cumprod.push_back(alpha_cumprod); + } + + m_final_alpha_cumprod = m_config.set_alpha_to_one ? 1 : m_alphas_cumprod[0]; +} + +void LCMScheduler::set_timesteps(size_t num_inference_steps) { + m_num_inference_steps = num_inference_steps; + const float strength = 1.0f; + + // LCM Timesteps Setting + size_t k = m_config.num_train_timesteps / m_config.original_inference_steps; + + size_t origin_timesteps_size = m_config.original_inference_steps * strength; + std::vector<size_t> lcm_origin_timesteps(origin_timesteps_size); + std::iota(lcm_origin_timesteps.begin(), lcm_origin_timesteps.end(), 1); + std::transform(lcm_origin_timesteps.begin(), lcm_origin_timesteps.end(), lcm_origin_timesteps.begin(), [&k](auto& x) { + return x * k - 1; + }); + + size_t skipping_step = origin_timesteps_size / m_num_inference_steps; + assert(skipping_step >= 1 && "The combination of `original_steps x strength` is smaller than `num_inference_steps`"); + + // LCM Inference Steps Schedule + std::reverse(lcm_origin_timesteps.begin(),lcm_origin_timesteps.end()); + + using numpy_utils::linspace; + // v1. based on https://github.com/huggingface/diffusers/blame/2a7f43a73bda387385a47a15d7b6fe9be9c65eb2/src/diffusers/schedulers/scheduling_lcm.py#L387 + std::vector<size_t> inference_indices = linspace<size_t, float>(0, origin_timesteps_size, m_num_inference_steps); + for (size_t i : inference_indices){ + m_timesteps.push_back(lcm_origin_timesteps[i]); + } + + // // v2. based on diffusers==0.23.1 + // std::vector<float> temp; + // for(size_t i = 0; i < lcm_origin_timesteps.size(); i+=skipping_step) + // temp.push_back(lcm_origin_timesteps[i]); + // for(size_t i = 0; i < num_inference_steps; i++) + // m_timesteps.push_back(temp[i]); + +} + +std::map<std::string, ov::Tensor> LCMScheduler::step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step) { + ov::Shape shape = latents.get_shape(); + size_t batch_size = shape[0], latent_size = ov::shape_size(shape) / batch_size; + float* noise_pred_data = noise_pred.data<float>(); + float* latents_data = latents.data<float>(); + + // 1. get previous step value + int64_t prev_step_index = inference_step + 1; + int64_t curr_step = m_timesteps[inference_step]; + int64_t prev_timestep = prev_step_index < static_cast<int64_t>(m_timesteps.size()) ? m_timesteps[prev_step_index] : curr_step; + + // 2. compute alphas, betas + float alpha_prod_t = m_alphas_cumprod[curr_step]; + float alpha_prod_t_prev = (prev_timestep >= 0) ? m_alphas_cumprod[prev_timestep] : m_final_alpha_cumprod; + float alpha_prod_t_sqrt = std::sqrt(alpha_prod_t); + float alpha_prod_t_prev_sqrt = std::sqrt(alpha_prod_t_prev); + float beta_prod_t_sqrt = std::sqrt(1 - alpha_prod_t); + float beta_prod_t_prev_sqrt = std::sqrt(1 - alpha_prod_t_prev); + + // 3. Get scalings for boundary conditions + // get_scalings_for_boundary_condition_discrete(...) + float scaled_timestep = curr_step * m_config.timestep_scaling; + float c_skip = std::pow(m_sigma_data, 2) / (std::pow(scaled_timestep, 2) + std::pow(m_sigma_data, 2)); + float c_out = scaled_timestep / std::sqrt((std::pow(scaled_timestep, 2) + std::pow(m_sigma_data, 2))); + + // 4. Compute the predicted original sample x_0 based on the model parameterization + std::vector<std::vector<float>> predicted_original_sample(batch_size); + // "epsilon" by default + if (m_config.prediction_type == PredictionType::EPSILON) { + for (std::size_t i = 0; i < batch_size; ++i) { + std::vector<float>& predicted_original_sample_l = predicted_original_sample[i]; + predicted_original_sample_l.resize(latent_size); + + for (std::size_t j = 0; j < latent_size; ++j) + predicted_original_sample_l[j] = (latents_data[i * latent_size + j] - + beta_prod_t_sqrt * noise_pred_data[i * latent_size + j]) / alpha_prod_t_sqrt; + } + } + + // 5. Clip or threshold "predicted x_0" + if (m_config.thresholding) { + for (std::size_t i = 0; i < batch_size; ++i) { + predicted_original_sample[i] = threshold_sample(predicted_original_sample[i]); + } + } else if (m_config.clip_sample) { + for (std::size_t i = 0; i < batch_size; ++i) { + for (float& value : predicted_original_sample[i]) { + value = std::clamp(value, - m_config.clip_sample_range, m_config.clip_sample_range); + } + } + } + + // 6. Denoise model output using boundary conditions + ov::Tensor denoised(latents.get_element_type(), latents.get_shape()); + float* denoised_data = denoised.data<float>(); + for (std::size_t i = 0; i < batch_size; ++i) { + for (std::size_t j = 0; j < latent_size; ++j) { + denoised_data[i * latent_size + j] = c_out * predicted_original_sample[i][j] + c_skip * latents_data[i * latent_size + j]; + } + } + + /// 7. Sample and inject noise z ~ N(0, I) for MultiStep Inference + // Noise is not used on the final timestep of the timestep schedule. + // This also means that noise is not used for one-step sampling. + ov::Tensor prev_sample(latents.get_element_type(), latents.get_shape()); + float* prev_sample_data = prev_sample.data<float>(); + + if (inference_step != m_num_inference_steps - 1) { + for (std::size_t i = 0; i < batch_size * latent_size; ++i) { + float gen_noise = m_normal(m_gen); + prev_sample_data[i] = alpha_prod_t_prev_sqrt * denoised_data[i] + beta_prod_t_prev_sqrt * gen_noise; + } + } else { + std::copy_n(denoised_data, denoised.get_size(), prev_sample_data); + } + + return { + {"latent", prev_sample}, + {"denoised", denoised} + }; +} + +std::vector<int64_t> LCMScheduler::get_timesteps() const { + return m_timesteps; +} + +float LCMScheduler::get_init_noise_sigma() const { + return 1.0f; +} + +void LCMScheduler::scale_model_input(ov::Tensor sample, size_t inference_step) { + return; +} + +// Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample +std::vector<float> LCMScheduler::threshold_sample(const std::vector<float>& flat_sample) { + /* + "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the + prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by + s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing + pixels from saturation at each step. We find that dynamic thresholding results in significantly better + photorealism as well as better image-text alignment, especially when using very large guidance weights." + https://arxiv.org/abs/2205.11487 + */ + + std::vector<float> thresholded_sample; + // Calculate abs + std::vector<float> abs_sample(flat_sample.size()); + std::transform(flat_sample.begin(), flat_sample.end(), abs_sample.begin(), [](float val) { return std::abs(val); }); + + // Calculate s, the quantile threshold + std::sort(abs_sample.begin(), abs_sample.end()); + const int s_index = std::min(static_cast<int>(std::round(m_config.dynamic_thresholding_ratio * flat_sample.size())), + static_cast<int>(flat_sample.size()) - 1); + float s = abs_sample[s_index]; + s = std::clamp(s, 1.0f, m_config.sample_max_value); + + // Threshold and normalize the sample + for (float& value : thresholded_sample) { + value = std::clamp(value, -s, s) / s; + } + + return thresholded_sample; +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text2image/schedulers/lcm.hpp b/src/cpp/src/text2image/schedulers/lcm.hpp new file mode 100644 index 0000000000..8abbcd3e29 --- /dev/null +++ b/src/cpp/src/text2image/schedulers/lcm.hpp @@ -0,0 +1,72 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <string> +#include <random> +#include <vector> + +#include "text2image/schedulers/types.hpp" +#include "text2image/schedulers/ischeduler.hpp" + +namespace ov { +namespace genai { + +class LCMScheduler : public IScheduler { +public: + // values from https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_lcm.py#L190 + struct Config { + size_t num_train_timesteps = 1000; + float beta_start = 0.00085f, beta_end = 0.012f; + BetaSchedule beta_schedule = BetaSchedule::SCALED_LINEAR; + std::vector<float> trained_betas = {}; + size_t original_inference_steps = 50; + bool clip_sample = false; + float clip_sample_range = 1.0f; + bool set_alpha_to_one = true; + size_t steps_offset = 0; + PredictionType prediction_type = PredictionType::EPSILON; + bool thresholding = false; + float dynamic_thresholding_ratio = 0.995f; + float sample_max_value = 1.0f; + TimestepSpacing timestep_spacing = TimestepSpacing::LEADING; + float timestep_scaling = 10.0f; + bool rescale_betas_zero_snr = false; + + Config() = default; + explicit Config(const std::string scheduler_config_path); + }; + + explicit LCMScheduler(const std::string scheduler_config_path); + explicit LCMScheduler(const Config& scheduler_config); + + void set_timesteps(size_t num_inference_steps) override; + + std::vector<std::int64_t> get_timesteps() const override; + + float get_init_noise_sigma() const override; + + void scale_model_input(ov::Tensor sample, size_t inference_step) override; + + std::map<std::string, ov::Tensor> step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step) override; + +private: + Config m_config; + + std::vector<float> m_alphas_cumprod; + float m_final_alpha_cumprod; + size_t m_num_inference_steps; + float m_sigma_data; + + std::vector<int64_t> m_timesteps; + + uint32_t m_seed; + std::mt19937 m_gen; + std::normal_distribution<float> m_normal; + + std::vector<float> threshold_sample(const std::vector<float>& flat_sample); +}; + +} // namespace genai +} // namespace ov diff --git a/image_generation/common/diffusers/src/scheduler_lms_discrete.cpp b/src/cpp/src/text2image/schedulers/lms_discrete.cpp similarity index 56% rename from image_generation/common/diffusers/src/scheduler_lms_discrete.cpp rename to src/cpp/src/text2image/schedulers/lms_discrete.cpp index ca21913797..3032822a73 100644 --- a/image_generation/common/diffusers/src/scheduler_lms_discrete.cpp +++ b/src/cpp/src/text2image/schedulers/lms_discrete.cpp @@ -1,9 +1,12 @@ -// Copyright (C) 2023 Intel Corporation +// Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -#include "scheduler_lms_discrete.hpp" +#include "text2image/schedulers/lms_discrete.hpp" #include <cmath> +#include <fstream> + +#include "utils.hpp" namespace { @@ -46,8 +49,8 @@ Real trapezoidal(F f, Real a, Real b, Real tol = 1e-6, int max_refinements = 100 return I0; } -float lms_derivative_function(float tau, int32_t order, int32_t curr_order, const std::vector<float>& sigma_vec, int32_t t) { - float prod = 1.0; +float lms_derivative(float tau, int32_t order, int32_t curr_order, const std::vector<float>& sigma_vec, int32_t t) { + float prod = 1.0f; for (int32_t k = 0; k < order; k++) { if (curr_order == k) { @@ -58,7 +61,11 @@ float lms_derivative_function(float tau, int32_t order, int32_t curr_order, cons return prod; } -} +} // namespace + + +namespace ov { +namespace genai { int64_t LMSDiscreteScheduler::_sigma_to_t(float sigma) const { double log_sigma = std::log(sigma); @@ -86,50 +93,67 @@ int64_t LMSDiscreteScheduler::_sigma_to_t(float sigma) const { return timestep; } -LMSDiscreteScheduler::LMSDiscreteScheduler(int32_t num_train_timesteps, - float beta_start, - float beta_end, - BetaSchedule beta_schedule, - PredictionType prediction_type, - const std::vector<float>& trained_betas) { - std::vector<float> alphas, betas; - - if (!trained_betas.empty()) { - betas = trained_betas; - } else if (beta_schedule == BetaSchedule::LINEAR) { - for (int32_t i = 0; i < num_train_timesteps; i++) { - betas.push_back(beta_start + (beta_end - beta_start) * i / (num_train_timesteps - 1)); - } - } else if (beta_schedule == BetaSchedule::SCALED_LINEAR) { - float start = std::sqrt(beta_start); - float end = std::sqrt(beta_end); - std::vector<float> temp = linspace(start, end, num_train_timesteps); - for (float b : temp) { - betas.push_back(b * b); - } +LMSDiscreteScheduler::Config::Config(const std::string& scheduler_config_path) { + std::ifstream file(scheduler_config_path); + OPENVINO_ASSERT(file.is_open(), "Failed to open ", scheduler_config_path); + + nlohmann::json data = nlohmann::json::parse(file); + using utils::read_json_param; + + read_json_param(data, "beta_start", beta_start); + read_json_param(data, "beta_end", beta_end); + read_json_param(data, "trained_betas", trained_betas); + read_json_param(data, "beta_schedule", beta_schedule); + read_json_param(data, "prediction_type", prediction_type); + read_json_param(data, "num_train_timesteps", num_train_timesteps); + read_json_param(data, "steps_offset", steps_offset); +} + +LMSDiscreteScheduler::LMSDiscreteScheduler(const std::string scheduler_config_path) + : LMSDiscreteScheduler(Config(scheduler_config_path)) { +} + +LMSDiscreteScheduler::LMSDiscreteScheduler(const Config& scheduler_config) + : m_config(scheduler_config) { + if (!m_config.trained_betas.empty()) { + m_betas = m_config.trained_betas; + } else if (m_config.beta_schedule == BetaSchedule::LINEAR) { + m_betas = linspace<float>(m_config.beta_start, m_config.beta_end, m_config.num_train_timesteps); + } else if (m_config.beta_schedule == BetaSchedule::SCALED_LINEAR) { + float start = std::sqrt(m_config.beta_start); + float end = std::sqrt(m_config.beta_end); + m_betas = linspace<float>(start, end, m_config.num_train_timesteps); + std::for_each(m_betas.begin(), m_betas.end(), [] (float & x) { x *= x; }); } else { - OPENVINO_THROW("'beta_schedule' must be one of 'EPSILON' or 'SCALED_LINEAR'"); + OPENVINO_THROW("'beta_schedule' must be one of 'EPSILON' or 'SCALED_LINEAR'. Please, add support of other types"); } - for (float b : betas) { - alphas.push_back(1.0f - b); - } + // generates alphas + std::transform(m_betas.begin(), m_betas.end(), std::back_inserter(m_alphas), [] (float b) { return 1.0f - b; }); std::vector<float> log_sigma_vec; - for (size_t i = 1; i <= alphas.size(); i++) { + for (size_t i = 1; i <= m_alphas.size(); i++) { float alphas_cumprod = - std::accumulate(alphas.begin(), alphas.begin() + i, 1.0f, std::multiplies<float>{}); + std::accumulate(m_alphas.begin(), m_alphas.begin() + i, 1.0f, std::multiplies<float>{}); float sigma = std::sqrt((1 - alphas_cumprod) / alphas_cumprod); m_log_sigmas.push_back(std::log(sigma)); } } float LMSDiscreteScheduler::get_init_noise_sigma() const { - return m_sigmas[0]; + float max_sigma = *std::max_element(m_sigmas.begin(), m_sigmas.end()); + + if (m_config.timestep_spacing == TimestepSpacing::LINSPACE || + m_config.timestep_spacing == TimestepSpacing::TRAILING) { + return max_sigma; + } + + return std::sqrt(max_sigma * max_sigma + 1); } void LMSDiscreteScheduler::scale_model_input(ov::Tensor sample, size_t inference_step) { const double scale = 1.0 / std::sqrt((m_sigmas[inference_step] * m_sigmas[inference_step] + 1)); + float* sample_data = sample.data<float>(); for (size_t i = 0; i < sample.get_size(); i++) { sample_data[i] *= scale; @@ -137,6 +161,9 @@ void LMSDiscreteScheduler::scale_model_input(ov::Tensor sample, size_t inference } void LMSDiscreteScheduler::set_timesteps(size_t num_inference_steps) { + m_timesteps.clear(); + m_derivative_list.clear(); + float delta = -999.0f / (num_inference_steps - 1); // transform interpolation to time range for (size_t i = 0; i < num_inference_steps; i++) { @@ -147,8 +174,9 @@ void LMSDiscreteScheduler::set_timesteps(size_t num_inference_steps) { float sigma = std::exp((1 - w) * m_log_sigmas[low_idx] + w * m_log_sigmas[high_idx]); m_sigmas.push_back(sigma); } - m_sigmas.push_back(0.f); + m_sigmas.push_back(0.f); + // initialize timesteps for (size_t i = 0; i < num_inference_steps; ++i) { int64_t timestep = _sigma_to_t(m_sigmas[i]); @@ -160,16 +188,33 @@ std::vector<int64_t> LMSDiscreteScheduler::get_timesteps() const { return m_timesteps; } -ov::Tensor LMSDiscreteScheduler::step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step) { +std::map<std::string, ov::Tensor> LMSDiscreteScheduler::step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step) { + const float sigma = m_sigmas[inference_step]; + // LMS step function: std::vector<float> derivative; derivative.reserve(latents.get_size()); for (size_t j = 0; j < latents.get_size(); j++) { - // 1. compute predicted original sample (x_0) from sigma-scaled predicted noise default "epsilon" - float pred_latent = latents.data<float>()[j] - m_sigmas[inference_step] * noise_pred.data<float>()[j]; + // 1. compute predicted original sample (x_0) from sigma-scaled predicted noise + float pred_latent = 0; + switch (m_config.prediction_type) { + case PredictionType::EPSILON: + pred_latent = latents.data<float>()[j] - sigma * noise_pred.data<float>()[j]; + break; + case PredictionType::SAMPLE: + pred_latent = noise_pred.data<float>()[j]; + break; + case PredictionType::V_PREDICTION: + // pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1)) + pred_latent = noise_pred.data<float>()[j] * (-sigma / std::sqrt(sigma * sigma + 1.0f) + + latents.data<float>()[j] / (sigma * sigma + 1.0f)); + break; + default: + OPENVINO_THROW("Unsupported value for 'PredictionType'"); + } // 2. Convert to an ODE derivative - derivative.push_back((latents.data<float>()[j] - pred_latent) / m_sigmas[inference_step]); + derivative.push_back((latents.data<float>()[j] - pred_latent) / sigma); } m_derivative_list.push_back(derivative); @@ -185,15 +230,16 @@ ov::Tensor LMSDiscreteScheduler::step(ov::Tensor noise_pred, ov::Tensor latents, std::vector<float> lms_coeffs(order); for (size_t curr_order = 0; curr_order < order; curr_order++) { auto lms_derivative_functor = [order, curr_order, sigmas = this->m_sigmas, inference_step] (float tau) { - return lms_derivative_function(tau, order, curr_order, sigmas, inference_step); + return lms_derivative(tau, order, curr_order, sigmas, inference_step); }; - lms_coeffs[curr_order] = trapezoidal(lms_derivative_functor, static_cast<double>(m_sigmas[inference_step]), static_cast<double>(m_sigmas[inference_step + 1]), 1e-4); + // integrated_coeff = integrate.quad(lms_derivative, self.sigmas[t], self.sigmas[t + 1], epsrel=1e-4)[0] + lms_coeffs[curr_order] = trapezoidal(lms_derivative_functor, static_cast<double>(sigma), static_cast<double>(m_sigmas[inference_step + 1]), 1e-4); } // 4. Compute previous sample based on the derivatives path // prev_sample = sample + sum(coeff * derivative for coeff, derivative in zip(lms_coeffs, reversed(self.derivatives))) ov::Tensor prev_sample(latents.get_element_type(), latents.get_shape()); - float * prev_sample_data = prev_sample.data<float>(); + float* prev_sample_data = prev_sample.data<float>(); const float* latents_data = latents.data<const float>(); for (size_t i = 0; i < prev_sample.get_size(); ++i) { float derivative_sum = 0.0f; @@ -204,5 +250,10 @@ ov::Tensor LMSDiscreteScheduler::step(ov::Tensor noise_pred, ov::Tensor latents, prev_sample_data[i] = latents_data[i] + derivative_sum; } - return prev_sample; + std::map<std::string, ov::Tensor> result{{"latent", prev_sample}}; + + return result; } + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text2image/schedulers/lms_discrete.hpp b/src/cpp/src/text2image/schedulers/lms_discrete.hpp new file mode 100644 index 0000000000..a8eacc4759 --- /dev/null +++ b/src/cpp/src/text2image/schedulers/lms_discrete.hpp @@ -0,0 +1,55 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <list> +#include <string> + +#include "text2image/schedulers/types.hpp" +#include "text2image/schedulers/ischeduler.hpp" + +namespace ov { +namespace genai { + +class LMSDiscreteScheduler : public IScheduler { +public: + struct Config { + int32_t num_train_timesteps = 1000; + float beta_start = 0.00085f, beta_end = 0.012f; + BetaSchedule beta_schedule = BetaSchedule::SCALED_LINEAR; + PredictionType prediction_type = PredictionType::EPSILON; + std::vector<float> trained_betas = {}; + TimestepSpacing timestep_spacing = TimestepSpacing::LINSPACE; + size_t steps_offset = 0; + + Config() = default; + explicit Config(const std::string& scheduler_config_path); + }; + + explicit LMSDiscreteScheduler(const std::string scheduler_config_path); + explicit LMSDiscreteScheduler(const Config& scheduler_config); + + void set_timesteps(size_t num_inference_steps) override; + + std::vector<std::int64_t> get_timesteps() const override; + + float get_init_noise_sigma() const override; + + void scale_model_input(ov::Tensor sample, size_t inference_step) override; + + std::map<std::string, ov::Tensor> step(ov::Tensor noise_pred, ov::Tensor latents, size_t inference_step) override; + +private: + Config m_config; + + std::vector<float> m_alphas, m_betas, m_alphas_cumprod; + std::vector<float> m_sigmas, m_log_sigmas; + std::vector<int64_t> m_timesteps; + std::list<std::vector<float>> m_derivative_list; + + int64_t _sigma_to_t(float sigma) const; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text2image/schedulers/scheduler.cpp b/src/cpp/src/text2image/schedulers/scheduler.cpp new file mode 100644 index 0000000000..44b08d67fc --- /dev/null +++ b/src/cpp/src/text2image/schedulers/scheduler.cpp @@ -0,0 +1,51 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/text2image/pipeline.hpp" + +#include <fstream> + +#include "utils.hpp" + +#include "text2image/schedulers/lcm.hpp" +#include "text2image/schedulers/lms_discrete.hpp" +#include "text2image/schedulers/ddim.hpp" +#include "text2image/schedulers/euler_discrete.hpp" + +namespace ov { +namespace genai { + +std::shared_ptr<Text2ImagePipeline::Scheduler> Text2ImagePipeline::Scheduler::from_config(const std::string& scheduler_config_path, Type scheduler_type) { + std::ifstream file(scheduler_config_path); + OPENVINO_ASSERT(file.is_open(), "Failed to open ", scheduler_config_path); + + if (scheduler_type == Scheduler::AUTO) { + nlohmann::json data = nlohmann::json::parse(file); + auto it = data.find("_class_name"); + OPENVINO_ASSERT(it != data.end(), "Failed to find '_class_name' field in ", scheduler_config_path); + + ov::genai::utils::read_json_param(data, "_class_name", scheduler_type); + OPENVINO_ASSERT(scheduler_type != Scheduler::AUTO, "Failed to guess scheduler based on its config ", scheduler_config_path); + } + + std::shared_ptr<Scheduler> scheduler = nullptr; + if (scheduler_type == Scheduler::Type::LCM) { + // TODO: do we need to pass RNG generator somehow to LCM? + scheduler = std::make_shared<LCMScheduler>(scheduler_config_path); + } else if (scheduler_type == Scheduler::Type::LMS_DISCRETE) { + scheduler = std::make_shared<LMSDiscreteScheduler>(scheduler_config_path); + } else if (scheduler_type == Scheduler::Type::DDIM) { + scheduler = std::make_shared<DDIMScheduler>(scheduler_config_path); + } else if (scheduler_type == Scheduler::Type::EULER_DISCRETE) { + scheduler = std::make_shared<EulerDiscreteScheduler>(scheduler_config_path); + } else { + OPENVINO_THROW("Unsupported scheduler type '", scheduler_type, ". Please, manually create scheduler via supported one"); + } + + return scheduler; +} + +Text2ImagePipeline::Scheduler::~Scheduler() = default; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text2image/schedulers/types.cpp b/src/cpp/src/text2image/schedulers/types.cpp new file mode 100644 index 0000000000..0ca970f359 --- /dev/null +++ b/src/cpp/src/text2image/schedulers/types.cpp @@ -0,0 +1,137 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "text2image/schedulers/types.hpp" + +namespace ov { +namespace genai { +namespace utils { + +template <> +void read_json_param(const nlohmann::json& data, const std::string& name, BetaSchedule& param) { + if (data.contains(name) && data[name].is_string()) { + std::string beta_schedule_str = data[name].get<std::string>(); + if (beta_schedule_str == "linear") + param = BetaSchedule::LINEAR; + else if (beta_schedule_str == "scaled_linear") + param = BetaSchedule::SCALED_LINEAR; + else if (beta_schedule_str == "squaredcos_cap_v2") + param = BetaSchedule::SQUAREDCOS_CAP_V2; + else if (!beta_schedule_str.empty()) { + OPENVINO_THROW("Unsupported value for 'beta_schedule' ", beta_schedule_str); + } + } +} + +template <> +void read_json_param(const nlohmann::json& data, const std::string& name, PredictionType& param) { + if (data.contains(name) && data[name].is_string()) { + std::string prediction_type_str = data[name].get<std::string>(); + if (prediction_type_str == "epsilon") + param = PredictionType::EPSILON; + else if (prediction_type_str == "sample") + param = PredictionType::SAMPLE; + else if (prediction_type_str == "v_prediction") + param = PredictionType::V_PREDICTION; + else if (!prediction_type_str.empty()) { + OPENVINO_THROW("Unsupported value for 'prediction_type' ", prediction_type_str); + } + } +} + +template <> +void read_json_param(const nlohmann::json& data, const std::string& name, Text2ImagePipeline::Scheduler::Type& param) { + if (data.contains(name) && data[name].is_string()) { + std::string scheduler_type_str = data[name].get<std::string>(); + if (scheduler_type_str == "LCMScheduler") + param = Text2ImagePipeline::Scheduler::LCM; + else if (scheduler_type_str == "DDIMScheduler") + param = Text2ImagePipeline::Scheduler::DDIM; + else if (scheduler_type_str == "LMSDiscreteScheduler") + param = Text2ImagePipeline::Scheduler::LMS_DISCRETE; + else if (scheduler_type_str == "EulerDiscreteScheduler") + param = Text2ImagePipeline::Scheduler::EULER_DISCRETE; + else if (!scheduler_type_str.empty()) { + OPENVINO_THROW("Unsupported value for 'prediction_type' ", scheduler_type_str); + } + } +} + +template <> +void read_json_param(const nlohmann::json& data, const std::string& name, TimestepSpacing& param) { + if (data.contains(name) && data[name].is_string()) { + std::string timestep_spacing_str = data[name].get<std::string>(); + if (timestep_spacing_str == "linspace") + param = TimestepSpacing::LINSPACE; + else if (timestep_spacing_str == "trailing") + param = TimestepSpacing::TRAILING; + else if (timestep_spacing_str == "leading") + param = TimestepSpacing::LEADING; + else if (!timestep_spacing_str.empty()) { + OPENVINO_THROW("Unsupported value for 'timestep_spacing' ", timestep_spacing_str); + } + } +} + +template <> +void read_json_param(const nlohmann::json& data, const std::string& name, InterpolationType& param) { + if (data.contains(name) && data[name].is_string()) { + std::string interpolation_type = data[name].get<std::string>(); + if (interpolation_type == "linear") + param = InterpolationType::LINEAR; + else if (interpolation_type == "log_linear") + param = InterpolationType::LOG_LINEAR; + else if (!interpolation_type.empty()) { + OPENVINO_THROW("Unsupported value for 'interpolation_type' ", interpolation_type); + } + } +} + +template <> +void read_json_param(const nlohmann::json& data, const std::string& name, FinalSigmaType& param) { + if (data.contains(name) && data[name].is_string()) { + std::string final_sigma_type = data[name].get<std::string>(); + if (final_sigma_type == "zero") + param = FinalSigmaType::ZERO; + else if (final_sigma_type == "sigma_min") + param = FinalSigmaType::SIGMA_MIN; + else if (!final_sigma_type.empty()) { + OPENVINO_THROW("Unsupported value for 'final_sigma_type' ", final_sigma_type); + } + } +} + +template <> +void read_json_param(const nlohmann::json& data, const std::string& name, TimestepType& param) { + if (data.contains(name) && data[name].is_string()) { + std::string timestep_type = data[name].get<std::string>(); + if (timestep_type == "discrete") + param = TimestepType::DISCRETE; + else if (timestep_type == "continuous") + param = TimestepType::CONTINUOUS; + else if (!timestep_type.empty()) { + OPENVINO_THROW("Unsupported value for 'timestep_type' ", timestep_type); + } + } +} + +} // namespace utils +} // namespace genai +} // namespace ov + +std::ostream& operator<<(std::ostream& os, const ov::genai::Text2ImagePipeline::Scheduler::Type& scheduler_type) { + switch (scheduler_type) { + case ov::genai::Text2ImagePipeline::Scheduler::Type::LCM: + return os << "LCMScheduler"; + case ov::genai::Text2ImagePipeline::Scheduler::Type::LMS_DISCRETE: + return os << "LMSDiscreteScheduler"; + case ov::genai::Text2ImagePipeline::Scheduler::Type::DDIM: + return os << "DDIMScheduler"; + case ov::genai::Text2ImagePipeline::Scheduler::Type::EULER_DISCRETE: + return os << "EulerDiscreteScheduler"; + case ov::genai::Text2ImagePipeline::Scheduler::Type::AUTO: + return os << "AutoScheduler"; + default: + OPENVINO_THROW("Unsupported scheduler type value"); + } +} diff --git a/src/cpp/src/text2image/schedulers/types.hpp b/src/cpp/src/text2image/schedulers/types.hpp new file mode 100644 index 0000000000..74fde4f993 --- /dev/null +++ b/src/cpp/src/text2image/schedulers/types.hpp @@ -0,0 +1,75 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <ostream> + +#include "openvino/genai/text2image/pipeline.hpp" + +#include "utils.hpp" + +namespace ov { +namespace genai { + +enum class BetaSchedule { + LINEAR, + SCALED_LINEAR, + SQUAREDCOS_CAP_V2 +}; + +enum class PredictionType { + EPSILON, + SAMPLE, + V_PREDICTION +}; + +enum class TimestepSpacing { + LINSPACE, + TRAILING, + LEADING +}; + +enum class InterpolationType { + LINEAR, + LOG_LINEAR +}; + +enum class FinalSigmaType { + ZERO, + SIGMA_MIN +}; + +enum class TimestepType { + DISCRETE, + CONTINUOUS +}; + +namespace utils { + +template <> +void read_json_param(const nlohmann::json& data, const std::string& name, BetaSchedule& param); + +template <> +void read_json_param(const nlohmann::json& data, const std::string& name, PredictionType& param); + +template <> +void read_json_param(const nlohmann::json& data, const std::string& name, Text2ImagePipeline::Scheduler::Type& param); + +template <> +void read_json_param(const nlohmann::json& data, const std::string& name, TimestepSpacing& param); + +template <> +void read_json_param(const nlohmann::json& data, const std::string& name, InterpolationType& param); + +template <> +void read_json_param(const nlohmann::json& data, const std::string& name, FinalSigmaType& param); + +template <> +void read_json_param(const nlohmann::json& data, const std::string& name, TimestepType& param); + +} // namespace utils +} // namespace genai +} // namespace ov + +std::ostream& operator<<(std::ostream& os, const ov::genai::Text2ImagePipeline::Scheduler::Type& scheduler_type); diff --git a/src/cpp/src/text2image/stable_diffusion_pipeline.hpp b/src/cpp/src/text2image/stable_diffusion_pipeline.hpp new file mode 100644 index 0000000000..f2543474ec --- /dev/null +++ b/src/cpp/src/text2image/stable_diffusion_pipeline.hpp @@ -0,0 +1,306 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "text2image/diffusion_pipeline.hpp" + +#include <ctime> +#include <cassert> + +#include "utils.hpp" +#include "lora_helper.hpp" + +namespace ov { +namespace genai { + +namespace { + +ov::Tensor get_guidance_scale_embedding(float guidance_scale, uint32_t embedding_dim) { + float w = guidance_scale * 1000; + uint32_t half_dim = embedding_dim / 2; + float emb = std::log(10000) / (half_dim - 1); + + ov::Shape embedding_shape = {1, embedding_dim}; + ov::Tensor w_embedding(ov::element::f32, embedding_shape); + float* w_embedding_data = w_embedding.data<float>(); + + for (size_t i = 0; i < half_dim; ++i) { + float temp = std::exp((i * (-emb))) * w; + w_embedding_data[i] = std::sin(temp); + w_embedding_data[i + half_dim] = std::cos(temp); + } + + if (embedding_dim % 2 == 1) + w_embedding_data[embedding_dim - 1] = 0; + + return w_embedding; +} + +} // namespace + +class Text2ImagePipeline::StableDiffusionPipeline : public Text2ImagePipeline::DiffusionPipeline { +public: + explicit StableDiffusionPipeline(const std::string& root_dir) { + const std::string model_index_path = root_dir + "/model_index.json"; + std::ifstream file(model_index_path); + OPENVINO_ASSERT(file.is_open(), "Failed to open ", model_index_path); + + nlohmann::json data = nlohmann::json::parse(file); + using utils::read_json_param; + + set_scheduler(Scheduler::from_config(root_dir + "/scheduler/scheduler_config.json")); + + const std::string text_encoder = data["text_encoder"][1].get<std::string>(); + if (text_encoder == "CLIPTextModel") { + m_clip_text_encoder = std::make_shared<CLIPTextModel>(root_dir + "/text_encoder"); + } else { + OPENVINO_THROW("Unsupported '", text_encoder, "' text encoder type"); + } + + const std::string unet = data["unet"][1].get<std::string>(); + if (unet == "UNet2DConditionModel") { + m_unet = std::make_shared<UNet2DConditionModel>(root_dir + "/unet"); + } else { + OPENVINO_THROW("Unsupported '", unet, "' UNet type"); + } + + const std::string vae = data["vae"][1].get<std::string>(); + if (vae == "AutoencoderKL") { + m_vae_decoder = std::make_shared<AutoencoderKL>(root_dir + "/vae_decoder"); + } else { + OPENVINO_THROW("Unsupported '", vae, "' VAE decoder type"); + } + + // initialize generation config + initialize_generation_config(data["_class_name"].get<std::string>()); + } + + StableDiffusionPipeline(const std::string& root_dir, const std::string& device, const ov::AnyMap& properties) { + const std::string model_index_path = root_dir + "/model_index.json"; + std::ifstream file(model_index_path); + OPENVINO_ASSERT(file.is_open(), "Failed to open ", model_index_path); + + nlohmann::json data = nlohmann::json::parse(file); + using utils::read_json_param; + + set_scheduler(Scheduler::from_config(root_dir + "/scheduler/scheduler_config.json")); + + const std::string text_encoder = data["text_encoder"][1].get<std::string>(); + if (text_encoder == "CLIPTextModel") { + m_clip_text_encoder = std::make_shared<CLIPTextModel>(root_dir + "/text_encoder", device, properties); + } else { + OPENVINO_THROW("Unsupported '", text_encoder, "' text encoder type"); + } + + const std::string unet = data["unet"][1].get<std::string>(); + if (unet == "UNet2DConditionModel") { + m_unet = std::make_shared<UNet2DConditionModel>(root_dir + "/unet", device, properties); + } else { + OPENVINO_THROW("Unsupported '", unet, "' UNet type"); + } + + const std::string vae = data["vae"][1].get<std::string>(); + if (vae == "AutoencoderKL") { + m_vae_decoder = std::make_shared<AutoencoderKL>(root_dir + "/vae_decoder", device, properties); + } else { + OPENVINO_THROW("Unsupported '", vae, "' VAE decoder type"); + } + + // initialize generation config + initialize_generation_config(data["_class_name"].get<std::string>()); + + update_adapters_from_properties(properties, m_generation_config.adapters); + } + + StableDiffusionPipeline( + const CLIPTextModel& clip_text_model, + const UNet2DConditionModel& unet, + const AutoencoderKL& vae_decoder) + : m_clip_text_encoder(std::make_shared<CLIPTextModel>(clip_text_model)), + m_unet(std::make_shared<UNet2DConditionModel>(unet)), + m_vae_decoder(std::make_shared<AutoencoderKL>(vae_decoder)) { } + + void reshape(const int num_images_per_prompt, const int height, const int width, const float guidance_scale) override { + check_image_size(height, width); + + const size_t batch_size_multiplier = do_classifier_free_guidance(guidance_scale) ? 2 : 1; // Unet accepts 2x batch in case of CFG + m_clip_text_encoder->reshape(batch_size_multiplier); + m_unet->reshape(num_images_per_prompt * batch_size_multiplier, height, width, m_clip_text_encoder->get_config().max_position_embeddings); + m_vae_decoder->reshape(num_images_per_prompt, height, width); + } + + void compile(const std::string& device, const ov::AnyMap& properties) override { + m_clip_text_encoder->compile(device, properties); + m_unet->compile(device, properties); + m_vae_decoder->compile(device, properties); + } + + ov::Tensor generate(const std::string& positive_prompt, + const ov::AnyMap& properties) override { + GenerationConfig generation_config = m_generation_config; + generation_config.update_generation_config(properties); + + // Stable Diffusion pipeline + // see https://huggingface.co/docs/diffusers/using-diffusers/write_own_pipeline#deconstruct-the-stable-diffusion-pipeline + + const auto& unet_config = m_unet->get_config(); + const size_t batch_size_multiplier = do_classifier_free_guidance(generation_config.guidance_scale) ? 2 : 1; // Unet accepts 2x batch in case of CFG + const size_t vae_scale_factor = m_unet->get_vae_scale_factor(); + + if (generation_config.height < 0) + generation_config.height = unet_config.sample_size * vae_scale_factor; + if (generation_config.width < 0) + generation_config.width = unet_config.sample_size * vae_scale_factor; + check_inputs(generation_config); + + m_clip_text_encoder->set_adapters(generation_config.adapters); + m_unet->set_adapters(generation_config.adapters); + + if (generation_config.random_generator == nullptr) { + uint32_t seed = time(NULL); + generation_config.random_generator = std::make_shared<CppStdGenerator>(seed); + } + + ov::Tensor encoder_hidden_states = m_clip_text_encoder->infer(positive_prompt, generation_config.negative_prompt, + batch_size_multiplier > 1); + + // replicate encoder hidden state to UNet model + if (generation_config.num_images_per_prompt == 1) { + // reuse output of text encoder directly w/o extra memory copy + m_unet->set_hidden_states("encoder_hidden_states", encoder_hidden_states); + } else { + ov::Shape enc_shape = encoder_hidden_states.get_shape(); + enc_shape[0] *= generation_config.num_images_per_prompt; + + ov::Tensor encoder_hidden_states_repeated(encoder_hidden_states.get_element_type(), enc_shape); + for (size_t n = 0; n < generation_config.num_images_per_prompt; ++n) { + batch_copy(encoder_hidden_states, encoder_hidden_states_repeated, 0, n); + if (batch_size_multiplier > 1) { + batch_copy(encoder_hidden_states, encoder_hidden_states_repeated, + 1, generation_config.num_images_per_prompt + n); + } + } + + m_unet->set_hidden_states("encoder_hidden_states", encoder_hidden_states_repeated); + } + + if (unet_config.time_cond_proj_dim >= 0) { // LCM + ov::Tensor guidance_scale_embedding = get_guidance_scale_embedding(generation_config.guidance_scale, unet_config.time_cond_proj_dim); + m_unet->set_hidden_states("timestep_cond", guidance_scale_embedding); + } + + m_scheduler->set_timesteps(generation_config.num_inference_steps); + std::vector<std::int64_t> timesteps = m_scheduler->get_timesteps(); + + // latents are multiplied by 'init_noise_sigma' + ov::Shape latent_shape{generation_config.num_images_per_prompt, unet_config.in_channels, + generation_config.height / vae_scale_factor, generation_config.width / vae_scale_factor}; + ov::Shape latent_shape_cfg = latent_shape; + latent_shape_cfg[0] *= batch_size_multiplier; + + ov::Tensor latent(ov::element::f32, latent_shape), latent_cfg(ov::element::f32, latent_shape_cfg); + std::generate_n(latent.data<float>(), latent.get_size(), [&]() -> float { + return generation_config.random_generator->next() * m_scheduler->get_init_noise_sigma(); + }); + + ov::Tensor denoised, noisy_residual_tensor(ov::element::f32, {}); + for (size_t inference_step = 0; inference_step < generation_config.num_inference_steps; inference_step++) { + // concat the same latent twice along a batch dimension in case of CFG + if (batch_size_multiplier > 1) { + batch_copy(latent, latent_cfg, 0, 0, generation_config.num_images_per_prompt); + batch_copy(latent, latent_cfg, 0, generation_config.num_images_per_prompt, generation_config.num_images_per_prompt); + } else { + // just assign to save memory copy + latent_cfg = latent; + } + + m_scheduler->scale_model_input(latent_cfg, inference_step); + + ov::Tensor timestep(ov::element::i64, {1}, ×teps[inference_step]); + ov::Tensor noise_pred_tensor = m_unet->infer(latent_cfg, timestep); + + ov::Shape noise_pred_shape = noise_pred_tensor.get_shape(); + noise_pred_shape[0] /= batch_size_multiplier; + noisy_residual_tensor.set_shape(noise_pred_shape); + + if (batch_size_multiplier > 1) { + // perform guidance + float* noisy_residual = noisy_residual_tensor.data<float>(); + const float* noise_pred_uncond = noise_pred_tensor.data<const float>(); + const float* noise_pred_text = noise_pred_uncond + noisy_residual_tensor.get_size(); + + for (size_t i = 0; i < noisy_residual_tensor.get_size(); ++i) { + noisy_residual[i] = noise_pred_uncond[i] + + generation_config.guidance_scale * (noise_pred_text[i] - noise_pred_uncond[i]); + } + } else { + noisy_residual_tensor = noise_pred_tensor; + } + + auto scheduler_step_result = m_scheduler->step(noisy_residual_tensor, latent, inference_step); + latent = scheduler_step_result["latent"]; + + // check whether scheduler returns "denoised" image, which should be passed to VAE decoder + const auto it = scheduler_step_result.find("denoised"); + denoised = it != scheduler_step_result.end() ? it->second : latent; + } + + return m_vae_decoder->infer(denoised); + } + +private: + bool do_classifier_free_guidance(float guidance_scale) const { + return guidance_scale >= 1.0f && m_unet->get_config().time_cond_proj_dim < 0; + } + + void initialize_generation_config(const std::string& class_name) override { + assert(m_unet != nullptr); + const auto& unet_config = m_unet->get_config(); + const size_t vae_scale_factor = m_unet->get_vae_scale_factor(); + + m_generation_config.height = unet_config.sample_size * vae_scale_factor; + m_generation_config.width = unet_config.sample_size * vae_scale_factor; + + if (class_name == "StableDiffusionPipeline") { + m_generation_config.guidance_scale = 7.5f; + m_generation_config.num_inference_steps = 50; + } else if (class_name == "LatentConsistencyModelPipeline") { + m_generation_config.guidance_scale = 7.5f; + m_generation_config.num_inference_steps = 50; + } else { + OPENVINO_THROW("Unsupported class_name '", class_name, "'. Please, contact OpenVINO GenAI developers"); + } + } + + void check_image_size(const int height, const int width) const override { + assert(m_unet != nullptr); + const size_t vae_scale_factor = m_unet->get_vae_scale_factor(); + OPENVINO_ASSERT((height % vae_scale_factor == 0 || height < 0) && + (width % vae_scale_factor == 0 || width < 0), "Both 'width' and 'height' must be divisible by", + vae_scale_factor); + } + + void check_inputs(const GenerationConfig& generation_config) const override { + check_image_size(generation_config.width, generation_config.height); + + const bool is_classifier_free_guidance = do_classifier_free_guidance(generation_config.guidance_scale); + const bool is_lcm = m_unet->get_config().time_cond_proj_dim > 0; + const char * const pipeline_name = is_lcm ? "Latent Consistency Model" : "Stable Diffusion"; + + OPENVINO_ASSERT(generation_config.prompt_2 == std::nullopt, "Prompt 2 is not used by ", pipeline_name); + OPENVINO_ASSERT(generation_config.prompt_3 == std::nullopt, "Prompt 3 is not used by ", pipeline_name); + if (is_lcm) { + OPENVINO_ASSERT(generation_config.negative_prompt.empty(), "Negative prompt is not used by ", pipeline_name); + } else if (!is_classifier_free_guidance) { + OPENVINO_ASSERT(generation_config.negative_prompt.empty(), "Negative prompt is not used when guidance scale < 1.0"); + } + OPENVINO_ASSERT(generation_config.negative_prompt_2.empty(), "Negative prompt 2 is not used by ", pipeline_name); + OPENVINO_ASSERT(generation_config.negative_prompt_3.empty(), "Negative prompt 3 is not used by ", pipeline_name); + } + + std::shared_ptr<CLIPTextModel> m_clip_text_encoder; + std::shared_ptr<UNet2DConditionModel> m_unet; + std::shared_ptr<AutoencoderKL> m_vae_decoder; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text2image/stable_diffusion_xl_pipeline.hpp b/src/cpp/src/text2image/stable_diffusion_xl_pipeline.hpp new file mode 100644 index 0000000000..15c82fc36a --- /dev/null +++ b/src/cpp/src/text2image/stable_diffusion_xl_pipeline.hpp @@ -0,0 +1,357 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "text2image/diffusion_pipeline.hpp" + +#include <ctime> +#include <cassert> + +#include "utils.hpp" + +namespace ov { +namespace genai { + +class Text2ImagePipeline::StableDiffusionXLPipeline : public Text2ImagePipeline::DiffusionPipeline { +public: + explicit StableDiffusionXLPipeline(const std::string& root_dir) { + const std::string model_index_path = root_dir + "/model_index.json"; + std::ifstream file(model_index_path); + OPENVINO_ASSERT(file.is_open(), "Failed to open ", model_index_path); + + nlohmann::json data = nlohmann::json::parse(file); + using utils::read_json_param; + + set_scheduler(Scheduler::from_config(root_dir + "/scheduler/scheduler_config.json")); + + const std::string text_encoder = data["text_encoder"][1].get<std::string>(); + if (text_encoder == "CLIPTextModel") { + m_clip_text_encoder = std::make_shared<CLIPTextModel>(root_dir + "/text_encoder"); + } else { + OPENVINO_THROW("Unsupported '", text_encoder, "' text encoder type"); + } + + const std::string text_encoder_2 = data["text_encoder_2"][1].get<std::string>(); + if (text_encoder_2 == "CLIPTextModelWithProjection") { + m_clip_text_encoder_with_projection = std::make_shared<CLIPTextModelWithProjection>(root_dir + "/text_encoder_2"); + } else { + OPENVINO_THROW("Unsupported '", text_encoder, "' text encoder type"); + } + + const std::string unet = data["unet"][1].get<std::string>(); + if (unet == "UNet2DConditionModel") { + m_unet = std::make_shared<UNet2DConditionModel>(root_dir + "/unet"); + } else { + OPENVINO_THROW("Unsupported '", unet, "' UNet type"); + } + + const std::string vae = data["vae"][1].get<std::string>(); + if (vae == "AutoencoderKL") { + m_vae_decoder = std::make_shared<AutoencoderKL>(root_dir + "/vae_decoder"); + } else { + OPENVINO_THROW("Unsupported '", vae, "' VAE decoder type"); + } + + // initialize generation config + initialize_generation_config(data["_class_name"].get<std::string>()); + } + + StableDiffusionXLPipeline(const std::string& root_dir, const std::string& device, const ov::AnyMap& properties) { + const std::string model_index_path = root_dir + "/model_index.json"; + std::ifstream file(model_index_path); + OPENVINO_ASSERT(file.is_open(), "Failed to open ", model_index_path); + + nlohmann::json data = nlohmann::json::parse(file); + using utils::read_json_param; + + set_scheduler(Scheduler::from_config(root_dir + "/scheduler/scheduler_config.json")); + + const std::string text_encoder = data["text_encoder"][1].get<std::string>(); + if (text_encoder == "CLIPTextModel") { + m_clip_text_encoder = std::make_shared<CLIPTextModel>(root_dir + "/text_encoder", device, properties); + } else { + OPENVINO_THROW("Unsupported '", text_encoder, "' text encoder type"); + } + + const std::string text_encoder_2 = data["text_encoder_2"][1].get<std::string>(); + if (text_encoder_2 == "CLIPTextModelWithProjection") { + m_clip_text_encoder_with_projection = std::make_shared<CLIPTextModelWithProjection>(root_dir + "/text_encoder_2", device, properties); + } else { + OPENVINO_THROW("Unsupported '", text_encoder, "' text encoder type"); + } + + const std::string unet = data["unet"][1].get<std::string>(); + if (unet == "UNet2DConditionModel") { + m_unet = std::make_shared<UNet2DConditionModel>(root_dir + "/unet", device, properties); + } else { + OPENVINO_THROW("Unsupported '", unet, "' UNet type"); + } + + const std::string vae = data["vae"][1].get<std::string>(); + if (vae == "AutoencoderKL") { + m_vae_decoder = std::make_shared<AutoencoderKL>(root_dir + "/vae_decoder", device, properties); + } else { + OPENVINO_THROW("Unsupported '", vae, "' VAE decoder type"); + } + + // initialize generation config + initialize_generation_config(data["_class_name"].get<std::string>()); + } + + StableDiffusionXLPipeline( + const CLIPTextModel& clip_text_model, + const CLIPTextModelWithProjection& clip_text_model_with_projection, + const UNet2DConditionModel& unet, + const AutoencoderKL& vae_decoder) + : m_clip_text_encoder(std::make_shared<CLIPTextModel>(clip_text_model)), + m_clip_text_encoder_with_projection(std::make_shared<CLIPTextModelWithProjection>(clip_text_model_with_projection)), + m_unet(std::make_shared<UNet2DConditionModel>(unet)), + m_vae_decoder(std::make_shared<AutoencoderKL>(vae_decoder)) { } + + void reshape(const int num_images_per_prompt, const int height, const int width, const float guidance_scale) override { + check_image_size(height, width); + + const size_t batch_size_multiplier = do_classifier_free_guidance(guidance_scale) ? 2 : 1; // Unet accepts 2x batch in case of CFG + m_clip_text_encoder->reshape(batch_size_multiplier); + m_clip_text_encoder_with_projection->reshape(batch_size_multiplier); + m_unet->reshape(num_images_per_prompt * batch_size_multiplier, height, width, m_clip_text_encoder->get_config().max_position_embeddings); + m_vae_decoder->reshape(num_images_per_prompt, height, width); + } + + void compile(const std::string& device, const ov::AnyMap& properties) override { + m_clip_text_encoder->compile(device, properties); + m_clip_text_encoder_with_projection->compile(device, properties); + m_unet->compile(device, properties); + m_vae_decoder->compile(device, properties); + } + + ov::Tensor generate(const std::string& positive_prompt, + const ov::AnyMap& properties) override { + GenerationConfig generation_config = m_generation_config; + generation_config.update_generation_config(properties); + + // Stable Diffusion pipeline + // see https://huggingface.co/docs/diffusers/using-diffusers/write_own_pipeline#deconstruct-the-stable-diffusion-pipeline + + const auto& unet_config = m_unet->get_config(); + const size_t batch_size_multiplier = do_classifier_free_guidance(generation_config.guidance_scale) ? 2 : 1; // Unet accepts 2x batch in case of CFG + const size_t vae_scale_factor = m_unet->get_vae_scale_factor(); + + if (generation_config.height < 0) + generation_config.height = unet_config.sample_size * vae_scale_factor; + if (generation_config.width < 0) + generation_config.width = unet_config.sample_size * vae_scale_factor; + check_image_size(generation_config.height, generation_config.width); + + if (generation_config.random_generator == nullptr) { + uint32_t seed = time(NULL); + generation_config.random_generator = std::make_shared<CppStdGenerator>(seed); + } + + std::vector<float> time_ids = {static_cast<float>(generation_config.width), + static_cast<float>(generation_config.height), + 0, + 0, + static_cast<float>(generation_config.width), + static_cast<float>(generation_config.height), + }; + ov::Tensor add_time_ids(ov::element::f32, {batch_size_multiplier, time_ids.size()}); + float* add_time_ids_data = add_time_ids.data<float>(); + std::copy(time_ids.begin(), time_ids.end(), add_time_ids_data); + + if (batch_size_multiplier > 1) { + std::copy(time_ids.begin(), time_ids.end(), add_time_ids_data + time_ids.size()); + } + + ov::Tensor add_text_embeds = m_clip_text_encoder_with_projection->infer(positive_prompt, generation_config.negative_prompt, batch_size_multiplier > 1); + m_clip_text_encoder->infer(positive_prompt, generation_config.negative_prompt, batch_size_multiplier > 1); + + // prompt_embeds = prompt_embeds.hidden_states[-2] + size_t idx_hidden_state_1 = m_clip_text_encoder->get_config().num_hidden_layers; + ov::Tensor encoder_hidden_states_1 = m_clip_text_encoder->get_output_tensor(idx_hidden_state_1); + size_t idx_hidden_state_2 = m_clip_text_encoder_with_projection->get_config().num_hidden_layers; + ov::Tensor encoder_hidden_states_2 = m_clip_text_encoder_with_projection->get_output_tensor(idx_hidden_state_2); + + ov::Shape ehs_1_shape = encoder_hidden_states_1.get_shape(); + ov::Shape ehs_2_shape = encoder_hidden_states_2.get_shape(); + + OPENVINO_ASSERT(ehs_1_shape[0] == ehs_2_shape[0] && ehs_1_shape[1] == ehs_2_shape[1], + "Tensors for concatenation must have the same dimensions"); + + // concatenate hidden_states from two encoders + ov::Shape encoder_hidden_states_shape = {ehs_1_shape[0], ehs_1_shape[1], ehs_1_shape[2] + ehs_2_shape[2]}; + ov::Tensor encoder_hidden_states(encoder_hidden_states_1.get_element_type(), encoder_hidden_states_shape); + + const float* ehs_1_data = encoder_hidden_states_1.data<const float>(); + const float* ehs_2_data = encoder_hidden_states_2.data<const float>(); + float* encoder_hidden_states_data = encoder_hidden_states.data<float>(); + + for (size_t i = 0; i < ehs_1_shape[0]; ++i) { + for (size_t j = 0; j < ehs_1_shape[1]; ++j) { + size_t offset_1 = (i * ehs_1_shape[1] + j) * ehs_1_shape[2]; + size_t offset_2 = (i * ehs_2_shape[1] + j) * ehs_2_shape[2]; + + size_t step = (i * ehs_1_shape[1] + j) * (ehs_1_shape[2] + ehs_2_shape[2]); + + std::memcpy(encoder_hidden_states_data + step, ehs_1_data + offset_1, ehs_1_shape[2] * sizeof(float)); + std::memcpy(encoder_hidden_states_data + step + ehs_1_shape[2], ehs_2_data + offset_2, ehs_2_shape[2] * sizeof(float)); + } + } + + // replicate encoder hidden state to UNet model + if (generation_config.num_images_per_prompt == 1) { + // reuse output of text encoder directly w/o extra memory copy + m_unet->set_hidden_states("encoder_hidden_states", encoder_hidden_states); + m_unet->set_hidden_states("text_embeds", add_text_embeds); + m_unet->set_hidden_states("time_ids", add_time_ids); + + } else { + ov::Shape enc_shape = encoder_hidden_states.get_shape(); + enc_shape[0] *= generation_config.num_images_per_prompt; + + ov::Tensor encoder_hidden_states_repeated(encoder_hidden_states.get_element_type(), enc_shape); + for (size_t n = 0; n < generation_config.num_images_per_prompt; ++n) { + batch_copy(encoder_hidden_states, encoder_hidden_states_repeated, 0, n); + if (batch_size_multiplier > 1) { + batch_copy(encoder_hidden_states, encoder_hidden_states_repeated, + 1, generation_config.num_images_per_prompt + n); + } + } + + m_unet->set_hidden_states("encoder_hidden_states", encoder_hidden_states_repeated); + + ov::Shape t_emb_shape = add_text_embeds.get_shape(); + t_emb_shape[0] *= generation_config.num_images_per_prompt; + + ov::Tensor add_text_embeds_repeated(add_text_embeds.get_element_type(), t_emb_shape); + for (size_t n = 0; n < generation_config.num_images_per_prompt; ++n) { + batch_copy(add_text_embeds, add_text_embeds_repeated, 0, n); + if (batch_size_multiplier > 1) { + batch_copy(add_text_embeds, add_text_embeds_repeated, + 1, generation_config.num_images_per_prompt + n); + } + } + + m_unet->set_hidden_states("text_embeds", add_text_embeds_repeated); + + ov::Shape t_ids_shape = add_time_ids.get_shape(); + t_ids_shape[0] *= generation_config.num_images_per_prompt; + ov::Tensor add_time_ids_repeated(add_time_ids.get_element_type(), t_ids_shape); + for (size_t n = 0; n < generation_config.num_images_per_prompt; ++n) { + batch_copy(add_time_ids, add_time_ids_repeated, 0, n); + if (batch_size_multiplier > 1) { + batch_copy(add_time_ids, add_time_ids_repeated, + 1, generation_config.num_images_per_prompt + n); + } + } + + m_unet->set_hidden_states("time_ids", add_time_ids_repeated); + } + + m_scheduler->set_timesteps(generation_config.num_inference_steps); + std::vector<std::int64_t> timesteps = m_scheduler->get_timesteps(); + + // latents are multiplied by 'init_noise_sigma' + ov::Shape latent_shape{generation_config.num_images_per_prompt, unet_config.in_channels, + generation_config.height / vae_scale_factor, generation_config.width / vae_scale_factor}; + ov::Shape latent_shape_cfg = latent_shape; + latent_shape_cfg[0] *= batch_size_multiplier; + + ov::Tensor latent(ov::element::f32, latent_shape), latent_cfg(ov::element::f32, latent_shape_cfg); + std::generate_n(latent.data<float>(), latent.get_size(), [&]() -> float { + return generation_config.random_generator->next() * m_scheduler->get_init_noise_sigma(); + }); + + ov::Tensor denoised, noisy_residual_tensor(ov::element::f32, {}); + for (size_t inference_step = 0; inference_step < generation_config.num_inference_steps; inference_step++) { + // concat the same latent twice along a batch dimension in case of CFG + if (batch_size_multiplier > 1) { + batch_copy(latent, latent_cfg, 0, 0, generation_config.num_images_per_prompt); + batch_copy(latent, latent_cfg, 0, generation_config.num_images_per_prompt, generation_config.num_images_per_prompt); + } else { + // just assign to save memory copy + latent_cfg = latent; + } + + m_scheduler->scale_model_input(latent_cfg, inference_step); + + ov::Tensor timestep(ov::element::i64, {1}, ×teps[inference_step]); + ov::Tensor noise_pred_tensor = m_unet->infer(latent_cfg, timestep); + + ov::Shape noise_pred_shape = noise_pred_tensor.get_shape(); + noise_pred_shape[0] /= batch_size_multiplier; + noisy_residual_tensor.set_shape(noise_pred_shape); + + if (batch_size_multiplier > 1) { + // perform guidance + float* noisy_residual = noisy_residual_tensor.data<float>(); + const float* noise_pred_uncond = noise_pred_tensor.data<const float>(); + const float* noise_pred_text = noise_pred_uncond + noisy_residual_tensor.get_size(); + + for (size_t i = 0; i < noisy_residual_tensor.get_size(); ++i) { + noisy_residual[i] = noise_pred_uncond[i] + + generation_config.guidance_scale * (noise_pred_text[i] - noise_pred_uncond[i]); + } + } else { + noisy_residual_tensor = noise_pred_tensor; + } + + auto scheduler_step_result = m_scheduler->step(noisy_residual_tensor, latent, inference_step); + latent = scheduler_step_result["latent"]; + + // check whether scheduler returns "denoised" image, which should be passed to VAE decoder + const auto it = scheduler_step_result.find("denoised"); + denoised = it != scheduler_step_result.end() ? it->second : latent; + } + + return m_vae_decoder->infer(denoised); + } + +private: + bool do_classifier_free_guidance(float guidance_scale) const { + return guidance_scale >= 1.0f && m_unet->get_config().time_cond_proj_dim < 0; + } + + void initialize_generation_config(const std::string& class_name) override { + assert(m_unet != nullptr); + const auto& unet_config = m_unet->get_config(); + const size_t vae_scale_factor = m_unet->get_vae_scale_factor(); + + m_generation_config.height = unet_config.sample_size * vae_scale_factor; + m_generation_config.width = unet_config.sample_size * vae_scale_factor; + + if (class_name == "StableDiffusionXLPipeline") { + m_generation_config.guidance_scale = 5.0f; + m_generation_config.num_inference_steps = 50; + } else { + OPENVINO_THROW("Unsupported class_name '", class_name, "'. Please, contact OpenVINO GenAI developers"); + } + } + + void check_image_size(const int height, const int width) const override { + assert(m_unet != nullptr); + const size_t vae_scale_factor = m_unet->get_vae_scale_factor(); + OPENVINO_ASSERT((height % vae_scale_factor == 0 || height < 0) && + (width % vae_scale_factor == 0 || width < 0), "Both 'width' and 'height' must be divisible by", + vae_scale_factor); + } + + void check_inputs(const GenerationConfig& generation_config) const override { + check_image_size(generation_config.width, generation_config.height); + + const bool is_classifier_free_guidance = do_classifier_free_guidance(generation_config.guidance_scale); + const char * const pipeline_name = "Stable Diffusion XL"; + + OPENVINO_ASSERT(generation_config.prompt_3 == std::nullopt, "Prompt 3 is not used by ", pipeline_name); + OPENVINO_ASSERT(is_classifier_free_guidance || generation_config.negative_prompt.empty(), "Negative prompt is not used when guidance scale < 1.0"); + OPENVINO_ASSERT(is_classifier_free_guidance || generation_config.negative_prompt_2.empty(), "Negative prompt 2 is not used when guidance scale < 1.0"); + OPENVINO_ASSERT(generation_config.negative_prompt_3.empty(), "Negative prompt 3 is not used by ", pipeline_name); + } + + std::shared_ptr<CLIPTextModel> m_clip_text_encoder; + std::shared_ptr<CLIPTextModelWithProjection> m_clip_text_encoder_with_projection; + std::shared_ptr<UNet2DConditionModel> m_unet; + std::shared_ptr<AutoencoderKL> m_vae_decoder; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text2image/text2image_pipeline.cpp b/src/cpp/src/text2image/text2image_pipeline.cpp new file mode 100644 index 0000000000..04422ef12f --- /dev/null +++ b/src/cpp/src/text2image/text2image_pipeline.cpp @@ -0,0 +1,157 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "text2image/stable_diffusion_pipeline.hpp" +#include "text2image/stable_diffusion_xl_pipeline.hpp" + +#include <ctime> +#include <cstdlib> + +#include "utils.hpp" + +namespace ov { +namespace genai { + +static constexpr char SD_GENERATION_CONFIG[] = "SD_GENERATION_CONFIG"; + +Generator::~Generator() = default; + +CppStdGenerator::CppStdGenerator(uint32_t seed) + : gen(seed), normal(0.0f, 1.0f) { +} + +float CppStdGenerator::next() { + return normal(gen); +} + +// +// GenerationConfig +// + +std::pair<std::string, ov::Any> generation_config(const Text2ImagePipeline::GenerationConfig& generation_config) { + return {SD_GENERATION_CONFIG, ov::Any::make<Text2ImagePipeline::GenerationConfig>(generation_config)}; +} + +void Text2ImagePipeline::GenerationConfig::update_generation_config(const ov::AnyMap& properties) { + using utils::read_anymap_param; + + // override whole generation config first + read_anymap_param(properties, SD_GENERATION_CONFIG, *this); + + // then try per-parameter values + read_anymap_param(properties, "prompt_2", prompt_2); + read_anymap_param(properties, "prompt_3", prompt_3); + read_anymap_param(properties, "negative_prompt", negative_prompt); + read_anymap_param(properties, "negative_prompt_2", negative_prompt_2); + read_anymap_param(properties, "negative_prompt_3", negative_prompt_3); + read_anymap_param(properties, "num_images_per_prompt", num_images_per_prompt); + read_anymap_param(properties, "random_generator", random_generator); + read_anymap_param(properties, "guidance_scale", guidance_scale); + read_anymap_param(properties, "height", height); + read_anymap_param(properties, "width", width); + read_anymap_param(properties, "num_inference_steps", num_inference_steps); + read_anymap_param(properties, "adapters", adapters); + + validate(); +} + +void Text2ImagePipeline::GenerationConfig::validate() const { + OPENVINO_ASSERT(guidance_scale >= 1.0f || negative_prompt.empty(), "Guidance scale < 1.0 ignores negative prompt"); +} + +// +// Text2ImagePipeline +// + +Text2ImagePipeline::Text2ImagePipeline(const std::string& root_dir) { + const std::string class_name = get_class_name(root_dir); + + if (class_name == "StableDiffusionPipeline" || + class_name == "LatentConsistencyModelPipeline") { + m_impl = std::make_shared<StableDiffusionPipeline>(root_dir); + } else if (class_name == "StableDiffusionXLPipeline") { + m_impl = std::make_shared<StableDiffusionXLPipeline>(root_dir); + } else { + OPENVINO_THROW("Unsupported text to image generation pipeline '", class_name, "'"); + } +} + +Text2ImagePipeline::Text2ImagePipeline(const std::string& root_dir, const std::string& device, const ov::AnyMap& properties) { + const std::string class_name = get_class_name(root_dir); + + if (class_name == "StableDiffusionPipeline" || + class_name == "LatentConsistencyModelPipeline") { + m_impl = std::make_shared<StableDiffusionPipeline>(root_dir, device, properties); + } else if (class_name == "StableDiffusionXLPipeline") { + m_impl = std::make_shared<StableDiffusionXLPipeline>(root_dir, device, properties); + } else { + OPENVINO_THROW("Unsupported text to image generation pipeline '", class_name, "'"); + } +} + +Text2ImagePipeline::Text2ImagePipeline(const std::shared_ptr<DiffusionPipeline>& impl) + : m_impl(impl) { + assert(m_impl != nullptr); +} + +Text2ImagePipeline Text2ImagePipeline::stable_diffusion( + const std::shared_ptr<Scheduler>& scheduler, + const CLIPTextModel& clip_text_model, + const UNet2DConditionModel& unet, + const AutoencoderKL& vae_decoder) { + auto impl = std::make_shared<StableDiffusionPipeline>(clip_text_model, unet, vae_decoder); + + assert(scheduler != nullptr); + impl->set_scheduler(scheduler); + + return Text2ImagePipeline(impl); +} + +Text2ImagePipeline Text2ImagePipeline::latent_consistency_model( + const std::shared_ptr<Scheduler>& scheduler, + const CLIPTextModel& clip_text_model, + const UNet2DConditionModel& unet, + const AutoencoderKL& vae_decoder) { + return stable_diffusion(scheduler, clip_text_model, unet, vae_decoder); +} + +Text2ImagePipeline Text2ImagePipeline::stable_diffusion_xl( + const std::shared_ptr<Scheduler>& scheduler, + const CLIPTextModel& clip_text_model, + const CLIPTextModelWithProjection& clip_text_model_with_projection, + const UNet2DConditionModel& unet, + const AutoencoderKL& vae_decoder) { + auto impl = std::make_shared<StableDiffusionXLPipeline>(clip_text_model, clip_text_model_with_projection, unet, vae_decoder); + + assert(scheduler != nullptr); + impl->set_scheduler(scheduler); + + return Text2ImagePipeline(impl); +} + +Text2ImagePipeline::GenerationConfig Text2ImagePipeline::get_generation_config() const { + return m_impl->get_generation_config(); +} + +void Text2ImagePipeline::set_generation_config(const GenerationConfig& generation_config) { + m_impl->set_generation_config(generation_config); +} + +void Text2ImagePipeline::set_scheduler(std::shared_ptr<Scheduler> scheduler) { + m_impl->set_scheduler(scheduler); +} + +void Text2ImagePipeline::reshape(const int num_images_per_prompt, const int height, const int width, const float guidance_scale) { + m_impl->reshape(num_images_per_prompt, height, width, guidance_scale); +} + +void Text2ImagePipeline::compile(const std::string& device, const ov::AnyMap& properties) { + m_impl->compile(device, properties); +} + +ov::Tensor Text2ImagePipeline::generate(const std::string& positive_prompt, const ov::AnyMap& properties) { + return m_impl->generate(positive_prompt, properties); +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text_callback_streamer.cpp b/src/cpp/src/text_callback_streamer.cpp new file mode 100644 index 0000000000..b2b5c9a463 --- /dev/null +++ b/src/cpp/src/text_callback_streamer.cpp @@ -0,0 +1,52 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "text_callback_streamer.hpp" + +namespace ov { +namespace genai { + +TextCallbackStreamer::TextCallbackStreamer(const Tokenizer& tokenizer, std::function<bool(std::string)> callback) { + m_tokenizer = tokenizer; + on_finalized_subword_callback = callback; +} + +bool TextCallbackStreamer::put(int64_t token) { + std::stringstream res; + m_tokens_cache.push_back(token); + std::string text = m_tokenizer.decode(m_tokens_cache); + if (!text.empty() && '\n' == text.back() && text.size() > print_len) { + // Flush the cache after the new line symbol + res << std::string_view{text.data() + print_len, text.size() - print_len}; + m_tokens_cache.clear(); + print_len = 0; + return on_finalized_subword_callback(res.str()); + } + + if (text.size() >= 3 && text.compare(text.size() - 3, 3, "�") == 0) { + // Don't print incomplete text + return on_finalized_subword_callback(res.str()); + } else if (text.size() > print_len) { + // It is possible to have a shorter text after adding new token. + // Print to output only if text lengh is increaesed. + res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush; + print_len = text.size(); + } + + return on_finalized_subword_callback(res.str()); +} + +void TextCallbackStreamer::end() { + std::stringstream res; + std::string text = m_tokenizer.decode(m_tokens_cache); + if (text.size() <= print_len) + return ; + res << std::string_view{text.data() + print_len, text.size() - print_len} << std::flush; + m_tokens_cache.clear(); + print_len = 0; + on_finalized_subword_callback(res.str()); + return; +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/text_callback_streamer.hpp b/src/cpp/src/text_callback_streamer.hpp new file mode 100644 index 0000000000..7afc52b4f6 --- /dev/null +++ b/src/cpp/src/text_callback_streamer.hpp @@ -0,0 +1,27 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "openvino/genai/streamer_base.hpp" +#include "openvino/genai/tokenizer.hpp" + +namespace ov { +namespace genai { + +class TextCallbackStreamer: public StreamerBase { +public: + bool put(int64_t token) override; + void end() override; + + TextCallbackStreamer(const Tokenizer& tokenizer, std::function<bool(std::string)> callback); + + std::function<bool(std::string)> on_finalized_subword_callback = [](std::string words)->bool { return false; }; +private: + Tokenizer m_tokenizer; + std::vector<int64_t> m_tokens_cache; + size_t print_len = 0; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/timer.hpp b/src/cpp/src/timer.hpp new file mode 100644 index 0000000000..c4893acd1c --- /dev/null +++ b/src/cpp/src/timer.hpp @@ -0,0 +1,32 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <string> +#include <chrono> +#include <iostream> + +class ManualTimer { + double m_total; + decltype(std::chrono::steady_clock::now()) m_start; + std::string m_title; +public: + ManualTimer(const std::string& title) : + m_total(0.), + m_title(title) { + } + + void start() { + m_start = std::chrono::steady_clock::now(); + } + + void end() { + auto m_end = std::chrono::steady_clock::now(); + m_total += std::chrono::duration<double, std::milli>(m_end - m_start).count(); + } + + ~ManualTimer() { + std::cout << m_title << ": " << m_total / 1000. << " secs" << std::endl; + } +}; diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp new file mode 100644 index 0000000000..8563ab26d2 --- /dev/null +++ b/src/cpp/src/tokenizer.cpp @@ -0,0 +1,562 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include <filesystem> +#include <fstream> +#include <memory> +#include <jinja2cpp/template.h> +#include <jinja2cpp/template_env.h> +#include <jinja2cpp/user_callable.h> +#include <jinja2cpp/generic_list.h> +#include <jinja2cpp/generic_list_iterator.h> + +#include "openvino/pass/manager.hpp" +#include "openvino/runtime/core.hpp" +#include "openvino/genai/tokenizer.hpp" + +#include "make_combine_segments_stateful.hpp" +#include "tokenizers_path.hpp" +#include "circular_buffer_queue.hpp" +#include "utils.hpp" + +namespace { + +// todo: remove when openvino-tokenizers will support left padding +ov::genai::TokenizedInputs pad_left(ov::Tensor& input_ids, ov::Tensor& attention_mask) { + const size_t batch_size = input_ids.get_shape()[0]; + const size_t sequence_length = input_ids.get_shape()[1]; + int64_t* inputs_data = input_ids.data<int64_t>(); + int64_t* attention_mask_data = attention_mask.data<int64_t>(); + + for (size_t batch = 0; batch < batch_size; batch++) { + const size_t batch_offset = batch * sequence_length; + + // last token in the sequence is not a PAD_TOKEN, skipping + if (attention_mask_data[batch_offset + sequence_length - 1] == 1) + continue; + + size_t pad_tokens_number = 0; + for (int i = sequence_length - 1; i >= 0; i--) { + const size_t token_offset = batch_offset + i; + + // count pad tokens + if (attention_mask_data[token_offset] == 0) + continue; + + if (pad_tokens_number == 0) + pad_tokens_number = sequence_length - i - 1; + + std::swap(inputs_data[token_offset], inputs_data[token_offset + pad_tokens_number]); + std::swap(attention_mask_data[token_offset], attention_mask_data[token_offset + pad_tokens_number]); + } + } + + return {input_ids, attention_mask}; +} + +constexpr char bos_token_key_name[] = "bos_token"; +constexpr char eos_token_key_name[] = "eos_token"; +constexpr char pad_token_key_name[] = "pad_token"; + +} // namespace + +namespace ov { +namespace genai { + +class Tokenizer::TokenizerImpl { +public: + ov::CompiledModel m_tokenizer; + ov::CompiledModel m_detokenizer; + + std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_ireq_queue_tokenizer; + std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_ireq_queue_detokenizer; + // To change the adding special tokens mode we use a statefull subgraph, + // this flag holds the current state value of the CompiledModel. + bool m_add_special_tokens = true; + + int64_t m_pad_token_id = -1; + int64_t m_bos_token_id = -1; + int64_t m_eos_token_id = -1; + + std::string m_pad_token = ""; + std::string m_bos_token = ""; + std::string m_eos_token = ""; + + std::string m_chat_template = ""; + + void set_state_if_necessary(CircularBufferQueueElementGuard<ov::InferRequest>& infer_request_guard, bool add_special_tokens) { + // If user requested add_special_tokens mode different from the current one, + // need to set state variable. + // If requested mode matches the stored state set, then don't touch states. + if (add_special_tokens == m_add_special_tokens) { + return; + } + + // auto states = m_ireq_queue_tokenizer->get(0).query_state(); + ov::Tensor add_special_tensor = ov::Tensor(ov::element::boolean, {}); + *add_special_tensor.data<bool>() = add_special_tokens; + + for (auto& state: infer_request_guard.get().query_state()) { + if (state.get_name().find(ov::genai::ADD_SPECIAL_TOKENS_VAR_ID) == std::string::npos) { + // It's not add_special_tokens flag state. + continue; + } + state.set_state(add_special_tensor); + break; + } + m_add_special_tokens = add_special_tokens; + } + + TokenizerImpl() = default; + + TokenizerImpl(std::filesystem::path tokenizer_path, const ov::AnyMap& plugin_config) + : m_chat_template{chat_template_from_tokenizer_json_if_exists(tokenizer_path)} { + ov::Core core; + + OPENVINO_ASSERT(tokenizer_path.extension() != ".xml", "ov_tokenizers_path should be a path to a dir not a xml file"); + + const char* ov_tokenizers_path = getenv(ScopedVar::ENVIRONMENT_VARIABLE_NAME); + OPENVINO_ASSERT(ov_tokenizers_path, "openvino_tokenizers path is not set"); + core.add_extension(ov_tokenizers_path); + + read_config(tokenizer_path); + read_special_tokens_map(tokenizer_path); + + // Try to read tokenizer_config if some token ids or token str are not defined. + read_tokenizer_config_if_necessary(tokenizer_path); + + auto device = "CPU"; // currently openvino_tokenizer supports only CPU + auto ov_tokenizer = core.read_model(tokenizer_path / "openvino_tokenizer.xml"); + + ov::pass::Manager manager; + manager.register_pass<MakeCombineSegmentsSatateful>(); + manager.run_passes(ov_tokenizer); + + m_tokenizer = core.compile_model(ov_tokenizer, device, plugin_config); + if (std::filesystem::exists(tokenizer_path / "openvino_detokenizer.xml")) { + m_detokenizer = core.compile_model(tokenizer_path / "openvino_detokenizer.xml", device, plugin_config); + } + + + const size_t INFER_REQUEST_QUEUE_SIZE = m_tokenizer.get_property(ov::optimal_number_of_infer_requests); + m_ireq_queue_tokenizer = std::make_unique<CircularBufferQueue<ov::InferRequest>>( + INFER_REQUEST_QUEUE_SIZE, + [this]() -> ov::InferRequest { + return std::move(this->m_tokenizer.create_infer_request()); + }); + if (m_detokenizer) { + m_ireq_queue_detokenizer = std::make_unique<CircularBufferQueue<ov::InferRequest>>( + INFER_REQUEST_QUEUE_SIZE, + [this]() -> ov::InferRequest { + return std::move(this->m_detokenizer.create_infer_request()); + }); + } + + // Get special token ids by inference if they are not defined. + infer_special_tokens_if_necessary(); + // Initialize tokenizer's cache to save time later. + // infer_special_tokens_if_necessary() already could do that + // but it didn't run decode() for sure. + // TODO CVS-150630: Empty strings sporadically can fail, therefore use nonempty string for warmup. + auto tokenized_input = encode("non empty string").input_ids; + if (m_detokenizer) + decode(tokenized_input); + } + + // load special tokens ids from config.json + void read_config(const std::filesystem::path& tokenizer_path) { + auto config_file_path = tokenizer_path / "config.json"; + if (!std::filesystem::exists(config_file_path)) + return ; + std::ifstream file(config_file_path); + if (!file.is_open()) + return ; + + nlohmann::json data = nlohmann::json::parse(file); + using ov::genai::utils::read_json_param; + + read_json_param(data, "pad_token_id", m_pad_token_id); + read_json_param(data, "bos_token_id", m_bos_token_id); + read_json_param(data, "eos_token_id", m_eos_token_id); + } + + // Reads the string representation of special tokens if they exist. + void read_special_tokens_map(const std::filesystem::path& tokenizer_path) { + auto special_tokens_file_path = tokenizer_path / "special_tokens_map.json"; + if (!std::filesystem::exists(special_tokens_file_path)) + return ; + std::ifstream f(special_tokens_file_path); + if (!f.is_open()) + return ; + + nlohmann::json data = nlohmann::json::parse(f); + + using ov::genai::utils::read_json_param; + // they are in the format {"bos_token": { "content": "<s>",... }} + auto read_token_content_str = [&data](std::string key_name, std::string& val) { + if (val == "" && data.contains(key_name)) { read_json_param(data[key_name], "content", val); } + }; + read_token_content_str(pad_token_key_name, m_pad_token); + read_token_content_str(bos_token_key_name, m_bos_token); + read_token_content_str(eos_token_key_name, m_eos_token); + } + + // Read string representation of special tokens if they exist. + // Also tries to load special token ids from added_tokens_decoder if they exist. + // Will not override special token strings or ids if they already exist. + void read_tokenizer_config_if_necessary(const std::filesystem::path& tokenizer_path) { + if (m_pad_token_id != -1 && m_bos_token_id != -1 && m_eos_token_id != -1 && + !m_pad_token.empty() && !m_bos_token.empty() && !m_eos_token.empty()) { + return ; + } + + auto tokenizer_config_file_path = tokenizer_path / "tokenizer_config.json"; + if (!std::filesystem::exists(tokenizer_config_file_path)) + return ; + std::ifstream f(tokenizer_config_file_path); + if (!f.is_open()) + return ; + + nlohmann::json data = nlohmann::json::parse(f); + + // read special tokens string representation + // if they are presented directly {"bos_token": "<bos>"} + using ov::genai::utils::read_json_param; + auto read_token_str = [&data](std::string key_name, std::string& val) { + if (val.empty()) { read_json_param(data, key_name, val); } + }; + read_token_str(pad_token_key_name, m_pad_token); + read_token_str(bos_token_key_name, m_bos_token); + read_token_str(eos_token_key_name, m_eos_token); + + // if special tokens are not loaded directly, try to read + // if they are in the format {"bos_token": { "content": "<s>",... }} + auto read_token_content_str = [&data](std::string key_name, std::string& val) { + if (val.empty() && data.contains(key_name)) { read_json_param(data[key_name], "content", val); } + }; + read_token_content_str(pad_token_key_name, m_pad_token); + read_token_content_str(bos_token_key_name, m_bos_token); + read_token_content_str(eos_token_key_name, m_eos_token); + + // if pad_token not found use eos_token as pad_token + if (m_pad_token.empty() && !m_eos_token.empty()) { + m_pad_token = m_eos_token; + } + + // special token ids integer representation are already defined + if (m_pad_token_id != -1 && m_bos_token_id != -1 && m_eos_token_id != -1) + return ; + + // values are stored as {"added_tokens_decoder": {"0": {"content": "<pad>"}}} + // token id is a key in the form of a string, need to do std::stoi + std::string spec_tokens_key_name = "added_tokens_decoder"; + if (!data.contains(spec_tokens_key_name)) + return ; + + // if added_tokens_decoder has different format items() will not fail + for (auto& [key, value] : data[spec_tokens_key_name].items()) { + if (!value.contains("content")) + continue; + auto content = value["content"]; + if (m_pad_token_id == -1 && content == m_pad_token) + m_pad_token_id = std::stoi(key); + if (m_bos_token_id == -1 && content == m_bos_token) + m_bos_token_id = std::stoi(key); + if (m_eos_token_id == -1 && content == m_eos_token) + m_eos_token_id = std::stoi(key); + } + + // if pad_token_id not found use eos_token_id as pad_token_id + // todo: read m_pad_token_id from tokenizer rt_info once implemented in tokenizers (CVS-144174) + if (m_pad_token_id == -1 && m_eos_token_id != -1) { + m_pad_token_id = m_eos_token_id; + } + } + + // tokenize str representation to get special tokens integer values + void infer_special_tokens_if_necessary() { + auto get_id_from_str = [this](std::string token_str, int64_t& token_val) { + if (token_val != -1 || token_str.empty()) + return ; + auto token_ids_tensor = this->encode(token_str).input_ids; + auto data = token_ids_tensor.data<int64_t>(); + auto data_len = token_ids_tensor.get_shape()[1]; + token_val = data[data_len - 1]; + }; + get_id_from_str(m_pad_token, m_pad_token_id); + get_id_from_str(m_bos_token, m_bos_token_id); + get_id_from_str(m_eos_token, m_eos_token_id); + } + + TokenizedInputs encode(std::string prompt, const ov::AnyMap& tokenization_params = {}) { + bool add_special_tokens_flag = true; + ov::genai::utils::read_anymap_param(tokenization_params, add_special_tokens.name(), add_special_tokens_flag); + + CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_tokenizer.get()); + set_state_if_necessary(infer_request_guard, add_special_tokens_flag); + size_t batch_size = 1; + infer_request_guard.get().set_input_tensor(ov::Tensor{ov::element::string, {batch_size}, &prompt}); + infer_request_guard.get().start_async(); + infer_request_guard.get().wait(); + return get_copied_results( + infer_request_guard.get().get_tensor("input_ids"), + infer_request_guard.get().get_tensor("attention_mask") + ); + } + + TokenizedInputs encode(std::vector<std::string>& prompts, const ov::AnyMap& tokenization_params = {}) { + + TokenizedInputs unpadded; + { + bool add_special_tokens_flag = true; + ov::genai::utils::read_anymap_param(tokenization_params, add_special_tokens.name(), add_special_tokens_flag); + + CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_tokenizer.get()); + set_state_if_necessary(infer_request_guard, add_special_tokens_flag); + infer_request_guard.get().set_input_tensor(ov::Tensor{ov::element::string, {prompts.size()}, prompts.data()}); + auto size_ = infer_request_guard.get().get_input_tensor().get_shape(); + infer_request_guard.get().start_async(); + infer_request_guard.get().wait(); + + unpadded = get_copied_results( + infer_request_guard.get().get_tensor("input_ids"), + infer_request_guard.get().get_tensor("attention_mask") + ); + } + return pad_left(unpadded.input_ids, unpadded.attention_mask); + } + + TokenizedInputs get_copied_results(ov::Tensor input_ids, ov::Tensor attention_mask) { + ov::Tensor input_ids_ = ov::Tensor(input_ids.get_element_type(), input_ids.get_shape()); + ov::Tensor attention_mask_ = ov::Tensor(attention_mask.get_element_type(), attention_mask.get_shape()); + input_ids.copy_to(input_ids_); + attention_mask.copy_to(attention_mask_); + + return {input_ids_, attention_mask_}; + } + + std::string decode(std::vector<int64_t> tokens) { + OPENVINO_ASSERT(m_detokenizer, "Detokenize model has not been provided. Tokenizer::decode is not available"); + + CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_detokenizer.get()); + size_t batch_size = 1; + infer_request_guard.get().set_input_tensor(ov::Tensor{ov::element::i64, {batch_size, tokens.size()}, tokens.data()}); + infer_request_guard.get().start_async(); + infer_request_guard.get().wait(); + return infer_request_guard.get().get_output_tensor().data<std::string>()[0]; + } + + std::vector<std::string> decode(ov::Tensor tokens) { + OPENVINO_ASSERT(m_detokenizer, "Detokenize model has not been provided. Tokenizer::decode is not available"); + OPENVINO_ASSERT(tokens.get_element_type() == ov::element::i64, "tokens tensor element type should be an i64"); + OPENVINO_ASSERT(tokens.get_shape().size() == 2, "tokens tensor should of rank 2 with shape [batch_size, seq_len]"); + + CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_detokenizer.get()); + infer_request_guard.get().set_input_tensor(tokens); + infer_request_guard.get().start_async(); + infer_request_guard.get().wait(); + + auto res = infer_request_guard.get().get_output_tensor(); + auto res_data = res.data<std::string>(); + return std::vector<std::string>(res_data, res_data + res.get_shape()[0]); + } + + std::vector<std::string> decode(std::vector<std::vector<int64_t>> lines) { + OPENVINO_ASSERT(m_detokenizer, "Detokenize model has not been provided. Tokenizer::decode is not available"); + + auto compare_lengths = [](const std::vector<int64_t>& a, const std::vector<int64_t>& b) { + return a.size() < b.size(); + }; + size_t max_len = std::max_element(lines.begin(), lines.end(), compare_lengths)->size(); + + ov::Tensor tokens = ov::Tensor{ov::element::i64, {lines.size(), max_len}}; + auto tokens_data = tokens.data<int64_t>(); + + for (size_t i = 0; i < lines.size(); ++i) { + const auto& line = lines[i]; + size_t line_len = line.size(); + std::copy(line.begin(), line.end(), tokens_data + i * max_len); + std::fill(tokens_data + i * max_len + line_len, tokens_data + (i + 1) * max_len, m_pad_token_id); + } + + CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard(this->m_ireq_queue_detokenizer.get()); + infer_request_guard.get().set_input_tensor(tokens); + infer_request_guard.get().start_async(); + infer_request_guard.get().wait(); + auto res = infer_request_guard.get().get_output_tensor(); + auto res_data = res.data<std::string>(); + return std::vector<std::string>(res_data, res_data + res.get_shape()[0]); + } + + std::string patch_chat_template(std::string template_str) { + // Replace what jinja2cpp doesn't support + std::pair<std::string, std::string> replace_str_map[] = { + {"'}", "' }"}, + {"{'", "{ '"}, + {".strip()", ""}, + {"is not none", "is defined"}, + {"is none", "is undefined"}, + {"= none", "= undefined"}, + // Jinja2Cpp does not support Python-style slicing, e.g. [1:]. + // If chat template contains such slicing, we replace it with + // a placeholder at the moment. + {"messages[1:]", "slice(messages, 1)"}, + }; + + for (const auto& [from, to] : replace_str_map) { + size_t pos = 0; + while ((pos = template_str.find(from, pos)) != std::string::npos) { + template_str.replace(pos, from.size(), to); + pos += to.size(); + } + } + return template_str; + } + + std::string chat_template_from_tokenizer_json_if_exists(const std::filesystem::path& path) { + auto tokenizer_config_file_path = path / "tokenizer_config.json"; + if (!std::filesystem::exists(tokenizer_config_file_path)) + return ""; + + std::ifstream file(tokenizer_config_file_path); + if (!file.is_open()) + return ""; + + std::string res = ""; + ov::genai::utils::read_json_param(nlohmann::json::parse(file), "chat_template", res); + if (res.empty()) + return res; + + return patch_chat_template(res); + } + + std::string apply_chat_template(ChatHistory history, + bool add_generation_prompt, + const std::string& chat_template) const { + auto chat_tpl = chat_template.empty() ? m_chat_template : chat_template; + OPENVINO_ASSERT(!chat_tpl.empty(), + "Chat template wasn't found. This may indicate that the model wasn't trained for chat scenario." + " Please add 'chat_template' to tokenizer_config.json to use the model in chat scenario." + " For more information see the section Troubleshooting in README.md"); + jinja2::TemplateEnv env; + env.GetSettings().lstripBlocks = true; + env.GetSettings().trimBlocks = true; + jinja2::Template tpl(&env); + tpl.Load(chat_tpl); + + jinja2::UserCallable slice_callable = jinja2::MakeCallable( + [](const jinja2::GenericList& messages, const size_t& start) { + jinja2::ValuesList result; + + size_t iter_num = 0; + for (auto message = messages.begin(); message != messages.end(); message++, iter_num++) { + if (iter_num < start) + continue; + result.emplace_back(*message); + } + + return result; + }, + jinja2::ArgInfo{"messages"}, jinja2::ArgInfo{"start"} + ); + + jinja2::ValuesList jinja_messages; + jinja2::ValuesMap jinja_message; + for (const auto& message : history) { + jinja_message = {{"role", message.at("role")}, {"content", message.at("content")}}; + jinja_messages.emplace_back(jinja_message); + } + + jinja2::ValuesMap params = { + {"messages", jinja_messages}, + {"bos_token", m_bos_token}, + {"eos_token", m_eos_token}, + {"pad_token", m_pad_token}, + {"add_generation_prompt", add_generation_prompt}, + {"slice", slice_callable}, + }; + + try { + return tpl.RenderAsString(params).value(); + } catch (const std::exception& error) { + OPENVINO_THROW("Chat template for the current model is not supported by Jinja2Cpp. " + "Please apply template manually to your prompt before calling generate. " + "For exmaple: <start_of_turn>user{user_prompt}<end_of_turn><start_of_turn>model"); + } + } + + void set_chat_template(const std::string& chat_template) { + m_chat_template = patch_chat_template(chat_template); + } +}; + +Tokenizer::Tokenizer(const std::string& tokenizer_path, const ov::AnyMap& plugin_config) { + ScopedVar env_manager(tokenizers_relative_to_genai().string()); + m_pimpl = std::make_shared<TokenizerImpl>(tokenizer_path, plugin_config); +} + +TokenizedInputs Tokenizer::encode(const std::string prompt, const ov::AnyMap& tokenization_params) { + return m_pimpl->encode(std::move(prompt), tokenization_params); +} + +TokenizedInputs Tokenizer::encode(std::vector<std::string>& prompts, const ov::AnyMap& tokenization_params) { + return m_pimpl->encode(prompts, tokenization_params); +} + +TokenizedInputs Tokenizer::encode(std::vector<std::string>&& prompts, const ov::AnyMap& tokenization_params) { + return m_pimpl->encode(prompts, tokenization_params); +} + +TokenizedInputs Tokenizer::encode(std::initializer_list<std::string>& text, const ov::AnyMap& tokenization_params) { + return encode(std::vector<std::string>(text.begin(), text.end()), tokenization_params); +} + +std::string Tokenizer::decode(std::vector<int64_t> tokens) { + return m_pimpl->decode(tokens); +} + +std::vector<std::string> Tokenizer::decode(ov::Tensor tokens) { + return m_pimpl->decode(tokens); +} + +std::vector<std::string> Tokenizer::decode(std::vector<std::vector<int64_t>> lines) { + return m_pimpl->decode(lines); +} + +int64_t Tokenizer::get_bos_token_id() const { + return m_pimpl->m_bos_token_id; +} + +int64_t Tokenizer::get_eos_token_id() const { + return m_pimpl->m_eos_token_id; +} + +int64_t Tokenizer::get_pad_token_id() const { + return m_pimpl->m_pad_token_id; +} + +std::string Tokenizer::get_pad_token() const { + return m_pimpl->m_pad_token; +} + +std::string Tokenizer::get_bos_token() const { + return m_pimpl->m_bos_token; +} + +std::string Tokenizer::get_eos_token() const { + return m_pimpl->m_eos_token; +} + +std::string Tokenizer::apply_chat_template(ChatHistory history, + bool add_generation_prompt, + const std::string& chat_template) const { + return m_pimpl->apply_chat_template(history, add_generation_prompt, chat_template); +} + +void Tokenizer::set_chat_template(const std::string& chat_template) { + m_pimpl->set_chat_template(chat_template); +} + +Tokenizer::~Tokenizer() = default; +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/tokenizers_path.cpp b/src/cpp/src/tokenizers_path.cpp new file mode 100644 index 0000000000..d0cd82c265 --- /dev/null +++ b/src/cpp/src/tokenizers_path.cpp @@ -0,0 +1,80 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "tokenizers_path.hpp" + +#include <sstream> +#ifdef _WIN32 +# include <windows.h> +# define MAX_ABS_PATH _MAX_PATH +# define get_absolute_path(result, path) _fullpath(result, path.c_str(), MAX_ABS_PATH) +#else +# include <dlfcn.h> +# include <limits.h> +# include <string.h> +# define MAX_ABS_PATH PATH_MAX +# define get_absolute_path(result, path) realpath(path.c_str(), result) +#endif + +namespace { +#ifndef _WIN32 +std::string get_absolute_file_path(const std::string& path) { + std::string absolutePath; + absolutePath.resize(MAX_ABS_PATH); + std::ignore = get_absolute_path(&absolutePath[0], path); + if (!absolutePath.empty()) { + // on Linux if file does not exist or no access, function will return NULL, but + // `absolutePath` will contain resolved path + absolutePath.resize(absolutePath.find('\0')); + return std::string(absolutePath); + } + std::stringstream ss; + ss << "Can't get absolute file path for [" << path << "], err = " << strerror(errno); + throw std::runtime_error(ss.str()); +} +#endif + +std::string get_ov_genai_library_path() { +#ifdef _WIN32 + CHAR genai_library_path[MAX_PATH]; + HMODULE hm = NULL; + if (!GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, + reinterpret_cast<LPSTR>(get_ov_genai_library_path), + &hm)) { + std::stringstream ss; + ss << "GetModuleHandle returned " << GetLastError(); + throw std::runtime_error(ss.str()); + } + GetModuleFileNameA(hm, (LPSTR)genai_library_path, sizeof(genai_library_path)); + return std::string(genai_library_path); +#elif defined(__APPLE__) || defined(__linux__) || defined(__EMSCRIPTEN__) + Dl_info info; + dladdr(reinterpret_cast<void*>(get_ov_genai_library_path), &info); + return get_absolute_file_path(info.dli_fname).c_str(); +#else +# error "Unsupported OS" +#endif // _WIN32 +} + +std::filesystem::path with_openvino_tokenizers(const std::filesystem::path& path) { +#if !defined(NDEBUG) && (defined(__APPLE__) || defined(_WIN32)) +# define LIB_POSTFIX "d" +#else +# define LIB_POSTFIX "" +#endif +#ifdef _WIN32 + constexpr char tokenizers[] = "openvino_tokenizers" LIB_POSTFIX ".dll"; +#elif defined(__linux__) + constexpr char tokenizers[] = "libopenvino_tokenizers" LIB_POSTFIX ".so"; +#elif defined(__APPLE__) + constexpr char tokenizers[] = "libopenvino_tokenizers" LIB_POSTFIX ".dylib"; +#else +# error "Unsupported OS" +#endif + return path.parent_path() / tokenizers; +} +} + +std::filesystem::path tokenizers_relative_to_genai() { + return with_openvino_tokenizers(get_ov_genai_library_path()); +} diff --git a/src/cpp/src/tokenizers_path.hpp b/src/cpp/src/tokenizers_path.hpp new file mode 100644 index 0000000000..5557b099f1 --- /dev/null +++ b/src/cpp/src/tokenizers_path.hpp @@ -0,0 +1,52 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <filesystem> + +#include "openvino/genai/visibility.hpp" + +// Returns an absolute path. The path is this library's directory +// concatenated with openvino_tokenizers OS specific +// * name (.so, .dll, .dylib, lib prefix). This is part of the interface +// because it's reused in Python bindings. +// tokenizers_relative_to_genai() and ScopedVar allow passing a path to +// openvino_tokenizers through env var removing one argument from +// Tokenizer's constructor. +OPENVINO_GENAI_EXPORTS +std::filesystem::path tokenizers_relative_to_genai(); + +namespace { +// Sets ENVIRONMENT_VARIABLE_NAME to environment_variable_value +// and unsets in destructor. Does nothing if ENVIRONMENT_VARIABLE_NAME +// was already defined. +class ScopedVar { +public: + bool was_already_set{false}; + static constexpr char ENVIRONMENT_VARIABLE_NAME[] = "OPENVINO_TOKENIZERS_PATH_GENAI"; + explicit ScopedVar(const std::string& environment_variable_value) { +#ifdef _WIN32 + char* value = nullptr; + size_t len = 0; + _dupenv_s(&value, &len, ENVIRONMENT_VARIABLE_NAME); + if (value == nullptr) + _putenv_s(ENVIRONMENT_VARIABLE_NAME, environment_variable_value.c_str()); +#else + if (!getenv(ENVIRONMENT_VARIABLE_NAME)) + setenv(ENVIRONMENT_VARIABLE_NAME, environment_variable_value.c_str(), 1); +#endif + else + was_already_set = true; + } + ~ScopedVar() { + if (!was_already_set) { +#ifdef _WIN32 + _putenv_s(ENVIRONMENT_VARIABLE_NAME, ""); +#else + unsetenv(ENVIRONMENT_VARIABLE_NAME); +#endif + } + } +}; +} diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp new file mode 100644 index 0000000000..e7f58a015e --- /dev/null +++ b/src/cpp/src/utils.cpp @@ -0,0 +1,264 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "utils.hpp" + +#include <fstream> + +#include "openvino/op/add.hpp" +#include "openvino/op/divide.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/matmul.hpp" +#include "openvino/op/slice.hpp" +#include "openvino/op/tanh.hpp" +#include "openvino/op/transpose.hpp" + +namespace ov { +namespace genai { +namespace utils { + +Tensor init_attention_mask(const Tensor& input_ids) { + auto shape = input_ids.get_shape(); + auto attention_mask = ov::Tensor{input_ids.get_element_type(), shape}; + std::fill_n(attention_mask.data<int64_t>(), shape[0] * shape[1], 1); + return attention_mask; +} + +void print_tensor(const ov::Tensor& tensor) { + std::vector<int64_t> res; + + auto t_shape = tensor.get_shape(); + std::cout << "["; + for (size_t i = 0; i < t_shape[0]; ++i) { + std::cout << "|"; + for (size_t j = 0; j < t_shape[1]; ++j) { + if (tensor.get_element_type() == ov::element::i64) { + res.emplace_back(tensor.data<int64_t>()[t_shape[1] * i + j]); + std::cout << tensor.data<int64_t>()[t_shape[1] * i + j] << " "; + } + } + std::cout << "|"; + } + std::cout << "]" << std::endl; +} + +int64_t argmax(const ov::Tensor& logits, const size_t batch_idx) { + if (logits.get_shape()[0] <= batch_idx) { + OPENVINO_THROW("logits batch size doesn't match the number of beams"); + } + + size_t vocab_size = logits.get_shape().back(); + size_t batch_offset = batch_idx * logits.get_shape()[1] * vocab_size; + size_t sequence_offset = (logits.get_shape()[1] - 1) * vocab_size; + const float* logits_data = logits.data<const float>() + batch_offset + sequence_offset; + + int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data; + float max_logit = logits_data[out_token]; + + return out_token; +} + +/** + * Initializes position ids based on attention mask and starting position + */ +void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask, int64_t start_pos) { + OPENVINO_ASSERT(position_ids.get_element_type() == ov::element::i64, + "position_ids tensor element type should be an i64"); + OPENVINO_ASSERT(position_ids.get_shape().size() == 2, + "position_ids tensor should of rank 2 with shape [batch_size, seq_len]"); + OPENVINO_ASSERT(attention_mask.get_element_type() == ov::element::i64, + "attention_mask tensor element type should be an i64"); + OPENVINO_ASSERT(attention_mask.get_shape().size() == 2, + "attention_mask tensor should of rank 2 with shape [batch_size, seq_len]"); + + const size_t batch_size = attention_mask.get_shape()[0]; + const size_t seq_length = attention_mask.get_shape()[1]; + + const int64_t* attention_mask_data = attention_mask.data<int64_t>(); + int64_t* position_ids_data = position_ids.data<int64_t>(); + + for (size_t batch = 0; batch < batch_size; batch++) { + size_t sum = start_pos; + for (size_t i = 0; i < seq_length; i++) { + const size_t element_offset = batch * seq_length + i; + position_ids_data[element_offset] = sum; + if (attention_mask_data[element_offset] == 1) { + sum += 1; + } + } + } +} + +void initialize_beam_inputs(const ov::Tensor& input_ids, const ov::Tensor& attention_mask, ov::InferRequest& request) { + request.set_tensor("input_ids", input_ids); + request.set_tensor("attention_mask", attention_mask); + + ov::Shape input_shape = input_ids.get_shape(); + + ov::Tensor position_ids = request.get_tensor("position_ids"); + position_ids.set_shape(input_shape); + initialize_position_ids(position_ids, attention_mask); + + ov::Tensor beam_idx = request.get_tensor("beam_idx"); + beam_idx.set_shape({input_shape.at(0)}); + std::fill_n(beam_idx.data<int32_t>(), input_shape.at(0), 0); +} + +void set_attention_mask(ov::Tensor&& attention_mask, std::vector<int32_t> next_beams) { + ov::Tensor original_mask{ov::element::i64, attention_mask.get_shape()}; + ov::Shape original_shape = original_mask.get_shape(); + attention_mask.copy_to(original_mask); + + ov::Shape new_shape{next_beams.size(), original_mask.get_shape().at(1) + 1}; + attention_mask.set_shape(new_shape); + + for (size_t beam_id = 0; beam_id < next_beams.size(); beam_id++) { + const size_t original_prompt_offset = next_beams.at(beam_id) * original_shape.at(1); + const size_t result_prompt_offset = beam_id * new_shape.at(1); + + int64_t* dest = attention_mask.data<int64_t>() + result_prompt_offset; + const int64_t* src = original_mask.data<int64_t>() + original_prompt_offset; + + std::memcpy(dest, src, original_shape.at(1) * sizeof(int64_t)); + attention_mask.data<int64_t>()[result_prompt_offset + new_shape.at(1) - 1] = 1; + } +} + +/** + * Set position ids tensor data for next token inference based on provided attention mask + * Supports multi batch + * Supports sparse attention_mask + */ +void update_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention_mask) { + const size_t batch_size = attention_mask.get_shape().at(0); + const size_t atten_length = attention_mask.get_shape().at(1); + position_ids.set_shape({batch_size, 1}); + + for (size_t batch = 0; batch < batch_size; batch++) { + int64_t* start = attention_mask.data<int64_t>() + batch * atten_length; + // todo: be careful with start + atten_length, probably need to replace with start + atten_length -1 + position_ids.data<int64_t>()[batch] = std::accumulate(start, start + atten_length, 0); + } +} + +/** + * Get attention mask tensor for next token inference + * Supports multi batch + * Supports sparse attention_mask + */ +ov::Tensor extend_attention(ov::Tensor attention_mask) { + auto shape = attention_mask.get_shape(); + auto batch_size = shape[0]; + auto seq_len = shape[1]; + + ov::Tensor new_atten_mask = ov::Tensor{attention_mask.get_element_type(), {batch_size, seq_len + 1}}; + auto old_data = attention_mask.data<int64_t>(); + auto new_data = new_atten_mask.data<int64_t>(); + for (size_t batch = 0; batch < batch_size; ++batch) { + std::memcpy(new_data + batch * (seq_len + 1), old_data + batch * seq_len, seq_len * sizeof(int64_t)); + new_data[batch * (seq_len + 1) + seq_len] = 1; + } + return new_atten_mask; +} + +ov::genai::StreamerVariant get_streamer_from_map(const ov::AnyMap& config_map) { + ov::genai::StreamerVariant streamer = std::monostate(); + + if (config_map.count(STREAMER_ARG_NAME)) { + auto any_val = config_map.at(STREAMER_ARG_NAME); + if (any_val.is<std::shared_ptr<ov::genai::StreamerBase>>()) { + streamer = any_val.as<std::shared_ptr<ov::genai::StreamerBase>>(); + } else if (any_val.is<std::function<bool(std::string)>>()) { + streamer = any_val.as<std::function<bool(std::string)>>(); + } + } + return streamer; +} + +ov::genai::OptionalGenerationConfig get_config_from_map(const ov::AnyMap& config_map) { + if (config_map.count(CONFIG_ARG_NAME)) + return config_map.at(CONFIG_ARG_NAME).as<ov::genai::GenerationConfig>(); + else + return std::nullopt; +} + +ProcessorConfig from_any_map( + const ov::AnyMap& config_map, + const ProcessorConfig& initial +) { + auto iter = config_map.find("processor_config"); + ProcessorConfig extracted_config = config_map.end() != iter ? + iter->second.as<ProcessorConfig>() : initial; + using utils::read_anymap_param; + read_anymap_param(config_map, "patch_size", extracted_config.patch_size); + read_anymap_param(config_map, "scale_resolution", extracted_config.scale_resolution); + read_anymap_param(config_map, "max_slice_nums", extracted_config.max_slice_nums); + read_anymap_param(config_map, "norm_mean", extracted_config.norm_mean); + read_anymap_param(config_map, "norm_std", extracted_config.norm_std); + return extracted_config; +} + +/** + * Split config by core and compile configs + * There are not supported by `core.compile` function plugin options like `ENABLE_MMAP` + * Move this options to `core.set_property` config + */ +std::pair<ov::AnyMap, ov::AnyMap> split_core_complile_config(const ov::AnyMap& plugin_config) { + const std::vector<std::string> unsupported_by_compile_options{"ENABLE_MMAP"}; + ov::AnyMap core_config; + ov::AnyMap compile_config{plugin_config}; + + for (const auto option : unsupported_by_compile_options) { + auto iter = plugin_config.find(option); + if (iter != plugin_config.end()) { + core_config[option] = iter->second; + compile_config.erase(option); + } + } + + return {core_config, compile_config}; +}; + +ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend) { + auto minuend_size = minuend.input_ids.get_size(); + auto subtrahend_size = subtrahend.input_ids.get_size(); + ov::Shape new_shape{1, minuend_size - subtrahend_size}; + + ov::Tensor new_input_ids(ov::element::i64, new_shape); + auto data_ptr = minuend.input_ids.data<int64_t>(); + std::copy(data_ptr + subtrahend_size, data_ptr + minuend_size, new_input_ids.data<int64_t>()); + + ov::Tensor new_attention_mask(ov::element::i64, new_shape); + std::fill_n(new_attention_mask.data<int64_t>(), new_shape[1], 1); + + return {new_input_ids, new_attention_mask}; +} + +void slice_matmul_statefull_model(std::shared_ptr<ov::Model> model) { + ov::Node* matmul = nullptr; + auto last_node = model->output(0).get_node()->input_value(0).get_node(); + if (matmul = dynamic_cast<ov::op::v0::MatMul*>(last_node)) { + } else if(auto add = dynamic_cast<ov::op::v1::Add*>(last_node)) { + matmul = dynamic_cast<ov::op::v0::MatMul*>(add->input_value(0).get_node()); + } else if (auto transpose = dynamic_cast<ov::op::v1::Transpose*>(last_node)) { + matmul = dynamic_cast<ov::op::v0::MatMul*>(transpose->input_value(0).get_node()); + } else if (auto multiply = dynamic_cast<ov::op::v1::Multiply*>(last_node)) { + if (auto tanh = dynamic_cast<ov::op::v0::Tanh*>(multiply->input_value(0).get_node())) { + if (auto divide = dynamic_cast<ov::op::v1::Divide*>(tanh->input_value(0).get_node())) { + matmul = dynamic_cast<ov::op::v0::MatMul*>(divide->input_value(0).get_node()); + } + } + } + + if (matmul && matmul->input(0).get_partial_shape().rank().get_length() == 3) { + auto start = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{-1}); + auto stop = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{-2}); + auto step = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{-1}); + auto axis = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{1}); + auto slice = std::make_shared<ov::op::v8::Slice>(matmul->input_value(0), start, stop, step, axis); + matmul->input(0).replace_source_output(slice); + } +} +} // namespace utils +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp new file mode 100644 index 0000000000..7a0f3ddef2 --- /dev/null +++ b/src/cpp/src/utils.hpp @@ -0,0 +1,94 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <nlohmann/json.hpp> + +#include "openvino/genai/llm_pipeline.hpp" +#include "visual_language/processor_config.hpp" + +namespace ov { +namespace genai { +namespace utils { + +Tensor init_attention_mask(const Tensor& position_ids); + +void print_tensor(const ov::Tensor& tensor); + +int64_t argmax(const ov::Tensor& logits, const size_t batch_idx); + +void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask, int64_t start_pos = 0); + +ov::Tensor extend_attention(ov::Tensor attention_mask); + +void update_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention_mask); + +/// @brief reads value to param if T argument type is aligned with value stores in json +/// if types are not compatible leave param unchanged +template <typename T> +void read_json_param(const nlohmann::json& data, const std::string& name, T& param) { + if (data.contains(name)) { + if (data[name].is_number() || data[name].is_boolean() || data[name].is_string() || data[name].is_object()) { + param = data[name].get<T>(); + } + } else if (name.find(".") != std::string::npos) { + size_t delimiter_pos = name.find("."); + std::string key = name.substr(0, delimiter_pos); + if (!data.contains(key)) { + return; + } + std::string rest_key = name.substr(delimiter_pos + 1); + + read_json_param(data[key], rest_key, param); + } +} + +template <typename V> +void read_json_param(const nlohmann::json& data, const std::string& name, std::vector<V>& param) { + if (data.contains(name) && data[name].is_array()) { + param.resize(0); + for (const auto elem : data[name]) { + param.push_back(elem.get<V>()); + } + } +} + +template <typename T> +void read_anymap_param(const ov::AnyMap& config_map, const std::string& name, T& param) { + auto it = config_map.find(name); + if (it != config_map.end()) { + param = it->second.as<T>(); + } +} + +const std::string STREAMER_ARG_NAME = "streamer"; +const std::string CONFIG_ARG_NAME = "generation_config"; + +template<typename Config=ov::genai::GenerationConfig> +Config from_config_json_if_exists(const std::filesystem::path& model_path, const char config_name[]="generation_config.json") { + auto config_file_path = model_path / config_name; + if (std::filesystem::exists(config_file_path)) { + return Config{(config_file_path).string()}; + } else { + return Config{}; + } +} + +ov::genai::StreamerVariant get_streamer_from_map(const ov::AnyMap& config_map); + +ov::genai::OptionalGenerationConfig get_config_from_map(const ov::AnyMap& config_map); + +ProcessorConfig from_any_map( + const ov::AnyMap& config_map, + const ProcessorConfig& initial +); + +std::pair<ov::AnyMap, ov::AnyMap> split_core_complile_config(const ov::AnyMap& plugin_config); + +ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend); + +void slice_matmul_statefull_model(std::shared_ptr<ov::Model> model); +} // namespace utils +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/visual_language/clip.cpp b/src/cpp/src/visual_language/clip.cpp new file mode 100644 index 0000000000..93adc26eb2 --- /dev/null +++ b/src/cpp/src/visual_language/clip.cpp @@ -0,0 +1,316 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +// NOTE: This is modified from clip.cpp only for LLaVA, +// so there might be still unnecessary artifacts hanging around +// I'll gradually clean and extend it +// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch + +#include <cassert> +#include <cmath> +#include <cstdlib> +#include <cstring> +#include <fstream> +#include <map> +#include <regex> +#include <stdexcept> +#include <vector> +#include <sstream> +#include <cinttypes> +#include <limits> + +#include "clip.hpp" + +#include <openvino/openvino.hpp> + +struct clip_hparams { + int32_t image_size; + int32_t hidden_size; + int32_t n_intermediate; + int32_t projection_dim; + int32_t n_head; + int32_t n_layer; + + float eps; + + char mm_patch_merge_type[32] = "flat"; // spatial_unpad or flat (default) + + int32_t image_grid_pinpoints[32]; + int32_t image_crop_resolution; +}; + +// Linear interpolation between two points +inline float clip_lerp(float s, float e, float t) { + return s + (e - s) * t; +} +// Bilinear resize function +static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) { + dst.nx = target_width; + dst.ny = target_height; + dst.buf.resize(3 * target_width * target_height); + + float x_ratio = static_cast<float>(src.nx - 1) / target_width; + float y_ratio = static_cast<float>(src.ny - 1) / target_height; + + for (int y = 0; y < target_height; y++) { + for (int x = 0; x < target_width; x++) { + float px = x_ratio * x; + float py = y_ratio * y; + int x_floor = static_cast<int>(px); + int y_floor = static_cast<int>(py); + float x_lerp = px - x_floor; + float y_lerp = py - y_floor; + + for (int c = 0; c < 3; c++) { + float top = clip_lerp( + static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]), + static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]), + x_lerp + ); + float bottom = clip_lerp( + static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]), + static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]), + x_lerp + ); + dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(clip_lerp(top, bottom, y_lerp)); + } + } + } +} + +// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not +static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, const float mean[3], const float std[3]) { + dst->nx = src->nx; + dst->ny = src->ny; + dst->buf.resize(src->buf.size()); + + for (size_t i = 0; i < src->buf.size(); ++i) { + int c = i % 3; // rgb + dst->buf[i] = (static_cast<float>(src->buf[i]) / 255.0f - mean[c]) / std[c]; + } +} + +template<typename NUM> +NUM clip(NUM x, NUM lower, NUM upper) { + return std::max(lower, std::min(x, upper)); +} + +bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_width, int target_height) { + const int nx = img.nx; + const int ny = img.ny; + + dst.nx = target_width; + dst.ny = target_height; + dst.buf.resize(3 * target_width * target_height); + + float Cc; + float C[5]; + float d0, d2, d3, a0, a1, a2, a3; + int i, j, k, jj; + int x, y; + float dx, dy; + float tx, ty; + + tx = (float)nx / (float)target_width; + ty = (float)ny / (float)target_height; + + // Bicubic interpolation; adapted from ViT.cpp, inspired from : + // -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36 + // -> https://en.wikipedia.org/wiki/Bicubic_interpolation + + for (i = 0; i < target_height; i++) { + for (j = 0; j < target_width; j++) { + x = (int)(tx * j); + y = (int)(ty * i); + + dx = tx * j - x; + dy = ty * i - y; + + for (k = 0; k < 3; k++) { + for (jj = 0; jj <= 3; jj++) { + d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + + a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; + a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; + a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; + + C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx; + + d0 = C[0] - C[1]; + d2 = C[2] - C[1]; + d3 = C[3] - C[1]; + a0 = C[1]; + a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; + a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; + a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; + Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy; + + const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f); + dst.buf[(i * target_width + j) * 3 + k] = float(Cc2); + } + } + } + } + + return true; +} + +// llava-1.6 type of resize_and_pad (black) +static void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &image_output, const std::pair<int, int>& target_resolution) { + int target_width = target_resolution.first; + int target_height = target_resolution.second; + + float scale_w = static_cast<float>(target_width) / image.nx; + float scale_h = static_cast<float>(target_height) / image.ny; + + int new_width, new_height; + + if (scale_w < scale_h) { + new_width = target_width; + new_height = std::min(static_cast<int>(std::ceil(image.ny * scale_w)), target_height); + } else { + new_height = target_height; + new_width = std::min(static_cast<int>(std::ceil(image.nx * scale_h)), target_width); + } + + clip_image_u8 resized_image; + // bilinear_resize(image, resized_image, new_width, new_height); + bicubic_resize(image, resized_image, new_width, new_height); + + clip_image_u8 padded_image; + padded_image.nx = target_width; + padded_image.ny = target_height; + padded_image.buf.resize(3 * target_width * target_height, 0); // Initialize with black + + // Calculate padding offsets + int pad_x = (target_width - new_width) / 2; + int pad_y = (target_height - new_height) / 2; + + // Copy the resized image into the center of the padded buffer + for (int y = 0; y < new_height; ++y) { + for (int x = 0; x < new_width; ++x) { + for (int c = 0; c < 3; ++c) { + padded_image.buf[3 * ((y + pad_y) * target_width + (x + pad_x)) + c] = resized_image.buf[3 * (y * new_width + x) + c]; + } + } + } + image_output = std::move(padded_image); +} + +/** + * Selects the best resolution from a list of possible resolutions based on the original size. + * + * @param original_size The original size of the image in the format (width, height). + * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...]. + * @return The best fit resolution in the format (width, height). + */ +static std::pair<int, int> select_best_resolution(const std::pair<int, int> & original_size, const std::vector<std::pair<int, int>> & possible_resolutions) { + int original_width = original_size.first; + int original_height = original_size.second; + std::pair<int, int> best_fit; + int max_effective_resolution = 0; + int min_wasted_resolution = std::numeric_limits<int>::max(); + + for (const auto& resolution : possible_resolutions) { + int width = resolution.first; + int height = resolution.second; + float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height); + int downscaled_width = static_cast<int>(original_width * scale); + int downscaled_height = static_cast<int>(original_height * scale); + int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); + int wasted_resolution = (width * height) - effective_resolution; + // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); + if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { + max_effective_resolution = effective_resolution; + min_wasted_resolution = wasted_resolution; + best_fit = resolution; + } + } + + return best_fit; +} + +// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector +// res_imgs memory is being allocated here, previous allocations will be freed if found +clip_image_f32 clip_image_preprocess(clip_ctx& ctx, const clip_image_u8& img) { + bool pad_to_square = true; + + // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104) + // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156 + + clip_image_u8 temp; // we will keep the input image data here temporarily + temp.nx = img.nx; + temp.ny = img.ny; + temp.buf.resize(img.buf.size()); + memcpy(temp.buf.data(), img.buf.data(), temp.buf.size()); + + + const int nx = temp.nx; + const int ny = temp.ny; + // clip_image_save_to_bmp(*temp, "resized_vanilla.bmp"); + + const int nx2 = temp.nx; + const int ny2 = temp.ny; + + clip_image_f32 res; + res.nx = nx2; + res.ny = ny2; + res.buf.resize(3 * nx2 * ny2); + + // const float scale = std::max(nx, ny) / (float)ctx.vision_model.hparams.image_size; + + // const int nx3 = int(nx / scale + 0.5f); + // const int ny3 = int(ny / scale + 0.5f); + + const int nx3 = nx; + const int ny3 = ny; + + const auto& m3 = ctx.image_mean; // {0.48145466f, 0.4578275f, 0.40821073f}; + const auto& s3 = ctx.image_std; // {0.26862954f, 0.26130258f, 0.27577711f}; + + for (int y = 0; y < ny3; y++) { + for (int x = 0; x < nx3; x++) { + for (int c = 0; c < 3; c++) { + // linear interpolation + const float sx = x; + const float sy = y; + + const int x0 = std::max(0, (int)std::floor(sx)); + const int y0 = std::max(0, (int)std::floor(sy)); + + const int x1 = std::min(x0 + 1, nx - 1); + const int y1 = std::min(y0 + 1, ny - 1); + + const float dx = sx - x0; + const float dy = sy - y0; + + const int j00 = 3 * (y0 * nx + x0) + c; + const int j01 = 3 * (y0 * nx + x1) + c; + const int j10 = 3 * (y1 * nx + x0) + c; + const int j11 = 3 * (y1 * nx + x1) + c; + + const float v00 = temp.buf[j00]; + const float v01 = temp.buf[j01]; + const float v10 = temp.buf[j10]; + const float v11 = temp.buf[j11]; + + const float v0 = v00 * (1.0f - dx) + v01 * dx; + const float v1 = v10 * (1.0f - dx) + v11 * dx; + + const float v = v0 * (1.0f - dy) + v1 * dy; + + const uint8_t v2 = std::min(std::max(std::round(v), 0.0f), 255.0f); + + //rgb hwc ->chw + //const int i = 3 * (y * nx3 + x) + c; + const int i = (y * nx3 + x) + c * nx3 * ny3; + + res.buf[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c]; + } + } + } + return res; +} diff --git a/src/cpp/src/visual_language/clip.hpp b/src/cpp/src/visual_language/clip.hpp new file mode 100644 index 0000000000..bce6cc8970 --- /dev/null +++ b/src/cpp/src/visual_language/clip.hpp @@ -0,0 +1,54 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <vector> +#include <numeric> + +//#define CLIP_DEBUG_FUNCTIONS +enum projector_type { + PROJECTOR_TYPE_RESAMPLER, + PROJECTOR_TYPE_UNKNOWN, +}; + +struct clip_ctx { + bool has_text_encoder = false; + bool has_vision_encoder = false; + bool has_minicpmv_projector = false; + + float image_mean[3]; + float image_std[3]; + int32_t ftype = 1; + + std::vector<uint8_t> buf_compute_meta; + + projector_type proj_type = PROJECTOR_TYPE_RESAMPLER; + size_t patch_size = 0; + size_t image_size = 0; +}; + +// RGB uint8 image +struct clip_image_u8 { + int nx; + int ny; + + std::vector<uint8_t> buf; +}; + +// RGB float32 image (NHWC) +// Memory layout: RGBRGBRGB... +struct clip_image_f32 { + int nx; + int ny; + + std::vector<float> buf; +}; + +/** interpret bytes as an image file with length bytes_length, and use the result to populate img */ +bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img); + +bool bicubic_resize(const clip_image_u8& img, clip_image_u8& dst, int target_width, int target_height); + +/** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */ +clip_image_f32 clip_image_preprocess(struct clip_ctx& ctx, const clip_image_u8& img); diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp new file mode 100644 index 0000000000..6d0223f79c --- /dev/null +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -0,0 +1,791 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/visual_language/pipeline.hpp" +#include "openvino/genai/tokenizer.hpp" +#include "vlm_sampling.hpp" +#include "clip.hpp" +#include "text_callback_streamer.hpp" +#include "utils.hpp" +#include "vision_encoder.hpp" +#include "vlm_config.hpp" +#include <openvino/openvino.hpp> +#include <optional> +#include <random> + +using namespace ov::genai; + +namespace { +template<class... Ts> struct overloaded : Ts... {using Ts::operator()...;}; +template<class... Ts> overloaded(Ts...) -> overloaded<Ts...>; + +constexpr size_t BATCH_SIZE = 1; + +struct Args { + bool do_sample = false; + int top_k = 0; + float top_p = 0.7f; + float temp = 0.95f; + float repeat_penalty = 1.0f; +}; + +int64_t get_out_token_id(const std::vector<int>& input_ids, float* logits, size_t vocab_size, Args args) { + int64_t out_token; + + // logits pre-process + if (args.repeat_penalty != 1.f) { + sampling_repetition_penalty(logits, logits + vocab_size, input_ids, args.repeat_penalty); + } + + if (args.do_sample) + { + if (args.temp > 0) { + sampling_temperature(logits, logits + vocab_size, args.temp); + } + + std::vector<TokenIdScore> token_scores(vocab_size); + for (int i = 0; i < vocab_size; i++) { + token_scores[i] = TokenIdScore(i, logits[i]); + } + + // top_k sampling + if (0 < args.top_k && args.top_k < (int)token_scores.size()) { + sampling_top_k(token_scores.data(), token_scores.data() + args.top_k, + token_scores.data() + token_scores.size()); + token_scores.resize(args.top_k); + } + + // top_p sampling + if (0.f < args.top_p && args.top_p < 1.f) { + auto pos = sampling_top_p(token_scores.data(), token_scores.data() + token_scores.size(), args.top_p); + token_scores.resize(pos - token_scores.data()); + } + + // sample next token + sampling_softmax_inplace(token_scores.data(), token_scores.data() + token_scores.size()); + for (size_t i = 0; i < token_scores.size(); i++) { + logits[i] = token_scores[i].score; + } + + thread_local std::random_device rd; + thread_local std::mt19937 gen(rd()); + + std::discrete_distribution<> dist(logits, logits + token_scores.size()); + out_token = token_scores[dist(gen)].id; + } + else { + out_token = std::max_element(logits, logits + vocab_size) - logits; + } + + return out_token; +} + +ov::Tensor process_prompt(ov::InferRequest& embedding, const ov::Tensor& prompt, float scale_emb) { + embedding.set_input_tensor(prompt); + embedding.infer(); + + const ov::Tensor& embed_output_tensor = embedding.get_output_tensor(); + + ov::Shape out_shape = embed_output_tensor.get_shape(); + float* data = embed_output_tensor.data<float>(); + + //embedding * scale_emb + for (size_t idx = 0; idx < embed_output_tensor.get_size(); idx++) { + data[idx] = data[idx] * scale_emb; + } + return embed_output_tensor; +} + +ov::Tensor concatenate_last_dim(const ov::Tensor& first, const ov::Tensor& second) { + size_t res_d_0 = first.get_shape().at(0); + size_t res_d_1 = first.get_shape().at(1); + OPENVINO_ASSERT(second.get_shape().at(0) == res_d_0); + OPENVINO_ASSERT(second.get_shape().at(1) == res_d_1); + size_t res_d_2 = first.get_shape().at(2) + second.get_shape().at(2); + ov::Tensor res{first.get_element_type(), {res_d_0, res_d_1, res_d_2}}; + float* first_data = first.data<float>(); + float* second_data = second.data<float>(); + float* res_data = res.data<float>(); + for (size_t i = 0; i < res_d_0; ++i) { + for (size_t j = 0; j < res_d_1; ++j) { + size_t k = 0; + for (; k < first.get_shape().at(2); ++k) { + res_data[i * res_d_1 * res_d_2 + j * res_d_2 + k] + = first_data[i * res_d_1 * first.get_shape().at(2) + j * first.get_shape().at(2) + k]; + } + for (size_t l = 0; l < second.get_shape().at(2); ++l, ++k) { + res_data[i * res_d_1 * res_d_2 + j * res_d_2 + k] + = second_data[i * res_d_1 * second.get_shape().at(2) + j * second.get_shape().at(2) + l]; + } + } + } + return res; +} + +ov::Tensor concatenate_mid_dim(const ov::Tensor& first, const ov::Tensor& second) { + size_t res_d_0 = first.get_shape().at(0); + size_t res_d_2 = first.get_shape().at(2); + OPENVINO_ASSERT(second.get_shape().at(0) == res_d_0); + OPENVINO_ASSERT(second.get_shape().at(2) == res_d_2); + size_t res_d_1 = first.get_shape().at(1) + second.get_shape().at(1); + ov::Tensor res{first.get_element_type(), {res_d_0, res_d_1, res_d_2}}; + float* first_data = first.data<float>(); + float* second_data = second.data<float>(); + float* res_data = res.data<float>(); + for (size_t i = 0; i < res_d_0; ++i) { + size_t j = 0; + for (; j < first.get_shape().at(1); ++j) { + std::copy_n( + first_data + i * first.get_shape().at(1) * res_d_2 + j * res_d_2, + res_d_2, + res_data + i * res_d_1 * res_d_2 + j * res_d_2 + ); + } + for (size_t k = 0; k < second.get_shape().at(1); ++k, ++j) { + std::copy_n( + second_data + i * second.get_shape().at(1) * res_d_2 + k * res_d_2, + res_d_2, + res_data + i * res_d_1 * res_d_2 + j * res_d_2 + ); + } + } + return res; +} + +/// embed_dim: output dimension for each position +/// pos: a list of positions to be encoded: size (H, W) +/// out: (H, W, D) +ov::Tensor get_1d_sincos_pos_embed_from_grid_new(size_t embed_dim, const ov::Tensor& pos) { + OPENVINO_ASSERT(embed_dim % 2 == 0); + ov::Shape pos_shape = pos.get_shape(); + size_t H = pos_shape[0]; + size_t W = pos_shape[1]; + + std::vector<float> omega(embed_dim / 2); + for (size_t i = 0; i < omega.size(); ++i) { + omega[i] = 1.0f / std::pow(10000.0f, float(i) / (embed_dim / 2)); + } + + std::vector<size_t> out_shape = {H, W, embed_dim}; + ov::Tensor emb(ov::element::f32, out_shape); + + float* pos_data = pos.data<float>(); + float* emb_data = emb.data<float>(); + + size_t counter = 0; + for (size_t h = 0; h < H; ++h) { + for (size_t w = 0; w < W; ++w) { + for (size_t d = 0; d < embed_dim / 2; ++d) { + // Correctly access the 2D position grid + float value = omega[d] * pos_data[h * W + w]; + // There should be sinf() and cosf(), but they don't exist on default Ubuntu20 gcc. + emb_data[h * W * embed_dim + w * embed_dim + d] = std::sin(double(value)); + emb_data[h * W * embed_dim + w * embed_dim + d + (embed_dim / 2)] = std::cos(double(value)); + } + } + } + return emb; +} + +ov::Tensor get_2d_sincos_pos_embed_from_grid(size_t embed_dim, const ov::Tensor& grid) { + OPENVINO_ASSERT(embed_dim % 2 == 0); + ov::Shape grid_shape = grid.get_shape(); + float* grid_data = grid.data<float>(); + ov::Shape plane_shape{grid_shape.at(1), grid_shape.at(2)}; + ov::Tensor emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, ov::Tensor{ + ov::element::f32, + plane_shape, + grid_data + }); // (H, W, D/2) + ov::Tensor emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, ov::Tensor{ + ov::element::f32, + plane_shape, + grid_data + plane_shape.at(0) * plane_shape.at(1) + }); // (H, W, D/2) + return concatenate_last_dim(emb_h, emb_w); +} + +/// image_size: image_size or (image_height, image_width) +/// return: +/// pos_embed: [image_height, image_width, embed_dim] +ov::Tensor get_2d_sincos_pos_embed(size_t embed_dim, const ImageSize& image_size) { + size_t grid_h_size = image_size.height, grid_w_size = image_size.width; + ov::Tensor grid(ov::element::f32, {2, grid_h_size, grid_w_size}); + float* data = grid.data<float>(); + for (size_t y = 0; y < grid_h_size; ++y) { + std::iota(data, data + grid_w_size, 0.0f); + data += grid_w_size; + } + for (float y = 0.0f; y < grid_h_size; ++y) { + std::fill(data, data + grid_w_size, y); + data += grid_w_size; + } + return get_2d_sincos_pos_embed_from_grid(embed_dim, grid); +} + +void adjust_pos_cache( + const std::vector<ImageSize>& target_sizes, + size_t hidden_size, + ov::Tensor& pos_embed_cache +) { + size_t max_h = std::max_element(target_sizes.begin(), target_sizes.end(), [](const ImageSize& left, const ImageSize& right) { + return left.height < right.height; + })->height; + size_t max_w = std::max_element(target_sizes.begin(), target_sizes.end(), [](const ImageSize& left, const ImageSize& right) { + return left.width < right.width; + })->width; + size_t allocated_height, allocated_width; + if (pos_embed_cache) { + const ov::Shape& allocated_shape = pos_embed_cache.get_shape(); + allocated_height = allocated_shape.at(0); + allocated_width = allocated_shape.at(1); + } else { + allocated_height = allocated_width = 70; + } + if (max_h > allocated_height || max_w > allocated_width) { + allocated_height = std::max(max_h, allocated_height); + allocated_width = std::max(max_w, allocated_width); + pos_embed_cache = get_2d_sincos_pos_embed( + hidden_size, {allocated_height, allocated_width} + ); + } +} + +ov::Tensor merge_text_and_image_embeddings_llava( + const ov::Tensor& input_ids, + const ov::Tensor& text_embeds, + const ov::Tensor& image_embeds, + int64_t image_token_index +) { + auto text_embeds_shape = text_embeds.get_shape(); + auto image_embeds_shape = image_embeds.get_shape(); + + OPENVINO_ASSERT( + text_embeds_shape[2] == image_embeds_shape[2], + "Incompatible shapes between text_embeds and image_embeds" + ); + + size_t text_embeds_seq_length = text_embeds_shape[1]; + size_t hidden_size = text_embeds_shape[2]; + size_t image_embeds_seq_length = image_embeds_shape[1]; + + size_t merged_seq_length = text_embeds_seq_length + (image_embeds_seq_length - 1); + + ov::Tensor merged_embeds(text_embeds.get_element_type(), {BATCH_SIZE, merged_seq_length, hidden_size}); + + const int64_t* input_ids_data = input_ids.data<const int64_t>(); + const float* text_embeds_data = text_embeds.data<const float>(); + const float* image_embeds_data = image_embeds.data<const float>(); + float* merged_data = merged_embeds.data<float>(); + + + size_t merged_idx = 0; + for (size_t s = 0; s < text_embeds_seq_length; ++s) { + if (input_ids_data[s] == image_token_index) { + for (size_t i = 0; i < image_embeds_seq_length; ++i) { + std::copy_n(image_embeds_data + i * hidden_size, + hidden_size, + merged_data + merged_idx * hidden_size); + merged_idx++; + } + } else { + std::copy_n(text_embeds_data + s * hidden_size, + hidden_size, + merged_data + merged_idx * hidden_size); + merged_idx++; + } + } + + return merged_embeds; +} + +ov::Core singleton_core() { + static ov::Core core; + return core; +} +} + +class ov::genai::VLMPipeline::VLMPipelineImpl { +public: + // A config to follow for LLM input construction. + VLMConfig m_vlm_config; + // A config to follow for text generation. + GenerationConfig m_generation_config; + // A tokenizer encoding a prompt. + Tokenizer m_tokenizer; + // An encoder to infer embeddings of an image. + VisionEncoder m_vision_encoder; + // A resampler model to resample image embeddings. + // [N, H*W, old_hidden_size] is the input shape. + // [N, query_num, hidden_size] is the output shape. + ov::InferRequest m_resampler; + // A model to compute token embeddings. + // Input shape: [N, conversation length]. + // Output shape: [1, conversation length, hidden_size]. + ov::InferRequest m_embedding; + // A language model used to generate a response. + // Input shapes: inputs_embeds[N, conversation length, hidden_size], + // position_ids[N, conversation length], beam_idx[N]. + // Output shape: logits[N, conversation length, vocab_size]. + ov::InferRequest m_language; + // Precomputed positional embeddings for the resampler. + // [70, 70, hidden_size]. 70 is the initial guess of the image + // height and width after dividing by patch_size. + ov::Tensor m_pos_embed_cache; + // True if chat mode is activated to save conversation + // history between generate() calls. + bool m_is_chat_conversation; + ChatHistory m_history; + std::string m_templated_chat_history; + size_t m_image_id; // Used to insert <image_id>i</image_id> per image (not a slice). + + VLMPipelineImpl( + const std::filesystem::path& model_dir, + const std::string& device, + const ov::AnyMap device_config + ) : + m_vlm_config{ + utils::from_config_json_if_exists<ov::genai::VLMConfig>( + model_dir, "config.json" + ) + }, + m_tokenizer{Tokenizer(model_dir.string(), device_config)}, + m_vision_encoder(model_dir, m_vlm_config.model_type, device, device_config, singleton_core()), + m_is_chat_conversation{false}, + m_image_id{0} { + ov::Core core = singleton_core(); + if (m_vlm_config.model_type == VLMModelType::MINICPM) { + m_resampler = core.compile_model( + model_dir / "openvino_resampler_model.xml", device, device_config + ).create_infer_request(); + + m_embedding = core.compile_model( + model_dir / "openvino_text_embeddings_model.xml", device, device_config + ).create_infer_request(); + + m_language = core.compile_model( + model_dir / "openvino_language_model.xml", device, device_config + ).create_infer_request(); + + m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70}); + } else if (m_vlm_config.model_type == VLMModelType::LLAVA) { + m_language = core.compile_model( + model_dir / "openvino_language_model.xml", device, device_config + ).create_infer_request(); + + // Reusing the same m_embedding for llava text_embeddings model + m_embedding = core.compile_model( + model_dir / "openvino_text_embeddings_model.xml", device, device_config + ).create_infer_request(); + } + + m_language.get_tensor("attention_mask").set_shape({1, 0}); + } + + DecodedResults generate( + const std::string& prompt, + const std::vector<ov::Tensor>& rgbs, + const GenerationConfig& generation_config, + const StreamerVariant& streamer + ) { + ov::Tensor inputs_embeds; + if (m_vlm_config.model_type == VLMModelType::MINICPM) { + inputs_embeds = get_inputs_embeds_minicpm(prompt, rgbs); + } else if (m_vlm_config.model_type == VLMModelType::LLAVA) { + inputs_embeds = get_inputs_embeds_llava(prompt, rgbs); + } + + m_language.set_tensor("inputs_embeds", inputs_embeds); + size_t history_len = m_language.get_tensor("attention_mask").get_shape().at(1); + m_language.get_tensor("attention_mask").set_shape({1, history_len + inputs_embeds.get_shape()[1]}); + std::fill_n(m_language.get_tensor("attention_mask").data<int64_t>(), m_language.get_tensor("attention_mask").get_size(), 1); + + m_language.get_tensor("position_ids").set_shape({1, inputs_embeds.get_shape().at(1)}); + std::iota(m_language.get_tensor("position_ids").data<int64_t>(), m_language.get_tensor("position_ids").data<int64_t>() + m_language.get_tensor("position_ids").get_size(), history_len); + + m_language.get_tensor("beam_idx").set_shape({ BATCH_SIZE }); + m_language.get_tensor("beam_idx").data<int32_t>()[0] = 0; + + m_language.infer(); + + ov::Shape logits_shape = m_language.get_tensor("logits").get_shape(); + auto attention_size = m_language.get_tensor("attention_mask").get_size(); + + int64_t sequence_len = m_language.get_tensor("logits").get_shape().at(1) - 1; + size_t vocab_size = m_language.get_tensor("logits").get_shape().back(); + float* logits = m_language.get_tensor("logits").data<float>() + sequence_len * vocab_size; + int64_t out_token = std::max_element(logits, logits + vocab_size) - logits; + + m_language.get_tensor("inputs_embeds").set_shape({BATCH_SIZE, 1, m_vlm_config.hidden_size}); + m_language.get_tensor("position_ids").set_shape({ BATCH_SIZE, 1 }); + + m_embedding.get_input_tensor().set_shape({ 1, 1 }); + + int64_t eos_token_id = m_tokenizer.get_eos_token_id(); + std::shared_ptr<StreamerBase> streamer_ptr = std::visit(overloaded{ + [&m_tokenizer = m_tokenizer]( + const std::function<bool(std::string)>& callback + ) -> std::shared_ptr<StreamerBase> { + return std::make_shared<TextCallbackStreamer>(m_tokenizer, callback); + }, + [](const std::shared_ptr<StreamerBase>& ptr) { + return ptr; + }, + [](std::monostate) { + return std::shared_ptr<StreamerBase>{nullptr}; + }, + }, streamer); + std::vector<int64_t> generated; + while (true) { //(out_token != eos_token_id) + m_embedding.get_input_tensor().data<int64_t>()[0] = out_token; + m_embedding.infer(); + const ov::Tensor& embed_prompt_tensor = m_embedding.get_output_tensor(); + float* embed_data = embed_prompt_tensor.data<float>(); + for (auto idx = 0; idx < embed_prompt_tensor.get_size(); idx++) { + embed_data[idx] = embed_data[idx] * m_vlm_config.scale_emb; + } + + m_language.set_tensor("inputs_embeds", embed_prompt_tensor); + m_language.get_tensor("attention_mask").set_shape({ BATCH_SIZE, m_language.get_tensor("attention_mask").get_shape()[1] + 1 }); + std::fill_n(m_language.get_tensor("attention_mask").data<int64_t>(), m_language.get_tensor("attention_mask").get_size(), 1); + m_language.get_tensor("position_ids").data<int64_t>()[0] = int64_t(m_language.get_tensor("attention_mask").get_size() - 1); + + m_language.infer(); + + generated.push_back(out_token); + if (streamer_ptr && streamer_ptr->put(out_token)) { + break; + } + logits = m_language.get_tensor("logits").data<float>(); + + out_token = std::max_element(logits, logits + vocab_size) - logits; + if (out_token == eos_token_id) { + break; + } + } + + if (streamer_ptr) { + streamer_ptr->end(); + } + + std::string decoded_results = m_tokenizer.decode(generated); + if (m_is_chat_conversation) { + // Tail of chat template is missing in KV cache. + // Find the tail to concatenate it with the next input prompt. + m_templated_chat_history.append(decoded_results); + m_history.push_back({{"role", "assistant"}, {"content", decoded_results}}); + } else { + for (auto& variable : m_language.query_state()) { + variable.reset(); + } + m_language.get_tensor("attention_mask").set_shape({1, 0}); + } + std::cout << '\n'; + std::cout << eos_token_id << '\n'; + std::cout << decoded_results << '\n'; + return {{std::move(decoded_results)}}; + } + + DecodedResults generate( + const std::string& prompt, + const ov::AnyMap& config_map + ) { + auto image = config_map.find(ov::genai::image.name()); + auto images = config_map.find(ov::genai::images.name()); + OPENVINO_ASSERT( + config_map.end() == image || config_map.end() == images, + "Only one property can be set: image of images." + ); + std::vector<ov::Tensor> rgbs; + if (config_map.end() != image) { + rgbs = {image->second.as<ov::Tensor>()}; + } if (config_map.end() != images) { + rgbs = images->second.as<std::vector<ov::Tensor>>(); + } + ov::genai::OptionalGenerationConfig config_arg = utils::get_config_from_map(config_map); + GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config(); + config.update_generation_config(config_map); + return generate( + prompt, + rgbs, + config, + utils::get_streamer_from_map(config_map) + ); + } + + void start_chat(const std::string& system_message) { + m_is_chat_conversation = true; + bool have_state = 0 != m_language.get_tensor("attention_mask").get_size(); + if (have_state) { + // Resetting state may be slow. + for (ov::VariableState& variable : m_language.query_state()) { + variable.reset(); + } + // Since if is already introduced, move all resetting here. + m_language.get_tensor("attention_mask").set_shape({1, 0}); + m_history.clear(); + m_templated_chat_history.clear(); + } + if (system_message.empty()) { + return; + } + m_history = {{{"role", "system"}, {"content", system_message}}}; + constexpr bool add_generation_prompt = false; + m_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); + } + + void finish_chat() {m_is_chat_conversation = false;} + + void set_chat_template(const std::string& new_template) { + m_tokenizer.set_chat_template(new_template); + } + + GenerationConfig get_generation_config() const { + return m_generation_config; + } + + void set_generation_config(const GenerationConfig& new_config) { + m_generation_config = new_config; + } + + ov::Tensor get_inputs_embeds_llava(const std::string& prompt, const std::vector<ov::Tensor>& images) { + std::string image_token = "<image>"; // TODO Consider getting from vlm_config or json + std::string formatted_prompt = "USER: " + (images.empty() ? prompt : image_token + "\n" + prompt) + " ASSISTANT:"; + ov::Tensor input_ids = m_tokenizer.encode(formatted_prompt).input_ids; + if (images.empty()) { + return process_prompt(m_embedding, input_ids, m_vlm_config.scale_emb); + } else { + OPENVINO_ASSERT(1 == images.size(), "Only a single image allowed"); + EncodedImage encoded_image = m_vision_encoder.encode(images.at(0)); + ov::Tensor image_embeds = encoded_image.resized_source; + + ov::Tensor text_embeds = process_prompt(m_embedding, input_ids, m_vlm_config.scale_emb); + + int64_t image_token_index = 32000; // TODO Consider getting from m_vlm_config.image_token_index or config.json + + return merge_text_and_image_embeddings_llava(input_ids, text_embeds, image_embeds, image_token_index); + } + } + + ov::Tensor get_inputs_embeds_minicpm(const std::string& prompt, const std::vector<ov::Tensor>& images) { + std::string images_prompt; + std::vector<EncodedImage> embeds; + for (const ov::Tensor& rgb : images) { + ov::Tensor reshaped = rgb; + ov::Shape rgb_shape = rgb.get_shape(); + switch (rgb_shape.size()) { + case 3: + reshaped.set_shape({1, rgb_shape.at(0), rgb_shape.at(1), rgb_shape.at(2)}); + break; + case 4: break; + default: OPENVINO_THROW("Input image must have [NHWC] or [HWC] layout"); + } + ov::Shape reshaped_shape = reshaped.get_shape(); + for (size_t batch_idx = 0; batch_idx < reshaped_shape.at(0); ++batch_idx) { + ov::Tensor single_image{ + ov::element::u8, + {1, reshaped_shape.at(1), reshaped_shape.at(2), reshaped_shape.at(3)}, + reshaped.data<uint8_t>() + batch_idx * reshaped_shape.at(1) * reshaped_shape.at(1) * reshaped_shape.at(1) + }; + EncodedImage encoded_image = m_vision_encoder.encode(single_image); + if (m_vlm_config.use_image_id) { + images_prompt += m_vlm_config.im_id_start + std::to_string(m_image_id) + m_vlm_config.im_id_end; + ++m_image_id; + } + std::string unk64; + for (size_t idx = 0; idx < m_vlm_config.query_num; ++idx) { + unk64 += m_vlm_config.unk; + } + images_prompt += m_vlm_config.im_start + unk64 + m_vlm_config.im_end; + if (encoded_image.slices) { + ov::Shape slices_shape = encoded_image.slices.get_shape(); + for (size_t row_idx = 0; row_idx < slices_shape.at(0); ++row_idx) { + for (size_t col_idx = 0; col_idx < slices_shape.at(1); ++col_idx) { + images_prompt += m_vlm_config.slice_start + unk64 + m_vlm_config.slice_end; + } + images_prompt += '\n'; + } + } + if ('\n' != *(images_prompt.end() - 1)) { + // Image wasn't sliced, add \n to the end of image anyway. + // Strangely, \n isn't placed between </image><slice>. + images_prompt += '\n'; + } + embeds.push_back(std::move(encoded_image)); + } + } + images_prompt += prompt; + ov::Tensor encoded_input; + if (m_is_chat_conversation) { + // KV cache in model already contains prompts and answers from previous iterations. + // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns + // token_ids = {<bos token>, ...<valuable tokens>}. So if tokenizer applies only to the new prompt, + // <bos token> will be inserted on every iteration. + // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt + // and takes only the difference between them. + // The chat history cannot be saved as already encoded tokens because generate call doesn't return <eos> token, but + // KV cache contains it. So we have to add it manually or get it by tokenization all chat history. + m_history.push_back({{"role", "user"}, {"content", images_prompt}}); + constexpr bool add_generation_prompt = true; + std::string new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); + ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history).input_ids; + if (0 == m_language.get_tensor("attention_mask").get_shape().at(1)) { + encoded_input = new_chat_tokens; + } else { + TokenizedInputs prev_chat_tokens = m_tokenizer.encode( + m_templated_chat_history + ); + encoded_input = utils::subtract_chat_tokenized_inputs( + {new_chat_tokens}, prev_chat_tokens + ).input_ids; + } + m_templated_chat_history = std::move(new_templated_chat_history); + } else { + encoded_input = m_tokenizer.encode(images_prompt).input_ids; + } + m_embedding.set_input_tensor(encoded_input); + m_embedding.infer(); + ov::Tensor inputs_embeds = m_embedding.get_output_tensor(); + OPENVINO_ASSERT( + m_vlm_config.hidden_size == inputs_embeds.get_shape().at(2), + "Unexpected embedding size" + ); + ov::Tensor special_tokens = m_tokenizer.encode( + m_vlm_config.im_start + + m_vlm_config.im_end + + m_vlm_config.slice_start + + m_vlm_config.slice_end + ).input_ids; + OPENVINO_ASSERT( + 4 == special_tokens.get_shape().at(1), + "Every special token must be represented with a single int." + ); + int64_t im_start_id = special_tokens.data<int64_t>()[0]; + int64_t im_end_id = special_tokens.data<int64_t>()[1]; + int64_t slice_start_id = special_tokens.data<int64_t>()[2]; + int64_t slice_end_id = special_tokens.data<int64_t>()[3]; + int64_t im_start_pos = 0, slice_start_pos = 0; + int64_t* begin = encoded_input.data<int64_t>(); + int64_t* ids = begin; + size_t encoded_input_size = encoded_input.get_size(); + int64_t* end = ids + encoded_input_size; + float* inputs_embeds_data = inputs_embeds.data<float>(); + for (const EncodedImage& encoded_image : embeds) { + const ov::Tensor& resampled_source = resample(*this, encoded_image.resized_source, {encoded_image.resized_source_size}); + float* emb = resampled_source.data<float>(); + ids = std::find(ids, end, im_start_id); + OPENVINO_ASSERT(end != ids); + ++ids; + std::copy_n(emb, resampled_source.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size); + ids += m_vlm_config.query_num; + if (encoded_image.slices) { + size_t token_idx = 0; + const ov::Shape& slices_shape = encoded_image.slices.get_shape(); + for (size_t i = 0; i < slices_shape.at(0); ++i) { + for (size_t ja = 0; ja < slices_shape.at(1); ++ja) { + size_t d2 = slices_shape.at(2); + size_t d3 = slices_shape.at(3); + ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, encoded_image.slices.data<float>() + (i * slices_shape.at(1) + ja) * d2 * d3}; + const ov::Tensor& vision_embed_tensor_i_j = resample(*this, encoded_view, {encoded_image.slices_size}); + ids = std::find(ids, end, slice_start_id); + OPENVINO_ASSERT(end != ids); + ++ids; + std::copy_n(vision_embed_tensor_i_j.data<float>(), vision_embed_tensor_i_j.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size); + ids += m_vlm_config.query_num; + } + } + } + } + + return inputs_embeds; + } + + ov::Tensor resample(VLMPipeline::VLMPipelineImpl& pipe, const ov::Tensor& encoded_image, const std::vector<ImageSize>& target_sizes) { + size_t bs = encoded_image.get_shape().at(0); + std::vector<size_t> patch_len{target_sizes.size()}; + std::transform(target_sizes.begin(), target_sizes.end(), patch_len.begin(), [](const ImageSize& height_width) { + return height_width.height * height_width.width; + }); + adjust_pos_cache( + target_sizes, + pipe.m_vlm_config.hidden_size, + pipe.m_pos_embed_cache + ); + size_t max_patch_len = *std::max_element(patch_len.begin(), patch_len.end()); + ov::Tensor key_padding_mask(ov::element::f32, {bs, max_patch_len}); + float* mask_data = key_padding_mask.data<float>(); + size_t embed_len = pipe.m_pos_embed_cache.get_shape().at(2); + ov::Tensor pos_embed(ov::element::f32, {max_patch_len, bs, embed_len}); // BLD => L * B * D + float* pos_embed_data = pos_embed.data<float>(); + float* cache_data = pipe.m_pos_embed_cache.data<float>(); + size_t _d0 = pipe.m_pos_embed_cache.get_shape().at(0); + size_t _d1 = pipe.m_pos_embed_cache.get_shape().at(1); + for (size_t i = 0; i < bs; ++i) { + size_t target_h = target_sizes.at(i).height; + size_t target_w = target_sizes.at(i).width; + for (size_t h_idx = 0; h_idx < target_h; ++h_idx) { + for (size_t w_idx = 0; w_idx < target_w; ++w_idx) { + std::copy_n( + cache_data + (h_idx * _d1 + w_idx) * embed_len, + embed_len, + pos_embed_data + (h_idx * target_w + w_idx) * bs * embed_len + i * embed_len + ); + } + } + for (size_t flat = target_h * target_w; flat < max_patch_len; ++flat) { + std::fill_n(pos_embed_data + flat * bs * embed_len + i * embed_len, embed_len, 0.0f); + } + std::fill_n(mask_data + i * max_patch_len, patch_len[i], 0.0f); + std::fill_n(mask_data + i * max_patch_len + patch_len[i], max_patch_len - patch_len[i], 1.0f); + } + pipe.m_resampler.set_tensor("image_feature", encoded_image); // [N, H*W, old_hidden_size] + pipe.m_resampler.set_tensor("pos_embed", pos_embed); // [H*W, N, new_hidden_size] + pipe.m_resampler.set_tensor("key_padding_mask", key_padding_mask); // [N, H*W] + pipe.m_resampler.infer(); + return pipe.m_resampler.get_output_tensor(); // [N, query_num, new_hidden_size] + } +}; + +VLMPipeline::VLMPipeline( + const std::filesystem::path& model_dir, + const std::string& device, + const ov::AnyMap device_config +) : m_pimpl{std::make_unique<VLMPipelineImpl>(model_dir, device, device_config)} {} + +ov::genai::VLMPipeline::~VLMPipeline() = default; + +DecodedResults VLMPipeline::generate( + const std::string& prompt, + const std::vector<ov::Tensor>& rgbs, + const GenerationConfig& generation_config, + const StreamerVariant& streamer +) { + return m_pimpl->generate(prompt, rgbs, generation_config, streamer); +} + +DecodedResults VLMPipeline::generate( + const std::string& prompt, + const ov::AnyMap& config_map +) { + return m_pimpl->generate(prompt, config_map); +} + +void VLMPipeline::start_chat(const std::string& system_message) { + m_pimpl->start_chat(system_message); +} + +void VLMPipeline::finish_chat() { + m_pimpl->finish_chat(); +} + +void VLMPipeline::set_chat_template(const std::string& new_template) { + m_pimpl->set_chat_template(new_template); +} + +GenerationConfig VLMPipeline::get_generation_config() const { + return m_pimpl->get_generation_config(); +} + +void VLMPipeline::set_generation_config(const GenerationConfig& new_config) { + m_pimpl->set_generation_config(new_config); +} diff --git a/src/cpp/src/visual_language/processor_config.cpp b/src/cpp/src/visual_language/processor_config.cpp new file mode 100644 index 0000000000..22d068feaf --- /dev/null +++ b/src/cpp/src/visual_language/processor_config.cpp @@ -0,0 +1,38 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "processor_config.hpp" +#include "utils.hpp" +#include <fstream> + +ov::genai::ProcessorConfig::ProcessorConfig(const std::filesystem::path& json_path) { + std::ifstream stream(json_path); + OPENVINO_ASSERT(stream.is_open(), "Failed to open '" + json_path.string() + "' with processor config"); + nlohmann::json parsed = nlohmann::json::parse(stream); + using ov::genai::utils::read_json_param; + read_json_param(parsed, "patch_size", patch_size); // For llava - stored in config.json vision_config + read_json_param(parsed, "scale_resolution", scale_resolution); + read_json_param(parsed, "max_slice_nums", max_slice_nums); + if (parsed.contains("norm_mean")) { + norm_mean = parsed.at("norm_mean").get<std::array<float, 3>>(); + } + if (parsed.contains("norm_std")) { + norm_std = parsed.at("norm_std").get<std::array<float, 3>>(); + } + + // Setting llava config params + if (parsed.contains("image_mean")) { + image_mean = parsed.at("image_mean").get<std::array<float, 3>>(); + } + if (parsed.contains("image_std")) { + image_std = parsed.at("image_std").get<std::array<float, 3>>(); + } + + if (parsed.contains("crop_size")) { + crop_size_height = parsed.at("crop_size").at("height"); + crop_size_width = parsed.at("crop_size").at("width"); + } + if (parsed.contains("size")) { + size_shortest_edge = parsed.at("size").at("shortest_edge"); + } +} diff --git a/src/cpp/src/visual_language/processor_config.hpp b/src/cpp/src/visual_language/processor_config.hpp new file mode 100644 index 0000000000..f4fc5d33ec --- /dev/null +++ b/src/cpp/src/visual_language/processor_config.hpp @@ -0,0 +1,55 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "openvino/genai/visibility.hpp" +#include <openvino/runtime/properties.hpp> +#include <array> +#include <filesystem> + +namespace ov::genai { +/// @brief A Configuration class passed to VisionEncoder and used to +/// change VisionEncoder's behavior. Corresponds to +/// preprocessor_config.json. +class OPENVINO_GENAI_EXPORTS ProcessorConfig { +public: + size_t image_size = 980; + /// @brief Dimensions of the smaller, non-overlapping patches that the + /// input image is divided into before being fed into the + /// transformer model. Used to divide image height and width. + size_t patch_size = 14; + /// @brief A recommended size to resize an input image. + /// llava calls it crop_size[height, width]. + size_t scale_resolution = 448; + /// @brief Maximum allowed number of intput image slices. + /// 0 disables slicing. + /// llava has image_grid_pinpoints instead. + size_t max_slice_nums = 0; + /// @brief RGB values to be subtracted from image pixel values. + /// Applied before norm_std. + /// llava calls it image_mean. + std::array<float, 3> norm_mean{0.0f, 0.0f, 0.0f}; + /// @brief RGB values to divide image pixel values. + /// Applied after norm_mean. + /// llava calls it image_std. + std::array<float, 3> norm_std{1.0f, 1.0f, 1.0f}; + + // llava specific config params + std::array<float, 3> image_mean{0.0f, 0.0f, 0.0f}; + std::array<float, 3> image_std{1.0f, 1.0f, 1.0f}; + size_t crop_size_height = 336; + size_t crop_size_width = 336; + size_t size_shortest_edge = 336; + + /// @brief Default constructor + ProcessorConfig() = default; + /// @brief Construct ProcessorConfig from values in json_path. + /// Keys in the file must match the ProcessorConfig's members. + /// @param json_path A path to a file to extract the values from. + explicit ProcessorConfig(const std::filesystem::path& json_path); + /// @brief Default copy constructor. + /// @param A config to copy from. + ProcessorConfig(const ProcessorConfig&) = default; +}; +} // namespace ov::genai diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp new file mode 100644 index 0000000000..d7308e6534 --- /dev/null +++ b/src/cpp/src/visual_language/vision_encoder.cpp @@ -0,0 +1,478 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "vision_encoder.hpp" +#include "visual_language/clip.hpp" +#include "utils.hpp" + +using namespace ov::genai; + +namespace { +int ensure_divide(int length, int patch_size) { + return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size); +} + +std::pair<int, int> find_best_resize(std::pair<int, int> original_size, int scale_resolution, int patch_size, bool allow_upscale=false) { + int width = original_size.first; + int height = original_size.second; + if ((width * height > scale_resolution * scale_resolution) || allow_upscale) { + float r = static_cast<float>(width) / height; + height = static_cast<int>(scale_resolution / std::sqrt(r)); + width = static_cast<int>(height * r); + } + int best_width = ensure_divide(width, patch_size); + int best_height = ensure_divide(height, patch_size); + return std::make_pair(best_width, best_height); +} + +std::pair<int, int> get_refine_size(std::pair<int, int> original_size, std::pair<int, int> grid, int scale_resolution, int patch_size, bool allow_upscale) { + int width, height; + std::tie(width, height) = original_size; + int grid_x, grid_y; + std::tie(grid_x, grid_y) = grid; + + int refine_width = ensure_divide(width, grid_x); + int refine_height = ensure_divide(height, grid_y); + + int grid_width = refine_width / grid_x; + int grid_height = refine_height / grid_y; + + auto best_grid_size = find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); + int best_grid_width, best_grid_height; + std::tie(best_grid_width, best_grid_height) = best_grid_size; + + std::pair<int, int> refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); + return refine_size; +} + +std::vector<std::vector<clip_image_u8>> slice_image(const clip_image_u8& img, const int max_slice_nums, const int scale_resolution, const int patch_size, const bool never_split) { + const std::pair<int, int> original_size{img.nx, img.ny}; + const int original_width = img.nx; + const int original_height = img.ny; + const float log_ratio = log(1.0f * original_width / original_height); + const float ratio = 1.0f * original_width * original_height / (scale_resolution * scale_resolution); + const int multiple = std::min(int(ceil(ratio)), max_slice_nums); + + std::vector<std::vector<clip_image_u8>> images; + images.push_back(std::vector<clip_image_u8>{}); + + if (multiple <= 1) { + auto best_size = find_best_resize(original_size, scale_resolution, patch_size, true); + images.back().push_back(clip_image_u8{}); + bicubic_resize(img, images.back().back(), best_size.first, best_size.second); + } + else if (multiple > 1) { + + std::vector<int> candidate_split_grids_nums; + for (int i : {multiple - 1, multiple, multiple + 1}) { + if (i == 1 || i > max_slice_nums) { + continue; + } + candidate_split_grids_nums.push_back(i); + } + + auto best_size = find_best_resize(original_size, scale_resolution, patch_size); + images.back().push_back(clip_image_u8{}); + bicubic_resize(img, images.back().back(), best_size.first, best_size.second); + + std::vector<std::pair<int, int>> candidate_grids; + + for (int split_grids_nums : candidate_split_grids_nums) { + int m = 1; + while (m <= split_grids_nums) { + if (split_grids_nums % m == 0) { + candidate_grids.emplace_back(m, split_grids_nums / m); + } + ++m; + } + } + + std::pair<int, int> best_grid{ 1, 1 }; + float min_error = std::numeric_limits<float>::infinity(); + + for (const auto& grid : candidate_grids) { + float error = std::abs(log_ratio - std::log(1.0f * grid.first / grid.second)); + if (error < min_error) { + best_grid = grid; + min_error = error; + } + } + auto refine_size = get_refine_size(original_size, best_grid, scale_resolution, patch_size, true); + clip_image_u8 refine_image; + bicubic_resize(img, refine_image, refine_size.first, refine_size.second); + + // split_to_patches + int width = refine_image.nx; + int height = refine_image.ny; + int grid_x = int(width / best_grid.first); + int grid_y = int(height / best_grid.second); + for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1) { + images.push_back(std::vector<clip_image_u8>{}); + for (int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1) { + images.back().push_back(clip_image_u8{}); + clip_image_u8& patch = images.back().back(); + patch.nx = grid_x; + patch.ny = grid_y; + patch.buf.resize(3 * patch.nx * patch.ny); + for (int y = patches_i; y < patches_i + grid_y; ++y) { + for (int x = patches_j; x < patches_j + grid_x; ++x) { + const int i = 3 * (y * refine_image.nx + x); + const int j = 3 * ((y - patches_i) * patch.nx + (x - patches_j)); + patch.buf[j] = refine_image.buf[i]; + patch.buf[j + 1] = refine_image.buf[i + 1]; + patch.buf[j + 2] = refine_image.buf[i + 2]; + } + } + } + } + } + + return images; +} + +// Reimplemented https://pytorch.org/docs/stable/generated/torch.nn.Unfold.html#torch.nn.Unfold +// in shape [NCHW], out shape: [N, C*kernel*kernel, H*W/kernel/kernel] +ov::Tensor unfold(const ov::Tensor& images_tensor, size_t kernel) { + ov::Shape images_shape = images_tensor.get_shape(); + + OPENVINO_ASSERT(4 == images_shape.size(), "Input tensor must be 4D (NCHW)."); + + const size_t bs = images_shape.at(0); + const size_t images_c = images_shape.at(1); + const size_t images_h = images_shape.at(2); + const size_t images_w = images_shape.at(3); + + OPENVINO_ASSERT(images_h >= kernel && images_w >= kernel, "Input height and width must be greater than or equal to kernel size."); + + const size_t new_c = images_c * kernel * kernel; + const size_t output_h = (images_h - kernel) / kernel + 1; + const size_t output_w = (images_w - kernel) / kernel + 1; + const size_t kernels_per_plane = output_h * output_w; + + ov::Tensor unfolded_tensor(ov::element::f32, {bs, new_c, kernels_per_plane}); + const float* images = images_tensor.data<float>(); + float* unfolded = unfolded_tensor.data<float>(); + for (size_t batch_idx = 0; batch_idx < bs; ++batch_idx) { + for (size_t c_idx = 0; c_idx < images_c; ++c_idx) { + for (size_t h_out = 0; h_out < output_h; ++h_out) { + for (size_t w_out = 0; w_out < output_w; ++w_out) { + size_t h_idx = h_out * kernel; // Calculate input height index + size_t w_idx = w_out * kernel; // Calculate input width index + + for (size_t kh = 0; kh < kernel; ++kh) { + for (size_t kw = 0; kw < kernel; ++kw) { + size_t input_idx = (batch_idx * images_c * images_h * images_w) + + (c_idx * images_h * images_w) + + ((h_idx + kh) * images_w) + + (w_idx + kw); + + size_t unfolded_c_idx = (c_idx * kernel * kernel) + (kh * kernel) + kw; + size_t unfolded_idx = (batch_idx * new_c * kernels_per_plane) + + unfolded_c_idx * kernels_per_plane + + (h_out * output_w + w_out); + + unfolded[unfolded_idx] = images[input_idx]; + } + } + } + } + } + } + return unfolded_tensor; +} + +ov::Tensor preprocess_for_encoder(const ov::Tensor& images, size_t kernel) { + ov::Shape images_shape = images.get_shape(); + OPENVINO_ASSERT(4 == images_shape.size()); + ov::Tensor unfolded_tensor = unfold(images, kernel); + const ov::Shape& unfolded_shape = unfolded_tensor.get_shape(); // [N, C*kernel*kernel, H*W/kernel/kernel] + const size_t bs = unfolded_shape[0]; + const size_t d1 = unfolded_shape[1]; + const size_t d2 = unfolded_shape[2]; + const size_t channels = 3; + const size_t new_len = d2 * kernel; + + ov::Tensor permuted_tensor{ov::element::f32, {bs, channels, kernel, new_len}}; + const float* unfolded = unfolded_tensor.data<float>(); + float* permuted = permuted_tensor.data<float>(); + for (size_t b_idx = 0; b_idx < bs; ++b_idx) { + for (size_t c_idx = 0; c_idx < channels; ++c_idx) { + for (size_t k1_idx = 0; k1_idx < kernel; ++k1_idx) { + for (size_t d2_idx = 0; d2_idx < d2; ++d2_idx) { + for (size_t k2_idx = 0; k2_idx < kernel; ++k2_idx) { + size_t unfolded_idx = b_idx * d1 * d2 + + (c_idx * kernel * kernel + k1_idx * kernel + k2_idx) * d2 + + d2_idx; + size_t permuted_idx = b_idx * channels * kernel * new_len + + c_idx * kernel * new_len + + k1_idx * new_len + + d2_idx * kernel + k2_idx; + permuted[permuted_idx] = unfolded[unfolded_idx]; + } + } + } + } + } + return permuted_tensor; +} + +// torch.bucketize(fractional_coords, boundaries, right=True) +std::vector<int64_t> bucket_size_right(const std::vector<float>& fractional_coords, const std::vector<float>& boundaries) { + std::vector<int64_t> bucket_coords(fractional_coords.size()); + std::transform(fractional_coords.begin(), fractional_coords.end(), bucket_coords.begin(), [&boundaries](float fractional_coord) { + return std::distance(boundaries.begin(), std::upper_bound(boundaries.begin(), boundaries.end(), fractional_coord)); + }); + return bucket_coords; +} + +ov::Tensor prepare_vis_position_ids( + const ov::Tensor& pixel_values, + const ov::Tensor& patch_attention_mask, + const std::vector<ImageSize> tgt_sizes, + size_t patch_size, + size_t num_patches_per_side +) { + size_t batch_size = pixel_values.get_shape().at(0); + size_t max_im_h = pixel_values.get_shape().at(2), max_im_w = pixel_values.get_shape().at(3); + size_t max_nb_patches_h = max_im_h / patch_size, max_nb_patches_w = max_im_w / patch_size; + std::vector<float> boundaries(1.0f * num_patches_per_side - 1); + std::generate(boundaries.begin(), boundaries.end(), [num_patches_per_side, val = 0.0f]() mutable { + val += 1.0f / num_patches_per_side; + return val; + }); + size_t position_ids_batch_elem = max_nb_patches_h * max_nb_patches_w; + ov::Tensor position_ids{ov::element::i64, {batch_size, position_ids_batch_elem}}; + // throw std::runtime_error(""); + int64_t* res_data = position_ids.data<int64_t>(); + std::fill_n(res_data, position_ids.get_size(), 0); + + for (size_t batch_idx = 0; batch_idx < batch_size; ++batch_idx) { + size_t nb_patches_h = tgt_sizes.at(batch_idx).height; + size_t nb_patches_w = tgt_sizes.at(batch_idx).width; + + std::vector<float> fractional_coords_h(nb_patches_h); + std::generate(fractional_coords_h.begin(), fractional_coords_h.end(), [nb_patches_h, val = -1.0f / nb_patches_h]() mutable { + val += 1.0f / nb_patches_h; + return val; + }); + std::vector<float> fractional_coords_w(nb_patches_w); + std::generate(fractional_coords_w.begin(), fractional_coords_w.end(), [nb_patches_w, val = -1.0f / nb_patches_w]() mutable { + val += 1.0f / nb_patches_w; + return val; + }); + + std::vector<int64_t> bucket_coords_h = bucket_size_right(fractional_coords_h, boundaries); + std::vector<int64_t> bucket_coords_w = bucket_size_right(fractional_coords_w, boundaries); + + std::vector<int64_t> pos_ids(bucket_coords_h.size() * bucket_coords_w.size()); + for (size_t col = 0; col < bucket_coords_h.size(); ++col) { + for (size_t row = 0; row < bucket_coords_w.size(); ++row) {; + pos_ids.at(col * bucket_coords_w.size() + row) = bucket_coords_h.at(col) * num_patches_per_side + bucket_coords_w.at(row); + } + } + std::copy(pos_ids.begin(), pos_ids.end(), res_data + batch_idx * position_ids_batch_elem); + } + return position_ids; +} + +EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const ov::Tensor& img, ov::InferRequest& encoder, int max_slice_nums, int scale_resolution, size_t patch_size, bool never_split) { + clip_image_u8 source{ + int(img.get_shape().at(3)), + int(img.get_shape().at(2)), + {img.data<uint8_t>(), img.data<uint8_t>() + img.get_size()} + }; + std::vector<std::vector<clip_image_u8>> imgs = ::slice_image(source, max_slice_nums, scale_resolution, patch_size, never_split); + std::vector<std::vector<ov::Tensor>> results; + std::vector<std::vector<ImageSize>> sizes; + + // std::vector<clip_image_f32*> img_res_v; // format N x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336 + std::vector<std::vector<clip_image_f32>> preprocessed{imgs.size()}; + std::transform(imgs.begin(), imgs.end(), preprocessed.begin(), [&ctx_clip](const std::vector<clip_image_u8>& row) { + std::vector<clip_image_f32> processed_row{row.size()}; + std::transform(row.begin(), row.end(), processed_row.begin(), [&ctx_clip](const clip_image_u8& raw) { + return clip_image_preprocess(ctx_clip, raw); + }); + return processed_row; + }); + + const clip_image_f32& resized_preprocessed = preprocessed.at(0).at(0); + ImageSize resized_source_size{resized_preprocessed.ny / patch_size, resized_preprocessed.nx / patch_size}; + ov::Tensor input_tensor{ov::element::f32, {1, 3, size_t(resized_preprocessed.ny), size_t(resized_preprocessed.nx)}, (void*)(resized_preprocessed.buf.data())}; + ov::Tensor pixel_values = preprocess_for_encoder(input_tensor, patch_size); + encoder.set_tensor("pixel_values", pixel_values); + ov::Tensor patch_attention_mask{ov::element::f32, {pixel_values.get_shape().at(0), 1, resized_source_size.height * resized_source_size.width}}; + std::fill_n(patch_attention_mask.data<float>(), patch_attention_mask.get_size(), 1.0f); + encoder.set_tensor("patch_attention_mask", patch_attention_mask); + ov::Tensor position_ids = prepare_vis_position_ids(pixel_values, patch_attention_mask, {resized_source_size}, ctx_clip.patch_size, ctx_clip.image_size / ctx_clip.patch_size); + encoder.set_tensor("position_ids", position_ids); + encoder.infer(); + const ov::Tensor& output_tensor = encoder.get_output_tensor(); + ov::Tensor resized_source{ov::element::f32, output_tensor.get_shape()}; + output_tensor.copy_to(resized_source); + + if (1 == preprocessed.size()) { + return {std::move(resized_source), resized_source_size}; + } + + ImageSize raw_size{ + size_t(preprocessed.at(1).at(0).ny), + size_t(preprocessed.at(1).at(0).nx) + }; + ImageSize slices_size{ + raw_size.height / patch_size, + raw_size.width / patch_size + }; + size_t n_patches = slices_size.height * slices_size.width, + old_hidden_size = resized_source.get_shape().at(2); + ov::Tensor encoded_slices{ov::element::f32, {preprocessed.size() - 1, preprocessed.at(1).size(), n_patches, old_hidden_size}}; + for (size_t row = 1; row < preprocessed.size(); ++row) { + for (size_t col = 0; col < preprocessed.at(row).size(); ++col) { + clip_image_f32& elem = preprocessed.at(row).at(col); + ov::Tensor pixel_values = preprocess_for_encoder( + {ov::element::f32, {1, 3, size_t(elem.ny), size_t(elem.nx)}, elem.buf.data()}, + patch_size + ); + encoder.set_tensor("pixel_values", pixel_values); + ov::Tensor patch_attention_mask{ov::element::f32, {1, 1, slices_size.height * slices_size.width}}; + std::fill_n(patch_attention_mask.data<float>(), patch_attention_mask.get_size(), 1.0f); + encoder.set_tensor("patch_attention_mask", patch_attention_mask); + ov::Tensor position_ids = prepare_vis_position_ids(pixel_values, patch_attention_mask, {slices_size}, ctx_clip.patch_size, ctx_clip.image_size / ctx_clip.patch_size); + encoder.set_tensor("position_ids", position_ids); + const ov::Tensor& old = encoder.get_output_tensor(); + encoder.set_output_tensor({ov::element::f32, {1, n_patches, old_hidden_size}, encoded_slices.data<float>() + ((row - 1) * preprocessed.at(row).size() + col) * n_patches * old_hidden_size}); + encoder.infer(); + encoder.set_output_tensor(old); + } + } + return {resized_source, resized_source_size, encoded_slices, slices_size}; +} + +ProcessorConfig from_any_map( + const ov::AnyMap& config_map, + const ProcessorConfig& initial +) { + auto iter = config_map.find("processor_config"); + ProcessorConfig extracted_config = config_map.end() != iter ? + iter->second.as<ProcessorConfig>() : initial; + using utils::read_anymap_param; + read_anymap_param(config_map, "patch_size", extracted_config.patch_size); + read_anymap_param(config_map, "scale_resolution", extracted_config.scale_resolution); + read_anymap_param(config_map, "max_slice_nums", extracted_config.max_slice_nums); + read_anymap_param(config_map, "norm_mean", extracted_config.norm_mean); + read_anymap_param(config_map, "norm_std", extracted_config.norm_std); + return extracted_config; +} + + +ov::Tensor preprocess_image_llava(const ov::Tensor& image, const ProcessorConfig& config) { + bool do_resize = true; + bool do_center_crop = true; + + // ov::Tensor to clip_image_u8 + clip_image_u8 input_image{ + int(image.get_shape().at(3)), + int(image.get_shape().at(2)), + {image.data<uint8_t>(), image.data<uint8_t>() + image.get_size()} + }; + + // Resize + clip_image_u8 resized_image; + if (do_resize) { + int target_size = config.size_shortest_edge; + float scale = static_cast<float>(target_size) / std::min(input_image.nx, input_image.ny); + int new_width = static_cast<int>(input_image.nx * scale); + int new_height = static_cast<int>(input_image.ny * scale); + bicubic_resize(input_image, resized_image, new_width, new_height); + } else { + resized_image = input_image; + } + + // Center crop + clip_image_u8 cropped_image; + if (do_center_crop) { + int crop_height = config.crop_size_height; + int crop_width = config.crop_size_width; + int start_x = (resized_image.nx - crop_width) / 2; + int start_y = (resized_image.ny - crop_height) / 2; + + cropped_image.nx = crop_width; + cropped_image.ny = crop_height; + cropped_image.buf.resize(3 * crop_width * crop_height); + + for (int y = 0; y < crop_height; ++y) { + for (int x = 0; x < crop_width; ++x) { + for (int c = 0; c < 3; ++c) { + cropped_image.buf[(y * crop_width + x) * 3 + c] = + resized_image.buf[((start_y + y) * resized_image.nx + (start_x + x)) * 3 + c]; + } + } + } + } else { + cropped_image = resized_image; + } + + // Normalize + clip_ctx ctx; + std::copy(config.image_mean.begin(), config.image_mean.end(), ctx.image_mean); + std::copy(config.image_std.begin(), config.image_std.end(), ctx.image_std); + + clip_image_f32 normalized_image = clip_image_preprocess(ctx, cropped_image); + + // Convert clip_image_f32 to ov::Tensor + ov::Tensor result( + ov::element::f32, + {1, 3, size_t(normalized_image.ny), size_t(normalized_image.nx)}, + (void*)(normalized_image.buf.data()) + ); + + return result; +} +} + +VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const VLMModelType model_type, const std::string& device, const ov::AnyMap device_config, ov::Core core) : + model_type(model_type) { + if (model_type == VLMModelType::MINICPM) { + m_vision_encoder = core.compile_model(model_dir / "openvino_vision_embeddings_model.xml", device, device_config).create_infer_request(); + } else if (model_type == VLMModelType::LLAVA) { + // Vision embeddings model is merged with multi modal projector at model export stage by optimum-intel + m_vision_encoder = core.compile_model(model_dir / "openvino_vision_embeddings_model.xml", device, device_config).create_infer_request(); + } + m_processor_config = ov::genai::utils::from_config_json_if_exists<ov::genai::ProcessorConfig>( + model_dir, "preprocessor_config.json" + ); +} + +EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ProcessorConfig& config) { + if (model_type == VLMModelType::MINICPM) { + return encode_minicpm(image, config); + } else if (model_type == VLMModelType::LLAVA) { + return encode_llava(image, config); + } +} + +EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ov::AnyMap& config_map) { + return encode(image, from_any_map( + config_map, m_processor_config + )); +} + +EncodedImage VisionEncoder::encode_minicpm(const ov::Tensor& image, const ProcessorConfig& config) { + clip_ctx ctx_clip; + ctx_clip.patch_size = m_processor_config.patch_size; + ctx_clip.image_size = m_processor_config.image_size; + std::copy(config.norm_mean.begin(), config.norm_mean.end(), ctx_clip.image_mean); + std::copy(config.norm_std.begin(), config.norm_std.end(), ctx_clip.image_std); + return llava_image_embed_make_with_bytes_slice(ctx_clip, image, m_vision_encoder, config.max_slice_nums, config.scale_resolution, config.patch_size, 0 == config.max_slice_nums); +} + +EncodedImage VisionEncoder::encode_llava(const ov::Tensor& image, const ProcessorConfig& config) { + ov::Tensor preprocessed_image = preprocess_image_llava(image, config); + + m_vision_encoder.set_tensor("pixel_values", preprocessed_image); + m_vision_encoder.infer(); + + ov::Tensor image_features = m_vision_encoder.get_output_tensor(); + ImageSize resized_source_size{config.crop_size_height / config.patch_size, config.crop_size_width / config.patch_size}; + + return {image_features, resized_source_size}; +} diff --git a/src/cpp/src/visual_language/vision_encoder.hpp b/src/cpp/src/visual_language/vision_encoder.hpp new file mode 100644 index 0000000000..446c093093 --- /dev/null +++ b/src/cpp/src/visual_language/vision_encoder.hpp @@ -0,0 +1,134 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <openvino/openvino.hpp> +#include "visual_language/processor_config.hpp" +#include "visual_language/vlm_model_type.hpp" + +namespace ov::genai { +/// @brief A pair describing image size. +struct ImageSize { + /// @brief Height of a corresponding image. + size_t height; + /// @brief Width of a corresponding image. + size_t width; +}; + +/// @brief Embeddings of a given image. The number of slices is no +/// greater than ProcessorConfig's max_slice_nums. +struct EncodedImage { + /// @brief Embeddings of a resized image based on ProcessorConfig's + /// scale_resolution. The tensor's shape is + /// [N, H*W, hidden_size]. [N, 1014, 1152] is a possible example for + /// openbmb/MiniCPM-V-2. Only batch 1 is supported. + ov::Tensor resized_source; + /// @brief A size of an image used to compute embeddings for + /// divided by ProcessorConfig's patch_size. + ImageSize resized_source_size; + /// @brief Embeddings of images obtained from a source image by + /// slicing at no more than max_slice_nums pieces and resizing. + /// The tensor's shape is + /// [slice_y, slice_x, number_of_embeddings, embedding_size]. + /// slices_sizes.size() == slice_y * slice_x. + ov::Tensor slices; + /// @brief A size of images used to compute embeddings + /// stored in slices member divided by ProcessorConfig's patch_size. + ImageSize slices_size; +}; + +/// @brief A class used to infer embeddings of an image using +/// ov::InferRequest and configured by ProcessorConfig. +class OPENVINO_GENAI_EXPORTS VisionEncoder { +public: + /// @brief A enum denoting model type. + VLMModelType model_type; + /// @brief A model for image encoding. + ov::InferRequest m_vision_encoder; + /// @brief A config to follow. + ProcessorConfig m_processor_config; + + /// @brief Construct from an already compiled model and a config. + /// @param encoder Compiled model. + /// @param processor_config Initial config. + explicit VisionEncoder( + const ov::InferRequest& encoder, + const ProcessorConfig& processor_config=ProcessorConfig{} + ) : m_vision_encoder{encoder}, m_processor_config{processor_config} {} + + /// @brief Construct the encoder from model_dir. + /// @param model_dir A folder containing openvino_embedding.xml and + /// preprocessor_config.json. + /// @param device A device to compile the encoder for. + /// @param device_config A config to be passed to + /// ov::Core::compile_model(). + /// @param core ov::Core to be used to compile the model. + explicit VisionEncoder( + const std::filesystem::path& model_dir, + const VLMModelType model_type, + const std::string& device="CPU", + const ov::AnyMap device_config={}, + ov::Core core=ov::Core{} + ); + + /// @brief Compute embeddings of an image. + /// @param image An image to infer embeddings for. Image shape must be + /// [1CHW]. Only batch 1 is supported. + /// @return Resulting embeddings for the resized source image and + /// its slices. + EncodedImage encode(const ov::Tensor& image) { + return encode(image, m_processor_config); + } + + /// @brief Compute embeddings of an image given ProcessorConfig. + /// @param image An image to infer embeddings for. Image shape must be + /// [1CHW]. Only batch 1 is supported. + /// @param config A config to follow instead of the config obtained + /// in constructors. + /// @return Resulting embeddings for the resized source image and + /// its slices. + EncodedImage encode( + const ov::Tensor& image, const ProcessorConfig& config + ); + + /// @brief Compute embeddings of an image given + /// ProcessorConfig members. + /// @param image An image to infer embeddings for. Image shape must be + /// [1CHW]. Only batch 1 is supported. + /// @param config_map A config or its members values to follow + /// instead of the config obtained in constructors. + /// @return Resulting embeddings for the resized source image and + /// its slices. + EncodedImage encode( + const ov::Tensor& image, const ov::AnyMap& config_map + ); + + /// @brief Compute embeddings of an image given + /// ProcessorConfig members. + /// @param image An image to infer embeddings for. Image shape must be + /// [1CHW]. Only batch 1 is supported. + /// @param ...properties A config or its members values to follow + /// instead of the config obtained in constructors. + /// @return Resulting embeddings for the resized source image and + /// its slices. + template <typename... Properties> + util::EnableIfAllStringAny<EncodedImage, Properties...> encode( + const ov::Tensor& image, + Properties&&... properties + ) { + return encode( + image, AnyMap{std::forward<Properties>(properties)...} + ); + } + +private: + EncodedImage encode_minicpm( + const ov::Tensor& image, const ProcessorConfig& config + ); + + EncodedImage encode_llava( + const ov::Tensor& image, const ProcessorConfig& config + ); +}; +} diff --git a/src/cpp/src/visual_language/vlm_config.cpp b/src/cpp/src/visual_language/vlm_config.cpp new file mode 100644 index 0000000000..f3a54c5ec7 --- /dev/null +++ b/src/cpp/src/visual_language/vlm_config.cpp @@ -0,0 +1,18 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "vlm_config.hpp" +#include "utils.hpp" +#include <fstream> + +ov::genai::VLMConfig::VLMConfig(const std::filesystem::path& json_path) { + std::ifstream stream(json_path); + OPENVINO_ASSERT(stream.is_open(), "Failed to open '" + json_path.string() + "' with processor config"); + nlohmann::json parsed = nlohmann::json::parse(stream); + using ov::genai::utils::read_json_param; + model_type = to_vlm_model_type(parsed.at("model_type")); + read_json_param(parsed, "hidden_size", hidden_size); + read_json_param(parsed, "scale_emb", scale_emb); + read_json_param(parsed, "query_num", query_num); + read_json_param(parsed, "use_image_id", use_image_id); +} diff --git a/src/cpp/src/visual_language/vlm_config.hpp b/src/cpp/src/visual_language/vlm_config.hpp new file mode 100644 index 0000000000..5a954c07ee --- /dev/null +++ b/src/cpp/src/visual_language/vlm_config.hpp @@ -0,0 +1,57 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "openvino/genai/visibility.hpp" +#include "visual_language/vlm_model_type.hpp" +#include <openvino/runtime/properties.hpp> +#include <filesystem> + +namespace ov::genai { +/// @brief A Configuration class passed to VLMPipeline and used to +/// change VLMPipeline's behavior. Corresponds to config.json. +class OPENVINO_GENAI_EXPORTS VLMConfig { +public: + /// @brief A enum denoting model type. + VLMModelType model_type; + /// @brief A size of a single embedding returned by a resampler. + /// Used to initialize positional embeddings for resampler input. + size_t hidden_size = 3584; + /// @brief Multiply embeddings by this value. + float scale_emb = 1.0f; + /// @brief A number of embedding vectors representing an image + /// slice. + size_t query_num = 64; + /// @brief A string token denoting start of image embeddings for an + /// LLM. + std::string im_start = "<image>"; + /// @brief A string token denoting end of image embeddings for an + /// LLM. + std::string im_end = "</image>"; + /// @brief A string token denoting start of image slices row + /// embeddings for an LLM. + std::string slice_start = "<slice>"; + /// @brief A string token denoting end of image slices row + /// embeddings for LLM. + std::string slice_end = "</slice>"; + /// @brief Start each image (not a slice) with + /// <image_id>i</image_id>. i is a number. + bool use_image_id = true; + /// @brief A string token denoting start of image number region. + std::string im_id_start = "<image_id>"; + /// @brief A string token denoting end of image number region. + std::string im_id_end = "</image_id>"; + /// @brief A placeholder for image embeddings in text. + std::string unk = "<unk>"; + /// @brief Default constructor. + VLMConfig() = default; + /// @brief Construct VLMConfig from values in json_path. + /// Keys in the file must match the VLMConfig's members. + /// @param json_path A path to a file to extract the values from. + explicit VLMConfig(const std::filesystem::path& config_path); + /// @brief Default copy constructor. + /// @param A config to copy from. + VLMConfig(const VLMConfig&) = default; +}; +} // namespace ov::genai diff --git a/src/cpp/src/visual_language/vlm_model_type.hpp b/src/cpp/src/visual_language/vlm_model_type.hpp new file mode 100644 index 0000000000..0f811a116a --- /dev/null +++ b/src/cpp/src/visual_language/vlm_model_type.hpp @@ -0,0 +1,31 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <string> +#include <unordered_map> + +#include "openvino/genai/visibility.hpp" +#include <openvino/core/except.hpp> + +namespace ov::genai { + +enum class OPENVINO_GENAI_EXPORTS VLMModelType { + MINICPM, + LLAVA, +}; + +inline VLMModelType to_vlm_model_type(const std::string& value) { + static const std::unordered_map<std::string, VLMModelType> model_types_map = { + {"minicpmv", VLMModelType::MINICPM}, + {"llava", VLMModelType::LLAVA} + }; + + auto it = model_types_map.find(value); + if (it != model_types_map.end()) { + return it->second; + } + OPENVINO_THROW("Unsupported '", value, "' VLM model type"); +} +} \ No newline at end of file diff --git a/src/cpp/src/vlm_sampling.hpp b/src/cpp/src/vlm_sampling.hpp new file mode 100644 index 0000000000..b0a7d2341f --- /dev/null +++ b/src/cpp/src/vlm_sampling.hpp @@ -0,0 +1,96 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <iostream> +#include <cmath> +#include <algorithm> +#include <numeric> +#include <vector> + +struct TokenIdScore { + int id; + float score; + + TokenIdScore() = default; + TokenIdScore(int id, float score) : id(id), score(score) {} + + bool operator<(const TokenIdScore& other) const { return score < other.score; } + bool operator>(const TokenIdScore& other) const { return score > other.score; } + + friend std::ostream& operator<<(std::ostream& os, const TokenIdScore& self) { + return os << "TokenIdScore(id=" << self.id << ", score=" << self.score << ")"; + } +}; + +void sampling_softmax_inplace(TokenIdScore* first, TokenIdScore* last) { + float max_score = std::max_element(first, last)->score; + float sum = 0.f; + for (TokenIdScore* p = first; p != last; p++) { + float s = std::exp(p->score - max_score); + p->score = s; + sum += s; + } + float inv_sum = 1.f / sum; + for (TokenIdScore* p = first; p != last; p++) { + p->score *= inv_sum; + } +} + +void sampling_top_k(TokenIdScore* first, TokenIdScore* kth, TokenIdScore* last) { + std::nth_element(first, kth, last, std::greater<TokenIdScore>()); +} + +TokenIdScore* sampling_top_p(TokenIdScore* first, TokenIdScore* last, float top_p) { + // fast top_p in expected O(n) time complexity + sampling_softmax_inplace(first, last); + + while (first + 1 < last) { + const float pivot_score = (last - 1)->score; // use mid score? + TokenIdScore* mid = + std::partition(first, last - 1, [pivot_score](const TokenIdScore& x) { return x.score > pivot_score; }); + std::swap(*mid, *(last - 1)); + + const float prefix_sum = + std::accumulate(first, mid, 0.f, [](float sum, const TokenIdScore& x) { return sum + x.score; }); + if (prefix_sum >= top_p) { + last = mid; + } + else if (prefix_sum + mid->score < top_p) { + first = mid + 1; + top_p -= prefix_sum + mid->score; + } + else { + return mid + 1; + } + } + return last; +} + +void sampling_repetition_penalty(float* first, float* last, const std::vector<int>& input_ids, + float penalty) { + if (penalty < 0) { + std::cout << "penalty must be a positive float, but got " << penalty; + return; + } + const float inv_penalty = 1.f / penalty; + const ptrdiff_t vocab_size = last - first; + std::vector<bool> occurrence(vocab_size, false); + for (const int id : input_ids) { + if (!occurrence[id]) { + first[id] *= (first[id] > 0) ? inv_penalty : penalty; + } + occurrence[id] = true; + } +} + +void sampling_temperature(float* first, float* last, float temp) { + const float inv_temp = 1.f / temp; + for (float* it = first; it != last; it++) { + *it *= inv_temp; + } +} + + + diff --git a/src/cpp/src/whisper/logit_processor.cpp b/src/cpp/src/whisper/logit_processor.cpp new file mode 100644 index 0000000000..c31bf8bb44 --- /dev/null +++ b/src/cpp/src/whisper/logit_processor.cpp @@ -0,0 +1,118 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include <openvino/openvino.hpp> + +#include "openvino/genai/whisper_generation_config.hpp" +#include "sampler.hpp" + +namespace ov { +namespace genai { + +void do_suppress_tokens(ov::Tensor& logits, const size_t batch_idx, const std::vector<int64_t>& suppress_tokens) { + OPENVINO_ASSERT(logits.get_shape()[0] >= batch_idx, "logits batch size doesn't match the batch number"); + + size_t vocab_size = logits.get_shape().back(); + size_t batch_offset = batch_idx * logits.get_shape()[1] * vocab_size; + size_t sequence_offset = (logits.get_shape()[1] - 1) * vocab_size; + float* logits_data = logits.data<float>() + batch_offset + sequence_offset; + + for (auto supress_token : suppress_tokens) { + logits_data[supress_token] = -std::numeric_limits<float>::infinity(); + } +} + +void process_whisper_timestamp_logits(ov::Tensor& logits, + const size_t batch_idx, + const ov::genai::WhisperGenerationConfig& config, + const std::vector<int64_t>& generated_tokens, + bool initial_step = false) { + const size_t batch_size = logits.get_shape().at(0); + OPENVINO_ASSERT(batch_size == 1, "Batch != 1 is not supported"); + + size_t vocab_size = logits.get_shape().back(); + size_t batch_offset = batch_idx * logits.get_shape()[1] * vocab_size; + size_t sequence_offset = (logits.get_shape()[1] - 1) * vocab_size; + float* logits_data = logits.data<float>() + batch_offset + sequence_offset; + + // supress<|notimestamps|> + logits_data[config.no_timestamps_token_id] = -std::numeric_limits<float>::infinity(); + + size_t timestamp_begin = config.no_timestamps_token_id + 1; + + // timestamps have to appear in pairs, except directly before eos_token; mask logits accordingly + size_t generated_length = generated_tokens.size(); + bool last_was_timestamp = generated_length >= 1 && generated_tokens[generated_length - 1] >= timestamp_begin; + bool penultimate_was_timestamp = generated_length < 2 || generated_tokens[generated_length - 2] >= timestamp_begin; + + if (last_was_timestamp) { + if (penultimate_was_timestamp) { + // has to be non-timestamp + for (size_t i = timestamp_begin; i < vocab_size; i++) { + logits_data[i] = -std::numeric_limits<float>::infinity(); + } + } else { + // cannot be normal text token + for (size_t i = 0; i < config.eos_token_id; i++) { + logits_data[i] = -std::numeric_limits<float>::infinity(); + } + } + } + + // filter generated timestaps + std::vector<int64_t> timestamps; + for (const auto token : generated_tokens) { + if (token >= timestamp_begin) { + timestamps.push_back(token); + } + } + + if (timestamps.size() > 0) { + size_t timestamp_last; + // `timestamps` shouldn't decrease; forbid timestamp tokens smaller than the last + // The following lines of code are copied from: https://github.com/openai/whisper/pull/914/files#r1137085090 + if (last_was_timestamp && !penultimate_was_timestamp) { + timestamp_last = timestamps.back(); + } else { + // Avoid to emit <|0.00|> again + timestamp_last = timestamps.back() + 1; + } + + for (size_t i = timestamp_begin; i < timestamp_last; i++) { + logits_data[i] = -std::numeric_limits<float>::infinity(); + } + } + + // apply the `max_initial_timestamp` option + if (initial_step) { + for (size_t i = 0; i < timestamp_begin; i++) { + logits_data[i] = -std::numeric_limits<float>::infinity(); + } + + size_t last_allowed = timestamp_begin + config.max_initial_timestamp_index; + for (size_t i = last_allowed + 1; i < vocab_size; i++) { + logits_data[i] = -std::numeric_limits<float>::infinity(); + } + } + + auto tokens = ov::genai::log_softmax(logits, 0); + float timestamp_exp_prov_sum = 0; + + for (size_t i = timestamp_begin; i < vocab_size; i++) { + timestamp_exp_prov_sum += std::exp(tokens[i].m_log_prob); + } + float timestamp_logprob = std::log(timestamp_exp_prov_sum); + + auto max_logprob_token = std::max_element(tokens.begin(), tokens.end(), [](const Token& left, const Token& right) { + return left.m_log_prob < right.m_log_prob; + }); + + if (timestamp_logprob > max_logprob_token->m_log_prob) { + for (size_t i = 0; i < timestamp_begin; i++) { + logits_data[i] = -std::numeric_limits<float>::infinity(); + } + } +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/whisper/logit_processor.hpp b/src/cpp/src/whisper/logit_processor.hpp new file mode 100644 index 0000000000..ee51f905b5 --- /dev/null +++ b/src/cpp/src/whisper/logit_processor.hpp @@ -0,0 +1,22 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <openvino/openvino.hpp> + +#include "openvino/genai/whisper_generation_config.hpp" + +namespace ov { +namespace genai { + +void do_suppress_tokens(ov::Tensor& logits, const size_t batch_idx, const std::vector<int64_t>& suppress_tokens); + +void process_whisper_timestamp_logits(ov::Tensor& logits, + const size_t batch_idx, + const ov::genai::WhisperGenerationConfig& config, + const std::vector<int64_t>& generated_tokens, + bool initial_step = false); + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/whisper/timestamps.cpp b/src/cpp/src/whisper/timestamps.cpp new file mode 100644 index 0000000000..ca0723f717 --- /dev/null +++ b/src/cpp/src/whisper/timestamps.cpp @@ -0,0 +1,85 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "timestamps.hpp" + +namespace ov { +namespace genai { + +ov::genai::ExtractedSegments extract_segments(const std::vector<int64_t>& tokens, + const ov::genai::WhisperGenerationConfig& config, + const size_t nb_max_frames, + const float time_precision) { + ov::genai::ExtractedSegments extracted_segments; + std::optional<int64_t> token_start = std::nullopt; + size_t idx_start = 0; + + for (size_t i = 0; i < tokens.size(); i++) { + int64_t token = tokens[i]; + + bool is_timestamp = token >= config.begin_timestamps_token_id; + + if (!is_timestamp) { + continue; + } + + if (!token_start.has_value()) { + token_start = token; + idx_start = i; + } else { + if (token_start == token) { + // from HF: + // https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/tokenization_whisper.py#L1020 + // This is a bug in timestamp token output where we're taking the duplicate token as a stop where it + // should be a start. This is an issue in the underlying model output. Let's just skip it so it becomes + // de-factor a start again. + continue; + } + + ov::genai::Segment segment; + segment.m_tokens = {tokens.begin() + idx_start + 1, tokens.begin() + i}; + segment.m_start = (*token_start - config.begin_timestamps_token_id) * time_precision; + segment.m_end = (token - config.begin_timestamps_token_id) * time_precision; + extracted_segments.segments.push_back(segment); + + // each next timestamp token represents .02 time diff + extracted_segments.last_offset = (token - config.begin_timestamps_token_id) * 2; + + extracted_segments.non_timestamp_tokens.insert(extracted_segments.non_timestamp_tokens.end(), + tokens.begin() + idx_start + 1, + tokens.begin() + i); + + token_start = std::nullopt; + } + } + + // segment started but has no closing timestamp + // add new segment only if it has non timestamps tokens + // do not add new segment if previous segments exists + bool has_tokens_to_add = idx_start < tokens.size() - 1; + bool has_previous_segments = extracted_segments.segments.size() > 0; + if (token_start.has_value() && has_tokens_to_add && !has_previous_segments) { + ov::genai::Segment segment; + segment.m_tokens = {tokens.begin() + idx_start + 1, tokens.end()}; + segment.m_start = (*token_start - config.begin_timestamps_token_id) * time_precision; + segment.m_end = -1.0f; + extracted_segments.segments.push_back(segment); + + extracted_segments.last_offset = nb_max_frames; + + extracted_segments.non_timestamp_tokens.insert(extracted_segments.non_timestamp_tokens.end(), + tokens.begin() + idx_start + 1, + tokens.end()); + } + + // last timestamps generated in pairs <ts><ts><eos> -> speech segment continuation to the next chunk -> token_start will have value + // single ending timestamp <ts><eos> -> no more speech till the end of current chunk -> set offset to the end of frame + if (!token_start.has_value()) { + extracted_segments.last_offset = nb_max_frames; + } + + return extracted_segments; +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/whisper/timestamps.hpp b/src/cpp/src/whisper/timestamps.hpp new file mode 100644 index 0000000000..fc4ca7b10b --- /dev/null +++ b/src/cpp/src/whisper/timestamps.hpp @@ -0,0 +1,25 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <openvino/openvino.hpp> + +#include "whisper.hpp" + +namespace ov { +namespace genai { + +struct ExtractedSegments { + std::vector<ov::genai::Segment> segments; + size_t last_offset; + std::vector<int64_t> non_timestamp_tokens; +}; + +ExtractedSegments extract_segments(const std::vector<int64_t>& tokens, + const ov::genai::WhisperGenerationConfig& config, + const size_t nb_max_frames, + const float time_precision); + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/whisper/whisper.cpp b/src/cpp/src/whisper/whisper.cpp new file mode 100644 index 0000000000..51a617673a --- /dev/null +++ b/src/cpp/src/whisper/whisper.cpp @@ -0,0 +1,319 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "whisper.hpp" + +#include <iostream> +#include <openvino/openvino.hpp> +#include <regex> +#include <thread> + +#include "../utils.hpp" +#include "logit_processor.hpp" +#include "openvino/genai/streamer_base.hpp" +#include "openvino/genai/whisper_generation_config.hpp" +#include "openvino/genai/whisper_pipeline.hpp" +#include "timestamps.hpp" +#include "whisper_config.hpp" +#include "whisper_feature_extractor.hpp" +#include "whisper_models.hpp" + +namespace { + +ov::Tensor encode(ov::InferRequest& request, + std::vector<float>& mel_data, + const size_t feature_size, + const size_t nb_max_frames) { + OPENVINO_ASSERT(mel_data.size() == feature_size * nb_max_frames, + "Mel spectrogram required size: ", + feature_size, + " * ", + nb_max_frames, + ". Actual size: ", + mel_data.size(), + "."); + + ov::Tensor input_tensor(ov::element::f32, {1, feature_size, nb_max_frames}, mel_data.data()); + + request.set_tensor("input_features", input_tensor); + + request.infer(); + + // reset input tensor + request.set_tensor("input_features", ov::Tensor(ov::element::f32, {0, feature_size, nb_max_frames})); + + return request.get_tensor("last_hidden_state"); +} + +void set_past_key_value(ov::InferRequest& source, ov::InferRequest& dest) { + // source outputs: + // present.0.decoder.key + // present.0.decoder.value + // present.0.encoder.key + // present.0.encoder.value + + // dest inputs: + // past_key_values.0.decoder.key + // past_key_values.0.decoder.value + // past_key_values.0.encoder.key + // past_key_values.0.encoder.value + + for (auto& source_output : source.get_compiled_model().outputs()) { + std::string source_output_name = source_output.get_any_name(); + if (source_output_name.find("logits") != std::string::npos) { + continue; + } + + std::string with_past_input_name = + std::regex_replace(source_output_name, std::regex("present"), "past_key_values"); + + auto kv_tensor = source.get_tensor(source_output_name); + dest.set_tensor(with_past_input_name, ov::Tensor{kv_tensor}); + } +} + +int64_t decode(ov::Tensor& encoder_hidden_state, + ov::InferRequest& decoder, + std::vector<int64_t>& input_ids, + const ov::genai::WhisperGenerationConfig& config, + const bool apply_logit_processors = true, + const bool return_timestamps = false) { + decoder.set_tensor("encoder_hidden_states", ov::Tensor{encoder_hidden_state}); + + ov::Tensor input_ids_tensor(ov::element::i64, {1, input_ids.size()}, input_ids.data()); + decoder.set_tensor("input_ids", input_ids_tensor); + + decoder.infer(); + + auto output_tensor = decoder.get_tensor("logits"); + + if (apply_logit_processors) { + ov::genai::do_suppress_tokens(output_tensor, 0, config.begin_suppress_tokens); + ov::genai::do_suppress_tokens(output_tensor, 0, config.suppress_tokens); + + if (return_timestamps) { + ov::genai::process_whisper_timestamp_logits(output_tensor, 0, config, {}, true); + } + } + + int64_t output_token = ov::genai::utils::argmax(output_tensor, 0); + + return output_token; +} + +int64_t decode_with_past(ov::Tensor& encoder_hidden_state, + ov::InferRequest& decoder_with_past, + int64_t input_id, + const size_t cache_position, + const ov::genai::WhisperGenerationConfig& config, + const bool return_timestamps, + const std::vector<int64_t>& generated_tokens) { + decoder_with_past.set_tensor("encoder_hidden_states", ov::Tensor{encoder_hidden_state}); + + std::vector<int64_t> input_ids = {input_id}; + ov::Tensor input_ids_tensor(ov::element::i64, {1, 1}, input_ids.data()); + decoder_with_past.set_tensor("input_ids", input_ids_tensor); + + ov::Tensor cache_position_tensor = decoder_with_past.get_tensor("cache_position"); + cache_position_tensor.set_shape({1}); + cache_position_tensor.data<int64_t>()[0] = cache_position; + + decoder_with_past.infer(); + + auto output_tensor = decoder_with_past.get_tensor("logits"); + + ov::genai::do_suppress_tokens(output_tensor, 0, config.suppress_tokens); + + if (return_timestamps) { + ov::genai::process_whisper_timestamp_logits(output_tensor, 0, config, generated_tokens); + } + + int64_t output_token = ov::genai::utils::argmax(output_tensor, 0); + + return output_token; +} + +int64_t detect_language(ov::Tensor& encoder_hidden_state, + ov::InferRequest decoder, + const ov::genai::WhisperGenerationConfig& config) { + std::vector<int64_t> input_ids{config.decoder_start_token_id}; + int64_t output_token = decode(encoder_hidden_state, decoder, input_ids, config, false, false); + + return output_token; +} + +std::vector<int64_t> prepare_init_ids(ov::Tensor& encoder_hidden_state, + ov::InferRequest decoder, + const ov::genai::WhisperGenerationConfig& config, + const bool return_timestamps) { + if (!config.is_multilingual) { + return std::vector<int64_t>{config.decoder_start_token_id, config.no_timestamps_token_id}; + } + + int64_t language_token_id; + if (config.language.has_value()) { + std::string language = *config.language; + if (config.lang_to_id.count(language)) { + language_token_id = config.lang_to_id.at(language); + } + } else { + language_token_id = detect_language(encoder_hidden_state, decoder, config); + } + + int64_t task_token_id = config.transcribe_token_id; + if (config.task.has_value() && *config.task == "translate") { + task_token_id = config.translate_token_id; + } + + if (return_timestamps) { + return std::vector<int64_t>{config.decoder_start_token_id, language_token_id, task_token_id}; + } + + return std::vector<int64_t>{config.decoder_start_token_id, + language_token_id, + task_token_id, + config.no_timestamps_token_id}; +} + +std::pair<bool, std::vector<int64_t>> full_decode(ov::Tensor& encoder_hidden_state, + const ov::genai::WhisperGenerationConfig& config, + ov::genai::WhisperInitializedModels& models, + std::vector<int64_t> init_ids, + const size_t max_new_tokens, + const bool return_timestamps, + const std::shared_ptr<ov::genai::StreamerBase> streamer) { + int64_t output_token = decode(encoder_hidden_state, models.decoder, init_ids, config, true, return_timestamps); + + std::vector<int64_t> output_tokens{output_token}; + + bool is_timestamp = output_token >= config.begin_timestamps_token_id; + if (!is_timestamp && streamer && streamer->put(output_token)) { + return {true, output_tokens}; + } + + if (max_new_tokens == 1) { + return {false, output_tokens}; + } + + set_past_key_value(models.decoder, models.decoder_with_past); + + for (size_t i = 0; i < max_new_tokens - 1; i++) { + auto output_token = decode_with_past(encoder_hidden_state, + models.decoder_with_past, + output_tokens.back(), + init_ids.size() + output_tokens.size() - 1, + config, + return_timestamps, + output_tokens); + + if (i == 0) { + set_past_key_value(models.decoder_with_past, models.decoder_with_past); + } + + if (output_token == config.eos_token_id) { + break; + } + + output_tokens.push_back(output_token); + bool is_timestamp = output_token >= config.begin_timestamps_token_id; + + if (!is_timestamp && streamer && streamer->put(output_token)) { + return {true, output_tokens}; + } + } + + return {false, output_tokens}; +} + +} // namespace + +namespace ov { +namespace genai { + +std::pair<std::vector<int64_t>, std::optional<std::vector<Segment>>> whisper_generate( + const ov::genai::WhisperGenerationConfig& config, + const ov::genai::WhisperConfig& model_config, + const RawSpeechInput& raw_speech, + ov::genai::WhisperInitializedModels& models, + WhisperFeatureExtractor& feature_extractor, + const std::shared_ptr<StreamerBase> streamer) { + auto input_features = feature_extractor.extract(raw_speech); + + const bool is_shortform = input_features.n_frames <= feature_extractor.nb_max_frames; + // long-form audio processing requires timestamps to be enabled + const bool return_timestamps = config.return_timestamps || !is_shortform; + + std::vector<int64_t> init_ids; + std::vector<int64_t> output_tokens; + size_t max_new_tokens = config.get_max_new_tokens(); + + std::vector<Segment> segments; + + // 0.02 by default + const float time_precision = static_cast<float>(feature_extractor.chunk_length) / model_config.max_source_positions; + size_t segment_offset = 0; + + for (size_t chunk_offset = 0; chunk_offset < input_features.n_frames; chunk_offset += segment_offset) { + if (output_tokens.size() >= max_new_tokens) { + break; + } + + auto input_features_chunk = input_features.get_data_with_offset(chunk_offset, feature_extractor.nb_max_frames); + + ov::Tensor hidden_state_tensor = encode(models.encoder, + input_features_chunk, + feature_extractor.feature_size, + feature_extractor.nb_max_frames); + + // prepare init_ids just once for whole input + if (init_ids.empty()) { + init_ids = prepare_init_ids(hidden_state_tensor, models.decoder, config, return_timestamps); + } + + auto [cancelled, chunk_output_tokens] = full_decode(hidden_state_tensor, + config, + models, + init_ids, + max_new_tokens - output_tokens.size(), + return_timestamps, + streamer); + + if (return_timestamps) { + auto extracted_segments = ov::genai::extract_segments(chunk_output_tokens, + config, + feature_extractor.nb_max_frames, + time_precision); + + segments.insert(segments.end(), extracted_segments.segments.begin(), extracted_segments.segments.end()); + + output_tokens.insert(output_tokens.end(), + extracted_segments.non_timestamp_tokens.begin(), + extracted_segments.non_timestamp_tokens.end()); + + segment_offset = extracted_segments.last_offset; + } else { + output_tokens.insert(output_tokens.end(), chunk_output_tokens.begin(), chunk_output_tokens.end()); + } + + if (is_shortform) { + segment_offset = input_features.n_frames; + } + + if (cancelled) { + break; + } + } + + if (streamer) { + streamer->end(); + } + + // if return_timestamps wasn't enabled by user + if (!config.return_timestamps) { + return {output_tokens, std::nullopt}; + } + + return {output_tokens, segments}; +} +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/whisper/whisper.hpp b/src/cpp/src/whisper/whisper.hpp new file mode 100644 index 0000000000..c99f0a3caa --- /dev/null +++ b/src/cpp/src/whisper/whisper.hpp @@ -0,0 +1,32 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <openvino/openvino.hpp> + +#include "openvino/genai/whisper_generation_config.hpp" +#include "openvino/genai/whisper_pipeline.hpp" +#include "whisper_config.hpp" +#include "whisper_feature_extractor.hpp" +#include "whisper_models.hpp" + +namespace ov { +namespace genai { + +struct Segment { + float m_start; + float m_end; + std::vector<int64_t> m_tokens; +}; + +std::pair<std::vector<int64_t>, std::optional<std::vector<Segment>>> whisper_generate( + const ov::genai::WhisperGenerationConfig& config, + const ov::genai::WhisperConfig& model_config, + const ov::genai::RawSpeechInput& raw_speech, + ov::genai::WhisperInitializedModels& models, + ov::genai::WhisperFeatureExtractor& feature_extractor, + const std::shared_ptr<StreamerBase> streamer); + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/whisper/whisper_config.cpp b/src/cpp/src/whisper/whisper_config.cpp new file mode 100644 index 0000000000..a46fb36aa8 --- /dev/null +++ b/src/cpp/src/whisper/whisper_config.cpp @@ -0,0 +1,32 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "whisper_config.hpp" + +#include <fstream> +#include <nlohmann/json.hpp> +#include <openvino/runtime/core.hpp> + +#include "utils.hpp" + +namespace ov { +namespace genai { + +WhisperConfig::WhisperConfig(const std::string& json_path) { + // preprocessor_config.json not found. Skip parameters initialization from file, use defaults. + if (!std::filesystem::exists(json_path)) { + return; + } + + using ov::genai::utils::read_json_param; + + std::ifstream f(json_path); + OPENVINO_ASSERT(f.is_open(), "Failed to open '" + json_path + "' with config"); + + nlohmann::json data = nlohmann::json::parse(f); + + read_json_param(data, "max_source_positions", max_source_positions); +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/whisper/whisper_config.hpp b/src/cpp/src/whisper/whisper_config.hpp new file mode 100644 index 0000000000..31f8cd7618 --- /dev/null +++ b/src/cpp/src/whisper/whisper_config.hpp @@ -0,0 +1,22 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <string> + +namespace ov { +namespace genai { + +/** + * @brief Structure to keep whisper config parameters. + */ +class WhisperConfig { +public: + explicit WhisperConfig(const std::string& json_path); + + size_t max_source_positions = 1500; +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/whisper/whisper_feature_extractor.cpp b/src/cpp/src/whisper/whisper_feature_extractor.cpp new file mode 100644 index 0000000000..0299272c81 --- /dev/null +++ b/src/cpp/src/whisper/whisper_feature_extractor.cpp @@ -0,0 +1,496 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#ifdef _WIN32 +# define _USE_MATH_DEFINES +#endif + +#include "whisper_feature_extractor.hpp" + +#include <algorithm> +#include <cassert> +#include <cmath> +#include <fstream> +#include <iostream> +#include <nlohmann/json.hpp> +#include <openvino/core/except.hpp> +#include <openvino/openvino.hpp> +#include <string> +#include <thread> +#include <vector> + +#include "../utils.hpp" +#include "openvino/genai/visibility.hpp" + +namespace { +using ov::genai::WhisperFeatures; + +static bool hann_window(const size_t length, const bool periodic, std::vector<float>& output) { + if (output.size() < length) { + output.resize(length); + } + int offset = -1; + if (periodic) { + offset = 0; + } + for (int i = 0; i < length; i++) { + output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset))); + } + + return true; +} + +// naive Discrete Fourier Transform +// input is real-valued +// output is complex-valued +static void dft(const std::vector<float>& in, + std::vector<float>& out, + const std::vector<float>& sin_vals, + const std::vector<float>& cos_vals, + const size_t n_fft) { + int N = in.size(); + + out.resize(N * 2); + const int sin_cos_step = n_fft / N; + + for (int k = 0; k < N; k++) { + float re = 0; + float im = 0; + + for (int n = 0; n < N; n++) { + int idx = (k * n * sin_cos_step) % (n_fft); // t = 2*M_PI*k*n/N + re += in[n] * cos_vals[idx]; // cos(t) + im -= in[n] * sin_vals[idx]; // sin(t) + } + + out[k * 2 + 0] = re; + out[k * 2 + 1] = im; + } +} + +// Cooley-Tukey FFT +// input is real-valued +// output is complex-valued +static void fft(const std::vector<float>& in, + std::vector<float>& out, + const std::vector<float>& sin_vals, + const std::vector<float>& cos_vals, + const size_t n_fft) { + out.resize(in.size() * 2); + + int N = in.size(); + + if (N == 1) { + out[0] = in[0]; + out[1] = 0; + return; + } + + if (N % 2 == 1) { + dft(in, out, sin_vals, cos_vals, n_fft); + return; + } + + std::vector<float> even; + std::vector<float> odd; + + even.reserve(N / 2); + odd.reserve(N / 2); + + for (int i = 0; i < N; i++) { + if (i % 2 == 0) { + even.push_back(in[i]); + } else { + odd.push_back(in[i]); + } + } + + std::vector<float> even_fft; + std::vector<float> odd_fft; + + fft(even, even_fft, sin_vals, cos_vals, n_fft); + fft(odd, odd_fft, sin_vals, cos_vals, n_fft); + + const int sin_cos_step = n_fft / N; + for (int k = 0; k < N / 2; k++) { + int idx = k * sin_cos_step; // t = 2*M_PI*k/N + float re = cos_vals[idx]; // cos(t) + float im = -sin_vals[idx]; // sin(t) + + float re_odd = odd_fft[2 * k + 0]; + float im_odd = odd_fft[2 * k + 1]; + + out[2 * k + 0] = even_fft[2 * k + 0] + re * re_odd - im * im_odd; + out[2 * k + 1] = even_fft[2 * k + 1] + re * im_odd + im * re_odd; + + out[2 * (k + N / 2) + 0] = even_fft[2 * k + 0] - re * re_odd + im * im_odd; + out[2 * (k + N / 2) + 1] = even_fft[2 * k + 1] - re * im_odd - im * re_odd; + } +} + +static void log_mel_spectrogram_worker_thread(int ith, + const std::vector<float>& hann, + const std::vector<float>& samples, + int n_samples, + int frame_size, + int frame_step, + int n_threads, + const std::vector<float>& mel_filter, + WhisperFeatures& features, + const std::vector<float>& sin_vals, + const std::vector<float>& cos_vals) { + std::vector<float> fft_in(frame_size, 0.0); + std::vector<float> fft_out(2 * frame_size); + int n_fft = 1 + (frame_size / 2); + int i = ith; + + OPENVINO_ASSERT(mel_filter.size() == n_fft * features.feature_size); + + // calculate FFT only when fft_in are not all zero + for (; i < std::min(n_samples / frame_step + 1, int(features.n_frames)); i += n_threads) { + const int offset = i * frame_step; + + // apply Hanning window (~10% faster) + for (int j = 0; j < std::min(frame_size, n_samples - offset); j++) { + fft_in[j] = hann[j] * samples[offset + j]; + } + // fill the rest with zeros + if (n_samples - offset < frame_size) { + std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0); + } + + // FFT + fft(fft_in, fft_out, sin_vals, cos_vals, frame_size); + + // Calculate modulus^2 of complex numbers + // Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting. + for (int j = 0; j < n_fft; j++) { + fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]); + } + + // mel spectrogram + for (int j = 0; j < features.feature_size; j++) { + double sum = 0.0; + + // unroll loop (suggested by GH user @lunixbochs) + int k = 0; + for (k = 0; k < n_fft - 3; k += 4) { + sum += fft_out[k + 0] * mel_filter[j * n_fft + k + 0] + fft_out[k + 1] * mel_filter[j * n_fft + k + 1] + + fft_out[k + 2] * mel_filter[j * n_fft + k + 2] + fft_out[k + 3] * mel_filter[j * n_fft + k + 3]; + } + + // handle n_fft remainder + for (; k < n_fft; k++) { + sum += fft_out[k] * mel_filter[j * n_fft + k]; + } + + sum = log10(std::max(sum, 1e-10)); + + features.data[j * features.n_frames + i] = sum; + } + } + + // Otherwise fft_out are all zero + double sum = log10(1e-10); + for (; i < features.n_frames; i += n_threads) { + for (int j = 0; j < features.feature_size; j++) { + features.data[j * features.n_frames + i] = sum; + } + } +} + +// python implementation: https://github.com/huggingface/transformers/blob/check_gemma/src/transformers/audio_utils.py + +float hertz_to_mel(const float freq) { + constexpr float min_log_hertz = 1000.0; + constexpr float min_log_mel = 15.0; + const float logstep = 27.0 / log(6.4); + float mel = 3.0 * freq / 200.0; + + if (freq >= min_log_hertz) { + mel = min_log_mel + log(freq / min_log_hertz) * logstep; + } + return mel; +} + +float mel_to_hertz(const float mel) { + constexpr float min_log_hertz = 1000.0; + constexpr float min_log_mel = 15.0; + const float logstep = log(6.4) / 27.0; + float freq = 200.0 * mel / 3.0; + + if (mel >= min_log_mel) { + freq = min_log_hertz * exp(logstep * (mel - min_log_mel)); + } + + return freq; +} + +std::vector<std::vector<float>> create_triangular_filter_bank(const std::vector<float>& fft_freqs, + const std::vector<float>& filter_freqs) { + std::vector<float> filter_diff(filter_freqs.size() - 1); + for (size_t i = 0; i < filter_diff.size(); i++) { + filter_diff[i] = filter_freqs[i + 1] - filter_freqs[i]; + } + + std::vector<std::vector<float>> slopes(fft_freqs.size(), std::vector<float>(filter_freqs.size())); + for (size_t row = 0; row < slopes.size(); row++) { + for (size_t col = 0; col < slopes[0].size(); col++) { + slopes[row][col] = filter_freqs[col] - fft_freqs[row]; + } + } + + std::vector<std::vector<float>> down_slopes(fft_freqs.size(), std::vector<float>(filter_freqs.size() - 2)); + for (size_t row = 0; row < down_slopes.size(); row++) { + for (size_t col = 0; col < down_slopes[0].size(); col++) { + down_slopes[row][col] = -slopes[row][col] / filter_diff[col]; + } + } + + std::vector<std::vector<float>> up_slopes(fft_freqs.size(), std::vector<float>(filter_freqs.size() - 2)); + for (size_t row = 0; row < up_slopes.size(); row++) { + for (size_t col = 0; col < up_slopes[0].size(); col++) { + up_slopes[row][col] = slopes[row][col + 2] / filter_diff[col + 1]; + } + } + + std::vector<std::vector<float>> result(fft_freqs.size(), std::vector<float>(filter_freqs.size() - 2)); + for (size_t row = 0; row < result.size(); row++) { + for (size_t col = 0; col < result[0].size(); col++) { + result[row][col] = std::max(float(0), std::min(down_slopes[row][col], up_slopes[row][col])); + } + } + + return result; +} + +std::vector<std::vector<float>> mel_filter_bank(const int64_t num_frequency_bins, + const int64_t num_mel_filters, + const int64_t sampling_rate, + const float min_frequency = 0.0f, + const float max_frequency = 8000.0f) { + OPENVINO_ASSERT(max_frequency <= (sampling_rate / 2), "max_frequency should be less or equal sampling_rate / 2"); + + const float mel_min = hertz_to_mel(min_frequency); + const float mel_max = hertz_to_mel(max_frequency); + + const float mel_freqs_step = (mel_max - mel_min) / float(num_mel_filters + 1); + std::vector<float> filter_freqs(num_mel_filters + 2); + for (size_t i = 0; i < filter_freqs.size(); i++) { + filter_freqs[i] = mel_to_hertz(mel_min + i * mel_freqs_step); + } + + std::vector<float> fft_freqs(num_frequency_bins); + const float fft_freq_step = float(sampling_rate / 2) / float(num_frequency_bins - 1); + for (size_t i = 0; i < num_frequency_bins; i++) { + fft_freqs[i] = i * fft_freq_step; + } + + auto mel_filters = create_triangular_filter_bank(fft_freqs, filter_freqs); + + std::vector<float> enorm(num_mel_filters); + for (size_t i = 0; i < enorm.size(); i++) { + enorm[i] = 2.0f / (filter_freqs[i + 2] - filter_freqs[i]); + } + + for (size_t row = 0; row < mel_filters.size(); row++) { + for (size_t col = 0; col < mel_filters[0].size(); col++) { + mel_filters[row][col] *= enorm[col]; + } + } + + return mel_filters; +} + +// In FFT, we frequently use sine and cosine operations with the same values. +// We can use precalculated values to speed up the process. +void fill_sin_cos_table(std::vector<float>& sin_vals, std::vector<float>& cos_vals, const size_t n_fft) { + sin_vals.resize(n_fft); + cos_vals.resize(n_fft); + + for (size_t i = 0; i < n_fft; i++) { + double theta = (2 * M_PI * i) / n_fft; + sin_vals[i] = sinf(theta); + cos_vals[i] = cosf(theta); + } +} + +std::vector<float> pad(const std::vector<float>& raw_speech, + const size_t minimum_length, + const size_t reflect_pad_size) { + // pad to minimum length if needed + size_t total_pad_length = std::max(raw_speech.size(), minimum_length) + 2 * reflect_pad_size; + + std::vector<float> padded_raw_speech(total_pad_length, 0.f); + + std::copy(raw_speech.begin(), raw_speech.end(), padded_raw_speech.begin() + reflect_pad_size); + + // reflect pad + std::reverse_copy(padded_raw_speech.begin() + reflect_pad_size + 1, + padded_raw_speech.begin() + reflect_pad_size + 1 + reflect_pad_size, + padded_raw_speech.begin()); + + std::reverse_copy(padded_raw_speech.end() - reflect_pad_size - 1 - reflect_pad_size, + padded_raw_speech.end() - reflect_pad_size - 1, + padded_raw_speech.end() - reflect_pad_size); + + return padded_raw_speech; +} + +WhisperFeatures mel_spectrogram_convert_audio(const std::vector<float>& raw_speech, + const size_t sampling_rate, + const size_t feature_size, + const size_t n_fft, + const size_t hop_length, + const size_t n_threads, + const std::vector<float>& mel_filter, + const std::vector<float>& sin_vals, + const std::vector<float>& cos_vals) { + // Hanning window (Use cosf to eliminate difference) + // ref: https://pytorch.org/docs/stable/generated/torch.hann_window.html + // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L147 + std::vector<float> hann; + hann_window(n_fft, true, hann); + + const size_t reflect_pad_size = n_fft / 2; + auto padded_raw_speech = pad(raw_speech, sampling_rate * 30, reflect_pad_size); + + WhisperFeatures features; + features.feature_size = feature_size; + // https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/SpectralOps.cpp#L936 + // Calculate number of frames + remove the last frame + features.n_frames = (padded_raw_speech.size() - n_fft) / hop_length; + features.data.resize(features.feature_size * features.n_frames); + + { + std::vector<std::thread> workers(n_threads - 1); + for (int iw = 0; iw < n_threads - 1; ++iw) { + workers[iw] = std::thread(log_mel_spectrogram_worker_thread, + iw + 1, + std::cref(hann), + padded_raw_speech, + raw_speech.size() + reflect_pad_size, + n_fft, + hop_length, + n_threads, + std::cref(mel_filter), + std::ref(features), + std::cref(sin_vals), + std::cref(cos_vals)); + } + + // main thread + log_mel_spectrogram_worker_thread(0, + hann, + padded_raw_speech, + raw_speech.size() + reflect_pad_size, + n_fft, + hop_length, + n_threads, + mel_filter, + features, + sin_vals, + cos_vals); + + for (int iw = 0; iw < n_threads - 1; ++iw) { + workers[iw].join(); + } + } + + // clamping and normalization + double mmax = -1e20; + for (int i = 0; i < features.feature_size * features.n_frames; i++) { + if (features.data[i] > mmax) { + mmax = features.data[i]; + } + } + + mmax -= 8.0; + + for (int i = 0; i < features.feature_size * features.n_frames; i++) { + if (features.data[i] < mmax) { + features.data[i] = mmax; + } + + features.data[i] = (features.data[i] + 4.0) / 4.0; + } + + return features; +} + +} // namespace + +namespace ov { +namespace genai { + +std::vector<float> WhisperFeatures::get_data_with_offset(const size_t frame_offset, const size_t min_frames) { + OPENVINO_ASSERT(n_frames > frame_offset); + + size_t copy_size = std::min(n_frames - frame_offset, min_frames); + std::vector<float> offset_data; + + for (size_t i = 0; i < feature_size; i++) { + size_t offset = frame_offset + (i * n_frames); + std::copy(data.begin() + offset, data.begin() + offset + copy_size, std::back_inserter(offset_data)); + if (copy_size < min_frames) { + std::fill_n(std::back_inserter(offset_data), min_frames - copy_size, 0); + } + } + + return offset_data; +} + +WhisperFeatureExtractor::WhisperFeatureExtractor(const std::string& preprocessor_json_path) { + init_parameters(preprocessor_json_path); + fill_sin_cos_table(sin_vals, cos_vals, n_fft); + init_mel_filter(); +} + +void WhisperFeatureExtractor::init_parameters(const std::string& preprocessor_json_path) { + // preprocessor_config.json not found. Skip parameters initialization from file, use defaults. + if (!std::filesystem::exists(preprocessor_json_path)) { + return; + } + + using ov::genai::utils::read_json_param; + + std::ifstream f(preprocessor_json_path); + OPENVINO_ASSERT(f.is_open(), "Failed to open '" + preprocessor_json_path + "' with preprocessor config"); + + nlohmann::json data = nlohmann::json::parse(f); + + read_json_param(data, "feature_size", feature_size); + read_json_param(data, "sampling_rate", sampling_rate); + read_json_param(data, "hop_length", hop_length); + read_json_param(data, "n_fft", n_fft); + read_json_param(data, "chunk_length", chunk_length); + read_json_param(data, "n_samples", n_samples); + read_json_param(data, "nb_max_frames", nb_max_frames); +}; + +void WhisperFeatureExtractor::init_mel_filter() { + auto mel_data = mel_filter_bank(1 + n_fft / 2, feature_size, sampling_rate); + mel_filter.resize(mel_data.size() * mel_data[0].size()); + + for (size_t col = 0; col < mel_data[0].size(); col++) { + for (size_t row = 0; row < mel_data.size(); row++) { + mel_filter[col * mel_data.size() + row] = mel_data[row][col]; + } + } +} + +WhisperFeatures WhisperFeatureExtractor::extract(const std::vector<float>& raw_speech) { + size_t n_threads = std::min(4, (int32_t)std::thread::hardware_concurrency()); + return mel_spectrogram_convert_audio(raw_speech, + sampling_rate, + feature_size, + n_fft, + hop_length, + n_threads, + mel_filter, + sin_vals, + cos_vals); +} + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/whisper/whisper_feature_extractor.hpp b/src/cpp/src/whisper/whisper_feature_extractor.hpp new file mode 100644 index 0000000000..b34b66c608 --- /dev/null +++ b/src/cpp/src/whisper/whisper_feature_extractor.hpp @@ -0,0 +1,64 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <string> +#include <vector> + +#include "openvino/genai/visibility.hpp" + +namespace ov { +namespace genai { + +struct WhisperFeatures { + size_t feature_size; + size_t n_frames; + + // flattened 2d array with shape [feature_size, n_frames] + std::vector<float> data; + + /** + * Return frames with specific offset + * Pad to min_frames if needed + * + * v offset + * ****xxxxx**** + * ****xxxxx**** + * ****xxxxx**** + * + */ + std::vector<float> get_data_with_offset(const size_t frame_offset, const size_t min_frames); +}; + +class WhisperFeatureExtractor { +public: + size_t feature_size = 80; + size_t sampling_rate = 16000; + size_t hop_length = 160; + size_t n_fft = 400; + size_t chunk_length = 30; + size_t n_samples = 480000; + size_t nb_max_frames = 3000; + + explicit WhisperFeatureExtractor(const std::string& preprocessor_json_path); + + /** + * @brief Create a flattened 2d log-mel spectrogram [feature_size, n_frames] from raw speech data + * + * @see [huggingface introduction to audio + * data](https://huggingface.co/learn/audio-course/chapter1/audio_data#mel-spectrogram) + */ + WhisperFeatures extract(const std::vector<float>& raw_speech); + +private: + std::vector<float> sin_vals; + std::vector<float> cos_vals; + std::vector<float> mel_filter; + + void init_mel_filter(); + void init_parameters(const std::string& preprocessor_json_path); +}; + +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/whisper/whisper_models.hpp b/src/cpp/src/whisper/whisper_models.hpp new file mode 100644 index 0000000000..576bdb9dc7 --- /dev/null +++ b/src/cpp/src/whisper/whisper_models.hpp @@ -0,0 +1,17 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include <openvino/openvino.hpp> + +namespace ov { +namespace genai { + +struct WhisperInitializedModels { + ov::InferRequest encoder; + ov::InferRequest decoder; + ov::InferRequest decoder_with_past; +}; +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/whisper_generation_config.cpp b/src/cpp/src/whisper_generation_config.cpp new file mode 100644 index 0000000000..792c940f06 --- /dev/null +++ b/src/cpp/src/whisper_generation_config.cpp @@ -0,0 +1,117 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/whisper_generation_config.hpp" + +#include <fstream> +#include <limits> +#include <nlohmann/json.hpp> +#include <openvino/runtime/core.hpp> + +#include "utils.hpp" + +namespace ov { +namespace genai { + +WhisperGenerationConfig::WhisperGenerationConfig(const std::string& json_path) { + using ov::genai::utils::read_json_param; + + std::ifstream f(json_path); + OPENVINO_ASSERT(f.is_open(), "Failed to open '" + json_path + "' with generation config"); + + nlohmann::json data = nlohmann::json::parse(f); + + read_json_param(data, "max_new_tokens", max_new_tokens); + read_json_param(data, "max_length", max_length); + read_json_param(data, "begin_suppress_tokens", begin_suppress_tokens); + read_json_param(data, "suppress_tokens", suppress_tokens); + read_json_param(data, "decoder_start_token_id", decoder_start_token_id); + read_json_param(data, "eos_token_id", eos_token_id); + read_json_param(data, "pad_token_id", pad_token_id); + read_json_param(data, "no_timestamps_token_id", no_timestamps_token_id); + read_json_param(data, "begin_timestamps_token_id", begin_timestamps_token_id); + read_json_param(data, "max_initial_timestamp_index", max_initial_timestamp_index); + + read_json_param(data, "is_multilingual", is_multilingual); + if (is_multilingual) { + read_json_param(data, "task_to_id.transcribe", transcribe_token_id); + read_json_param(data, "task_to_id.translate", translate_token_id); + } + + read_json_param(data, "lang_to_id", lang_to_id); +} + +void WhisperGenerationConfig::set_eos_token_id(int64_t tokenizer_eos_token_id) { + if (eos_token_id < 0) { + eos_token_id = tokenizer_eos_token_id; + } else { + OPENVINO_ASSERT(eos_token_id == tokenizer_eos_token_id, + "EOS token ID is different in generation config (", + eos_token_id, + ") and tokenizer (", + tokenizer_eos_token_id, + ")"); + } +} + +void WhisperGenerationConfig::update_generation_config(const ov::AnyMap& config_map) { + using ov::genai::utils::read_anymap_param; + + read_anymap_param(config_map, "max_new_tokens", max_new_tokens); + read_anymap_param(config_map, "max_length", max_length); + read_anymap_param(config_map, "begin_suppress_tokens", begin_suppress_tokens); + read_anymap_param(config_map, "suppress_tokens", suppress_tokens); + read_anymap_param(config_map, "decoder_start_token_id", decoder_start_token_id); + read_anymap_param(config_map, "eos_token_id", eos_token_id); + read_anymap_param(config_map, "pad_token_id", pad_token_id); + read_anymap_param(config_map, "transcribe_token_id", transcribe_token_id); + read_anymap_param(config_map, "translate_token_id", translate_token_id); + read_anymap_param(config_map, "no_timestamps_token_id", no_timestamps_token_id); + read_anymap_param(config_map, "begin_timestamps_token_id", begin_timestamps_token_id); + read_anymap_param(config_map, "max_initial_timestamp_index", max_initial_timestamp_index); + read_anymap_param(config_map, "is_multilingual", is_multilingual); + read_anymap_param(config_map, "language", language); + read_anymap_param(config_map, "lang_to_id", lang_to_id); + read_anymap_param(config_map, "task", task); + read_anymap_param(config_map, "return_timestamps", return_timestamps); +} + +size_t WhisperGenerationConfig::get_max_new_tokens(size_t prompt_length) const { + // max_new_tokens has priority over max_length, only if max_new_tokens was not specified use max_length + if (max_new_tokens != SIZE_MAX) { + return max_new_tokens; + } else { + return max_length - prompt_length; + } +} + +void WhisperGenerationConfig::validate() const { + OPENVINO_ASSERT(max_new_tokens > 0, "'max_new_tokens' must be greater than 0"); + + // max_new_tokens has priority over max_length + // if max_new_tokens is defined no need to check max_length + OPENVINO_ASSERT(max_new_tokens != SIZE_MAX || max_length > 0, + "'max_length' must be greater than 0 or 'max_new_tokens' should be defined"); + + OPENVINO_ASSERT(eos_token_id != -1 || max_new_tokens != SIZE_MAX || max_length != SIZE_MAX, + "Either 'eos_token_id', or 'max_new_tokens', or 'max_length' should be defined."); + + if (is_multilingual && language.has_value()) { + OPENVINO_ASSERT(lang_to_id.count(*language), + "'language' " + *language + " must be provided in generation_config.json 'lang_to_id' map."); + } + + if (is_multilingual && task.has_value()) { + OPENVINO_ASSERT(*task == "transcribe" || *task == "translate", + "'task' mast be 'transcribe' or 'translate'. Task provided: '", + *task, + "'."); + } + + if (!is_multilingual) { + OPENVINO_ASSERT(!language.has_value(), "Cannot specify 'language' for not multilingual model."); + OPENVINO_ASSERT(!task.has_value(), "Cannot specify 'task' for not multilingual model."); + } +} +} // namespace genai +} // namespace ov diff --git a/src/cpp/src/whisper_pipeline.cpp b/src/cpp/src/whisper_pipeline.cpp new file mode 100644 index 0000000000..72d7003b30 --- /dev/null +++ b/src/cpp/src/whisper_pipeline.cpp @@ -0,0 +1,176 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "openvino/genai/whisper_pipeline.hpp" + +#include <algorithm> +#include <filesystem> +#include <openvino/openvino.hpp> +#include <variant> + +#include "text_callback_streamer.hpp" +#include "utils.hpp" +#include "whisper/whisper.hpp" +#include "whisper/whisper_config.hpp" +#include "whisper/whisper_feature_extractor.hpp" +#include "whisper/whisper_models.hpp" + +namespace { +ov::genai::WhisperGenerationConfig from_config_json_if_exists(const std::filesystem::path& model_path) { + auto config_file_path = model_path / "generation_config.json"; + if (std::filesystem::exists(config_file_path)) { + return ov::genai::WhisperGenerationConfig((config_file_path).string()); + } else { + return ov::genai::WhisperGenerationConfig{}; + } +} + +ov::genai::OptionalWhisperGenerationConfig get_config_from_map(const ov::AnyMap& config_map) { + if (config_map.count("generation_config")) { + return config_map.at("generation_config").as<ov::genai::WhisperGenerationConfig>(); + } else { + return std::nullopt; + } +} +} // namespace + +namespace ov { +namespace genai { + +class WhisperPipeline::Impl { +private: + ov::genai::WhisperConfig m_model_config; + +public: + ov::genai::WhisperGenerationConfig m_generation_config; + ov::genai::WhisperInitializedModels m_models; + ov::genai::WhisperFeatureExtractor m_feature_extractor; + Tokenizer m_tokenizer; + float m_load_time_ms = 0; + + Impl(const std::filesystem::path& model_path, + const ov::genai::Tokenizer& tokenizer, + const std::string& device, + const ov::AnyMap& plugin_config) + : m_generation_config{from_config_json_if_exists(model_path)}, + m_tokenizer{tokenizer}, + m_feature_extractor{(model_path / "preprocessor_config.json").string()}, + m_model_config{(model_path / "config.json").string()} { + ov::Core core; + auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_complile_config(plugin_config); + core.set_property(core_plugin_config); + + m_models.encoder = core.compile_model(model_path / "openvino_encoder_model.xml", device, compile_plugin_config) + .create_infer_request(); + m_models.decoder = core.compile_model(model_path / "openvino_decoder_model.xml", device, compile_plugin_config) + .create_infer_request(); + m_models.decoder_with_past = + core.compile_model(model_path / "openvino_decoder_with_past_model.xml", device, compile_plugin_config) + .create_infer_request(); + + // If eos_token_id was not provided, take value + if (m_generation_config.eos_token_id == -1) { + m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id()); + } + } + + Impl(const std::filesystem::path& model_path, const std::string& device, const ov::AnyMap& plugin_config) + : Impl{model_path, Tokenizer(model_path.string()), device, plugin_config} {} + + WhisperDecodedResults generate(const RawSpeechInput& raw_speech_input, + OptionalWhisperGenerationConfig generation_config, + StreamerVariant streamer) { + auto start_time = std::chrono::steady_clock::now(); + WhisperGenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config; + config.validate(); + + std::shared_ptr<StreamerBase> streamer_ptr; + if (auto streamer_obj = std::get_if<std::monostate>(&streamer)) { + streamer_ptr = nullptr; + } else if (auto streamer_obj = std::get_if<std::shared_ptr<StreamerBase>>(&streamer)) { + streamer_ptr = *streamer_obj; + } else if (auto callback = std::get_if<std::function<bool(std::string)>>(&streamer)) { + streamer_ptr = std::make_shared<TextCallbackStreamer>(m_tokenizer, *callback); + } + + auto [output_tokens, segments] = ov::genai::whisper_generate(config, + m_model_config, + raw_speech_input, + m_models, + m_feature_extractor, + streamer_ptr); + + WhisperDecodedResults decoded_results{std::vector{m_tokenizer.decode(output_tokens)}, std::vector{1.f}}; + if (!segments.has_value()) { + return decoded_results; + } + + std::vector<WhisperDecodedResultChunk> chunks; + chunks.reserve((*segments).size()); + + for (auto& segment : *segments) { + chunks.push_back( + WhisperDecodedResultChunk{segment.m_start, segment.m_end, m_tokenizer.decode(segment.m_tokens)}); + } + + decoded_results.chunks = chunks; + return decoded_results; + } +}; + +} // namespace genai +} // namespace ov + +ov::genai::WhisperPipeline::WhisperPipeline(const std::string& model_path, + const ov::genai::Tokenizer& tokenizer, + const std::string& device, + const ov::AnyMap& plugin_config) { + auto start_time = std::chrono::steady_clock::now(); + m_impl = std::make_unique<WhisperPipeline::Impl>(model_path, tokenizer, device, plugin_config); + auto stop_time = std::chrono::steady_clock::now(); + m_impl->m_load_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count(); +} + +ov::genai::WhisperPipeline::WhisperPipeline(const std::string& model_path, + const std::string& device, + const ov::AnyMap& plugin_config) { + auto start_time = std::chrono::steady_clock::now(); + m_impl = std::make_unique<WhisperPipeline::Impl>(model_path, device, plugin_config); + auto stop_time = std::chrono::steady_clock::now(); + m_impl->m_load_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(stop_time - start_time).count(); +} + +ov::genai::WhisperDecodedResults ov::genai::WhisperPipeline::generate(const RawSpeechInput& raw_speech_input, + OptionalWhisperGenerationConfig generation_config, + StreamerVariant streamer) { + return m_impl->generate(raw_speech_input, generation_config, streamer); +} + +ov::genai::WhisperDecodedResults ov::genai::WhisperPipeline::generate(const RawSpeechInput& raw_speech_input, + const ov::AnyMap& config_map) { + auto config_arg = get_config_from_map(config_map); + WhisperGenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config(); + config.update_generation_config(config_map); + + return m_impl->generate(raw_speech_input, config, utils::get_streamer_from_map(config_map)); +} + +ov::genai::WhisperGenerationConfig ov::genai::WhisperPipeline::get_generation_config() const { + return m_impl->m_generation_config; +} + +ov::genai::Tokenizer ov::genai::WhisperPipeline::get_tokenizer() { + return m_impl->m_tokenizer; +} + +void ov::genai::WhisperPipeline::set_generation_config(const WhisperGenerationConfig& config) { + int64_t default_eos_token_id = m_impl->m_generation_config.eos_token_id; + m_impl->m_generation_config = config; + // if eos_token_id was not provided in config forward from default config + if (config.eos_token_id == -1) + m_impl->m_generation_config.eos_token_id = default_eos_token_id; + + m_impl->m_generation_config.validate(); +} + +ov::genai::WhisperPipeline::~WhisperPipeline() = default; diff --git a/src/docs/BUILD.md b/src/docs/BUILD.md new file mode 100644 index 0000000000..77657620a0 --- /dev/null +++ b/src/docs/BUILD.md @@ -0,0 +1,227 @@ +# How to Build OpenVINO™ GenAI + +> **NOTE**: There is a known Python API issue with `ov::Tensor`. The issue is reproduced when building OpenVINO GenAI from sources while using OpenVINO from archives. Using `ov::Tensor` with OpenVINO GenAI fails. Possible errors: `TypeError: generate(): incompatible function arguments.`, `TypeError: __init__(): incompatible constructor arguments.`, `TypeError: Unregistered type : ov::Tensor`. +The preferred approach is to build both OpenVINO and OpenVINO GenAI from sources using the same build environment. Or to install prebuilt OpenVINO GenAI from [distribution channels](https://docs.openvino.ai/2024/get-started/install-openvino.html). + +## Software Requirements + +### Linux + +- [CMake](https://cmake.org/download/) 3.23 or higher +- GCC 7.5 or higher +- Python 3.9 or higher +- Git + +### Windows + +- [CMake](https://cmake.org/download/) 3.23 or higher +- Microsoft Visual Studio 2019 or higher, version 16.3 or later +- Python 3.9 or higher +- Git for Windows + +### macOS + +- [CMake](https://cmake.org/download/) 3.23 or higher +- [brew](https://brew.sh/) package manager to install additional dependencies: + ```sh + brew install coreutils scons + ``` +- Clang compiler and other command line tools from Xcode 10.1 or higher: + ```sh + xcode-select --install + ``` +- Python 3.9 or higher +- Git + + +## Build Instructions + +### Build OpenVINO GenAI as OpenVINO Extra Module + +OpenVINO GenAI can be built as an extra module during the OpenVINO build process. This method simplifies the build process by integrating OpenVINO GenAI directly into the OpenVINO build. + +1. Clone OpenVINO and OpenVINO GenAI repositories: + ```sh + git clone --recursive https://github.com/openvinotoolkit/openvino.git + git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git + ``` +2. Configure CMake with OpenVINO extra modules: + ```sh + cmake -DOPENVINO_EXTRA_MODULES=./openvino.genai -DCPACK_ARCHIVE_COMPONENT_INSTALL=OFF -S ./openvino -B ./build + ``` +3. Build OpenVINO archive with GenAI: + ```sh + cmake --build ./build --target package -j + ``` + +After the build process completes, you should find the packaged OpenVINO with GenAI in the `build` directory. +Follow the OpenVINO [build instructions](https://github.com/openvinotoolkit/openvino/wiki#how-to-build) and [install instructions](https://github.com/openvinotoolkit/openvino/blob/master/docs/dev/installing.md) for additional information. + +### Build OpenVINO, OpenVINO Tokenizers, and OpenVINO GenAI From Source + +1. Build and install OpenVINO from sources following the [instructions](https://github.com/openvinotoolkit/openvino/wiki#how-to-build). +The path to the OpenVINO install directory is referred as `<INSTALL_DIR>` throughout the document. +2. Clone OpenVINO GenAI repository and init submodules: + ```sh + git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git + cd openvino.genai + ``` +3. Set up the environment: + + #### Option 1 - using OpenVINO `setupvars` script: + + Linux and macOS: + ```sh + source <INSTALL_DIR>/setupvars.sh + ``` + + Windows Command Prompt: + ```cmd + call <INSTALL_DIR>\setupvars.bat + ``` + + Windows PowerShell: + ```cmd + . <INSTALL_DIR>/setupvars.ps1 + ``` + + #### Option 2 - setting environment variables manually: + + Linux: + ```sh + export OpenVINO_DIR=<INSTALL_DIR>/runtime + export PYTHONPATH=<INSTALL_DIR>/python:./build/:$PYTHONPATH + export LD_LIBRARY_PATH=<INSTALL_DIR>/runtime/lib/intel64:$LD_LIBRARY_PATH + ``` + + macOS: + ```sh + export OpenVINO_DIR=<INSTALL_DIR>/runtime + export PYTHONPATH=<INSTALL_DIR>/python:./build/:$PYTHONPATH + export DYLD_LIBRARY_PATH=<INSTALL_DIR>/runtime/lib/intel64:$LD_LIBRARY_PATH + ``` + + Windows Command Prompt: + ```cmd + set OpenVINO_DIR=<INSTALL_DIR>\runtime + set PYTHONPATH=<INSTALL_DIR>\python;%CD%\build;%PYTHONPATH% + set OPENVINO_LIB_PATHS=<INSTALL_DIR>\bin\intel64\Release;%OPENVINO_LIB_PATHS% + set PATH=%OPENVINO_LIB_PATHS%;%PATH% + ``` + + Windows PowerShell: + ```sh + $env:OpenVINO_DIR = "<INSTALL_DIR>\runtime" + $env:PYTHONPATH = "<INSTALL_DIR>\python;$PWD\build;$env:PYTHONPATH" + $env:OPENVINO_LIB_PATHS = "<INSTALL_DIR>\bin\intel64\Release;$env:OPENVINO_LIB_PATHS" + $env:PATH = "$env:OPENVINO_LIB_PATHS;$env:PATH" + ``` + +4. Build the project: + ```sh + cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ + cmake --build ./build/ --config Release -j + ``` + +5. Install OpenVINO GenAI: + + #### Option 1 - using cmake: + + The following command will store built OpenVINO GenAI artifacts along with OpenVINO in `<INSTALL_DIR>`: + + ```sh + cmake --install ./build/ --config Release --prefix <INSTALL_DIR> + ``` + + #### Option 2 - setting paths to built OpenVINO GenAI artifacts manually: + + The path to the OpenVINO GenAI root directory is referred as `<GENAI_ROOT_DIR>` throughout the document. + + Linux: + ```sh + export PYTHONPATH=<GENAI_ROOT_DIR>/build/:$PYTHONPATH + export LD_LIBRARY_PATH=<GENAI_ROOT_DIR>/build/openvino_genai/:$LD_LIBRARY_PATH + ``` + + macOS: + ```sh + export PYTHONPATH=<GENAI_ROOT_DIR>/build:$PYTHONPATH + export DYLD_LIBRARY_PATH=<GENAI_ROOT_DIR>/build/openvino_genai:$DYLD_LIBRARY_PATH + ``` + + Windows Command Prompt: + ```cmd + set PYTHONPATH=<GENAI_ROOT_DIR>\build;%PYTHONPATH% + set PATH=<GENAI_ROOT_DIR>\build\openvino_genai;%PATH% + ``` + + Windows PowerShell: + ```sh + $env:PYTHONPATH = "<GENAI_ROOT_DIR>\build;$env:PYTHONPATH" + $env:PATH = "<GENAI_ROOT_DIR>\build\openvino_genai;$env:PATH" + ``` + +To optimize the package size, you can reduce the ICU (International Components for Unicode) data size when OpenVINO Tokenizers are built as a submodule of OpenVINO GenAI. +For more information please refer to the [OpenVINO Tokenizers instructions](https://github.com/openvinotoolkit/openvino_tokenizers?tab=readme-ov-file#reducing-the-icu-data-size). + + +### Build OpenVINO GenAI Wheel + +1. Build and install OpenVINO from sources following the [instructions](https://github.com/openvinotoolkit/openvino/wiki#how-to-build) +The path to the openvino install directory is referred as <INSTALL_DIR> throughout the document. +2. Clone OpenVINO GenAI repository and init submodules: + ```sh + git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git + cd openvino.genai + ``` +2. Set up the environment: + - Option 1 - using OpenVINO `setupvars.sh` script: + ```sh + source <INSTALL_DIR>/setupvars.sh + ``` + - Option 2 - setting environment variables manually: + ```sh + export OpenVINO_DIR=<INSTALL_DIR>/runtime + export PYTHONPATH=<INSTALL_DIR>/python:./build/:$PYTHONPATH + export LD_LIBRARY_PATH=<INSTALL_DIR>/runtime/lib/intel64:$LD_LIBRARY_PATH + ``` +3. Upgrade pip to ensure you have the latest version: + ```sh + python -m pip install --upgrade pip + ``` +4. Build the wheel in the `dist` directory: + ```sh + python -m pip wheel . -w dist/ --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release + ``` + +### Install OpenVINO GenAI From Source + +1. Clone OpenVINO GenAI repository and init submodules: + ```sh + git clone --recursive https://github.com/openvinotoolkit/openvino.genai.git + cd openvino.genai + ``` +2. Set up the environment: + - Option 1 - using OpenVINO `setupvars.sh` script: + ```sh + source <INSTALL_DIR>/setupvars.sh + ``` + - Option 2 - setting environment variables manually: + ```sh + export OpenVINO_DIR=<INSTALL_DIR>/runtime + export PYTHONPATH=<INSTALL_DIR>/python:./build/:$PYTHONPATH + export LD_LIBRARY_PATH=<INSTALL_DIR>/runtime/lib/intel64:$LD_LIBRARY_PATH + ``` +3. Upgrade pip to ensure you have the latest version: + ```sh + python -m pip install --upgrade pip + ``` +4. Install the package directly from source: + ```sh + python -m pip install . + ``` +5. To verify the installation, run a simple Python script: + ```python + import openvino_genai + print(openvino_genai.__version__) + ``` diff --git a/src/docs/DOCKER.md b/src/docs/DOCKER.md new file mode 100644 index 0000000000..38764864ad --- /dev/null +++ b/src/docs/DOCKER.md @@ -0,0 +1,88 @@ +# Building openvino_llm:latest genai docker image +```Bash +git clone --branch ct-beam-search https://github.com/ilya-lavrenov/openvino.genai.git +git submodule update --remote --init +cd text_generation/causal_lm/cpp/continuous_batching/ +make +``` + +```Bash +cd ../../../.. +docker run -it -v `pwd`:/workspace/openvino.genai/ openvino_llm:latest +cd /workspace/openvino.genai/ +cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ && cmake --build ./build/ -j +``` + +# Downloading LLM models +```Bash +cd /workspace/openvino.genai/text_generation/causal_lm/cpp/continuous_batching/ +optimum-cli export openvino --model facebook/opt-125m ./ov_model +``` + +# Running throuput benchmark application +```Bash +cd /workspace/openvino.genai/ +./build/text_generation/causal_lm/cpp/continuous_batching/apps/throughput_benchmark --model /workspace/openvino.genai/text_generation/causal_lm/cpp/continuous_batching/ov_model --dataset /workspace/ShareGPT_V3_unfiltered_cleaned_split.json --dynamic_split_fuse --num_prompts 100 --device CPU --plugin_config {/"ENABLE_PROFILING/":true} +``` + + +# How to create environment to debug and develop continious batching project with OpenVINO: + +1. Build OpenVINO with python bindings: +``` +cd /path/to/openvino +mkdir build +cd build +cmake -DCMAKE_BUILD_TYPE={ov_build_type} .. +make -j24 +``` +2. Set PYTHONPATH, LD_LIBRARY_PATH and OpenVINO_DIR environment variables: +``` +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/path/to/openvino/bin/intel64/{ov_build_type} +export PYTHONPATH=${PYTHONPATH}:/path/to/openvino/bin/intel64/Release/python:/path/to/openvino/tools/ovc +export OpenVINO_DIR=/path/to/openvino/{ov_build_type} +``` +3. Build OpenVINO tokenizers: +``` +cd /path/to/openvino.genai/thirdparty/openvino_tokenizers +mkdir build +cd build +cmake -DCMAKE_BUILD_TYPE={ov_build_type} .. +make -j24 +``` +4. Create virtual environment to generate models and run python tests: +> NOTE: Comment installation of `openvino` and `openvino_tokenizers` to your env in `/path/to/openvino.genai/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt +``` +cd /path/to/openvino.genai/text_generation/causal_lm/cpp/continuous_batching +python3 -m venv .env +source .env/bin/activate +pip3 install -r python/tests/requirements.txt +``` +5. Install `openvino_tokenizers` to your virtual environment: +``` +cd /path/to/openvino.genai/thirdparty/openvino_tokenizers +export OpenVINO_DIR=/path/to/openvino/build +pip install --no-deps . +``` +6. Create build directory in `continious batching` project: +``` +mkdir /path/to/openvino.genai/text_generation/causal_lm/cpp/continuous_batching/build +``` +7. Generate cmake project: +``` +cd build +cmake -DCMAKE_BUILD_TYPE=Debug -DOpenVINO_DIR=/path/to/openvino/build .. +``` +8. Build the project +``` +make -j24 +``` +9. Extend `PYTHONPATH` by `continious batching`: +``` +export PYTHONPATH=${PYTHONPATH}:/path/to/openvino.genai/text_generation/causal_lm/cpp/continuous_batching/build/python +``` +10. Run python tests: +``` +cd python/tests +pytest . +``` diff --git a/src/docs/HOW_IT_WORKS.md b/src/docs/HOW_IT_WORKS.md new file mode 100644 index 0000000000..f0d2f68b95 --- /dev/null +++ b/src/docs/HOW_IT_WORKS.md @@ -0,0 +1,25 @@ +# OpenVINO™ GenAI: How it works + +## Stateful LLM + +A common optimization for LLM inference is using a past KV (key/value)-cache. This cache is represented by the corresponding inputs and outputs in a model originally implemented in a DL framework (e.g. PyTorch models from Hugging Face). For further optimization and easier use, the model is transformed to a stateful form. This transformation improves inference performance and decreases the allocated runtime memory in long-running text generation scenarios. It is achieved by hiding inputs and outputs of the model that represent past KV-cache tensors and handling them inside the model in a more efficient way. Although the cache is still accessible with state API. It is opposed to stateless model approach requiring manipulating these inputs and outputs explicitly. An introduction to the stateful models can be found in the [Stateful Models article](https://docs.openvino.ai/2024/openvino-workflow/running-inference/stateful-models.html). + +Hiding KV-cache introduces a peculiarity for beam search algorithm. Beam search suggests batched inference of multiple beams. The design described here so far would result in generating multiple independent sequences of tokens. Beam search algorithm, on the other hand, requires removing some of the ongoing beams and splitting other beams to multiple branches. Beam removal requires deleting corresponding KV-cache entry and beam splitting requires copying corresponding KV-cache values. + +To provide the possibility to implement beam search without accessing model's internal state, a stateful LLM converted with `optimum-intel` or [llm_bench](../../llm_bench/python/) introduces an additional 1-dimentional `beam_idx` input. `beam_idx` must contain indexes of elements in a batch which are intended to be selected and will evolve during the next beam search iteration. There's only one beam when the generation starts. That beam corresponds to the initial prompt. `beam_idx` must have values: `[0, 0]` to keep the initial beam and introduce its copy. The dynamic batch size enables to change the number of beams dynamically. `beam_idx` must have `[1]` as the value to remove zeroth sequence and keep the second beam only. + +Assume there are two running beams. To proceed with generating both beams at the next iteration, `beam_idx` values must be `[0, 1]`, pointing to batch elements `0` and `1`. To drop the last beam and split the other beam in two, `beam_idx` must be set to `[0, 0]`. This results in utilizing only the part of KV cache corresponding to the zeroth element in the batch. The process of selecting proper entries in cache is called Cache Reorder. + + + + +The images below represent stateless and stateful LLM pipelines. The model has 4 inputs: +1. `input_ids` contains the next selected token +2. `attention_mask` is filled with `1` +3. `position_ids` encodes a position of currently generating token in the sequence +4. `beam_idx` selects beams + +The model has 1 output `logits` describing the predicted distribution over the next tokens. And there's KV cache state. + + + diff --git a/src/docs/SUPPORTED_MODELS.md b/src/docs/SUPPORTED_MODELS.md new file mode 100644 index 0000000000..d9dddc64b7 --- /dev/null +++ b/src/docs/SUPPORTED_MODELS.md @@ -0,0 +1,231 @@ +# OpenVINO™ GenAI: Supported Models + +## Large language models + +<table> + <tbody style="vertical-align: top;"> + <tr> + <th>Architecture</th> + <th>Models</th> + <th>Example HuggingFace Models</th> + </tr> + <tr> + <td><code>ChatGLMModel</code></td> + <td>ChatGLM</td> + <td> + <ul> + <li><a href="https://huggingface.co/THUDM/chatglm2-6b"><code>THUDM/chatglm2-6b</code></a></li> + <li><a href="https://huggingface.co/THUDM/chatglm3-6b"><code>THUDM/chatglm3-6b</code></a></li> + </ul> + </td> + </tr> + <tr> + <td><code>GemmaForCausalLM</code></td> + <td>Gemma</td> + <td> + <ul> + <li><a href="https://huggingface.co/google/gemma-2b-it"><code>google/gemma-2b-it</code></a></li> + </ul> + </td> + </tr> + <tr> + <td rowspan="2"><code>GPTNeoXForCausalLM</code></td> + <td>Dolly</td> + <td> + <ul> + <li><a href="https://huggingface.co/databricks/dolly-v2-3b"><code>databricks/dolly-v2-3b</code></a></li> + </ul> + </td> + </tr> + <tr> + <!-- <td><code>GPTNeoXForCausalLM</code></td> --> + <td> RedPajama</td> + <td> + <ul> + <li><a href="https://huggingface.co/ikala/redpajama-3b-chat"><code>ikala/redpajama-3b-chat</code></a></li> + </ul> + </td> + </tr> + <tr> + <td rowspan="4" vertical-align="top"><code>LlamaForCausalLM</code></td> + <td>Llama 3</td> + <td> + <ul> + <li><a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B"><code>meta-llama/Meta-Llama-3-8B</code></a></li> + <li><a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"><code>meta-llama/Meta-Llama-3-8B-Instruct</code></a></li> + <li><a href="https://huggingface.co/meta-llama/Meta-Llama-3-70B"><code>meta-llama/Meta-Llama-3-70B</code></a></li> + <li><a href="https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct"><code>meta-llama/Meta-Llama-3-70B-Instruct</code></a></li> + </ul> + </td> + </tr> + <tr> + <!-- <td><code>LlamaForCausalLM</code></td> --> + <td>Llama 2</td> + <td> + <ul> + <li><a href="https://huggingface.co/meta-llama/Llama-2-13b-chat-hf"><code>meta-llama/Llama-2-13b-chat-hf</code></a></li> + <li><a href="https://huggingface.co/meta-llama/Llama-2-13b-hf"><code>meta-llama/Llama-2-13b-hf</code></a></li> + <li><a href="https://huggingface.co/meta-llama/Llama-2-7b-chat-hf"><code>meta-llama/Llama-2-7b-chat-hf</code></a></li> + <li><a href="https://huggingface.co/meta-llama/Llama-2-7b-hf"><code>meta-llama/Llama-2-7b-hf</code></a></li> + <li><a href="https://huggingface.co/meta-llama/Llama-2-70b-chat-hf"><code>meta-llama/Llama-2-70b-chat-hf</code></a></li> + <li><a href="https://huggingface.co/meta-llama/Llama-2-70b-hf"><code>meta-llama/Llama-2-70b-hf</code></a></li> + <li><a href="https://huggingface.co/microsoft/Llama2-7b-WhoIsHarryPotter"><code>microsoft/Llama2-7b-WhoIsHarryPotter</code></a></li> + </ul> + </td> + </tr> + <tr> + <!-- <td><code>LlamaForCausalLM</code></td> --> + <td>OpenLLaMA</td> + <td> + <ul> + <li><a href="https://huggingface.co/openlm-research/open_llama_13b"><code>openlm-research/open_llama_13b</code></a></li> + <li><a href="https://huggingface.co/openlm-research/open_llama_3b"><code>openlm-research/open_llama_3b</code></a></li> + <li><a href="https://huggingface.co/openlm-research/open_llama_3b_v2"><code>openlm-research/open_llama_3b_v2</code></a></li> + <li><a href="https://huggingface.co/openlm-research/open_llama_7b"><code>openlm-research/open_llama_7b</code></a></li> + <li><a href="https://huggingface.co/openlm-research/open_llama_7b_v2"><code>openlm-research/open_llama_7b_v2</code></a></li> + </ul> + </td> + </tr> + <tr> + <!-- <td><code>LlamaForCausalLM</code></td> --> + <td>TinyLlama</td> + <td> + <ul> + <li><a href="https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0"><code>TinyLlama/TinyLlama-1.1B-Chat-v1.0</code></a></li> + </ul> + </td> + </tr> + <tr> + <td rowspan="3"><code>MistralForCausalLM</code></td> + <td>Mistral</td> + <td> + <ul> + <li><a href="https://huggingface.co/mistralai/Mistral-7B-v0.1"><code>mistralai/Mistral-7B-v0.1</code></a></li> + </ul> + </td> + </tr> + <tr> + <!-- <td><code>MistralForCausalLM</code></td> --> + <td>Notus</td> + <td> + <ul> + <li><a href="https://huggingface.co/argilla/notus-7b-v1"><code>argilla/notus-7b-v1</code></a></li> + </ul> + </td> + </tr> + <tr> + <!-- <td><code>MistralForCausalLM</code></td> --> + <td>Zephyr </td> + <td> + <ul> + <li><a href="https://huggingface.co/HuggingFaceH4/zephyr-7b-beta"><code>HuggingFaceH4/zephyr-7b-beta</code></a></li> + </ul> + </td> + </tr> + <tr> + <td><code>PhiForCausalLM</code></td> + <td>Phi</td> + <td> + <ul> + <li><a href="https://huggingface.co/microsoft/phi-2"><code>microsoft/phi-2</code></a></li> + <li><a href="https://huggingface.co/microsoft/phi-1_5"><code>microsoft/phi-1_5</code></a></li> + </ul> + </td> + </tr> + <tr> + <td><code>QWenLMHeadModel</code></td> + <td>Qwen</td> + <td> + <ul> + <li><a href="https://huggingface.co/Qwen/Qwen-7B-Chat"><code>Qwen/Qwen-7B-Chat</code></a></li> + <li><a href="https://huggingface.co/Qwen/Qwen-7B-Chat-Int4"><code>Qwen/Qwen-7B-Chat-Int4</code></a></li> + <li><a href="https://huggingface.co/Qwen/Qwen1.5-7B-Chat"><code>Qwen/Qwen1.5-7B-Chat</code></a></li> + <li><a href="https://huggingface.co/Qwen/Qwen1.5-7B-Chat-GPTQ-Int4"><code>Qwen/Qwen1.5-7B-Chat-GPTQ-Int4</code></a></li> + </ul> + </td> + </tr> + </tbody> +</table> + + +The pipeline can work with other similar topologies produced by `optimum-intel` with the same model signature. The model is required to have the following inputs after the conversion: +1. `input_ids` contains the tokens. +2. `attention_mask` is filled with `1`. +3. `beam_idx` selects beams. +4. `position_ids` (optional) encodes a position of currently generating token in the sequence and a single `logits` output. + +> [!NOTE] +> Models should belong to the same family and have the same tokenizers. + +## Text 2 image models + +<table> + <tbody style="vertical-align: top;"> + <tr> + <th>Architecture</th> + <th>Example HuggingFace Models</th> + </tr> + <tr> + <td><code>Latent Consistency Model</code></td> + <td> + <ul> + <li><a href="https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7"><code>SimianLuo/LCM_Dreamshaper_v7</code></a></li> + </ul> + </td> + </tr> + <tr> + <td><code>Stable Diffusion</code></td> + <td> + <ul> + <li><a href="https://huggingface.co/botp/stable-diffusion-v1-5"><code>botp/stable-diffusion-v1-5</code></a></li> + <li><a href="https://huggingface.co/dreamlike-art/dreamlike-anime-1.0"><code>dreamlike-art/dreamlike-anime-1.0</code></a></li> + <li><a href="https://huggingface.co/stabilityai/stable-diffusion-2"><code>stabilityai/stable-diffusion-2</code></a></li> + <li><a href="https://huggingface.co/stabilityai/stable-diffusion-2-1"><code>stabilityai/stable-diffusion-2-1</code></a></li> + </ul> + </td> + </tr> + <tr> + <td><code>Stable Diffusion XL</code></td> + <td> + <ul> + <li><a href="https://huggingface.co/stabilityai/stable-diffusion-xl-base-0.9"><code>stabilityai/stable-diffusion-xl-base-0.9</code></a></li> + <li><a href="https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0"><code>stabilityai/stable-diffusion-xl-base-1.0</code></a></li> + </ul> + </td> + </tr> + </tbody> +</table> + +## Visual language models + +<table> + <tbody style="vertical-align: top;"> + <tr> + <th>Architecture</th> + <th>Models</th> + <th>Example HuggingFace Models</th> + </tr> + <tr> + <td>LLaVA</td> + <td><code>LLaVA-v1.5</code></td> + <td> + <ul> + <li><a href="https://huggingface.co/llava-hf/llava-1.5-7b-hf"><code>llava-hf/llava-1.5-7b-hf</code></a></li> + </ul> + </td> + </tr> + <tr> + <td>MiniCPMV</td> + <td><code>MiniCPM-V-2_6</code></td> + <td> + <ul> + <li><a href="https://huggingface.co/openbmb/MiniCPM-V-2_6"><code>openbmb/MiniCPM-V-2_6</code></a></li> + </ul> + </td> + </tr> + </tbody> +</table> + +Some models may require access request submission on the Hugging Face page to be downloaded. + +If https://huggingface.co/ is down, the conversion step won't be able to download the models. diff --git a/src/docs/beam_idx-drop.gif b/src/docs/beam_idx-drop.gif new file mode 100644 index 0000000000..1c0f596d06 --- /dev/null +++ b/src/docs/beam_idx-drop.gif @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:701839c28ac1e05c1c9e23823c74a10149a343210192e51df36e563ff6e257e4 +size 5700875 diff --git a/src/docs/beam_idx-fork.gif b/src/docs/beam_idx-fork.gif new file mode 100644 index 0000000000..6255595bfd --- /dev/null +++ b/src/docs/beam_idx-fork.gif @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:292753b30a2153c92cedf16672ba182a851ec30c95c309cdaca13173f00fe700 +size 6062552 diff --git a/src/docs/stateful.jpg b/src/docs/stateful.jpg new file mode 100644 index 0000000000..11e7f68e23 --- /dev/null +++ b/src/docs/stateful.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6fb5ab9990c845eef8847bdf76799fcaefe0a9afa10fb9d07f6df4394a9e2ad +size 129471 diff --git a/src/docs/stateless.jpg b/src/docs/stateless.jpg new file mode 100644 index 0000000000..0e8823e77e --- /dev/null +++ b/src/docs/stateless.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20904ff7a8793359b978cfcdc85c482e0764291af17b572936955f586e202ea9 +size 113440 diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt new file mode 100644 index 0000000000..bf76f34f4f --- /dev/null +++ b/src/python/CMakeLists.txt @@ -0,0 +1,80 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +include(FetchContent) + +FetchContent_Declare( + pybind11 + URL https://github.com/pybind/pybind11/archive/refs/tags/v2.13.5.tar.gz + URL_HASH SHA256=b1e209c42b3a9ed74da3e0b25a4f4cd478d89d5efbb48f04b277df427faf6252 +) +FetchContent_GetProperties(pybind11) +# search for FindPython3.cmake instead of legacy modules +set(PYBIND11_FINDPYTHON ON) + +if(NOT pybind11_POPULATED) + FetchContent_Populate(pybind11) + add_subdirectory(${pybind11_SOURCE_DIR} ${pybind11_BINARY_DIR}) +endif() + +pybind11_add_module(py_generate_pipeline py_vlm_pipeline.cpp py_generate_pipeline.cpp py_whisper_pipeline.cpp utils.cpp) +target_link_libraries(py_generate_pipeline PRIVATE openvino::genai) +set_target_properties(py_generate_pipeline PROPERTIES + ARCHIVE_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" + LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" +) +file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/__init__.py" DESTINATION "${CMAKE_BINARY_DIR}/openvino_genai/") + +configure_file("${OpenVINOGenAI_SOURCE_DIR}/cmake/templates/__version__.py.in" + "${CMAKE_BINARY_DIR}/openvino_genai/__version__.py" @ONLY) + +if(DEFINED PY_BUILD_CMAKE_PACKAGE_NAME) + # RPATH for wheel is mandatory to find openvino_genai library. It + # must be forced because GenAI may be built with OpenVINO targeting + # archive. Such OpenVINO configurations sets + # CMAKE_SKIP_INSTALL_RPATH to ON because it relyes on setupvars.sh. + set(CMAKE_SKIP_INSTALL_RPATH OFF) +endif() +# setting RPATH / LC_RPATH depending on platform +if(LINUX) + # to find libopenvino_genai.so in the same folder + set(rpaths "$ORIGIN") +elseif(APPLE) + # to find libopenvino_genai.dylib in the same folder + set(rpaths "@loader_path") + if(DEFINED PY_BUILD_CMAKE_PACKAGE_NAME) + # in case we build pip package, we need to refer to libopenvino.dylib from 'openvino' package + list(APPEND rpaths "@loader_path/../openvino/libs") + endif() +endif() + +if(rpaths) + set_target_properties(py_generate_pipeline PROPERTIES INSTALL_RPATH "${rpaths}") +endif() + +install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/__init__.py" + "${CMAKE_BINARY_DIR}/openvino_genai/__version__.py" + DESTINATION python/openvino_genai + COMPONENT pygenai_${Python3_VERSION_MAJOR}_${Python3_VERSION_MINOR}) +install(TARGETS py_generate_pipeline + LIBRARY DESTINATION python/openvino_genai + COMPONENT pygenai_${Python3_VERSION_MAJOR}_${Python3_VERSION_MINOR}) + +install(FILES "${CMAKE_BINARY_DIR}/openvino_genai/__version__.py" + DESTINATION openvino_genai + COMPONENT wheel_genai + EXCLUDE_FROM_ALL) + +install(FILES "${OpenVINOGenAI_SOURCE_DIR}/LICENSE" + "${OpenVINOGenAI_SOURCE_DIR}/third-party-programs.txt" + "${OpenVINOGenAI_SOURCE_DIR}/SECURITY.md" + DESTINATION "${PY_BUILD_CMAKE_PACKAGE_NAME}-${PY_BUILD_CMAKE_PACKAGE_VERSION}.dist-info" + COMPONENT wheel_genai + EXCLUDE_FROM_ALL) + +# wheel_genai component is used for wheel generation in pyproject.toml. +# Exclude wheel_genai from normal packaging because there's pygenai_X_Y component for that. +install(TARGETS openvino_genai py_generate_pipeline + LIBRARY DESTINATION openvino_genai COMPONENT wheel_genai EXCLUDE_FROM_ALL + RUNTIME DESTINATION openvino_genai COMPONENT wheel_genai EXCLUDE_FROM_ALL) diff --git a/src/python/openvino_genai/__init__.py b/src/python/openvino_genai/__init__.py new file mode 100644 index 0000000000..879dfc8262 --- /dev/null +++ b/src/python/openvino_genai/__init__.py @@ -0,0 +1,33 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +"""openvino genai module namespace, exposing pipelines and configs to create these pipelines.""" + +import openvino # add_dll_directory for openvino lib +import os +from .__version__ import __version__ + + +if hasattr(os, "add_dll_directory"): + os.add_dll_directory(os.path.dirname(__file__)) + +from .py_generate_pipeline import ( + ContinuousBatchingPipeline, + DecodedResults, + EncodedResults, + GenerationConfig, + GenerationResult, + LLMPipeline, + VLMPipeline, + PerfMetrics, + RawPerfMetrics, + SchedulerConfig, + StopCriteria, + StreamerBase, + TokenizedInputs, + Tokenizer, + WhisperGenerationConfig, + WhisperPipeline, + CacheEvictionConfig, + AggregationMode, +) diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp new file mode 100644 index 0000000000..b636253e33 --- /dev/null +++ b/src/python/py_generate_pipeline.cpp @@ -0,0 +1,769 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include <filesystem> +#include <pybind11/pybind11.h> +#include <pybind11/stl.h> +#include <pybind11/stl_bind.h> +#include <pybind11/functional.h> +#include "openvino/genai/continuous_batching_pipeline.hpp" +#include "openvino/genai/llm_pipeline.hpp" +#include <openvino/runtime/auto/properties.hpp> +#include "../cpp/src/tokenizers_path.hpp" + +#include "./utils.hpp" + +namespace py = pybind11; +namespace utils = ov::genai::pybind::utils; +using ov::genai::ChatHistory; +using ov::genai::ContinuousBatchingPipeline; +using ov::genai::DecodedResults; +using ov::genai::EncodedInputs; +using ov::genai::EncodedResults; +using ov::genai::GenerationConfig; +using ov::genai::GenerationResult; +using ov::genai::LLMPipeline; +using ov::genai::MeanStdPair; +using ov::genai::OptionalGenerationConfig; +using ov::genai::PerfMetrics; +using ov::genai::PipelineMetrics; +using ov::genai::RawPerfMetrics; +using ov::genai::SchedulerConfig; +using ov::genai::CacheEvictionConfig; +using ov::genai::AggregationMode; +using ov::genai::StopCriteria; +using ov::genai::StreamerBase; +using ov::genai::StreamerVariant; +using ov::genai::StringInputs; +using ov::genai::TokenizedInputs; +using ov::genai::Tokenizer; + +template <typename T, typename U> +std::vector<float> get_ms(const T& instance, U T::*member) { + // Converts c++ duration to float so that it can be used in Python. + std::vector<float> res; + const auto& durations = instance.*member; + res.reserve(durations.size()); + std::transform(durations.begin(), durations.end(), std::back_inserter(res), + [](const auto& duration) { return duration.count(); }); + return res; +} + +void init_whisper_pipeline(py::module_& m); +void init_vlm_pipeline(py::module_& m); + +namespace { + +auto generate_docstring = R"( + Generates sequences or tokens for LLMs. If input is a string or list of strings then resulting sequences will be already detokenized. + + :param inputs: inputs in the form of string, list of strings or tokenized input_ids + :type inputs: str, List[str], ov.genai.TokenizedInputs, or ov.Tensor + + :param generation_config: generation_config + :type generation_config: GenerationConfig or a Dict + + :param streamer: streamer either as a lambda with a boolean returning flag whether generation should be stopped + :type : Callable[[str], bool], ov.genai.StreamerBase + + :param kwargs: arbitrary keyword arguments with keys corresponding to GenerationConfig fields. + :type : Dict + + :return: return results in encoded, or decoded form depending on inputs type + :rtype: DecodedResults, EncodedResults, str +)"; + +auto decoded_results_docstring = R"( + Structure to store resulting batched text outputs and scores for each batch. + The first num_return_sequences elements correspond to the first batch element. + + Parameters: + texts: vector of resulting sequences. + scores: scores for each sequence. + metrics: performance metrics with tpot, ttft, etc. of type ov::genai::PerfMetrics. +)"; + +auto encoded_results_docstring = R"( + Structure to store resulting batched tokens and scores for each batch sequence. + The first num_return_sequences elements correspond to the first batch element. + In the case if results decoded with beam search and random sampling scores contain + sum of logarithmic probabilities for each token in the sequence. In the case + of greedy decoding scores are filled with zeros. + + Parameters: + tokens: sequence of resulting tokens. + scores: sum of logarithmic probabilities of all tokens in the sequence. + metrics: performance metrics with tpot, ttft, etc. of type ov::genai::PerfMetrics. +)"; + +auto generation_config_docstring = R"( + Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group + and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will + be used while greedy and beam search parameters will not affect decoding at all. + + Parameters: + max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + + max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set. + max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. + ignore_eos: if set to true, then generation will not stop even if <eos> token is met. + eos_token_id: token_id of <eos> (end of sentence) + min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens. Ignored for non continuous batching. + stop_strings: list of strings that will cause pipeline to stop generating further tokens. Ignored for non continuous batching. + include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false) + stop_token_ids: list of tokens that will cause pipeline to stop generating further tokens. Ignored for non continuous batching. + + Beam search specific parameters: + num_beams: number of beams for beam search. 1 disables beam search. + num_beam_groups: number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. + diversity_penalty: value is subtracted from a beam's score if it generates the same token as any beam from other group at a particular time. + length_penalty: exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to + the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log + likelihood of the sequence (i.e. negative), length_penalty > 0.0 promotes longer sequences, while + length_penalty < 0.0 encourages shorter sequences. + num_return_sequences: the number of sequences to return for grouped beam search decoding. + no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once. + stop_criteria: controls the stopping condition for grouped beam search. It accepts the following values: + "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; + "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; + "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). + + Random sampling parameters: + temperature: the value used to modulate token probabilities for random sampling. + top_p: if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. + top_k: the number of highest probability vocabulary tokens to keep for top-k-filtering. + do_sample: whether or not to use multinomial random sampling that add up to `top_p` or higher are kept. + repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. +)"; + +auto scheduler_config_docstring = R"( + SchedulerConfig to construct ContinuousBatchingPipeline + + Parameters: + max_num_batched_tokens: a maximum number of tokens to batch (in constrast to max_batch_size which combines + independent sequences, we consider total amount of tokens in a batch). + num_kv_blocks: total number of KV blocks available to scheduler logic. + cache_size: total size of KV cache in GB. + block_size: block size for KV cache. + dynamic_split_fuse: whether to split prompt / generate to different scheduling phases. + + vLLM-like settings: + max_num_seqs: max number of scheduled sequences (you can think of it as "max batch size"). + enable_prefix_caching: Enable caching of KV-blocks. + When turned on all previously calculated KV-caches are kept in memory for future usages. + KV-caches can be rewritten if KV-cache limit is reached, but blocks are not released. + This results in more RAM usage, maximum RAM usage is determined by cache_size or num_kv_blocks parameters. + When turend off only KV-cache required for batch calculation is kept in memory and + when a sequence has finished genegartion its cache is released. +)"; + +auto generation_result_docstring = R"( + GenerationResult stores resulting batched tokens and scores. + + Parameters: + request_id: obsolete when handle API is approved as handle will connect results with prompts. + generation_ids: in a generic case we have multiple generation results per initial prompt + depending on sampling parameters (e.g. beam search or parallel sampling). + scores: scores. + status: status of generation. The following values are possible: + RUNNING = 0 - Default status for ongoing generation. + FINISHED = 1 - Status set when generation has been finished. + IGNORED = 2 - Status set when generation run into out-of-memory condition and could not be continued. + DROPPED_BY_PIPELINE = 3 - Currently not used, TODO: implement abort functionality. + DROPPED_BY_HANDLE = 4 - Status set when generation handle is dropped. + +)"; + +auto stop_criteria_docstring = R"( + StopCriteria controls the stopping condition for grouped beam search. + + The following values are possible: + "openvino_genai.StopCriteria.EARLY" stops as soon as there are `num_beams` complete candidates. + "openvino_genai.StopCriteria.HEURISTIC" stops when is it unlikely to find better candidates. + "openvino_genai.StopCriteria.NEVER" stops when there cannot be better candidates. +)"; + +auto streamer_base_docstring = R"( + Base class for streamers. In order to use inherit from from this class and inplement put, and methods. +)"; + +auto tokenized_inputs_docstring = R"( + Structure to agregate inputs to model. + + Parameters: + input_ids: numerical token IDs from the tokenizer + attention_mask: indicates which tokens are attended to +)"; + +auto raw_perf_metrics_docstring = R"( + Structure with raw performance metrics for each generation before any statistics are calculated. + + :param generate_durations: Durations for each generate call in microseconds. + :type generate_durations: List[MicroSeconds] + + :param tokenization_durations: Durations for the tokenization process in microseconds. + :type tokenization_durations: List[MicroSeconds] + + :param detokenization_durations: Durations for the detokenization process in microseconds. + :type detokenization_durations: List[MicroSeconds] + + :param m_times_to_first_token: Times to the first token for each call in microseconds. + :type m_times_to_first_token: List[MicroSeconds] + + :param m_new_token_times: Time points for each new token generated. + :type m_new_token_times: List[TimePoint] + + :param m_batch_sizes: Batch sizes for each generate call. + :type m_batch_sizes: List[int] + + :param m_durations: Total durations for each generate call in microseconds. + :type m_durations: List[MicroSeconds] + + :param num_generated_tokens: Total number of tokens generated. + :type num_generated_tokens: int + + :param num_input_tokens: Total number of tokens in the input prompt. + :type num_input_tokens: int +)"; + +auto perf_metrics_docstring = R"( + Holds performance metrics for each generate call. + + PerfMetrics holds fields with mean and standard deviations for the following metrics: + - Time To the First Token (TTFT), ms + - Time per Output Token (TPOT), ms/token + - Generate total duration, ms + - Tokenization duration, ms + - Detokenization duration, ms + - Throughput, tokens/s + + Additional fields include: + - Load time, ms + - Number of generated tokens + - Number of tokens in the input prompt + + Preferable way to access values is via get functions. Getters calculate mean and std values from raw_metrics and return pairs. + If mean and std were already calculated, getters return cached values. + + :param get_load_time: Returns the load time in milliseconds. + :type get_load_time: float + + :param get_num_generated_tokens: Returns the number of generated tokens. + :type get_num_generated_tokens: int + + :param get_num_input_tokens: Returns the number of tokens in the input prompt. + :type get_num_input_tokens: int + + :param get_ttft: Returns the mean and standard deviation of TTFT in milliseconds. + :type get_ttft: MeanStdPair + + :param get_tpot: Returns the mean and standard deviation of TPOT in milliseconds. + :type get_tpot: MeanStdPair + + :param get_throughput: Returns the mean and standard deviation of throughput in tokens per second. + :type get_throughput: MeanStdPair + + :param get_generate_duration: Returns the mean and standard deviation of generate durations in milliseconds. + :type get_generate_duration: MeanStdPair + + :param get_tokenization_duration: Returns the mean and standard deviation of tokenization durations in milliseconds. + :type get_tokenization_duration: MeanStdPair + + :param get_detokenization_duration: Returns the mean and standard deviation of detokenization durations in milliseconds. + :type get_detokenization_duration: MeanStdPair + + :param raw_metrics: A structure of RawPerfMetrics type that holds raw metrics. + :type raw_metrics: RawPerfMetrics +)"; + +auto pipeline_metrics_docstring = R"( + Contains general pipeline metrics, either aggregated throughout the lifetime of the generation pipeline + or measured at the previous generation step. + + :param requests: Number of requests to be processed by the pipeline. + :type requests: int + + :param scheduled_requests: Number of requests that were scheduled for processing at the previous step of the pipeline. + :type scheduled_requests: int + + :param cache_usage: Percentage of KV cache usage in the last generation step. + :type cache_usage: float + + :param max_cache_usage: Max KV cache usage during the lifetime of the pipeline in % + :type max_cache_usage: float + + + :param avg_cache_usage: Running average of the KV cache usage (in %) during the lifetime of the pipeline, with max window size of 1000 steps + :type avg_cache_usage: float +)"; + +auto cache_eviction_config_docstring = R"( + Configuration struct for the cache eviction algorithm. + :param start_size: Number of tokens in the *beginning* of KV cache that should be retained in the KV cache for this sequence during generation. Must be non-zero and a multiple of the KV cache block size for this pipeline. + :type start_size: int + + :param recent_size: Number of tokens in the *end* of KV cache that should be retained in the KV cache for this sequence during generation. Must be non-zero and a multiple of the KV cache block size for this pipeline. + :type recent_size: int + + :param max_cache_size: Maximum number of tokens that should be kept in the KV cache. The evictable block area will be located between the "start" and "recent" blocks and its size will be calculated as (`max_cache_size` - `start_size` - `recent_size`). Must be non-zero, larger than (`start_size` + `recent_size`), and a multiple of the KV cache block size for this pipeline. Note that since only the completely filled blocks are evicted, the actual maximum per-sequence KV cache size in tokens may be up to (`max_cache_size` + `SchedulerConfig.block_size - 1`). + :type max_cache_size: int + + :param aggregation_mode: The mode used to compute the importance of tokens for eviction + :type aggregation_mode: openvino_genai.AggregationMode +)"; + +py::list handle_utf8_results(const std::vector<std::string>& decoded_res) { + // pybind11 decodes strings similar to Pythons's + // bytes.decode('utf-8'). It raises if the decoding fails. + // generate() may return incomplete Unicode points if max_new_tokens + // was reached. Replace such points with � instead of raising an exception + py::list res; + for (const auto s: decoded_res) { + PyObject* py_s = PyUnicode_DecodeUTF8(s.data(), s.length(), "replace"); + res.append(py::reinterpret_steal<py::object>(py_s)); + } + return res; +} + +py::object call_common_generate( + LLMPipeline& pipe, + const std::variant<ov::Tensor, TokenizedInputs, std::string, std::vector<std::string>>& inputs, + const OptionalGenerationConfig& config, + const utils::PyBindStreamerVariant& py_streamer, + const py::kwargs& kwargs +) { + auto updated_config = ov::genai::pybind::utils::update_config_from_kwargs(config, kwargs); + py::object results; + EncodedInputs tensor_data; + StreamerVariant streamer = ov::genai::pybind::utils::pystreamer_to_streamer(py_streamer); + + // Call suitable generate overload for each type of input. + std::visit(utils::overloaded { + [&](ov::Tensor ov_tensor) { + results = py::cast(pipe.generate(ov_tensor, updated_config, streamer)); + }, + [&](TokenizedInputs tokenized_input) { + results = py::cast(pipe.generate(tokenized_input, updated_config, streamer)); + }, + [&](std::string string_input) { + DecodedResults res = pipe.generate(string_input, updated_config, streamer); + // If input was a string return a single string otherwise return DecodedResults. + if (updated_config.has_value() && (*updated_config).num_return_sequences == 1) { + results = py::cast<py::object>(handle_utf8_results(res.texts)[0]); + } else { + results = py::cast(res); + } + }, + [&](std::vector<std::string> string_input) { + // For DecodedResults texts getter already handles utf8 decoding. + results = py::cast(pipe.generate(string_input, updated_config, streamer)); + }}, + inputs); + + return results; +} + +class ConstructableStreamer: public StreamerBase { + bool put(int64_t token) override { + PYBIND11_OVERRIDE_PURE( + bool, // Return type + StreamerBase, // Parent class + put, // Name of function in C++ (must match Python name) + token // Argument(s) + ); + } + void end() override { + PYBIND11_OVERRIDE_PURE(void, StreamerBase, end); + } +}; + +std::ostream& operator << (std::ostream& stream, const GenerationResult& generation_result) { + stream << generation_result.m_request_id << std::endl; + const bool has_scores = !generation_result.m_scores.empty(); + for (size_t i = 0; i < generation_result.m_generation_ids.size(); ++i) { + stream << "{ "; + if (has_scores) + stream << generation_result.m_scores[i] << ", "; + stream << generation_result.m_generation_ids[i] << " }" << std::endl; + } + return stream << std::endl; +} + +} // namespace + + +PYBIND11_MODULE(py_generate_pipeline, m) { + m.doc() = "Pybind11 binding for LLM Pipeline"; + + py::class_<LLMPipeline>(m, "LLMPipeline", "This class is used for generation with LLMs") + .def(py::init([]( + const std::string& model_path, + const std::string& device, + const std::map<std::string, py::object>& config + ) { + ScopedVar env_manager(utils::ov_tokenizers_module_path()); + return std::make_unique<LLMPipeline>(model_path, device, utils::properties_to_any_map(config)); + }), + py::arg("model_path"), "folder with openvino_model.xml and openvino_tokenizer[detokenizer].xml files", + py::arg("device") = "CPU", "device on which inference will be done", + py::arg("config") = ov::AnyMap({}), "openvino.properties map", + R"( + LLMPipeline class constructor. + model_path (str): Path to the model file. + device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'. + Add {"scheduler_config": ov_genai.SchedulerConfig} to config properties to create continuous batching pipeline. + )") + + .def(py::init([]( + const std::string& model_path, + const Tokenizer& tokenizer, + const std::string& device, + const std::map<std::string, py::object>& config + ) { + ScopedVar env_manager(utils::ov_tokenizers_module_path()); + return std::make_unique<LLMPipeline>(model_path, tokenizer, device, utils::properties_to_any_map(config)); + }), + py::arg("model_path"), + py::arg("tokenizer"), + py::arg("device") = "CPU", + py::arg("config") = ov::AnyMap({}), "openvino.properties map", + R"( + LLMPipeline class constructor for manualy created openvino_genai.Tokenizer. + model_path (str): Path to the model file. + tokenizer (openvino_genai.Tokenizer): tokenizer object. + device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'. + Add {"scheduler_config": ov_genai.SchedulerConfig} to config properties to create continuous batching pipeline. + )") + + .def( + "generate", + [](LLMPipeline& pipe, + const std::variant<ov::Tensor, TokenizedInputs, std::string, std::vector<std::string>>& inputs, + const OptionalGenerationConfig& generation_config, + const utils::PyBindStreamerVariant& streamer, + const py::kwargs& kwargs + ) { + return call_common_generate(pipe, inputs, generation_config, streamer, kwargs); + }, + py::arg("inputs"), "Input string, or list of string or encoded tokens", + py::arg("generation_config") = std::nullopt, "generation_config", + py::arg("streamer") = std::monostate(), "streamer", + (generate_docstring + std::string(" \n ") + generation_config_docstring).c_str() + ) + + .def( + "__call__", + [](LLMPipeline& pipe, + const std::variant<ov::Tensor, TokenizedInputs, std::string, std::vector<std::string>>& inputs, + const OptionalGenerationConfig& generation_config, + const utils::PyBindStreamerVariant& streamer, + const py::kwargs& kwargs + ) { + return call_common_generate(pipe, inputs, generation_config, streamer, kwargs); + }, + py::arg("inputs"), "Input string, or list of string or encoded tokens", + py::arg("generation_config") = std::nullopt, "generation_config", + py::arg("streamer") = std::monostate(), "streamer", + (generate_docstring + std::string(" \n ") + generation_config_docstring).c_str() + ) + + .def("get_tokenizer", &LLMPipeline::get_tokenizer) + .def("start_chat", &LLMPipeline::start_chat, py::arg("system_message") = "") + .def("finish_chat", &LLMPipeline::finish_chat) + .def("get_generation_config", &LLMPipeline::get_generation_config, py::return_value_policy::copy) + .def("set_generation_config", &LLMPipeline::set_generation_config); + + // Binding for Tokenizer + py::class_<ov::genai::Tokenizer>(m, "Tokenizer", + R"(openvino_genai.Tokenizer object is used to initialize Tokenizer + if it's located in a different path than the main model.)") + + .def(py::init([](const std::string& tokenizer_path, const std::map<std::string, py::object>& plugin_config) { + ScopedVar env_manager(utils::ov_tokenizers_module_path()); + return std::make_unique<ov::genai::Tokenizer>(tokenizer_path, utils::properties_to_any_map(plugin_config)); + }), py::arg("tokenizer_path"), py::arg("plugin_config") = ov::AnyMap({})) + + .def("encode", [](Tokenizer& tok, std::vector<std::string>& prompts, bool add_special_tokens) { + ov::AnyMap tokenization_params; + tokenization_params[ov::genai::add_special_tokens.name()] = add_special_tokens; + return tok.encode(prompts, tokenization_params); + }, + py::arg("prompts"), + py::arg("add_special_tokens") = true, + R"(Encodes a list of prompts into tokenized inputs.)") + + .def("encode", [](Tokenizer& tok, const std::string prompt, bool add_special_tokens) { + ov::AnyMap tokenization_params; + tokenization_params[ov::genai::add_special_tokens.name()] = add_special_tokens; + return tok.encode(prompt, tokenization_params); + }, + py::arg("prompt"), py::arg("add_special_tokens") = true, + R"(Encodes a single prompt into tokenized input.)") + + .def( + "decode", + [](Tokenizer& tok, std::vector<int64_t>& tokens) -> py::str { + return handle_utf8_results({tok.decode(tokens)})[0]; + }, + py::arg("tokens"), + R"(Decode a sequence into a string prompt.)" + ) + + .def( + "decode", + [](Tokenizer& tok, ov::Tensor& tokens) -> py::list { + return handle_utf8_results(tok.decode(tokens)); + }, + py::arg("tokens"), + R"(Decode tensor into a list of string prompts.)") + + .def( + "decode", + [](Tokenizer& tok, std::vector<std::vector<int64_t>>& tokens) -> py::list{ + return handle_utf8_results(tok.decode(tokens)); + }, + py::arg("tokens"), + R"(Decode a batch of tokens into a list of string prompt.)") + + .def("apply_chat_template", [](Tokenizer& tok, + ChatHistory history, + bool add_generation_prompt, + const std::string& chat_template) { + return tok.apply_chat_template(history, add_generation_prompt, chat_template); + }, + py::arg("history"), + py::arg("add_generation_prompt"), + py::arg("chat_template") = "", + R"(Embeds input prompts with special tags for a chat scenario.)") + + .def( + "set_chat_template", &Tokenizer::set_chat_template, + py::arg("chat_template"), "The new template to override with.", + "Override a chat_template read from tokenizer_config.json." + ) + + .def("get_pad_token_id", &Tokenizer::get_pad_token_id) + .def("get_bos_token_id", &Tokenizer::get_bos_token_id) + .def("get_eos_token_id", &Tokenizer::get_eos_token_id) + .def("get_pad_token", &Tokenizer::get_pad_token) + .def("get_bos_token", &Tokenizer::get_bos_token) + .def("get_eos_token", &Tokenizer::get_eos_token); + + // Binding for StopCriteria + py::enum_<StopCriteria>(m, "StopCriteria", stop_criteria_docstring) + .value("EARLY", StopCriteria::EARLY) + .value("HEURISTIC", StopCriteria::HEURISTIC) + .value("NEVER", StopCriteria::NEVER) + .export_values(); + + // Binding for GenerationConfig + py::class_<GenerationConfig>(m, "GenerationConfig", generation_config_docstring) + .def(py::init<std::string>(), py::arg("json_path"), "path where generation_config.json is stored") + .def(py::init([](py::kwargs kwargs) { return *ov::genai::pybind::utils::update_config_from_kwargs(GenerationConfig(), kwargs); })) + .def_readwrite("max_new_tokens", &GenerationConfig::max_new_tokens) + .def_readwrite("max_length", &GenerationConfig::max_length) + .def_readwrite("ignore_eos", &GenerationConfig::ignore_eos) + .def_readwrite("min_new_tokens", &GenerationConfig::min_new_tokens) + .def_readwrite("num_beam_groups", &GenerationConfig::num_beam_groups) + .def_readwrite("num_beams", &GenerationConfig::num_beams) + .def_readwrite("diversity_penalty", &GenerationConfig::diversity_penalty) + .def_readwrite("length_penalty", &GenerationConfig::length_penalty) + .def_readwrite("num_return_sequences", &GenerationConfig::num_return_sequences) + .def_readwrite("no_repeat_ngram_size", &GenerationConfig::no_repeat_ngram_size) + .def_readwrite("stop_criteria", &GenerationConfig::stop_criteria) + .def_readwrite("temperature", &GenerationConfig::temperature) + .def_readwrite("top_p", &GenerationConfig::top_p) + .def_readwrite("top_k", &GenerationConfig::top_k) + .def_readwrite("do_sample", &GenerationConfig::do_sample) + .def_readwrite("repetition_penalty", &GenerationConfig::repetition_penalty) + .def_readwrite("eos_token_id", &GenerationConfig::eos_token_id) + .def_readwrite("presence_penalty", &GenerationConfig::presence_penalty) + .def_readwrite("frequency_penalty", &GenerationConfig::frequency_penalty) + .def_readwrite("rng_seed", &GenerationConfig::rng_seed) + .def_readwrite("stop_strings", &GenerationConfig::stop_strings) + .def_readwrite("include_stop_str_in_output", &GenerationConfig::include_stop_str_in_output) + .def_readwrite("stop_token_ids", &GenerationConfig::stop_token_ids) + .def("set_eos_token_id", &GenerationConfig::set_eos_token_id) + .def("is_beam_search", &GenerationConfig::is_beam_search); + + py::class_<DecodedResults>(m, "DecodedResults", decoded_results_docstring) + .def(py::init<>()) + .def_property_readonly("texts", [](const DecodedResults &dr) { return handle_utf8_results(dr); }) + .def_readonly("scores", &DecodedResults::scores) + .def_readonly("perf_metrics", &DecodedResults::perf_metrics) + .def("__str__", [](const DecodedResults &dr) -> py::str { + auto valid_utf8_strings = handle_utf8_results(dr); + py::str res; + if (valid_utf8_strings.size() == 1) + return valid_utf8_strings[0]; + + for (size_t i = 0; i < valid_utf8_strings.size() - 1; i++) { + res += py::str(std::to_string(dr.scores[i])) + py::str(": ") + valid_utf8_strings[i] + py::str("\n"); + } + res += py::str(std::to_string(dr.scores.back())) + py::str(": ") + valid_utf8_strings[valid_utf8_strings.size() - 1]; + return res; + }); + + py::class_<RawPerfMetrics>(m, "RawPerfMetrics", raw_perf_metrics_docstring) + .def(py::init<>()) + .def_property_readonly("generate_durations", [](const RawPerfMetrics &rw) { + return get_ms(rw, &RawPerfMetrics::generate_durations); + }) + .def_property_readonly("tokenization_durations", [](const RawPerfMetrics &rw) { + return get_ms(rw, &RawPerfMetrics::tokenization_durations); + }) + .def_property_readonly("detokenization_durations", [](const RawPerfMetrics &rw) { + return get_ms(rw, &RawPerfMetrics::detokenization_durations); + }) + .def_property_readonly("m_times_to_first_token", [](const RawPerfMetrics &rw) { + return get_ms(rw, &RawPerfMetrics::m_times_to_first_token); + }) + .def_property_readonly("m_durations", [](const RawPerfMetrics &rw) { + return get_ms(rw, &RawPerfMetrics::m_durations); + }) + .def_readonly("m_batch_sizes", &RawPerfMetrics::m_batch_sizes); + + py::class_<MeanStdPair>(m, "MeanStdPair") + .def(py::init<>()) + .def_readonly("mean", &MeanStdPair::mean) + .def_readonly("std", &MeanStdPair::std) + .def("__iter__", [](const MeanStdPair &self) { + return py::make_iterator(&self.mean, &self.std + 1); + }, py::keep_alive<0, 1>()); // Keep object alive while the iterator is used; + + py::class_<PerfMetrics>(m, "PerfMetrics", perf_metrics_docstring) + .def(py::init<>()) + .def("get_load_time", &PerfMetrics::get_load_time) + .def("get_num_generated_tokens", &PerfMetrics::get_num_generated_tokens) + .def("get_num_input_tokens", &PerfMetrics::get_num_input_tokens) + .def("get_ttft", &PerfMetrics::get_ttft) + .def("get_tpot", &PerfMetrics::get_tpot) + .def("get_throughput", &PerfMetrics::get_throughput) + .def("get_generate_duration", &PerfMetrics::get_generate_duration) + .def("get_tokenization_duration", &PerfMetrics::get_tokenization_duration) + .def("get_detokenization_duration", &PerfMetrics::get_detokenization_duration) + .def("__add__", &PerfMetrics::operator+) + .def("__iadd__", &PerfMetrics::operator+=) + .def_readonly("raw_metrics", &PerfMetrics::raw_metrics); + + py::class_<PipelineMetrics>(m, "PipelineMetrics", pipeline_metrics_docstring) + .def(py::init<>()) + .def_readonly("requests", &PipelineMetrics::requests) + .def_readonly("scheduled_requests", &PipelineMetrics::scheduled_requests) + .def_readonly("cache_usage", &PipelineMetrics::cache_usage) + .def_readonly("avg_cache_usage", &PipelineMetrics::avg_cache_usage) + .def_readonly("max_cache_usage", &PipelineMetrics::max_cache_usage); + + py::class_<TokenizedInputs>(m, "TokenizedInputs") + .def(py::init<ov::Tensor, ov::Tensor>()) + .def_readwrite("input_ids", &TokenizedInputs::input_ids) + .def_readwrite("attention_mask", &TokenizedInputs::attention_mask); + + py::class_<EncodedResults>(m, "EncodedResults", encoded_results_docstring) + .def_readonly("tokens", &EncodedResults::tokens) + .def_readonly("scores", &EncodedResults::scores) + .def_readonly("perf_metrics", &EncodedResults::perf_metrics); + + py::class_<StreamerBase, ConstructableStreamer, std::shared_ptr<StreamerBase>>(m, "StreamerBase", streamer_base_docstring) // Change the holder form unique_ptr to shared_ptr + .def(py::init<>()) + .def("put", &StreamerBase::put, "Put is called every time new token is decoded. Returns a bool flag to indicate whether generation should be stoped, if return true generation stops") + .def("end", &StreamerBase::end, "End is called at the end of generation. It can be used to flush cache if your own streamer has one"); + + py::class_<GenerationResult>(m, "GenerationResult", generation_result_docstring) + .def(py::init<>()) + .def_readonly("m_request_id", &GenerationResult::m_request_id) + .def_property("m_generation_ids", + [](GenerationResult &r) -> py::list { + py::list res; + for (auto s: r.m_generation_ids) { + PyObject* py_s = PyUnicode_DecodeUTF8(s.data(), s.length(), "replace"); + res.append(py_s); + } + return res; + }, + [](GenerationResult &r, std::vector<std::string> &generation_ids) { + r.m_generation_ids = generation_ids; + }) + .def_readwrite("m_scores", &GenerationResult::m_scores) + .def("__repr__", + [](const GenerationResult &r) -> py::str{ + std::stringstream stream; + stream << "<py_continuous_batching.GenerationResult " << r << ">"; + std::string str = stream.str(); + PyObject* py_s = PyUnicode_DecodeUTF8(str.data(), str.length(), "replace"); + return py::reinterpret_steal<py::str>(py_s); + } + ) + .def("get_generation_ids", + [](GenerationResult &r) -> py::list { + py::list res; + for (auto s: r.m_generation_ids) { + PyObject* py_s = PyUnicode_DecodeUTF8(s.data(), s.length(), "replace"); + res.append(py_s); + } + return res; + }); + + py::class_<SchedulerConfig>(m, "SchedulerConfig", scheduler_config_docstring) + .def(py::init<>()) + .def_readwrite("max_num_batched_tokens", &SchedulerConfig::max_num_batched_tokens) + .def_readwrite("num_kv_blocks", &SchedulerConfig::num_kv_blocks) + .def_readwrite("cache_size", &SchedulerConfig::cache_size) + .def_readwrite("block_size", &SchedulerConfig::block_size) + .def_readwrite("dynamic_split_fuse", &SchedulerConfig::dynamic_split_fuse) + .def_readwrite("max_num_seqs", &SchedulerConfig::max_num_seqs) + .def_readwrite("enable_prefix_caching", &SchedulerConfig::enable_prefix_caching) + .def_readwrite("use_cache_eviction", &SchedulerConfig::use_cache_eviction) + .def_readwrite("cache_eviction_config", &SchedulerConfig::cache_eviction_config); + + py::class_<CacheEvictionConfig>(m, "CacheEvictionConfig", cache_eviction_config_docstring) + .def(py::init<>([](const size_t start_size, size_t recent_size, size_t max_cache_size, AggregationMode aggregation_mode) { + return CacheEvictionConfig{start_size, recent_size, max_cache_size, aggregation_mode}; }), + py::arg("start_size"), py::arg("recent_size"), py::arg("max_cache_size"), py::arg("aggregation_mode")) + .def_readwrite("aggregation_mode", &CacheEvictionConfig::aggregation_mode); + + // Binding for StopCriteria + py::enum_<AggregationMode>(m, "AggregationMode", + R"(Represents the mode of per-token score aggregation when determining least important tokens for eviction from cache + :param AggregationMode.SUM: In this mode the importance scores of each token will be summed after each step of generation + :param AggregationMode.NORM_SUM: Same as SUM, but the importance scores are additionally divided by the lifetime (in tokens generated) of a given token in cache)") + .value("SUM", AggregationMode::SUM) + .value("NORM_SUM", AggregationMode::NORM_SUM) + .export_values(); + + py::class_<ContinuousBatchingPipeline>(m, "ContinuousBatchingPipeline", "This class is used for generation with LLMs with continuous batchig") + .def(py::init([](const std::string& model_path, const SchedulerConfig& scheduler_config, const std::string& device, const std::map<std::string, py::object>& llm_plugin_config, const std::map<std::string, py::object>& tokenizer_plugin_config) { + ScopedVar env_manager(utils::ov_tokenizers_module_path()); + return std::make_unique<ContinuousBatchingPipeline>(model_path, scheduler_config, device, utils::properties_to_any_map(llm_plugin_config), utils::properties_to_any_map(tokenizer_plugin_config)); + }), py::arg("model_path"), py::arg("scheduler_config"), py::arg("device") = "CPU", py::arg("llm_plugin_config") = ov::AnyMap({}), py::arg("tokenizer_plugin_config") = ov::AnyMap({})) + .def(py::init([](const std::string& model_path, const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const std::map<std::string, py::object>& plugin_config) { + ScopedVar env_manager(utils::ov_tokenizers_module_path()); + return std::make_unique<ContinuousBatchingPipeline>(model_path, tokenizer, scheduler_config, device, utils::properties_to_any_map(plugin_config)); + }), py::arg("model_path"), py::arg("tokenizer"), py::arg("scheduler_config"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap({})) + .def("get_tokenizer", &ContinuousBatchingPipeline::get_tokenizer) + .def("get_config", &ContinuousBatchingPipeline::get_config) + .def("get_metrics", &ContinuousBatchingPipeline::get_metrics) + .def("add_request", py::overload_cast<uint64_t, const ov::Tensor&, const ov::genai::GenerationConfig&>(&ContinuousBatchingPipeline::add_request)) + .def("add_request", py::overload_cast<uint64_t, const std::string&, const ov::genai::GenerationConfig&>(&ContinuousBatchingPipeline::add_request)) + .def("step", &ContinuousBatchingPipeline::step) + .def("has_non_finished_requests", &ContinuousBatchingPipeline::has_non_finished_requests) + .def( + "generate", + py::overload_cast<const std::vector<ov::Tensor>&, const std::vector<ov::genai::GenerationConfig>&, const ov::genai::StreamerVariant&>(&ContinuousBatchingPipeline::generate), + py::arg("input_ids"), + py::arg("sampling_params"), + py::arg("streamer") = std::monostate{} + ) + .def( + "generate", + py::overload_cast<const std::vector<std::string>&, const std::vector<ov::genai::GenerationConfig>&, const ov::genai::StreamerVariant&>(&ContinuousBatchingPipeline::generate), + py::arg("prompts"), + py::arg("sampling_params"), + py::arg("streamer") = std::monostate{} + ); + + // init whisper bindings + init_whisper_pipeline(m); + + // init vlm pipeline + init_vlm_pipeline(m); +} diff --git a/src/python/py_vlm_pipeline.cpp b/src/python/py_vlm_pipeline.cpp new file mode 100644 index 0000000000..9b60e39705 --- /dev/null +++ b/src/python/py_vlm_pipeline.cpp @@ -0,0 +1,153 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + + +#include <filesystem> +#include <pybind11/pybind11.h> +#include <pybind11/stl.h> +#include <pybind11/stl_bind.h> +#include <pybind11/functional.h> +#include "openvino/genai/visual_language/pipeline.hpp" +#include "../cpp/src/tokenizers_path.hpp" +#include "./utils.hpp" + +namespace py = pybind11; +namespace utils = ov::genai::pybind::utils; + + +auto vlm_generate_docstring = R"( + Generates sequences for VLMs. + + :param prompt: input prompt + :type prompt: str + + :param images: list of images + :type inputs: List[ov.Tensor] + + :param generation_config: generation_config + :type generation_config: GenerationConfig or a Dict + + :param streamer: streamer either as a lambda with a boolean returning flag whether generation should be stopped + :type : Callable[[str], bool], ov.genai.StreamerBase + + :param kwargs: arbitrary keyword arguments with keys corresponding to GenerationConfig fields. + :type : Dict + + :return: return results in decoded form + :rtype: DecodedResults +)"; + +auto vlm_generate_kwargs_docstring = R"( + Generates sequences for VLMs. + + :param prompt: input prompt + :type prompt: str + + :param kwargs: arbitrary keyword arguments with keys corresponding to generate params. + + Expected parameters list: + image: ov.Tensor - input image, + images: List[ov.Tensor] - input images, + generation_config: GenerationConfig, + streamer: Callable[[str], bool], ov.genai.StreamerBase - streamer either as a lambda with a boolean returning flag whether generation should be stopped + + :return: return results in decoded form + :rtype: DecodedResults +)"; + +py::object call_vlm_generate( + ov::genai::VLMPipeline& pipe, + const std::string& prompt, + const std::vector<ov::Tensor>& images, + const ov::genai::GenerationConfig& generation_config, + const utils::PyBindStreamerVariant& py_streamer, + const py::kwargs& kwargs +) { + auto updated_config = *ov::genai::pybind::utils::update_config_from_kwargs(generation_config, kwargs); + ov::genai::StreamerVariant streamer = ov::genai::pybind::utils::pystreamer_to_streamer(py_streamer); + + return py::cast(pipe.generate(prompt, images, updated_config, streamer)); +} + +py::object call_vlm_generate( + ov::genai::VLMPipeline& pipe, + const std::string& prompt, + const py::kwargs& kwargs +) { + ov::AnyMap params = {}; + + for (const auto& item : kwargs) { + std::string key = py::cast<std::string>(item.first); + py::object value = py::cast<py::object>(item.second); + + if (key == "images") { + params.insert({ov::genai::images(std::move(py::cast<std::vector<ov::Tensor>>(item.second)))}); + } else if (key == "image") { + params.insert({ov::genai::image(std::move(py::cast<ov::Tensor>(item.second)))}); + } else if (key == "generation_config") { + params.insert({ov::genai::generation_config(std::move(py::cast<ov::genai::GenerationConfig>(item.second)))}); + } else if (key == "streamer") { + auto py_streamer = py::cast<utils::PyBindStreamerVariant>(value); + params.insert({ov::genai::streamer(std::move(ov::genai::pybind::utils::pystreamer_to_streamer(py_streamer)))}); + + } else { + throw(std::invalid_argument("'" + key + "' is unexpected parameter name. " + "Use help(openvino_genai.VLMPipeline.generate) to get list of acceptable parameters.")); + } + } + + return py::cast(pipe.generate(prompt, params)); +} + +void init_vlm_pipeline(py::module_& m) { + py::class_<ov::genai::VLMPipeline>(m, "VLMPipeline", "This class is used for generation with VLMs") + .def(py::init([]( + const std::string& model_path, + const std::string& device, + const std::map<std::string, py::object>& config + ) { + ScopedVar env_manager(utils::ov_tokenizers_module_path()); + return std::make_unique<ov::genai::VLMPipeline>(model_path, device, utils::properties_to_any_map(config)); + }), + py::arg("model_path"), "folder with exported model files", + py::arg("device") = "CPU", "device on which inference will be done", + py::arg("config") = ov::AnyMap({}), "openvino.properties map" + R"( + VLMPipeline class constructor. + model_path (str): Path to the folder with exported model files. + device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'. + )") + + .def("start_chat", &ov::genai::VLMPipeline::start_chat, py::arg("system_message") = "") + .def("finish_chat", &ov::genai::VLMPipeline::finish_chat) + .def("get_generation_config", &ov::genai::VLMPipeline::get_generation_config) + .def("set_generation_config", &ov::genai::VLMPipeline::set_generation_config) + .def( + "generate", + [](ov::genai::VLMPipeline& pipe, + const std::string& prompt, + const std::vector<ov::Tensor>& images, + const ov::genai::GenerationConfig& generation_config, + const utils::PyBindStreamerVariant& streamer, + const py::kwargs& kwargs + ) { + return call_vlm_generate(pipe, prompt, images, generation_config, streamer, kwargs); + }, + py::arg("prompt"), "Input string", + py::arg("images"), "Input images", + py::arg("generation_config") = std::nullopt, "generation_config", + py::arg("streamer") = std::monostate(), "streamer", + (vlm_generate_docstring + std::string(" \n ")).c_str() + ) + .def( + "generate", + [](ov::genai::VLMPipeline& pipe, + const std::string& prompt, + const py::kwargs& kwargs + ) { + return call_vlm_generate(pipe, prompt, kwargs); + }, + py::arg("prompt"), "Input string", + (vlm_generate_kwargs_docstring + std::string(" \n ")).c_str() + ); +} diff --git a/src/python/py_whisper_pipeline.cpp b/src/python/py_whisper_pipeline.cpp new file mode 100644 index 0000000000..ecb6af9715 --- /dev/null +++ b/src/python/py_whisper_pipeline.cpp @@ -0,0 +1,313 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include <pybind11/pybind11.h> +#include <pybind11/stl.h> +#include <pybind11/stl_bind.h> + +#include "../cpp/src/tokenizers_path.hpp" +#include "openvino/genai/whisper_generation_config.hpp" +#include "openvino/genai/whisper_pipeline.hpp" +#include "utils.hpp" + +namespace py = pybind11; +using ov::genai::DecodedResults; +using ov::genai::OptionalWhisperGenerationConfig; +using ov::genai::RawSpeechInput; +using ov::genai::StreamerBase; +using ov::genai::StreamerVariant; +using ov::genai::Tokenizer; +using ov::genai::WhisperDecodedResultChunk; +using ov::genai::WhisperDecodedResults; +using ov::genai::WhisperGenerationConfig; +using ov::genai::WhisperPipeline; + +namespace utils = ov::genai::pybind::utils; + +namespace { + +auto whisper_generate_docstring = R"( + High level generate that receives raw speech as a vector of floats and returns decoded output. + + :param raw_speech_input: inputs in the form of list of floats. Required to be normalized to near [-1, 1] range and have 16k Hz sampling rate. + :type raw_speech_input: List[float] + + :param generation_config: generation_config + :type generation_config: WhisperGenerationConfig or a Dict + + :param streamer: streamer either as a lambda with a boolean returning flag whether generation should be stopped + :type : Callable[[str], bool], ov.genai.StreamerBase + + :param kwargs: arbitrary keyword arguments with keys corresponding to WhisperGenerationConfig fields. + :type : Dict + + :return: return results in encoded, or decoded form depending on inputs type + :rtype: DecodedResults +)"; + +auto whisper_decoded_results_docstring = R"( + Structure to store resulting batched text outputs and scores for each batch. + The first num_return_sequences elements correspond to the first batch element. + + Parameters: + texts: vector of resulting sequences. + scores: scores for each sequence. + metrics: performance metrics with tpot, ttft, etc. of type ov::genai::PerfMetrics. + shunks: chunk of resulting sequences with timestamps +)"; + +auto whisper_decoded_result_chunk = R"( + Structure to store decoded text with corresponding timestamps + + :param start_ts chunk start time in seconds + :param end_ts chunk end time in seconds + :param text chunk text +)"; + +auto whisper_generation_config_docstring = R"( + WhisperGenerationConfig parameters + max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + + `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set. + type: int + + max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. + type: int + + eos_token_id: End of stream token id. + type: int + + Whisper specific parameters: + + decoder_start_token_id: Corresponds to the ”<|startoftranscript|>” token. + type: int + + pad_token_id: Padding token id. + type: int + + translate_token_id: Translate token id. + type: int + + transcribe_token_id: Transcribe token id. + type: int + + no_timestamps_token_id: No timestamps token id. + type: int + + begin_timestamps_token_id: Begin timestamps token id. + type: int + + is_multilingual: + type: bool + + begin_suppress_tokens: A list containing tokens that will be supressed at the beginning of the sampling process. + type: list[int] + + suppress_tokens: A list containing the non-speech tokens that will be supressed during generation. + type: list[int] + + language: Language token to use for generation in the form of <|en|>. + You can find all the possible language tokens in the generation_config.json lang_to_id dictionary. + type: Optional[str] + + lang_to_id: Language token to token_id map. Initialized from the generation_config.json lang_to_id dictionary. + type: Dict[str, int] + + task: Task to use for generation, either “translate” or “transcribe” + type: int +)"; + +OptionalWhisperGenerationConfig update_whisper_config_from_kwargs(const OptionalWhisperGenerationConfig& config, + const py::kwargs& kwargs) { + if (!config.has_value() && kwargs.empty()) + return std::nullopt; + + WhisperGenerationConfig res_config; + if (config.has_value()) + res_config = *config; + + for (const auto& item : kwargs) { + std::string key = py::cast<std::string>(item.first); + py::object value = py::cast<py::object>(item.second); + + if (item.second.is_none()) { + // Even if argument key name does not fit GenerationConfig name + // it's not an eror if it's not defined. + // Some HF configs can have parameters for methods currenly unsupported in ov_genai + // but if their values are not set / None, then this should not block + // us from reading such configs, e.g. {"typical_p": None, 'top_p': 1.0,...} + return res_config; + } + + if (key == "max_new_tokens") { + res_config.max_new_tokens = py::cast<int>(item.second); + } else if (key == "max_length") { + res_config.max_length = py::cast<int>(item.second); + } else if (key == "decoder_start_token_id") { + res_config.decoder_start_token_id = py::cast<int>(item.second); + } else if (key == "pad_token_id") { + res_config.pad_token_id = py::cast<int>(item.second); + } else if (key == "translate_token_id") { + res_config.translate_token_id = py::cast<int>(item.second); + } else if (key == "transcribe_token_id") { + res_config.transcribe_token_id = py::cast<int>(item.second); + } else if (key == "no_timestamps_token_id") { + res_config.no_timestamps_token_id = py::cast<int>(item.second); + } else if (key == "begin_timestamps_token_id") { + res_config.begin_timestamps_token_id = py::cast<int>(item.second); + } else if (key == "max_initial_timestamp_index") { + res_config.max_initial_timestamp_index = py::cast<size_t>(item.second); + } else if (key == "begin_suppress_tokens") { + res_config.begin_suppress_tokens = py::cast<std::vector<int64_t>>(item.second); + } else if (key == "suppress_tokens") { + res_config.suppress_tokens = py::cast<std::vector<int64_t>>(item.second); + } else if (key == "is_multilingual") { + res_config.is_multilingual = py::cast<bool>(item.second); + } else if (key == "language") { + res_config.language = py::cast<std::string>(item.second); + } else if (key == "lang_to_id") { + res_config.lang_to_id = py::cast<std::map<std::string, int64_t>>(item.second); + } else if (key == "task") { + res_config.task = py::cast<std::string>(item.second); + } else if (key == "return_timestamps") { + res_config.return_timestamps = py::cast<bool>(item.second); + } else if (key == "eos_token_id") { + res_config.set_eos_token_id(py::cast<int>(item.second)); + } else { + throw(std::invalid_argument( + "'" + key + + "' is incorrect WhisperGenerationConfig parameter name. " + "Use help(openvino_genai.WhisperGenerationConfig) to get list of acceptable parameters.")); + } + } + + return res_config; +} + +py::object call_whisper_common_generate(WhisperPipeline& pipe, + const RawSpeechInput& raw_speech_input, + const OptionalWhisperGenerationConfig& config, + const utils::PyBindStreamerVariant& py_streamer, + const py::kwargs& kwargs) { + // whisper config should initialized from generation_config.json in case of only kwargs provided + // otherwise it would be initialized with default values which is unexpected for kwargs use case + // if full config was provided then rely on it as a base config + OptionalWhisperGenerationConfig base_config = config.has_value() ? config : pipe.get_generation_config(); + + auto updated_config = update_whisper_config_from_kwargs(base_config, kwargs); + + StreamerVariant streamer = ov::genai::pybind::utils::pystreamer_to_streamer(py_streamer); + + return py::cast(pipe.generate(raw_speech_input, updated_config, streamer)); +} + +py::str handle_utf8_text(const std::string& text) { + // pybind11 decodes strings similar to Pythons's + // bytes.decode('utf-8'). It raises if the decoding fails. + // generate() may return incomplete Unicode points if max_new_tokens + // was reached. Replace such points with � instead of raising an exception + PyObject* py_s = PyUnicode_DecodeUTF8(text.data(), text.length(), "replace"); + return py::reinterpret_steal<py::object>(py_s); +} +} // namespace + +void init_whisper_pipeline(py::module_& m) { + m.doc() = "Pybind11 binding for Whisper Pipeline"; + + // Binding for WhisperGenerationConfig + py::class_<WhisperGenerationConfig>(m, "WhisperGenerationConfig", whisper_generation_config_docstring) + .def(py::init<std::string>(), py::arg("json_path"), "path where generation_config.json is stored") + .def(py::init([](py::kwargs kwargs) { + return *update_whisper_config_from_kwargs(WhisperGenerationConfig(), kwargs); + })) + .def_readwrite("max_new_tokens", &WhisperGenerationConfig::max_new_tokens) + .def_readwrite("max_length", &WhisperGenerationConfig::max_length) + .def_readwrite("begin_suppress_tokens", &WhisperGenerationConfig::begin_suppress_tokens) + .def_readwrite("suppress_tokens", &WhisperGenerationConfig::suppress_tokens) + .def_readwrite("decoder_start_token_id", &WhisperGenerationConfig::decoder_start_token_id) + .def_readwrite("eos_token_id", &WhisperGenerationConfig::eos_token_id) + .def_readwrite("pad_token_id", &WhisperGenerationConfig::pad_token_id) + .def_readwrite("translate_token_id", &WhisperGenerationConfig::translate_token_id) + .def_readwrite("transcribe_token_id", &WhisperGenerationConfig::transcribe_token_id) + .def_readwrite("begin_timestamps_token_id", &WhisperGenerationConfig::begin_timestamps_token_id) + .def_readwrite("max_initial_timestamp_index", &WhisperGenerationConfig::max_initial_timestamp_index) + .def_readwrite("no_timestamps_token_id", &WhisperGenerationConfig::no_timestamps_token_id) + .def_readwrite("is_multilingual", &WhisperGenerationConfig::is_multilingual) + .def_readwrite("language", &WhisperGenerationConfig::language) + .def_readwrite("lang_to_id", &WhisperGenerationConfig::lang_to_id) + .def_readwrite("task", &WhisperGenerationConfig::task) + .def_readwrite("return_timestamps", &WhisperGenerationConfig::return_timestamps) + .def("set_eos_token_id", &WhisperGenerationConfig::set_eos_token_id); + + py::class_<WhisperDecodedResultChunk>(m, "WhisperDecodedResultChunk", whisper_decoded_result_chunk) + .def(py::init<>()) + .def_readonly("start_ts", &WhisperDecodedResultChunk::start_ts) + .def_readonly("end_ts", &WhisperDecodedResultChunk::end_ts) + .def_property_readonly("text", [](WhisperDecodedResultChunk& chunk) { + return handle_utf8_text(chunk.text); + }); + + py::class_<WhisperDecodedResults, DecodedResults>(m, "WhisperDecodedResults", whisper_decoded_results_docstring) + .def_readonly("chunks", &WhisperDecodedResults::chunks); + + py::class_<WhisperPipeline>(m, "WhisperPipeline") + .def(py::init([](const std::string& model_path, + const std::string& device, + const std::map<std::string, py::object>& config) { + ScopedVar env_manager(utils::ov_tokenizers_module_path()); + return std::make_unique<WhisperPipeline>(model_path, device, utils::properties_to_any_map(config)); + }), + py::arg("model_path"), + "folder with openvino_model.xml and openvino_tokenizer[detokenizer].xml files", + py::arg("device") = "CPU", + "device on which inference will be done", + py::arg("config") = ov::AnyMap({}), + "openvino.properties map", + R"( + WhisperPipeline class constructor. + model_path (str): Path to the model file. + device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'. + )") + + .def(py::init([](const std::string& model_path, + const Tokenizer& tokenizer, + const std::string& device, + const std::map<std::string, py::object>& config) { + return std::make_unique<WhisperPipeline>(model_path, + tokenizer, + device, + utils::properties_to_any_map(config)); + }), + py::arg("model_path"), + py::arg("tokenizer"), + py::arg("device") = "CPU", + py::arg("config") = ov::AnyMap({}), + "openvino.properties map", + R"( + WhisperPipeline class constructor for manualy created openvino_genai.Tokenizer. + model_path (str): Path to the model file. + tokenizer (openvino_genai.Tokenizer): tokenizer object. + device (str): Device to run the model on (e.g., CPU, GPU). Default is 'CPU'. + )") + + .def( + "generate", + [](WhisperPipeline& pipe, + const RawSpeechInput& raw_speech_input, + const OptionalWhisperGenerationConfig& generation_config, + const utils::PyBindStreamerVariant& streamer, + const py::kwargs& kwargs) { + return call_whisper_common_generate(pipe, raw_speech_input, generation_config, streamer, kwargs); + }, + py::arg("raw_speech_input"), + "List of floats representing raw speech audio. " + "Required to be normalized to near [-1, 1] range and have 16k Hz sampling rate.", + py::arg("generation_config") = std::nullopt, + "generation_config", + py::arg("streamer") = std::monostate(), + "streamer", + (whisper_generate_docstring + std::string(" \n ") + whisper_generation_config_docstring).c_str()) + + .def("get_tokenizer", &WhisperPipeline::get_tokenizer) + .def("get_generation_config", &WhisperPipeline::get_generation_config, py::return_value_policy::copy) + .def("set_generation_config", &WhisperPipeline::set_generation_config); +} diff --git a/src/python/utils.cpp b/src/python/utils.cpp new file mode 100644 index 0000000000..65033d0866 --- /dev/null +++ b/src/python/utils.cpp @@ -0,0 +1,247 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "./utils.hpp" + +#include <pybind11/pybind11.h> +#include <pybind11/stl.h> +#include <pybind11/stl_bind.h> + +#include <openvino/runtime/auto/properties.hpp> + +#include "../cpp/src/tokenizers_path.hpp" +#include "openvino/genai/llm_pipeline.hpp" + +namespace py = pybind11; +namespace ov::genai::pybind::utils { + +bool py_object_is_any_map(const py::object& py_obj) { + if (!py::isinstance<py::dict>(py_obj)) { + return false; + } + auto dict = py::cast<py::dict>(py_obj); + return std::all_of(dict.begin(), dict.end(), [&](const std::pair<py::object::handle, py::object::handle>& elem) { + return py::isinstance<py::str>(elem.first); + }); +} + +ov::Any py_object_to_any(const py::object& py_obj); + +ov::AnyMap py_object_to_any_map(const py::object& py_obj) { + OPENVINO_ASSERT(py_object_is_any_map(py_obj), "Unsupported attribute type."); + ov::AnyMap return_value = {}; + for (auto& item : py::cast<py::dict>(py_obj)) { + std::string key = py::cast<std::string>(item.first); + py::object value = py::cast<py::object>(item.second); + if (py_object_is_any_map(value)) { + return_value[key] = py_object_to_any_map(value); + } else { + return_value[key] = py_object_to_any(value); + } + } + return return_value; +} + +ov::Any py_object_to_any(const py::object& py_obj) { + // Python types + py::object float_32_type = py::module_::import("numpy").attr("float32"); + + if (py::isinstance<py::str>(py_obj)) { + return py_obj.cast<std::string>(); + } else if (py::isinstance<py::bool_>(py_obj)) { + return py_obj.cast<bool>(); + } else if (py::isinstance<py::bytes>(py_obj)) { + return py_obj.cast<std::string>(); + } else if (py::isinstance<py::float_>(py_obj)) { + return py_obj.cast<double>(); + } else if (py::isinstance(py_obj, float_32_type)) { + return py_obj.cast<float>(); + } else if (py::isinstance<py::int_>(py_obj)) { + return py_obj.cast<int64_t>(); + } else if (py::isinstance<py::none>(py_obj)) { + return {}; + } else if (py::isinstance<py::list>(py_obj)) { + auto _list = py_obj.cast<py::list>(); + enum class PY_TYPE : int { UNKNOWN = 0, STR, INT, FLOAT, BOOL, PARTIAL_SHAPE }; + PY_TYPE detected_type = PY_TYPE::UNKNOWN; + for (const auto& it : _list) { + auto check_type = [&](PY_TYPE type) { + if (detected_type == PY_TYPE::UNKNOWN || detected_type == type) { + detected_type = type; + return; + } + OPENVINO_THROW("Incorrect attribute. Mixed types in the list are not allowed."); + }; + if (py::isinstance<py::str>(it)) { + check_type(PY_TYPE::STR); + } else if (py::isinstance<py::int_>(it)) { + check_type(PY_TYPE::INT); + } else if (py::isinstance<py::float_>(it)) { + check_type(PY_TYPE::FLOAT); + } else if (py::isinstance<py::bool_>(it)) { + check_type(PY_TYPE::BOOL); + } else if (py::isinstance<ov::PartialShape>(it)) { + check_type(PY_TYPE::PARTIAL_SHAPE); + } + } + + if (_list.empty()) + return ov::Any(); + + switch (detected_type) { + case PY_TYPE::STR: + return _list.cast<std::vector<std::string>>(); + case PY_TYPE::FLOAT: + return _list.cast<std::vector<double>>(); + case PY_TYPE::INT: + return _list.cast<std::vector<int64_t>>(); + case PY_TYPE::BOOL: + return _list.cast<std::vector<bool>>(); + case PY_TYPE::PARTIAL_SHAPE: + return _list.cast<std::vector<ov::PartialShape>>(); + default: + OPENVINO_ASSERT(false, "Unsupported attribute type."); + } + + // OV types + } else if (py_object_is_any_map(py_obj)) { + return py_object_to_any_map(py_obj); + } else if (py::isinstance<ov::Any>(py_obj)) { + return py::cast<ov::Any>(py_obj); + } else if (py::isinstance<ov::element::Type>(py_obj)) { + return py::cast<ov::element::Type>(py_obj); + } else if (py::isinstance<ov::PartialShape>(py_obj)) { + return py::cast<ov::PartialShape>(py_obj); + } else if (py::isinstance<ov::hint::Priority>(py_obj)) { + return py::cast<ov::hint::Priority>(py_obj); + } else if (py::isinstance<ov::hint::PerformanceMode>(py_obj)) { + return py::cast<ov::hint::PerformanceMode>(py_obj); + } else if (py::isinstance<ov::intel_auto::SchedulePolicy>(py_obj)) { + return py::cast<ov::intel_auto::SchedulePolicy>(py_obj); + } else if (py::isinstance<ov::hint::SchedulingCoreType>(py_obj)) { + return py::cast<ov::hint::SchedulingCoreType>(py_obj); + } else if (py::isinstance<std::set<ov::hint::ModelDistributionPolicy>>(py_obj)) { + return py::cast<std::set<ov::hint::ModelDistributionPolicy>>(py_obj); + } else if (py::isinstance<ov::hint::ExecutionMode>(py_obj)) { + return py::cast<ov::hint::ExecutionMode>(py_obj); + } else if (py::isinstance<ov::log::Level>(py_obj)) { + return py::cast<ov::log::Level>(py_obj); + } else if (py::isinstance<ov::device::Type>(py_obj)) { + return py::cast<ov::device::Type>(py_obj); + } else if (py::isinstance<ov::streams::Num>(py_obj)) { + return py::cast<ov::streams::Num>(py_obj); + } else if (py::isinstance<ov::Affinity>(py_obj)) { + return py::cast<ov::Affinity>(py_obj); + } else if (py::isinstance<ov::Tensor>(py_obj)) { + return py::cast<ov::Tensor>(py_obj); + } else if (py::isinstance<ov::Output<ov::Node>>(py_obj)) { + return py::cast<ov::Output<ov::Node>>(py_obj); + } else if (py::isinstance<ov::genai::SchedulerConfig>(py_obj)) { + return py::cast<ov::genai::SchedulerConfig>(py_obj); + } else if (py::isinstance<py::object>(py_obj)) { + return py_obj; + } + OPENVINO_ASSERT(false, "Unsupported attribute type."); +} + +std::map<std::string, ov::Any> properties_to_any_map(const std::map<std::string, py::object>& properties) { + std::map<std::string, ov::Any> properties_to_cpp; + for (const auto& property : properties) { + properties_to_cpp[property.first] = py_object_to_any(property.second); + } + return properties_to_cpp; +} + +std::string ov_tokenizers_module_path() { + // Try a path relative to build artifacts folder first. + std::filesystem::path from_relative = tokenizers_relative_to_genai(); + if (std::filesystem::exists(from_relative)) { + return from_relative.string(); + } + return py::str(py::module_::import("openvino_tokenizers").attr("_ext_path")); +} + +ov::genai::StreamerVariant pystreamer_to_streamer(const utils::PyBindStreamerVariant& py_streamer) { + ov::genai::StreamerVariant streamer = std::monostate(); + + std::visit(utils::overloaded { + [&streamer](const std::function<bool(py::str)>& py_callback){ + // Wrap python streamer with manual utf-8 decoding. Do not rely + // on pybind automatic decoding since it raises exceptions on incomplete strings. + auto callback_wrapped = [py_callback](std::string subword) -> bool { + auto py_str = PyUnicode_DecodeUTF8(subword.data(), subword.length(), "replace"); + return py_callback(py::reinterpret_borrow<py::str>(py_str)); + }; + streamer = callback_wrapped; + }, + [&streamer](std::shared_ptr<StreamerBase> streamer_cls){ + streamer = streamer_cls; + }, + [](std::monostate none){ /*streamer is already a monostate */ } + }, py_streamer); + return streamer; +} + +ov::genai::OptionalGenerationConfig update_config_from_kwargs(const ov::genai::OptionalGenerationConfig& config, const py::kwargs& kwargs) { + if(!config.has_value() && kwargs.empty()) + return std::nullopt; + + ov::genai::GenerationConfig res_config; + if(config.has_value()) + res_config = *config; + + for (const auto& item : kwargs) { + std::string key = py::cast<std::string>(item.first); + py::object value = py::cast<py::object>(item.second); + + if (item.second.is_none()) { + // Even if argument key name does not fit GenerationConfig name + // it's not an eror if it's not defined. + // Some HF configs can have parameters for methods currenly unsupported in ov_genai + // but if their values are not set / None, then this should not block + // us from reading such configs, e.g. {"typical_p": None, 'top_p': 1.0,...} + return res_config; + } + + if (key == "max_new_tokens") { + res_config.max_new_tokens = py::cast<int>(item.second); + } else if (key == "max_length") { + res_config.max_length = py::cast<int>(item.second); + } else if (key == "ignore_eos") { + res_config.ignore_eos = py::cast<bool>(item.second); + } else if (key == "num_beam_groups") { + res_config.num_beam_groups = py::cast<int>(item.second); + } else if (key == "num_beams") { + res_config.num_beams = py::cast<int>(item.second); + } else if (key == "diversity_penalty") { + res_config.diversity_penalty = py::cast<float>(item.second); + } else if (key == "length_penalty") { + res_config.length_penalty = py::cast<float>(item.second); + } else if (key == "num_return_sequences") { + res_config.num_return_sequences = py::cast<int>(item.second); + } else if (key == "no_repeat_ngram_size") { + res_config.no_repeat_ngram_size = py::cast<int>(item.second); + } else if (key == "stop_criteria") { + res_config.stop_criteria = py::cast<StopCriteria>(item.second); + } else if (key == "temperature") { + res_config.temperature = py::cast<float>(item.second); + } else if (key == "top_p") { + res_config.top_p = py::cast<float>(item.second); + } else if (key == "top_k") { + res_config.top_k = py::cast<int>(item.second); + } else if (key == "do_sample") { + res_config.do_sample = py::cast<bool>(item.second); + } else if (key == "repetition_penalty") { + res_config.repetition_penalty = py::cast<float>(item.second); + } else if (key == "eos_token_id") { + res_config.set_eos_token_id(py::cast<int>(item.second)); + } else { + throw(std::invalid_argument("'" + key + "' is incorrect GenerationConfig parameter name. " + "Use help(openvino_genai.GenerationConfig) to get list of acceptable parameters.")); + } + } + + return res_config; +} + +} // namespace ov::genai::pybind::utils diff --git a/src/python/utils.hpp b/src/python/utils.hpp new file mode 100644 index 0000000000..4047bdcfe7 --- /dev/null +++ b/src/python/utils.hpp @@ -0,0 +1,41 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include <pybind11/pybind11.h> +#include <pybind11/stl.h> +#include <pybind11/stl_bind.h> + +#include "openvino/genai/streamer_base.hpp" +#include "openvino/genai/llm_pipeline.hpp" + +namespace py = pybind11; +using ov::genai::StreamerBase; + +namespace ov::genai::pybind::utils { + +// When StreamerVariant is used utf-8 decoding is done by pybind and can lead to exception on incomplete texts. +// Therefore strings decoding should be handled with PyUnicode_DecodeUTF8(..., "replace") to not throw errors. +using PyBindStreamerVariant = std::variant<std::function<bool(py::str)>, std::shared_ptr<StreamerBase>, std::monostate>; + +template <class... Ts> +struct overloaded : Ts... { + using Ts::operator()...; +}; +template <class... Ts> +overloaded(Ts...) -> overloaded<Ts...>; + +ov::Any py_object_to_any(const py::object& py_obj); + +bool py_object_is_any_map(const py::object& py_obj); + +ov::AnyMap py_object_to_any_map(const py::object& py_obj); + +std::map<std::string, ov::Any> properties_to_any_map(const std::map<std::string, py::object>& properties); + +std::string ov_tokenizers_module_path(); + +ov::genai::OptionalGenerationConfig update_config_from_kwargs(const ov::genai::OptionalGenerationConfig& config, const py::kwargs& kwargs); + +ov::genai::StreamerVariant pystreamer_to_streamer(const utils::PyBindStreamerVariant& py_streamer); + +} // namespace ov::genai::pybind::utils diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt new file mode 100644 index 0000000000..b3526c873c --- /dev/null +++ b/tests/cpp/CMakeLists.txt @@ -0,0 +1,26 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +if(NOT TARGET gtest) + set(INSTALL_GTEST OFF CACHE BOOL "") + + FetchContent_Declare( + googletest + URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip + URL_HASH SHA256=edd885a1ab32b6999515a880f669efadb80b3f880215f315985fa3f6eca7c4d3 + ) + FetchContent_MakeAvailable(googletest) +endif() + +set(TEST_TARGET_NAME "tests_continuous_batching") +file(GLOB tests_src "*.cpp") +file(GLOB src_files "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/sequence_group.cpp" + "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/cache_eviction.cpp" + "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/sampler.cpp") + +add_executable(${TEST_TARGET_NAME} ${tests_src} + block_allocator.cpp) +target_link_libraries(${TEST_TARGET_NAME} PRIVATE openvino::genai gtest_main) +target_include_directories(${TEST_TARGET_NAME} PRIVATE "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src") +target_sources(${TEST_TARGET_NAME} PRIVATE ${src_files}) diff --git a/tests/cpp/block_allocator.cpp b/tests/cpp/block_allocator.cpp new file mode 100644 index 0000000000..9069740118 --- /dev/null +++ b/tests/cpp/block_allocator.cpp @@ -0,0 +1,220 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include <gtest/gtest.h> +#include "scheduler.hpp" + +using TestBlockAllocatorWithNumLayers = ::testing::TestWithParam<size_t>; + +TEST_P(TestBlockAllocatorWithNumLayers, AllocatesBlocksAccordingToNumLayers) { + size_t num_layers = GetParam(); + size_t initial_num_free_blocks = 10; + auto allocator = ov::genai::BlockAllocator(initial_num_free_blocks, false, num_layers); + for (size_t i = 0; i < num_layers; i++) { + EXPECT_EQ(allocator.num_free_blocks(i), initial_num_free_blocks); + } + + auto blocks = allocator.allocate_block(); + ASSERT_EQ(blocks.size(), num_layers); + + for (size_t i = 0; i < num_layers; i++) { + EXPECT_EQ(allocator.num_free_blocks(i), initial_num_free_blocks - 1); + } +} + +INSTANTIATE_TEST_SUITE_P(VariousNumLayers, TestBlockAllocatorWithNumLayers, ::testing::Values(1, 2, 15, 23, 42)); + +TEST(TestBlockAllocator, AllocatesBlocksIndependentlyToLayers) { + size_t num_layers = 3; + size_t initial_num_free_blocks = 10; + auto allocator = ov::genai::BlockAllocator(initial_num_free_blocks, false, num_layers); + + allocator.allocate_block(0); + allocator.allocate_block(0); + EXPECT_EQ(allocator.num_free_blocks(0), 8); + EXPECT_EQ(allocator.num_free_blocks(1), 10); + EXPECT_EQ(allocator.num_free_blocks(2), 10); + + allocator.allocate_block(2); + + EXPECT_EQ(allocator.num_free_blocks(0), 8); + EXPECT_EQ(allocator.num_free_blocks(1), 10); + EXPECT_EQ(allocator.num_free_blocks(2), 9); + + allocator.allocate_block(1); + allocator.allocate_block(1); + allocator.allocate_block(1); + + EXPECT_EQ(allocator.num_free_blocks(0), 8); + EXPECT_EQ(allocator.num_free_blocks(1), 7); + EXPECT_EQ(allocator.num_free_blocks(2), 9); +} + +TEST(TestBlockAllocator, FreesBlocksIndependentlyFromLayers) { + size_t num_layers = 3; + size_t initial_num_free_blocks = 10; + auto allocator = ov::genai::BlockAllocator(initial_num_free_blocks, false, num_layers); + + auto block_01 = allocator.allocate_block(0); + auto block_02 = allocator.allocate_block(0); + auto block_10 = allocator.allocate_block(1); + auto block_11 = allocator.allocate_block(1); + auto block_12 = allocator.allocate_block(1); + auto block_20 = allocator.allocate_block(2); + ASSERT_EQ(allocator.num_free_blocks(0), 8); + ASSERT_EQ(allocator.num_free_blocks(1), 7); + ASSERT_EQ(allocator.num_free_blocks(2), 9); + + allocator.free(block_02, 0); + EXPECT_EQ(allocator.num_free_blocks(0), 9); + EXPECT_EQ(allocator.num_free_blocks(1), 7); + EXPECT_EQ(allocator.num_free_blocks(2), 9); + + allocator.free(block_20, 2); + EXPECT_EQ(allocator.num_free_blocks(0), 9); + EXPECT_EQ(allocator.num_free_blocks(1), 7); + EXPECT_EQ(allocator.num_free_blocks(2), 10); + + allocator.free(block_12, 1); + allocator.free(block_10, 1); + EXPECT_EQ(allocator.num_free_blocks(0), 9); + EXPECT_EQ(allocator.num_free_blocks(1), 9); + EXPECT_EQ(allocator.num_free_blocks(2), 10); +} + +class PrefixCachingBlockAllocatorTest : public testing::Test { +protected: + PrefixCachingBlockAllocatorTest(): allocator(initial_num_free_blocks, true, num_layers) {} + size_t num_layers = 3; + size_t initial_num_free_blocks = 10; + ov::genai::BlockAllocator allocator; + std::map<uint64_t, ov::genai::BlocksPerLayer> cached_blocks_map; +}; + +TEST_F(PrefixCachingBlockAllocatorTest, OnlyAllocatesAndFreesBlocksFromAllLayers) { + auto allocator = ov::genai::BlockAllocator(initial_num_free_blocks, true, num_layers); + EXPECT_THROW(allocator.allocate_block(0), ov::Exception); + + // allocate one block so that there is something to free + auto blocks_per_layer = allocator.allocate_block(0, cached_blocks_map); + + EXPECT_THROW(allocator.free(blocks_per_layer[0], 0), ov::Exception); + EXPECT_NO_THROW(allocator.free(blocks_per_layer)); + + // with prefix caching freed blocks should go into the overwriteable store first, not in the actual free pool + EXPECT_EQ(allocator.num_overwriteable_blocks(), 1); +} + + +TEST_F(PrefixCachingBlockAllocatorTest, HandlesFreesCorrectlyWithMixedHashFrees) { + // allocate one block so that there is something to free + allocator.allocate_block(0, cached_blocks_map); + allocator.allocate_block(1, cached_blocks_map); + allocator.allocate_block(2, cached_blocks_map); + ASSERT_EQ(allocator.num_free_blocks(0), 7); + + ov::genai::BlocksPerLayer mixed_hash_blocks; + mixed_hash_blocks.reserve(num_layers); + auto hash_0_blocks = cached_blocks_map[0]; + auto hash_1_blocks = cached_blocks_map[1]; + std::copy(hash_0_blocks.begin(), hash_0_blocks.begin() + num_layers / 2, std::back_inserter(mixed_hash_blocks)); + std::copy(hash_1_blocks.begin() + num_layers / 2, hash_1_blocks.end(), std::back_inserter(mixed_hash_blocks)); + + EXPECT_NO_THROW(allocator.free(mixed_hash_blocks)); + EXPECT_EQ(allocator.num_free_blocks(0), 8); + EXPECT_EQ(allocator.num_free_blocks(num_layers - 1), 8); + EXPECT_EQ(allocator.num_overwriteable_blocks(), 0); // mixed hash, can't store under blocks across layers under same hash +} + +TEST_F(PrefixCachingBlockAllocatorTest, AllocatesFromOverwriteableBlocksWhenFreePoolIsExhausted) { + allocator.allocate_block(0, cached_blocks_map); + allocator.allocate_block(1, cached_blocks_map); + allocator.allocate_block(2, cached_blocks_map); + + allocator.free(cached_blocks_map[0]); + allocator.free(cached_blocks_map[1]); + allocator.free(cached_blocks_map[2]); + + ASSERT_EQ(allocator.num_overwriteable_blocks(), 3); + + for (size_t i = 0; i < initial_num_free_blocks - 3; i++) { + allocator.allocate_block(1337 + i, cached_blocks_map); + EXPECT_EQ(allocator.num_overwriteable_blocks(), 3); + } + + EXPECT_EQ(allocator.num_overwriteable_blocks(), 3); + allocator.allocate_block(31337, cached_blocks_map); + EXPECT_EQ(allocator.num_overwriteable_blocks(), 2); +} + +TEST_F(PrefixCachingBlockAllocatorTest, ThrowsAtAllocationWhenFull) { + for (size_t i = 0; i < initial_num_free_blocks; i++) { + allocator.allocate_block(1337 + i, cached_blocks_map); + } + + ASSERT_EQ(allocator.num_overwriteable_blocks(), 0); + ASSERT_EQ(allocator.num_free_blocks(0), 0); + + EXPECT_THROW(allocator.allocate_block(31337, cached_blocks_map), ov::Exception); +} + +TEST_F(PrefixCachingBlockAllocatorTest, HandlesHashCollisionsAtFreeCorrectly) { + // TODO (vshampor): also handle collisions during allocations (multimap instead of map?) + auto cached_blocks_map = std::map<uint64_t, ov::genai::BlocksPerLayer>{}; + auto first_hash_0_block = allocator.allocate_block(0, cached_blocks_map); + allocator.free(first_hash_0_block); + ASSERT_EQ(allocator.num_overwriteable_blocks(), 1); + + // double free + ASSERT_THROW(allocator.free(first_hash_0_block), ov::Exception); + + allocator.allocate_block(1, cached_blocks_map); + auto second_hash_0_block = allocator.allocate_block(0, cached_blocks_map); + EXPECT_EQ(allocator.num_overwriteable_blocks(), 1); + + // this "free" should replace the old block with the same hash in the overwriteable store + allocator.free(second_hash_0_block); + EXPECT_EQ(allocator.num_overwriteable_blocks(), 1); + std::map<uint64_t, ov::genai::BlocksPerLayer> empty_map{}; // to force allocator to take the block from overwriteable store + auto internal_overwriteable_block = allocator.get_cached_block(0, empty_map); + for (size_t layer_idx = 0; layer_idx < internal_overwriteable_block.size(); layer_idx++) { + EXPECT_EQ(internal_overwriteable_block[layer_idx], second_hash_0_block[layer_idx]); + } +} + +TEST(TestBlockAllocator, CalculatesUsagePercentageCorrectly) { + size_t num_layers = 10; + size_t initial_num_free_blocks = 10; + auto allocator = ov::genai::BlockAllocator(initial_num_free_blocks, false, num_layers); + EXPECT_NEAR(allocator.get_used_percentage(), 0.0, 1e-5); + + auto one_block_from_each_layer = allocator.allocate_block(); + EXPECT_NEAR(allocator.get_used_percentage(), 10.0, 1e-5); + + auto one_block_from_some_layer = allocator.allocate_block(7); + EXPECT_NEAR(allocator.get_used_percentage(), 11.0, 1e-5); + + allocator.free(one_block_from_each_layer); + EXPECT_NEAR(allocator.get_used_percentage(), 1.0, 1e-5); +} + + +TEST(TestBlockAllocator, CalculatesUsagePercentageCorrectlyWithPrefixCaching) { + size_t num_layers = 10; + size_t initial_num_free_blocks = 10; + auto allocator = ov::genai::BlockAllocator(initial_num_free_blocks, true, num_layers); + ASSERT_NEAR(allocator.get_used_percentage(), 0.0, 1e-5); + + std::map<uint64_t, ov::genai::BlocksPerLayer> prefix_hash_map; + + for (uint64_t mock_hash: {13, 42, 1337}) { + auto one_block_from_each_layer = allocator.allocate_block(mock_hash, prefix_hash_map); + } + ASSERT_NEAR(allocator.get_used_percentage(), 30.0, 1e-5); + + allocator.free(prefix_hash_map[13]); + ASSERT_NEAR(allocator.get_used_percentage(), 20.0, 1e-5); + + allocator.allocate_block(13, prefix_hash_map); + ASSERT_NEAR(allocator.get_used_percentage(), 30.0, 1e-5); +} diff --git a/tests/cpp/block_hash_store.cpp b/tests/cpp/block_hash_store.cpp new file mode 100644 index 0000000000..6827a332b8 --- /dev/null +++ b/tests/cpp/block_hash_store.cpp @@ -0,0 +1,54 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include <gtest/gtest.h> +#include "openvino/runtime/core.hpp" +#include "scheduler.hpp" +#include <chrono> +#include <thread> + +TEST(TestBlockHashStore, general_test) { + ov::genai::OverwritableBlocksHashStore block_hash_store(1); + auto block0 = std::make_shared<ov::genai::KVCacheBlock>(0); + block0->set_hash(77); + std::this_thread::sleep_until(std::chrono::system_clock::now() + std::chrono::seconds(1)); + auto block1 = std::make_shared<ov::genai::KVCacheBlock>(1); + block1->set_hash(56); + std::this_thread::sleep_until(std::chrono::system_clock::now() + std::chrono::seconds(1)); + auto block2 = std::make_shared<ov::genai::KVCacheBlock>(2); + block2->set_hash(23); + std::this_thread::sleep_until(std::chrono::system_clock::now() + std::chrono::seconds(1)); + block_hash_store.add(ov::genai::BlocksPerLayer{block0}); + block_hash_store.add(ov::genai::BlocksPerLayer{block1}); + block_hash_store.add(ov::genai::BlocksPerLayer{block2}); + EXPECT_EQ(block_hash_store.num_blocks(), 3); + + auto block = block_hash_store.get_block_to_restore(56)[0]; + EXPECT_EQ(block->get_index(), 1); + EXPECT_EQ(block->get_hash(), 56); + EXPECT_EQ(block->get_references_count(), 1); + EXPECT_EQ(block_hash_store.num_blocks(), 2); + + EXPECT_TRUE(block_hash_store.get_block_to_restore(44).empty()); + EXPECT_EQ(block_hash_store.num_blocks(), 2); + + EXPECT_EQ(block_hash_store.get_lru_block_to_overwrite()[0]->get_index(), 0); + EXPECT_EQ(block_hash_store.num_blocks(), 1); + + auto block3 = std::make_shared<ov::genai::KVCacheBlock>(7); + block3->set_hash(12); + std::this_thread::sleep_until(std::chrono::system_clock::now() + std::chrono::seconds(1)); + auto block4 = std::make_shared<ov::genai::KVCacheBlock>(10); + block4->set_hash(99); + std::this_thread::sleep_until(std::chrono::system_clock::now() + std::chrono::seconds(1)); + block_hash_store.add(ov::genai::BlocksPerLayer{block3}); + block_hash_store.add(ov::genai::BlocksPerLayer{block4}); + block2->set_timestamp(std::chrono::system_clock::now()); + + EXPECT_EQ(block_hash_store.get_lru_block_to_overwrite()[0]->get_index(), 7); + EXPECT_EQ(block_hash_store.get_lru_block_to_overwrite()[0]->get_index(), 10); + EXPECT_EQ(block_hash_store.get_lru_block_to_overwrite()[0]->get_index(), 2); + EXPECT_TRUE(block_hash_store.get_lru_block_to_overwrite().empty()); + EXPECT_EQ(block_hash_store.num_blocks(), 0); +} diff --git a/tests/cpp/block_manager.cpp b/tests/cpp/block_manager.cpp new file mode 100644 index 0000000000..466cc23864 --- /dev/null +++ b/tests/cpp/block_manager.cpp @@ -0,0 +1,111 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include <gtest/gtest.h> +#include "openvino/runtime/core.hpp" +#include "openvino/genai/generation_config.hpp" +#include "sequence_group.hpp" +#include "scheduler.hpp" + +TEST(TestBlockManager, general_test) { + ov::genai::BlockManager bm = ov::genai::BlockManager(6, false, 4); + ov::genai::TokenIds prompt_ids; + + ov::genai::SequenceGroup::Ptr sequence_group = std::make_shared<ov::genai::SequenceGroup>( + 0, + ov::Tensor(ov::element::i64, { + prompt_ids.size()}, prompt_ids.data()), + ov::genai::beam_search(), + 4, + false); + auto sequence = sequence_group->get_not_finished_sequences()[0]; + bm.allocate(sequence, 6); + auto seq_id = sequence->get_id(); + EXPECT_TRUE(bm.has_block_table(seq_id)); + EXPECT_EQ(bm.get_block_table(seq_id, 0).size(), 6); + EXPECT_EQ(bm.num_free_blocks(), 0); + + bm.free_sequence_partially(seq_id, 4); + EXPECT_EQ(bm.get_block_table(seq_id, 0).size(), 2); + EXPECT_EQ(bm.num_free_blocks(), 4); + + bm.free_sequence(seq_id); + EXPECT_FALSE(bm.has_block_table(seq_id)); + EXPECT_EQ(bm.num_free_blocks(), 6); + + bm.allocate(sequence, 2); + bm.fork_sequence(seq_id, 1); + EXPECT_TRUE(bm.has_block_table(1)); + EXPECT_EQ(bm.get_block_table(1, 0).back()->get_references_count(), 2); + +} + +TEST(TestBlockManager, required_blocks_count) { + ov::genai::BlockManager bm = ov::genai::BlockManager(8, false, 4, 3); + + std::vector<uint64_t> tokens = {0,1,2,3,4}; + ov::genai::SequenceGroup::Ptr sequence_group = std::make_shared<ov::genai::SequenceGroup>( + 0, + ov::Tensor(ov::element::i64, { + tokens.size()}, tokens.data()), + ov::genai::beam_search(), + 4, + false); + sequence_group->set_sequence_group_ptr(sequence_group); + sequence_group->schedule_tokens(5); + auto required_blocks = bm.required_blocks_count(sequence_group); + EXPECT_EQ(required_blocks, 2); + EXPECT_TRUE(bm.can_append_slots(sequence_group)); + bm.append_slots(sequence_group); + EXPECT_EQ(bm.num_free_blocks(), 6); + EXPECT_EQ(bm.get_number_of_blocks_occupied_by_sequence(sequence_group), 2); + + sequence_group->finish_iteration(); + auto sequence_to_fork = sequence_group->get_running_sequences()[0]; + for (size_t i = 0; i < 4; ++i) { + const auto forked_sequence = sequence_group->fork_sequence(sequence_to_fork); + bm.fork_sequence(sequence_to_fork->get_id(), forked_sequence->get_id()); + } + EXPECT_EQ(bm.get_number_of_blocks_occupied_by_sequence(sequence_group), 2); + sequence_group->schedule_tokens(1); + required_blocks = bm.required_blocks_count(sequence_group); + // The last block was incomplete before forking, therefore need to allocate an extra block for each new forked + // sequence (excluding the original) + EXPECT_EQ(required_blocks, 4); + EXPECT_TRUE(bm.can_append_slots(sequence_group)); + bm.append_slots(sequence_group); + EXPECT_EQ(bm.get_number_of_blocks_occupied_by_sequence(sequence_group), 6); + EXPECT_EQ(bm.num_free_blocks(), 2); + sequence_group->finish_iteration(); + + sequence_group->schedule_tokens(3); + required_blocks = bm.required_blocks_count(sequence_group); + // Each sequence in group had 3 tokens scheduled in addition to 6 already processed, e.g. with block size 4 we + // require 1 extra block for each sequence in group + EXPECT_EQ(required_blocks, 5); + EXPECT_FALSE(bm.can_append_slots(sequence_group)); +} + + +TEST(TestBlockManager, CanFreeBlocksFromSequence) { + const size_t BLOCK_SIZE = 2; + ov::genai::BlockManager bm = ov::genai::BlockManager(8, false, BLOCK_SIZE, 3); + + std::vector<uint64_t> tokens = {0,1,2,3,4}; + ov::genai::SequenceGroup::Ptr sequence_group = std::make_shared<ov::genai::SequenceGroup>( + 0, + ov::Tensor(ov::element::i64, { + tokens.size()}, tokens.data()), + ov::genai::beam_search(), + BLOCK_SIZE, + false); + sequence_group->set_sequence_group_ptr(sequence_group); + sequence_group->schedule_tokens(5); + bm.append_slots(sequence_group); + ASSERT_EQ(bm.num_free_blocks(), 5); + + size_t seq_id = sequence_group->get_sequences()[0]->get_id(); + bm.free_blocks_from_sequence(seq_id, { {0}, {1}, {2} }); + EXPECT_EQ(bm.num_free_blocks(), 6); +} \ No newline at end of file diff --git a/tests/cpp/cache_eviction.cpp b/tests/cpp/cache_eviction.cpp new file mode 100644 index 0000000000..026a7cbe64 --- /dev/null +++ b/tests/cpp/cache_eviction.cpp @@ -0,0 +1,422 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "cache_eviction.hpp" +#include "gtest/gtest.h" + +#include <algorithm> + +const ov::genai::CacheEvictionConfig DEFAULT_CACHE_EVICTION_CONFIG = {32, 32, 192, ov::genai::AggregationMode::NORM_SUM}; +const ov::genai::CacheEvictionConfig SHORT_RECENT_EVICTION_CONFIG = {32, 32, 72, ov::genai::AggregationMode::NORM_SUM}; +constexpr size_t DEFAULT_BLOCK_SIZE = 4; +constexpr size_t DEFAULT_NUM_DECODER_LAYERS = 2; + +class DefaultCacheEvictionAlgoTest : public testing::Test { +protected: + DefaultCacheEvictionAlgoTest() { + algo = ov::genai::CacheEvictionAlgorithm(eviction_config, block_size, num_decoder_layers); + } + size_t block_size = DEFAULT_BLOCK_SIZE; + size_t num_decoder_layers = DEFAULT_NUM_DECODER_LAYERS; + ov::genai::CacheEvictionConfig eviction_config = DEFAULT_CACHE_EVICTION_CONFIG; + ov::genai::CacheEvictionAlgorithm algo; + + void evict_twice_and_expect_no_eviction() { + auto blocks_to_evict = algo.evict_logical_blocks(); + for (const auto& evicted_blocks_for_this_layer : blocks_to_evict) { + EXPECT_TRUE(evicted_blocks_for_this_layer.empty()); + } + + // again + blocks_to_evict = algo.evict_logical_blocks(); + for (const auto& evicted_blocks_for_this_layer : blocks_to_evict) { + EXPECT_TRUE(evicted_blocks_for_this_layer.empty()); + } + } +}; + +AttentionScoresForEachDecoderLayer get_mock_scores(size_t num_layers, size_t num_tokens) { + AttentionScoresForEachDecoderLayer retval; + retval.reserve(num_layers); + for (size_t i = 0; i < num_layers; i++) { + auto tensor = ov::Tensor(ov::element::f32, ov::Shape{num_tokens}); + retval.push_back(tensor); + } + return retval; +} + +TEST_F(DefaultCacheEvictionAlgoTest, NothingToEvictInitially) { + evict_twice_and_expect_no_eviction(); +} + +class CacheEvictionAlgoTokenCountParameterizedTest : public DefaultCacheEvictionAlgoTest, public ::testing::WithParamInterface<size_t> {}; + + +TEST_P(CacheEvictionAlgoTokenCountParameterizedTest, DoesntEvictIfTotalSizeNotReached) { + const size_t num_tokens_to_register = GetParam(); + ASSERT_LT(num_tokens_to_register, eviction_config.get_max_cache_size()); + + algo.register_new_token_scores(get_mock_scores(num_decoder_layers, num_tokens_to_register)); + + evict_twice_and_expect_no_eviction(); +} + +INSTANTIATE_TEST_SUITE_P(VariousTokenCountsLessThanTotalSize, CacheEvictionAlgoTokenCountParameterizedTest, + ::testing::Values(8, 49, 190)); + + +struct RangeCalculationTestStruct { + size_t num_tokens; + ov::genai::CacheEvictionAlgorithm::CacheEvictionRange expected_range; +}; + +class CacheEvictionRangeCalculationParameterizedTest : public DefaultCacheEvictionAlgoTest, public ::testing::WithParamInterface<RangeCalculationTestStruct> {}; +const std::vector<RangeCalculationTestStruct> RANGE_CALCULATION_TEST_CASES = { + {192, ov::genai::CacheEvictionAlgorithm::CacheEvictionRange(0, 0)}, + {192 + 1, ov::genai::CacheEvictionAlgorithm::CacheEvictionRange(0, 0)}, + {192 + 4, ov::genai::CacheEvictionAlgorithm::CacheEvictionRange(8, 41)}, + {192 + 4 + 1, ov::genai::CacheEvictionAlgorithm::CacheEvictionRange(8, 41)}, + {192 + 2 * 4, ov::genai::CacheEvictionAlgorithm::CacheEvictionRange(8, 42)}, + {192 + 8 * 4 + 3, ov::genai::CacheEvictionAlgorithm::CacheEvictionRange(8, 48)}, +}; +TEST_P(CacheEvictionRangeCalculationParameterizedTest, EvictableRangeCalculatedCorrectly) { + const size_t num_tokens_to_register = GetParam().num_tokens; + + algo.register_new_token_scores(get_mock_scores(num_decoder_layers, num_tokens_to_register)); + auto range = algo.get_evictable_block_range(); + EXPECT_EQ(range.first, GetParam().expected_range.first); + EXPECT_EQ(range.second, GetParam().expected_range.second); +} +INSTANTIATE_TEST_SUITE_P(VariousTokenCounts, CacheEvictionRangeCalculationParameterizedTest, ::testing::ValuesIn(RANGE_CALCULATION_TEST_CASES)); + + +TEST_F(DefaultCacheEvictionAlgoTest, StartsEvictingOnceMaxSizeExceeded) { + // all eviction areas filled, but no overflow yet + algo.register_new_token_scores(get_mock_scores(num_decoder_layers, eviction_config.get_max_cache_size())); + evict_twice_and_expect_no_eviction(); + + // some tokens overflow the combined eviction area size, but the overflow size is less than 1 block + algo.register_new_token_scores(get_mock_scores(num_decoder_layers, eviction_config.get_max_cache_size() + 1)); + evict_twice_and_expect_no_eviction(); + + // same + algo.register_new_token_scores(get_mock_scores(num_decoder_layers, eviction_config.get_max_cache_size() + DEFAULT_BLOCK_SIZE - 1)); + evict_twice_and_expect_no_eviction(); + + // overflowing tokens now fill 1 extra block, all layers should evict 1 block + algo.register_new_token_scores(get_mock_scores(num_decoder_layers, eviction_config.get_max_cache_size() + DEFAULT_BLOCK_SIZE)); + auto evictable_range = algo.get_evictable_block_range(); + EXPECT_EQ(evictable_range.second - evictable_range.first, eviction_config.get_evictable_size() / block_size + 1); + + auto evicted_blocks = algo.evict_logical_blocks(); + EXPECT_TRUE(std::all_of(evicted_blocks.begin(), evicted_blocks.end(), [](const std::set<size_t>& v) { return (v.size() == 1); })); + EXPECT_TRUE(std::all_of(evicted_blocks.begin(), evicted_blocks.end(), [evictable_range](const std::set<size_t>& v) { + size_t evicted_block_idx = *(v.begin()); + return (evicted_block_idx >= evictable_range.first) && (evicted_block_idx < evictable_range.second) ; })); +} + +using CacheEvictionAlgoConfigurationTest = ::testing::TestWithParam<size_t>; + +TEST_P(CacheEvictionAlgoConfigurationTest, EvictedBlocksAreLayeredAsConfigured) { + size_t ref_num_layers = GetParam(); + auto algo = ov::genai::CacheEvictionAlgorithm(DEFAULT_CACHE_EVICTION_CONFIG, DEFAULT_BLOCK_SIZE, ref_num_layers); + auto blocks_to_evict = algo.evict_logical_blocks(); + ASSERT_EQ(blocks_to_evict.size(), ref_num_layers); +} + +INSTANTIATE_TEST_SUITE_P(VariousLayerCounts, CacheEvictionAlgoConfigurationTest, ::testing::Values(1, 4, 13, 23, 42)); + + +void fill_scores(ov::Tensor& scores, size_t start_pos, size_t end_pos, float value) { + ASSERT_LE(start_pos, end_pos); + ASSERT_LE(end_pos, scores.get_size()); + + for (size_t i = start_pos; i < end_pos; i++) { + scores.data<float>()[i] = value; + } +} + +struct LowScoreBlocksTestStruct { + std::string test_id; + size_t tokens_over_max_cache_size; + ov::genai::CacheEvictionConfig eviction_config; + std::vector<std::set<size_t>> zero_filled_blocks; + std::vector<std::set<size_t>> ref_evicted_blocks; +}; + +using CacheEvictionLowScoreBlocksParameterizedTest = ::testing::TestWithParam<LowScoreBlocksTestStruct>; +const std::vector<LowScoreBlocksTestStruct> LOW_SCORE_BLOCK_EVICTION_TEST_CASES = { + // low-scored blocks in evictable area + { + "one_block", + 1, // one overflowing token amounting to one extra block to be evicted + DEFAULT_CACHE_EVICTION_CONFIG, + {{17}, {9}}, + {{17}, {9}} + }, + + // same, but with multiple blocks in evictable area + { + "three_blocks", + 2 * 4 + 2, // 2 blocks worth of overflow + 2 tokens, amounting to 3 blocks to be evicted + DEFAULT_CACHE_EVICTION_CONFIG, + {{28, 10, 11}, {18, 8, 31}}, + {{28, 10, 11}, {18, 8, 31}} + }, + // if there are more blocks with same low score than should be evicted, the lower-indexed ones should take precedence + { + "four_zeroed_two_to_evict", + 1 * 4 + 2, // 2 blocks to be evicted + DEFAULT_CACHE_EVICTION_CONFIG, + {{15, 36, 13, 10}, {9, 39, 31, 11}}, // 4 zeroed blocks + {{10, 13}, {9, 11}} + }, + // will prefer to evict lower-indexed blocks if there are multiple same-scored blocks + { + "less_zeroed_than_to_evict", + 5 * 4 + 2, // 6 blocks to be evicted + DEFAULT_CACHE_EVICTION_CONFIG, + {{}, {30, 22}}, // 1st layer has no zeroed blocks, 2nd has only 2 zeroed blocks + {{8, 9, 10, 11, 12, 13}, {8, 9, 10, 11, 22, 30}} // non-zeroed blocks to evict are taken from the beginning of evictable range + }, + + // low-scored blocks in non-evictable range do not lead to eviction + { + "zeros_also_in_non_evictable_areas", + 5 * 4 + 2, // 6 blocks to be evicted + DEFAULT_CACHE_EVICTION_CONFIG, + {{0, 2, 7, 24, 31, 49}, {5, 19, 27, 39, 50, 52}}, // 1st layer has 0, 2, 7 in start_area, 49 in recent_area; 2nd has 5 in start_area, 50, 54 in recent_area + {{8, 9, 10, 11, 24, 31}, {8, 9, 10, 19, 27, 39}} // eviction padded up to 6 blocks by blocks in the beginning of the evictable_area + }, + // more overflowing blocks than evictable area, recent area shifts accordingly to the end of the overflow + { + "more_overflow_than_eviction_blocks", + 4 * 4 + 1, // 5 blocks to be evicted + SHORT_RECENT_EVICTION_CONFIG, + {{0, 9, 10, 11, 13}, {12, 11, 8, 9, 17}}, + {{8, 9, 10, 11, 13}, {8, 9, 10, 11, 12}} + }, +}; + +TEST_P(CacheEvictionLowScoreBlocksParameterizedTest, EvictsLowestScoredBlocks) { + auto test_struct = GetParam(); + size_t num_decoder_layers = DEFAULT_NUM_DECODER_LAYERS; + auto algo = ov::genai::CacheEvictionAlgorithm(test_struct.eviction_config, DEFAULT_BLOCK_SIZE, num_decoder_layers); + std::vector<std::set<size_t>> ref_lowest_scored_block_indices = test_struct.zero_filled_blocks; + ASSERT_EQ(ref_lowest_scored_block_indices.size(), num_decoder_layers); + + auto scores = get_mock_scores(num_decoder_layers, algo.get_max_cache_size_after_eviction() + test_struct.tokens_over_max_cache_size); + for (size_t layer_idx = 0; layer_idx < num_decoder_layers; layer_idx++) { + auto& scores_per_layer = scores[layer_idx]; + // Fill scores of target blocks with 0, the rest with 1 + fill_scores(scores_per_layer, 0, scores_per_layer.get_size(), 1.0); + for (size_t target_block_idx : test_struct.zero_filled_blocks[layer_idx]) { + fill_scores(scores_per_layer, DEFAULT_BLOCK_SIZE * target_block_idx, + DEFAULT_BLOCK_SIZE * (target_block_idx + 1), 0.0); + } + } + algo.register_new_token_scores(scores); + + auto test_evicted_blocks = algo.evict_logical_blocks(); + auto ref_evicted_blocks = test_struct.ref_evicted_blocks; + for (size_t layer_idx = 0; layer_idx < num_decoder_layers; layer_idx++) { + EXPECT_EQ(test_evicted_blocks[layer_idx], ref_evicted_blocks[layer_idx]); + } +} + + +INSTANTIATE_TEST_SUITE_P(VariousSetsOfLowScoreBlocks, CacheEvictionLowScoreBlocksParameterizedTest, + ::testing::ValuesIn(LOW_SCORE_BLOCK_EVICTION_TEST_CASES), + [](const testing::TestParamInfo<CacheEvictionLowScoreBlocksParameterizedTest::ParamType>& info) { + return info.param.test_id; + }); + + +static constexpr size_t BLOCKS_TO_EVICT = 3; // 3 blocks to evict +struct NormalizationSettingTestStruct { + ov::genai::AggregationMode normalization_mode; + double token_score_power; + bool newer_tokens_with_larger_score; + std::array<size_t, BLOCKS_TO_EVICT> ref_evicted_blocks; // will be cast to std::set so order is irrelevant +}; + +using CacheEvictionNormalizationSettingTest = ::testing::TestWithParam<NormalizationSettingTestStruct>; +const std::vector<NormalizationSettingTestStruct> NORMALIZATION_SETTING_TEST_CASES = { + // power of 1.1 beats the 1 / N in the normalization, low-score blocks are in the end of the evictable area + { ov::genai::AggregationMode::NORM_SUM, 1.1, false, { 40, 41, 42} }, + + // newer tokens have larger score, low-score blocks are now in the beginning of the evictable area + { ov::genai::AggregationMode::NORM_SUM, 1.1, true, { 8, 9, 10} }, + + // power of 0.9 does not beat the 1 / N in the normalization, low-score blocks are in the beginning of the evictable area + { ov::genai::AggregationMode::NORM_SUM, 0.9, false, { 8, 9, 10} }, + + // newer tokens have larger score, low-score blocks are now in the beginning of the evictable area + { ov::genai::AggregationMode::NORM_SUM, 0.9, true, { 8, 9, 10} }, + + // for the SUM aggregation mode, only the score curve determines the evicted blocks + { ov::genai::AggregationMode::SUM, 0.9, false, { 40, 41, 42} }, + { ov::genai::AggregationMode::SUM, 0.9, true, { 8, 9, 10} }, + { ov::genai::AggregationMode::SUM, 1.1, false, { 40, 41, 42} }, + { ov::genai::AggregationMode::SUM, 1.1, true, { 8, 9, 10} }, +}; + +TEST_P(CacheEvictionNormalizationSettingTest, TokenLifetimeNormalizationHasEffect) { + const auto& test_struct = GetParam(); + auto config = DEFAULT_CACHE_EVICTION_CONFIG; + config.aggregation_mode = test_struct.normalization_mode; + + const size_t NUM_DECODER_LAYERS = 1; + auto algo = ov::genai::CacheEvictionAlgorithm(config, DEFAULT_BLOCK_SIZE, NUM_DECODER_LAYERS); + auto scores = get_mock_scores(NUM_DECODER_LAYERS, algo.get_max_cache_size_after_eviction() + BLOCKS_TO_EVICT * DEFAULT_BLOCK_SIZE); + for (auto& scores_per_layer : scores) { + const size_t SCORES_SIZE = scores_per_layer.get_size(); + for (size_t i = 0; i < SCORES_SIZE; i++) { + if (test_struct.newer_tokens_with_larger_score) { + fill_scores(scores_per_layer, i, i + 1, std::pow(i, test_struct.token_score_power)); + } else { + fill_scores(scores_per_layer, SCORES_SIZE - i - 1, SCORES_SIZE - i, std::pow(i, test_struct.token_score_power)); + } + } + } + + algo.register_new_token_scores(scores); + auto blocks_to_evict = algo.evict_logical_blocks(); + std::set<size_t> ref_evicted_blocks; + for (auto val : test_struct.ref_evicted_blocks) { + ref_evicted_blocks.insert(val); // same for all decoder layers + } + + for (const auto& test_evicted_blocks : blocks_to_evict) { + EXPECT_EQ(test_evicted_blocks, ref_evicted_blocks); + } + +} + +INSTANTIATE_TEST_SUITE_P(VariousAggregationModesAndScoreDistributions, CacheEvictionNormalizationSettingTest, + ::testing::ValuesIn(NORMALIZATION_SETTING_TEST_CASES), + [](const testing::TestParamInfo<CacheEvictionNormalizationSettingTest::ParamType>& info) { + std::stringstream ss; + if (info.param.normalization_mode == ov::genai::AggregationMode::NORM_SUM) { + ss << "norm_sum"; + } + else { + ss << "sum"; + } + ss << "_" << info.param.token_score_power; + + if (info.param.newer_tokens_with_larger_score) { + ss << "_rising"; + } + else { + ss << "_falling"; + } + + std::string retval = ss.str(); + std::replace(retval.begin(), retval.end(), '.', '_'); + return retval; + }); + + + +using CacheEvictionConfigModeCommonBehaviour = ::testing::TestWithParam<ov::genai::AggregationMode>; +const std::vector<ov::genai::AggregationMode> SCORE_ACCUMULATION_TEST_CASES = {ov::genai::AggregationMode::NORM_SUM, + ov::genai::AggregationMode::SUM}; + +TEST_P(CacheEvictionConfigModeCommonBehaviour, ScoresAreAccumulated) { + const auto& aggregation_mode = GetParam(); + + auto config = DEFAULT_CACHE_EVICTION_CONFIG; + config.aggregation_mode = aggregation_mode; + const size_t NUM_DECODER_LAYERS = 1; + auto algo = ov::genai::CacheEvictionAlgorithm(config, DEFAULT_BLOCK_SIZE, NUM_DECODER_LAYERS); + + auto scores_phase_1 = get_mock_scores(NUM_DECODER_LAYERS, algo.get_max_cache_size_after_eviction() + BLOCKS_TO_EVICT * DEFAULT_BLOCK_SIZE); + for (auto& scores_per_layer : scores_phase_1) { + // ones + fill_scores(scores_per_layer, 0, scores_per_layer.get_size(), 1.0); + } + + algo.register_new_token_scores(scores_phase_1); + auto blocks_to_evict_phase_1 = algo.evict_logical_blocks(); + ASSERT_GT(blocks_to_evict_phase_1.size(), 0); + ASSERT_EQ(blocks_to_evict_phase_1[0].size(), BLOCKS_TO_EVICT); + + const std::set<size_t> zeroed_blocks_in_phase_2{14, 3, 17, 21}; // only 14, 17 and 21 are in evictable range + + auto scores_phase_2 = get_mock_scores(NUM_DECODER_LAYERS, algo.get_max_cache_size_after_eviction() + BLOCKS_TO_EVICT * DEFAULT_BLOCK_SIZE); + for (auto& scores_per_layer : scores_phase_2) { + // zeroes for tokens that are expected to be evicted and large background score for the rest + fill_scores(scores_per_layer, 0, scores_per_layer.get_size(), 1000.0); + for (size_t target_block_idx : zeroed_blocks_in_phase_2) { + fill_scores(scores_per_layer, DEFAULT_BLOCK_SIZE * target_block_idx, + DEFAULT_BLOCK_SIZE * (target_block_idx + 1), 0.0); + } + } + + algo.register_new_token_scores(scores_phase_2); + + const std::set<size_t> ref_evicted_blocks = {14, 17, 21}; + + auto blocks_to_evict_phase_2 = algo.evict_logical_blocks(); + + for (const auto& test_evicted_blocks : blocks_to_evict_phase_2) { + EXPECT_EQ(test_evicted_blocks, ref_evicted_blocks); + } + +} + + +INSTANTIATE_TEST_SUITE_P(VariousAggregationModes, CacheEvictionConfigModeCommonBehaviour, + ::testing::ValuesIn(SCORE_ACCUMULATION_TEST_CASES)); + +struct CacheEvictionConfigInitParamsForTest { + size_t start_size; + size_t recent_size; + size_t max_cache_size; +}; + +using CacheEvictionConfigInitializationTest = ::testing::TestWithParam<CacheEvictionConfigInitParamsForTest>; + +const std::vector<CacheEvictionConfigInitParamsForTest> INVALID_CONFIG_INIT_PARAMS_CASES = { + // zero area sizes + {32, 32, 64}, + {0, 13, 39}, + {128, 0, 384}, + + // max_cache_size less than start_size + recent_size + {32, 64, 32}, +}; + +TEST_P(CacheEvictionConfigInitializationTest, ThrowsForInvalidConfigParams) { + auto params = GetParam(); + EXPECT_THROW(ov::genai::CacheEvictionConfig(params.start_size, params.recent_size, params.max_cache_size, ov::genai::AggregationMode::NORM_SUM), ov::Exception); +} + +INSTANTIATE_TEST_SUITE_P(VariousInvalidInitParams, CacheEvictionConfigInitializationTest, + ::testing::ValuesIn(INVALID_CONFIG_INIT_PARAMS_CASES)); + +struct CacheEvictionAlgoInitParamsForTest { + ov::genai::CacheEvictionConfig config; + size_t block_size; + size_t num_decoder_layers; +}; + +using CacheEvictionAlgoInitializationTest = ::testing::TestWithParam<CacheEvictionAlgoInitParamsForTest>; + +const std::vector<CacheEvictionAlgoInitParamsForTest> INVALID_ALGO_INIT_PARAMS_CASES = { + // area sizes not multiple of block size + { {32, 32, 97, ov::genai::AggregationMode::SUM}, 16, 8}, + { {11, 13, 50, ov::genai::AggregationMode::NORM_SUM}, 13, 1}, + { {128, 200, 584, ov::genai::AggregationMode::NORM_SUM}, 128, 19}, + + // zero decoder layers + { {32, 64, 192, ov::genai::AggregationMode::SUM}, 32, 0}, +}; +TEST_P(CacheEvictionAlgoInitializationTest, ThrowsForInvalidConfigs) { + auto params = GetParam(); + EXPECT_THROW(ov::genai::CacheEvictionAlgorithm(params.config, params.block_size, params.num_decoder_layers), ov::Exception); +} + +INSTANTIATE_TEST_SUITE_P(VariousInvalidInitParams, CacheEvictionAlgoInitializationTest, + ::testing::ValuesIn(INVALID_ALGO_INIT_PARAMS_CASES)); diff --git a/tests/cpp/cache_manager.cpp b/tests/cpp/cache_manager.cpp new file mode 100644 index 0000000000..be764a121d --- /dev/null +++ b/tests/cpp/cache_manager.cpp @@ -0,0 +1,35 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include <gtest/gtest.h> +#include "openvino/runtime/core.hpp" +#include "scheduler.hpp" +#include "device_config.hpp" +#include "cache_manager.hpp" + +TEST(TestCacheManager, general_test) { + ov::Core core; + ov::genai::SchedulerConfig scheduler_config; + scheduler_config.max_num_batched_tokens = 32; + scheduler_config.num_kv_blocks = 0; + scheduler_config.cache_size = 2; + scheduler_config.block_size = 32; + scheduler_config.max_num_seqs = 2; + + const std::string device = "CPU"; + ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU"); + size_t num_decoder_layers = 12; + device_config.set_model_params(12, 64, num_decoder_layers); + + auto cache_manager = std::make_shared<ov::genai::CacheManager>(device_config, core); + + size_t allocated_bytes = 0; + for (size_t i = 0; i < num_decoder_layers; i++) { + auto key_cache = cache_manager->get_key_cache(i); + auto value_cache = cache_manager->get_value_cache(i); + allocated_bytes += key_cache.get_byte_size() + value_cache.get_byte_size(); + } + + ASSERT_EQ(allocated_bytes, 2146959360); +} diff --git a/tests/cpp/device_config.cpp b/tests/cpp/device_config.cpp new file mode 100644 index 0000000000..f60e948852 --- /dev/null +++ b/tests/cpp/device_config.cpp @@ -0,0 +1,32 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include <gtest/gtest.h> +#include "openvino/runtime/core.hpp" +#include "scheduler.hpp" +#include "device_config.hpp" + +TEST(TestDeviceConfig, kv_cache_precision_u8) { + ov::Core core; + ov::genai::SchedulerConfig scheduler_config; + scheduler_config.max_num_batched_tokens = 32; + scheduler_config.num_kv_blocks = 0; + scheduler_config.cache_size = 2; + scheduler_config.block_size = 32; + scheduler_config.max_num_seqs = 2; + + const std::string device = "CPU"; + size_t num_decoder_layers = 12; + size_t head_size = 64, head_size_u8 = head_size + 8; + size_t num_kv_heads = 12; + + ov::genai::DeviceConfig device_config_default(core, scheduler_config, "CPU"); + device_config_default.set_model_params(num_kv_heads, head_size_u8, num_decoder_layers); + + ov::genai::DeviceConfig device_config_u8(core, scheduler_config, "CPU", { ov::hint::kv_cache_precision(ov::element::u8) }); + device_config_u8.set_model_params(num_kv_heads, head_size, num_decoder_layers); + + const auto ratio = ov::element::f16.size() / ov::element::u8.size(); + ASSERT_EQ(device_config_default.get_num_kv_blocks() * ratio, device_config_u8.get_num_kv_blocks()); +} diff --git a/tests/cpp/generate_config.cpp b/tests/cpp/generate_config.cpp new file mode 100644 index 0000000000..bf11b33e67 --- /dev/null +++ b/tests/cpp/generate_config.cpp @@ -0,0 +1,103 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include <gtest/gtest.h> +#include <openvino/core/except.hpp> +#include "openvino/genai/generation_config.hpp" + + +using namespace ov::genai; + +TEST(GenerationConfigTest, invalid_temperature) { + GenerationConfig config; + config.max_new_tokens = 20; + config.temperature = -0.1; + config.do_sample = true; + EXPECT_THROW(config.validate(), ov::Exception); +} + +TEST(GenerationConfigTest, valid_temperature) { + GenerationConfig config; + config.max_new_tokens = 20; + config.do_sample = true; + config.temperature = 0.1; + EXPECT_NO_THROW(config.validate()); +} + +TEST(GenerationConfigTest, invalid_top_p) { + GenerationConfig config; + config.max_new_tokens = 20; + config.do_sample = true; + config.top_p = -0.5; + EXPECT_THROW(config.validate(), ov::Exception); + config.top_p = 1.1; + EXPECT_THROW(config.validate(), ov::Exception); +} + +TEST(GenerationConfigTest, valid_top_p) { + GenerationConfig config; + config.max_new_tokens = 20; + config.do_sample = true; + config.top_p = 0.1; + EXPECT_NO_THROW(config.validate()); +} + +TEST(GenerationConfigTest, invalid_repeatition_penalty) { + GenerationConfig config; + config.max_new_tokens = 20; + config.do_sample = true; + config.repetition_penalty = -3.0; + EXPECT_THROW(config.validate(), ov::Exception); + config.repetition_penalty = -0.1; + EXPECT_THROW(config.validate(), ov::Exception); +} + +TEST(GenerationConfigTest, valid_repeatition_penalty) { + GenerationConfig config; + config.max_new_tokens = 20; + config.do_sample = true; + config.repetition_penalty = 1.8; + EXPECT_NO_THROW(config.validate()); + config.repetition_penalty = 0.1; + EXPECT_NO_THROW(config.validate()); +} + +TEST(GenerationConfigTest, invalid_presence_penalty) { + GenerationConfig config; + config.max_new_tokens = 20; + config.do_sample = true; + config.presence_penalty = 3.0; + EXPECT_THROW(config.validate(), ov::Exception); + config.presence_penalty = -3.1; + EXPECT_THROW(config.validate(), ov::Exception); +} + +TEST(GenerationConfigTest, valid_presence_penalty) { + GenerationConfig config; + config.max_new_tokens = 20; + config.do_sample = true; + config.presence_penalty = 1.8; + EXPECT_NO_THROW(config.validate()); + config.presence_penalty = -2.0; + EXPECT_NO_THROW(config.validate()); +} + +TEST(GenerationConfigTest, invalid_frequency_penalty) { + GenerationConfig config; + config.max_new_tokens = 20; + config.do_sample = true; + config.frequency_penalty = 3.0; + EXPECT_THROW(config.validate(), ov::Exception); + config.frequency_penalty = -3.1; + EXPECT_THROW(config.validate(), ov::Exception); +} + +TEST(GenerationConfigTest, valid_frequency_penalty) { + GenerationConfig config; + config.max_new_tokens = 20; + config.do_sample = true; + config.frequency_penalty = 1.8; + EXPECT_NO_THROW(config.validate()); + config.frequency_penalty = -2.0; + EXPECT_NO_THROW(config.validate()); +} diff --git a/tests/cpp/logit_filtering.cpp b/tests/cpp/logit_filtering.cpp new file mode 100644 index 0000000000..66df086fc6 --- /dev/null +++ b/tests/cpp/logit_filtering.cpp @@ -0,0 +1,345 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include <gtest/gtest.h> +#include <openvino/core/except.hpp> + +#include "logit_processor.hpp" + +using namespace LogitTransformers; + +struct TemperatureTransformTestStruct { + static inline const size_t size = 3; + + float temperature; + float input[size]; + float expected_output[size]; +}; + +using TemperatureTransformTest = testing::TestWithParam<TemperatureTransformTestStruct>; + +TEST_P(TemperatureTransformTest, TransformResultEqualToReference) { + auto test_struct = GetParam(); + auto logits = Logits(test_struct.input, TemperatureTransformTestStruct::size); + auto transform = TemperatureLogitTransform(test_struct.temperature); + transform.apply(logits); + ASSERT_FALSE(logits.is_vector_initialized()); + ASSERT_EQ(logits.m_size, TemperatureTransformTestStruct::size); // temperature transfrom should not change buffer size + for (size_t i = 0; i < logits.m_size; i++) { + EXPECT_NEAR(logits.m_data[i], test_struct.expected_output[i], 1e-6); + } +} + + +const std::vector<TemperatureTransformTestStruct> TEMPERATURE_TRANSFORM_TEST_CASES = { + {1.0f, { 1.0f, 2.0f, 3.0f }, { 0.090031, 0.244728, 0.665241 } }, + {2.0f, { 3.0f, 2.0f, 1.0f }, { 0.506480, 0.307195, 0.186323 } }, + {1.0f, { 3.0f, 1.0f, 2.0f }, { 0.665241, 0.090031, 0.244728 } }, +}; + +INSTANTIATE_TEST_SUITE_P(VariousInputs, + TemperatureTransformTest, + testing::ValuesIn(TEMPERATURE_TRANSFORM_TEST_CASES)); + + + +struct TopPTestStruct { + static inline const size_t size = 3; + + float top_p; + float input[size]; + std::vector<Token> expected_output; +}; + +using TopPFilteringTest = testing::TestWithParam<TopPTestStruct>; + +TEST_P(TopPFilteringTest, FilterResultEqualToReference) { + auto test_struct = GetParam(); + auto logits = Logits(test_struct.input, TopPTestStruct::size); + auto transform = TopPFilter(test_struct.top_p); + transform.apply(logits); + ASSERT_TRUE(logits.is_vector_initialized()); + ASSERT_EQ(logits.m_size, logits.m_vector.size()); + ASSERT_EQ(logits.m_size, test_struct.expected_output.size()); + for (size_t i = 0; i < logits.m_vector.size(); i++) { + EXPECT_NEAR(logits.m_vector[i].m_log_prob, test_struct.expected_output[i].m_log_prob, 1e-6); + EXPECT_EQ(logits.m_vector[i].m_index, test_struct.expected_output[i].m_index); + } +} + + +const std::vector<TopPTestStruct> TOP_P_TRANSFORM_TEST_CASES = { + {0.2f, { 0.090031, 0.244728, 0.665241 }, { {0.665241, 2} } }, + {0.9f, { 0.090031, 0.244728, 0.665241 }, { {0.665241, 2}, {0.244728, 1} } }, +}; + +INSTANTIATE_TEST_SUITE_P(VariousInputs, + TopPFilteringTest, + testing::ValuesIn(TOP_P_TRANSFORM_TEST_CASES)); + + + +struct TopKTestStruct { + static inline const size_t size = 3; + + size_t top_k; + float input[size]; + std::vector<Token> expected_output; +}; + +using TopKFilteringTest = testing::TestWithParam<TopKTestStruct>; + +TEST_P(TopKFilteringTest, FilterResultEqualToReference) { + auto test_struct = GetParam(); + auto logits = Logits(test_struct.input, TopKTestStruct::size); + auto transform = TopKFilter(test_struct.top_k); + transform.apply(logits); + ASSERT_TRUE(logits.is_vector_initialized()); + ASSERT_EQ(logits.m_size, logits.m_vector.size()); + ASSERT_EQ(logits.m_size, test_struct.expected_output.size()); + for (size_t i = 0; i < logits.m_vector.size(); i++) { + EXPECT_NEAR(logits.m_vector[i].m_log_prob, test_struct.expected_output[i].m_log_prob, 1e-6); + EXPECT_EQ(logits.m_vector[i].m_index, test_struct.expected_output[i].m_index); + } +} + + +const std::vector<TopKTestStruct> TOP_K_TRANSFORM_TEST_CASES = { + {1, { 0.090031, 0.244728, 0.665241 }, { {0.665241, 2} } }, + {2, { 0.090031, 0.244728, 0.665241 }, { {0.665241, 2}, {0.244728, 1} } }, +}; + +INSTANTIATE_TEST_SUITE_P(VariousInputs, + TopKFilteringTest, + testing::ValuesIn(TOP_K_TRANSFORM_TEST_CASES)); + +TEST(TopKFilteringTest, FilterNotAppliedTopKGreaterThanInputSize) { + float input[]{0.090031, 0.244728, 0.665241}; + float expected_output[]{0.090031, 0.244728, 0.665241}; // no change expected + size_t top_k = 5; + auto logits = Logits(input, 3); + auto transform = TopKFilter(top_k); + transform.apply(logits); + ASSERT_FALSE(logits.is_vector_initialized()); + ASSERT_EQ(logits.m_size, 3); + for (size_t i = 0; i < logits.m_size; i++) { + EXPECT_EQ(logits.m_data[i], expected_output[i]); + } +} + +struct RepetitionPenaltyTransformTestStruct { + static inline const size_t size = 3; + + float penalty; + float input[size]; + TokenIds input_ids; + float expected_output[size]; +}; + +using RepetitionPenaltyTransformTest = testing::TestWithParam<RepetitionPenaltyTransformTestStruct>; + +TEST_P(RepetitionPenaltyTransformTest, TransformResultEqualToReference) { + auto test_struct = GetParam(); + auto logits = Logits(test_struct.input, RepetitionPenaltyTransformTestStruct::size); + auto transform = RepetitionPenaltyTransform(test_struct.penalty); + transform.apply(logits, test_struct.input_ids); + ASSERT_FALSE(logits.is_vector_initialized()); + ASSERT_EQ(logits.m_size, RepetitionPenaltyTransformTestStruct::size); // penalty transfrom should not change buffer size + for (size_t i = 0; i < logits.m_size; i++) { + EXPECT_NEAR(logits.m_data[i], test_struct.expected_output[i], 1e-6); + } +} + + +const std::vector<RepetitionPenaltyTransformTestStruct> REPETITION_PENALTY_TRANSFORM_TEST_CASES = { + RepetitionPenaltyTransformTestStruct{ // basic case, indices are applied, order is left as-is + 1.2f, + { 1.0f, 2.0f, 3.0f }, + TokenIds{ 2, 0 }, + { 0.8333333f, 2.0f, 2.5f } + }, + RepetitionPenaltyTransformTestStruct{ // negative scores case + 2.0f, + { -1.0f, 2.0f, 3.0f }, + TokenIds{ 0, 1 }, + { -2.0f, 1.0f, 3.0f } + }, + RepetitionPenaltyTransformTestStruct{ // repeated tokens in prompt, check that the penalty is only applied once + 0.5f, + { -1.0f, 2.0f, 3.0f }, + TokenIds{ 1, 1 }, + { -1.0f, 4.0f, 3.0f } + }, +}; + +INSTANTIATE_TEST_SUITE_P(VariousInputs, + RepetitionPenaltyTransformTest, + testing::ValuesIn(REPETITION_PENALTY_TRANSFORM_TEST_CASES)); + +TEST(RepetitionPenaltyTransformInitializationTest, ThrowsForInvalidInputIds) { + auto transform = RepetitionPenaltyTransform(1.5); + float input[]{43.0f}; + Logits logits(input, 1); + EXPECT_THROW(transform.apply(logits, {1337}), ov::Exception); + input[0] = {18.0f}; + EXPECT_THROW(transform.apply(logits, {0, -1}), ov::Exception); +} + + +struct FrequencyPenaltyTransformTestStruct { + static inline const size_t size = 3; + + float penalty; + float input[size]; + TokenIds input_ids; + float expected_output[size]; +}; + +using FrequencyPenaltyTransformTest = testing::TestWithParam<FrequencyPenaltyTransformTestStruct>; + +TEST_P(FrequencyPenaltyTransformTest, TransformResultEqualToReference) { + auto test_struct = GetParam(); + auto logits = Logits(test_struct.input, FrequencyPenaltyTransformTestStruct::size); + auto transform = FrequencyPenaltyTransform(test_struct.penalty); + transform.apply(logits, test_struct.input_ids); + ASSERT_FALSE(logits.is_vector_initialized()); + ASSERT_EQ(logits.m_size, FrequencyPenaltyTransformTestStruct::size); // penalty transfrom should not change buffer size + for (size_t i = 0; i < logits.m_size; i++) { + EXPECT_NEAR(logits.m_data[i], test_struct.expected_output[i], 1e-6); + } +}; + + +const std::vector<FrequencyPenaltyTransformTestStruct> FREQUENCY_PENALTY_TRANSFORM_TEST_CASES = { + FrequencyPenaltyTransformTestStruct{ // basic case, indices are applied, order is left as-is + 0.5f, + { -1.0f, 2.0f, 3.0f }, + TokenIds{ 1, 0 }, + { -0.5f, 1.5f, 3.0f } + }, + FrequencyPenaltyTransformTestStruct{ // negative scores case + -0.6f, + { -1.0f, 2.0f, 3.0f }, + TokenIds{ 0, 1, 1 }, + { -1.6f, 3.2f, 3.0f } + }, + FrequencyPenaltyTransformTestStruct{ // repeated tokens in prompt, check that the penalty is only applied once + 0.2f, + { 1.0f, 2.0f, 3.0f }, + TokenIds{ 2, 0, 2 }, + { 0.8f, 2.0f, 2.6f } + }, +}; + +INSTANTIATE_TEST_SUITE_P(VariousInputs, + FrequencyPenaltyTransformTest, + testing::ValuesIn(FREQUENCY_PENALTY_TRANSFORM_TEST_CASES)); + +TEST(FrequencyPenaltyTransformInitializationTest, ThrowsForInvalidInputIds) { + auto transform = FrequencyPenaltyTransform(1.5); + float input[]{43.0f}; + Logits logits(input, 1); + EXPECT_THROW(transform.apply(logits, {1337}), ov::Exception); + input[0] = {18.0f}; + EXPECT_THROW(transform.apply(logits, {0, -1}), ov::Exception); +} + + +struct PresencePenaltyTransformTestStruct { + static inline const size_t size = 3; + + float penalty; + float input[size]; + TokenIds input_ids; + float expected_output[size]; +}; + +using PresencePenaltyTransformTest = testing::TestWithParam<PresencePenaltyTransformTestStruct>; + +TEST_P(PresencePenaltyTransformTest, TransformResultEqualToReference) { + auto test_struct = GetParam(); + auto logits = Logits(test_struct.input, PresencePenaltyTransformTestStruct::size); + auto transform = PresencePenaltyTransform(test_struct.penalty); + transform.apply(logits, test_struct.input_ids); + ASSERT_FALSE(logits.is_vector_initialized()); + ASSERT_EQ(logits.m_size, PresencePenaltyTransformTestStruct::size); // penalty transfrom should not change buffer size + for (size_t i = 0; i < logits.m_size; i++) { + EXPECT_NEAR(logits.m_data[i], test_struct.expected_output[i], 1e-6); + } +}; + + +const std::vector<PresencePenaltyTransformTestStruct> PRESENCE_PENALTY_TRANSFORM_TEST_CASES = { + PresencePenaltyTransformTestStruct{ // basic case, indices are applied, order is left as-is + 0.5f, + { -1.0f, 2.0f, 3.0f }, + TokenIds{ 1, 0 }, + { -0.5f, 1.5f, 3.0f } + }, + PresencePenaltyTransformTestStruct{ // negative scores case + -0.6f, + { -1.0f, 2.0f, 3.0f }, + TokenIds{ 0, 1, 1 }, + { -1.6f, 2.6f, 3.0f } + }, + PresencePenaltyTransformTestStruct{ // repeated tokens in prompt, check that the penalty is only applied once + 0.2f, + { 1.0f, 2.0f, 3.0f }, + TokenIds{ 2, 0, 2 }, + { 0.8f, 2.0f, 2.8f } + }, +}; + +INSTANTIATE_TEST_SUITE_P(VariousInputs, + PresencePenaltyTransformTest, + testing::ValuesIn(PRESENCE_PENALTY_TRANSFORM_TEST_CASES)); + +TEST(PresencePenaltyTransformInitializationTest, ThrowsForInvalidInputIds) { + auto transform = PresencePenaltyTransform(1.5); + float input[]{43.0f}; + Logits logits(input, 1); + EXPECT_THROW(transform.apply(logits, {1337}), ov::Exception); + input[0] = {18.0f}; + EXPECT_THROW(transform.apply(logits, {0, -1}), ov::Exception); +} + +struct EOSPenaltyTransformTestStruct { + static inline const size_t size = 3; + + std::set<int64_t> stop_token_ids; + float input[size]; + float expected_output[size]; +}; + +using EOSPenaltyTransformTest = testing::TestWithParam<EOSPenaltyTransformTestStruct>; + +TEST_P(EOSPenaltyTransformTest, TransformResultEqualToReference) { + auto test_struct = GetParam(); + auto logits = Logits(test_struct.input, EOSPenaltyTransformTestStruct::size); + auto transform = EOSPenaltyTransform(test_struct.stop_token_ids, std::numeric_limits<size_t>::max()); + transform.apply(logits); + ASSERT_FALSE(logits.is_vector_initialized()); + ASSERT_EQ(logits.m_size, EOSPenaltyTransformTestStruct::size); // penalty transfrom should not change buffer size + for (size_t i = 0; i < logits.m_size; i++) { + EXPECT_NEAR(logits.m_data[i], test_struct.expected_output[i], 1e-6); + } +} + + +const std::vector<EOSPenaltyTransformTestStruct> EOS_PENALTY_TRANSFORM_TEST_CASES = { + EOSPenaltyTransformTestStruct{ // basic case, indices are applied, order is left as-is + { 1 }, + { 1.0f, 2.0f, 3.0f }, + { 1.0f, 0.0f, 3.0f }, + }, + EOSPenaltyTransformTestStruct{ + { 1, 0 }, + { 1.0f, 2.0f, 3.0f }, + { 0.0f, 0.0f, 3.0f }, + }, +}; + +INSTANTIATE_TEST_SUITE_P(VariousInputs, + EOSPenaltyTransformTest, + testing::ValuesIn(EOS_PENALTY_TRANSFORM_TEST_CASES)); + diff --git a/tests/cpp/sampler.cpp b/tests/cpp/sampler.cpp new file mode 100644 index 0000000000..f146ab7426 --- /dev/null +++ b/tests/cpp/sampler.cpp @@ -0,0 +1,302 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include <gtest/gtest.h> +#include "sampler.hpp" +#include "openvino/genai/generation_config.hpp" + + +using namespace ov::genai; + +TEST(SamplerStopTokenIdsTest, single_stop_token_match) { + std::vector<int64_t> generated_tokens = {3, 4, 5, 6, 7, 8, 9}; + std::set<int64_t> stop_token_ids = {9}; + ASSERT_TRUE(is_stop_token_id_hit(generated_tokens.back(), stop_token_ids)); +} + +TEST(SamplerStopTokenIdsTest, multiple_stop_token_match) { + std::vector<int64_t> generated_tokens = {3, 4, 5, 6, 7, 8, 9}; + std::set<int64_t> stop_token_ids = {7, 8, 9}; + ASSERT_TRUE(is_stop_token_id_hit(generated_tokens.back(), stop_token_ids)); +} + +TEST(SamplerStopTokenIdsTest, single_stop_sequence_no_match) { + std::vector<int64_t> generated_tokens = {3, 4, 5, 6, 7, 8, 9}; + std::set<int64_t> stop_token_ids = { 10 }; + ASSERT_FALSE(is_stop_token_id_hit(generated_tokens.back(), stop_token_ids)); +} + +TEST(SamplerStopTokenIdsTest, multiple_stop_sequence_no_match) { + std::vector<int64_t> generated_tokens = {3, 4, 5, 6, 7, 8, 9}; + std::set<int64_t> stop_token_ids = { 10, 10, 11 }; + ASSERT_FALSE(is_stop_token_id_hit(generated_tokens.back(), stop_token_ids)); +} + +TEST(SamplerValidationMode, gen_phase_to_cut_whole_seq) { + auto sampling_config = ov::genai::greedy(); + // create sequence group with prompt [0, 1, 2, 3, 4] + std::vector<int64_t> input_vector{0, 1, 2, 3, 4}; + ov::Tensor input_tensor(ov::element::i64, ov::Shape{1, 5}, input_vector.data()); + std::vector<SequenceGroup::Ptr> sequence_groups{ + SequenceGroup::Ptr(new SequenceGroup(0, input_tensor, sampling_config, 32, false)), + }; + + // to emulate processed prompt and add next token [ 0 ] + sequence_groups.front()->get_sequences().front()->append_token(0, 1.f); + sequence_groups.front()->update_processed_tokens_num(5); + + // append candidates [ 2, 3, 4 ] + size_t num_validated_tokens = 3; + for (size_t i = 1; i <= num_validated_tokens; ++i) { + sequence_groups.front()->get_sequences().front()->append_token(i + 1, 1.f); + } + + // generated sequence [0, 1, 2, 3, 4] -> [0, 2, 3, 4] + sequence_groups.front()->set_num_validated_tokens(num_validated_tokens); + const auto num_scheduled_tokens = sequence_groups.front()->get_num_available_tokens_for_batching(); + ASSERT_EQ(num_scheduled_tokens, num_validated_tokens + 1); + sequence_groups.front()->schedule_tokens(num_scheduled_tokens); + + // create ref tensor : to generate candidates + next token + std::vector<float> logits = { + 0, 1.f, 0, 0, 0, + 0, 0, 1.f, 0, 0, + 0, 0, 0, 1.f, 0, + 0, 0, 0, 0, 1.f, + }; + + // shape 4 tokens + 1 batch + 5 vocab + ov::Tensor gen_input_ids(ov::element::f32, ov::Shape{4, 1, 5}, logits.data()); + + Sampler sampler; + sampler.sample(sequence_groups, gen_input_ids, true); + + TokenIds actual = sequence_groups.front()->get_sequences().front()->get_generated_ids(), + expected{0, 1}; + ASSERT_EQ(sequence_groups.front()->get_sequences().front()->get_generated_ids(), expected); +} + +TEST(SamplerValidationMode, gen_phase_to_cut_part_seq) { + auto sampling_config = ov::genai::greedy(); + // create sequence group with prompt [0, 1, 2, 3, 4] + std::vector<int64_t> input_vector{0, 1, 2, 3, 4}; + ov::Tensor input_tensor(ov::element::i64, ov::Shape{1, 5}, input_vector.data()); + std::vector<SequenceGroup::Ptr> sequence_groups{ + SequenceGroup::Ptr(new SequenceGroup(0, input_tensor, sampling_config, 32, false)), + }; + + // to emulate processed prompt and add next token [ 0 ] + sequence_groups.front()->get_sequences().front()->append_token(0, 1.f); + sequence_groups.front()->update_processed_tokens_num(5); + + // append candidates [ 1, 2, 2 ] + size_t num_validated_tokens = 3; + for (size_t i = 1; i <= num_validated_tokens; ++i) { + int64_t token_id = i == num_validated_tokens ? i - 1 : i; + sequence_groups.front()->get_sequences().front()->append_token(token_id, 1.f); + } + + // generated sequence [0, 1, 2, 3, 4] -> [0, 1, 2, 2] + sequence_groups.front()->set_num_validated_tokens(num_validated_tokens); + const auto num_scheduled_tokens = sequence_groups.front()->get_num_available_tokens_for_batching(); + ASSERT_EQ(num_scheduled_tokens, num_validated_tokens + 1); + sequence_groups.front()->schedule_tokens(num_scheduled_tokens); + + // create ref tensor : to generate candidates + next token + std::vector<float> logits = { + 0, 1.f, 0, 0, 0, + 0, 0, 1.f, 0, 0, + 0, 0, 0, 1.f, 0, + 0, 0, 0, 0, 1.f, + }; + + // shape 4 tokens + 1 batch + 5 vocab + ov::Tensor gen_input_ids(ov::element::f32, ov::Shape{4, 1, 5}, logits.data()); + + Sampler sampler; + sampler.sample(sequence_groups, gen_input_ids, true); + + TokenIds actual = sequence_groups.front()->get_sequences().front()->get_generated_ids(), + expected{0, 1, 2, 3}; + ASSERT_EQ(sequence_groups.front()->get_sequences().front()->get_generated_ids(), expected); +} + +TEST(SamplerValidationMode, gen_phase) { + auto sampling_config = ov::genai::greedy(); + // create sequence group with prompt [0, 1, 2, 3, 4] + std::vector<int64_t> input_vector{0, 1, 2, 3, 4}; + ov::Tensor input_tensor(ov::element::i64, ov::Shape{1, 5}, input_vector.data()); + std::vector<SequenceGroup::Ptr> sequence_groups{ + SequenceGroup::Ptr(new SequenceGroup(0, input_tensor, sampling_config, 32, false)), + }; + + // to emulate processed prompt and add next token [ 0 ] + sequence_groups.front()->get_sequences().front()->append_token(0, 1.f); + sequence_groups.front()->update_processed_tokens_num(5); + + // append candidates [ 1, 2, 3 ] + size_t num_validated_tokens = 3; + for (size_t i = 1; i <= num_validated_tokens; ++i) { + sequence_groups.front()->get_sequences().front()->append_token(i, 1.f); + } + + // generated sequence [0, 1, 2, 3, 4] -> [0, 1, 2, 3] + sequence_groups.front()->set_num_validated_tokens(num_validated_tokens); + const auto num_scheduled_tokens = sequence_groups.front()->get_num_available_tokens_for_batching(); + ASSERT_EQ(num_scheduled_tokens, num_validated_tokens + 1); + sequence_groups.front()->schedule_tokens(num_scheduled_tokens); + + // create ref tensor : to generate candidates + next token + std::vector<float> logits = { + 0, 1.f, 0, 0, 0, + 0, 0, 1.f, 0, 0, + 0, 0, 0, 1.f, 0, + 0, 0, 0, 0, 1.f, + }; + + // shape 4 tokens + 1 batch + 5 vocab + ov::Tensor gen_input_ids(ov::element::f32, ov::Shape{4, 1, 5}, logits.data()); + + Sampler sampler; + sampler.sample(sequence_groups, gen_input_ids, true); + + TokenIds actual = sequence_groups.front()->get_sequences().front()->get_generated_ids(), + expected{0, 1, 2, 3, 4}; + ASSERT_EQ(sequence_groups.front()->get_sequences().front()->get_generated_ids(), expected); +} + +TEST(SamplerValidationMode, prompt_phase_to_cut_part_seq) { + auto sampling_config = ov::genai::greedy(); + // create sequence group with prompt [0, 1, 2, 3, 4] + std::vector<int64_t> input_vector{0, 1, 2, 3, 4}; + ov::Tensor input_tensor(ov::element::i64, ov::Shape{1, 5}, input_vector.data()); + std::vector<SequenceGroup::Ptr> sequence_groups{ + SequenceGroup::Ptr(new SequenceGroup(0, input_tensor, sampling_config, 32, false)), + }; + + // append candidates [ 0, 1, 1 ] + size_t num_validated_tokens = 3; + for (size_t i = 0; i < num_validated_tokens; ++i) { + int64_t token_id = i + 1 == num_validated_tokens ? i - 1 : i; + sequence_groups.front()->get_sequences().front()->append_token(token_id, 1.f); + } + + // generated sequence [0, 1, 2, 3, 4] -> [0, 1, 1] + sequence_groups.front()->set_num_validated_tokens(num_validated_tokens); + const auto num_scheduled_tokens = sequence_groups.front()->get_num_available_tokens_for_batching(); + // prompt len + validation + ASSERT_EQ(num_scheduled_tokens, num_validated_tokens + input_vector.size()); + sequence_groups.front()->schedule_tokens(num_scheduled_tokens); + + // create ref tensor : to generate candidates + next token + std::vector<float> logits = { + 0, 1.f, 0, 0, 0, + 0, 0, 1.f, 0, 0, + 0, 0, 0, 1.f, 0, + 0, 0, 0, 0, 1.f, + 1.f, 0, 0, 0, 0, + 0, 1.f, 0, 0, 0, + 0, 0, 1.f, 0, 0, + 0, 0, 0, 1.f, 0, + }; + + // shape 4 tokens + 1 batch + 5 vocab + ov::Tensor gen_input_ids(ov::element::f32, ov::Shape{8, 1, 5}, logits.data()); + + Sampler sampler; + sampler.sample(sequence_groups, gen_input_ids, true); + + TokenIds actual = sequence_groups.front()->get_sequences().front()->get_generated_ids(), + expected{0, 1, 2}; + ASSERT_EQ(sequence_groups.front()->get_sequences().front()->get_generated_ids(), expected); +} + +TEST(SamplerValidationMode, prompt_phase_to_cut_whole_seq) { + auto sampling_config = ov::genai::greedy(); + // create sequence group with prompt [0, 1, 2, 3, 4] + std::vector<int64_t> input_vector{0, 1, 2, 3, 4}; + ov::Tensor input_tensor(ov::element::i64, ov::Shape{1, 5}, input_vector.data()); + std::vector<SequenceGroup::Ptr> sequence_groups{ + SequenceGroup::Ptr(new SequenceGroup(0, input_tensor, sampling_config, 32, false)), + }; + + // append candidates [ 1, 2, 3 ] + size_t num_validated_tokens = 3; + for (size_t i = 0; i < num_validated_tokens; ++i) { + sequence_groups.front()->get_sequences().front()->append_token(i + 1, 1.f); + } + + // generated sequence [0, 1, 2, 3, 4] -> [1, 2, 3] + sequence_groups.front()->set_num_validated_tokens(num_validated_tokens); + const auto num_scheduled_tokens = sequence_groups.front()->get_num_available_tokens_for_batching(); + // prompt len + validation + ASSERT_EQ(num_scheduled_tokens, num_validated_tokens + input_vector.size()); + sequence_groups.front()->schedule_tokens(num_scheduled_tokens); + + // create ref tensor : to generate candidates + next token + std::vector<float> logits = { + 0, 1.f, 0, 0, 0, + 0, 0, 1.f, 0, 0, + 0, 0, 0, 1.f, 0, + 0, 0, 0, 0, 1.f, + 1.f, 0, 0, 0, 0, + 0, 1.f, 0, 0, 0, + 0, 0, 1.f, 0, 0, + 0, 0, 0, 1.f, 0, + }; + + // shape 4 tokens + 1 batch + 5 vocab + ov::Tensor gen_input_ids(ov::element::f32, ov::Shape{8, 1, 5}, logits.data()); + + Sampler sampler; + sampler.sample(sequence_groups, gen_input_ids, true); + + TokenIds actual = sequence_groups.front()->get_sequences().front()->get_generated_ids(), + expected{0}; + ASSERT_EQ(sequence_groups.front()->get_sequences().front()->get_generated_ids(), expected); +} + +TEST(SamplerValidationMode, prompt_phase) { + auto sampling_config = ov::genai::greedy(); + // create sequence group with prompt [0, 1, 2, 3, 4] + std::vector<int64_t> input_vector{0, 1, 2, 3, 4}; + ov::Tensor input_tensor(ov::element::i64, ov::Shape{1, 5}, input_vector.data()); + std::vector<SequenceGroup::Ptr> sequence_groups{ + SequenceGroup::Ptr(new SequenceGroup(0, input_tensor, sampling_config, 32, false)), + }; + + // append candidates [ 0, 1, 2 ] + size_t num_validated_tokens = 3; + for (size_t i = 0; i < num_validated_tokens; ++i) { + sequence_groups.front()->get_sequences().front()->append_token(i, 1.f); + } + + // generated sequence [0, 1, 2, 3, 4] -> [0, 1, 2] + sequence_groups.front()->set_num_validated_tokens(num_validated_tokens); + const auto num_scheduled_tokens = sequence_groups.front()->get_num_available_tokens_for_batching(); + // prompt len + validation + ASSERT_EQ(num_scheduled_tokens, num_validated_tokens + input_vector.size()); + sequence_groups.front()->schedule_tokens(num_scheduled_tokens); + + // create ref tensor : to generate candidates + next token + std::vector<float> logits = { + 0, 1.f, 0, 0, 0, + 0, 0, 1.f, 0, 0, + 0, 0, 0, 1.f, 0, + 0, 0, 0, 0, 1.f, + 1.f, 0, 0, 0, 0, + 0, 1.f, 0, 0, 0, + 0, 0, 1.f, 0, 0, + 0, 0, 0, 1.f, 0, + }; + + // shape 4 tokens + 1 batch + 5 vocab + ov::Tensor gen_input_ids(ov::element::f32, ov::Shape{8, 1, 5}, logits.data()); + + Sampler sampler; + sampler.sample(sequence_groups, gen_input_ids, true); + + TokenIds actual = sequence_groups.front()->get_sequences().front()->get_generated_ids(), + expected{0, 1, 2, 3}; + ASSERT_EQ(sequence_groups.front()->get_sequences().front()->get_generated_ids(), expected); +} diff --git a/tests/cpp/scheduler.cpp b/tests/cpp/scheduler.cpp new file mode 100644 index 0000000000..276c53ba9a --- /dev/null +++ b/tests/cpp/scheduler.cpp @@ -0,0 +1,969 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include <gtest/gtest.h> +#include "openvino/runtime/core.hpp" +#include "openvino/genai/continuous_batching_pipeline.hpp" +#include "openvino/genai/generation_config.hpp" +#include "sequence_group.hpp" +#include "scheduler.hpp" + +using namespace ov::genai; + +void clear_finished_sequences(std::vector<SequenceGroup::Ptr>& requests) { + auto new_end = std::remove_if(requests.begin(), requests.end(), [] (SequenceGroup::CPtr seq_group) -> bool { + return seq_group->has_finished(); + }); + requests.erase(new_end, requests.end()); +} + +TEST(TestScheduler, general_test) { + std::array<SchedulerConfig, 2> configs = {SchedulerConfig(), SchedulerConfig()}; + configs.at(0).max_num_batched_tokens = 32; + configs.at(0).num_kv_blocks = 6; + configs.at(0).block_size = 4; + configs.at(0).dynamic_split_fuse = false; + configs.at(0).max_num_seqs = 5; + configs.at(1).max_num_batched_tokens = 32; + configs.at(1).num_kv_blocks = 6; + configs.at(1).block_size = 4; + configs.at(1).dynamic_split_fuse = true; + configs.at(1).max_num_seqs = 5; + for (auto scheduler_config: configs) { + std::vector<uint64_t> tokens = {0,1,2,3,4,5,6,7}; + SequenceGroup::Ptr sequence_group1 = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), + ov::genai::greedy(), scheduler_config.block_size, scheduler_config.enable_prefix_caching); + auto idx0 = (*sequence_group1)[0]->get_id(); + SequenceGroup::Ptr sequence_group2 = std::make_shared<SequenceGroup>(1, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), + ov::genai::greedy(), scheduler_config.block_size, scheduler_config.enable_prefix_caching); + auto idx1 = (*sequence_group2)[0]->get_id(); + SequenceGroup::Ptr sequence_group3 = std::make_shared<SequenceGroup>(1, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), + ov::genai::greedy(), scheduler_config.block_size, scheduler_config.enable_prefix_caching); + auto idx2 = (*sequence_group3)[0]->get_id(); + std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2, sequence_group3}; + + + // schedule 3 sequence groups that use 6 kv blocks + Scheduler scheduler = Scheduler(scheduler_config); + auto out1 = scheduler.schedule(requests); + + std::vector<uint64_t> ref_ids = {0, 1, 2}; + EXPECT_EQ(out1.m_scheduled_sequence_groups_ids, ref_ids); + EXPECT_EQ(out1.m_block_tables[idx0][0].size(), 2); + EXPECT_EQ(out1.m_block_tables[idx1][0].size(), 2); + EXPECT_EQ(out1.m_block_tables[idx2][0].size(), 2); + // tokens.size() * 2 tokens should be scheduled on prompt phase, corresponding to first three sequences + EXPECT_EQ(out1.m_total_num_scheduled_tokens, tokens.size() * 3); + EXPECT_EQ(out1.is_prompt, !scheduler_config.dynamic_split_fuse); + + for (auto seq: requests) { + std::vector<Sequence::Ptr> running_sequences = seq->get_running_sequences(); + // prompt phase + seq->finish_iteration(); + } + + // at this point we scheduled all available kv blocks + + // sequence_group3 should be evicted + auto out3 = scheduler.schedule(requests); + + for (auto seq: requests) { + std::vector<Sequence::Ptr> running_sequences = seq->get_running_sequences(); + // generate phase, append a token to each sequence + running_sequences[0]->append_token(16, 0.9); + seq->finish_iteration(); + } + + std::vector<uint64_t> ref_ids2 = {0, 1}; + EXPECT_EQ(out3.m_scheduled_sequence_groups_ids, ref_ids2); + EXPECT_EQ(out3.m_block_tables[idx0][0].size(), 3); + EXPECT_EQ(out3.m_block_tables[idx1][0].size(), 3); + // 2 tokens should be scheduled on generate phase for "0" and "1" sequence, "2" sequence should be preempted + EXPECT_EQ(out3.m_total_num_scheduled_tokens, 2); + EXPECT_FALSE(out3.is_prompt); + + // check that scheduler has no block table for sequence_group3 + EXPECT_FALSE(scheduler.has_block_table(idx2)); + + // finish first sequence + requests[0]->get_running_sequences()[0]->set_status(SequenceStatus::FINISHED); + scheduler.free_sequence(idx0); + clear_finished_sequences(requests); + // KV blocks 0,1,5 are free now + + + auto out4 = scheduler.schedule(requests); + + // check that sequence_group3 is fully scehuled + EXPECT_EQ(out4.m_block_tables[idx2][0].size(), 2); + EXPECT_FALSE(out4.m_block_tables[idx2][0][0]->is_free()); + EXPECT_EQ(out4.m_block_tables[idx2][0][0]->get_index(), 0); + EXPECT_FALSE(out4.m_block_tables[idx2][0][1]->is_free()); + EXPECT_EQ(out4.m_block_tables[idx2][0][1]->get_index(), 1); + + // requests1[1] should be fully scheduled plus 1 slot for requests[0] for generate phase + EXPECT_EQ(out4.m_total_num_scheduled_tokens, requests[1]->get_context_len() + 1); + EXPECT_EQ(out4.is_prompt, false); + } + +} + +SchedulerConfig get_scheduler_config(size_t max_num_batched_tokens, + size_t num_kv_blocks, + size_t block_size, + bool dynamic_split_fuse, + size_t max_num_seqs, + std::optional<ov::genai::CacheEvictionConfig> cache_eviction_config = std::nullopt) { + auto retval = SchedulerConfig(); + retval.max_num_batched_tokens = max_num_batched_tokens; + retval.num_kv_blocks = num_kv_blocks; + retval.block_size = block_size; + retval.dynamic_split_fuse = dynamic_split_fuse; + retval.max_num_seqs = max_num_seqs; + retval.use_cache_eviction = false; + if (cache_eviction_config.has_value()) { + retval.cache_eviction_config = cache_eviction_config.value(); + } + return retval; +} + +const ov::genai::CacheEvictionConfig LONG_EVICTION_CONFIG = ov::genai::CacheEvictionConfig(32, 32, 128, ov::genai::AggregationMode::NORM_SUM); + + +using AppendSlotsSchedulerTest = ::testing::TestWithParam<SchedulerConfig>; +const std::vector<SchedulerConfig> APPEND_SLOTS_TEST_CASES = { + get_scheduler_config(32, 5, 4, false, 5), + get_scheduler_config(32, 5, 4, true, 5), +}; + +TEST_P(AppendSlotsSchedulerTest, test_append_slots_considers_all_sequences) { + auto scheduler_config = GetParam(); + std::vector<uint64_t> tokens = {0,1,2,3,4,5,6,7}; + SequenceGroup::Ptr sequence_group1 = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), + ov::genai::greedy(), scheduler_config.block_size, scheduler_config.enable_prefix_caching); + auto idx0 = (*sequence_group1)[0]->get_id(); + SequenceGroup::Ptr sequence_group2 = std::make_shared<SequenceGroup>(1, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), + ov::genai::greedy(), scheduler_config.block_size, scheduler_config.enable_prefix_caching); + auto idx1 = (*sequence_group2)[0]->get_id(); + std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2}; + + Scheduler scheduler = Scheduler(scheduler_config); + auto out1 = scheduler.schedule(requests); + + std::vector<uint64_t> ref_ids = {0, 1}; + EXPECT_EQ(out1.m_scheduled_sequence_groups_ids, ref_ids); + EXPECT_EQ(out1.m_block_tables[idx0][0].size(), 2); + EXPECT_EQ(out1.m_block_tables[idx1][0].size(), 2); + EXPECT_FALSE(out1.m_block_tables[idx0][0][0]->is_free()); + EXPECT_EQ(out1.m_block_tables[idx0][0][0]->get_index(), 0); + EXPECT_FALSE(out1.m_block_tables[idx0][0][1]->is_free()); + EXPECT_EQ(out1.m_block_tables[idx0][0][1]->get_index(), 1); + EXPECT_FALSE(out1.m_block_tables[idx1][0][0]->is_free()); + EXPECT_EQ(out1.m_block_tables[idx1][0][0]->get_index(), 2); + EXPECT_FALSE(out1.m_block_tables[idx1][0][1]->is_free()); + EXPECT_EQ(out1.m_block_tables[idx1][0][1]->get_index(), 3); + EXPECT_EQ(out1.m_total_num_scheduled_tokens, tokens.size() * 2); + EXPECT_EQ(out1.is_prompt, !scheduler_config.dynamic_split_fuse); + for (auto seq: requests) { + std::vector<Sequence::Ptr> running_sequences = seq->get_running_sequences(); + // prompt phase + seq->finish_iteration(); + } + + // at this point we used 4/5 KV blocks. Both sequences require new KV block, but we have space for only one. + auto out2 = scheduler.schedule(requests); + + // 1-st sequence now should use 3 kv-blocks + EXPECT_EQ(out2.m_block_tables[idx0][0].size(), 3); + EXPECT_FALSE(out2.m_block_tables[idx0][0][0]->is_free()); + EXPECT_EQ(out2.m_block_tables[idx0][0][0]->get_index(), 0); + EXPECT_FALSE(out2.m_block_tables[idx0][0][1]->is_free()); + EXPECT_EQ(out2.m_block_tables[idx0][0][1]->get_index(), 1); + EXPECT_FALSE(out2.m_block_tables[idx0][0][2]->is_free()); + EXPECT_EQ(out2.m_block_tables[idx0][0][2]->get_index(), 4); + + // 1 token was scheduled for generate phase + EXPECT_EQ(out2.m_total_num_scheduled_tokens, 1); + + EXPECT_FALSE(out2.is_prompt); +} + +INSTANTIATE_TEST_SUITE_P(VariousSchedulerConfigs, AppendSlotsSchedulerTest, + ::testing::ValuesIn(APPEND_SLOTS_TEST_CASES)); + +using PartialPreemptionSchedulerTest = ::testing::TestWithParam<SchedulerConfig>; +const std::vector<SchedulerConfig> PARTIAL_PREEMPTION_TEST_CASES = { + get_scheduler_config(32, 6, 4, false, 5), + get_scheduler_config(32, 6, 4, true, 5), + + // Cache eviction should not impact preemption for cache eviction's max_cache_size larger than the sequence lengths at preemption time + get_scheduler_config(32, 6, 4, false, 5, LONG_EVICTION_CONFIG), + get_scheduler_config(32, 6, 4, true, 5, LONG_EVICTION_CONFIG) +}; + +TEST_P(PartialPreemptionSchedulerTest, test_partial_preemption) { + auto scheduler_config = GetParam(); + std::vector<uint64_t> tokens1 = {0,1,2,3,4,5,6,7,8,9,10}; + SequenceGroup::Ptr sequence_group1 = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens1.size()}, tokens1.data()), + ov::genai::greedy(), scheduler_config.block_size, scheduler_config.enable_prefix_caching); + std::vector<uint64_t> tokens2 = {0,1,2,3,4,5,6,7}; + auto idx0 = (*sequence_group1)[0]->get_id(); + SequenceGroup::Ptr sequence_group2 = std::make_shared<SequenceGroup>(1, ov::Tensor(ov::element::i64, {tokens2.size()}, tokens2.data()), + ov::genai::greedy(), scheduler_config.block_size, scheduler_config.enable_prefix_caching); + auto idx1 = (*sequence_group2)[0]->get_id(); + std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2}; + + + // schedule 2 sequence groups that use 5 kv blocks + Scheduler scheduler = Scheduler(scheduler_config); + auto out0 = scheduler.schedule(requests); + + for (auto seq: requests) { + std::vector<Sequence::Ptr> running_sequences = seq->get_running_sequences(); + // prompt phase + seq->finish_iteration(); + } + + + // schedule generate, all 6 kv blocks are used. + auto out1 = scheduler.schedule(requests); + + for (auto seq: requests) { + std::vector<Sequence::Ptr> running_sequences = seq->get_running_sequences(); + // generate phase + running_sequences[0]->append_token(16, 0.9); + seq->finish_iteration(); + } + + // sequence_group2 should be partially preempted + auto out2 = scheduler.schedule(requests); + + std::vector<uint64_t> ref_ids = {0}; + EXPECT_EQ(out2.m_scheduled_sequence_groups_ids, ref_ids); + auto block_table1 = scheduler.get_block_tables(*(*sequence_group1)[0])[0]; + auto block_table2 = scheduler.get_block_tables(*(*sequence_group2)[0])[0]; + EXPECT_EQ(block_table1.size(), 4); + EXPECT_EQ(block_table1[0]->get_index(), 0); + EXPECT_EQ(block_table1[1]->get_index(), 1); + EXPECT_EQ(block_table1[2]->get_index(), 2); + EXPECT_EQ(block_table1[3]->get_index(), 5); + EXPECT_EQ(block_table2.size(), 2); + EXPECT_EQ(block_table2[0]->get_index(), 3); + EXPECT_EQ(block_table2[1]->get_index(), 4); + + EXPECT_EQ(out2.m_total_num_scheduled_tokens, 1); + EXPECT_EQ(out2.m_block_tables[idx0][0][0]->get_index(), 0); + EXPECT_EQ(out2.m_block_tables[idx0][0][1]->get_index(), 1); + EXPECT_EQ(out2.m_block_tables[idx0][0][2]->get_index(), 2); + EXPECT_EQ(out2.m_block_tables[idx0][0][3]->get_index(), 5); + + // finish first sequence + requests[0]->get_running_sequences()[0]->set_status(SequenceStatus::FINISHED); + scheduler.free_sequence(idx0); + clear_finished_sequences(requests); + // KV blocks 0,1,2,5 are free now + + // sequence_group2 should be scheduled + auto out3 = scheduler.schedule(requests); + + // last token should be recomputed + EXPECT_EQ(out3.m_total_num_scheduled_tokens, 1); + EXPECT_EQ(out3.m_block_tables[idx1][0][0]->get_index(), 3); + EXPECT_EQ(out3.m_block_tables[idx1][0][1]->get_index(), 4); + EXPECT_EQ(out3.m_block_tables[idx1][0][2]->get_index(), 0); + + block_table2 = scheduler.get_block_tables(*(*sequence_group2)[0])[0]; + EXPECT_EQ(block_table2.size(), 3); + EXPECT_EQ(block_table2[0]->get_index(), 3); + EXPECT_EQ(block_table2[1]->get_index(), 4); + EXPECT_EQ(block_table2[2]->get_index(), 0); + + EXPECT_FALSE(scheduler.has_block_table(idx0)); +} + +INSTANTIATE_TEST_SUITE_P(VariousSchedulerConfigs, PartialPreemptionSchedulerTest , + ::testing::ValuesIn(PARTIAL_PREEMPTION_TEST_CASES)); + +TEST(TestScheduler, test_partial_preemption_beam_search) { + std::array<SchedulerConfig, 2> configs = {SchedulerConfig(), SchedulerConfig()}; + configs.at(0).num_kv_blocks = 10; + configs.at(0).block_size = 4; + configs.at(0).dynamic_split_fuse = false; + configs.at(1).num_kv_blocks = 10; + configs.at(1).block_size = 4; + configs.at(1).dynamic_split_fuse = true; + for (auto scheduler_config: configs) { + std::vector<uint64_t> tokens = {0,1,2,3}; + int64_t token = 4; + + // create beam search group + SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), + ov::genai::beam_search(), scheduler_config.block_size, scheduler_config.enable_prefix_caching); + sequence_group->set_sequence_group_ptr(sequence_group); + std::vector<SequenceGroup::Ptr> requests = {sequence_group}; + + Scheduler scheduler = Scheduler(scheduler_config); + auto out = scheduler.schedule(requests); + for (auto sequence: sequence_group->get_not_finished_sequences()) { + sequence->append_token(token, 0.7); + } + sequence_group->finish_iteration(); + + // make 2 forked sequence + auto sequence_to_fork = sequence_group->get_running_sequences()[0]; + for (size_t i = 0; i < 2; ++i) { + const auto forked_sequence = sequence_group->fork_sequence(sequence_to_fork); + scheduler.fork_sequence(sequence_to_fork->get_id(), forked_sequence->get_id()); + } + size_t num_scheduled_tokens = 4; + + // generate 4 tokens + for (size_t i = 0; i < num_scheduled_tokens; i++) { + scheduler.schedule(requests); + for (auto sequence: sequence_group->get_not_finished_sequences()) { + token += 3; + sequence->append_token(token, 0.5); + } + sequence_group->finish_iteration(); + } + // currently sequence occupies 4 blocks (1 shared, 3 not shared) + + // make another 2 forked sequence + for (size_t i = 0; i < 2; ++i) { + const auto forked_sequence = sequence_group->fork_sequence(sequence_to_fork); + scheduler.fork_sequence(sequence_to_fork->get_id(), forked_sequence->get_id()); + } + + // generate 4 tokens + for (size_t i = 0; i < num_scheduled_tokens; i++) { + scheduler.schedule(requests); + for (auto sequence: sequence_group->get_not_finished_sequences()) { + token += 3; + sequence->append_token(token, 0.5); + } + sequence_group->finish_iteration(); + } + // currently sequence occupies 9 blocks (4 blocks previously created + 5 blocks for each sequence) + + // create group, which requires 1 block + SequenceGroup::Ptr sequence_group_greedy = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), + ov::genai::greedy(), scheduler_config.block_size, scheduler_config.enable_prefix_caching); + sequence_group_greedy->set_sequence_group_ptr(sequence_group_greedy); + + // set greedy group at the beginning of list to make it higher priority + std::vector<SequenceGroup::Ptr> new_requests = {sequence_group_greedy, sequence_group}; + + // process prompt of greedy group, at this point all blocks are used + scheduler.schedule(new_requests); + sequence_group_greedy->get_sequences()[0]->append_token(token, 0.8); + sequence_group_greedy->finish_iteration(); + + EXPECT_EQ(sequence_group->get_num_processed_tokens(), 12); + EXPECT_EQ(sequence_group->get_context_len(), 12); + + // beam search group should be partially preempted and 5 blocks should be released + out = scheduler.schedule(new_requests); + sequence_group_greedy->get_sequences()[0]->append_token(token, 0.5); + sequence_group_greedy->finish_iteration(); + + EXPECT_EQ(sequence_group->get_num_processed_tokens(), 8); + auto seqs = sequence_group->get_sequences(); + EXPECT_EQ(scheduler.get_block_tables(*seqs[0])[0].size(), 2); + EXPECT_EQ(scheduler.get_block_tables(*seqs[1])[0].size(), 2); + EXPECT_EQ(scheduler.get_block_tables(*seqs[2])[0].size(), 2); + EXPECT_EQ(scheduler.get_block_tables(*seqs[3])[0].size(), 2); + EXPECT_EQ(scheduler.get_block_tables(*seqs[4])[0].size(), 2); + + // append another 20 tokens to greedy group, this should result in usage of all free blocks and + // another partial preemption of beam search group + for (size_t i = 0; i < 20; i++) { + out = scheduler.schedule(new_requests); + sequence_group_greedy->get_sequences()[0]->append_token(token, 0.5); + sequence_group_greedy->finish_iteration(); + } + + EXPECT_EQ(sequence_group->get_num_processed_tokens(), 4); + seqs = sequence_group->get_sequences(); + EXPECT_EQ(scheduler.get_block_tables(*seqs[0])[0].size(), 1); + EXPECT_EQ(scheduler.get_block_tables(*seqs[1])[0].size(), 1); + EXPECT_EQ(scheduler.get_block_tables(*seqs[2])[0].size(), 1); + EXPECT_EQ(scheduler.get_block_tables(*seqs[3])[0].size(), 1); + EXPECT_EQ(scheduler.get_block_tables(*seqs[4])[0].size(), 1); + } +} + +TEST(TestScheduler, test_partially_preempted_prompt) { + std::array<SchedulerConfig, 2> configs = {SchedulerConfig(), SchedulerConfig()}; + configs.at(0).max_num_batched_tokens = 32; + configs.at(0).num_kv_blocks = 6; + configs.at(0).block_size = 4; + configs.at(0).dynamic_split_fuse = false; + configs.at(0).max_num_seqs = 5; + configs.at(1).max_num_batched_tokens = 32; + configs.at(1).num_kv_blocks = 6; + configs.at(1).block_size = 4; + configs.at(1).dynamic_split_fuse = true; + configs.at(1).max_num_seqs = 5; + for (auto scheduler_config: configs) { + std::vector<uint64_t> tokens = {0,1,2,3,4,5,6,7,8,9,10,11}; + SequenceGroup::Ptr sequence_group1 = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), + ov::genai::greedy(), scheduler_config.block_size, scheduler_config.enable_prefix_caching); + auto idx0 = (*sequence_group1)[0]->get_id(); + SequenceGroup::Ptr sequence_group2 = std::make_shared<SequenceGroup>(1, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), + ov::genai::greedy(), scheduler_config.block_size, scheduler_config.enable_prefix_caching); + auto idx1 = (*sequence_group2)[0]->get_id(); + std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2}; + + + // schedule 2 sequence groups that use all available 2*3 kv blocks, we used all available kv-blocks. + Scheduler scheduler = Scheduler(scheduler_config); + auto out1 = scheduler.schedule(requests); + + for (auto seq: requests) { + std::vector<Sequence::Ptr> running_sequences = seq->get_running_sequences(); + // prompt phase + seq->finish_iteration(); + } + + // sequence_group2 should be fully preempted + auto out2 = scheduler.schedule(requests); + + // check that sequence_group1 has one more allocated block + auto block_tables_for_all_layers = scheduler.get_block_tables(*(*sequence_group1)[0]); + auto block_table1 = block_tables_for_all_layers[0]; + EXPECT_EQ(block_table1.size(), 4); + EXPECT_EQ(block_table1[0]->get_index(), 0); + EXPECT_EQ(block_table1[1]->get_index(), 1); + EXPECT_EQ(block_table1[2]->get_index(), 2); + EXPECT_EQ(block_table1[3]->get_index(), 5); + EXPECT_EQ(out2.m_block_tables[idx0][0].size(), 4); + EXPECT_EQ(out2.m_block_tables[idx0][0][0]->get_index(), 0); + EXPECT_EQ(out2.m_block_tables[idx0][0][1]->get_index(), 1); + EXPECT_EQ(out2.m_block_tables[idx0][0][2]->get_index(), 2); + EXPECT_EQ(out2.m_block_tables[idx0][0][3]->get_index(), 5); + + std::vector<uint64_t> ref_ids = {0}; + EXPECT_EQ(out2.m_scheduled_sequence_groups_ids, ref_ids); + EXPECT_EQ(out2.m_total_num_scheduled_tokens, 1); + + if (scheduler_config.dynamic_split_fuse) { + // for dynamic_split_fuse sequence_group2 is preemted partially, part of prompt is left + EXPECT_TRUE(scheduler.has_block_table(idx1)); + auto block_table2 = scheduler.get_block_tables(*(*sequence_group2)[0])[0]; + EXPECT_EQ(block_table2.size(), 2); // full prompt requires 3 blocks, 2 are left in scheduler + + } else { + // for vllm case sequence_group2 is fully preempted + EXPECT_FALSE(scheduler.has_block_table(idx1)); + } + + for (auto seq: requests) { + std::vector<Sequence::Ptr> running_sequences = seq->get_running_sequences(); + seq->finish_iteration(); + } + + // finish first sequence + requests[0]->get_running_sequences()[0]->set_status(SequenceStatus::FINISHED); + scheduler.free_sequence(idx0); + clear_finished_sequences(requests); + // KV blocks 0,1,2,5 are free now + + // sequence_group2 should be scheduled + auto out3 = scheduler.schedule(requests); + + if (scheduler_config.dynamic_split_fuse) { + // remaining part of prompt should be scheduled + EXPECT_EQ(out3.m_total_num_scheduled_tokens, 4); + } + else { + // prompt should be fully scheduled + EXPECT_EQ(out3.m_total_num_scheduled_tokens, 12); + } + + EXPECT_EQ(out3.m_block_tables[idx1][0][0]->get_index(), 3); + EXPECT_EQ(out3.m_block_tables[idx1][0][1]->get_index(), 4); + EXPECT_EQ(out3.m_block_tables[idx1][0][2]->get_index(), 0); + + auto block_table2 = scheduler.get_block_tables(*(*sequence_group2)[0])[0]; + EXPECT_EQ(block_table2.size(), 3); + EXPECT_EQ(block_table2[0]->get_index(), 3); + EXPECT_EQ(block_table2[1]->get_index(), 4); + EXPECT_EQ(block_table2[2]->get_index(), 0); + + EXPECT_FALSE(scheduler.has_block_table(idx0)); + } +} + +TEST(TestScheduler, prefix_caching_test) { + std::array<SchedulerConfig, 2> configs = {SchedulerConfig(), SchedulerConfig()}; + configs.at(0).max_num_batched_tokens = 32; + configs.at(0).num_kv_blocks = 100; + configs.at(0).block_size = 4; + configs.at(0).dynamic_split_fuse = false; + configs.at(0).max_num_seqs = 5; + configs.at(0).enable_prefix_caching = true; + configs.at(1).max_num_batched_tokens = 32; + configs.at(1).num_kv_blocks = 100; + configs.at(1).block_size = 4; + configs.at(1).dynamic_split_fuse = true; + configs.at(1).max_num_seqs = 5; + configs.at(1).enable_prefix_caching = true; + for (auto scheduler_config: configs) { + std::vector<uint64_t> prompt_tokens = {0,1,2,3,4,5,6,7}; + std::vector<uint64_t> histrory_tokens = {}; + // schedule prompt + Scheduler scheduler = Scheduler(scheduler_config); + + size_t chat_iterations = 10; + + for (size_t chat_iteration = 0; chat_iteration < chat_iterations; chat_iteration++) { + std::vector<uint64_t> tokens = histrory_tokens; + tokens.insert(tokens.end(), prompt_tokens.begin(), prompt_tokens.end()); + SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), + ov::genai::greedy(), scheduler_config.block_size, + scheduler_config.enable_prefix_caching); + sequence_group->set_sequence_group_ptr(sequence_group); + scheduler.restore_cached_blocks(sequence_group); + std::vector<SequenceGroup::Ptr> requests = {sequence_group}; + + auto out1 = scheduler.schedule(requests); + if (chat_iteration == 0) + EXPECT_EQ(out1.m_total_num_scheduled_tokens, prompt_tokens.size()); + else + EXPECT_EQ(out1.m_total_num_scheduled_tokens, prompt_tokens.size() + 1); + for (auto seq: requests) { + std::vector<Sequence::Ptr> running_sequences = seq->get_running_sequences(); + running_sequences[0]->append_token(23, 0.7); + seq->finish_iteration(); + } + + // schedule generate + size_t num_generate_tokens = 10; + for (size_t i = 0; i < num_generate_tokens; i++) { + auto out2 = scheduler.schedule(requests); + EXPECT_EQ(out2.m_total_num_scheduled_tokens, 1); + for (auto seq: requests) { + std::vector<Sequence::Ptr> running_sequences = seq->get_running_sequences(); + running_sequences[0]->append_token(16, 0.9); + seq->finish_iteration(); + } + } + + // finish sequence + auto sequence = requests[0]->get_running_sequences()[0]; + sequence->set_status(SequenceStatus::FINISHED); + auto idx0 = sequence->get_id(); + scheduler.free_sequence(idx0); + auto generated_ids = sequence->get_generated_ids(); + + histrory_tokens.insert(histrory_tokens.end(), prompt_tokens.begin(), prompt_tokens.end()); + histrory_tokens.insert(histrory_tokens.end(), generated_ids.begin(), generated_ids.end()); + } + } + +} + +TEST(TestScheduler, prefix_caching_test_two_identical_sequences) { + std::array<SchedulerConfig, 2> configs = {SchedulerConfig(), SchedulerConfig()}; + configs.at(0).num_kv_blocks = 100; + configs.at(0).block_size = 4; + configs.at(0).dynamic_split_fuse = false; + configs.at(0).enable_prefix_caching = true; + configs.at(1).num_kv_blocks = 100; + configs.at(1).block_size = 4; + configs.at(1).dynamic_split_fuse = true; + configs.at(1).enable_prefix_caching = true; + for (auto scheduler_config: configs) { + std::vector<uint64_t> prompt_tokens = {0,1,2,3,4,5,6,7}; + std::vector<uint64_t> histrory_tokens = {}; + // schedule prompt + Scheduler scheduler = Scheduler(scheduler_config); + + size_t chat_iterations = 10; + + for (size_t chat_iteration = 0; chat_iteration < chat_iterations; chat_iteration++) { + std::vector<uint64_t> tokens = histrory_tokens; + tokens.insert(tokens.end(), prompt_tokens.begin(), prompt_tokens.end()); + SequenceGroup::Ptr sequence_group1 = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), + ov::genai::greedy(), scheduler_config.block_size, + scheduler_config.enable_prefix_caching); + + SequenceGroup::Ptr sequence_group2 = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), + ov::genai::greedy(), scheduler_config.block_size, + scheduler_config.enable_prefix_caching); + sequence_group1->set_sequence_group_ptr(sequence_group1); + sequence_group2->set_sequence_group_ptr(sequence_group2); + std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2}; + // restore cached blocks + for (auto request: requests) { + scheduler.restore_cached_blocks(request); + } + + // schedule prompt + auto out1 = scheduler.schedule(requests); + if (chat_iteration == 0) + EXPECT_EQ(out1.m_total_num_scheduled_tokens, prompt_tokens.size() * 2); + else + EXPECT_EQ(out1.m_total_num_scheduled_tokens, (prompt_tokens.size() + 1) * 2); + for (auto seq: requests) { + std::vector<Sequence::Ptr> running_sequences = seq->get_running_sequences(); + running_sequences[0]->append_token(23, 0.7); + seq->finish_iteration(); + } + + // schedule generate + size_t num_generate_tokens = 10; + for (size_t i = 0; i < num_generate_tokens; i++) { + auto out2 = scheduler.schedule(requests); + EXPECT_EQ(out2.m_total_num_scheduled_tokens, 2); + for (auto request: requests) { + std::vector<Sequence::Ptr> running_sequences = request->get_running_sequences(); + running_sequences[0]->append_token(16, 0.9); + request->finish_iteration(); + } + } + + for (auto request: requests) { + // finish sequences + auto sequence = request->get_running_sequences()[0]; + sequence->set_status(SequenceStatus::FINISHED); + auto idx0 = sequence->get_id(); + scheduler.free_sequence(idx0); + } + auto generated_ids = requests[0]->get_sequences()[0]->get_generated_ids(); + + histrory_tokens.insert(histrory_tokens.end(), prompt_tokens.begin(), prompt_tokens.end()); + histrory_tokens.insert(histrory_tokens.end(), generated_ids.begin(), generated_ids.end()); + } + } + +} + + +TEST(TestScheduler, prefix_caching_with_max_new_tokens_equal_1) { + std::array<SchedulerConfig, 2> configs = {SchedulerConfig(), SchedulerConfig()}; + configs.at(0).num_kv_blocks = 10; + configs.at(0).block_size = 32; + configs.at(0).dynamic_split_fuse = false; + configs.at(0).enable_prefix_caching = true; + configs.at(1).num_kv_blocks = 10; + configs.at(1).block_size = 32; + configs.at(1).dynamic_split_fuse = true; + configs.at(1).enable_prefix_caching = true; + for (auto scheduler_config: configs) { + std::vector<uint64_t> prompt_tokens = {0,1,2,3,4,5,6,7}; + // schedule prompt + Scheduler scheduler = Scheduler(scheduler_config); + + size_t chat_iterations = 2; + + for (size_t chat_iteration = 0; chat_iteration < chat_iterations; chat_iteration++) { + SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {prompt_tokens.size()}, prompt_tokens.data()), + ov::genai::greedy(), scheduler_config.block_size, + scheduler_config.enable_prefix_caching); + + sequence_group->set_sequence_group_ptr(sequence_group); + std::vector<SequenceGroup::Ptr> requests = {sequence_group}; + // restore cached blocks + for (auto request: requests) { + scheduler.restore_cached_blocks(request); + } + + // schedule prompt + auto out1 = scheduler.schedule(requests); + if (chat_iteration == 0) + EXPECT_EQ(out1.m_total_num_scheduled_tokens, prompt_tokens.size()); + else + EXPECT_EQ(out1.m_total_num_scheduled_tokens, 1); + for (auto seq: requests) { + std::vector<Sequence::Ptr> running_sequences = seq->get_running_sequences(); + running_sequences[0]->append_token(23, 0.7); + seq->finish_iteration(); + } + + // In case max_new_tokens == 1 no generate phase happens + + for (auto request: requests) { + // finish sequences + auto sequence = request->get_running_sequences()[0]; + sequence->set_status(SequenceStatus::FINISHED); + auto idx0 = sequence->get_id(); + scheduler.free_sequence(idx0); + } + } + } + +} + +TEST(TestScheduler, test_partially_preempted_prompt_not_allowed) { + SchedulerConfig scheduler_config; + scheduler_config.max_num_batched_tokens = 32; + scheduler_config.num_kv_blocks = 6; + scheduler_config.block_size = 4; + scheduler_config.dynamic_split_fuse = false; + scheduler_config.max_num_seqs = 5; + + std::vector<uint64_t> tokens = {0,1,2,3,4,5,6,7,8,9,10,11}; + SequenceGroup::Ptr sequence_group1 = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), + ov::genai::greedy(), scheduler_config.block_size, scheduler_config.enable_prefix_caching); + auto idx0 = (*sequence_group1)[0]->get_id(); + SequenceGroup::Ptr sequence_group2 = std::make_shared<SequenceGroup>(1, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), + ov::genai::greedy(), scheduler_config.block_size, scheduler_config.enable_prefix_caching); + auto idx1 = (*sequence_group2)[0]->get_id(); + std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2}; + + + // schedule 2 sequence groups that use all available 2*3 kv blocks, we used all available kv-blocks. + const bool can_use_partial_preemption = false; + Scheduler scheduler = Scheduler(scheduler_config, 1, can_use_partial_preemption); + auto out1 = scheduler.schedule(requests); + + for (auto req : requests) + req->finish_iteration(); + + // sequence_group2 should be fully preempted + auto out2 = scheduler.schedule(requests); + + // check that sequence_group1 has one more allocated block + auto block_table1 = scheduler.get_block_tables(*(*sequence_group1)[0]); + ASSERT_EQ(block_table1[0].size(), 4); + ASSERT_EQ(block_table1[0][0]->get_index(), 0); + ASSERT_EQ(block_table1[0][1]->get_index(), 1); + ASSERT_EQ(block_table1[0][2]->get_index(), 2); + ASSERT_EQ(block_table1[0][3]->get_index(), 3); + ASSERT_EQ(out2.m_block_tables[idx0][0].size(), 4); + ASSERT_EQ(out2.m_block_tables[idx0][0][0]->get_index(), 0); + ASSERT_EQ(out2.m_block_tables[idx0][0][1]->get_index(), 1); + ASSERT_EQ(out2.m_block_tables[idx0][0][2]->get_index(), 2); + ASSERT_EQ(out2.m_block_tables[idx0][0][3]->get_index(), 3); + + std::vector<uint64_t> ref_ids = {0}; + ASSERT_EQ(out2.m_scheduled_sequence_groups_ids, ref_ids); + ASSERT_EQ(out2.m_total_num_scheduled_tokens, 1); + + // for vllm case sequence_group2 is fully preempted + EXPECT_FALSE(scheduler.has_block_table(idx1)); + + for (auto req : requests) + req->finish_iteration(); + + // finish first sequence + requests[0]->get_running_sequences()[0]->set_status(SequenceStatus::FINISHED); + scheduler.free_sequence(idx0); + clear_finished_sequences(requests); + + // sequence_group2 should be scheduled + auto out3 = scheduler.schedule(requests); + + // prompt should be fully scheduled + ASSERT_EQ(out3.m_total_num_scheduled_tokens, 12); + + ASSERT_EQ(out3.m_block_tables[idx1][0][0]->get_index(), 4); + ASSERT_EQ(out3.m_block_tables[idx1][0][1]->get_index(), 5); + ASSERT_EQ(out3.m_block_tables[idx1][0][2]->get_index(), 0); + + auto block_table2 = scheduler.get_block_tables(*(*sequence_group2)[0]); + ASSERT_EQ(block_table2[0].size(), 3); + ASSERT_EQ(block_table2[0][0]->get_index(), 4); + ASSERT_EQ(block_table2[0][1]->get_index(), 5); + ASSERT_EQ(block_table2[0][2]->get_index(), 0); + + EXPECT_FALSE(scheduler.has_block_table(idx0)); +} + +TEST(TestScheduler, test_partially_preempted_prompt_not_allowed2) { + SchedulerConfig scheduler_config; + scheduler_config.max_num_batched_tokens = 32; + scheduler_config.num_kv_blocks = 6; + scheduler_config.block_size = 4; + scheduler_config.dynamic_split_fuse = false; + scheduler_config.max_num_seqs = 5; + + std::vector<uint64_t> tokens = {0,1,2,3,4,5,6,7,8,9}; + SequenceGroup::Ptr sequence_group1 = std::make_shared<SequenceGroup>(0, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), + ov::genai::greedy(), scheduler_config.block_size, scheduler_config.enable_prefix_caching); + auto idx0 = (*sequence_group1)[0]->get_id(); + SequenceGroup::Ptr sequence_group2 = std::make_shared<SequenceGroup>(1, ov::Tensor(ov::element::i64, {tokens.size()}, tokens.data()), + ov::genai::greedy(), scheduler_config.block_size, scheduler_config.enable_prefix_caching); + auto idx1 = (*sequence_group2)[0]->get_id(); + std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2}; + + // schedule 2 sequence groups that use all available 2*3 kv blocks, we used all available kv-blocks. + const bool can_use_partial_preemption = false; + Scheduler scheduler = Scheduler(scheduler_config, 1, can_use_partial_preemption); + scheduler.schedule(requests); + for (auto req: requests) + req->finish_iteration(); + + scheduler.schedule(requests); + for (auto req: requests) + req->finish_iteration(); + + scheduler.schedule(requests); + for (auto req: requests) + req->finish_iteration(); + + // sequence_group2 should be fully preempted + scheduler.schedule(requests); + for (auto req: requests) + req->finish_iteration(); + + auto out2 = scheduler.schedule(requests); + + // check that sequence_group1 has one more allocated block + auto block_table1 = scheduler.get_block_tables(*(*sequence_group1)[0]); + ASSERT_EQ(block_table1[0].size(), 4); + ASSERT_EQ(block_table1[0][0]->get_index(), 0); + ASSERT_EQ(block_table1[0][1]->get_index(), 1); + ASSERT_EQ(block_table1[0][2]->get_index(), 2); + ASSERT_EQ(block_table1[0][3]->get_index(), 3); + ASSERT_EQ(out2.m_block_tables[idx0][0].size(), 4); + ASSERT_EQ(out2.m_block_tables[idx0][0][0]->get_index(), 0); + ASSERT_EQ(out2.m_block_tables[idx0][0][1]->get_index(), 1); + ASSERT_EQ(out2.m_block_tables[idx0][0][2]->get_index(), 2); + ASSERT_EQ(out2.m_block_tables[idx0][0][3]->get_index(), 3); + + std::vector<uint64_t> ref_ids = {0}; + ASSERT_EQ(out2.m_scheduled_sequence_groups_ids, ref_ids); + ASSERT_EQ(out2.m_total_num_scheduled_tokens, 1); + + // for vllm case sequence_group2 is fully preempted + EXPECT_FALSE(scheduler.has_block_table(idx1)); + + for (auto req: requests) + req->finish_iteration(); + + // finish first sequence + requests[0]->get_running_sequences()[0]->set_status(SequenceStatus::FINISHED); + scheduler.free_sequence(idx0); + clear_finished_sequences(requests); + + // sequence_group2 should be scheduled + auto out3 = scheduler.schedule(requests); + + // prompt should be fully scheduled + generated tokens concatenated to prompt (10 + 2) + ASSERT_EQ(out3.m_total_num_scheduled_tokens, 12); + + ASSERT_EQ(out3.m_block_tables[idx1][0][0]->get_index(), 4); + ASSERT_EQ(out3.m_block_tables[idx1][0][1]->get_index(), 5); + ASSERT_EQ(out3.m_block_tables[idx1][0][2]->get_index(), 0); + + auto block_table2 = scheduler.get_block_tables(*(*sequence_group2)[0]); + ASSERT_EQ(block_table2[0].size(), 3); + ASSERT_EQ(block_table2[0][0]->get_index(), 4); + ASSERT_EQ(block_table2[0][1]->get_index(), 5); + ASSERT_EQ(block_table2[0][2]->get_index(), 0); + + EXPECT_FALSE(scheduler.has_block_table(idx0)); +} + + +std::vector<size_t> _get_indices(const std::vector<KVCacheBlock::Ptr>& block_table_for_layer) { + std::vector<size_t> retval(block_table_for_layer.size()); + for (size_t i = 0; i < block_table_for_layer.size(); i++) { + retval[i] = block_table_for_layer[i]->get_index(); + } + return retval; +} + +Scheduler::Output _schedule_one_mock_generation_token_for_each_sequence_group(Scheduler& scheduler, std::vector<SequenceGroup::Ptr>& requests) { + auto out = scheduler.schedule(requests); + for (auto& req : requests) { + std::vector<Sequence::Ptr> running_sequences = req->get_running_sequences(); + running_sequences[0]->append_token(16, 0.9); + req->finish_iteration(); + } + return out; +} + +TEST(TestScheduler, FullyPreemptsCacheEvictedSequences) { + // NB: only eviction at prompt phase is tested here. Eviction during generation would happen only for beam search/parallel sampling cases + // (since greedy sampling doesn't exceed the max cache size at generation phase), but should currently execute the same code path as + // the preemption at prompt stage anyway + SchedulerConfig scheduler_config; + + scheduler_config.max_num_batched_tokens = 32; + scheduler_config.num_kv_blocks = 6; + scheduler_config.block_size = 2; + scheduler_config.dynamic_split_fuse = false; + scheduler_config.max_num_seqs = 5; + scheduler_config.use_cache_eviction = true; + scheduler_config.cache_eviction_config = ov::genai::CacheEvictionConfig(2, 2, 6, ov::genai::AggregationMode::NORM_SUM); + + + std::vector<uint64_t> tokens1 = {0, 1}; // 1 full block + SequenceGroup::Ptr sequence_group1 = std::make_shared<SequenceGroup>(0, + ov::Tensor(ov::element::i64, {tokens1.size()}, + tokens1.data()), + ov::genai::greedy(), + scheduler_config.block_size, + scheduler_config.enable_prefix_caching); + std::vector<uint64_t> tokens2 = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; // 5 full blocks, larger than eviction arena size (3 blocks) - will start evicting already at prompt stage + auto idx1 = (*sequence_group1)[0]->get_id(); + SequenceGroup::Ptr sequence_group2 = std::make_shared<SequenceGroup>(1, ov::Tensor(ov::element::i64, {tokens2.size()}, tokens2.data()), + ov::genai::greedy(), scheduler_config.block_size, scheduler_config.enable_prefix_caching); + auto idx2 = (*sequence_group2)[0]->get_id(); + std::vector<SequenceGroup::Ptr> requests = {sequence_group1, sequence_group2}; + + + Scheduler scheduler = Scheduler(scheduler_config); + // prompt phase - schedules 1 block for seq 1, 5 blocks for seq 2 + auto out = scheduler.schedule(requests); + + for (auto seq: requests) { + std::vector<Sequence::Ptr> running_sequences = seq->get_running_sequences(); + seq->finish_iteration(); + } + + // evict 2 blocks from seq 2 immediately to formally satisfy eviction arena size + std::vector<std::set<size_t>> blocks_to_evict(1, {0, 1}); + scheduler.free_blocks_from_sequence(idx2, blocks_to_evict); + sequence_group2->register_token_eviction(2 * 2); + + // 4 blocks are taken up at this stage + + // mock-generate 4 more tokens in the 1-st sequence group so that the remaining 2 blocks are filled up + std::vector<SequenceGroup::Ptr> first_seq_group_only = { requests[0] }; + for (size_t i = 0; i < 4; i++) { + // Since eviction arena size is less than the cache_size - BLOCK_SIZE, no preemption is expected to occur yet + // - tokens are added 1 by 1 and once a new block fills, an older one is evicted automatically + _schedule_one_mock_generation_token_for_each_sequence_group(scheduler, first_seq_group_only); + } + + // ensure we are in expected cache state just before preemption + auto block_table1 = _get_indices(scheduler.get_block_tables(*(*sequence_group1)[0])[0]); + auto block_table2 = _get_indices(scheduler.get_block_tables(*(*sequence_group2)[0])[0]); + + const std::vector<size_t> ref_block_table1{0, 1, 2}; + EXPECT_EQ(block_table1, ref_block_table1); + + const std::vector<size_t> ref_block_table2{3, 4, 5}; + EXPECT_EQ(block_table2, ref_block_table2); + + // Next generation in 1-st sequence group should lead to preemption of 2-nd, but tokens from it were evicted already + // Should ensure that the 2-nd sequence can only be preempted completely + out = _schedule_one_mock_generation_token_for_each_sequence_group(scheduler, requests); + + block_table1 = _get_indices(scheduler.get_block_tables(*(*sequence_group1)[0])[0]); + + const std::vector<size_t> ref_block_table1_after_preemption{0, 1, 2, 3}; // 3 was the first to be freed after preemption + EXPECT_EQ(block_table1, ref_block_table1_after_preemption); + EXPECT_FALSE(scheduler.has_block_table(idx2)); + + // finish first sequence + requests[0]->get_running_sequences()[0]->set_status(SequenceStatus::FINISHED); + scheduler.free_sequence(idx1); + clear_finished_sequences(requests); + + // sequence_group2 should be scheduled + out = scheduler.schedule(requests); + + // last token should be recomputed + EXPECT_FALSE(scheduler.has_block_table(idx1)); + EXPECT_TRUE(scheduler.has_block_table(idx2)); + block_table2 = _get_indices(scheduler.get_block_tables(*(*sequence_group2)[0])[0]); + const std::vector<size_t> ref_block_table2_after_recompute{4, 5, 0, 1, 2}; // should restore the old state before first eviction in terms of block count + EXPECT_EQ(block_table2, ref_block_table2_after_recompute); + +} diff --git a/tests/python_tests/README.md b/tests/python_tests/README.md new file mode 100644 index 0000000000..e5381708de --- /dev/null +++ b/tests/python_tests/README.md @@ -0,0 +1,47 @@ +# OpenVINO™ GenAI Tests + +This tests aim to validate support for vanilla and continuous batching GenAI APIs. + +## Setup environemnt + +In order to run tests first of all build or install OpenVINO GenAI library, follow instructions [GenAI Library README](../../src/README.md). + +Then install requirements for tests: +```sh +pip install -r tests/python_tests/requirements.txt +``` + +## Run Tests + +```sh +python -m pytest tests/python_tests/ -m precommit +``` + +During the test downloaded HuggingFace (HF) models will be saved into the current directory. If you wish to place them somewhere else you can specify `GENAI_MODELS_PATH_PREFIX` environenment variable, e.g. +```sh +GENAI_MODELS_PATH_PREFIX=$HOME/test_models python -m pytest tests/python_tests/ -m precommit +``` + +If you have built GenAI library by yourself instead of using wheel please set `PYTHONPATH` so that test could find library, e.g. +```sh +PYTHONPATH=$PYTHONPATH:.../openvino.genai/build-Release/ python -m pytest tests/python_tests/ -m precommit +``` + +## Customise tests run + +Tests have `precommit` and `nightly` set of models. `precommit` contains lightweight models which can be quickly inferred, `nightly` models are heavier and required more time for interence. If you wish to run specific tests only for nightly models, you can use `-k` option, for example to run only multibatch and chat tests: +```sh +python -m pytest tests/python_tests/ -m nightly -k "test_multibatch and test_chat" +``` + +If you wish to run all tests except beam search do the following: +```sh +python -m pytest tests/python_tests/ -m precommit -k "not test_beam_search" +``` + +Argument `--model_ids` can be used to run tests selectively only for specific models. HF model ids should be separated by space, e.g: +```sh +python -m pytest tests/python_tests/ -m nightly -k "test_multibatch" --model_ids "TinyLlama/TinyLlama-1.1B-Chat-v1.0 Qwen/Qwen2-0.5B-Instruct" +``` + +List of currently supported `nightly` and `precommit` models can be found in tests/python_tests/ov_genai_test_utils.py:get_models_list diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py new file mode 100644 index 0000000000..897839b454 --- /dev/null +++ b/tests/python_tests/common.py @@ -0,0 +1,440 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +import shutil +import pytest + +from optimum.intel import OVModelForCausalLM +from pathlib import Path +from openvino_genai import ContinuousBatchingPipeline, SchedulerConfig, GenerationResult, GenerationConfig +from transformers import AutoTokenizer, AutoModelForCausalLM +from transformers import GenerationConfig as HFGenerationConfig +from typing import List, Tuple + +TESTS_ROOT = Path(__file__).parent + +def get_greedy() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.num_return_sequences = 1 + generation_config.max_new_tokens = 30 + return generation_config + +def get_greedy_with_min_and_max_tokens() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.num_return_sequences = 1 + generation_config.min_new_tokens = 15 + generation_config.max_new_tokens = 30 + return generation_config + +def get_greedy_with_repetition_penalty() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.num_return_sequences = 1 + generation_config.repetition_penalty = 2.0 + generation_config.max_new_tokens = 30 + return generation_config + +def get_greedy_with_penalties() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.num_return_sequences = 1 + generation_config.presence_penalty = 2.0 + generation_config.frequency_penalty = 0.2 + generation_config.max_new_tokens = 30 + return generation_config + +def get_greedy_with_min_and_max_tokens() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.num_return_sequences = 1 + generation_config.min_new_tokens = 15 + generation_config.max_new_tokens = 30 + return generation_config + +def get_greedy_with_single_stop_string() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.num_return_sequences = 1 + generation_config.min_new_tokens = 15 + generation_config.max_new_tokens = 50 + generation_config.stop_strings = {"anag"} # expected match on "manage" + generation_config.include_stop_str_in_output = True + return generation_config + +def get_greedy_with_multiple_stop_strings() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.num_return_sequences = 1 + generation_config.min_new_tokens = 1 + generation_config.max_new_tokens = 50 + generation_config.stop_strings = {".", "software", "Intel"} + generation_config.include_stop_str_in_output = True + return generation_config + +def get_greedy_with_multiple_stop_strings_no_match() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.num_return_sequences = 1 + generation_config.min_new_tokens = 1 + generation_config.max_new_tokens = 50 + generation_config.stop_strings = {"Einstein", "sunny", "geothermal"} + generation_config.include_stop_str_in_output = True + return generation_config + +def get_beam_search() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.num_beam_groups = 3 + generation_config.num_beams = 6 + generation_config.max_new_tokens = 30 + generation_config.num_return_sequences = 3 + generation_config.num_return_sequences = generation_config.num_beams + return generation_config + +def get_beam_search_min_and_max_tokens() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.num_beam_groups = 3 + generation_config.num_beams = 6 + generation_config.min_new_tokens = 15 + generation_config.max_new_tokens = 30 + generation_config.num_return_sequences = 3 + generation_config.num_return_sequences = generation_config.num_beams + return generation_config + +def get_beam_search_with_single_stop_string() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.num_beam_groups = 3 + generation_config.num_beams = 6 + generation_config.max_new_tokens = 50 + generation_config.num_return_sequences = generation_config.num_beams + generation_config.stop_strings = {"open sour"} # expected match on "open source" + generation_config.include_stop_str_in_output = True + return generation_config + +def get_beam_search_with_multiple_stop_strings() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.num_beam_groups = 3 + generation_config.num_beams = 6 + generation_config.max_new_tokens = 50 + generation_config.num_return_sequences = generation_config.num_beams + generation_config.stop_strings = {".", "software", "Intel"} + generation_config.include_stop_str_in_output = True + return generation_config + +def get_beam_search_with_multiple_stop_strings_no_match() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.num_beam_groups = 3 + generation_config.num_beams = 6 + generation_config.max_new_tokens = 30 + generation_config.num_return_sequences = generation_config.num_beams + generation_config.stop_strings = {"Einstein", "sunny", "geothermal"} + generation_config.include_stop_str_in_output = True + return generation_config + +def get_multinomial_temperature() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.do_sample = True + generation_config.temperature = 0.8 + generation_config.num_return_sequences = 1 + generation_config.max_new_tokens = 30 + return generation_config + +def get_multinomial_temperature_and_num_return_sequence() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.do_sample = True + generation_config.temperature = 0.7 + generation_config.num_return_sequences = 3 + generation_config.max_new_tokens = 30 + return generation_config + +def get_multinomial_temperature_and_top_p() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.num_return_sequences = 1 + generation_config.do_sample = True + generation_config.temperature = 0.8 + generation_config.top_p = 0.9 + generation_config.max_new_tokens = 30 + return generation_config + +def get_multinomial_temperature_and_top_k() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.do_sample = True + generation_config.num_return_sequences = 1 + generation_config.temperature = 0.8 + generation_config.top_k = 2 + generation_config.max_new_tokens = 30 + return generation_config + +def get_multinomial_temperature_top_p_and_top_k() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.do_sample = True + generation_config.temperature = 0.8 + generation_config.top_p = 0.9 + generation_config.num_return_sequences = 1 + generation_config.top_k = 2 + generation_config.max_new_tokens = 30 + return generation_config + +def get_multinomial_temperature_and_repetition_penalty() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.do_sample = True + generation_config.num_return_sequences = 1 + generation_config.temperature = 0.8 + generation_config.repetition_penalty = 2.0 + generation_config.max_new_tokens = 30 + return generation_config + +def get_multinomial_all_parameters() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.do_sample = True + generation_config.num_return_sequences = 4 + generation_config.temperature = 0.9 + generation_config.top_p = 0.8 + generation_config.top_k = 20 + generation_config.repetition_penalty = 2.0 + generation_config.max_new_tokens = 30 + return generation_config + +def get_multinomial_temperature_and_frequence_penalty() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.do_sample = True + generation_config.temperature = 0.8 + generation_config.frequency_penalty = 0.5 + generation_config.num_return_sequences = 1 + generation_config.max_new_tokens = 30 + return generation_config + +def get_multinomial_temperature_and_presence_penalty() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.do_sample = True + generation_config.temperature = 0.8 + generation_config.presence_penalty = 0.1 + generation_config.num_return_sequences = 1 + generation_config.max_new_tokens = 30 + return generation_config + +def get_multinomial_max_and_min_token() -> GenerationConfig: + multinomial = GenerationConfig() + multinomial.do_sample = True + multinomial.temperature = 0.9 + multinomial.top_p = 0.9 + multinomial.top_k = 20 + multinomial.num_return_sequences = 3 + multinomial.presence_penalty = 0.01 + multinomial.frequency_penalty = 0.1 + multinomial.min_new_tokens = 15 + multinomial.max_new_tokens = 30 + return multinomial + +def get_test_dataset() -> Tuple[List[str], List[GenerationConfig]]: + prompts = [ + "What is OpenVINO?", + "How are you?", + "What is your name?", + "Tell me something about Canada" + ] + generation_configs = [ + get_greedy(), + get_beam_search(), + get_greedy(), + get_beam_search(), + ] + return (prompts, generation_configs) + + +def get_scheduler_config(scheduler_params: dict = None) -> SchedulerConfig: + scheduler_config = SchedulerConfig() + scheduler_config.cache_size = 1 + if scheduler_params is None: + scheduler_config.dynamic_split_fuse = True + # vLLM specific + scheduler_config.max_num_batched_tokens = 256 + scheduler_config.max_num_seqs = 256 + + # Expedited number of blocks = text_blocks_n * G * n_prompts, where + # text_blocks_n - number of blocks required for storing prompt and generated text, + # currently it is 1 block for prompt (31 token with block_size 32) + 1 block for generated text (max length of generated text - 30 tokens); + # G - number of sequences in a sequence group, for beam search it is 2(group_size) * 3 (num_groups); + # n_prompts - number of prompts. + # For current parameters in tests expedited number of blocks is approximately 48. + scheduler_config.num_kv_blocks = 60 + else: + for param, value in scheduler_params.items(): + setattr(scheduler_config, param, value) + + return scheduler_config + + +def convert_to_hf( + default_generation_config : HFGenerationConfig, + generation_config : GenerationConfig +) -> HFGenerationConfig: + kwargs = {} + + # generic parameters + kwargs['max_length'] = generation_config.max_length + # has higher priority than 'max_length' + kwargs['max_new_tokens'] = generation_config.max_new_tokens + if generation_config.stop_strings: + kwargs['stop_strings'] = generation_config.stop_strings + + # copy default parameters + kwargs['eos_token_id'] = default_generation_config.eos_token_id + kwargs['pad_token_id'] = default_generation_config.pad_token_id + kwargs['repetition_penalty'] = generation_config.repetition_penalty + + if generation_config.num_beams > 1: + # beam search case + kwargs['num_beam_groups'] = generation_config.num_beam_groups + kwargs['num_beams'] = generation_config.num_beams + kwargs['diversity_penalty'] = generation_config.diversity_penalty + kwargs['length_penalty'] = generation_config.length_penalty + kwargs['no_repeat_ngram_size'] = generation_config.no_repeat_ngram_size + kwargs['num_return_sequences'] = generation_config.num_return_sequences + kwargs['output_scores'] = True + elif generation_config.do_sample: + # mulitinomial + kwargs['temperature'] = generation_config.temperature + kwargs['top_k'] = generation_config.top_k + kwargs['top_p'] = generation_config.top_p + kwargs['do_sample'] = generation_config.do_sample + else: + # greedy + pass + + hf_generation_config = HFGenerationConfig(**kwargs) + return hf_generation_config + + +def run_hugging_face( + model, + hf_tokenizer, + prompts: List[str], + generation_configs: List[GenerationConfig], +) -> List[GenerationResult]: + generation_results = [] + for prompt, generation_config in zip(prompts, generation_configs): + inputs = hf_tokenizer(prompt, return_tensors="pt") + prompt_len = inputs['input_ids'].numel() + generate_outputs = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], generation_config=convert_to_hf(model.generation_config, generation_config), + return_dict_in_generate=True, tokenizer=hf_tokenizer) + all_text_batch = hf_tokenizer.batch_decode([generated_ids[prompt_len:] for generated_ids in generate_outputs.sequences], skip_special_tokens=True) + + generation_result = GenerationResult() + generation_result.m_generation_ids = all_text_batch + # sequences_scores are available only for beam search case + if generation_config.is_beam_search(): + generation_result.m_scores = [score for score in generate_outputs.sequences_scores] + generation_results.append(generation_result) + + del hf_tokenizer + del model + + return generation_results + + +def run_continuous_batching( + model_path : Path, + scheduler_config : SchedulerConfig, + prompts: List[str], + generation_configs : List[GenerationConfig] +) -> List[GenerationResult]: + pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config, "CPU", {}, {}) + output = pipe.generate(prompts, generation_configs) + del pipe + shutil.rmtree(model_path) + return output + + +def get_models_list(file_name: str): + models = [] + with open(file_name) as f: + for model_name in f: + model_name = model_name.strip() + # skip comment in model scope file + if model_name.startswith('#'): + continue + models.append(model_name) + return models + + +def compare_results(hf_result: GenerationResult, ov_result: GenerationResult, generation_config: GenerationConfig): + if generation_config.is_beam_search(): + assert len(hf_result.m_scores) == len(ov_result.m_scores) + for hf_score, ov_score in zip(hf_result.m_scores, ov_result.m_scores): + # Note, that for fp32 / fp16 models scores are different less than 0.001 + assert abs(hf_score - ov_score) < 0.02 + + assert len(hf_result.m_generation_ids) == len(ov_result.m_generation_ids) + for hf_text, ov_text in zip(hf_result.m_generation_ids, ov_result.m_generation_ids): + assert hf_text == ov_text + +def save_ov_model_from_optimum(model, hf_tokenizer, model_path: Path): + model.save_pretrained(model_path) + # convert tokenizers as well + from openvino_tokenizers import convert_tokenizer + from openvino import serialize + tokenizer, detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True, skip_special_tokens=True) + serialize(tokenizer, model_path / "openvino_tokenizer.xml") + serialize(detokenizer, model_path / "openvino_detokenizer.xml") + +def get_model_and_tokenizer(model_id: str, use_optimum = True): + hf_tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) + model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True) if use_optimum else \ + AutoModelForCausalLM.from_pretrained(model_id) + return model, hf_tokenizer + +def generate_and_compare_with_hf(model_id: str, prompts: List[str], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig, tmp_path: Path): + use_optimum = True + model_path : Path = tmp_path / model_id + model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum) + + if use_optimum: + save_ov_model_from_optimum(model, hf_tokenizer, model_path) + + hf_results = run_hugging_face(model=model, hf_tokenizer=hf_tokenizer, prompts=prompts, generation_configs=generation_configs) + _generate_and_compare_with_reference_results(model_path, prompts, hf_results, generation_configs, scheduler_config) + + +def _generate_and_compare_with_reference_results(model_path: Path, prompts: List[str], reference_results: List[GenerationResult], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig): + ov_results : List[GenerationResult] = run_continuous_batching(model_path, scheduler_config, prompts, generation_configs) + + assert len(prompts) == len(reference_results) + assert len(prompts) == len(ov_results) + + for prompt, ref_result, ov_result, generation_config in zip(prompts, reference_results, ov_results, generation_configs): + print(f"Prompt = {prompt}\nref result = {ref_result}\nOV result = {ov_result.m_generation_ids}") + compare_results(ref_result, ov_result, generation_config) + + +def generate_and_compare_with_reference_text(model_path: Path, prompts: List[str], reference_texts_per_prompt: List[List[str]], generation_configs: List[GenerationConfig], scheduler_config: SchedulerConfig): + ov_results : List[GenerationResult] = run_continuous_batching(model_path, scheduler_config, prompts, generation_configs) + + assert len(prompts) == len(reference_texts_per_prompt) + assert len(prompts) == len(ov_results) + + for prompt, ref_texts_for_this_prompt, ov_result, generation_config in zip(prompts, reference_texts_per_prompt, ov_results, generation_configs): + print(f"Prompt = {prompt}\nref text = {ref_texts_for_this_prompt}\nOV result = {ov_result.m_generation_ids}") + + assert len(ref_texts_for_this_prompt) == len(ov_result.m_generation_ids) + for ref_text, ov_text in zip(ref_texts_for_this_prompt, ov_result.m_generation_ids): + assert ref_text == ov_text + +def run_test_pipeline(tmp_path: str, model_id: str, scheduler_params: dict = None, generation_config = None): + prompts, generation_configs = get_test_dataset() + scheduler_config = get_scheduler_config(scheduler_params) + + if generation_config is not None: + generation_config.rng_seed = 0 + generation_configs = [generation_config] * len(prompts) + + generate_and_compare_with_hf(model_id, prompts, generation_configs, scheduler_config, tmp_path) + + +DEFAULT_SCHEDULER_CONFIG = get_scheduler_config({"num_kv_blocks": 300, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}) + +def get_image_by_link(link): + from PIL import Image + import requests + from openvino import Tensor + import numpy as np + + image = Image.open(requests.get(link, stream=True).raw) + if image.mode != 'RGB': + image = image.convert('RGB') + image_data = np.array((np.array(image.getdata()) - 128).astype(np.byte)).reshape(1, 3, image.size[1], image.size[0]) + return Tensor(image_data) diff --git a/tests/python_tests/conftest.py b/tests/python_tests/conftest.py new file mode 100644 index 0000000000..f98f47ecf3 --- /dev/null +++ b/tests/python_tests/conftest.py @@ -0,0 +1,24 @@ +import pytest + + +def pytest_make_parametrize_id(config, val, argname): + if argname in ['prompt', 'prompts', 'batched_prompts']: + return f'{val}' + elif argname == 'model_descr': + return f"{val[0]}" + elif argname == 'chat_config': + return f"{val[0]}" + elif argname in ['stop_criteria', 'generation_config']: + return str(val) + elif isinstance(val, (int, float, str)): + return f'{argname}={val}' + return None + +def pytest_addoption(parser): + parser.addoption("--model_ids", help="Select models to run") + +def pytest_configure(config: pytest.Config): + marker = 'precommit' if config.getoption('-m') == 'precommit' else 'nightly' + pytest.run_marker = marker + pytest.selected_model_ids = config.getoption('--model_ids', default=None) + diff --git a/tests/python_tests/data/long_prompts.txt b/tests/python_tests/data/long_prompts.txt new file mode 100644 index 0000000000..470f22f301 --- /dev/null +++ b/tests/python_tests/data/long_prompts.txt @@ -0,0 +1,15 @@ +As enticing as quantum entanglement is, it also raises philosophical questions regarding the nature of reality. The phenomenon challenges our conventional understanding of causality and the separability of objects. If entangled particles can instantaneously affect each other's states over vast distances, what does this imply about the structure of spacetime? Some interpretations of quantum mechanics, such as the many-worlds interpretation, suggest that every possible outcome occurs in a separate, branching universe, thereby preserving locality while explaining entanglement. +At its core, quantum entanglement occurs when two or more particles become interconnected in such a way that the state of one particle instantaneously influences the state of another, regardless of the distance separating them. This interdependence persists even if the particles are light-years apart, defying classical intuitions about the separability of objects. To understand how this phenomenon arises, we must explore the principles of superposition and measurement in quantum mechanics. +Despite significant advances, there remains much to explore regarding the nature and applications of quantum entanglement. Scientists are actively researching how to harness and manipulate entangled states more effectively. As technology improves, we may unlock new ways to utilize this phenomenon for practical applications, such as improved GPS systems, advanced sensors, and even insights into the fabric of spacetime itself. +Entanglement has implications far beyond theoretical physics; it plays a crucial role in the burgeoning field of quantum computing. Classical computers process information in bits that represent either a 0 or a 1. Quantum computers, however, leverage the principles of quantum mechanics to manipulate quantum bits, or qubits, which can represent both 0 and 1 simultaneously due to superposition. Entangled qubits can work together to perform complex calculations at speeds unachievable by classical computers, potentially revolutionizing fields like cryptography, drug discovery, and complex system simulations. +In a small town nestled between rolling hills, there existed a peculiar library known as The Whispering Pages . Its walls were filled with ancient tomes and modern texts alike, each with a story eager to be told . The townsfolk often spoke of the library's magic, claiming that the books would whisper secrets to anyone willing to listen . Many dismissed it as mere folklore, but for twelve-year-old Mira, it was a place of wonder and discovery . Every Saturday, Mira would visit The Whispering Pages after finishing her chores . She had a routine: first, she would greet Mr . Tobias, the elderly librarian with twinkling eyes and a white mustache that danced when he smiled . “Welcome back, Mira,” he would say, as if he had been waiting all week for her return . +Mira would then make her way to her favorite corner, a snug little nook bathed in sunlight, where a massive oak tree outside provided shade and created a cozy atmosphere . On one such Saturday, as Mira settled in with a book about mythical creatures, she began to hear a faint voice . She looked around, but the library was empty except for Mr . Tobias, who was busy sorting through a stack of new arrivals . The voice became clearer, and to her surprise, it was coming from the book itself . "Find the heart that beats beneath the stones," it murmured . Intrigued, Mira leaned closer . She flipped through the pages, and as she did, she noticed something unusual—a map drawn in the margins . It appeared to lead beyond the library, out into the hills that surrounded the town . Her curiosity piqued, she carefully removed the book from the shelf and tucked it under her arm, deciding that after her visit, she would investigate further . +In conclusion, quantum entanglement is a captivating area of study that has reshaped our understanding of the physical universe. It is a testament to the oddities of quantum mechanics, where particles can be deeply connected regardless of the distances that separate them. With ongoing research and technological advancements, the concept of entanglement continues to inspire new theories and applications, offering a glimpse into a future where quantum systems may revolutionize how we process information and interact with the world around us. As we delve deeper into the quantum realm, we uncover not just the intricacies of particles and forces but also fundamental truths about the nature of reality itself. +In quantum mechanics, particles such as electrons or photons exist in a state of superposition. This means they do not have definite properties until measured. For example, an electron can simultaneously have a spin of "up" and "down" until an observation is made. When two particles are entangled, their superposed states are linked. If one particle is measured and found to have a specific property, the other particle’s state is determined instantaneously—the spin of the second particle will be opposite that of the first, regardless of the distance between them. +Moreover, quantum entanglement is a critical component of quantum communication. It enables secure transmission of information through techniques like quantum key distribution (QKD). In QKD, two parties can share a secret key using entangled particles. Any attempt by an eavesdropper to intercept or measure the particles will disturb their states, revealing the presence of an unauthorized observer. This technology promises a significant advancement in data security, offering virtually unbreakable encryption. +Once home, Mira spread out her findings across her bedroom floor . The map was rudimentary, marked with simple symbols: a sun, a tree, and an ominous 'X' at the end . It felt like a treasure map, and Mira's imagination began to race . After her parents went to bed, she gathered supplies: a flashlight, a notebook, and a snack for the journey . With her heart racing at the thought of adventure, she headed out into the cool night . The moon illuminated her path as Mira made her way up the hillside, following the map's directions . The night was quiet, with only the sound of rustling leaves and the distant hoot of an owl . As she climbed higher, she felt a growing sense of purpose . “The heart that beats beneath the stones,” she muttered, trying to decipher what the words could mean . After some time, she arrived at a clearing where the ground was carpeted with moss and dotted with smooth stones . The map indicated that she needed to look closely . Mira knelt down to inspect the area and, just as she was about to give up, she heard a soft thump, like the beat of a drum . Surprised, she looked around and found a particularly large stone slightly displaced from the others . +Quantum entanglement is one of the most intriguing phenomena in the realm of quantum mechanics, a branch of physics that describes the behavior of matter and energy at the smallest scales. Developed in the early 20th century, quantum mechanics fundamentally altered our perception of the universe. Unlike classical physics, which dictates that particles have defined positions and velocities, quantum mechanics introduces a level of uncertainty and non-locality. One of the cornerstones of this theory is the concept of entanglement, which Albert Einstein famously referred to as "spooky action at a distance." +The crystal became her talisman, reminding her of her promise and the magic of storytelling—a bridge between the ordinary and the extraordinary, where dreams take flight and every book waited to be opened . ### The Fascinating World of Bioluminescence #### Introduction Bioluminescence is a natural phenomenon that occurs in various organisms, characterized by the ability to emit light . This incredible adaptation can be found in a range of living beings, including certain species of fungi, bacteria, and marine animals . The light produced can serve various purposes such as predation, communication, and camouflage . This article explores the mechanisms, examples, and ecological significance of bioluminescence, shedding light on its role in the natural world . +The process of bioluminescence involves a biochemical reaction between a light-emitting molecule known as luciferin and an enzyme called luciferase . This reaction occurs within specialized cells or organelles and typically requires oxygen . The specific structure of luciferin varies among different organisms, leading to a wide range of colors emitted, from blue and green to red and yellow . The basic biochemical reaction can be summarized as follows: 1 . **Formation of Luciferin-Oxygen Complex**: When luciferin reacts with oxygen in the presence of luciferase, it forms an unstable complex . 2 . +The implications of quantum entanglement extend beyond fundamental physics. They intersect with various fields, including thermodynamics, information theory, and even biology. Researchers are exploring the possibility that quantum entanglement plays a role in biological processes, such as photosynthesis and avian navigation. For example, certain birds are thought to navigate using quantum coherence in their eyes. This intriguing intersection of quantum phenomena and biological systems suggests that entanglement may be a universal principle, manifesting in diverse contexts across nature. +The study of entanglement has also led to the exploration of quantum teleportation—the process of transferring quantum states from one location to another without physically moving the particle itself. By creating a pair of entangled particles, where one remains at point A and the other is sent to point B, the state of the particle at point A can be "teleported" to point B through a classical communication channel. This concept is not merely science fiction; researchers have successfully demonstrated teleportation of quantum states in laboratory settings, paving the way for potential advancements in quantum networks. \ No newline at end of file diff --git a/tests/python_tests/data/short_prompts.txt b/tests/python_tests/data/short_prompts.txt new file mode 100644 index 0000000000..d919f62474 --- /dev/null +++ b/tests/python_tests/data/short_prompts.txt @@ -0,0 +1,30 @@ +The Earth revolves around the Sun. +Water is essential for all known forms of life. +The human body is composed of about 60% water. +Photosynthesis allows plants to convert sunlight into energy. +The speed of light is approximately 299,792 kilometers per second. +Ice is less dense than liquid water. +The brain contains around 86 billion neurons. +Honey never spoils due to its low moisture content. +A group of lions is called a pride. +The Great Wall of China is visible from space. +Humans share 99.9% of their DNA with chimpanzees. +The average adult has 206 bones in their body. +Bananas are berries, while strawberries are not. +The Pacific Ocean is the largest ocean on Earth. +Sound travels faster in water than in air. +The Eiffel Tower can be 15 cm taller during the summer. +Cheetahs are the fastest land animals, reaching speeds up to 75 mph. +The longest river in the world is the Nile River. +Penguins are flightless birds that live in the Southern Hemisphere. +Mars has the largest volcano in the solar system, Olympus Mons. +Diamonds are made of carbon atoms arranged in a crystal structure. +A day on Venus is longer than a year on Venus. +The heart beats about 100,000 times a day. +Octopuses have three hearts and blue blood. +Avocados are toxic to some animals, including dogs. +The mitochondria are known as the powerhouse of the cell. +An octet rule states that atoms are most stable when they have eight electrons in their outer shell. +The Sahara Desert is the largest hot desert in the world. +Lightning is hotter than the surface of the sun. +Honeybees communicate through a dance known as the waggle dance. diff --git a/tests/python_tests/models/nightly b/tests/python_tests/models/nightly new file mode 100644 index 0000000000..72b707bd63 --- /dev/null +++ b/tests/python_tests/models/nightly @@ -0,0 +1,51 @@ +hf-tiny-model-private/tiny-random-GPTJForCausalLM +hf-tiny-model-private/tiny-random-BartForCausalLM +hf-tiny-model-private/tiny-random-BigBirdForCausalLM +hf-tiny-model-private/tiny-random-BigBirdPegasusForCausalLM +hf-tiny-model-private/tiny-random-BioGptForCausalLM +hf-tiny-model-private/tiny-random-BlenderbotSmallForCausalLM +hf-tiny-model-private/tiny-random-BlenderbotForCausalLM +hf-tiny-model-private/tiny-random-BloomForCausalLM +hf-tiny-model-private/tiny-random-ErnieForCausalLM +hf-tiny-model-private/tiny-random-GPTNeoForCausalLM +hf-tiny-model-private/tiny-random-GPTNeoXForCausalLM +hf-tiny-model-private/tiny-random-GPTNeoXJapaneseForCausalLM +hf-tiny-model-private/tiny-random-MBartForCausalLM +hf-tiny-model-private/tiny-random-MvpForCausalLM +hf-tiny-model-private/tiny-random-PegasusForCausalLM +hf-tiny-model-private/tiny-random-PLBartForCausalLM +hf-tiny-model-private/tiny-random-XGLMForCausalLM +hf-internal-testing/tiny-random-PersimmonForCausalLM +hf-internal-testing/tiny-random-BartForCausalLM +hf-internal-testing/tiny-random-GPTNeoForCausalLM +hf-internal-testing/tiny-random-MptForCausalLM +hf-internal-testing/tiny-random-GPTBigCodeForCausalLM +hf-internal-testing/tiny-random-BloomForCausalLM +hf-internal-testing/tiny-random-GPTJForCausalLM +hf-internal-testing/tiny-random-CohereForCausalLM +hf-internal-testing/tiny-random-FalconForCausalLM +hf-internal-testing/tiny-random-XGLMForCausalLM +hf-internal-testing/tiny-random-PegasusForCausalLM +hf-internal-testing/tiny-random-MBartForCausalLM +hf-internal-testing/tiny-random-BigBirdPegasusForCausalLM +hf-internal-testing/tiny-random-BigBirdForCausalLM +hf-internal-testing/tiny-random-MegaForCausalLM +hf-internal-testing/tiny-random-RobertaPreLayerNormForCausalLM +hf-internal-testing/tiny-random-BioGptForCausalLM +hf-internal-testing/tiny-random-ProphetNetForCausalLM +hf-internal-testing/tiny-random-PLBartForCausalLM +hf-internal-testing/tiny-random-MegatronBertForCausalLM +hf-internal-testing/tiny-random-GPTNeoXJapaneseForCausalLM +hf-internal-testing/tiny-random-ErnieForCausalLM +hf-internal-testing/tiny-random-BlenderbotForCausalLM +hf-internal-testing/tiny-random-BlenderbotSmallForCausalLM +hf-tiny-model-private/tiny-random-CodeGenForCausalLM +hf-tiny-model-private/tiny-random-OPTForCausalLM +hf-internal-testing/tiny-random-MistralForCausalLM +hf-internal-testing/tiny-random-GPTNeoXForCausalLM +hf-internal-testing/tiny-random-LlamaForCausalLM +hf-internal-testing/tiny-random-StableLmForCausalLM +hf-internal-testing/tiny-random-PhiForCausalLM +hf-internal-testing/tiny-random-CodeGenForCausalLM +hf-internal-testing/tiny-random-Starcoder2ForCausalLM +hf-internal-testing/tiny-random-OPTForCausalLM \ No newline at end of file diff --git a/tests/python_tests/models/precommit b/tests/python_tests/models/precommit new file mode 100644 index 0000000000..0b913d3b01 --- /dev/null +++ b/tests/python_tests/models/precommit @@ -0,0 +1,3 @@ +hf-tiny-model-private/tiny-random-CodeGenForCausalLM +hf-tiny-model-private/tiny-random-GPT2LMHeadModel +hf-tiny-model-private/tiny-random-OPTForCausalLM \ No newline at end of file diff --git a/tests/python_tests/models/real_models b/tests/python_tests/models/real_models new file mode 100644 index 0000000000..cbf7424541 --- /dev/null +++ b/tests/python_tests/models/real_models @@ -0,0 +1,122 @@ +01-ai/Yi-6B +BAAI/Aquila-7B +BAAI/AquilaChat-7B +BAAI/AquilaChat2-7B +CohereForAI/c4ai-command-r-v01 +EleutherAI/gpt-j-6B +EleutherAI/gpt-j-6b +EleutherAI/gpt-neo-1.3B +EleutherAI/gpt-neo-125m +EleutherAI/gpt-neo-2.7B +EleutherAI/gpt-neox-20b +EleutherAI/pythia-160m +GAIR/Abel-7B-002 +# OrionStarAI/Orion-14B-Base: pip install flash_attn (https://github.com/huggingface/transformers/pull/30954) +PygmalionAI/pygmalion-6b +Qwen/Qwen-7B +Qwen/Qwen-7B-Chat +Qwen/Qwen1.5-0.5B +Qwen/Qwen1.5-1.8B +Qwen/Qwen1.5-7B +Qwen/Qwen1.5-7B-Chat +Qwen/Qwen1.5-MoE-A2.7B +Qwen/Qwen1.5-MoE-A2.7B-Chat +Salesforce/codegen-350M-multi +Salesforce/codegen-350M-nl +Salesforce/codegen2-1b +# Salesforce/xgen-7b-8k-base: Transformers issue - Object of type method is not JSON serializable (https://huggingface.co/Salesforce/xgen-7b-8k-base/discussions/32) +THUDM/chatglm2-6b +THUDM/chatglm3-6b +TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ +TinyLlama/TinyLlama-1.1B-Chat-v0.6 +TinyLlama/TinyLlama-1.1B-Chat-v1.0 +TitanML/tiny-mixtral +WizardLMTeam/WizardCoder-15B-V1.0 +allenai/OLMo-1B-hf +allenai/OLMo-7B-hf +baichuan-inc/Baichuan-7B +baichuan-inc/Baichuan2-7B-Base +baichuan-inc/Baichuan2-7B-Chat +berkeley-nest/Starling-LM-7B-alpha +bigcode/gpt_bigcode-santacoder +bigcode/starcoder2-3b +bigcode/starcoder2-7b +bigcode/starcoderbase-3b +bigscience/bloom-560m +bigscience/bloom-7b1 +bigscience/bloomz-1b7 +bigscience/bloomz-560m +bigscience/bloomz-7b1 +cerebras/Cerebras-GPT-13B +# core42/jais-13b: wrong output with PA +# core42/jais-13b-chat: wrong output with PA +databricks/dolly-v1-6b +databricks/dolly-v2-3b +# deepseek-ai/deepseek-coder-33b-instruct: OpenVINO tokenizers - Cannot convert tokenizer of this type without `.model` file +# deepseek-ai/deepseek-coder-6.7b-instruct: OpenVINO tokenizers - Cannot convert tokenizer of this type without `.model` file +# deepseek-ai/deepseek-moe-16b-base: optimum - Trying to export a deepseek model, that is a custom or unsupported architecture +# facebook/blenderbot-3B: optimum - IndexError: tuple index out of range +# facebook/incoder-1B: CB - Failed to detect "eos_token_id" in openvino_tokenizer.xml runtime information +facebook/opt-1.3b +facebook/opt-125m +facebook/opt-2.7b +facebook/opt-350m +facebook/opt-6.7b +google/gemma-1.1-7b-it +google/gemma-2b +google/gemma-2b-it +google/gemma-7b +google/pegasus-big_patent +google/pegasus-large +gpt2 +gpt2-xl +internlm/internlm-chat-7b +internlm/internlm2-7b +lmsys/longchat-7b-v1.5-32k +lmsys/vicuna-7b-v1.3 +lmsys/vicuna-7b-v1.5 +meta-llama/CodeLlama-7b-hf +meta-llama/Llama-2-7b-chat-hf +meta-llama/Llama-2-7b-hf +meta-llama/Meta-Llama-3-8B-Instruct +meta-llama/Meta-Llama-3.1-8B +meta-llama/Meta-Llama-3.1-8B-Instruct +microsoft/DialoGPT-large +microsoft/DialoGPT-medium +microsoft/Orca-2-7b +microsoft/Phi-3-mini-128k-instruct +microsoft/Phi-3-mini-4k-instruct +# microsoft/biogpt: OpenVINO Tokenizers - openvino.runtime.exceptions.OVTypeError: Tokenizer type is not supported: <class 'transformers.models.biogpt.tokenization_biogpt.BioGptTokenizer'> +microsoft/phi-1_5 +microsoft/phi-2 +mistralai/Mistral-7B-Instruct-v0.1 +mistralai/Mistral-7B-v0.1 +mistralai/Mixtral-8x7B-Instruct-v0.1 +mistralai/Mixtral-8x7B-v0.1 +mosaicml/mpt-30b +mosaicml/mpt-7b +mosaicml/mpt-7b-chat +nomic-ai/gpt4all-j +nomic-ai/gpt4all-mpt +openai-community/gpt2 +openai-community/gpt2-large +openai-community/gpt2-medium +openai-community/gpt2-xl +openbmb/MiniCPM-2B-dpo-bf16 +openbmb/MiniCPM-2B-sft-bf16 +openchat/openchat_3.5 +openlm-research/open_llama_13b +# openlm-research/open_llama_3b: CPU - head size must be multiple of 16, current: 100 +# openlm-research/open_llama_3b_v2: CPU - head size must be multiple of 16, current: 100 +# replit/replit-code-v1-3b: OpenVINO Tokenizers - AttributeError: 'ReplitLMTokenizer' object has no attribute 'sp_model' +# rinna/bilingual-gpt-neox-4b: OpenVINO Tokenizers - trash output (https://jira.devtools.intel.com/browse/CVS-142063) +rinna/youri-7b-chat +stabilityai/stable-code-3b +stabilityai/stable-zephyr-3b +stabilityai/stablelm-2-zephyr-1_6b +stabilityai/stablelm-3b-4e1t +tiiuae/falcon-7b +tiiuae/falcon-rw-7b +togethercomputer/RedPajama-INCITE-Chat-3B-v1 +# xverse/XVERSE-7B-Chat: Transfomers - Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3 +# xverse/XVERSE-MoE-A4.2B: Transfomers - Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3 diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py new file mode 100644 index 0000000000..75ff69e967 --- /dev/null +++ b/tests/python_tests/ov_genai_test_utils.py @@ -0,0 +1,251 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import pathlib +import os +import pytest +import functools +import openvino +import openvino_tokenizers +import openvino_genai as ov_genai +from typing import List, Tuple +from pathlib import Path +import shutil +import json + + +def get_models_list(): + precommit_models = [ + "katuni4ka/tiny-random-phi3", + ] + + nightly_models = [ + "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "facebook/opt-125m", + "microsoft/phi-1_5", + "microsoft/phi-2", + "THUDM/chatglm2-6b", + "Qwen/Qwen2-0.5B-Instruct", + "Qwen/Qwen-7B-Chat", + "Qwen/Qwen1.5-7B-Chat", + "argilla/notus-7b-v1", + "HuggingFaceH4/zephyr-7b-beta", + "ikala/redpajama-3b-chat", + "mistralai/Mistral-7B-v0.1", + + # "meta-llama/Llama-2-7b-chat-hf", # Cannot be downloaded without access token + # "google/gemma-2b-it", # Cannot be downloaded without access token. + # "google/gemma-7b-it", # Cannot be downloaded without access token. + "meta-llama/Llama-2-13b-chat-hf", + "meta-llama/Meta-Llama-3-8B-Instruct", + "openlm-research/open_llama_3b", + "openlm-research/open_llama_3b_v2", + "openlm-research/open_llama_7b", + "databricks/dolly-v2-12b", + "databricks/dolly-v2-3b", + ] + + if pytest.run_marker == "precommit": + model_ids = precommit_models + else: + model_ids = nightly_models + + if pytest.selected_model_ids: + model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')] + # pytest.set_trace() + prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', '')) + return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids] + + +def get_whisper_models_list(tiny_only=False): + precommit_models = [ + "openai/whisper-tiny", + "openai/whisper-small.en", + "openai/whisper-base", + ] + if tiny_only: + precommit_models = ["openai/whisper-tiny"] + + nightly_models = [] + + if pytest.run_marker == "precommit": + model_ids = precommit_models + else: + model_ids = nightly_models + + if pytest.selected_model_ids: + model_ids = [model_id for model_id in model_ids if model_id in pytest.selected_model_ids.split(' ')] + # pytest.set_trace() + prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', '')) + return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids] + + +def get_chat_models_list(): + precommit_models = [ + "Qwen/Qwen2-0.5B-Instruct", + ] + + nightly_models = [ + "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "meta-llama/Meta-Llama-3-8B-Instruct", + "meta-llama/Llama-2-7b-chat-hf", + # "google/gemma-2b-it", # Cannot be downloaded without access token + # "google/gemma-7b-it", # Cannot be downloaded without access token + ] + + if pytest.run_marker == "precommit": + model_ids = precommit_models + else: + model_ids = nightly_models + + prefix = pathlib.Path(os.getenv('GENAI_MODELS_PATH_PREFIX', '')) + return [(model_id, prefix / model_id.split('/')[1]) for model_id in model_ids] + + +def get_chat_templates(): + # Returns chat templates saved in tokenizer_configs.py, + # but skips some models that currently are not processed correctly. + + skipped_models = { + # TODO: openchat/openchat_3.5 and berkeley-nest/Starling-LM-7B-alpha have the same template. + # Need to enable and unskip, since it's preset in continious batching and has >100 000 downloads. + "openchat/openchat-3.5-0106", + + # These models fail even on HF so no need to check if applying chat matches. + "vibhorag101/llama-2-13b-chat-hf-phr_mental_therapy", + "codellama/CodeLlama-34b-Instruct-hf", + "deepseek-ai/deepseek-math-7b-rl", + "allenai/tulu-2-7b", + "alexsobolev/IcaroLM", + "tokyotech-llm/Swallow-7b-instruct-v0.1", + "bofenghuang/vigogne-2-7b-chat", + "OpenBuddy/openbuddy-mistral2-7b-v20.3-32k", + "AliAbdelrasheed/maqa_llama_4bit", + "stephenlzc/Mistral-7B-v0.3-Chinese-Chat-uncensored", + + # TODO: Need to support chat templates in more models: CVS-145963 + # Either ov_genai is unable to parse chat_template or results do not match with HF. + "meta-llama/Meta-Llama-3-8B-Instruct", + "databricks/dbrx-instruct", # Chat template is not supported by Jinja2Cpp + "mosaicml/mpt-30b-chat", + "deepseek-ai/deepseek-coder-6.7b-instruct", # Chat template is not supported by Jinja2Cpp + "maldv/winter-garden-7b-alpha", # Chat template is not supported by Jinja2Cpp + "ishorn5/RTLCoder-Deepseek-v1.1", # Chat template is not supported by Jinja2Cpp + "openchat/openchat-3.5-0106", + "casperhansen/llama-3-70b-instruct-awq", + "TheBloke/deepseek-coder-33B-instruct-GPTQ", + "AI-Sweden-Models/gpt-sw3-356m-instruct", + "google/gemma-7b-it", + "THUDM/cogvlm2-llama3-chat-19B", + "KnutJaegersberg/internlm-20b-llama", + "maywell/Synatra-Mixtral-8x7B", + "MediaTek-Research/Breeze-7B-Instruct-v1_0", + "bofenghuang/vigostral-7b-chat", + "meetkai/functionary-small-v2.5", # Chat template is not supported by Jinja2Cpp + "openchat/openchat-3.6-8b-20240522", + "tenyx/TenyxChat-7B-v1", + "LoneStriker/TinyLlama-1.1B-32k-Instruct-3.0bpw-h6-exl2", + "yam-peleg/Hebrew-Gemma-11B-V2", + "shenzhi-wang/Llama3-8B-Chinese-Chat", # AssertionError + "nlpai-lab/KULLM3", + "HuggingFaceH4/zephyr-7b-gemma-sft-v0.1", + "MediaTek-Research/Breeze-7B-Instruct-v0_1", + "shanchen/llama3-8B-slerp-biomed-chat-chinese", # AssertionError + "MLP-KTLim/llama-3-Korean-Bllossom-8B", + "aloobun/CosmicBun-8B", # Chat template is not supported by Jinja2Cpp + "codellama/CodeLlama-70b-Instruct-hf", + "gorilla-llm/gorilla-openfunctions-v2", # Chat template is not supported by Jinja2Cpp + "BramVanroy/Llama-2-13b-chat-dutch" + } + from tokenizer_configs import get_tokenizer_configs + return [(k, v) for k, v in get_tokenizer_configs().items() if k not in skipped_models] + + +@functools.lru_cache(1) +def read_model(params, **tokenizer_kwargs): + model_id, path = params + + from optimum.intel.openvino import OVModelForCausalLM + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) + + if (path / "openvino_model.xml").exists(): + opt_model = OVModelForCausalLM.from_pretrained(path, trust_remote_code=True, + compile=False, device='CPU') + else: + ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(tokenizer, + with_detokenizer=True, + **tokenizer_kwargs) + openvino.save_model(ov_tokenizer, path / "openvino_tokenizer.xml") + openvino.save_model(ov_detokenizer, path / "openvino_detokenizer.xml") + + # to store tokenizer config jsons with special tokens + tokenizer.save_pretrained(path) + + opt_model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True, + compile=False, device='CPU', load_in_8bit=False) + opt_model.generation_config.save_pretrained(path) + opt_model.config.save_pretrained(path) + opt_model.save_pretrained(path) + + return ( + model_id, + path, + tokenizer, + opt_model, + ov_genai.LLMPipeline(str(path), device='CPU', config={"ENABLE_MMAP": False}), + ) + + +# in OpenVINO GenAI this parameter is called stop_criteria, +# while in HF it's called early_stopping. +# HF values True, False and "never" correspond to OV GenAI values "EARLY", "HEURISTIC" and "NEVER" +STOP_CRITERIA_MAP = { + ov_genai.StopCriteria.NEVER: "never", + ov_genai.StopCriteria.EARLY: True, + ov_genai.StopCriteria.HEURISTIC: False +} + + +@pytest.fixture(scope="module") +def model_tmp_path(tmpdir_factory): + model_id, path, _, _, _ = read_model(get_models_list()[0]) + temp_path = tmpdir_factory.mktemp(model_id.replace('/', '_')) + + # copy openvino converted model and tokenizers + for pattern in ['*.xml', '*.bin']: + for src_file in path.glob(pattern): + if src_file.is_file(): + shutil.copy(src_file, temp_path / src_file.name) + yield model_id, Path(temp_path) + + +def load_tok(configs: List[Tuple], temp_path): + # load Tokenizer where all configs are cleared. + # remove existing jsons from previous tests + for json_file in temp_path.glob("*.json"): + json_file.unlink() + + for config_json, config_name in configs: + with (temp_path / config_name).open('w') as f: + json.dump(config_json, f) + return ov_genai.Tokenizer(str(temp_path), {}) + + +def load_pipe(configs: List[Tuple], temp_path): + # Load LLMPipline where all configs are cleared. + # remove existing jsons from previous tests + for json_file in temp_path.glob("*.json"): + json_file.unlink() + + for config_json, config_name in configs: + with (temp_path / config_name).open('w') as f: + json.dump(config_json, f) + return ov_genai.LLMPipeline(str(temp_path)) + + +@functools.lru_cache(1) +def get_continuous_batching(path): + scheduler_config = ov_genai.SchedulerConfig() + scheduler_config.cache_size = 1 + return ov_genai.LLMPipeline(str(path), ov_genai.Tokenizer(str(path)), device='CPU', config={"scheduler_config": scheduler_config}) diff --git a/tests/python_tests/pytest.ini b/tests/python_tests/pytest.ini new file mode 100644 index 0000000000..541e59c7e3 --- /dev/null +++ b/tests/python_tests/pytest.ini @@ -0,0 +1,8 @@ +[pytest] + +markers = + precommit + nightly + real_models + +addopts = -m precommit diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt new file mode 100644 index 0000000000..0e48cc125d --- /dev/null +++ b/tests/python_tests/requirements.txt @@ -0,0 +1,35 @@ +--extra-index-url https://download.pytorch.org/whl/cpu +optimum-intel @ git+https://github.com/huggingface/optimum-intel.git +numpy<2.0.0; sys_platform == 'darwin' +onnx==1.16.1 +pytest +llm_bench/python/who_what_benchmark + +# requirements for specific models +# - hf-tiny-model-private/tiny-random-RoFormerForCausalLM +rjieba +# - baichuan-inc/Baichuan2-7B-Chat +bitsandbytes +# - nomic-ai/gpt4all-falcon +# - Qwen/Qwen-7B +# - Qwen/Qwen-7B-Chat +# - mosaicml/mpt-7b +# - internlm/internlm2-7b +einops +# - Qwen/Qwen-7B +# - Qwen/Qwen-7B-Chat +transformers_stream_generator +# - openbmb/MiniCPM-V-2 +torchvision +# - openbmb/MiniCPM-V-2 +timm +# - Qwen/Qwen-7B +# - Qwen/Qwen-7B-Chat +# - Salesforce/xgen-7b-8k-base +tiktoken +# - microsoft/biogpt +sacremoses +# - openai/whisper-base +librosa +soundfile +datasets \ No newline at end of file diff --git a/tests/python_tests/test_cache_optimizations.py b/tests/python_tests/test_cache_optimizations.py new file mode 100644 index 0000000000..f3125976d2 --- /dev/null +++ b/tests/python_tests/test_cache_optimizations.py @@ -0,0 +1,146 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +from dataclasses import dataclass +from pathlib import Path +import sys +from typing import Dict, List, Optional + +import pytest + +import whowhatbench +from optimum.intel.openvino import OVModelForCausalLM + +from openvino_genai import ContinuousBatchingPipeline, SchedulerConfig, GenerationResult, GenerationConfig, CacheEvictionConfig, AggregationMode + +from openvino_tokenizers import convert_tokenizer +from openvino import serialize +from transformers import AutoTokenizer + +from common import TESTS_ROOT + + +def load_prompts_dataset(file_name : str) -> Dict[str, List[str]]: + file_path = TESTS_ROOT / 'data' / file_name + with open(file_path, 'r') as f: + return {"prompts": [s for s in f]} + +def get_scheduler_config(num_kv_blocks: int) -> SchedulerConfig: + scheduler_config = SchedulerConfig() + scheduler_config.num_kv_blocks = num_kv_blocks + scheduler_config.dynamic_split_fuse = True + scheduler_config.max_num_batched_tokens = 256 + scheduler_config.max_num_seqs = 256 + scheduler_config.use_cache_eviction = False + return scheduler_config + +@dataclass +class ConvertedModel: + model: OVModelForCausalLM + tokenizer: AutoTokenizer + model_path: Path + + +@pytest.fixture(scope='module') +def converted_model(tmp_path_factory): + model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" + model = OVModelForCausalLM.from_pretrained(model_id, export=True, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_id) + model_path = tmp_path_factory.mktemp("cacheopt_test_models") / model_id + model.save_pretrained(model_path) + ov_tokenizer, ov_detokenizer = convert_tokenizer(tokenizer, with_detokenizer=True, skip_special_tokens=True) + serialize(ov_tokenizer, model_path / "openvino_tokenizer.xml") + serialize(ov_detokenizer, model_path / "openvino_detokenizer.xml") + converted_model = ConvertedModel(model, tokenizer, model_path) + yield converted_model + del converted_model + del model + + +@dataclass +class CacheOptTestStruct: + prompt_file: str + max_new_tokens: int + num_kv_blocks: int + use_cache_eviction: bool + cache_eviction_config: Optional[CacheEvictionConfig] + similarity_threshold: float + avg_cache_usage_optimization_ratio: float # expecting no less than these optimization ratios + max_cache_usage_optimization_ratio: float + + +SHORT_CACHE_EVICTION_CONFIG = CacheEvictionConfig(start_size=32, recent_size=32, max_cache_size=128, aggregation_mode=AggregationMode.NORM_SUM) + +@pytest.mark.precommit +@pytest.mark.skipif(sys.platform in ("win32", "darwin"), reason="doesn't work on win due to optimum-intel export bug, segfault on mac") +@pytest.mark.parametrize("test_struct", [ + # prompts + generation length are longer than the eviction arena, eviction expected w/ impact to similarity + CacheOptTestStruct(prompt_file="long_prompts.txt", max_new_tokens=128, num_kv_blocks=100, use_cache_eviction=True, + cache_eviction_config=SHORT_CACHE_EVICTION_CONFIG, + similarity_threshold=0.8, + max_cache_usage_optimization_ratio=1.8, + avg_cache_usage_optimization_ratio=1.35), + + # prompts + generation length are shorter than the eviction arena, no eviction expected + CacheOptTestStruct(prompt_file="short_prompts.txt", max_new_tokens=32, num_kv_blocks=100, use_cache_eviction=True, + cache_eviction_config=SHORT_CACHE_EVICTION_CONFIG, + similarity_threshold=0.98, + max_cache_usage_optimization_ratio=0.95, # no improvement expected + avg_cache_usage_optimization_ratio=0.95), + + # short prompts, long generation - eviction expected + CacheOptTestStruct(prompt_file="short_prompts.txt", max_new_tokens=384, num_kv_blocks=100, use_cache_eviction=True, + cache_eviction_config=SHORT_CACHE_EVICTION_CONFIG, + similarity_threshold=0.94, + max_cache_usage_optimization_ratio=1.75, + avg_cache_usage_optimization_ratio=1.35), + +]) +@pytest.mark.parametrize("enable_prefix_caching", [True, False]) # prefix caching shouldn't impact similarity +def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, test_struct, enable_prefix_caching): + seqs_per_request = 5 + scheduler_config = get_scheduler_config(test_struct.num_kv_blocks) + + generation_config = GenerationConfig() # expecting default greedy sampling + generation_config.num_return_sequences = 1 + generation_config.max_new_tokens = test_struct.max_new_tokens + + scheduler_config_opt = get_scheduler_config(test_struct.num_kv_blocks) + scheduler_config_opt.use_cache_eviction = test_struct.use_cache_eviction + if scheduler_config_opt.use_cache_eviction: + scheduler_config_opt.cache_eviction_config = test_struct.cache_eviction_config + scheduler_config_opt.enable_prefix_caching = enable_prefix_caching + + model_path = converted_model.model_path + model_cb_noopt = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config, "CPU", {}) + model_cb_opt = ContinuousBatchingPipeline(model_path.absolute().as_posix(), scheduler_config_opt, "CPU", {}) + + tokenizer = converted_model.tokenizer + + data_dict = load_prompts_dataset(test_struct.prompt_file) + + evaluator = whowhatbench.TextEvaluator(base_model=model_cb_noopt, tokenizer=tokenizer, test_data=data_dict, + generation_config=generation_config, + generation_config_base=generation_config, + max_new_tokens=test_struct.max_new_tokens, seqs_per_request=seqs_per_request) + + _, all_metrics = evaluator.score(model_cb_opt) + + similarity_metric = float(all_metrics['similarity'][0]) + pipeline_opt_metrics = model_cb_opt.get_metrics() + pipeline_noopt_metrics = model_cb_noopt.get_metrics() + + print(f"Similarity: {similarity_metric}") + print(f"No-opt cache usage: max {pipeline_noopt_metrics.max_cache_usage:.3f}, avg {pipeline_noopt_metrics.avg_cache_usage:.3f}") + print(f"Opt cache usage: max {pipeline_opt_metrics.max_cache_usage:.3f}, avg {pipeline_opt_metrics.avg_cache_usage:.3f}") + max_optimization_ratio = (pipeline_noopt_metrics.max_cache_usage / pipeline_opt_metrics.max_cache_usage) + avg_optimization_ratio = (pipeline_noopt_metrics.avg_cache_usage / pipeline_opt_metrics.avg_cache_usage) + print(f"Optimization ratios: max {max_optimization_ratio:.3f}x, avg {avg_optimization_ratio:.3f}x") + + assert similarity_metric > test_struct.similarity_threshold + assert max_optimization_ratio >= test_struct.max_cache_usage_optimization_ratio + assert avg_optimization_ratio >= test_struct.avg_cache_usage_optimization_ratio + + del model_cb_opt + del model_cb_noopt + + diff --git a/tests/python_tests/test_chat_generate_api.py b/tests/python_tests/test_chat_generate_api.py new file mode 100644 index 0000000000..b68de6372d --- /dev/null +++ b/tests/python_tests/test_chat_generate_api.py @@ -0,0 +1,224 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import math +import openvino +import openvino_tokenizers +import openvino_genai as ov_genai +import pytest +from typing import Dict, Tuple +from ov_genai_test_utils import ( + get_models_list, + get_chat_models_list, + read_model, + load_tok, + model_tmp_path, + get_chat_templates, + get_continuous_batching, +) + + +configs = [ + dict(max_new_tokens=20), + dict(num_beam_groups=3, num_beams=15, num_return_sequences=1, max_new_tokens=10, diversity_penalty=1.0) +] + + +quenstions = [ + '1+1=', + 'What is the previous answer?', + 'Why is the Sun yellow?', + 'What was my first question?' +] + + +@pytest.mark.parametrize("generation_config", configs) +@pytest.mark.parametrize("model_descr", get_chat_models_list()) +@pytest.mark.precommit +@pytest.mark.nightly +def test_chat_compare_with_HF(model_descr, generation_config: Dict): + device = 'CPU' + chat_history_hf = [] + chat_history_ov = [] + chat_prompt = '' + + # Will set add_special_tokens=False inside pipeline when start_chat() is called. + model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) + + pipe.start_chat() + for prompt in quenstions: + chat_history_hf.append({'role': 'user', 'content': prompt}) + chat_history_ov.append({'role': 'user', 'content': prompt}) + + chat_prompt = tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False) + + answer = model_opt.generate(**tokenized, **generation_config, do_sample=False, repetition_penalty = None) + answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True) + chat_history_hf.append({'role': 'assistant', 'content': answer_str}) + + answer_ov = pipe.generate(prompt, **generation_config) + chat_history_ov.append({'role': 'assistant', 'content': answer_ov}) + + pipe.finish_chat() + + if chat_history_ov != chat_history_hf: + print(f'hf_output: {chat_history_hf}') + print(f'ov_output: {chat_history_ov}') + assert chat_history_ov == chat_history_hf + + +@pytest.mark.parametrize("generation_config", configs) +@pytest.mark.parametrize("model_descr", get_chat_models_list()) +@pytest.mark.precommit +@pytest.mark.nightly +def test_chat_compare_text_history_with_HF(model_descr, generation_config: Dict): + # compares with HF when history in ov_genai is save as a text + device = 'CPU' + chat_history_hf = [] + chat_history_ov = [] + chat_prompt = '' + + # HF in chat scenario does not add special tokens, but openvino tokenizer by default is converted with add_special_tokens=True. + # Need to regenerate openvino_tokenizer/detokenizer. + model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'), add_special_tokens=False) + + for prompt in quenstions: + chat_history_hf.append({'role': 'user', 'content': prompt}) + chat_history_ov.append({'role': 'user', 'content': prompt}) + + chat_prompt = tokenizer.apply_chat_template(chat_history_hf, tokenize=False, add_generation_prompt=True) + tokenized = tokenizer(chat_prompt, return_tensors='pt', add_special_tokens=False) + + answer = model_opt.generate(**tokenized, **generation_config, do_sample=False, repetition_penalty = None) + answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True) + chat_history_hf.append({'role': 'assistant', 'content': answer_str}) + + chat_prompt = pipe.get_tokenizer().apply_chat_template(chat_history_ov, add_generation_prompt=True) + answer_ov = pipe.generate(chat_prompt, **generation_config) + chat_history_ov.append({'role': 'assistant', 'content': answer_ov}) + + if chat_history_ov != chat_history_hf: + print(f'hf_output: {chat_history_hf}') + print(f'ov_output: {chat_history_ov}') + assert chat_history_ov == chat_history_hf + + +@pytest.mark.parametrize("generation_config", configs) +@pytest.mark.parametrize("model_descr", get_chat_models_list()) +@pytest.mark.precommit +@pytest.mark.nightly +def test_chat_compare_statefull_vs_text_history(model_descr, generation_config: Dict): + # Check that when history is stored in KV cache results are the same as when history stored in a text. + device ='CPU' + + chat_history_with_kv_cache = [] + chat_history_ov = [] + + # HF in chat scenario does not add special tokens, but openvino tokenizer by default is converted with add_special_tokens=True. + # Need to regenerate openvino_tokenizer/detokenizer. + model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'), add_special_tokens=False) + pipe_with_kv_cache = ov_genai.LLMPipeline(str(path), device, config={"ENABLE_MMAP": False}) + + pipe_with_kv_cache.start_chat() + for question in quenstions: + chat_history_with_kv_cache.append({'role': 'user', 'content': question}) + answer = pipe_with_kv_cache.generate(question, **generation_config) + chat_history_with_kv_cache.append({'role': 'assistant', 'content': answer}) + + chat_history_ov.append({'role': 'user', 'content': question}) + prompt = pipe.get_tokenizer().apply_chat_template(chat_history_ov, add_generation_prompt=True) + answer = pipe.generate(prompt, **generation_config) + chat_history_ov.append({'role': 'assistant', 'content': answer}) + pipe_with_kv_cache.finish_chat() + + if chat_history_ov != chat_history_with_kv_cache: + print(f'kvcache_hist: {chat_history_with_kv_cache}') + print(f'text_history: {chat_history_ov}') + assert chat_history_ov == chat_history_with_kv_cache + + +conversation = [ + {'role': 'user', 'content': '1+1='}, + {'role': 'assistant', 'content': '1 + 1 = 2'}, + {'role': 'user', 'content': 'What is the previous answer?'}, + {'role': 'assistant', 'content': 'The previous answer was: 1 + 1 = 2. Please ask me your next question.'}, + {'role': 'user', 'content': 'Why is the sun yellow?'}, + {'role': 'assistant', 'content': 'Because it emits yeloow light.'}, + {'role': 'user', 'content': 'What was my first question?'}, +] +@pytest.mark.precommit +@pytest.mark.nightly +@pytest.mark.parametrize('chat_config', get_chat_templates()) +def test_apply_chat_template(model_tmp_path, chat_config: Tuple[str, Dict]): + tokenizer_config = chat_config[1] + + # Will load openvino_model for tiny-random-phi as a placeholder + # but indeed only Tokenizer and apply_chat_template will be tested. + model_id, path, tokenizer, opt_model, pipe = read_model(get_models_list()[0]) + + full_history_str_hf = tokenizer.apply_chat_template(conversation, + add_generation_prompt=False, + tokenize=False, + **tokenizer_config) + + tok = load_tok([(tokenizer_config, "tokenizer_config.json")], model_tmp_path[1]) + full_history_str = tok.apply_chat_template(conversation, add_generation_prompt=False) + if full_history_str != full_history_str_hf: + print(f'hf reference: {full_history_str_hf}') + print(f'ov_genai out: {full_history_str}') + assert full_history_str == full_history_str_hf + + +@pytest.mark.parametrize("generation_config", configs[1:]) +@pytest.mark.parametrize("model_descr", get_chat_models_list()) +@pytest.mark.precommit +def test_chat_continuous_batching_vs_stateful(model_descr, generation_config: Dict): + model_id, path, tokenizer, model, stateful = read_model((model_descr[0], model_descr[1] / '_test_chat')) + cb = get_continuous_batching(path) + stateful.start_chat() + cb.start_chat() + for question in quenstions: + generated = cb.generate(question, **generation_config) + reference = stateful.generate(question, **generation_config) + assert generated == reference + # Test that finish_chat() doesn't fail just in case. + cb.finish_chat() + +@pytest.mark.precommit +@pytest.mark.nightly +def test_set_chat_template(): + model_descr = get_chat_models_list()[0] + model_id, path, tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) + pipe.get_tokenizer().set_chat_template("{% for message in messages %}{{ message['content'] }}{% endfor %}") + pipe.start_chat() + generated = pipe.generate("a", max_new_tokens=1) + pipe.finish_chat() + reference = pipe.generate("a", max_new_tokens=1) + assert generated == reference + +prompts = [ + '1+1=', + 'What is the previous answer?', + 'Why is the Sun yellow?', + 'What was my first question?', + ['Why is the Sun yellow?'], + "若我有一亿美元,在人工智能盛行的今天,我怎样投资才能收益最大化?", + "מחרוזת בדיקה", + "Multiline\nstring!\nWow!", +] + +@pytest.mark.precommit +@pytest.mark.nightly +@pytest.mark.parametrize("add_special_tokens", [True, False]) +@pytest.mark.parametrize("prompt", prompts) +def test_add_special_tokens(add_special_tokens, prompt): + import numpy as np + model_descr = get_chat_models_list()[0] + model_id, path, hf_tokenizer, model_opt, pipe = read_model((model_descr[0], model_descr[1] / '_test_chat')) + genai_tokenzier = pipe.get_tokenizer() + + # Calling encode with add_special_tokens will set state flag. + res_genai = genai_tokenzier.encode(prompt, add_special_tokens).input_ids.data + res_hf = hf_tokenizer(prompt, return_tensors="np", add_special_tokens=add_special_tokens)["input_ids"] + assert np.all(res_genai == res_hf) diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py new file mode 100644 index 0000000000..f80729d425 --- /dev/null +++ b/tests/python_tests/test_generate_api.py @@ -0,0 +1,809 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import openvino_genai as ov_genai +from openvino_genai import StopCriteria +import pytest +import transformers +from typing import Union, List, Dict, Optional +import numpy as np +import openvino as ov +import sys +from pathlib import Path +import torch +import math +from ov_genai_test_utils import ( + get_models_list, + read_model, + load_pipe, + load_tok, + model_tmp_path, + STOP_CRITERIA_MAP, + get_continuous_batching, +) + + +def run_hf_ov_genai_comparison_batched(model_descr, generation_config: Dict, prompts: Union[str, List[str]]): + model_id, path, tokenizer, model, pipe = model_descr + config = generation_config.copy() # to avoid side effects + num_beams = config['num_beams'] if 'num_beams' in config else 1 + config['num_return_sequences'] = num_beams + + if not isinstance(prompts, list): + prompts = [prompts] + + if 'do_sample' not in config: + # Some HF models have default do_sample = True, and if we set beam search generation config + # it conflicts with `diversity_penalty` and/or `num_beam_groups`. + # Need to set exlicitly to False, but only if test arguments omitted this arg. + # Do not apply 'repetition_penalty' if sampling is not used. + config['do_sample'] = False + config['repetition_penalty'] = None + + generation_config_hf = config.copy() + if generation_config_hf.get('stop_criteria'): + generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')] + generation_config_hf.pop('ignore_eos', None) + + # Encode the batch of prompts + tokenizer.padding_side = "left" + encoded_prompts = tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, add_special_tokens=True) + prompt_ids, attention_mask = encoded_prompts['input_ids'], encoded_prompts['attention_mask'] + + hf_encoded_outputs = model.generate(prompt_ids, attention_mask=attention_mask, **generation_config_hf) + + hf_outputs = [] + for idx, hf_encoded_out in enumerate(hf_encoded_outputs): + prompt_count = idx // num_beams + hf_outputs.append(tokenizer.decode(hf_encoded_out[prompt_ids[prompt_count].shape[0]:], skip_special_tokens=True)) + + ov_outputs = pipe.generate(prompts, **config).texts + + hf_outputs.sort() + ov_outputs.sort() + for i, (hf_output, ov_output) in enumerate(zip(hf_outputs, ov_outputs)): + if hf_output != ov_output: + print(f'hf_output: {hf_output}') + print(f'ov_output: {ov_output}') + assert hf_output == ov_output + +def run_hf_ov_genai_comparison(model_descr, generation_config: Dict, prompt: str): + model_id, path, tokenizer, model, pipe = model_descr + + config = generation_config.copy() # to avoid side effects + + if 'do_sample' not in config: + # Some HF models have default do_sample = True, and if we set beam search generation config + # it conflicts with `diversity_penalty` and/or `num_beam_groups`. + # Need to set explicitly to False, but only if test arguments omitted this arg. + # Do not apply 'repetition_penalty' if sampling is not used. + config['do_sample'] = False + config['repetition_penalty'] = None + + generation_config_hf = config.copy() + if generation_config_hf.get('stop_criteria'): + generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')] + generation_config_hf.pop('ignore_eos', None) + + encoded_prompt = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=True) + hf_encoded_output = model.generate(encoded_prompt, **generation_config_hf) + hf_output = tokenizer.decode(hf_encoded_output[0, encoded_prompt.shape[1]:], skip_special_tokens=True) + + ov_output = pipe.generate(prompt, **config) + if config.get('num_return_sequences', 1) > 1: + assert hf_output in ov_output.texts + else: + if hf_output != ov_output: + print(f'hf_output: {hf_output}') + print(f'ov_output: {ov_output}') + + assert hf_output == ov_output + +def hf_ov_genai_tensors_comparison( + model_descr, + generation_config: Dict, + input_ids: np.ndarray, + attention_mask: Optional[np.array] = None + ): + device = 'CPU' + model_id, path, tokenizer, model, pipe = model_descr + + config = generation_config.copy() # to avoid side effects + + if 'do_sample' not in config: + # Some HF models have default do_sample = True, and if we set beam search generation config + # it conflicts with `diversity_penalty` and/or `num_beam_groups`. + # Need to set exlicitly to False, but only if test arguments omitted this arg. + # Do not apply 'repetition_penalty' if sampling is not used. + config['do_sample'] = False + config['repetition_penalty'] = None + + generation_config_hf = config.copy() + if generation_config_hf.get('stop_criteria'): + generation_config_hf['early_stopping'] = STOP_CRITERIA_MAP[generation_config_hf.pop('stop_criteria')] + generation_config_hf.pop('ignore_eos', None) + + if attention_mask is not None: + inputs_ov = ov_genai.TokenizedInputs(ov.Tensor(input_ids), ov.Tensor(attention_mask)) + inputs_hf = dict(inputs=torch.tensor(input_ids), attention_mask=torch.tensor(attention_mask)) + else: + inputs_hf = dict(inputs=torch.tensor(input_ids)) + inputs_ov = ov.Tensor(input_ids) + + hf_output = model.generate(**inputs_hf, **generation_config_hf) + + pipe = ov_genai.LLMPipeline(str(path), device) + ov_output = pipe.generate(inputs_ov, **config) + + hf_res = hf_output[0, input_ids.shape[1]:].numpy() + ov_res = np.array(ov_output.tokens, dtype=np.int64) + assert np.all(ov_res == hf_res) + + +test_cases = [ + (dict(max_new_tokens=20), 'table is made of'), + (dict(max_new_tokens=20), '你好! 你好嗎?'), + (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=30, diversity_penalty=1.0), 'Alan Turing was a'), + (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'table is made of'), + (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'The Sun is yellow because'), + (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.5), 'The Sun is yellow because'), +] +@pytest.mark.parametrize("generation_config,prompt", test_cases) +@pytest.mark.parametrize("model_descr", get_models_list()) +@pytest.mark.precommit +@pytest.mark.nightly +def test_decoding(model_descr, generation_config, prompt): + run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt) + +input_tensors_list = [ + # input_ids, attention_mask + (np.array([[1, 4, 42]], dtype=np.int64), None), + (np.array([[1, 4, 42]], dtype=np.int64), np.array([[1, 1, 1]], dtype=np.int64)), +] +@pytest.mark.parametrize("inputs", input_tensors_list) +@pytest.mark.parametrize("model_descr", get_models_list()) +@pytest.mark.precommit +@pytest.mark.nightly +def test_ov_tensors(model_descr, inputs): + hf_ov_genai_tensors_comparison(read_model(model_descr), dict(max_new_tokens=20), *inputs) + + +prompts = [ + 'table is made of', + '你好! 你好嗎?', + 'Alan Turing was a', + 'The Sun is yellow because', + ['The Sun is yellow because', 'Alan Turing was a', 'Alan Turing was a'] +] +@pytest.mark.parametrize("model_descr", get_models_list()) +@pytest.mark.parametrize("prompt", prompts) +@pytest.mark.precommit +@pytest.mark.nightly +@pytest.mark.xfail( + raises=TypeError, + reason="pybind was unable to find ov::Tensor from openvino yet", + strict=False, + condition=sys.platform in ["linux", "win32"] +) +def test_genai_tokenizer_encode(model_descr, prompt): + model_id, path, tokenizer, model, pipe = read_model(model_descr) + tok = pipe.get_tokenizer() + + encoded_ov = tok.encode(prompt).input_ids.data + if isinstance(prompt, list): + encoded_hf = tokenizer.batch_encode_plus(prompt)['input_ids'] + for tokens_ov, tokens_hf in zip(encoded_ov, encoded_hf): + assert np.all(tokens_ov == tokens_hf) + else: + encoded_hf = tokenizer.encode(prompt) + assert np.all(encoded_hf == encoded_ov[0]) + +encoded_prompts = [ + [1, 1591, 338, 1754, 310], + [1, 17102, 323, 3864, 471, 263], + + # chineze characters + [1, 29871, 30919, 31076, 30584, 29871, 30919, 31076, 232, 154, 145, 30882], + + # On meta-llama/Meta-Llama-3-8B-Instruct this becomes longer after removing the last token + [3113, 264, 364, 267], + + # batched tokens + [[1, 1591, 338, 1754, 310], [1, 1591, 338, 1754, 310], [1, 17102, 323, 3864, 471, 263]] +] +@pytest.mark.parametrize("model_descr", get_models_list()) +@pytest.mark.parametrize("encoded_prompt", encoded_prompts) +@pytest.mark.precommit +def test_genai_tokenizer_decode(model_descr, encoded_prompt): + model_id, path, tokenizer, model, pipe = read_model(model_descr) + tok = pipe.get_tokenizer() + decoded_ov = tok.decode(encoded_prompt) + + if isinstance(encoded_prompt[0], list): + decoded_hf = tokenizer.batch_decode(encoded_prompt, skip_special_tokens=True) + for tokens_ov, tokens_hf in zip(decoded_ov, decoded_hf): + assert np.all(tokens_ov == tokens_hf) + else: + decoded_hf = tokenizer.decode(encoded_prompt, skip_special_tokens=True) + assert decoded_hf == decoded_ov + + +test_configs = [ + dict(max_new_tokens=20), + dict(max_new_tokens=200, ignore_eos=True), + dict(max_new_tokens=20, num_beam_groups=3, num_beams=15, diversity_penalty=1.0) +] +batched_prompts = [ + ['table is made', 'They sky is blue because', 'Difference between Jupiter and Mars is that'], + ['hello', 'Here is the longest nowel ever: '], + ['Alan Turing was a', 'return 0', '你好! 你好嗎?'], + ['table is made', 'table is made [force left pad tokens]'] +] +@pytest.mark.parametrize("generation_config", test_configs) +@pytest.mark.parametrize("prompts", batched_prompts) +@pytest.mark.parametrize("model_descr", get_models_list()) +@pytest.mark.precommit +@pytest.mark.nightly +def test_multibatch(model_descr, generation_config, prompts): + run_hf_ov_genai_comparison_batched(read_model(model_descr), generation_config, prompts) + + +prompts = ['The Sun is yellow because', 'Difference between Jupiter and Mars is that', 'table is made of'] +@pytest.mark.parametrize("num_beam_groups", [2, 3, 8]) +@pytest.mark.parametrize("group_size", [5, 3, 10]) +@pytest.mark.parametrize("max_new_tokens", [20, 15]) +@pytest.mark.parametrize("diversity_penalty", [1.0 , 1.5]) +@pytest.mark.parametrize("prompt", prompts) +@pytest.mark.parametrize("model_descr", get_models_list()) +@pytest.mark.precommit +@pytest.mark.nightly +def test_beam_search_decoding(model_descr, num_beam_groups, group_size, + max_new_tokens, diversity_penalty, prompt): + generation_config = dict( + num_beam_groups=num_beam_groups, + num_beams=num_beam_groups * group_size, + diversity_penalty=diversity_penalty, + num_return_sequences=num_beam_groups * group_size, + max_new_tokens=max_new_tokens, + ) + run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt) + + +@pytest.mark.parametrize("stop_criteria", [StopCriteria.NEVER, StopCriteria.EARLY, StopCriteria.HEURISTIC]) +@pytest.mark.parametrize("prompt", prompts) +@pytest.mark.parametrize("max_new_tokens", [10, 80]) +@pytest.mark.parametrize("model_descr", get_models_list()) +@pytest.mark.precommit +@pytest.mark.nightly +def test_stop_criteria(model_descr, stop_criteria, prompt, max_new_tokens): + # todo: with EARLY stop_criteria looks like HF return unvalid out with sentence<eos><unk><unk> + # while genai ends sentence with <eos> + if (stop_criteria == StopCriteria.EARLY): + pytest.skip() + generation_config = dict( + num_beam_groups=2, + num_beams=2 * 3, + diversity_penalty=1.0, + num_return_sequences=2 * 3, + max_new_tokens=max_new_tokens, + stop_criteria=stop_criteria, + ) + run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt) + + +# test long sequences +@pytest.mark.parametrize("num_beam_groups", [2]) +@pytest.mark.parametrize("group_size", [5]) +@pytest.mark.parametrize("max_new_tokens", [800, 2000]) +@pytest.mark.parametrize("prompt", prompts) +@pytest.mark.parametrize("model_descr", get_models_list()) +@pytest.mark.nightly +def test_beam_search_long_sentences(model_descr, num_beam_groups, group_size, + max_new_tokens, prompt): + generation_config = dict( + num_beam_groups=num_beam_groups, + num_beams=num_beam_groups * group_size, + diversity_penalty=1.0, + num_return_sequences=num_beam_groups * group_size, + max_new_tokens=max_new_tokens, + ) + run_hf_ov_genai_comparison(read_model(model_descr), generation_config, prompt) + + +def user_defined_callback(subword): + print(subword) + + +@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) +@pytest.mark.precommit +@pytest.mark.nightly +def test_callback_one_string(callback): + pipe = read_model(get_models_list()[0])[4] + generation_config = pipe.get_generation_config() + generation_config.max_new_tokens = 10 + pipe.generate('table is made of', generation_config, callback) + + +@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) +@pytest.mark.precommit +@pytest.mark.nightly +def test_callback_batch_fail(callback): + pipe = read_model(get_models_list()[0])[4] + with pytest.raises(RuntimeError): + pipe.generate(['1', '2'], ov_genai.GenerationConfig(), callback) + + +@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) +@pytest.mark.precommit +@pytest.mark.nightly +def test_callback_kwargs_one_string(callback): + pipe = read_model(get_models_list()[0])[4] + pipe.generate('table is made of', max_new_tokens=10, streamer=callback) + +@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) +@pytest.mark.precommit +@pytest.mark.nightly +@pytest.mark.parametrize("model_descr", get_models_list()) +def test_callback_decoding_metallama(model_descr, callback): + # On metallam this prompt generates output which can shorten after adding new tokens. + # Test that streamer correctly handles such cases. + prompt = 'I have an interview about product speccing with the company Weekend Health. Give me an example of a question they might ask with regards about a new feature' + if model_descr[0] != 'meta-llama/Meta-Llama-3-8B-Instruct': + pytest.skip() + pipe = read_model(model_descr)[4] + pipe.generate(prompt, max_new_tokens=300, streamer=callback) + + +@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) +@pytest.mark.precommit +@pytest.mark.nightly +def test_callback_kwargs_batch_fail(callback): + pipe = read_model(get_models_list()[0])[4] + with pytest.raises(RuntimeError): + pipe.generate(['1', '2'], max_new_tokens=10, streamer=callback) + + +class Printer(ov_genai.StreamerBase): + def __init__(self, tokenizer): + # super() may work, but once you begin mixing Python and C++ + # multiple inheritance, things will fall apart due to + # differences between Python’s MRO and C++’s mechanisms. + ov_genai.StreamerBase.__init__(self) + self.tokenizer = tokenizer + def put(self, token_id): + # print(self.tokenizer.decode([token_id])) # Incorrect way to print, but easy to implement + print(token_id) # print only token because self.tokenizer.decode([token_id]) are not implemented yet + def end(self): + print('end') + + +@pytest.mark.precommit +@pytest.mark.nightly +def test_streamer_one_string(): + pipe = read_model(get_models_list()[0])[4] + generation_config = pipe.get_generation_config() + generation_config.max_new_tokens = 10 + printer = Printer(pipe.get_tokenizer()) + pipe.generate('table is made of', generation_config, printer) + + +@pytest.mark.precommit +@pytest.mark.nightly +def test_streamer_batch_fail(): + pipe = read_model(get_models_list()[0])[4] + printer = Printer(pipe.get_tokenizer()) + with pytest.raises(RuntimeError): + pipe.generate(['1', '2'], ov_genai.GenerationConfig(), printer) + + +@pytest.mark.precommit +@pytest.mark.nightly +def test_streamer_kwargs_one_string(): + pipe = read_model(get_models_list()[0])[4] + printer = Printer(pipe.get_tokenizer()) + pipe.generate('table is made of', max_new_tokens=10, do_sample=False, streamer=printer) + + +@pytest.mark.precommit +@pytest.mark.nightly +def test_streamer_kwargs_batch_fail(): + pipe = read_model(get_models_list()[0])[4] + printer = Printer(pipe.get_tokenizer()) + with pytest.raises(RuntimeError): + pipe.generate('', num_beams=2, streamer=printer) + + +@pytest.mark.precommit +@pytest.mark.nightly +@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) +def test_operator_with_callback_one_string(callback): + pipe = read_model(get_models_list()[0])[4] + ten_tokens = pipe.get_generation_config() + ten_tokens.max_new_tokens = 10 + pipe('talbe is made of', ten_tokens, callback) + + +@pytest.mark.precommit +@pytest.mark.nightly +@pytest.mark.parametrize("callback", [print, user_defined_callback, lambda subword: print(subword)]) +def test_operator_with_callback_batch_fail(callback): + pipe = read_model(get_models_list()[0])[4] + with pytest.raises(RuntimeError): + pipe(['1', '2'], ov_genai.GenerationConfig(), callback) + + +@pytest.mark.precommit +@pytest.mark.nightly +def test_operator_with_streamer_kwargs_one_string(): + pipe = read_model(get_models_list()[0])[4] + printer = Printer(pipe.get_tokenizer()) + pipe('hi', max_new_tokens=10, do_sample=True, streamer=printer) + + +@pytest.mark.precommit +@pytest.mark.nightly +def test_operator_with_streamer_kwargs_batch_fail(): + pipe = read_model(get_models_list()[0])[4] + printer = Printer(pipe.get_tokenizer()) + with pytest.raises(RuntimeError): + pipe('', num_beams=2, streamer=printer) + + +@pytest.mark.precommit +@pytest.mark.nightly +def test_load_special_tokens_ids_1(model_tmp_path): + # test when there is an available config.json + config_json = { + "pad_token_id": 422, + "bos_token_id": 42, + "eos_token_id": 37, + } + tok = load_tok([(config_json, "config.json")], model_tmp_path[1]) + assert tok.get_pad_token_id() == config_json['pad_token_id'] + assert tok.get_bos_token_id() == config_json['bos_token_id'] + assert tok.get_eos_token_id() == config_json['eos_token_id'] + + +@pytest.mark.precommit +@pytest.mark.nightly +def test_load_special_tokens_str_2(model_tmp_path): + # test with special_tokens_map + special_tokens_map_json = { + "pad_token": {"content": "<custom_pad>"}, + "bos_token": {"content": "<custom_bos>"}, + "eos_token": {"content": "<custom_eos>"}, + } + tok = load_tok([(special_tokens_map_json, "special_tokens_map.json")], model_tmp_path[1]) + assert tok.get_pad_token() == special_tokens_map_json['pad_token']["content"] + assert tok.get_bos_token() == special_tokens_map_json['bos_token']["content"] + assert tok.get_eos_token() == special_tokens_map_json['eos_token']["content"] + + +@pytest.mark.precommit +@pytest.mark.nightly +def test_load_special_tokens_3_(model_tmp_path): + # special_tokens_map is not available + # but tokenize_config.json exists + # will load both string and integer representations + tok_config_json = { + "added_tokens_decoder": { + "422": {"content": "<pad>"}, + "37": {"content": "<s>"}, + "42": {"content": "</s>"}, + }, + "pad_token": "<pad>", + "bos_token": "<s>", + "eos_token": "</s>", + } + + tok = load_tok([(tok_config_json, "tokenizer_config.json")], model_tmp_path[1]) + assert tok.get_pad_token() == tok_config_json['pad_token'] + assert tok.get_bos_token() == tok_config_json['bos_token'] + assert tok.get_eos_token() == tok_config_json['eos_token'] + + assert tok.get_pad_token_id() == 422 + assert tok.get_bos_token_id() == 37 + assert tok.get_eos_token_id() == 42 + + +@pytest.mark.precommit +@pytest.mark.nightly +def test_load_special_tokens_3(model_tmp_path): + # both config.json is availabel and tokenizer_config.json available + # check that it does not read int values from tokenizer_config.json if they are in config.json + tok_config_json = { + "added_tokens_decoder": { + # integers differ from config.json to check they don't override config.json + "777": {"content": "<pad>"}, + "888": {"content": "<s>"}, + "656": {"content": "</s>"}, + }, + "pad_token": "<pad>", + "bos_token": "<s>", + "eos_token": "</s>", + } + config_json = { + "pad_token_id": 422, + "bos_token_id": 42, + "eos_token_id": 37, + } + configs = [ + (tok_config_json, "tokenizer_config.json"), + (config_json, "config.json") + ] + tok = load_tok(configs, model_tmp_path[1]) + assert tok.get_pad_token_id() == config_json['pad_token_id'] + assert tok.get_bos_token_id() == config_json['bos_token_id'] + assert tok.get_eos_token_id() == config_json['eos_token_id'] + + assert tok.get_pad_token() == tok_config_json['pad_token'] + assert tok.get_bos_token() == tok_config_json['bos_token'] + assert tok.get_eos_token() == tok_config_json['eos_token'] + + +@pytest.mark.precommit +@pytest.mark.nightly +@pytest.mark.xfail( + raises=AssertionError, + reason="CVS-143410 ov tokenizer should be aligned with hf", + strict=False, +) +def test_load_special_tokens_4(model_tmp_path): + # only string representation is provided, find token integers by inference + model_id, temp_path = model_tmp_path + tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) + + special_tokens_map_json = {} + token_str_int_map = {} + special_token_names = ['pad_token', 'bos_token', 'eos_token'] + for token_str in special_token_names: + if hasattr(tokenizer, token_str): + token_val = getattr(tokenizer, token_str) + special_tokens_map_json.update({token_str: {"content": token_val}}) + token_id = tokenizer(token_val, add_special_tokens=False)['input_ids'][0] + token_str_int_map.update({token_str: token_id}) + + # since only string representations are present in the json will try to get by inference + tok = load_tok([(special_tokens_map_json, "special_tokens_map.json")], temp_path) + + # check ids inferred correctly for special tokens existing if HF tokenizer + if 'pad_token' in token_str_int_map: + assert tok.get_pad_token_id() == token_str_int_map['pad_token'] + if 'bos_token' in token_str_int_map: + assert tok.get_bos_token_id() == token_str_int_map['bos_token'] + if 'eos_token' in token_str_int_map: + assert tok.get_eos_token_id() == token_str_int_map['eos_token'] + + +invalid_configs = [ + dict(num_beam_groups=3, num_beams=15, do_sample=True), + dict(do_sample=True), # no eos_token_id no max_new_tokens, no max_len + dict(eos_token_id=42, ignore_eos=True), # no max_new_tokens, no max_len with ignore_eos + dict(repetition_penalty=-1.0, eos_token_id=42, max_new_tokens=20), # invalid penalty + dict(temperature=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid temp + dict(top_p=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_p + dict(top_k=0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_k +] +@pytest.mark.parametrize("generation_config", invalid_configs) +@pytest.mark.precommit +@pytest.mark.nightly +def test_invalid_configs(model_tmp_path, generation_config): + model_id, temp_path = model_tmp_path + config_json = {} + pipe = load_pipe([(config_json, "config.json")], temp_path) + with pytest.raises(RuntimeError): + pipe.generate('blah blah', **generation_config) + + +@pytest.mark.precommit +@pytest.mark.nightly +def test_valid_configs(model_tmp_path): + model_id, temp_path = model_tmp_path + pipe = load_pipe([({"eos_token_id": 37}, "config.json")], temp_path) + + config = ov_genai.GenerationConfig() + config.do_sample = True # no eos_token_id but it's loaded from config.json + pipe.set_generation_config(config) + +invalid_py_configs = [ + dict(num_beam_groups=3, num_beams=15, do_sample=True), + dict(unexisting_key_name=True), # no eos_token_id no max_new_tokens, no max_len + dict(eos_token_id=42, ignore_eos=True), # no max_new_tokens, no max_len with ignore_eos + dict(repetition_penalty=-1.0, eos_token_id=42, max_new_tokens=20), # invalid penalty + dict(temperature=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid temp + dict(top_p=-1.0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_p + dict(top_k=0, do_sample=True, eos_token_id=42, max_new_tokens=20), # invalid top_k +] +@pytest.mark.precommit +@pytest.mark.nightly +@pytest.mark.parametrize("generation_config", invalid_py_configs) +def test_python_generation_config_validation(model_tmp_path, generation_config): + model_id, temp_path = model_tmp_path + pipe = load_pipe([({"eos_token_id": 37}, "config.json")], temp_path) + + # 'unexisting_key_name' key validity is checked in pybind and ValueError will be returned + # instead of RuntimeError, which is returned when GenerationConfig values are validated + return_exception_type = ValueError if 'unexisting_key_name' in generation_config else RuntimeError + with pytest.raises(return_exception_type): + pipe.set_generation_config(ov_genai.GenerationConfig(**generation_config)) + + +@pytest.mark.precommit +@pytest.mark.nightly +def test_unicode_pybind_decoding_1(): + # On this model this prompt generates unfinished utf string. + # Test that pybind will not fail. + model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3') + pipe = read_model((model_id, path))[4] + res_str = pipe.generate(',', max_new_tokens=4) + assert '�' == res_str[-1] + + + +@pytest.mark.precommit +@pytest.mark.nightly +def test_unicode_pybind_decoding_2(): + # On this model this prompt generates unfinished utf string. + # Test that pybind will not fail. + model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3') + pipe = read_model((model_id, path))[4] + res_str = pipe.generate([","], max_new_tokens=4) + assert '�' == res_str.texts[0][-1] + + +@pytest.mark.precommit +@pytest.mark.nightly +def test_unicode_pybind_decoding_3(): + # On this model this prompt generates unfinished utf-8 string + # and streams it. Test that pybind will not fail while we pass string to python. + model_id, path = 'katuni4ka/tiny-random-phi3', Path('tiny-random-phi3') + pipe = read_model((model_id, path))[4] + res_str = [] + pipe.generate(",", max_new_tokens=4, streamer=lambda x: res_str.append(x)) + assert '�' == res_str[-1] + + +@pytest.mark.skip(reason="probably both models ov + hf doesn't fit to memory") +@pytest.mark.precommit +@pytest.mark.nightly +@pytest.mark.skipif(sys.platform.startswith("win"), reason="not enough space for this model on Win") +def test_left_pad(): + # test left pad tokenizer post processing implementation + prompts = [ + "The Sun is yellow because", + "The Sun is yellow because [force left pad tokens]" + ] + models = read_model(("microsoft/phi-1_5", Path("phi-1_5/"))) + + config = { + "max_new_tokens": 20, + "num_beam_groups": 2, + "num_beams": 2, + "num_return_sequences": 2, + "do_sample": False, + "diversity_penalty": 1.0, + # phi 1_5 has no eos_token_id in model configuration + # ov genai will detect eos_token_id from tokenizer config + # hf implementation doesn't fetch it from tokenizer config and defaults to None + # align ov genai and hf by setting eos_token_id explicitly + "eos_token_id": 50256, + } + + models[2].pad_token = models[2].eos_token + run_hf_ov_genai_comparison_batched(models, config, prompts) + + +@pytest.mark.parametrize("generation_config", test_configs) +@pytest.mark.parametrize("prompt", batched_prompts[1:]) # num_beams=15 diverges on the first prompt. +@pytest.mark.precommit +def test_continuous_batching_vs_stateful(prompt, generation_config): + model_id, path, tokenizer, model, stateful = read_model(( + "facebook/opt-125m", + Path("opt-125m") + )) + cb = get_continuous_batching(path) + generated = cb.generate(prompt, **generation_config) + reference = stateful.generate(prompt, **generation_config) + assert generated.texts == reference.texts + if 1 != generation_config.get("num_return_sequences", 1): + # Stateful puts zeroes to generated.scores. Don't compare them. + for gen, ref in zip(generated.scores, reference.scores): + assert math.isclose(gen, ref, abs_tol=0.0003) + +@pytest.mark.parametrize("prompt", prompts) +@pytest.mark.precommit +def test_cb_streamer_vs_return_vs_stateful(prompt): + model_id, path, tokenizer, model, stateful = read_model(( + "facebook/opt-125m", + Path("opt-125m") + )) + cb = get_continuous_batching(path) + streamed = [] + generated = cb.generate(prompt, max_new_tokens=20, streamer=lambda subword: streamed.append(subword)) + reference = stateful.generate(prompt, max_new_tokens=20) + assert generated == "".join(streamed) + assert "".join(streamed) == reference + +def run_perf_metrics_collection(model_descr, generation_config: Dict, prompt: str) -> ov_genai.PerfMetrics: + model_id, path, tokenizer, model, pipe = model_descr + + config = generation_config.copy() # to avoid side effects + + if 'do_sample' not in config: + # Some HF models have default do_sample = True, and if we set beam search generation config + # it conflicts with `diversity_penalty` and/or `num_beam_groups`. + # Need to set explicitly to False, but only if test arguments omitted this arg. + # Do not apply 'repetition_penalty' if sampling is not used. + config['do_sample'] = False + config['repetition_penalty'] = None + return pipe.generate([prompt], **config).perf_metrics + + +test_cases = [ + (dict(max_new_tokens=20), 'table is made of'), +] +@pytest.mark.parametrize("generation_config,prompt", test_cases) +@pytest.mark.parametrize("model_descr", get_models_list()) +@pytest.mark.precommit +@pytest.mark.nightly +def test_perf_metrics(model_descr, generation_config, prompt): + import time + start_time = time.perf_counter() + perf_metrics = run_perf_metrics_collection(read_model(model_descr), generation_config, prompt) + total_time = (time.perf_counter() - start_time) * 1000 + + # Check that load time is adequate. + load_time = perf_metrics.get_load_time() + assert load_time > 0 and load_time < 1000.0 + + # Check that num input and generated tokens are adequate. + num_generated_tokens = perf_metrics.get_num_generated_tokens() + assert num_generated_tokens > 0 and num_generated_tokens <= generation_config['max_new_tokens'] + + num_input_tokens = perf_metrics.get_num_input_tokens() + assert num_input_tokens > 0 and num_input_tokens <= len(prompt) + + mean_ttft, std_ttft = perf_metrics.get_ttft() + assert (mean_ttft, std_ttft) == (perf_metrics.get_ttft().mean, perf_metrics.get_ttft().std) + assert mean_ttft > 0 and mean_ttft < 1000.0 + + mean_tpot, std_tpot = perf_metrics.get_tpot() + assert (mean_tpot, std_tpot) == (perf_metrics.get_tpot().mean, perf_metrics.get_tpot().std) + assert mean_tpot > 0 and mean_ttft < 1000.0 + + mean_throughput, std_throughput = perf_metrics.get_throughput() + assert (mean_throughput, std_throughput) == (perf_metrics.get_throughput().mean, perf_metrics.get_throughput().std) + assert mean_throughput > 0 and mean_throughput < 20000.0 + + mean_gen_duration, std_gen_duration = perf_metrics.get_generate_duration() + assert (mean_gen_duration, std_gen_duration) == (perf_metrics.get_generate_duration().mean, perf_metrics.get_generate_duration().std) + assert mean_gen_duration > 0 and load_time + mean_gen_duration < total_time + assert std_gen_duration == 0 + + mean_tok_duration, std_tok_duration = perf_metrics.get_tokenization_duration() + assert (mean_tok_duration, std_tok_duration) == (perf_metrics.get_tokenization_duration().mean, perf_metrics.get_tokenization_duration().std) + assert mean_tok_duration > 0 and mean_tok_duration < mean_gen_duration + assert std_tok_duration == 0 + + mean_detok_duration, std_detok_duration = perf_metrics.get_detokenization_duration() + assert (mean_detok_duration, std_detok_duration) == (perf_metrics.get_detokenization_duration().mean, perf_metrics.get_detokenization_duration().std) + assert mean_detok_duration > 0 and mean_detok_duration < mean_gen_duration + assert std_detok_duration == 0 + + # assert that calculating statistics manually from the raw counters we get the same restults as from PerfMetrics + raw_metrics = perf_metrics.raw_metrics + raw_dur = np.array(raw_metrics.generate_durations) / 1000 + assert np.allclose(mean_gen_duration, np.mean(raw_dur)) + assert np.allclose(std_gen_duration, np.std(raw_dur)) + + raw_dur = np.array(raw_metrics.tokenization_durations) / 1000 + assert np.allclose(mean_tok_duration, np.mean(raw_dur)) + assert np.allclose(std_tok_duration, np.std(raw_dur)) + + raw_dur = np.array(raw_metrics.detokenization_durations) / 1000 + assert np.allclose(mean_detok_duration, np.mean(raw_dur)) + assert np.allclose(std_detok_duration, np.std(raw_dur)) + + assert len(raw_metrics.m_times_to_first_token) > 0 + assert len(raw_metrics.m_batch_sizes) > 0 + assert len(raw_metrics.m_durations) > 0 diff --git a/tests/python_tests/test_preemption.py b/tests/python_tests/test_preemption.py new file mode 100644 index 0000000000..239ae6399c --- /dev/null +++ b/tests/python_tests/test_preemption.py @@ -0,0 +1,178 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +from openvino_genai import GenerationConfig +from common import get_model_and_tokenizer, save_ov_model_from_optimum, generate_and_compare_with_reference_text, \ + get_scheduler_config, run_test_pipeline, get_beam_search, get_greedy, \ + get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \ + get_multinomial_temperature_and_top_k, get_multinomial_temperature, get_multinomial_temperature_and_top_p +from test_sampling import RandomSamplingTestStruct, get_current_plarform_ref_texts + + +def get_greedy_seq_len_300() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.num_return_sequences = 3 + generation_config.max_new_tokens = 300 + return generation_config + +def get_beam_search_seq_len_300() -> GenerationConfig: + generation_config = GenerationConfig() + generation_config.num_beam_groups = 3 + generation_config.num_beams = 6 + generation_config.max_new_tokens = 300 + generation_config.num_return_sequences = generation_config.num_beams + return generation_config + +scheduler_params_list = [({"num_kv_blocks": 2, "block_size": 32, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_greedy()), + ({"num_kv_blocks": 2, "block_size": 32, "dynamic_split_fuse": False, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_greedy()), + ({"num_kv_blocks": 10, "block_size": 32, "dynamic_split_fuse": True}, get_greedy_seq_len_300()), + ({"num_kv_blocks": 10, "block_size": 32, "dynamic_split_fuse": False}, get_greedy_seq_len_300()), + ({"num_kv_blocks": 34, "block_size": 32, "dynamic_split_fuse": True, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_beam_search()), + ({"num_kv_blocks": 34, "block_size": 32, "dynamic_split_fuse": False, "max_num_batched_tokens": 256, "max_num_seqs": 256}, get_beam_search()), + ({"num_kv_blocks": 100, "block_size": 32, "dynamic_split_fuse": True}, get_beam_search_seq_len_300()), + ({"num_kv_blocks": 100, "block_size": 32, "dynamic_split_fuse": False}, get_beam_search_seq_len_300())] +@pytest.mark.parametrize("params", scheduler_params_list) +@pytest.mark.precommit +def test_preemption(tmp_path, params): + run_test_pipeline(tmp_path, "facebook/opt-125m", params[0], params[1]) + + +multinomial_params = RandomSamplingTestStruct( + generation_config=[ + get_multinomial_temperature(), + get_multinomial_temperature_and_top_p(), + get_multinomial_temperature_and_top_k(), + ], + prompts=[ + "What is OpenVINO?", + "How are you?", + "Tell me something about Canada?", + ], + ref_texts=get_current_plarform_ref_texts({ + "linux": [ + [ + "\n\nOpenVINO is a live platform that allows users to create and manage a new library for open source applications.\n\nOpenVINO is" + ], + [ + " You're getting much better results from doing this, than you are by not doing this. I have a BH and I was so far" + ], + [ + "\nI'm from Canada, and I'm from the US, so I'm not sure.\nI think you mean the Canadian version." + ], + ], + "win32": [ + [ + "\n\nOpenVINO is a live platform that allows users to create and manage a new library of applications on the Virtuoso server, which can" + ], + [ + " You're getting much better results from doing this, than you are by not doing this. If you are truly trying to do something good," + ], + [ + "\nI'm from Canada, and I'm from the US, so I'm not sure what you're talking about.\nI'm Canadian and I" + ], + ], + }), +) + + +# todo: Anastasiia Pnevskaya: fix the test because it is hanging according max_new_tokens = std::numeric_limits<std::size_t>::max() +@pytest.mark.parametrize("dynamic_split_fuse", [True, False]) +@pytest.mark.precommit +@pytest.mark.skip(reason="Random sampling results are non deterministic due to: discrete_distribution impl depends on platform, model inference results may depend on CPU. Test passes on CI but fails locally.") +def test_preemption_with_multinomial(tmp_path, dynamic_split_fuse): + generation_configs = multinomial_params.generation_config + for config in generation_configs: + config.rng_seed = 0 + config.max_new_tokens = 30 + model_id : str = "facebook/opt-125m" + model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) + + model_path : Path = tmp_path / model_id + save_ov_model_from_optimum(model, hf_tokenizer, model_path) + + scheduler_config = get_scheduler_config({"num_kv_blocks": 3, "block_size": 32, "dynamic_split_fuse": dynamic_split_fuse, "max_num_batched_tokens": 256, "max_num_seqs": 256}) + generate_and_compare_with_reference_text(model_path, multinomial_params.prompts, multinomial_params.ref_texts, generation_configs, scheduler_config) + + +multinomial_params_n_seq = RandomSamplingTestStruct( + generation_config=[ + get_multinomial_temperature(), + get_multinomial_temperature_and_num_return_sequence(), + get_multinomial_all_parameters(), + ], + prompts=[ + "Artificial intelligence ", + "What is the current", + "Tell me something about UAE?", + ], + ref_texts=get_current_plarform_ref_texts({ + "linux": [ + [ + "\nI've seen this expression used too many times without making sense.\nAs an AI engineer, and as a scientist, we should make everything easier" + ], + [ + " position of the Z-shaped groove?\n0.41\nWhat is the current position of the Z-shaped groove?\n0.11\n", + " status of all of this? I can't stop thinking about it.\nIt's been a while since I've seen it. I found it a", + " status of your blog? Do you accept feedback?\nYes, I’m happy to accept feedback at this time (I’m a" + ], + [ + "\nIt's in the middle of nowhere if you haven’t seen one yet! It might be more convenient there than anywhere else.. maybe take", + "\nUAE is a country with some great culture that has been living under Islamic oppression for almost 60 years now (including 20 years as part of Arab", + "\nNope, just wanted to say how awesome and beautiful it was when my brother came back from an adventure trip across Asia - our 2nd year", + "\nI don't know anything. I'm not sure what kind this sub wants though... but apparently they are pretty bad at making videos/photos", + ], + ], + "win32": [ + [ + "\nI've had a friend with the capacity to test this in his own words.\nThe big problem with real-world results is the economics of" + ], + [ + " position of the patent application number of the present invention?\n\nIn the present invention, the present invention relates to an improved method for manufacturing a semic", + " status of your town? How many houses do you have?\nThere are about three houses in our town. The closest place to us is about 25", + " status of all the other passengers?\nWe're the only ones left, so no...\nI don't think they'll really leave.\nThey" + ], + [ + "\nI don't have any knowledge on them. We are based out near Dubai so hopefully they will take care of us soon enough :) thanks though :", + "\nUAE is not one of the richest countries in Asia but definitely among those most corrupt nations because this corruption (and its own endemic practices) still", + "\nNope, I'm just going through my first semester there right now and it was nice to see some people who were doing well haha - we", + "\nIt's a country where your parents can never give you anything at all! It also has an extremely low education system for many years... You", + ], + ], + "darwin": [ + [ + "\nI've had a friend with the capacity to test this in his own words.\nThe big problem with real-world results is the rigidity" + ], + [ + " position of the patent application number of the present invention?\n\nIn the present invention, the present invention relates to an improved method for manufacturing a semic", + " status of your town? How many houses do you have?\nThere are about three houses in our town. The closest place to us is about 25", + " status of all the other passengers?\nWe're the only ones left, so no...\nI don't think they'll really leave.\nThey" + ], + [ + "\nI don't have any knowledge on them. We are based out near Dubai so hopefully they will take care of us soon enough :) thanks though :", + "\nUAE is not one of the richest countries in Asia but definitely among those most corrupt nations because this corruption (and its own endemic practices) still", + "\nNope, I'm just going through my first semester there right now and it was nice to see some people who were doing well haha - we", + "\nIt's a country where your parents can never give you anything at all! It also has an extremely low education system for many years... You", + ], + ], + }), +) + + +@pytest.mark.parametrize("dynamic_split_fuse", [True, False]) +@pytest.mark.precommit +@pytest.mark.skip(reason="Random sampling results are non deterministic due to: discrete_distribution impl depends on platform, model inference results may depend on CPU. Test passes on CI but fails locally.") +def test_preemption_with_multinomial_n_seq(tmp_path, dynamic_split_fuse): + generation_configs = multinomial_params_n_seq.generation_config + for config in generation_configs: + config.rng_seed = 0 + model_id : str = "facebook/opt-125m" + model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) + + model_path : Path = tmp_path / model_id + save_ov_model_from_optimum(model, hf_tokenizer, model_path) + + # needed kv_blocks - 16 (2 blocks per sequence (30 tokens to generated text + prompt (> 2 tokens)) * (1 + 3 + 4) seq ) + scheduler_config = get_scheduler_config({"num_kv_blocks": 8, "block_size": 32, "dynamic_split_fuse": dynamic_split_fuse, "max_num_batched_tokens": 256, "max_num_seqs": 256}) + generate_and_compare_with_reference_text(model_path, multinomial_params_n_seq.prompts, multinomial_params_n_seq.ref_texts, generation_configs, scheduler_config) \ No newline at end of file diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py new file mode 100644 index 0000000000..1e7a1b81a5 --- /dev/null +++ b/tests/python_tests/test_sampling.py @@ -0,0 +1,353 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +import os +import sys +import pytest +import shutil +import sys +from dataclasses import dataclass +from pathlib import Path +from openvino_genai import ContinuousBatchingPipeline, GenerationConfig, Tokenizer +from typing import List, TypedDict + +from common import run_test_pipeline, get_models_list, get_model_and_tokenizer, save_ov_model_from_optimum, \ + generate_and_compare_with_reference_text, get_greedy, get_beam_search, get_multinomial_temperature, \ + get_greedy_with_penalties, get_multinomial_temperature, \ + get_multinomial_temperature_and_top_k, get_multinomial_temperature_and_top_p, \ + get_multinomial_temperature_top_p_and_top_k, DEFAULT_SCHEDULER_CONFIG, get_greedy_with_repetition_penalty, \ + get_multinomial_all_parameters, get_multinomial_temperature_and_num_return_sequence, \ + generate_and_compare_with_reference_text, get_greedy, get_greedy_with_min_and_max_tokens, \ + get_greedy_with_single_stop_string, get_greedy_with_multiple_stop_strings, get_greedy_with_multiple_stop_strings_no_match, \ + get_beam_search, get_beam_search_min_and_max_tokens, get_beam_search_with_single_stop_string, \ + get_beam_search_with_multiple_stop_strings, get_beam_search_with_multiple_stop_strings_no_match, get_multinomial_max_and_min_token, \ + get_multinomial_temperature_and_frequence_penalty, get_multinomial_temperature_and_presence_penalty, \ + generate_and_compare_with_hf, get_multinomial_temperature_and_repetition_penalty, get_scheduler_config, \ + run_continuous_batching + + +@pytest.mark.precommit +@pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "precommit"))) +@pytest.mark.xfail( + raises=RuntimeError, + reason="Test fails with error: CPU: head size must be multiple of 16, current: X. CVS-145986.", + strict=True, +) +def test_sampling_precommit(tmp_path, model_id): + run_test_pipeline(tmp_path, model_id) + + +@pytest.mark.nightly +@pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "nightly"))) +def test_sampling_nightly(tmp_path, model_id): + run_test_pipeline(tmp_path, model_id) + +@pytest.mark.real_models +@pytest.mark.parametrize("model_id", get_models_list(os.path.join(os.path.dirname(os.path.realpath(__file__)), "models", "real_models"))) +def test_real_models(tmp_path, model_id): + run_test_pipeline(tmp_path, model_id) + + +@pytest.mark.precommit +def test_eos_beam_search(tmp_path): + ''' + Current test checks that in case of beam search, some generation results + explicitly have EOS token at the end, which is aligned with HF + + Example of current output: + { -1.23264, that I don't know about. + I don't know what you're talking about, but I'm pretty sure it's a Canadian thing.</s> } + ''' + model_id = "facebook/opt-125m" + prompts = ["Tell me something about Canada"] + generation_configs = [get_beam_search()] + scheduler_config = get_scheduler_config() + generate_and_compare_with_hf(model_id, prompts, generation_configs, scheduler_config, tmp_path) + + +@pytest.mark.precommit +def test_eos_greedy(tmp_path): + ''' + Current test checks that in case of gready, some generation results + explicitly have EOS token at the end, which is aligned with HF: + + Example of current output: + { a software program</s> } + ''' + model_id = "bigscience/bloomz-560m" + prompts = ["What is OpenVINO?"] + generation_configs = [get_greedy()] + scheduler_config = get_scheduler_config() + generate_and_compare_with_hf(model_id, prompts, generation_configs, scheduler_config, tmp_path) + +@pytest.mark.precommit +@pytest.mark.parametrize("generation_config", [get_greedy(), get_greedy_with_min_and_max_tokens(), get_greedy_with_repetition_penalty(), get_greedy_with_single_stop_string(), + get_greedy_with_multiple_stop_strings(), get_greedy_with_multiple_stop_strings_no_match(), + get_beam_search(), get_beam_search_min_and_max_tokens(), get_beam_search_with_multiple_stop_strings_no_match(), ], + ids=[ + "greedy", + "greedy_with_min_and_max_tokens", + "greedy_with_repetition_penalty", + "greedy_with_single_stop_string", + "greedy_with_multiple_stop_strings", + "greedy_with_multiple_stop_strings_no_match", + "beam", + "beam_search_min_and_max_tokens", + "beam_search_with_multiple_stop_strings_no_match", + ]) +def test_individual_generation_configs_deterministic(tmp_path, generation_config): + prompts = [ + "What is OpenVINO?", + ] + generation_configs = [generation_config] + model_id : str = "facebook/opt-125m" + generate_and_compare_with_hf(model_id, prompts, generation_configs, DEFAULT_SCHEDULER_CONFIG, tmp_path) + +@pytest.mark.precommit +@pytest.mark.xfail( + raises=AssertionError, + reason="Stop strings do not seem to work as expected with beam search in HF, so comparison will fail. If it changes, these cases shall be merged to the test above.", + strict=True, +) +@pytest.mark.parametrize("generation_config", [get_beam_search_with_single_stop_string(), get_beam_search_with_multiple_stop_strings(),], + ids=[ + "beam_search_with_single_stop_string", + "beam_search_with_multiple_stop_strings", + ]) +def test_beam_search_with_stop_string(tmp_path, generation_config): + prompts = [ + "What is OpenVINO?", + ] + generation_configs = [generation_config] + model_id : str = "facebook/opt-125m" + generate_and_compare_with_hf(model_id, prompts, generation_configs, DEFAULT_SCHEDULER_CONFIG, tmp_path) + + +class PlatformsRefTexts(TypedDict, total=False): + linux: List[List[str]] + win32: List[List[str]] + darwin: List[List[str]] + + +def get_current_plarform_ref_texts(ref_texts: PlatformsRefTexts) -> List[List[str]]: + # mac and win often have identical results + # to avoid duplication, use win32 ref_text if no mac ref_texts were found + if sys.platform == "darwin": + result = ref_texts.get("darwin") or ref_texts.get("win32") + else: + result = ref_texts.get(sys.platform) + if not result: + raise RuntimeError("No ref_texts were provided") + return result + + +@dataclass +class RandomSamplingTestStruct: + generation_config: GenerationConfig + prompts: List[str] + ref_texts: List[List[str]] + + +RANDOM_SAMPLING_TEST_CASES = [ + RandomSamplingTestStruct( + generation_config=get_multinomial_temperature(), + prompts=["What is OpenVINO?"], + ref_texts=[ + [ + "\n\nOpenVINO is a software development platform developed by OpenVINO, a set of technology companies and startups that enables developers to use the most" + ] + ], + ), + RandomSamplingTestStruct( + generation_config=get_multinomial_temperature_and_top_p(), + prompts=["What is OpenVINO?"], + ref_texts=get_current_plarform_ref_texts({ + "linux": [ + [ + "\nOpenVINO is an online application that allows users to create, test, and analyze their own software using a collection of software packages. The application" + ] + ], + "win32": [ + [ + "\n\nOpenVINO is a software development platform designed to allow developers to develop and commercialize the most important software products on the web. OpenV" + ] + ], + }) + ), + RandomSamplingTestStruct( + generation_config=get_multinomial_temperature_and_top_k(), + prompts=["What is OpenVINO?"], + ref_texts=[ + [ + "\n\nOpenVINO is a software that allows users to create a virtual machine with the ability to create a virtual machine in a virtual environment. Open" + ] + ], + ), + RandomSamplingTestStruct( + generation_config=get_multinomial_temperature_top_p_and_top_k(), + prompts=["What is OpenVINO?"], + ref_texts=get_current_plarform_ref_texts({ + "linux": [ + [ + "\nOpenVINO is an open source software that allows developers to create, manage, and distribute software. It is an open source project that allows developers" + ] + ], + "win32": [ + [ + "\n\nOpenVINO is a software that allows users to create a virtual machine with the ability to create a virtual machine in a virtual environment. Open" + ] + ], + }), + ), + RandomSamplingTestStruct( + generation_config=get_multinomial_temperature_and_repetition_penalty(), + prompts=["What is OpenVINO?"], + ref_texts=[ + [ + "\nOpen Vino's are a new and improved way to find cheap, fast-investment frozen vegetables that have no waste or calories. They're" + ] + ], + ), + RandomSamplingTestStruct( + generation_config=get_multinomial_temperature_and_num_return_sequence(), + prompts=["What is location of"], + ref_texts=[ + [ + " the exact same image?\nI've tried multiple times to find it, but I'm still not sure. I am sure it's the exact same", + " your new house?\nAnywhere that has a GPS. It will be up to you.", + " your cat? He is more likely to be on the floor with him.\nTalduck" + ] + ], + ), + RandomSamplingTestStruct( + generation_config=get_multinomial_all_parameters(), + prompts=["Tell me something about UAE"], + ref_texts=get_current_plarform_ref_texts({ + "linux": [ + [ + " and how it's not like we're all in the same boat right now lol (or even close) 😂😁! Just curious :) If", + "? You are my country... so what does our military do here?? What am i missing out on?? And why don't u tell us?", + "?\nThe U.S government has been doing quite well with foreign-made aircraft for many years under US administration....and they have very good reasons", + "? I think that is a bit of an anomaly, but you might want to ask yourself this question: Where can some young people from Dubai or Bahrain", + ] + ], + "win32": [ + [ + "? I think that is a bit of an anomaly, especially since there aren't many Americans living here (like us). What makes you say they've", + "? You are my country... so what does our future have to do with your problems?? \U0001f609\U0001f608\U0001f495 \U0001f5a4\ufffd", + "?\nThe U.S government has been doing quite well for decades now when compared strictly directly or indirectly as regards security issues.. They even made some", + " and how it's not like we're all in the same boat either! We had such fun meeting each other at different times this past summer :) It", + ] + ], + }), + ), + RandomSamplingTestStruct( + generation_config=get_multinomial_temperature_and_presence_penalty(), + prompts=["What is OpenVINO?"], + ref_texts=[ + [ + "\n\nOpenVINO is a software development platform developed by OpenVINO, Inc., which uses a RESTful API for server-side web applications" + ] + ], + ), + RandomSamplingTestStruct( + generation_config=get_multinomial_temperature_and_frequence_penalty(), + prompts=["What is OpenVINO?"], + ref_texts=[ + [ + "\n\nOpenVINO is a software development platform developed by OpenVINO, Inc., which offers the Linux-based platform. OpenVINO's" + ] + ], + ), + RandomSamplingTestStruct( + generation_config=get_greedy_with_penalties(), + prompts=["What is OpenVINO?"], + ref_texts=[ + [ + "\nOpenVINO is a software that allows users to create and manage their own virtual machines. It's designed for use with Windows, Mac OS X" + ] + ], + ), + RandomSamplingTestStruct( + generation_config=get_multinomial_max_and_min_token(), + prompts=["What is OpenVINO?"], + ref_texts=get_current_plarform_ref_texts({ + "linux": [ + [ + "\nOpenVINO is a Linux distro. It's not as simple as using the Linux distro itself. OpenVINO is essentially a dist", + "\nOpenVINO is an open-source open-source software that allows anyone to work with a virtual machine, from a smartphone to an iPhone,", + "\n\nOpenVINO is a social networking tool. OpenVINO is a free virtualization service that works at scale. The tool provides the ability", + ] + ], + "win32": [ + [ + "\nOpenVINO is the latest addition to the OpenVINO series of platforms. OpenVINO is an open source software development framework for all platforms", + "\nOpenVINO is a browser-based virtual assistant that enables developers and developers to quickly communicate with their own virtual machines. Using this virtual assistant,", + "\n\nOpenVINO is a program designed to help you find the best open source open source software. The program, which is a lightweight package and", + ] + ], + }), + ), +] + + +@pytest.mark.precommit +@pytest.mark.parametrize("test_struct", RANDOM_SAMPLING_TEST_CASES, + ids=["multinomial_temperature", + "multinomial_temperature_and_top_p", + "multinomial_temperature_and_top_k", + "multinomial_temperature_top_p_and_top_k", + "multinomial_temperature_and_repetition_penalty", + "multinomial_temperature_and_num_return_sequence", + "multinomial_all_parameters", + "multinomial_temperature_and_presence_penalty", + "multinomial_temperature_and_frequence_penalty", + "greedy_with_penalties", + "multinomial_max_and_min_token"]) +def test_individual_generation_configs_random(tmp_path, test_struct: RandomSamplingTestStruct): + generation_config = test_struct.generation_config + + prompts = test_struct.prompts + generation_config.rng_seed = 0 + generation_configs = [generation_config] + model_id : str = "facebook/opt-125m" + model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) + + model_path : Path = tmp_path / model_id + save_ov_model_from_optimum(model, hf_tokenizer, model_path) + + # run multinomial without comparison with reference + _ = run_continuous_batching(model_path, DEFAULT_SCHEDULER_CONFIG, prompts, generation_configs) + + # Reference comparison is not performed as sampling results are non-deterministic. + # Discrete_distribution impl depends on platform, model inference results may depend on CPU. + + + +@pytest.mark.precommit +@pytest.mark.parametrize("sampling_config", [get_greedy(), get_beam_search(), get_multinomial_all_parameters()]) +def test_post_oom_health(tmp_path, sampling_config): + generation_config = sampling_config + generation_config.ignore_eos = True + generation_config.max_new_tokens = 1000000 + + scheduler_config = get_scheduler_config() + # Low cache size to trigger OOM quickly + scheduler_config.num_kv_blocks = 10 + generation_configs = [generation_config] + model_id : str = "facebook/opt-125m" + model, hf_tokenizer = get_model_and_tokenizer(model_id, use_optimum=True) + + model_path : Path = tmp_path / model_id + save_ov_model_from_optimum(model, hf_tokenizer, model_path) + + pipe = ContinuousBatchingPipeline(model_path.absolute().as_posix(), Tokenizer(model_path.absolute().as_posix(), {}), scheduler_config, "CPU", {}) + # First run should return incomplete response + output = pipe.generate(["What is OpenVINO?"], generation_configs) + assert (len(output)) + assert(len(output[0].m_generation_ids)) + # Same for the second run, here we want to make sure the cleanup works and we have free blocks after recent OOM + output = pipe.generate(["What is OpenVINO?"], generation_configs) + assert (len(output)) + assert(len(output[0].m_generation_ids)) + del pipe + shutil.rmtree(model_path) diff --git a/tests/python_tests/test_vlm_api.py b/tests/python_tests/test_vlm_api.py new file mode 100644 index 0000000000..b32b2b5fb6 --- /dev/null +++ b/tests/python_tests/test_vlm_api.py @@ -0,0 +1,111 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import openvino_genai +import pytest +import gc +import os +import numpy as np +from PIL import Image +from multiprocessing import Process + +from openvino_genai import VLMPipeline +from openvino import Tensor +from common import get_greedy, get_image_by_link, get_beam_search, get_greedy, get_multinomial_all_parameters + +def get_ov_model(model_dir): + import sys + from pathlib import Path + #TODO: use optimum-intel + + sys.path.append(str(Path(__file__).resolve().parents[2] / 'samples/cpp/visual_language_chat')) + import importlib + export_MiniCPM = importlib.import_module("export_MiniCPM-V-2_6", "export_MiniCPM") + convert_llm = getattr(export_MiniCPM, "convert_llm") + convert_vision_encoder = getattr(export_MiniCPM, "convert_vision_encoder") + from transformers import AutoModel, AutoTokenizer, AutoProcessor + import os + import openvino_tokenizers + import openvino as ov + import gc + + model_id = "openbmb/MiniCPM-V-2_6" + ckpt = Path(os.path.join(model_dir, "ckpt")) + if not ckpt.exists(): + snapshot_download = getattr(export_MiniCPM, "snapshot_download") + patch_model_code = getattr(export_MiniCPM, "patch_model_code") + snapshot_download(model_id, local_dir=ckpt, force_download=True) + patch_model_code(ckpt) + model = AutoModel.from_pretrained(ckpt, trust_remote_code=True) + model.eval() + model.config.save_pretrained(model_dir) + tokenizer = AutoTokenizer.from_pretrained(ckpt, trust_remote_code=True) + tokenizer.save_pretrained(model_dir) + ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(tokenizer, with_detokenizer=True) + ov.save_model(ov_tokenizer, os.path.join(model_dir, "openvino_tokenizer.xml")) + ov.save_model(ov_detokenizer, os.path.join(model_dir, "openvino_detokenizer.xml")) + processor = AutoProcessor.from_pretrained(ckpt, trust_remote_code=True) + processor.save_pretrained(model_dir) + + convert_llm(model, model_dir) + del model.llm + gc.collect() + + convert_vision_encoder(model, model_dir) + return model_dir + +sampling_configs = [ + get_beam_search(), + get_greedy(), + get_multinomial_all_parameters() +] + +prompts = [ + "What is on the image?", + "What is special about this image?", + "Tell me more about this image." +] + +image_links = [ + "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg", + "https://github.com/user-attachments/assets/8c9ae017-7837-4abc-ae92-c1054c9ec350" +] + +image_links_for_testing = [ + [], + [image_links[0]], + [image_links[1], image_links[0]], + [image_links[0], image_links[2], image_links[1]] +] + +@pytest.mark.precommit +def test_vlm_pipeline(tmp_path): + import os + + def streamer(word: str) -> bool: + print(word, end="") + return False + + model_path = get_ov_model(os.path.join(tmp_path, "miniCPM")) + + for generation_config in sampling_configs: + for links in image_links_for_testing: + images = [] + for link in links: + images.append(get_image_by_link(link)) + + pipe = VLMPipeline(model_path, "CPU") + pipe.start_chat() + + pipe.generate(prompts[0], images=images, generation_config=generation_config, streamer=streamer) + + for prompt in prompts[1:]: + pipe.generate(prompt, generation_config=generation_config, streamer=streamer) + + pipe.finish_chat() + gc.collect() + del pipe + gc.collect() + + diff --git a/tests/python_tests/test_whisper_generate_api.py b/tests/python_tests/test_whisper_generate_api.py new file mode 100644 index 0000000000..96648f3620 --- /dev/null +++ b/tests/python_tests/test_whisper_generate_api.py @@ -0,0 +1,525 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import openvino_genai as ov_genai +import functools +import pytest +import openvino_tokenizers +import openvino +from ov_genai_test_utils import get_whisper_models_list +import datasets +from transformers import WhisperProcessor, pipeline, AutoTokenizer +from optimum.intel.openvino import OVModelForSpeechSeq2Seq +import json +import time +import typing + + +@functools.lru_cache(1) +def read_whisper_model(params, **tokenizer_kwargs): + model_id, path = params + + processor = WhisperProcessor.from_pretrained(model_id, trust_remote_code=True) + + if (path / "openvino_encoder_model.xml").exists(): + opt_model = OVModelForSpeechSeq2Seq.from_pretrained( + path, + trust_remote_code=True, + compile=False, + device="CPU", + load_in_8bit=False, + ) + else: + + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) + ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer( + tokenizer, with_detokenizer=True, **tokenizer_kwargs + ) + + openvino.save_model(ov_tokenizer, path / "openvino_tokenizer.xml") + openvino.save_model(ov_detokenizer, path / "openvino_detokenizer.xml") + + # to store tokenizer config jsons with special tokens + tokenizer.save_pretrained(path) + + opt_model = OVModelForSpeechSeq2Seq.from_pretrained( + model_id, + export=True, + trust_remote_code=True, + compile=False, + device="CPU", + load_in_8bit=False, + ) + opt_model.generation_config.save_pretrained(path) + opt_model.config.save_pretrained(path) + opt_model.save_pretrained(path) + + opt_pipe = pipeline( + "automatic-speech-recognition", + model=opt_model, + tokenizer=processor.tokenizer, + feature_extractor=processor.feature_extractor, + ) + + return ( + model_id, + path, + opt_pipe, + ov_genai.WhisperPipeline( + str(path), device="CPU", config={"ENABLE_MMAP": False} + ), + ) + + +def compare_genai_and_opt_pipelines(opt_pipe, genai_pipe, dataset_id): + ds = datasets.load_dataset(dataset_id, "clean", split="validation") + opt_infer_time = 0 + genai_infer_time = 0 + failed = 0 + for ds_row in ds: + audio_sample = ds_row["audio"] + + start = time.time() + genai_result = genai_pipe.generate(audio_sample["array"].tolist()) + genai_infer_time += time.time() - start + + start = time.time() + result = opt_pipe(audio_sample) + opt_infer_time += time.time() - start + + if genai_result.texts[0] != result["text"]: + print(f'HuggingFace: {result["text"]}\n genai: {genai_result.texts[0]}') + failed += 1 + print(f"Inference time\nOpt: {opt_infer_time}\nGenAI: {genai_infer_time}") + if failed > 0: + print(f"Filed: {failed}") + assert failed == 0 + + +def get_samples_from_dataset( + language: str = "en", length: int = 30, long_form: bool = False +): + if not long_form: + ds = datasets.load_dataset( + "mozilla-foundation/common_voice_11_0", + language, + split="test", + streaming=True, + trust_remote_code=True, + ) + else: + ds = datasets.load_dataset( + "distil-whisper/meanwhile", + split="test", + streaming=True, + trust_remote_code=True, + ) + + ds = typing.cast(datasets.IterableDataset, ds) + ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16000)) + ds = ds.take(length) + + return [x["audio"]["array"] for x in ds] + + +@pytest.mark.parametrize("model_descr", get_whisper_models_list()) +@pytest.mark.parametrize("dataset_id", ["hf-internal-testing/librispeech_asr_dummy"]) +@pytest.mark.precommit +def test_whisper_on_hf_dataset(model_descr, dataset_id): + model_id, path, opt_pipe, genai_pipe = read_whisper_model(model_descr) + + compare_genai_and_opt_pipelines(opt_pipe, genai_pipe, dataset_id) + + +@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) +@pytest.mark.precommit +def test_whisper_config_constructor(model_descr): + model_id, path = model_descr + + config = ov_genai.WhisperGenerationConfig(str(path / "generation_config.json")) + + with open(path / "generation_config.json") as f: + original_config = json.load(f) + + assert original_config["decoder_start_token_id"] == config.decoder_start_token_id + assert original_config["max_length"] == config.max_length + assert original_config["eos_token_id"] == config.eos_token_id + assert original_config["pad_token_id"] == config.pad_token_id + if "task_to_id" in original_config: + assert original_config["task_to_id"]["translate"] == config.translate_token_id + assert original_config["task_to_id"]["transcribe"] == config.transcribe_token_id + assert original_config["no_timestamps_token_id"] == config.no_timestamps_token_id + assert original_config["is_multilingual"] == config.is_multilingual + + assert set(original_config["begin_suppress_tokens"]) == set( + config.begin_suppress_tokens + ) + + assert set(original_config["suppress_tokens"]) == set(config.suppress_tokens) + + config = ov_genai.WhisperGenerationConfig( + suppress_tokens=[1, 2], + begin_suppress_tokens=[3, 4], + max_new_tokens=100, + lang_to_id={"<|_ru|>": 42}, + ) + + assert set(config.suppress_tokens) == set([1, 2]) + assert set(config.begin_suppress_tokens) == set([3, 4]) + assert config.max_new_tokens == 100 + assert config.lang_to_id["<|_ru|>"] == 42 + + +@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) +@pytest.mark.parametrize("test_sample", get_samples_from_dataset(length=1)) +@pytest.mark.precommit +def test_whisper_constructors(model_descr, test_sample): + model_id, path = model_descr + model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) + + expected = opt_pipe(test_sample)["text"] + + genai_result = ov_genai.WhisperPipeline( + str(path), device="CPU", config={"ENABLE_MMAP": False} + ).generate(test_sample) + + assert genai_result.texts[0] == expected + + genai_result = ov_genai.WhisperPipeline(str(path)).generate(test_sample) + + assert genai_result.texts[0] == expected + + tokenizer = ov_genai.Tokenizer(str(path)) + + genai_result = ov_genai.WhisperPipeline( + str(path), tokenizer=tokenizer, device="CPU", config={"ENABLE_MMAP": False} + ).generate(test_sample) + + assert genai_result.texts[0] == expected + + +@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) +@pytest.mark.parametrize("test_sample", get_samples_from_dataset(length=1)) +@pytest.mark.precommit +def test_max_new_tokens(model_descr, test_sample): + model_id, path = model_descr + model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) + + expected = opt_pipe(test_sample, max_new_tokens=30)["text"] + + genai_result = ov_genai.WhisperPipeline(str(path)).generate( + test_sample, max_new_tokens=30 + ) + + assert genai_result.texts[0] == expected + + tokenizer = ov_genai.Tokenizer(str(path)) + + genai_pipeline = ov_genai.WhisperPipeline( + str(path), tokenizer=tokenizer, device="CPU", config={"ENABLE_MMAP": False} + ) + config = genai_pipeline.get_generation_config() + config.max_new_tokens = 30 + genai_result = genai_pipeline.generate(test_sample, config) + + assert genai_result.texts[0] == expected + + +@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) +@pytest.mark.parametrize( + "test_sample", get_samples_from_dataset(language="fr", length=3) +) +@pytest.mark.precommit +def test_language_mode_fr(model_descr, test_sample): + model_id, path = model_descr + model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) + + expected = opt_pipe( + test_sample, max_new_tokens=30, generate_kwargs={"language": "fr"} + ) + + genai_result = pipe.generate(test_sample, max_new_tokens=30, language="<|fr|>") + + assert genai_result.texts[0] == expected["text"] + + config = pipe.get_generation_config() + config.max_new_tokens = 30 + config.language = "<|fr|>" + genai_result = pipe.generate(test_sample, config) + + assert genai_result.texts[0] == expected["text"] + + +@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) +@pytest.mark.parametrize( + "test_sample", get_samples_from_dataset(language="de", length=3) +) +@pytest.mark.precommit +def test_language_mode_de(model_descr, test_sample): + model_id, path = model_descr + model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) + + expected = opt_pipe( + test_sample, max_new_tokens=30, generate_kwargs={"language": "de"} + ) + + genai_result = pipe.generate(test_sample, max_new_tokens=30, language="<|de|>") + + assert genai_result.texts[0] == expected["text"] + + config = pipe.get_generation_config() + config.max_new_tokens = 30 + config.language = "<|de|>" + genai_result = pipe.generate(test_sample, config) + + assert genai_result.texts[0] == expected["text"] + + +@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) +@pytest.mark.parametrize( + "test_sample", get_samples_from_dataset(language="fr", length=3) +) +@pytest.mark.precommit +def test_task_mode(model_descr, test_sample): + model_id, path = model_descr + model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) + + expected = opt_pipe( + test_sample, + max_new_tokens=30, + generate_kwargs={"language": "fr", "task": "translate"}, + ) + + genai_result = pipe.generate( + test_sample, max_new_tokens=30, language="<|fr|>", task="translate" + ) + + assert genai_result.texts[0] == expected["text"] + + config = pipe.get_generation_config() + config.max_new_tokens = 30 + config.language = "<|fr|>" + config.task = "translate" + genai_result = pipe.generate(test_sample, config) + + assert genai_result.texts[0] == expected["text"] + + expected = opt_pipe( + test_sample, + max_new_tokens=30, + generate_kwargs={"language": "ru", "task": "translate"}, + ) + + genai_result = pipe.generate( + test_sample, max_new_tokens=30, language="<|ru|>", task="translate" + ) + + assert genai_result.texts[0] == expected["text"] + + config = pipe.get_generation_config() + config.max_new_tokens = 30 + config.language = "<|ru|>" + config.task = "translate" + genai_result = pipe.generate(test_sample, config) + + assert genai_result.texts[0] == expected["text"] + + # seems to be equivalent to translate task + expected = opt_pipe( + test_sample, + max_new_tokens=30, + generate_kwargs={"language": "en", "task": "transcribe"}, + ) + + genai_result = pipe.generate( + test_sample, max_new_tokens=30, language="<|en|>", task="transcribe" + ) + + assert genai_result.texts[0] == expected["text"] + + config = pipe.get_generation_config() + config.max_new_tokens = 30 + config.language = "<|en|>" + config.task = "transcribe" + genai_result = pipe.generate(test_sample, config) + + assert genai_result.texts[0] == expected["text"] + + +@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) +@pytest.mark.parametrize( + "test_sample", + [ + *get_samples_from_dataset(language="fr", length=2), + *get_samples_from_dataset(language="de", length=2), + *get_samples_from_dataset(language="es", length=2), + ], +) +@pytest.mark.precommit +def test_language_autodetect(model_descr, test_sample): + model_id, path = model_descr + model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) + + input_features = opt_pipe.feature_extractor(test_sample) + language_id = opt_pipe.model.detect_language(input_features["input_features"])[0] + # ensure detected language us not english + assert language_id != pipe.get_generation_config().lang_to_id["<|en|>"] + + expected = opt_pipe( + test_sample, + max_new_tokens=30, + ) + + genai_result = pipe.generate(test_sample, max_new_tokens=30) + + assert genai_result.texts[0] == expected["text"] + + +@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) +@pytest.mark.parametrize( + "test_sample", + [ + *get_samples_from_dataset(language="en", length=10, long_form=True), + ], +) +@pytest.mark.precommit +def test_return_timestamps_short_form(model_descr, test_sample): + model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) + # long form audio not supported yet + test_sample = test_sample[: 16000 * 30] + + expected = opt_pipe( + test_sample, + return_timestamps=True, + ) + + genai_result = pipe.generate( + test_sample.tolist(), + return_timestamps=True, + ) + + assert genai_result.texts[0] == expected["text"] + + assert len(genai_result.chunks) == len(expected["chunks"]) + + for opt_chunk, genai_chunk in zip(expected["chunks"], genai_result.chunks): + assert opt_chunk["text"] == genai_chunk.text + assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2) + assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2) + + +@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) +@pytest.mark.parametrize( + "test_sample", + [ + *get_samples_from_dataset(language="en", length=10, long_form=True), + ], +) +@pytest.mark.precommit +def test_return_timestamps_max_new_tokens_short_form(model_descr, test_sample): + model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) + # long form audio not supported yet + test_sample = test_sample[: 16000 * 30] + + expected = opt_pipe( + test_sample, + return_timestamps=True, + max_new_tokens=15, + generate_kwargs={"language": "en"}, + ) + + genai_result = pipe.generate( + test_sample.tolist(), + max_new_tokens=15, + return_timestamps=True, + language="<|en|>", + ) + + assert genai_result.texts[0] == expected["text"] + + assert len(genai_result.chunks) == len(expected["chunks"]) + + for opt_chunk, genai_chunk in zip(expected["chunks"], genai_result.chunks): + assert opt_chunk["text"] == genai_chunk.text + assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2) + if opt_chunk["timestamp"][1]: + assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2) + else: + assert opt_chunk["timestamp"][1] == None + assert round(genai_chunk.end_ts, 2) == -1.0 + + +@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) +@pytest.mark.parametrize( + "test_sample", + [ + *get_samples_from_dataset(language="en", length=10, long_form=True), + *get_samples_from_dataset(language="fr", length=10, long_form=True), + ], +) +@pytest.mark.precommit +def test_longform_audio_return_timestamps(model_descr, test_sample): + model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) + + expected = opt_pipe( + test_sample, + return_timestamps=True, + ) + + genai_result = pipe.generate( + test_sample, + return_timestamps=True, + ) + + assert genai_result.texts[0] == expected["text"] + + assert len(genai_result.chunks) == len(expected["chunks"]) + + for opt_chunk, genai_chunk in zip(expected["chunks"], genai_result.chunks): + assert opt_chunk["text"] == genai_chunk.text + assert opt_chunk["timestamp"][0] == round(genai_chunk.start_ts, 2) + if opt_chunk["timestamp"][1]: + assert opt_chunk["timestamp"][1] == round(genai_chunk.end_ts, 2) + else: + assert opt_chunk["timestamp"][1] == None + assert round(genai_chunk.end_ts, 2) == -1.0 + + +@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) +@pytest.mark.parametrize( + "test_sample", + [ + *get_samples_from_dataset(language="en", length=3, long_form=True), + *get_samples_from_dataset(language="sp", length=3, long_form=True), + ], +) +@pytest.mark.precommit +def test_longform_audio(model_descr, test_sample): + model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) + + expected = opt_pipe(test_sample, return_timestamps=True) + + genai_result = pipe.generate(test_sample) + + assert genai_result.texts[0] == expected["text"] + + assert genai_result.chunks == None + + +@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) +@pytest.mark.parametrize( + "test_sample", + get_samples_from_dataset(language="en", length=1), +) +@pytest.mark.precommit +def test_smoke(model_descr, test_sample): + model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) + + expected = opt_pipe(test_sample) + + genai_result = pipe.generate(test_sample) + + assert genai_result.texts[0] == expected["text"] + + assert "chunks" not in expected + assert genai_result.chunks == None diff --git a/tests/python_tests/tokenizer_configs.py b/tests/python_tests/tokenizer_configs.py new file mode 100644 index 0000000000..45d60f998d --- /dev/null +++ b/tests/python_tests/tokenizer_configs.py @@ -0,0 +1,1012 @@ + +def get_tokenizer_configs(): + return { + "meta-llama/Meta-Llama-3-8B-Instruct": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": None, + "unk_token": None, + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" + }, + "TheBloke/Mistral-7B-OpenOrca-GPTQ": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|im_end|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + }, + "TinyLlama/TinyLlama-1.1B-Chat-v1.0": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": "<unk>", + "unk_token": "<unk>", + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" + }, + "upstage/SOLAR-10.7B-Instruct-v1.0": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": None, + "unk_token": "<unk>", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{% if message['content']%}{{'### System:\n' + message['content']+'\n\n'}}{% endif %}{% elif message['role'] == 'user' %}{{'### User:\n' + message['content']+'\n\n'}}{% elif message['role'] == 'assistant' %}{{'### Assistant:\n' + message['content']}}{% endif %}{% if loop.last and add_generation_prompt %}{{ '### Assistant:\n' }}{% endif %}{% endfor %}" + }, + "Nondzu/zephyr-speakleash-010-pl-3072-32-16-0.01": { + "bos_token": "<s>", + "eos_token": "<|im_end|>", + "pad_token": "</s>", + "unk_token": "<unk>", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful assistant.' %}{% endif %}{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{{'<|im_start|>system\n' + system_message + '<|im_end|>\n'}}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + }, + "TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": None, + "unk_token": "<unk>", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" + }, + "vibhorag101/llama-2-13b-chat-hf-phr_mental_therapy": { + "bos_token": { + "__type": "AddedToken", + "content": "<s>", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False + }, + "eos_token": { + "__type": "AddedToken", + "content": "</s>", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False + }, + "pad_token": None, + "unk_token": { + "__type": "AddedToken", + "content": "<unk>", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False + }, + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content + ' ' + eos_token }}{% endif %}{% endfor %}" + }, + "Qwen/Qwen1.5-0.5B": { + "bos_token": None, + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + }, + "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ": { + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": "<|endoftext|>", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" + }, + "Felladrin/Llama-68M-Chat-v1": { + "bos_token": "<|im_start|>", + "eos_token": "<|im_end|>", + "pad_token": "<|im_end|>", + "unk_token": "<|endoftext|>", + "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + }, + "databricks/dbrx-instruct": { + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": "<|pad|>", + "unk_token": "<|endoftext|>", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif 'system' not in messages[0]['role'] %}{% set loop_messages = messages %}{% set system_message = 'You are DBRX, created by Databricks. You were last updated in December 2023. You answer questions based on information available up to that point.\nYOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, but provide thorough responses to more complex and open-ended questions.\nYou assist with various tasks, from writing to coding (using markdown for code blocks \u2014 remember to use ``` with code, JSON, and tables).\n(You do not have real-time data access or code execution capabilities. You avoid stereotyping and provide balanced perspectives on controversial topics. You do not provide song lyrics, poems, or news articles and do not divulge details of your training data.)\nThis is your system prompt, guiding your responses. Do not reference it, just respond to the user. If you find yourself talking about this message, stop. You should be responding appropriately and usually that means not mentioning this.\nYOU DO NOT MENTION ANY OF THIS INFORMATION ABOUT YOURSELF UNLESS THE INFORMATION IS DIRECTLY PERTINENT TO THE USER\\'S QUERY.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if system_message != false %}{{ '<|im_start|>system\n' + system_message | trim + '<|im_end|>\n'}}{% endif %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% else %}{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% endif %}{% if (add_generation_prompt == true and loop.last) %}{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}{% endif %}{% endfor %}" + }, + "speakleash/Bielik-7B-Instruct-v0.1": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": None, + "unk_token": "<unk>", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + eos_token }}{% endif %}{% endfor %}" + }, + "internlm/internlm2-chat-7b": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": "</s>", + "unk_token": "<unk>", + "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + }, + "Qwen/Qwen2-7B-Instruct": { + "bos_token": None, + "eos_token": "<|im_end|>", + "pad_token": "<|endoftext|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + }, + "codellama/CodeLlama-34b-Instruct-hf": { + "bos_token": { + "__type": "AddedToken", + "content": "<s>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "eos_token": { + "__type": "AddedToken", + "content": "</s>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "pad_token": None, + "unk_token": { + "__type": "AddedToken", + "content": "<unk>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}" + }, + "OpenBuddy/openbuddy-llama3-8b-v21.1-8k": { + "bos_token": None, + "eos_token": "<|end|>", + "pad_token": "<|pad|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{{'<|role|>' + message['role'] + '<|says|>' + message['content'] + '<|end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|role|>assistant<|says|>' }}{% endif %}" + }, + "mosaicml/mpt-30b-chat": { + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": None, + "unk_token": "<|endoftext|>", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif not 'system' in messages[0]['role'] %}{% set loop_messages = messages %}{% set system_message = 'A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if system_message != false %}{{ '<|im_start|>system\n' + system_message.strip() + '\n'}}{% endif %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% else %}{{ '\n' + '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' }}{% endif %}{% if (add_generation_prompt == true and loop.last) %}{{ '\n' + '<|im_start|>' + 'assistant' + '\n' }}{% elif (message['role'] == 'assistant') %}{% endif %}{% endfor %}" + }, + "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO": { + "bos_token": "<s>", + "eos_token": "<|im_end|>", + "pad_token": "</s>", + "unk_token": "<unk>", + "chat_template": "{{bos_token}}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + }, + "deepseek-ai/deepseek-coder-6.7b-instruct": { + "bos_token": { + "__type": "AddedToken", + "content": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "eos_token": { + "__type": "AddedToken", + "content": "<|EOT|>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "pad_token": { + "__type": "AddedToken", + "content": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "unk_token": None, + "chat_template": "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}" + }, + "deepseek-ai/deepseek-math-7b-rl": { + "bos_token": { + "__type": "AddedToken", + "content": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "eos_token": { + "__type": "AddedToken", + "content": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "pad_token": { + "__type": "AddedToken", + "content": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "unk_token": None, + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}" + }, + "FINGU-AI/FinguAI-Chat-v1": { + "bos_token": None, + "eos_token": "<|im_end|>", + "pad_token": "<|endoftext|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}" + }, + "allenai/tulu-2-7b": { + "bos_token": { + "__type": "AddedToken", + "content": "<s>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "eos_token": { + "__type": "AddedToken", + "content": "</s>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "pad_token": None, + "unk_token": { + "__type": "AddedToken", + "content": "<unk>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" + }, + "maldv/winter-garden-7b-alpha": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": "<unk>", + "unk_token": "<unk>", + "chat_template": "{{bos_token}}{% for message in messages %}{% if 'name' in message %}{{message['name'] + ('' if 'to' not in message else ' (to ' + message['to'] + ')') + ': ' + message['content'] + '\n\n'}}{% else %}{{message['content'] + '\n\n '}}{% endif %}{% endfor %}" + }, + "mlabonne/NeuralMonarch-7B": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": "</s>", + "unk_token": "<unk>", + "chat_template": "{% for message in messages %}{{bos_token + message['role'] + '\n' + message['content'] + eos_token + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\n' }}{% endif %}" + }, + "meta-llama/Llama-2-7b-chat-hf": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": None, + "unk_token": "<unk>", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}" + }, + "GritLM/GritLM-7B": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": "<s>", + "unk_token": "<unk>", + "chat_template": "{{ bos_token }}{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" + }, + "ishorn5/RTLCoder-Deepseek-v1.1": { + "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", + "eos_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "unk_token": None, + "chat_template": "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{{'### Response:\\n'}}\n" + }, + "jondurbin/bagel-34b-v0.2": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": "<unk>", + "unk_token": "<unk>", + "chat_template": "{%- for idx in range(0, messages|length) -%}\n{%- if messages[idx]['role'] == 'user' -%}\n{%- if idx > 1 -%}\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\n{%- else -%}\n{{- messages[idx]['content'] + ' [/INST]' -}}\n{%- endif -%}\n{% elif messages[idx]['role'] == 'system' %}\n{{- '[INST] <<SYS>>\\n' + messages[idx]['content'] + '\\n<</SYS>>\\n\\n' -}}\n{%- elif messages[idx]['role'] == 'assistant' -%}\n{{- ' ' + messages[idx]['content'] + ' ' + eos_token -}}\n{% endif %}\n{% endfor %}" + }, + "openchat/openchat-3.5-0106": { + "bos_token": "<s>", + "eos_token": "<|end_of_turn|>", + "pad_token": None, + "unk_token": "<unk>", + "chat_template": "{{ bos_token }}{% for message in messages %}{{ 'GPT4 Correct ' + message['role'].title() + ': ' + message['content'] + '<|end_of_turn|>'}}{% endfor %}{% if add_generation_prompt %}{{ 'GPT4 Correct Assistant:' }}{% endif %}" + }, + "mobiuslabsgmbh/aanaphi2-v0.1": { + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": "[PAD]", + "unk_token": "<|endoftext|>", + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'### Human: ' + message['content'].strip() + '\n' }}{% elif message['role'] == 'assistant' %}{{'### Assistant: ' + message['content'].strip() + '\n'}}{% endif %}{% endfor %}" + }, + "typeof/mistral-60m": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": "<unk>", + "unk_token": "<unk>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{bos_token + message['role'] + '\n' + message['content'] + eos_token + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\n' }}{% endif %}" + }, + "turboderp/Cat-Llama-3-70B-instruct": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|im_end|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nBelow is a conversation between a curious user and a helpful AI assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}" + }, + "saltlux/Ko-Llama3-Luxia-8B": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ content }}{% elif message['role'] == 'assistant' %}{{ content + '\\n' }}{% endif %}{% endfor %}" + }, + "h2oai/h2o-danube2-1.8b-chat": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": "<unk>", + "unk_token": "<unk>", + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|prompt|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% elif message['role'] == 'assistant' %}{{ '<|answer|>' + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|answer|>' }}{% endif %}{% endfor %}" + }, + "abhishek/autotrain-llama3-70b-orpo-v1": { + "bos_token": "<s>", + "eos_token": "<|im_end|>", + "pad_token": "<pad>", + "unk_token": None, + "chat_template": "{% for message in messages %}\n{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% if loop.last and add_generation_prompt %}{{'<|im_start|>assistant\n' }}{% endif %}{% endfor %}" + }, + "casperhansen/llama-3-70b-instruct-awq": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}" + }, + "01-ai/Yi-1.5-34B-Chat": { + "bos_token": "<|startoftext|>", + "eos_token": "<|im_end|>", + "pad_token": "<unk>", + "unk_token": "<unk>", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}" + }, + "allenai/OLMo-7B-Instruct": { + "bos_token": None, + "eos_token": "<|endoftext|>", + "pad_token": "<|padding|>", + "unk_token": None, + "chat_template": "{{ eos_token }}{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" + }, + "TheBloke/deepseek-coder-33B-instruct-GPTQ": { + "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", + "eos_token": "<|EOT|>", + "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "unk_token": None, + "chat_template": "{%- set found_item = false -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set found_item = true -%}\n {%- endif -%}\n{%- endfor -%}\n{%- if not found_item -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{{'### Response:\\n'}}\n" + }, + "cognitivecomputations/dolphin-2.8-mistral-7b-v02": { + "bos_token": "<s>", + "eos_token": "<|im_end|>", + "pad_token": "</s>", + "unk_token": "<unk>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + }, + "alexsobolev/IcaroLM": { + "bos_token": "<s>", + "eos_token": "<|im_end|>", + "pad_token": "<unk>", + "unk_token": "<unk>", + "chat_template": "{% for message in messages %}{% if message['from'] == 'human' %}{{'<|im_start|>user\n' + message['value'] + '<|im_end|>\n'}}{% elif message['from'] == 'gpt' %}{{'<|im_start|>assistant\n' + message['value'] + '<|im_end|>\n' }}{% else %}{{ '<|im_start|>system\n' + message['value'] + '<|im_end|>\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + }, + "tokyotech-llm/Swallow-7b-instruct-v0.1": { + "bos_token": { + "__type": "AddedToken", + "content": "<s>", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False + }, + "eos_token": { + "__type": "AddedToken", + "content": "</s>", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False + }, + "pad_token": None, + "unk_token": { + "__type": "AddedToken", + "content": "<unk>", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False + }, + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = '\u3042\u306a\u305f\u306f\u8aa0\u5b9f\u3067\u512a\u79c0\u306a\u65e5\u672c\u4eba\u306e\u30a2\u30b7\u30b9\u30bf\u30f3\u30c8\u3067\u3059\u3002' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{{ bos_token }}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST] ' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ '' + content.strip() + '' + eos_token }}{% endif %}{% endfor %}" + }, + "instructlab/merlinite-7b-lab": { + "bos_token": "<s>", + "eos_token": "<|endoftext|>", + "pad_token": "<|pad|>", + "unk_token": "<unk>", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>'+ '\n' + message['content'] + '\n'}}{% elif message['role'] == 'user' %}{{'<|user|>' + '\n' + message['content'] + '\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>' + '\n' + message['content'] + '<|endoftext|>' + ('' if loop.last else '\n')}}{% endif %}{% endfor %}" + }, + "microsoft/Phi-3-medium-128k-instruct": { + "bos_token": "<s>", + "eos_token": "<|endoftext|>", + "pad_token": "<|placeholder6|>", + "unk_token": "<unk>", + "chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}" + }, + "katuni4ka/tiny-random-phi3": { + "bos_token": "<s>", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": "<unk>", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|system|>' + '\n' + message['content'] + '<|end|>' + '\n'}}{% elif (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}" + }, + "microsoft/Phi-3-mini-128k-instruct": { + "bos_token": "<s>", + "eos_token": "<|endoftext|>", + "pad_token": "<|placeholder6|>", + "unk_token": "<unk>", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}" + }, + "VAGOsolutions/SauerkrautLM-Qwen-32b": { + "bos_token": None, + "eos_token": "<|im_end|>", + "pad_token": "<|endoftext|>", + "unk_token": None, + "chat_template": "{% set system_message = 'Du bist ein freundlicher und hilfsbereiter KI-Assistent.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\\n' + system_message + '<|im_end|>\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}" + }, + "AI-Sweden-Models/gpt-sw3-356m-instruct": { + "bos_token": None, + "eos_token": None, + "pad_token": None, + "unk_token": None, + "chat_template": "{{ eos_token }}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content']}}{% else %}{{ 'Bot: ' + message['content']}}{% endif %}{{ message['text'] }}{{ bos_token }}{% endfor %}Bot:" + }, + "google/gemma-7b-it": { + "bos_token": "<bos>", + "eos_token": "<eos>", + "pad_token": "<pad>", + "unk_token": "<unk>", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}" + }, + "ise-uiuc/Magicoder-S-DS-6.7B": { + "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", + "eos_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "unk_token": None, + "chat_template": "{{bos_token}}{{'You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.\n\n'}}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n {{ raise_exception('System messages are not allowed in this template.') }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'@@ Instruction\n' + message['content'] + '\n\n'}}\n {%- else %}\n{{'@@ Response\n' + message['content'] + eos_token + '\n\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{{'@@ Response\n'}}" + }, + "Deci/DeciLM-7B": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": None, + "unk_token": "<unk>", + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '### User:\n' + message['content'] }}\n{% elif message['role'] == 'system' %}\n{{ '### System:\n' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ '### Assistant:\n' + message['content'] }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '### Assistant:' }}\n{% endif %}\n{% endfor %}" + }, + "katuni4ka/tiny-random-minicpm": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": None, + "unk_token": "<unk>", + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{'<\u7528\u6237>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}" + }, + "UnicomLLM/Unichat-llama3-Chinese-8B-28K": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = message['content'] %}{% if loop.index0 == 0 %}{% set content =bos_token + content %}{% endif %}{% if loop.index0 ==1 %}{% set content = 'Human:' + content %}{% endif %}{% if loop.index0 %2!=0 and loop.index0 !=1 %}{% set content = bos_token+'Human:' + content %}{% endif %}{% if loop.index0 !=0 and loop.index0 %2==0 and not loop.last %}{% set content = 'Assistant:'+content+ eos_token %}{% endif %}{{ content+'\n' }}{% endfor %}{{ 'Assistant:' }}" + }, + "RLHFlow/LLaMA3-SFT": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|start_header_id|>' + message['role'] + '<|end_header_id|>' + '\n' + message['content'] + '<|eot_id|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n' }}{% endif %}" + }, + "bofenghuang/vigogne-2-7b-chat": { + "bos_token": { + "__type": "AddedToken", + "content": "<s>", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False + }, + "eos_token": { + "__type": "AddedToken", + "content": "</s>", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False + }, + "pad_token": None, + "unk_token": { + "__type": "AddedToken", + "content": "<unk>", + "lstrip": False, + "normalized": False, + "rstrip": False, + "single_word": False + }, + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true %}{% set loop_messages = messages %}{% set system_message = 'Vous \u00eates Vigogne, un assistant IA cr\u00e9\u00e9 par Zaion Lab. Vous suivez extr\u00eamement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|system|>: ' + system_message + '\\n' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|user|>: ' + message['content'].strip() + '\\n' }}{% elif message['role'] == 'assistant' %}{{ '<|assistant|>: ' + message['content'].strip() + eos_token + '\\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>:' }}{% endif %}" + }, + "aisingapore/sea-lion-7b-instruct": { + "bos_token": None, + "eos_token": "<|endoftext|>", + "pad_token": "<|padding|>", + "unk_token": "<unk>", + "chat_template": "{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}### USER:\n{{ message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}### RESPONSE:\n{{ message['content'] + '\n\n' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}{% if add_generation_prompt %}### RESPONSE:\n{% endif %}" + }, + "microsoft/Phi-3-small-8k-instruct": { + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": None, + "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}" + }, + "THUDM/cogvlm2-llama3-chat-19B": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": None, + "unk_token": None, + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% else %}{{ eos_token }}{% endif %}" + }, + "tiiuae/falcon-11B": { + "bos_token": ">>", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": None, + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'User: \n' + message['content'] }}\n{% elif message['role'] == 'system' %}\n{{ 'System: ' + message['content'] }}\n{% elif message['role'] == 'assistant' %}\n{{ 'Falcon:\n' + message['content']}}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Falcon:' }}\n{% endif %}\n{% endfor %}" + }, + "Mihaiii/Pallas-0.5": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": "<unk>", + "unk_token": "<unk>", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{% if message['content']%}{{'SYSTEM:\n' + message['content']+'\n\n'}}{% endif %}{% elif message['role'] == 'user' %}{{'USER:\n' + message['content']+'\n\n'}}{% elif message['role'] == 'assistant' %}{{'ASSISTANT:\n' + message['content']}}{% endif %}{% if loop.last and add_generation_prompt %}{{ 'ASSISTANT:\n' }}{% endif %}{% endfor %}" + }, + "prithivida/Asimov-7B-v2": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": "<unk>", + "unk_token": "<unk>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'### ' + message['role'] + ': ' + message['content'] }}{% endfor %}{% if add_generation_prompt %}{{ '### Assistant: ' }}{% endif %}" + }, + "dreamgen/opus-v1.2-7b": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": None, + "unk_token": "<unk>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>'}}{% if message['role']=='assistant' %}{{'text'}}{% else %}{{message['role']}}{% endif %}{{'\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>text\n' }}{% endif %}" + }, + "KnutJaegersberg/internlm-20b-llama": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": "</s>", + "unk_token": "<unk>", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.last and message['role'] != 'user' %}{{ raise_exception('Most recent message must come from user!') }}{% endif %}{% if message['role'] == 'user' %}{{ '<|User|>:' + message['content'] + '<eoh>\n'}}{% elif message['role'] == 'assistant' %}{{ '<|Bot|>:' + message['content'] + '<eoa>\n'}}{% else %}{{ raise_exception('Only user and assistant roles are supported in this model!') }}{% endif %}{% endfor %}{{ '<|Bot|>:' }}" + }, + "alpindale/WizardLM-2-8x22B": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": "<unk>", + "unk_token": "<unk>", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{{ messages[0]['content'].strip() }}{% else %}{% set loop_messages = messages %}{{ 'A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\\'s questions.' }}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{% if message['role'] == 'system' or message['role'] == 'user' %}{{ ' USER: ' + message['content'].strip() }}{% else %}{{ ' ASSISTANT: ' + message['content'].strip() + eos_token }}{% endif %}{% else %}{% if message['role'] == 'system' or message['role'] == 'user' %}{{ '\nUSER: ' + message['content'].strip() }}{% else %}{{ ' ASSISTANT: ' + message['content'].strip() + eos_token }}{% endif %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ ' ASSISTANT:' }}{% endif %}" + }, + "yentinglin/Taiwan-LLM-7B-v2.0-base": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": "<unk>", + "unk_token": "<unk>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() %}{% else %}{% set loop_messages = messages %}{% set system_message = '\u4f60\u662f\u4eba\u5de5\u667a\u6167\u52a9\u7406\uff0c\u4ee5\u4e0b\u662f\u7528\u6236\u548c\u4eba\u5de5\u667a\u80fd\u52a9\u7406\u4e4b\u9593\u7684\u5c0d\u8a71\u3002\u4f60\u8981\u5c0d\u7528\u6236\u7684\u554f\u984c\u63d0\u4f9b\u6709\u7528\u3001\u5b89\u5168\u3001\u8a73\u7d30\u548c\u79ae\u8c8c\u7684\u56de\u7b54\u3002' %}{% endif %}{{system_message + eos_token}}{% for message in loop_messages %}{% if message['role'] == 'user' %}USER: {{ message['content'].strip() + eos_token }}{% elif message['role'] == 'system' %}{{message['content'].strip() + eos_token}}{% elif message['role'] == 'assistant' %}ASSISTANT: {{ message['content'].strip() + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{'ASSISTANT:'}}{% endif %}" + }, + "maywell/Synatra-Mixtral-8x7B": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": "</s>", + "unk_token": "<unk>", + "chat_template": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n{% for message in messages %}{% if message['role'] == 'user' %}### Instruction:\n{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% elif message['role'] == 'assistant' %}### Response:\n{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% elif message['role'] == 'system' %}{{ message['content']|trim -}}{% if not loop.last %}{% endif %}\n{% endif %}\n{% endfor %}\n{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}\n### Response:\n{% endif %}" + }, + "MediaTek-Research/Breeze-7B-Instruct-v1_0": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": "</s>", + "unk_token": "<unk>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful AI assistant built by MediaTek Research. The user you are helping speaks Traditional Chinese and comes from Taiwan.' %}{% endif %}{{ bos_token }} {{ system_message }} {% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/... or system/user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ ' [INST] ' + message['content'] + ' [/INST] ' }}{% elif message['role'] == 'assistant' %}{{ message['content'] }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" + }, + "MTSAIR/multi_verse_model": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": "<unk>", + "unk_token": "<unk>", + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '### Instruction:\n' + message['content'] + '\n### Response:\n' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% elif message['role'] == 'system' %}{{ '### System:\n' + message['content'] + '\n' }}{% endif %}{% endfor %}" + }, + "bofenghuang/vigostral-7b-chat": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": None, + "unk_token": "<unk>", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous \u00eates Vigogne, un assistant IA cr\u00e9\u00e9 par Zaion Lab. Vous suivez extr\u00eamement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}" + }, + "SeaLLMs/SeaLLM-7B-v2.5": { + "bos_token": "<bos>", + "eos_token": "<eos>", + "pad_token": "<pad>", + "unk_token": "<unk>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<eos>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + }, + "qnguyen3/Master-Yi-9B": { + "bos_token": "<|startoftext|>", + "eos_token": "<|im_end|>", + "pad_token": "<unk>", + "unk_token": "<unk>", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\\n' + system_message + '<|im_end|>\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}" + }, + "meetkai/functionary-small-v2.5": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' or message['role'] == 'system' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}{% elif message['role'] == 'tool' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + 'name=' + message['name'] + '\n' + message['content'] + '<|eot_id|>' }}{% else %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'}}{% if message['content'] is not none %}\n{{ message['content'] }}{% endif %}\n{% if 'tool_calls' in message and message['tool_calls'] is not none %}\n{% for tool_call in message['tool_calls'] %}\n{{ '<|reserved_special_token_249|>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments'] }}{% endfor %}\n{% endif %}\n{{ '<|eot_id|>' }}{% endif %}\n{% endfor %}\n{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" + }, + "h2oai/h2o-danube-1.8b-chat": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": "<unk>", + "unk_token": "<unk>", + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|prompt|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ '<|system|>' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '<|answer|>' + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|answer|>' }}{% endif %}{% endfor %}" + }, + "TheBloke/CodeLlama-70B-Instruct-AWQ": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": None, + "unk_token": "<unk>", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set user_index = 1 %}{% else %}{% set user_index = 0 %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != ((loop.index0 + user_index) % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{{ '<s>' }}{% endif %}{% set content = 'Source: ' + message['role'] + '\n\n ' + message['content'].strip() %}{{ content + ' <step> ' }}{% endfor %}{{'Source: assistant\nDestination: user\n\n '}}" + }, + "FairMind/Phi-3-mini-4k-instruct-bnb-4bit-Ita": { + "bos_token": "<s>", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": "<unk>", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] in ['user', 'system']) %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}" + }, + "ibm-granite/granite-8b-code-instruct": { + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": "<|endoftext|>", + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'Question:\n' + message['content'] + '\n\n' }}{% elif message['role'] == 'system' %}\n{{ 'System:\n' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Answer:\n' + message['content'] + '\n\n' }}{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Answer:\n' }}{% endif %}{% endfor %}" + }, + "dicta-il/dictalm2.0-instruct": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": None, + "unk_token": "<unk>", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]\n' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" + }, + "nvidia/Llama3-ChatQA-1.5-8B": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": None, + "unk_token": None, + "chat_template": "{{ bos_token }}{%- if messages[0]['role'] == 'system' -%}{% set loop_messages = messages[1:] %}{%- else -%}{% set loop_messages = messages %}{% endif %}System: This is a chat between a user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions based on the context. The assistant should also indicate when the answer cannot be found in the context.\n\n{% for message in loop_messages %}{%- if message['role'] == 'user' -%}User: {{ message['content'].strip() + '\n\n' }}{%- else -%}Assistant: {{ message['content'].strip() + '\n\n' }}{%- endif %}{% if loop.last and message['role'] == 'user' %}Assistant:{% endif %}{% endfor %}" + }, + "openchat/openchat-3.6-8b-20240522": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": None, + "unk_token": None, + "chat_template": "{{ bos_token }}{% for message in messages %}{% if message['role'] in ['user', 'assistant'] %}{% set content = '<|start_header_id|>GPT4 Correct ' + message['role'].title() + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' %}{% elif message['role'] == 'system' %}{% set content = '<|start_header_id|>System<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' %}{% else %}{{ raise_exception('Only user, assistant and system roles are supported!') }}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>GPT4 Correct Assistant<|end_header_id|>\n\n' }}{% endif %}" + }, + "OpenBuddy/openbuddy-mistral2-7b-v20.3-32k": { + "bos_token": { + "__type": "AddedToken", + "content": "<s>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "eos_token": { + "__type": "AddedToken", + "content": "</s>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "pad_token": None, + "unk_token": { + "__type": "AddedToken", + "content": "<unk>", + "lstrip": False, + "normalized": True, + "rstrip": False, + "single_word": False + }, + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n' }}{% elif message['role'] == 'assistant' %}{% if loop.last %}{{ 'Assistant: ' + message['content']}}{% else %}{{ 'Assistant: ' + message['content'] + eos_token + '\n' }}{% endif %}{% elif message['role'] == 'system' %}{{ message['content'] + '\n' }}{% endif %}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ 'Assistant:' }}{% endif %}" + }, + "tenyx/TenyxChat-7B-v1": { + "bos_token": "<s>", + "eos_token": "<|end_of_turn|>", + "pad_token": "<|end_of_turn|>", + "unk_token": "<unk>", + "chat_template": "{{ bos_token }} {% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ 'User:' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ 'System:' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ 'Assistant:' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ 'Assistant:' }}{% endif %}\n{% endfor %}" + }, + "LoneStriker/TinyLlama-1.1B-32k-Instruct-3.0bpw-h6-exl2": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": "</s>", + "unk_token": "<unk>", + "chat_template": "{{ bos_token }}{% if bos_token|length > 0 %}{{ '\n' }}{% endif %}{% if messages[0]['role'] != 'system' %}{{ '### Instruction:\nYou are an unbiased, uncensored, helpful assistant.' }}{% endif %}{% for message in messages %}{% if message['role'] == 'system' %}{{ '### Instruction:\n' + message['content'] }}{% elif message['role'] == 'user' %}{{ '\n\n### Input:\n' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ '\n\n### Response:\n' + message['content'] + eos_token }}{% else %}{{ raise_exception('Only user, assistant, and system roles are supported!') }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '\n\n### Response:\n' }}{% endif %}" + }, + "SeaLLMs/SeaLLM-7B-v2": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": "<unk>", + "unk_token": "<unk>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '</s>'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" + }, + "cognitivecomputations/dolphin-2.6-mistral-7b-dpo-laser": { + "bos_token": "<s>", + "eos_token": "<|im_end|>", + "pad_token": "<|im_end|>", + "unk_token": "<unk>", + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|im_start|>user\n' + message['content'] + '<|im_end|>' }}\n{% elif message['role'] == 'system' %}\n{{ '<|im_start|>system\n' + message['content'] + '<|im_end|>' }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|im_start|>assistant\n' + message['content'] + '<|im_end|>' }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|im_start|>assistant' }}\n{% endif %}\n{% endfor %}" + }, + "vaiv/llamion-14b-chat": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": "<unk>", + "unk_token": "<unk>", + "chat_template": "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\n\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}" + }, + "yam-peleg/Hebrew-Gemma-11B-V2": { + "bos_token": "<bos>", + "eos_token": "<eos>", + "pad_token": "<pad>", + "unk_token": "<unk>", + "chat_template": "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}" + }, + "shenzhi-wang/Llama3-8B-Chinese-Chat": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": "<|eot_id|>", + "unk_token": None, + "chat_template": "{{ '<|begin_of_text|>' }}{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% set loop_messages = messages[1:] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|start_header_id|>system<|end_header_id|>\n\n' + system_message | trim + '<|eot_id|>' }}{% endif %}{% for message in loop_messages %}{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" + }, + "ericzzz/falcon-rw-1b-chat": { + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": None, + "unk_token": "<|endoftext|>", + "chat_template": "{% for message in messages %}{% if loop.index > 1 and loop.previtem['role'] != 'assistant' %}{{ ' ' }}{% endif %}{% if message['role'] == 'system' %}{{ '[SYS] ' + message['content'].strip() }}{% elif message['role'] == 'user' %}{{ '[INST] ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '[RESP] ' + message['content'] + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ ' [RESP] ' }}{% endif %}" + }, + "NLPark/AnFeng_v3_Avocet": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": "<s>", + "unk_token": "<unk>", + "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}" + }, + "microsoft/Phi-3-vision-128k-instruct": { + "bos_token": "<s>", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": "<unk>", + "chat_template": "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}" + }, + "jphme/em_german_leo_mistral": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": None, + "unk_token": "<unk>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{% if message['role'] == 'system' %}{{message['content'] + ' '}}{% elif message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' '}}{% elif message['role'] == 'assistant' %}{{ 'ASSISTANT: ' + message['content'] + ' '}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT: '}}{% endif %}" + }, + "nlpai-lab/KULLM3": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": "</s>", + "unk_token": "<unk>", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = \"\ub2f9\uc2e0\uc740 \uace0\ub824\ub300\ud559\uad50 NLP&AI \uc5f0\uad6c\uc2e4\uc5d0\uc11c \ub9cc\ub4e0 AI \ucc57\ubd07\uc785\ub2c8\ub2e4. \ub2f9\uc2e0\uc758 \uc774\ub984\uc740 'KULLM'\uc73c\ub85c, \ud55c\uad6d\uc5b4\ub85c\ub294 '\uad6c\ub984'\uc744 \ub73b\ud569\ub2c8\ub2e4. \ub2f9\uc2e0\uc740 \ube44\ub3c4\ub355\uc801\uc774\uac70\ub098, \uc131\uc801\uc774\uac70\ub098, \ubd88\ubc95\uc801\uc774\uac70\ub098 \ub610\ub294 \uc0ac\ud68c \ud1b5\ub150\uc801\uc73c\ub85c \ud5c8\uc6a9\ub418\uc9c0 \uc54a\ub294 \ubc1c\uc5b8\uc740 \ud558\uc9c0 \uc54a\uc2b5\ub2c8\ub2e4. \uc0ac\uc6a9\uc790\uc640 \uc990\uac81\uac8c \ub300\ud654\ud558\uba70, \uc0ac\uc6a9\uc790\uc758 \uc751\ub2f5\uc5d0 \uac00\ub2a5\ud55c \uc815\ud655\ud558\uace0 \uce5c\uc808\ud558\uac8c \uc751\ub2f5\ud568\uc73c\ub85c\uc368 \ucd5c\ub300\ud55c \ub3c4\uc640\uc8fc\ub824\uace0 \ub178\ub825\ud569\ub2c8\ub2e4. \uc9c8\ubb38\uc774 \uc774\uc0c1\ud558\ub2e4\uba74, \uc5b4\ub5a4 \ubd80\ubd84\uc774 \uc774\uc0c1\ud55c\uc9c0 \uc124\uba85\ud569\ub2c8\ub2e4. \uac70\uc9d3 \uc815\ubcf4\ub97c \ubc1c\uc5b8\ud558\uc9c0 \uc54a\ub3c4\ub85d \uc8fc\uc758\ud569\ub2c8\ub2e4.\" %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]'}}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}" + }, + "HuggingFaceH4/zephyr-7b-gemma-sft-v0.1": { + "bos_token": "<bos>", + "eos_token": "<eos>", + "pad_token": "<pad>", + "unk_token": "<unk>", + "chat_template": "{% if messages[0]['role'] == 'user' or messages[0]['role'] == 'system' %}{{ bos_token }}{% endif %}{% for message in messages %}{{ '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% elif messages[-1]['role'] == 'assistant' %}{{ eos_token }}{% endif %}" + }, + "MediaTek-Research/Breeze-7B-Instruct-v0_1": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": "</s>", + "unk_token": "<unk>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'].strip() %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful AI assistant built by MediaTek Research. The user you are helping speaks Traditional Chinese and comes from Taiwan.' %}{% endif %}{{ bos_token }}{{ system_message }} {% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/... or system/user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST] ' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" + }, + "microsoft/DialoGPT-large": { + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": None, + "unk_token": "<|endoftext|>", + "chat_template": "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}" + }, + "meta-llama/Meta-Llama-Guard-2-8B": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": None, + "unk_token": None, + "chat_template": "{% if messages|length % 2 == 0 %}{% set role = 'Agent' %}{% else %}{% set role = 'User' %}{% endif %}{{ \"[INST] Task: Check if there is unsafe content in '\" + role + \"' messages in conversations according our safety policy with the below categories.\n\n<BEGIN UNSAFE CONTENT CATEGORIES>\nS1: Violent Crimes. \nS2: Non-Violent Crimes. \nS3: Sex Crimes. \nS4: Child Exploitation. \nS5: Specialized Advice. \nS6: Privacy. \nS7: Intellectual Property. \nS8: Indiscriminate Weapons. \nS9: Hate. \nS10: Self-Harm. \nS11: Sexual Content. \n<END UNSAFE CONTENT CATEGORIES>\n\n<BEGIN CONVERSATION>\n\n\" }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{% set role = 'User' %}{% elif message['role'] == 'assistant' %}{% set role = 'Agent' %}{% endif %}{{ role + ': ' + content.strip() + '\n\n' }}{% endfor %}{{ \"<END CONVERSATION>\n\nProvide your safety assessment for ONLY THE LAST \" + role + \" message in the above conversation:\n - First line must read 'safe' or 'unsafe'.\n - If unsafe, a second line must include a comma-separated list of violated categories. [/INST]\" }}" + }, + "chinoll/Yi-6b-200k-dpo": { + "bos_token": "<|startoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": "<unk>", + "unk_token": "<unk>", + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|Human|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|System|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|Assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|Assistant|>' }}\n{% endif %}\n{% endfor %}" + }, + "shanchen/llama3-8B-slerp-biomed-chat-chinese": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": "<|eot_id|>", + "unk_token": None, + "chat_template": "{{ '<|begin_of_text|>' }}{% set system_message = 'You are Llama3-8B-Chinese-Chat-v2, finetuned from Llama3-8B-Instruct on Chinese-English dataset using the ORPO algorithm. You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% set loop_messages = messages[1:] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|start_header_id|>system<|end_header_id|>\n\n' + system_message | trim + '<|eot_id|>' }}{% endif %}{% for message in loop_messages %}{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" + }, + "MLP-KTLim/llama-3-Korean-Bllossom-8B": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" + }, + "UnfilteredAI/UNfilteredAI-1B": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": "</s>", + "unk_token": "<unk>", + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|user|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ '<|system|>' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '<|assistant|>' + message['content'] + eos_token }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|assistant|>' }}{% endif %}{% endfor %}" + }, + "abacusai/Smaug-Mixtral-v0.1": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": None, + "unk_token": "<unk>", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{%if message['content'][0] == '$' %} {% endif %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" + }, + "ProbeMedicalYonseiMAILab/medllama3-v20": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": "<|eot_id|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{ message['content'] }}{% elif message['role'] == 'user' %}{{ '\n\nHuman: ' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '\n\nAssistant: ' + message['content'] + eos_token }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '\n\nAssistant: ' }}{% endif %}" + }, + "vinai/PhoGPT-4B-Chat": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": "<pad>", + "unk_token": "<unk>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{% if message['role'] == 'user' and loop.first %}{{ '### C\u00e2u h\u1ecfi: ' + message['content'].strip() }}{% elif message['role'] == 'user' %}{{ '\n### C\u00e2u h\u1ecfi: ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '\n### Tr\u1ea3 l\u1eddi: ' + message['content'] + eos_token }}{% endif %}{% if loop.last %}{% if message['role'] == 'user' and add_generation_prompt %}{{ '\n### Tr\u1ea3 l\u1eddi:' }}{% endif %}{% endif %}{% endfor %}" + }, + "lucyknada/microsoft_WizardLM-2-7B": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": "<unk>", + "unk_token": "<unk>", + "chat_template": "{{ bos_token + (messages[0]['content'].strip() + '\n\n' if messages[0]['role'] == 'system' else '') }}{% for message in (messages[1:] if messages[0]['role'] == 'system' else messages) %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'].strip() + '\n' }}{% elif message['role'] == 'assistant' %}{{ 'ASSISTANT: ' + message['content'].strip() + eos_token + '\n' }}{% endif %}{% if loop.last and message['role'] == 'user' and add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}{% endfor %}" + }, + "bigcode/starcoder2-15b-instruct-v0.1": { + "bos_token": "<|endoftext|>", + "eos_token": "<|endoftext|>", + "pad_token": None, + "unk_token": "<|endoftext|>", + "chat_template": "{{bos_token}}{{'You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.\n\n'}}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n {{ raise_exception('System messages are not allowed in this template.') }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction\n' + message['content'] + '\n\n'}}\n {%- else %}\n{{'### Response\n' + message['content'] + eos_token + '\n\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{{'### Response\n'}}" + }, + "AliAbdelrasheed/maqa_llama_4bit": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": "<|reserved_special_token_250|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{% if message['from'] == 'human' %}{{ '<|start_header_id|>user<|end_header_id|>\n\n' + message['value'] | trim + '<|eot_id|>' }}{% elif message['from'] == 'gpt' %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' + message['value'] | trim + '<|eot_id|>' }}{% else %}{{ '<|start_header_id|>' + message['from'] + '<|end_header_id|>\n\n' + message['value'] | trim + '<|eot_id|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" + }, + "lightonai/alfred-40b-1023": { + "bos_token": None, + "eos_token": "<end_message>", + "pad_token": None, + "unk_token": None, + "chat_template": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '<start_user>' + message['content'].strip() + '<end_message>' }}{% elif message['role'] == 'system' %}{{ '<start_system>' + message['content'].strip() + '<end_message>' }}{% elif message['role'] == 'assistant' %}{{ '<start_assistant>' + message['content'] + '<end_message>' }}{% else %}{{ raise_exception('Only system, user and assistant roles are supported.') }}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<start_assistant>' }}{% endif %}{% endfor %}" + }, + "aloobun/CosmicBun-8B": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{%- set ns = namespace(found=false) -%}{%- for message in messages -%}{%- if message['role'] == 'system' -%}{%- set ns.found = true -%}{%- endif -%}{%- endfor -%}{%- for message in messages %}{%- if message['role'] == 'system' -%}{{- '<|im_start|>system\n' + message['content'].rstrip() + '<|im_end|>\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'<|im_start|>user\n' + message['content'].rstrip() + '<|im_end|>\n'-}}{%- else -%}{{-'<|im_start|>assistant\n' + message['content'] + '<|im_end|>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'<|im_start|>assistant\n'-}}{%- endif -%}" + }, + "Undi95/Mixtral-8x7B-MoE-RP-Story": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": None, + "unk_token": "<unk>", + "chat_template": "{%- for idx in range(0, messages|length) -%}\n{%- if messages[idx]['role'] == 'user' -%}\n{%- if idx > 1 -%}\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\n{%- else -%}\n{{- messages[idx]['content'] + ' [/INST]' -}}\n{%- endif -%}\n{% elif messages[idx]['role'] == 'system' %}\n{{- '[INST] <<SYS>>\\n' + messages[idx]['content'] + '\\n<</SYS>>\\n\\n' -}}\n{%- elif messages[idx]['role'] == 'assistant' -%}\n{{- ' ' + messages[idx]['content'] + ' ' + eos_token -}}\n{% endif %}\n{% endfor %}\n" + }, + "TIGER-Lab/MAmmoTH2-8B-Plus": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": "<|eot_id|>", + "unk_token": None, + "chat_template": "{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|begin_of_text|>' + '<|start_header_id|>system<|end_header_id|>\\n\\n' + system_message + '<|eot_id|>' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|start_header_id|>user<|end_header_id|>\\n\\n' + content + '<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|eot_id|>' }}{% endif %}{% endfor %}" + }, + "codellama/CodeLlama-70b-Instruct-hf": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": None, + "unk_token": "<unk>", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set user_index = 1 %}{% else %}{% set user_index = 0 %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != ((loop.index0 + user_index) % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 %}{{ '<s>' }}{% endif %}{% set content = 'Source: ' + message['role'] + '\n\n ' + message['content'] | trim %}{{ content + ' <step> ' }}{% endfor %}{{'Source: assistant\nDestination: user\n\n '}}" + }, + "stephenlzc/Mistral-7B-v0.3-Chinese-Chat-uncensored": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": "[control_768]", + "unk_token": "<unk>", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{{ '<s>' + system_message }}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ ' [INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '</s>' }}{% endif %}{% endfor %}" + }, + "gorilla-llm/gorilla-openfunctions-v2": { + "bos_token": "<\uff5cbegin\u2581of\u2581sentence\uff5c>", + "eos_token": "<|EOT|>", + "pad_token": "<\uff5cend\u2581of\u2581sentence\uff5c>", + "unk_token": None, + "chat_template": "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Gorilla LLM model, developed by Gorilla LLM, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}" + }, + "ghost-x/ghost-7b-alpha": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": "</s>", + "unk_token": "<unk>", + "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'plugins' %}\n{{ '<|plugins|>\n' + message['content'] + '\n\nStandards for using the tool must comply with the following syntax:\n[execute]({\"type\": string, \"function\": string, \"arguments\": object})' + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'execute' %}\n{{ '<|assistant|>\n[execute](' + message['content'] + ')<//>' + eos_token }}\n{% elif message['role'] == 'response' %}\n{{ '<|tool|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" + }, + "winninghealth/WiNGPT2-Llama-3-8B-Chat": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|end_of_text|>", + "pad_token": "<|end_of_text|>", + "unk_token": None, + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}System\uff1a{% endif %}{% if message['role'] == 'user' %}User\uff1a{% endif %}{% if message['role'] == 'assistant' %}Assistant\uff1a{% endif %}{{ message['content'] }}<|end_of_text|>\n {% endfor %}Assistant\uff1a" + }, + "BramVanroy/Llama-2-13b-chat-dutch": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": None, + "unk_token": "<unk>", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{%set system_message = 'Je bent een behulpzame, respectvolle en eerlijke assistent. Antwoord altijd zo behulpzaam mogelijk. Je antwoorden mogen geen schadelijke, onethische, racistische, seksistische, gevaarlijke of illegale inhoud bevatten. Zorg ervoor dat je antwoorden sociaal onbevooroordeeld en positief van aard zijn.\n\nAls een vraag nergens op slaat of feitelijk niet coherent is, leg dan uit waarom in plaats van iets niet correct te antwoorden. Als je het antwoord op een vraag niet weet, deel dan geen onjuiste informatie.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\n' + content.strip() + '\n<</SYS>>\n\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}" + }, + "THUDM/chatglm3-6b": { + "bos_token": None, + "eos_token": "</s>", + "pad_token": "<unk>", + "unk_token": "<unk>", + "chat_template": "{% for message in messages %}{% if loop.first %}[gMASK]sop<|{{ message['role'] }}|>\n {{ message['content'] }}{% else %}<|{{ message['role'] }}|>\n {{ message['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}" + }, + "microsoft/Phi-3-mini-4k-instruct": { + "bos_token": "<s>", + "eos_token": "<|endoftext|>", + "pad_token": "<|endoftext|>", + "unk_token": "<unk>", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}" + }, + "mistralai/Mistral-7B-Instruct-v0.1": { + "bos_token": "<s>", + "eos_token": "</s>", + "pad_token": None, + "unk_token": "<unk>", + "chat_template": "{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n {%- endif %}\n {%- if message['role'] == 'user' %}\n {%- if loop.first and system_message is defined %}\n {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n {%- else %}\n {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'assistant' %}\n {{- ' ' + message['content'] + eos_token}}\n {%- else %}\n {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n {%- endif %}\n{%- endfor %}\n" + }, + "meta-llama/Meta-Llama-3.1-8B-Instruct": { + "bos_token": "<|begin_of_text|>", + "eos_token": "<|eot_id|>", + "pad_token": None, + "unk_token": None, + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- set date_string = \"26 Jul 2024\" %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if builtin_tools is defined or tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n {{- \"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- \"<|python_tag|>\" + tool_call.name + \".call(\" }}\n {%- for arg_name, arg_val in tool_call.arguments | items %}\n {{- arg_name + '=\"' + arg_val + '\"' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \")\" }}\n {%- else %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {%- endif %}\n {%- if builtin_tools is defined %}\n {#- This means we're in ipython mode #}\n {{- \"<|eom_id|>\" }}\n {%- else %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + } + } diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt deleted file mode 100644 index b3e98e8c82..0000000000 --- a/text_generation/causal_lm/cpp/CMakeLists.txt +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (C) 2023 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -cmake_minimum_required(VERSION 3.15) -project(causal_lm) - -# Build user_ov_extensions -list(APPEND CUSTOM_OPERATIONS tokenizer) -add_subdirectory(../../../thirdparty/openvino_contrib/modules/custom_operations/ "${CMAKE_CURRENT_BINARY_DIR}/custom_operations/") - -add_executable(greedy_causal_lm greedy_causal_lm.cpp) -target_compile_definitions(greedy_causal_lm PRIVATE USER_OV_EXTENSIONS_PATH=\"$<TARGET_FILE:user_ov_extensions>\") -find_package(OpenVINO REQUIRED COMPONENTS Runtime) -target_link_libraries(greedy_causal_lm PRIVATE openvino::runtime) -set_target_properties(greedy_causal_lm PROPERTIES CXX_STANDARD 17) -set_target_properties(greedy_causal_lm PROPERTIES CXX_STANDARD_REQUIRED ON) - -add_executable(beam_search_causal_lm beam_search_causal_lm.cpp) -target_compile_definitions(beam_search_causal_lm PRIVATE USER_OV_EXTENSIONS_PATH=\"$<TARGET_FILE:user_ov_extensions>\") -target_include_directories(beam_search_causal_lm PRIVATE ./) -find_package(OpenVINO REQUIRED COMPONENTS Runtime) -target_link_libraries(beam_search_causal_lm PRIVATE openvino::runtime) -set_target_properties(beam_search_causal_lm PROPERTIES CXX_STANDARD 17) -set_target_properties(beam_search_causal_lm PROPERTIES CXX_STANDARD_REQUIRED ON) diff --git a/text_generation/causal_lm/cpp/README.md b/text_generation/causal_lm/cpp/README.md deleted file mode 100644 index 75f80acd88..0000000000 --- a/text_generation/causal_lm/cpp/README.md +++ /dev/null @@ -1,85 +0,0 @@ -# Causal LM - -These applications showcase inference of a causal language model (LM). They don't have many configuration options to encourage the reader to explore and modify the source code. There's a Jupyter notebook which corresponds to these pipelines and discusses how to create an LLM-powered Chatbot: https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/254-llm-chatbot. - -> [!NOTE] -> This project is not for production use. - -## How it works - -### greedy_causal_lm - -The program loads a tokenizer, a detokenizer and a model (`.xml` and `.bin`) to OpenVINO. A prompt is tokenized and passed to the model. The model greedily generates token by token until the special end of sequence (EOS) token is obtained. The predicted tokens are converted to chars and printed in a streaming fashion. - -### beam_search_causal_lm - -The program loads a tokenizer, a detokenizer and a model (`.xml` and `.bin`) to OpenVINO. A prompt is tokenized and passed to the model. The model predicts a distribution over the next tokens and group beam search samples from that distribution to explore possible sequesnses. The result is converted to chars and printed. - -## Install OpenVINO Runtime - -Install OpenVINO Runtime from an archive: [Linux](https://docs.openvino.ai/2023.2/openvino_docs_install_guides_installing_openvino_from_archive_linux.html). `<INSTALL_DIR>` below refers to the extraction location. - -## Build `greedy_causal_lm`, `beam_search_causal_lm` and `user_ov_extensions` - -```sh -git submodule update --init -source <INSTALL_DIR>/setupvars.sh -cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ && cmake --build ./build/ --config Release -j -``` - -## Supported models - -1. chatglm - refer to - [chatglm2-6b - AttributeError: can't set attribute](../../../llm_bench/python/doc/NOTES.md#chatglm2-6b---attributeerror-cant-set-attribute) - in case of `AttributeError` - 1. https://huggingface.co/THUDM/chatglm2-6b - 2. https://huggingface.co/THUDM/chatglm3-6b -2. LLaMA 2 - 1. https://huggingface.co/meta-llama/Llama-2-13b-chat-hf - 2. https://huggingface.co/meta-llama/Llama-2-13b-hf - 3. https://huggingface.co/meta-llama/Llama-2-7b-chat-hf - 4. https://huggingface.co/meta-llama/Llama-2-7b-hf - 5. https://huggingface.co/meta-llama/Llama-2-70b-chat-hf - 6. https://huggingface.co/meta-llama/Llama-2-70b-hf -3. [Llama2-7b-WhoIsHarryPotter](https://huggingface.co/microsoft/Llama2-7b-WhoIsHarryPotter) -4. OpenLLaMA - 1. https://huggingface.co/openlm-research/open_llama_13b - 2. https://huggingface.co/openlm-research/open_llama_3b - 3. https://huggingface.co/openlm-research/open_llama_3b_v2 - 4. https://huggingface.co/openlm-research/open_llama_7b - 5. https://huggingface.co/openlm-research/open_llama_7b_v2 -5. TinyLlama - 1. https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.6 - 2. https://huggingface.co/TinyLlama/TinyLlama-1.1B-intermediate-step-1195k-token-2.5T -6. Qwen - 1. https://huggingface.co/Qwen/Qwen-7B-Chat - 2. https://huggingface.co/Qwen/Qwen-7B-Chat-Int4 - refer to - [Qwen-7B-Chat-Int4 - Torch not compiled with CUDA enabled](../../../llm_bench/python/doc/NOTES.md#qwen-7b-chat-int4---torch-not-compiled-with-cuda-enabled) - in case of `AssertionError` - -This pipeline can work with other similar topologies produced by `optimum-intel` with the same model signature. - -### Download and convert the model and tokenizers - -The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. -`beam_search_causal_lm` requires ommiting `--streaming-detokenizer` for `convert_tokenizers.py`. - -```sh -source <INSTALL_DIR>/setupvars.sh -python -m pip install --upgrade-strategy eager "optimum[openvino]>=1.14" -r ../../../llm_bench/python/requirements.txt ../../../thirdparty/openvino_contrib/modules/custom_operations/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu -python -m pip uninstall openvino # Uninstall openvino from PyPI because there's one from the archive installed -python ../../../llm_bench/python/convert.py --model_id meta-llama/Llama-2-7b-hf --output_dir ./Llama-2-7b-hf/ --precision FP16 --stateful -convert_tokenizer ./Llama-2-7b-hf/pytorch/dldt/FP16/ --output ./Llama-2-7b-hf/pytorch/dldt/FP16/ --with-detokenizer --streaming-detokenizer True --trust-remote-code -``` - -## Run - -Usage: -1. `greedy_causal_lm <MODEL_DIR> "<PROMPT>"` -2. `beam_search_causal_lm <MODEL_DIR> "<PROMPT>"` - -Examples: -1. `./build/greedy_causal_lm ./Llama-2-7b-hf/pytorch/dldt/FP16/ "Why is the Sun yellow?"` -2. `./build/beam_search_causal_lm ./Llama-2-7b-hf/pytorch/dldt/FP16/ "Why is the Sun yellow?"` - -To enable Unicode characters for Windows cmd open `Region` settings from `Control panel`. `Administrative`->`Change system locale`->`Beta: Use Unicode UTF-8 for worldwide language support`->`OK`. Reboot. diff --git a/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp b/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp deleted file mode 100644 index 3f9e2cd3b7..0000000000 --- a/text_generation/causal_lm/cpp/beam_search_causal_lm.cpp +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include <group_beam_searcher.hpp> -#include <openvino/openvino.hpp> - -namespace { -std::pair<ov::Tensor, ov::Tensor> tokenize(ov::InferRequest& tokenizer, std::string&& prompt) { - constexpr size_t BATCH_SIZE = 1; - tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {BATCH_SIZE}, &prompt}); - tokenizer.infer(); - return {tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask")}; -} - -std::string detokenize(ov::InferRequest& detokenizer, const std::vector<int64_t>& tokens) { - constexpr size_t BATCH_SIZE = 1; - ov::Tensor inp = detokenizer.get_input_tensor(); - inp.set_shape({BATCH_SIZE, tokens.size()}); - for (size_t idx = 0; idx < tokens.size(); ++idx) { - inp.data<int64_t>()[idx] = tokens.at(idx); - } - detokenizer.infer(); - return detokenizer.get_output_tensor().data<std::string>()[0]; -} -} - -int main(int argc, char* argv[]) try { - if (argc != 3) { - throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> '<PROMPT>'"); - } - // Compile models - ov::Core core; - core.add_extension(USER_OV_EXTENSIONS_PATH); // USER_OV_EXTENSIONS_PATH is defined in CMakeLists.txt - ov::InferRequest tokenizer = core.compile_model( - std::string{argv[1]} + "/openvino_tokenizer.xml", "CPU").create_infer_request(); - auto [input_ids, attention_mask] = tokenize(tokenizer, argv[2]); - ov::InferRequest detokenizer = core.compile_model( - std::string{argv[1]} + "/openvino_detokenizer.xml", "CPU").create_infer_request(); - ov::InferRequest lm = core.compile_model( - std::string{argv[1]} + "/openvino_model.xml", "CPU").create_infer_request(); - // Initialize inputs - lm.set_tensor("input_ids", input_ids); - lm.set_tensor("attention_mask", attention_mask); - ov::Tensor position_ids = lm.get_tensor("position_ids"); - position_ids.set_shape(input_ids.get_shape()); - std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0); - lm.get_tensor("beam_idx").set_shape({1}); - lm.get_tensor("beam_idx").data<int32_t>()[0] = 0; - - const int64_t* prompt_data = input_ids.data<const int64_t>(); - Parameters parameters{std::vector<int64_t>{prompt_data, prompt_data + input_ids.get_size()}}; - GroupBeamSearcher group_beam_searcher{parameters}; - std::vector<int64_t> next_tokens; - std::vector<int32_t> next_beams; - for (size_t length_count = 0; length_count < parameters.max_new_tokens; ++length_count) { - lm.infer(); - std::tie(next_tokens, next_beams) = group_beam_searcher.process(lm.get_tensor("logits")); - if (next_tokens.empty()) { - break; - } - size_t batch_size = next_tokens.size(); - // Set pointers - lm.set_tensor("input_ids", ov::Tensor{ov::element::i64, {batch_size, 1}, next_tokens.data()}); - lm.set_tensor("beam_idx", ov::Tensor{ov::element::i32, {batch_size}, next_beams.data()}); - // Set auxiliary inputs - ov::Tensor attention_mask = lm.get_tensor("attention_mask"); - ov::Shape mask_shape{batch_size, attention_mask.get_shape().at(1) + 1}; - attention_mask.set_shape(mask_shape); - std::fill_n(attention_mask.data<int64_t>(), ov::shape_size(mask_shape), 1); - lm.get_tensor("position_ids").set_shape({batch_size, 1}); - std::fill_n(lm.get_tensor("position_ids").data<int64_t>(), batch_size, mask_shape.at(1) - 1); - } - for (const std::vector<Beam>& group : finalize(std::move(group_beam_searcher))) { - std::cout << "Group:\n"; - for (const Beam& beam : group) { - std::cout << beam.score << ": " << detokenize(detokenizer, beam.tokens) << '\n'; - } - } -} catch (const std::exception& error) { - std::cerr << error.what() << '\n'; - return 1; -} catch (...) { - std::cerr << "Non-exception object thrown\n"; - return 1; -} diff --git a/text_generation/causal_lm/cpp/greedy_causal_lm.cpp b/text_generation/causal_lm/cpp/greedy_causal_lm.cpp deleted file mode 100644 index 8ad78e7906..0000000000 --- a/text_generation/causal_lm/cpp/greedy_causal_lm.cpp +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright (C) 2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include <openvino/openvino.hpp> - -namespace { -std::pair<ov::Tensor, ov::Tensor> tokenize(ov::InferRequest& tokenizer, std::string&& prompt) { - constexpr size_t BATCH_SIZE = 1; - tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {BATCH_SIZE}, &prompt}); - tokenizer.infer(); - return {tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask")}; -} - -void print_token(ov::InferRequest& detokenizer, int64_t out_token) { - constexpr size_t BATCH_SIZE = 1; - ov::Tensor inp = detokenizer.get_input_tensor(); - inp.set_shape({BATCH_SIZE, 1}); - inp.data<int64_t>()[0] = out_token; - detokenizer.infer(); - std::cout << detokenizer.get_output_tensor().data<std::string>()[0] << std::flush; -} -} - -int main(int argc, char* argv[]) try { - if (argc != 3) { - throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> '<PROMPT>'"); - } - // Compile models - ov::Core core; - core.add_extension(USER_OV_EXTENSIONS_PATH); // USER_OV_EXTENSIONS_PATH is defined in CMakeLists.txt - ov::InferRequest tokenizer = core.compile_model( - std::string{argv[1]} + "/openvino_tokenizer.xml", "CPU").create_infer_request(); - auto [input_ids, attention_mask] = tokenize(tokenizer, argv[2]); - ov::InferRequest detokenizer = core.compile_model( - std::string{argv[1]} + "/openvino_detokenizer.xml", "CPU").create_infer_request(); - ov::InferRequest lm = core.compile_model( - std::string{argv[1]} + "/openvino_model.xml", "CPU").create_infer_request(); - // Initialize inputs - lm.set_tensor("input_ids", input_ids); - lm.set_tensor("attention_mask", attention_mask); - ov::Tensor position_ids = lm.get_tensor("position_ids"); - position_ids.set_shape(input_ids.get_shape()); - std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0); - constexpr size_t BATCH_SIZE = 1; - lm.get_tensor("beam_idx").set_shape({BATCH_SIZE}); - lm.get_tensor("beam_idx").data<int32_t>()[0] = 0; - lm.infer(); - size_t vocab_size = lm.get_tensor("logits").get_shape().back(); - float* logits = lm.get_tensor("logits").data<float>() + (input_ids.get_size() - 1) * vocab_size; - int64_t out_token = std::max_element(logits, logits + vocab_size) - logits; - - lm.get_tensor("input_ids").set_shape({BATCH_SIZE, 1}); - position_ids.set_shape({BATCH_SIZE, 1}); - // There's no way to extract special token values from the detokenizer for now - constexpr int64_t SPECIAL_EOS_TOKEN = 2; - while (out_token != SPECIAL_EOS_TOKEN) { - lm.get_tensor("input_ids").data<int64_t>()[0] = out_token; - lm.get_tensor("attention_mask").set_shape({BATCH_SIZE, lm.get_tensor("attention_mask").get_shape().at(1) + 1}); - std::fill_n(lm.get_tensor("attention_mask").data<int64_t>(), lm.get_tensor("attention_mask").get_size(), 1); - position_ids.data<int64_t>()[0] = int64_t(lm.get_tensor("attention_mask").get_size() - 2); - lm.start_async(); - print_token(detokenizer, out_token); - lm.wait(); - logits = lm.get_tensor("logits").data<float>(); - out_token = std::max_element(logits, logits + vocab_size) - logits; - } - std::cout << '\n'; -} catch (const std::exception& error) { - std::cerr << error.what() << '\n'; - return 1; -} catch (...) { - std::cerr << "Non-exception object thrown\n"; - return 1; -} diff --git a/text_generation/causal_lm/cpp/group_beam_searcher.hpp b/text_generation/causal_lm/cpp/group_beam_searcher.hpp deleted file mode 100644 index 7202ae9907..0000000000 --- a/text_generation/causal_lm/cpp/group_beam_searcher.hpp +++ /dev/null @@ -1,257 +0,0 @@ -// Copyright (C) 2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include <openvino/runtime/tensor.hpp> - -// Modifyed Knuth–Morris–Pratt algorithm which returns tokens following after every needle occurance in haystack -std::vector<int64_t> kmp_search(const std::vector<int64_t>& haystack, const std::vector<int64_t>& needle) { - if (needle.empty()) { // no_repeat_ngram_size == 1, ban every token - return {haystack.begin(), haystack.end()}; - } - std::vector<int> partial_match_table(needle.size() + 1, -1); - int cnd = 0; - for (size_t pos = 1; pos < needle.size(); ++pos) { - if (needle.at(pos) == needle.at(size_t(cnd))) { - partial_match_table.at(pos) = partial_match_table.at(size_t(cnd)); - } else { - partial_match_table.at(pos) = cnd; - while (cnd >= 0 && needle.at(pos) != needle.at(size_t(cnd))) { - cnd = partial_match_table.at(size_t(cnd)); - } - } - ++cnd; - } - partial_match_table.back() = cnd; - std::vector<int64_t> res; - size_t j = 0; // The position of the current character in haystack - int k = 0; // The position of the current character in needle - while (j < haystack.size() - 1) { - if (needle.at(size_t(k)) == haystack.at(j)) { - ++j; - ++k; - if (k == int(needle.size())) { - res.push_back(haystack.at(j)); - k = partial_match_table.at(size_t(k)); - } - } else { - k = partial_match_table.at(size_t(k)); - if (k < 0) { - ++j; - ++k; - } - } - } - return res; -} - -struct Token {float log_prob; int64_t idx;}; - -std::vector<Token> log_softmax(const ov::Tensor& logits, size_t batch_idx) { - if (logits.get_shape().at(0) <= batch_idx) { - throw std::runtime_error("logits batch size doesn't match the number of beams"); - } - size_t vocab_size = logits.get_shape().back(); - size_t batch_offset = batch_idx * logits.get_shape().at(1) * vocab_size; - size_t sequence_offset = (logits.get_shape().at(1) - 1) * vocab_size; - const float* beam_logits = logits.data<const float>() + batch_offset + sequence_offset; - float max_logit = *std::max_element(beam_logits, beam_logits + vocab_size); - float log_sum = std::log(std::accumulate( - beam_logits, beam_logits + vocab_size, 0.0f, [max_logit](float accumulated, float to_add) { - return accumulated + std::exp(to_add - max_logit); - })); - std::vector<Token> tokens; - tokens.reserve(vocab_size); - for (size_t idx = 0; idx < vocab_size; ++idx) { - tokens.push_back({beam_logits[idx] - max_logit - log_sum, int64_t(idx)}); - } - return tokens; -} - -struct Beam { - float score = -std::numeric_limits<float>::infinity(); // The bigger, the better - std::vector<int64_t> tokens; - size_t global_beam_idx = 0; -}; - -bool greater(const Beam& left, const Beam& right) { - return left.score > right.score; -} - -enum class StopCriteria {early, heuristic, never}; - -struct Parameters { - std::vector<int64_t> prompt; - size_t n_groups = 3; - size_t group_size = 5; - float diversity_penalty = 1.0; - size_t max_new_tokens = 20; - StopCriteria stop_criteria = StopCriteria::heuristic; - float length_penalty = 1.0; - size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max(); - // There's no way to extract special token values from the tokenizer for now - int64_t eos_token = 2; - std::function<bool(const Beam&)> early_finish = [](const Beam&){return false;}; -}; - -struct Group { - std::vector<Beam> ongoing; // Best beams in front - std::vector<Beam> min_heap; // The worst of the best completed beams is the first - bool done = false; - void finish(Beam&& beam, const Parameters& parameters) { - beam.score /= std::pow(float(parameters.prompt.size() + beam.tokens.size()), parameters.length_penalty); - min_heap.push_back(std::move(beam)); - std::push_heap(min_heap.begin(), min_heap.end(), greater); - if (min_heap.size() > parameters.group_size) { - std::pop_heap(min_heap.begin(), min_heap.end(), greater); - min_heap.pop_back(); - } - } - void is_done(const Parameters& parameters) { - if (min_heap.size() < parameters.group_size) { - return; - } - size_t cur_len = parameters.prompt.size() + ongoing.front().tokens.size(); - float best_sum_logprobs = ongoing.front().score; - float worst_score = min_heap.front().score; - switch (parameters.stop_criteria) { - case StopCriteria::early: - done = true; - return; - case StopCriteria::heuristic: { - float highest_attainable_score = best_sum_logprobs / std::pow(float(cur_len), parameters.length_penalty); - done = worst_score >= highest_attainable_score; - return; - } - case StopCriteria::never: { - size_t length = parameters.length_penalty > 0.0 ? parameters.max_new_tokens : cur_len; - float highest_attainable_score = best_sum_logprobs / std::pow(float(length), parameters.length_penalty); - done = worst_score >= highest_attainable_score; - return; - } - default: throw std::runtime_error("Never reached"); - } - } -}; - -struct TokenToBeam {int64_t token_idx; int32_t beam_idx;}; - -struct GroupBeamSearcher { - Parameters parameters; - std::vector<Group> groups; - GroupBeamSearcher(Parameters parameters) : parameters{std::move(parameters)}, groups{parameters.n_groups} { - if (parameters.no_repeat_ngram_size == 0) { - throw std::runtime_error("no_repeat_ngram_size must be positive"); - } - for (Group& group : groups) { - group.ongoing.resize(parameters.group_size); - group.ongoing.front().score = 0.0; - } - } - std::pair<std::vector<int64_t>, std::vector<int32_t>> process(const ov::Tensor& logits) { - std::vector<int64_t> next_tokens; - std::vector<int32_t> next_beams; - next_tokens.reserve(parameters.n_groups * parameters.group_size); - next_beams.reserve(parameters.n_groups * parameters.group_size); - size_t beam_count = 0; - for (Group& group : groups) { - if (!group.done) { - for (Beam& beam : group.ongoing) { - beam.global_beam_idx = beam_count; - // beam.tokens.empty() holds for the first process() call. - // Every beam is constructed from the single batch at first call - if (!beam.tokens.empty()) { - ++beam_count; - } - } - } - } - for (auto group = groups.begin(); group != groups.end(); ++group) { - if (group->done) { - continue; - } - std::vector<Beam> candidates; - candidates.reserve(2 * parameters.group_size); - for (const Beam& beam : group->ongoing) { - std::vector<Token> tokens = log_softmax(logits, beam.global_beam_idx); - for (auto prev_group = groups.cbegin(); prev_group != group; ++prev_group) { - for (const Beam& prev_beam : prev_group->ongoing) { - if (prev_beam.tokens.size() > beam.tokens.size()) { - tokens.at(size_t(prev_beam.tokens.back())).log_prob -= parameters.diversity_penalty; - } - } - } - std::vector<int64_t> full_text{parameters.prompt}; - full_text.insert(full_text.end(), beam.tokens.begin(), beam.tokens.end()); - if (full_text.size() > 1 && full_text.size() >= parameters.no_repeat_ngram_size) { - auto tail_start = full_text.end() - ptrdiff_t(parameters.no_repeat_ngram_size) + 1; - for (int64_t banned_token : kmp_search(full_text, {tail_start, full_text.end()})) { - tokens.at(size_t(banned_token)).log_prob = -std::numeric_limits<float>::infinity(); - } - } - std::sort(tokens.begin(), tokens.end(), [](Token left, Token right) { - return left.log_prob > right.log_prob; // Most probable tokens in front - }); - size_t add_count = 0; - for (Token token : tokens) { - Beam new_candidate = beam; - new_candidate.score += token.log_prob; - new_candidate.tokens.push_back(token.idx); - if (parameters.early_finish(new_candidate)) { - group->finish(std::move(new_candidate), parameters); - } else { - candidates.push_back(std::move(new_candidate)); - ++add_count; - if (add_count == 2 * parameters.group_size) { - break; - } - } - } - } - // Sample 2 * group_size highest score tokens to get at least 1 non EOS token per beam - if (candidates.size() < 2 * parameters.group_size) { - throw std::runtime_error("No beams left to search"); - } - auto to_sort = candidates.begin() + ptrdiff_t(2 * parameters.group_size); - std::partial_sort(candidates.begin(), to_sort, candidates.end(), greater); - group->ongoing.clear(); - for (size_t cand_idx = 0; cand_idx < candidates.size(); ++cand_idx) { - if (parameters.eos_token == candidates.at(cand_idx).tokens.back()) { - // If beam_token does not belong to top num_beams tokens, it should not be added - if (cand_idx >= parameters.group_size) { - continue; - } - candidates.at(cand_idx).tokens.resize(candidates.at(cand_idx).tokens.size() - 1); - group->finish(std::move(candidates.at(cand_idx)), parameters); - } else { - group->ongoing.push_back(std::move(candidates.at(cand_idx))); - if (group->ongoing.size() == parameters.group_size) { - break; - } - } - } - group->is_done(parameters); - if (!group->done) { - for (const Beam& beam : group->ongoing) { - next_tokens.push_back(beam.tokens.back()); - next_beams.push_back(int32_t(beam.global_beam_idx)); - } - } - } - return {next_tokens, next_beams}; - } -}; - -// Consume group_beam_searcher because beams are consumed -std::vector<std::vector<Beam>> finalize(GroupBeamSearcher&& group_beam_searcher) { - std::vector<std::vector<Beam>> finalized; - finalized.reserve(group_beam_searcher.groups.size()); - for (Group& group : group_beam_searcher.groups) { - if (!group.done) { - for (Beam& beam : group.ongoing) { - group.finish(std::move(beam), group_beam_searcher.parameters); - } - } - finalized.push_back(std::move(group.min_heap)); - } - return finalized; -} diff --git a/text_generation/causal_lm/cpp/set_up_and_run.sh b/text_generation/causal_lm/cpp/set_up_and_run.sh deleted file mode 100755 index 54e586e00a..0000000000 --- a/text_generation/causal_lm/cpp/set_up_and_run.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash -# Copyright (C) 2023 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -set -e # Exit immediately if a command exits with a non-zero status - -function abs_path() { - script_path=$(eval echo "${BASH_SOURCE[0]}") - directory=$(dirname "$script_path") - builtin cd "$directory" || exit - pwd -P -} -cd "`abs_path`" - -mkdir ./ov/ -curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2023.3.0-13739-294cc6668c4/l_openvino_toolkit_ubuntu20_2023.3.0.dev20231219_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz -sudo ./ov/install_dependencies/install_openvino_dependencies.sh - -source ./ov/setupvars.sh -python -m pip install --upgrade-strategy eager "optimum[openvino]>=1.14" -r ../../../llm_bench/python/requirements.txt ../../../thirdparty/openvino_contrib/modules/custom_operations/[transformers] --extra-index-url https://download.pytorch.org/whl/cpu && python -m pip uninstall --yes openvino && python ../../../llm_bench/python/convert.py --model_id openlm-research/open_llama_3b_v2 --output_dir ./open_llama_3b_v2/ --precision FP16 --stateful & -cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ -cmake --build ./build/ --config Release -j -wait - -convert_tokenizer ./open_llama_3b_v2/pytorch/dldt/FP16/ --output ./open_llama_3b_v2/pytorch/dldt/FP16/ --with-detokenizer --streaming-detokenizer True -./build/greedy_causal_lm ./open_llama_3b_v2/pytorch/dldt/FP16/ "return 0" diff --git a/third-party-programs.txt b/third-party-programs.txt new file mode 100644 index 0000000000..e418d7b5e3 --- /dev/null +++ b/third-party-programs.txt @@ -0,0 +1,417 @@ +OpenVINO GenAI Third Party Programs File + +This file contains the list of third party software ("third party programs") +contained in the Intel software and their required notices and/or license +terms. This third party software, even if included with the distribution of +the Intel software, may be governed by separate license terms, including +without limitation, third party license terms, other Intel software license +terms, and open source software license terms. These separate license terms +govern your use of the third party programs as set forth in the +"third-party-programs.txt" or other similarly-named text file. + +Third party programs and their corresponding required notices and/or license +terms are listed below. + +------------------------------------------------------------- + +Jinja2Cpp + +Mozilla Public License Version 2.0 +================================== + +1. Definitions +-------------- + +1.1. "Contributor" + means each individual or legal entity that creates, contributes to + the creation of, or owns Covered Software. + +1.2. "Contributor Version" + means the combination of the Contributions of others (if any) used + by a Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + means Source Code Form to which the initial Contributor has attached + the notice in Exhibit A, the Executable Form of such Source Code + Form, and Modifications of such Source Code Form, in each case + including portions thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + (a) that the initial Contributor has attached the notice described + in Exhibit B to the Covered Software; or + + (b) that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the + terms of a Secondary License. + +1.6. "Executable Form" + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + means a work that combines Covered Software with other material, in + a separate file or files, that is not Covered Software. + +1.8. "License" + means this document. + +1.9. "Licensable" + means having the right to grant, to the maximum extent possible, + whether at the time of the initial grant or subsequently, any and + all of the rights conveyed by this License. + +1.10. "Modifications" + means any of the following: + + (a) any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered + Software; or + + (b) any new file in Source Code Form that contains any Covered + Software. + +1.11. "Patent Claims" of a Contributor + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the + License, by the making, using, selling, offering for sale, having + made, import, or transfer of either its Contributions or its + Contributor Version. + +1.12. "Secondary License" + means either the GNU General Public License, Version 2.0, the GNU + Lesser General Public License, Version 2.1, the GNU Affero General + Public License, Version 3.0, or any later versions of those + licenses. + +1.13. "Source Code Form" + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that + controls, is controlled by, or is under common control with You. For + purposes of this definition, "control" means (a) the power, direct + or indirect, to cause the direction or management of such entity, + whether by contract or otherwise, or (b) ownership of more than + fifty percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants and Conditions +-------------------------------- + +2.1. Grants + +Each Contributor hereby grants You a world-wide, royalty-free, +non-exclusive license: + +(a) under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + +(b) under Patent Claims of such Contributor to make, use, sell, offer + for sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + +The licenses granted in Section 2.1 with respect to any Contribution +become effective for each Contribution on the date the Contributor first +distributes such Contribution. + +2.3. Limitations on Grant Scope + +The licenses granted in this Section 2 are the only rights granted under +this License. No additional rights or licenses will be implied from the +distribution or licensing of Covered Software under this License. +Notwithstanding Section 2.1(b) above, no patent license is granted by a +Contributor: + +(a) for any code that a Contributor has removed from Covered Software; + or + +(b) for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + +(c) under Patent Claims infringed by Covered Software in the absence of + its Contributions. + +This License does not grant any rights in the trademarks, service marks, +or logos of any Contributor (except as may be necessary to comply with +the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + +No Contributor makes additional grants as a result of Your choice to +distribute the Covered Software under a subsequent version of this +License (see Section 10.2) or under the terms of a Secondary License (if +permitted under the terms of Section 3.3). + +2.5. Representation + +Each Contributor represents that the Contributor believes its +Contributions are its original creation(s) or it has sufficient rights +to grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + +This License is not intended to limit any rights You have under +applicable copyright doctrines of fair use, fair dealing, or other +equivalents. + +2.7. Conditions + +Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +in Section 2.1. + +3. Responsibilities +------------------- + +3.1. Distribution of Source Form + +All distribution of Covered Software in Source Code Form, including any +Modifications that You create or to which You contribute, must be under +the terms of this License. You must inform recipients that the Source +Code Form of the Covered Software is governed by the terms of this +License, and how they can obtain a copy of this License. You may not +attempt to alter or restrict the recipients' rights in the Source Code +Form. + +3.2. Distribution of Executable Form + +If You distribute Covered Software in Executable Form then: + +(a) such Covered Software must also be made available in Source Code + Form, as described in Section 3.1, and You must inform recipients of + the Executable Form how they can obtain a copy of such Source Code + Form by reasonable means in a timely manner, at a charge no more + than the cost of distribution to the recipient; and + +(b) You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter + the recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + +You may create and distribute a Larger Work under terms of Your choice, +provided that You also comply with the requirements of this License for +the Covered Software. If the Larger Work is a combination of Covered +Software with a work governed by one or more Secondary Licenses, and the +Covered Software is not Incompatible With Secondary Licenses, this +License permits You to additionally distribute such Covered Software +under the terms of such Secondary License(s), so that the recipient of +the Larger Work may, at their option, further distribute the Covered +Software under the terms of either this License or such Secondary +License(s). + +3.4. Notices + +You may not remove or alter the substance of any license notices +(including copyright notices, patent notices, disclaimers of warranty, +or limitations of liability) contained within the Source Code Form of +the Covered Software, except that You may alter any license notices to +the extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + +You may choose to offer, and to charge a fee for, warranty, support, +indemnity or liability obligations to one or more recipients of Covered +Software. However, You may do so only on Your own behalf, and not on +behalf of any Contributor. You must make it absolutely clear that any +such warranty, support, indemnity, or liability obligation is offered by +You alone, and You hereby agree to indemnify every Contributor for any +liability incurred by such Contributor as a result of warranty, support, +indemnity or liability terms You offer. You may include additional +disclaimers of warranty and limitations of liability specific to any +jurisdiction. + +4. Inability to Comply Due to Statute or Regulation +--------------------------------------------------- + +If it is impossible for You to comply with any of the terms of this +License with respect to some or all of the Covered Software due to +statute, judicial order, or regulation then You must: (a) comply with +the terms of this License to the maximum extent possible; and (b) +describe the limitations and the code they affect. Such description must +be placed in a text file included with all distributions of the Covered +Software under this License. Except to the extent prohibited by statute +or regulation, such description must be sufficiently detailed for a +recipient of ordinary skill to be able to understand it. + +5. Termination +-------------- + +5.1. The rights granted under this License will terminate automatically +if You fail to comply with any of its terms. However, if You become +compliant, then the rights granted under this License from a particular +Contributor are reinstated (a) provisionally, unless and until such +Contributor explicitly and finally terminates Your grants, and (b) on an +ongoing basis, if such Contributor fails to notify You of the +non-compliance by some reasonable means prior to 60 days after You have +come back into compliance. Moreover, Your grants from a particular +Contributor are reinstated on an ongoing basis if such Contributor +notifies You of the non-compliance by some reasonable means, this is the +first time You have received notice of non-compliance with this License +from such Contributor, and You become compliant prior to 30 days after +Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent +infringement claim (excluding declaratory judgment actions, +counter-claims, and cross-claims) alleging that a Contributor Version +directly or indirectly infringes any patent, then the rights granted to +You by any and all Contributors for the Covered Software under Section +2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all +end user license agreements (excluding distributors and resellers) which +have been validly granted by You or Your distributors under this License +prior to termination shall survive termination. + +************************************************************************ +* * +* 6. Disclaimer of Warranty * +* ------------------------- * +* * +* Covered Software is provided under this License on an "as is" * +* basis, without warranty of any kind, either expressed, implied, or * +* statutory, including, without limitation, warranties that the * +* Covered Software is free of defects, merchantable, fit for a * +* particular purpose or non-infringing. The entire risk as to the * +* quality and performance of the Covered Software is with You. * +* Should any Covered Software prove defective in any respect, You * +* (not any Contributor) assume the cost of any necessary servicing, * +* repair, or correction. This disclaimer of warranty constitutes an * +* essential part of this License. No use of any Covered Software is * +* authorized under this License except under this disclaimer. * +* * +************************************************************************ + +************************************************************************ +* * +* 7. Limitation of Liability * +* -------------------------- * +* * +* Under no circumstances and under no legal theory, whether tort * +* (including negligence), contract, or otherwise, shall any * +* Contributor, or anyone who distributes Covered Software as * +* permitted above, be liable to You for any direct, indirect, * +* special, incidental, or consequential damages of any character * +* including, without limitation, damages for lost profits, loss of * +* goodwill, work stoppage, computer failure or malfunction, or any * +* and all other commercial damages or losses, even if such party * +* shall have been informed of the possibility of such damages. This * +* limitation of liability shall not apply to liability for death or * +* personal injury resulting from such party's negligence to the * +* extent applicable law prohibits such limitation. Some * +* jurisdictions do not allow the exclusion or limitation of * +* incidental or consequential damages, so this exclusion and * +* limitation may not apply to You. * +* * +************************************************************************ + +8. Litigation +------------- + +Any litigation relating to this License may be brought only in the +courts of a jurisdiction where the defendant maintains its principal +place of business and such litigation shall be governed by laws of that +jurisdiction, without reference to its conflict-of-law provisions. +Nothing in this Section shall prevent a party's ability to bring +cross-claims or counter-claims. + +9. Miscellaneous +---------------- + +This License represents the complete agreement concerning the subject +matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent +necessary to make it enforceable. Any law or regulation which provides +that the language of a contract shall be construed against the drafter +shall not be used to construe this License against a Contributor. + +10. Versions of the License +--------------------------- + +10.1. New Versions + +Mozilla Foundation is the license steward. Except as provided in Section +10.3, no one other than the license steward has the right to modify or +publish new versions of this License. Each version will be given a +distinguishing version number. + +10.2. Effect of New Versions + +You may distribute the Covered Software under the terms of the version +of the License under which You originally received the Covered Software, +or under the terms of any subsequent version published by the license +steward. + +10.3. Modified Versions + +If you create software not governed by this License, and you want to +create a new license for such software, you may create and use a +modified version of this License if you rename the license and remove +any references to the name of the license steward (except to note that +such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary +Licenses + +If You choose to distribute Source Code Form that is Incompatible With +Secondary Licenses under the terms of this version of the License, the +notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice +------------------------------------------- + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular +file, then You may include the notice in a location (such as a LICENSE +file in a relevant directory) where a recipient would be likely to look +for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice +--------------------------------------------------------- + + This Source Code Form is "Incompatible With Secondary Licenses", as + defined by the Mozilla Public License, v. 2.0. + +------------------------------------------------------------- + +JSON for Modern C++ (https://github.com/nlohmann/json) + +MIT License + +Copyright (c) 2013-2022 Niels Lohmann + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/thirdparty/CMakeLists.txt b/thirdparty/CMakeLists.txt new file mode 100644 index 0000000000..2cc9f1c06b --- /dev/null +++ b/thirdparty/CMakeLists.txt @@ -0,0 +1,38 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +option(BUILD_TOKENIZERS "Build OpenVINO Tokenizers together with OpenVINO GenAI" ON) + +if(BUILD_TOKENIZERS) + add_subdirectory(./openvino_tokenizers/ "${CMAKE_BINARY_DIR}/openvino_tokenizers/") + # Put binaries to a single dir to mimic package structure. + set_target_properties(openvino_tokenizers PROPERTIES + # Generator expressions to disable appending a per-configuration subdirectory (Release, Debug). + # ARCHIVE_OUTPUT is irrelevant. It's here just to keep all the artifacts in one place. + ARCHIVE_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" + LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" + RUNTIME_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" + ) + if(TARGET core_tokenizers) + set_target_properties(core_tokenizers PROPERTIES + ARCHIVE_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" + LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" + RUNTIME_OUTPUT_DIRECTORY "$<1:${CMAKE_BINARY_DIR}/openvino_genai/>" + ) + else() + # Prebuilt dependencies + if(WIN32) + set(extra_libs "${CMAKE_BINARY_DIR}/_deps/fast_tokenizer-src/lib/core_tokenizers.dll" + "${CMAKE_BINARY_DIR}/_deps/fast_tokenizer-src/third_party/lib/icudt70.dll" + "${CMAKE_BINARY_DIR}/_deps/fast_tokenizer-src/third_party/lib/icuuc70.dll") + elseif(LINUX) + set(extra_libs "${CMAKE_BINARY_DIR}/_deps/fast_tokenizer-src/lib/libcore_tokenizers.so") + elseif(APPLE) + set(extra_libs "${CMAKE_BINARY_DIR}/_deps/fast_tokenizer-src/lib/libcore_tokenizers.dylib") + endif() + add_custom_command(OUTPUT "${extra_libs}" + COMMAND "${CMAKE_COMMAND}" -E copy "${extra_libs}" "${CMAKE_BINARY_DIR}/openvino_genai/" + DEPENDS openvino_tokenizers) + endif() +endif() diff --git a/thirdparty/openvino_contrib b/thirdparty/openvino_contrib deleted file mode 160000 index 096932eed0..0000000000 --- a/thirdparty/openvino_contrib +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 096932eed0a7459be7a1ddc889c9549fcc62a3c8 diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers new file mode 160000 index 0000000000..e74460f8b7 --- /dev/null +++ b/thirdparty/openvino_tokenizers @@ -0,0 +1 @@ +Subproject commit e74460f8b78c26ad46ccaccc0ee34d7ccccf56f7 diff --git a/tools/__init__.py b/tools/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tools/cacheviz/__init__.py b/tools/cacheviz/__init__.py new file mode 100644 index 0000000000..88b5a71df7 --- /dev/null +++ b/tools/cacheviz/__init__.py @@ -0,0 +1,3 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + diff --git a/tools/cacheviz/cacheviz.py b/tools/cacheviz/cacheviz.py new file mode 100644 index 0000000000..f242a10c96 --- /dev/null +++ b/tools/cacheviz/cacheviz.py @@ -0,0 +1,321 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +""" +Usage: +After running a continous batching workload with debug dumps enabled, observe a "debug" folder in the working directory; +the subdirectories `cache_dump/*N*` correspond to the state of N-th layer cache at each generation step, and can be +visualized by running: +cacheviz.py --dump_folder ./debug/cache_dump/0 + +Use "A" and "D" (or "left arrow" and "right arrow") keys to move to the previous or next steps correspondingly, +with "Alt" modifier to move 10 steps at a time, and "Shift" modifier to move 100 steps at a time. +""" + +import argparse +import hashlib +import pathlib +from collections import defaultdict +from dataclasses import dataclass, field +from typing import Dict +from typing import List +from typing import Tuple +import matplotlib.pyplot as plt +import numpy as np +import tqdm +from matplotlib import patches +plt.switch_backend('TkAgg') + +BLOCK_SIZE = 32 +EVICTION_START_SIZE = 32 +EVICTION_EVICTABLE_SIZE = 64 +EVICTION_RECENT_SIZE = 32 + + +def is_evictable(logical_block_idx: int, total_occupied_logical_blocks: int): + assert(logical_block_idx < total_occupied_logical_blocks) + if total_occupied_logical_blocks <= (EVICTION_START_SIZE + EVICTION_EVICTABLE_SIZE + EVICTION_RECENT_SIZE) / BLOCK_SIZE: + return False + logical_block_idx_in_tokens = logical_block_idx * BLOCK_SIZE + return EVICTION_START_SIZE <= logical_block_idx_in_tokens < EVICTION_START_SIZE + EVICTION_EVICTABLE_SIZE + + +def get_hashed_rgb_color(idx: int) -> str: + return '#' + hashlib.sha1(str(idx).encode()).hexdigest()[0:6] # nosec + + +@dataclass +class StepDumpData: + dump_file_name: str = None + num_blocks: int = None + occupied_blocks: Dict[int, List[Tuple[int, int]]] = field(default_factory=lambda: defaultdict(list)) + occupied_blocks_per_sequence: Dict[int, List[int]] = field(default_factory=lambda: defaultdict(list)) + sequence_groups: Dict[int, List[int]] = field(default_factory=dict) + + +def load_data(dump_dir: pathlib.Path) -> List[StepDumpData]: + retval = [] + num_step_files = 0 + step_file_names_dict: Dict[int, List[pathlib.Path]] = defaultdict(list) + + for f in dump_dir.iterdir(): + if f.is_file() and f.suffix == '.txt' and 'usage' not in f.name: + file_name = f.stem + step_number = int(file_name.split("_")[-1]) + step_file_names_dict[step_number].append(f) + num_step_files += 1 + + if num_step_files == 0: + print(f"No step files found") + exit(-1) + + print(f"Step files found: {num_step_files}") + step_file_names_in_order = [name_lex_sorted for _, names_for_step in sorted(step_file_names_dict.items()) for + name_lex_sorted in sorted(names_for_step)] + + for dump_file_name in tqdm.tqdm(step_file_names_in_order): + collected_data = StepDumpData() + collected_data.dump_file_name = dump_file_name.name + with open(dump_file_name, "r") as f: + num_blocks_line = f.readline() + collected_data.num_blocks = int(num_blocks_line) + num_sequence_groups_line = f.readline() + num_sequence_groups = int(num_sequence_groups_line) + for i in range(num_sequence_groups): + sequence_group_line = f.readline() + sequence_group_tokens = sequence_group_line.split() + sequence_group_id = int(sequence_group_tokens[0]) + sequence_group_seq_ids = [int(s) for s in sequence_group_tokens[1:]] + collected_data.sequence_groups[sequence_group_id] = sequence_group_seq_ids + + for (i, line) in enumerate(f): + tokens = line.split() + seq_id, block_idx, ref_count = int(tokens[0]), int(tokens[1]), int(tokens[2]) + if block_idx not in collected_data.occupied_blocks: + collected_data.occupied_blocks[block_idx] = [(seq_id, ref_count)] + else: + collected_data.occupied_blocks[block_idx].append((seq_id, ref_count)) + collected_data.occupied_blocks_per_sequence[seq_id].append(block_idx) + retval.append(collected_data) + return retval + + +def get_allocated_usage_series(step_data: List[StepDumpData]) -> List[float]: + return [len(sd.occupied_blocks) / sd.num_blocks * 100 for sd in step_data] + + +def draw_from_step_data(plot_axes: plt.Axes, step_data: StepDumpData) -> plt.Axes: + num_blocks = step_data.num_blocks + occupied_blocks = step_data.occupied_blocks + occupied_blocks_per_sequence = step_data.occupied_blocks_per_sequence + sequence_groups = step_data.sequence_groups + + seq_id_to_sequence_group_id: Dict[int, int] = { seq_id: seq_group_id for seq_group_id, seq_id_list in sequence_groups.items() for seq_id in seq_id_list } + + nrows = 1 + ncols = num_blocks // nrows + + width = 1 + height = width + + # Positions of the square patches are shifted half-unit to the right so that the ticks on the X axis end up + # centered at the squares middle points + patch_x_positions = np.arange(0.0, ncols, width) + patch_x_positions -= 0.5 + + # Shade the areas occupied by at least one sequence for a visual representation of the cache usage + for occupied_block_idx in occupied_blocks: + vspan_from = patch_x_positions[occupied_block_idx] + vspan_to = vspan_from + 1 + plot_axes.axvspan(vspan_from, vspan_to, alpha=0.5, color='gray') + + max_ylim = 1 + + # Set up the squares for individual sequences and for the overall block table usage + for block_idx, patch_xpos in enumerate(patch_x_positions): + # Block table usage indicator (occupying position -1 on the Y axis) + base_pos = (patch_xpos, -1.5) + base_face_color = '1' + num_occupying_sequences = 0 + base_text_color = 'black' + if block_idx in occupied_blocks: + num_occupying_sequences = occupied_blocks[block_idx][0][1] + base_face_color = str(1 / (2 * num_occupying_sequences)) + base_text_color = 'white' + sq = patches.Rectangle(base_pos, width, height, fill=True, facecolor=base_face_color, edgecolor='black') + plot_axes.add_patch(sq) + + # Mark the block with the number of occupying sequences + text = str(num_occupying_sequences) + center = (base_pos[0] + 0.5, base_pos[1] + 0.5) + plot_axes.annotate(text, center, ha='center', va='center', color=base_text_color) + + if block_idx in occupied_blocks: + for seq_idx, ref_count in occupied_blocks[block_idx]: + # Draw the blocks representing the occupied sequence - a block at each occupied position on the X axis, + # with Y position equal to the sequence ID. + sequence_local_text = str(seq_idx) + sequence_local_center = (center[0], center[1] + (seq_idx + 1) * height) + seq_sq_pos = (base_pos[0], base_pos[1] + (seq_idx + 1)) + max_ylim = max(max_ylim, seq_idx + 1) + seq_color = get_hashed_rgb_color(seq_idx) + seq_group_color = get_hashed_rgb_color(-seq_id_to_sequence_group_id[seq_idx] - 1) + linestyle = 'solid' + logical_idx_in_seq = occupied_blocks_per_sequence[seq_idx].index(block_idx) + if is_evictable(logical_idx_in_seq, len(occupied_blocks_per_sequence[seq_idx])): + linestyle = 'dotted' + seq_sq = patches.Rectangle(seq_sq_pos, width, height, fill=True, facecolor=seq_color, edgecolor=seq_group_color, lw=3, + linestyle=linestyle) + plot_axes.add_patch(seq_sq) + plot_axes.annotate(sequence_local_text, sequence_local_center, ha='center', va='center') + + # Display total blocks used on the right side of the plot + pos_on_right_of_plot_at_sequence_idx = (num_blocks, sequence_local_center[1]) + plot_axes.annotate(str(len(occupied_blocks_per_sequence[seq_idx])), pos_on_right_of_plot_at_sequence_idx, + ha='center', va='center', + color=seq_color, weight='bold') + + # Set limits and ticks so that only integer ticks are visible and all the range is shown + plot_axes.set_yticks(np.arange(max_ylim)) + plot_axes.set_ylim(-1.5, max_ylim) + plot_axes.set_xticks(np.arange(num_blocks)) + plot_axes.set_xlim(-0.5, num_blocks + 0.5) + + # Labels + plot_axes.set_xlabel('Block index') + plot_axes.set_ylabel('Sequence index') + plot_axes.set_title(step_data.dump_file_name) + + # Legend for sequence group colors + plot_axes.legend(handles=[patches.Patch(facecolor=get_hashed_rgb_color(-seq_group_idx - 1), + label=f'Sequence group {seq_group_idx}') for seq_group_idx in + sequence_groups], loc='center left', bbox_to_anchor=(1, 0.5)) + + return plot_axes + + +def load_and_draw_usage(plot_axes: plt.Axes, usage_dump_file: pathlib.Path, current_step: int, allocated_usage_series: List[float], eviction_relation='before') -> Tuple[plt.Axes, float, Tuple[List, List]]: + usage_values: Dict[int, Tuple[float, float]] = {} + with open(usage_dump_file, "r") as f: + while True: + before_eviction_line = f.readline() + after_eviction_line = f.readline() + if before_eviction_line is None or after_eviction_line is None or before_eviction_line == '' or after_eviction_line == '': + break + before_step_num, before_cache_usage = before_eviction_line.split() + after_step_num, after_cache_usage = after_eviction_line.split() + assert before_step_num == after_step_num + step_num = int(before_step_num) + usage_values[step_num] = (float(before_cache_usage), float(after_cache_usage)) + + step_numbers = [k for k in usage_values.keys()] + before_series = [v[0] for v in usage_values.values()] + after_series = [v[1] for v in usage_values.values()] + + # plot "after" first so that it ends up under the "before" plot for better visibility of eviction + plot_axes.plot(step_numbers, after_series, color='blue') + plot_axes.plot(step_numbers, before_series, color='green') + + allocated_usage_before_series = [v for v in allocated_usage_series[0::2]] + allocated_usage_after_series = [v for v in allocated_usage_series[1::2]] + + leaked_before_series = [r - a if (r - a) > 0 else 0 for r, a in zip(before_series, allocated_usage_before_series)] + leaked_after_series = [r - a if (r - a) > 0 else 0 for r, a in zip(after_series, allocated_usage_after_series)] + plot_axes.plot(step_numbers, leaked_after_series, color='orange') + plot_axes.plot(step_numbers, leaked_before_series, color='red') + + plot_axes.set_yticks(np.arange(0, 100, 10)) + plot_axes.set_ylim(0, 100) + plot_axes.grid(visible=True, which='major', axis='y') + + plot_axes.set_xticks(np.arange(0, step_num, 100)) + plot_axes.set_xlim(0, step_num) + + # Labels + plot_axes.set_xlabel('Step') + plot_axes.set_ylabel('Cache usage, %') + + plot_axes.vlines(current_step, ymin=0, ymax=100, colors='red') + + plot_axes.legend(['after eviction', 'before eviction', 'leaked (after eviction)', 'leaked (before eviction)']) + + if eviction_relation == 'before': + reported_cache_usage = usage_values[current_step][0] + allocated_usage_series = allocated_usage_before_series[current_step] + if eviction_relation == 'after': + reported_cache_usage = usage_values[current_step][1] + allocated_usage_series = allocated_usage_after_series[current_step] + + plot_axes.annotate( + f'Block table usage: {allocated_usage_series:.2f}% (occupied), {reported_cache_usage:.2f}% (reported)', + xy=(0.5, 0), xytext=(0, 10), + xycoords=('axes fraction', 'figure fraction'), + textcoords='offset points', + size=14, ha='center', va='bottom') + + +def get_eviction_relation(dump_file_name: str) -> str: + return 'before' if 'before' in str(dump_file_name) else 'after' + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--dump_folder", help="Cache info dump folder", required=True) + parser.add_argument("--step", help="Step ID to show at startup", required=False, default=0, type=int) + args = parser.parse_args() + dump_folder = args.dump_folder + + dump_folder_path = pathlib.Path(dump_folder) + step_data = load_data(dump_folder_path) + allocated_usage_series = get_allocated_usage_series(step_data) + + fig = plt.figure(figsize=(10, 10)) + fig.tight_layout() + plot_axes = fig.add_subplot(211, aspect='equal') + + current_file_idx_displayed: int = args.step * 2 # 2 files per step - before and after eviction + + usage_dump_file = dump_folder_path / "cache_usage.txt" + + def on_press(event): + nonlocal current_file_idx_displayed + if event.key == 'd' or event.key == 'right': + current_file_idx_displayed += 1 + elif event.key == 'a' or event.key == 'left': + current_file_idx_displayed -= 1 + if event.key == 'alt+d' or event.key == 'alt+right': + current_file_idx_displayed += 10 * 2 + elif event.key == 'alt+a' or event.key == 'alt+left': + current_file_idx_displayed -= 10 * 2 + if event.key == 'D' or event.key == 'shift+right': + current_file_idx_displayed += 100 * 2 + elif event.key == 'A' or event.key == 'shift+left': + current_file_idx_displayed -= 100 * 2 + current_file_idx_displayed %= len(step_data) + + mode = get_eviction_relation(step_data[current_file_idx_displayed].dump_file_name) + + plot_axes.clear() + draw_from_step_data(plot_axes, step_data[current_file_idx_displayed]) + + usage_plot_axes.clear() + load_and_draw_usage(usage_plot_axes, usage_dump_file, current_file_idx_displayed // 2, allocated_usage_series=allocated_usage_series, eviction_relation=mode) + fig.canvas.draw_idle() + + fig.canvas.mpl_connect('key_press_event', on_press) + usage_plot_axes = fig.add_subplot(212, aspect='auto') + + curr_step_file_data = step_data[current_file_idx_displayed] + mode = get_eviction_relation(curr_step_file_data.dump_file_name) + + draw_from_step_data(plot_axes, curr_step_file_data) + load_and_draw_usage(usage_plot_axes, usage_dump_file, args.step, allocated_usage_series=allocated_usage_series, eviction_relation=mode) + + plt.show() + + +if __name__ == "__main__": + main() + + + diff --git a/tools/cacheviz/requirements.txt b/tools/cacheviz/requirements.txt new file mode 100644 index 0000000000..9af70e35fa --- /dev/null +++ b/tools/cacheviz/requirements.txt @@ -0,0 +1,2 @@ +argparse +matplotlib \ No newline at end of file