diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 9ab4587c2a..789167949f 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -1,5 +1,9 @@ version: 2 updates: + - package-ecosystem: "pip" + directory: "./" + schedule: + interval: "weekly" - package-ecosystem: "pip" directory: "image_generation/stable_diffusion_1_5/cpp/scripts/" schedule: diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 23d9006d07..a07dacac30 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -194,8 +194,8 @@ jobs: shell: cmd run: | call w_openvino_toolkit_windows_2024.1.0.15008.f4afc983258_x86_64\setupvars.bat - - .\build\Release\beam_search_causal_lm.exe .\TinyLlama-1.1B-Chat-v1.0\ "69" > .\pred.txt + .\build\text_generation\causal_lm\cpp\Release\beam_search_causal_lm.exe .\TinyLlama-1.1B-Chat-v1.0\ "69" > .\pred.txt + echo import transformers > ref.py echo predictions = open('pred.txt', 'r').read() >> ref.py echo tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') >> ref.py diff --git a/.github/workflows/genai_package.yml b/.github/workflows/genai_package.yml index b6f1647c7a..e618586d26 100644 --- a/.github/workflows/genai_package.yml +++ b/.github/workflows/genai_package.yml @@ -2,7 +2,9 @@ name: genai_package on: pull_request jobs: ubuntu_genai_package: - if: false + strategy: + matrix: + build-type: [Release, Debug] runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v4 @@ -12,18 +14,27 @@ jobs: with: python-version: 3.8 - run: mkdir ./ov/ - - run: curl https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.1/linux/l_openvino_toolkit_ubuntu20_2024.1.0.15008.f4afc983258_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz + - run: curl https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.2.0-15454-0d95325972f/l_openvino_toolkit_centos7_2024.2.0.dev20240522_x86_64.tgz | tar --directory ./ov/ --strip-components 1 -xz - run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh - - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - - run: source ./ov/setupvars.sh && cmake --build ./build/ --config Release --target package -j - - run: source ./ov/setupvars.sh && cmake --install ./build/ --config Release --prefix ov - - run: ov/samples/cpp/build_samples.sh -b "${{ github.workspace }}/s pace" + - run: sudo apt-get install libtbb-dev + - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/ + - run: source ./ov/setupvars.sh && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j + - run: source ./ov/setupvars.sh && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ov + - run: ov/samples/cpp/build_samples.sh -i ${{ github.workspace }}/s\ pace + if: ${{ 'Release' == matrix.build-type }} # build_samples enforces Release build - run: source ./ov/setupvars.sh && python -m pip install --upgrade-strategy eager -r text_generation/causal_lm/cpp/requirements.txt + if: ${{ 'Release' == matrix.build-type }} - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + if: ${{ 'Release' == matrix.build-type }} - run: source ./ov/setupvars.sh && optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - - run: source ./ov/setupvars.sh && timeout 50s "${{ github.workspace }}/s pace/intel64/Release/greedy_causal_lm" .\TinyLlama-1.1B-Chat-v1.0\ "" + if: ${{ 'Release' == matrix.build-type }} + - run: source ./ov/setupvars.sh && timeout 50s ${{ github.workspace }}/s\ pace/samples_bin/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "" + if: ${{ 'Release' == matrix.build-type }} windows_genai_package: + strategy: + matrix: + build-type: [Release, Debug] runs-on: windows-latest defaults: run: @@ -37,11 +48,16 @@ jobs: python-version: 3.8 - run: curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.2.0-15349-765302e0de1/w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64.zip - run: unzip ov.zip - - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake --build ./build/ --config Release --target package -j - - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake --install ./build/ --config Release --prefix w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64 - - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\samples\cpp\build_samples_msvc.bat -b "${{ github.workspace }}/samples_build" + - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/ + - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j + - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64 + - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\samples\cpp\build_samples_msvc.bat -i "${{ github.workspace }}/samples_install" + if: ${{ 'Release' == matrix.build-type }} # build_samples enforces Release build - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -m pip install --upgrade-strategy eager -r text_generation/causal_lm/cpp/requirements.txt + if: ${{ 'Release' == matrix.build-type }} - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] + if: ${{ 'Release' == matrix.build-type }} - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0 - - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && "${{ github.workspace }}/samples_build/intel64/Release/greedy_causal_lm" .\TinyLlama-1.1B-Chat-v1.0\ "" + if: ${{ 'Release' == matrix.build-type }} + - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && "${{ github.workspace }}/samples_install/samples_bin/greedy_causal_lm" .\TinyLlama-1.1B-Chat-v1.0\ "" + if: ${{ 'Release' == matrix.build-type }} diff --git a/.github/workflows/genai_python_lib.yml b/.github/workflows/genai_python_lib.yml index e9cfefff31..f00ce286aa 100644 --- a/.github/workflows/genai_python_lib.yml +++ b/.github/workflows/genai_python_lib.yml @@ -16,9 +16,22 @@ jobs: - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - run: source ./ov/setupvars.sh && cmake --build ./build/ --config Release -j - run: python -m pip install --pre openvino --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly # Can't load CentOS libraries from the archive - - run: PYTHONPATH=./src/python/ python -c "from openvino_genai.py_generate_pipeline import LLMPipeline" - - run: source ./ov/setupvars.sh && python -m pip install --pre . --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - - run: python -c "from openvino_genai.py_generate_pipeline import LLMPipeline" + # GitHub Actions already provides what is listed in ./requirements-build.txt but the internal + # build system doesn't. Install ./requirements-build.txt to detect possible conflicts. + - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./requirements-build.txt + - run: PYTHONPATH=./src/python/ python -c "from openvino_genai import LLMPipeline" + - run: source ./ov/setupvars.sh && CMAKE_BUILD_PARALLEL_LEVEL="" python -m pip install --pre . --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + - run: python -c "from openvino_genai import LLMPipeline" + - name: GenAI Python API tests + run: | + source ./ov/setupvars.sh + cd ./tests/python_tests/ + python -m pip install -r requirements.txt + models=$(python list_test_models.py) + echo "$models" | while read -r model_name model_path; do + optimum-cli export openvino --trust-remote-code --weight-format fp16 --model "$model_name" "$model_path" + done + python -m pytest test_generate_api.py windows_genai_python_lib: runs-on: windows-latest @@ -37,6 +50,9 @@ jobs: - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake --build ./build/ --config Release -j - run: python -m pip install "numpy<1.27" - - run: set "PYTHONPATH=./src/python;" && call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -c "from openvino_genai.py_generate_pipeline import LLMPipeline" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that. - - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -m pip install . - - run: python -c "from openvino_genai.py_generate_pipeline import LLMPipeline" + # GitHub Actions already provides what is listed in ./requirements-build.txt but the internal + # build system doesn't. Install ./requirements-build.txt to detect possible conflicts. + - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./requirements-build.txt + - run: set "PYTHONPATH=./src/python;" && call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -c "from openvino_genai import LLMPipeline" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that. + - run: set CMAKE_BUILD_PARALLEL_LEVEL="" && call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -m pip install . + - run: python -c "from openvino_genai import LLMPipeline" diff --git a/.gitmodules b/.gitmodules index 937468fb64..f72fd83489 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,9 +1,3 @@ [submodule "thirdparty/openvino_tokenizers"] path = thirdparty/openvino_tokenizers url = https://github.com/openvinotoolkit/openvino_tokenizers.git -[submodule "thirdparty/nlohmann_json"] - path = thirdparty/nlohmann_json - url = https://github.com/nlohmann/json.git -[submodule "thirdparty/Jinja2Cpp"] - path = thirdparty/Jinja2Cpp - url = https://github.com/jinja2cpp/Jinja2Cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 809327095c..6c01b378c9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,16 +4,23 @@ cmake_minimum_required(VERSION 3.15) -set(CMAKE_BUILD_TYPE "Release" CACHE STRING "CMake build type") -set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Release" "Debug" "RelWithDebInfo" "MinSizeRel") +# Multi config generators such as Visual Studio ignore CMAKE_BUILD_TYPE. Multi config generators are configured with +# CMAKE_CONFIGURATION_TYPES, but limiting options in it completely removes such build options +get_property(GENERATOR_IS_MULTI_CONFIG_VAR GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG) +if(NOT GENERATOR_IS_MULTI_CONFIG_VAR AND NOT DEFINED CMAKE_BUILD_TYPE) + message(STATUS "CMAKE_BUILD_TYPE is not defined, 'Release' will be used") + # Setting CMAKE_BUILD_TYPE as CACHE must go before project(). Otherwise project() sets its value and set() doesn't take an effect + set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel ...") +endif() -project(openvino_genai VERSION 2024.2.0.0) +project(OpenVINOGenAI VERSION 2024.2.0.0) add_subdirectory(./thirdparty/openvino_tokenizers/ "${CMAKE_CURRENT_BINARY_DIR}/openvino_tokenizers/") add_subdirectory(src) add_subdirectory(text_generation/causal_lm/cpp) install(DIRECTORY text_generation/causal_lm/cpp/ DESTINATION samples/cpp/causal_lm COMPONENT cpp_samples_genai) -install(FILES LICENSE third-party-programs.txt DESTINATION licensing_genai COMPONENT licensing_genai) # TODO: how to merge with OPenvino +install(FILES LICENSE DESTINATION licensing COMPONENT licensing_genai RENAME LICENSE-GENAI) +install(FILES third-party-programs.txt DESTINATION licensing COMPONENT licensing_genai RENAME third-party-programs-genai.txt) set(CPACK_GENERATOR "ZIP") include(CPack) diff --git a/pyproject.toml b/pyproject.toml index cb373e12c8..f9707988bf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ dependencies = [ cmake.source-dir = "./" cmake.build-type = "Release" cmake.targets = ["py_generate_pipeline", "genai"] -install.components = ["core_genai", "pygenai"] +install.components = ["wheel_genai"] sdist.cmake = true wheel.packages = ["src/python/openvino_genai"] wheel.install-dir = "openvino_genai" diff --git a/requirements-build.txt b/requirements-build.txt new file mode 100644 index 0000000000..aaaf7148ec --- /dev/null +++ b/requirements-build.txt @@ -0,0 +1,2 @@ +cmake~=3.23 +build~=1.2.1 diff --git a/text_generation/causal_lm/cpp/generate_pipeline/README.md b/src/README.md similarity index 69% rename from text_generation/causal_lm/cpp/generate_pipeline/README.md rename to src/README.md index 0a0f6010e6..ad21250989 100644 --- a/text_generation/causal_lm/cpp/generate_pipeline/README.md +++ b/src/README.md @@ -2,7 +2,7 @@ ## Usage -Firs of all you need to convert your model with optimum-cli +First of all you need to convert your model with optimum-cli ``` sh optimum-cli export openvino --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --weight-format fp16 --trust-remote-code "TinyLlama-1.1B-Chat-v1.0" pip install openvino-genai @@ -10,19 +10,33 @@ pip install openvino-genai LLMPipeline is the main object used for decoding. You can initiliza it straigh away from the folder with the converted model. It will automanically load the main model, tokenizer, detokenizer and default generation configuration. -### In Python +### Python A minimalist example: ```python -import py_generate_pipeline as genai # set more friendly module name -pipe = genai.LLMPipeline(model_path, "CPU") +import openvino_genai as ov_genai +pipe = ov_genai.LLMPipeline(model_path, "CPU") print(pipe.generate("The Sun is yellow bacause")) ``` +Calling generate with custom generation config parameters, e.g. config for grouped beam search +```python +import openvino_genai as ov_genai +pipe = ov_genai.LLMPipeline(model_path, "CPU") + +res = pipe.generate("The Sun is yellow bacause", max_new_tokens=30, num_groups=3, group_size=5) +print(res) +``` + +output: +``` +'it is made up of carbon atoms. The carbon atoms are arranged in a linear pattern, which gives the yellow color. The arrangement of carbon atoms in' +``` + A simples chat in python: ```python import openvino_genai as ov_genai -pipe = ov_genai.LLMPipeline(model_path) +pipe = ov_ov_genai.LLMPipeline(model_path) config = {'num_groups': 3, 'group_size': 5, 'diversity_penalty': 1.1} pipe.set_generation_cofnig(config) @@ -39,60 +53,45 @@ pipe.finish_chat() ``` Test to compare with Huggingface outputs -```python -from transformers import AutoTokenizer, AutoModelForCausalLM - -tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0") -model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0") - -max_new_tokens = 32 -prompt = 'table is made of' - -encoded_prompt = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=False) -hf_encoded_output = model.generate(encoded_prompt, max_new_tokens=max_new_tokens, do_sample=False) -hf_output = tokenizer.decode(hf_encoded_output[0, encoded_prompt.shape[1]:]) -print(f'hf_output: {hf_output}') - -import sys -sys.path.append('build-Debug/') -import py_generate_pipeline as genai # set more friendly module name - -pipe = genai.LLMPipeline('text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/') -ov_output = pipe(prompt, max_new_tokens=max_new_tokens) -print(f'ov_output: {ov_output}') -assert hf_output == ov_output - -``` - -### In C++ +### C++ Minimalistc example ```cpp +#include "openvino/genai/llm_pipeline.hpp" +#include <iostream> + int main(int argc, char* argv[]) { std::string model_path = argv[1]; ov::LLMPipeline pipe(model_path, "CPU"); - cout << pipe.generate("The Sun is yellow bacause"); + std::cout << pipe.generate("The Sun is yellow bacause"); } ``` Using Group Beam Search Decoding ```cpp +#include "openvino/genai/llm_pipeline.hpp" +#include <iostream> + int main(int argc, char* argv[]) { std::string model_path = argv[1]; ov::LLMPipeline pipe(model_path, "CPU"); + ov::GenerationConfig config = pipe.get_generation_config(); config.max_new_tokens = 256; config.num_groups = 3; config.group_size = 5; config.diversity_penalty = 1.0f; - cout << pipe.generate("The Sun is yellow bacause", config); + std::cout << pipe.generate("The Sun is yellow bacause", config); } ``` A simplest chat in C++ ``` cpp +#include "openvino/genai/llm_pipeline.hpp" +#include <iostream> + int main(int argc, char* argv[]) { std::string prompt; @@ -142,24 +141,38 @@ int main(int argc, char* argv[]) { Streaming exapmle with lambda function ``` cpp -int main(int argc, char* argv[]) { - auto streamer = [](std::string word) { std::cout << word << std::flush; }; +#include "openvino/genai/llm_pipeline.hpp" +#include <iostream> + +int main(int argc, char* argv[]) { std::string model_path = argv[1]; ov::LLMPipeline pipe(model_path, "CPU"); - cout << pipe.generate("The Sun is yellow bacause", streamer); + + auto streamer = [](std::string word) { std::cout << word << std::flush; }; + std::cout << pipe.generate("The Sun is yellow bacause", streamer); } ``` Streaming with custom class ``` cpp #include <streamer_base.hpp> +#include "openvino/genai/llm_pipeline.hpp" +#include <iostream> class CustomStreamer: publict StreamerBase { public: - void put(int64_t token) {/* decode tokens and do process them*/}; - - void end() {/* decode tokens and do process them*/}; + void put(int64_t token) { + /* custom decoding/tokens processing code + tokens_cache.push_back(token); + std::string text = m_tokenizer.decode(tokens_cache); + ... + */ + }; + + void end() { + /* custom finalization */ + }; }; int main(int argc, char* argv[]) { @@ -170,4 +183,3 @@ int main(int argc, char* argv[]) { cout << pipe.generate("The Sun is yellow bacause", custom_streamer); } ``` - diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt index ffe28a81df..30d95d3553 100644 --- a/src/cpp/CMakeLists.txt +++ b/src/cpp/CMakeLists.txt @@ -13,8 +13,8 @@ FetchContent_MakeAvailable(nlohmann_json) function(ov_genai_build_jinja2cpp) FetchContent_Declare(jinja2cpp - URL https://github.com/ilya-lavrenov/Jinja2Cpp/archive/a5d002cbf44469775556daea14ba3ccdba1e365a.tar.gz - URL_HASH SHA256=5aa5378d9acf3c44dfb607fd7f16f48b17ffa6495c219957901e9191ffe28900) + URL https://github.com/ilya-lavrenov/Jinja2Cpp/archive/5433af6b225cd35df700023cf60df4acdd6cbcf3.tar.gz + URL_HASH SHA256=b90f6c44908beaacae8eeb2690d11a6ebb183b4560434698ac00017e7bc07d11) FetchContent_GetProperties(jinja2cpp) if(NOT jinja2cpp_POPULATED) @@ -49,8 +49,6 @@ add_library(${TARGET_NAME} SHARED ${SOURCE_FILES}) add_library(openvino::${TARGET_NAME} ALIAS ${TARGET_NAME}) target_include_directories(${TARGET_NAME} - # TODO: remove it, because beam_search algo should not be exposed to end users - PRIVATE "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../../text_generation/causal_lm/cpp/>" PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>" "$<INSTALL_INTERFACE:runtime/include>") target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime PRIVATE nlohmann_json::nlohmann_json jinja2cpp) @@ -59,6 +57,11 @@ target_compile_definitions(${TARGET_NAME} PRIVATE OPENVINO_TOKENIZERS_PATH=\"$<T target_compile_features(${TARGET_NAME} PUBLIC cxx_std_17) +set_target_properties(${TARGET_NAME} PROPERTIES + VERSION ${CMAKE_PROJECT_VERSION} + SOVERSION ${CMAKE_PROJECT_VERSION_MAJOR} +) + # Copy the library to python to allow skipping wheel installation add_custom_command(TARGET ${TARGET_NAME} POST_BUILD COMMAND "${CMAKE_COMMAND}" -E copy @@ -66,7 +69,19 @@ add_custom_command(TARGET ${TARGET_NAME} POST_BUILD "${CMAKE_CURRENT_SOURCE_DIR}/../python/openvino_genai/$<TARGET_FILE_NAME:${TARGET_NAME}>" COMMENT "Copy ${TARGET_NAME} to src/python/openvino_genai") -install(TARGETS ${TARGET_NAME} LIBRARY DESTINATION . COMPONENT core_genai RUNTIME DESTINATION . COMPONENT core_genai) +find_package(Python3 REQUIRED COMPONENTS Interpreter Development) +install(TARGETS ${TARGET_NAME} + LIBRARY DESTINATION python/openvino_genai/ COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR} + RUNTIME DESTINATION python/openvino_genai/ COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR}) + +# Copy libcore_tokenizers.so to build_dir/openvino_tokenizers/src/ +if(NOT MSVC) + add_custom_command(TARGET ${TARGET_NAME} POST_BUILD + COMMAND "${CMAKE_COMMAND}" -E copy + "${CMAKE_BINARY_DIR}/_deps/fast_tokenizer-src/lib/libcore_tokenizers.so" + "${CMAKE_BINARY_DIR}/openvino_tokenizers/src/" + COMMENT "Copy libcore_tokenizers.so to build_dir/openvino_tokenizers/src/") +endif() # - Windows: `<openvino_dir>\runtime\bin\intel64\Release\` # - MacOS_x86: `<openvino_dir>/runtime/lib/intel64/Release` @@ -89,17 +104,17 @@ endif() if(MSVC OR APPLE) set(ARCH_DIR ${ARCH_DIR}/${CMAKE_BUILD_TYPE}) endif() -install(TARGETS ${TARGET_NAME} EXPORT openvino_genaiTargets - LIBRARY DESTINATION runtime/lib/${ARCH_DIR} COMPONENT core_genai_dev +install(TARGETS ${TARGET_NAME} EXPORT OpenVINOGenAITargets + LIBRARY DESTINATION runtime/lib/${ARCH_DIR} COMPONENT core_genai + NAMELINK_COMPONENT core_genai_dev ARCHIVE DESTINATION runtime/lib/${ARCH_DIR} COMPONENT core_genai_dev - RUNTIME DESTINATION runtime/bin/${ARCH_DIR} COMPONENT core_genai_dev + RUNTIME DESTINATION runtime/bin/${ARCH_DIR} COMPONENT core_genai INCLUDES DESTINATION runtime/include) install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/ DESTINATION runtime/include COMPONENT core_genai_dev) -install(EXPORT openvino_genaiTargets FILE openvino_genaiTargets.cmake NAMESPACE openvino:: DESTINATION runtime/cmake) +install(EXPORT OpenVINOGenAITargets FILE OpenVINOGenAITargets.cmake NAMESPACE openvino:: DESTINATION runtime/cmake) include(CMakePackageConfigHelpers) -configure_package_config_file(openvino_genaiConfig.cmake.in "${CMAKE_BINARY_DIR}/openvino_genaiConfig.cmake" INSTALL_DESTINATION runtime/cmake) -install(FILES "${CMAKE_BINARY_DIR}/openvino_genaiConfig.cmake" "${CMAKE_BINARY_DIR}/openvino_genaiConfigVersion.cmake" DESTINATION runtime/cmake COMPONENT core_genai_dev) +configure_package_config_file(OpenVINOGenAIConfig.cmake.in "${CMAKE_BINARY_DIR}/OpenVINOGenAIConfig.cmake" INSTALL_DESTINATION runtime/cmake) +install(FILES "${CMAKE_BINARY_DIR}/OpenVINOGenAIConfig.cmake" "${CMAKE_BINARY_DIR}/OpenVINOGenAIConfig.cmake" DESTINATION runtime/cmake COMPONENT core_genai_dev) include(CMakePackageConfigHelpers) -write_basic_package_version_file("${CMAKE_BINARY_DIR}/openvino_genaiConfigVersion.cmake" VERSION ${CMAKE_PROJECT_VERSION} COMPATIBILITY AnyNewerVersion) -export(EXPORT openvino_genaiTargets FILE "${CMAKE_BINARY_DIR}/openvino_genaiTargets.cmake" NAMESPACE openvino::) -# export(TARGETS ${TARGET_NAME} NAMESPACE openvino:: FILE "${CMAKE_BINARY_DIR}/openvino_genaiConfig.cmake") TODO +write_basic_package_version_file("${CMAKE_BINARY_DIR}/OpenVINOGenAIConfigVersion.cmake" VERSION ${CMAKE_PROJECT_VERSION} COMPATIBILITY AnyNewerVersion) +export(EXPORT OpenVINOGenAITargets FILE "${CMAKE_BINARY_DIR}/OpenVINOGenAITargets.cmake" NAMESPACE openvino::) diff --git a/src/cpp/openvino_genaiConfig.cmake.in b/src/cpp/OpenVINOGenAIConfig.cmake.in similarity index 70% rename from src/cpp/openvino_genaiConfig.cmake.in rename to src/cpp/OpenVINOGenAIConfig.cmake.in index abfd33b524..18c0bb4e48 100644 --- a/src/cpp/openvino_genaiConfig.cmake.in +++ b/src/cpp/OpenVINOGenAIConfig.cmake.in @@ -4,7 +4,7 @@ include(CMakeFindDependencyMacro) find_dependency(OpenVINO COMPONENTS Runtime) if(NOT TARGET genai) - include("${CMAKE_CURRENT_LIST_DIR}/openvino_genaiTargets.cmake") + include("${CMAKE_CURRENT_LIST_DIR}/OpenVINOGenAITargets.cmake") endif() check_required_components(openvino_genai) diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp index e1f2151d49..837fae21ad 100644 --- a/src/cpp/include/openvino/genai/generation_config.hpp +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -14,9 +14,10 @@ namespace ov { /** - * @brief controls the stopping condition for grouped beam search. The following values are possible: - * "early", where the generation stops as soon as there are `num_beams` complete candidates; "heuristic", where an - * heuristic is applied and the generation stops when is it very unlikely to find better candidates; + * @brief controls the stopping condition for grouped beam search. The following values are possible: + * "early" stops as soon as there are `num_beams` complete candidates. + "heuristic" stops when is it unlikely to find better candidates. + "never" stops when there cannot be better candidates. */ enum class StopCriteria { early, heuristic, never }; @@ -25,11 +26,11 @@ enum class StopCriteria { early, heuristic, never }; * * @param max_length the maximum length the generated tokens can have. Corresponds to the length of the input prompt + * `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set. - * @param max_new_tokens the maximum numbers of tokens to generate, ignoring the number of tokens in the prompt. + * @param max_new_tokens the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. * @param ignore_eos if set to true, then generation will not stop even if <eos> token is met. - * @param num_beams number of beams for beam search. 1 means no beam search. + * @param num_beams number of beams for beam search. 1 disables beam search. * @param num_beam_groups number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. - * @param diversity_penalty this value is subtracted from a beam's score if it generates a token same as any beam from other group at a + * @param diversity_penalty this value is subtracted from a beam's score if it generates the same token as any beam from other group at a * particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled. * @param length_penalty exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to * the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log @@ -42,11 +43,11 @@ enum class StopCriteria { early, heuristic, never }; * heuristic is applied and the generation stops when is it very unlikely to find better candidates; * "never", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). * @param temperature the value used to modulate token probabilities for random sampling - * @param top_p if set to float < 1, only the smallest set of most probable tokens with probabilities + * @param top_p - if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. * @param top_k the number of highest probability vocabulary tokens to keep for top-k-filtering. * @param do_sample whether or not to use multinomial random sampling * that add up to `top_p` or higher are kept. - * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty. + * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty. See https://arxiv.org/pdf/1909.05858. * @param pad_token_id id of padding token * @param bos_token_id id of <bos> token * @param eos_token_id id of <eos> token diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp index b25d11ecd4..3bc8453d4e 100644 --- a/src/cpp/include/openvino/genai/llm_pipeline.hpp +++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp @@ -39,6 +39,24 @@ class DecodedResults { public: std::vector<std::string> texts; std::vector<float> scores; + + // @brief Convert DecodedResults to a vector of strings. + // @return A std::vector<std::string> containing the texts from the DecodedResults object. + operator std::vector<std::string>() const { + return texts; + } + + // @brief Overloads operator<< to enhance output the contents of DecodedResults. + // @return A reference to the output stream with the concatenated texts. + friend std::ostream& operator<<(std::ostream& os, const DecodedResults& dr) { + for (size_t i = 0; i < dr.texts.size(); ++i) { + os << dr.texts[i]; + if (i != dr.texts.size() - 1) { + os << std::endl; + } + } + return os; + } }; /** @@ -47,13 +65,15 @@ class DecodedResults { class OPENVINO_GENAI_EXPORTS LLMPipeline { public: /** - * @brief Constructs a LLMPipeline when convert model xml/bin files, tokenizers and configuration and in the same dir. + * @brief Constructs an LLMPipeline from xml/bin files, tokenizers and configuration in the same dir. * * @param model_path Path to the dir model xml/bin files, tokenizers and generation_configs.json * @param device optional device * @param plugin_config optional plugin_config */ - LLMPipeline(std::string& path, std::string device="CPU", const ov::AnyMap& plugin_config={}); + LLMPipeline(std::string& path, std::string device="CPU", + const ov::AnyMap& plugin_config={}, + const std::string& ov_tokenizers_path=""); /** * @brief Constructs a LLMPipeline when ov::Tokenizer is initialized manually using file from the different dirs. @@ -67,7 +87,8 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { const std::string model_path, const ov::Tokenizer& tokenizer, const std::string device="CPU", - const ov::AnyMap& plugin_config = {} + const ov::AnyMap& plugin_config = {}, + const std::string& ov_tokenizers_path="" ); ~LLMPipeline(); @@ -84,8 +105,8 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { template <typename... Properties> util::EnableIfAllStringAny<std::string, Properties...> generate( - std::string text, - Properties&&... properties) { + std::string text, + Properties&&... properties) { return generate(text, AnyMap{std::forward<Properties>(properties)...}); } std::string generate(std::string text, const ov::AnyMap& config); diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp index 0d55d9b0fe..03c0cd64f7 100644 --- a/src/cpp/include/openvino/genai/tokenizer.hpp +++ b/src/cpp/include/openvino/genai/tokenizer.hpp @@ -21,7 +21,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { * @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path * @param device device. Currently only 'CPU' is supported */ - Tokenizer(const std::string& tokenizers_path, const std::string& device="CPU"); + Tokenizer(const std::string& tokenizers_path, const std::string& device="CPU", const std::string& ov_tokenizers_path=""); /** * @brief encode a single prompt diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp index b392e44b3b..14fc370c59 100644 --- a/src/cpp/src/generation_config.cpp +++ b/src/cpp/src/generation_config.cpp @@ -10,40 +10,43 @@ #include "openvino/genai/generation_config.hpp" #include "generation_config_helper.hpp" +#include "utils.hpp" + +namespace { + + +} // namespace + namespace ov { GenerationConfig::GenerationConfig(std::string json_path) { + using ov::generate_utils::read_json_param; + std::ifstream f(json_path); OPENVINO_ASSERT(f.is_open(), "Failed to open '" + json_path + "' with generation config"); nlohmann::json data = nlohmann::json::parse(f); - - if (data.contains("max_new_tokens")) max_new_tokens = data["max_new_tokens"]; - if (data.contains("max_length")) max_length = data["max_length"]; - // note that ignore_eos is not present in HF GenerationConfig - if (data.contains("num_beam_groups")) num_beam_groups = data["num_beam_groups"]; - if (data.contains("num_beams")) num_beams = data["num_beams"]; - if (data.contains("diversity_penalty")) diversity_penalty = data["diversity_penalty"]; - if (data.contains("length_penalty")) length_penalty = data["length_penalty"]; - if (data.contains("num_return_sequences")) num_return_sequences = data["num_return_sequences"]; - if (data.contains("no_repeat_ngram_size")) no_repeat_ngram_size = data["no_repeat_ngram_size"]; - // stop_criteria will be processed below - if (data.contains("temperature")) temperature = data["temperature"]; - if (data.contains("top_p")) top_p = data["top_p"]; - if (data.contains("top_k")) top_k = data["top_k"]; - if (data.contains("do_sample")) do_sample = data["do_sample"]; - if (data.contains("repetition_penalty")) repetition_penalty = data["repetition_penalty"]; - if (data.contains("pad_token_id")) pad_token_id = data["pad_token_id"]; - if (data.contains("bos_token_id")) bos_token_id = data["bos_token_id"]; - if (data.contains("eos_token_id") && data["eos_token_id"].type() == nlohmann::json::value_t::number_integer) { - // todo: qwen contains several eos_token_id - eos_token_id = data["eos_token_id"]; - } - - if (data.contains("bos_token")) bos_token = data["bos_token"]; - if (data.contains("eos_token")) eos_token = data["eos_token"]; + read_json_param(data, "max_new_tokens", max_new_tokens); + read_json_param(data, "max_length", max_length); + // note that ignore_eos is not present in HF GenerationConfig + read_json_param(data, "num_beam_groups", num_beam_groups); + read_json_param(data, "num_beams", num_beams); + read_json_param(data, "diversity_penalty", diversity_penalty); + read_json_param(data, "length_penalty", length_penalty); + read_json_param(data, "num_return_sequences", num_return_sequences); + read_json_param(data, "no_repeat_ngram_size", no_repeat_ngram_size); + read_json_param(data, "temperature", temperature); + read_json_param(data, "top_p", top_p); + read_json_param(data, "top_k", top_k); + read_json_param(data, "do_sample", do_sample); + read_json_param(data, "repetition_penalty", repetition_penalty); + read_json_param(data, "pad_token_id", pad_token_id); + read_json_param(data, "bos_token_id", bos_token_id); + read_json_param(data, "eos_token_id", eos_token_id); + read_json_param(data, "bos_token", bos_token); + read_json_param(data, "eos_token", eos_token); if (data.contains("early_stopping")) { auto field_type = data["early_stopping"].type(); @@ -55,32 +58,35 @@ GenerationConfig::GenerationConfig(std::string json_path) { stop_criteria = StopCriteria::heuristic; } } + + } GenerationConfig GenerationConfigHelper::anymap_to_generation_config(const ov::AnyMap& config_map) { + using ov::generate_utils::read_anymap_param; + GenerationConfig config = m_config; - - if (config_map.count("max_new_tokens")) config.max_new_tokens = config_map.at("max_new_tokens").as<size_t>(); - if (config_map.count("max_length")) config.max_length = config_map.at("max_length").as<size_t>(); - if (config_map.count("ignore_eos")) config.ignore_eos = config_map.at("ignore_eos").as<bool>(); - if (config_map.count("num_beam_groups")) config.num_beam_groups = config_map.at("num_beam_groups").as<size_t>(); - if (config_map.count("num_beams")) config.num_beams = config_map.at("num_beams").as<size_t>(); - if (config_map.count("diversity_penalty")) config.diversity_penalty = config_map.at("diversity_penalty").as<float>(); - if (config_map.count("length_penalty")) config.length_penalty = config_map.at("length_penalty").as<float>(); - if (config_map.count("num_return_sequences")) config.num_return_sequences = config_map.at("num_return_sequences").as<size_t>(); - if (config_map.count("no_repeat_ngram_size")) config.no_repeat_ngram_size = config_map.at("no_repeat_ngram_size").as<size_t>(); - if (config_map.count("stop_criteria")) config.stop_criteria = config_map.at("stop_criteria").as<StopCriteria>(); - if (config_map.count("temperature")) config.temperature = config_map.at("temperature").as<float>(); - if (config_map.count("top_p")) config.top_p = config_map.at("top_p").as<float>(); - if (config_map.count("top_k")) config.top_k = config_map.at("top_k").as<int>(); - if (config_map.count("do_sample")) config.do_sample = config_map.at("do_sample").as<bool>(); - if (config_map.count("repetition_penalty")) config.repetition_penalty = config_map.at("repetition_penalty").as<float>(); - if (config_map.count("pad_token_id")) config.pad_token_id = config_map.at("pad_token_id").as<int64_t>(); - if (config_map.count("bos_token_id")) config.bos_token_id = config_map.at("bos_token_id").as<int64_t>(); - if (config_map.count("eos_token_id")) config.eos_token_id = config_map.at("eos_token_id").as<int64_t>(); - if (config_map.count("bos_token")) config.bos_token = config_map.at("bos_token").as<std::string>(); - if (config_map.count("eos_token")) config.eos_token = config_map.at("eos_token").as<std::string>(); - + read_anymap_param(config_map, "max_new_tokens", config.max_new_tokens); + read_anymap_param(config_map, "max_length", config.max_length); + read_anymap_param(config_map, "ignore_eos", config.ignore_eos); + read_anymap_param(config_map, "num_beam_groups", config.num_beam_groups); + read_anymap_param(config_map, "num_beams", config.num_beams); + read_anymap_param(config_map, "diversity_penalty", config.diversity_penalty); + read_anymap_param(config_map, "length_penalty", config.length_penalty); + read_anymap_param(config_map, "num_return_sequences", config.num_return_sequences); + read_anymap_param(config_map, "no_repeat_ngram_size", config.no_repeat_ngram_size); + read_anymap_param(config_map, "stop_criteria", config.stop_criteria); + read_anymap_param(config_map, "temperature", config.temperature); + read_anymap_param(config_map, "top_p", config.top_p); + read_anymap_param(config_map, "top_k", config.top_k); + read_anymap_param(config_map, "do_sample", config.do_sample); + read_anymap_param(config_map, "repetition_penalty", config.repetition_penalty); + read_anymap_param(config_map, "pad_token_id", config.pad_token_id); + read_anymap_param(config_map, "bos_token_id", config.bos_token_id); + read_anymap_param(config_map, "eos_token_id", config.eos_token_id); + read_anymap_param(config_map, "bos_token", config.bos_token); + read_anymap_param(config_map, "eos_token", config.eos_token); + return config; } diff --git a/src/cpp/src/group_beam_searcher.cpp b/src/cpp/src/group_beam_searcher.cpp index 1e27f36a0a..312671c8f0 100644 --- a/src/cpp/src/group_beam_searcher.cpp +++ b/src/cpp/src/group_beam_searcher.cpp @@ -2,8 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 #include <openvino/runtime/tensor.hpp> -#include "group_beam_searcher.hpp" #include "generation_config_helper.hpp" +#include "openvino/genai/llm_pipeline.hpp" #include "utils.hpp" namespace { diff --git a/src/cpp/src/group_beam_searcher.hpp b/src/cpp/src/group_beam_searcher.hpp deleted file mode 100644 index 91f3ef4096..0000000000 --- a/src/cpp/src/group_beam_searcher.hpp +++ /dev/null @@ -1,12 +0,0 @@ -// Copyright (C) 2023-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#pragma once - -#include <openvino/runtime/tensor.hpp> -#include "openvino/genai/generation_config.hpp" -#include "openvino/genai/llm_pipeline.hpp" - -namespace ov { - EncodedResults beam_search(ov::InferRequest& lm, ov::Tensor prompts, ov::Tensor attentin_mask, GenerationConfig sampling_params); -} diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 9d4161f859..9ea685e583 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -14,7 +14,6 @@ #include "openvino/genai/llm_pipeline.hpp" #include "utils.hpp" #include "generation_config_helper.hpp" -#include "group_beam_searcher.hpp" #include "text_callback_streamer.hpp" @@ -29,6 +28,8 @@ ov::EncodedResults greedy_decoding( bool is_chat_conversation = false ); +EncodedResults beam_search(ov::InferRequest& lm, ov::Tensor prompts, ov::Tensor attentin_mask, GenerationConfig config); + class LLMPipeline::LLMPipelineImpl { public: @@ -44,10 +45,11 @@ class LLMPipeline::LLMPipelineImpl { const std::string model_path, const ov::Tokenizer& tokenizer, const std::string device, - const ov::AnyMap& plugin_config + const ov::AnyMap& plugin_config, + const std::string& ov_tokenizers_path="" ); - LLMPipelineImpl(std::string& path, std::string device, const ov::AnyMap& config); + LLMPipelineImpl(std::string& path, std::string device, const ov::AnyMap& config, const std::string& ov_tokenizers_path=""); GenerationConfig generation_config() const; @@ -68,16 +70,18 @@ ov::LLMPipeline::LLMPipeline( const std::string model_path, const ov::Tokenizer& tokenizer, const std::string device, - const ov::AnyMap& plugin_config + const ov::AnyMap& plugin_config, + const std::string& ov_tokenizers_path ) { - m_pimpl = make_unique<LLMPipelineImpl>(model_path, tokenizer, device, plugin_config); + m_pimpl = make_unique<LLMPipelineImpl>(model_path, tokenizer, device, plugin_config, ov_tokenizers_path); } ov::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl( const std::string model_path, const ov::Tokenizer& tokenizer, std::string device, - const ov::AnyMap& plugin_config + const ov::AnyMap& plugin_config, + const std::string& ov_tokenizers_path ): m_tokenizer(tokenizer), m_device(device), m_plugin_config(plugin_config) { ov::Core core; @@ -91,30 +95,42 @@ ov::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl( } } -ov::LLMPipeline::LLMPipeline(std::string& path, std::string device, const ov::AnyMap& config) { - m_pimpl = make_unique<LLMPipelineImpl>(path, device, config); +ov::LLMPipeline::LLMPipeline(std::string& path, std::string device, const ov::AnyMap& config, const std::string& ov_tokenizers_path) { + m_pimpl = make_unique<LLMPipelineImpl>(path, device, config, ov_tokenizers_path); } -ov::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl(std::string& path, std::string device, const ov::AnyMap& config) { - std::string tokenizer_config_fname = "tokenizer_config.json"; - std::string generation_config_fname = "generation_config.json"; +ov::LLMPipeline::LLMPipelineImpl::LLMPipelineImpl(std::string& path, std::string device, + const ov::AnyMap& config, const std::string& ov_tokenizers_path) { + std::string config_path = path + "/" + "config.json"; + std::string tokenizer_config_path = path + "/" +"tokenizer_config.json"; + std::string generation_config_path = path + "/" +"generation_config.json"; + + if (std::filesystem::exists(generation_config_path)) { + m_generation_config = GenerationConfig(generation_config_path); + } else if (std::filesystem::exists(config_path)) { + // some models (e.g. google/gemma-*) do not have generation_config.json, but have config.json + // and special tokens are stored there. + + std::ifstream f(config_path); + OPENVINO_ASSERT(f.is_open(), "Failed to open '" + config_path + "' with config.json"); - if (std::filesystem::exists(path + "/" + generation_config_fname)) { - m_generation_config = GenerationConfig(path + "/" + generation_config_fname); - } - if (std::filesystem::exists(path + "/" + tokenizer_config_fname)) { - std::ifstream f(path + "/" + tokenizer_config_fname); nlohmann::json data = nlohmann::json::parse(f); - m_chat_template = data.value("chat_template", ""); + using ov::generate_utils::read_json_param; + read_json_param(data, "pad_token_id", m_generation_config.pad_token_id); + read_json_param(data, "bos_token_id", m_generation_config.bos_token_id); + read_json_param(data, "eos_token_id", m_generation_config.eos_token_id); + } + + if (std::filesystem::exists(tokenizer_config_path)) { + std::ifstream f(tokenizer_config_path); + ov::generate_utils::read_json_param(nlohmann::json::parse(f), "chat_template", m_chat_template); } - - m_device = device; ov::Core core; m_model_runner = core.compile_model(path + "/openvino_model.xml", device, config).create_infer_request(); - m_tokenizer = Tokenizer(path); + m_tokenizer = Tokenizer(path, device, ov_tokenizers_path); } ov::GenerationConfig ov::LLMPipeline::LLMPipelineImpl::generation_config() const { diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp index 75c18734d3..778778faec 100644 --- a/src/cpp/src/tokenizer.cpp +++ b/src/cpp/src/tokenizer.cpp @@ -53,15 +53,18 @@ class Tokenizer::TokenizerImpl { int64_t m_eos_token_id = 2; TokenizerImpl() = default; - TokenizerImpl(std::string tokenizers_path, const std::string device) { + TokenizerImpl(std::string tokenizers_path, const std::string device, const std::string& ov_tokenizers_path) { ov::Core core; if (ov::generate_utils::is_xml(tokenizers_path)) OPENVINO_THROW("tokenizers_path should be a path to a dir not a xml file"); - // todo:: OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt - core.add_extension(OPENVINO_TOKENIZERS_PATH); - + if (ov_tokenizers_path.empty()) { + // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt + core.add_extension(OPENVINO_TOKENIZERS_PATH); + } else { + core.add_extension(ov_tokenizers_path + "/libopenvino_tokenizers.so"); + } std::shared_ptr<ov::Model> tokenizer_model, detokenizer_model; try { tokenizer_model = core.read_model(tokenizers_path + "/openvino_tokenizer.xml"); @@ -141,8 +144,8 @@ class Tokenizer::TokenizerImpl { } }; -Tokenizer::Tokenizer(const std::string& tokenizers_path, const std::string& device) { - m_pimpl = std::make_shared<TokenizerImpl>(tokenizers_path, device); +Tokenizer::Tokenizer(const std::string& tokenizers_path, const std::string& device, const std::string& ov_tokenizers_path) { + m_pimpl = std::make_shared<TokenizerImpl>(tokenizers_path, device, ov_tokenizers_path); } std::pair<ov::Tensor, ov::Tensor> Tokenizer::encode(const std::string prompt) { diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp index 92df3d7067..dbd18cf3f3 100644 --- a/src/cpp/src/utils.cpp +++ b/src/cpp/src/utils.cpp @@ -18,11 +18,15 @@ void print_tensor(const ov::Tensor& tensor) { auto t_shape = tensor.get_shape(); std::cout << "["; - for (size_t i = 0; i < t_shape[1]; ++i) { - if (tensor.get_element_type() == ov::element::i64) { - res.emplace_back(tensor.data<int64_t>()[i]); - std::cout << tensor.data<int64_t>()[i] << " "; + for (size_t i = 0; i < t_shape[0]; ++i) { + std::cout << "|"; + for (size_t j = 0; j < t_shape[1]; ++j) { + if (tensor.get_element_type() == ov::element::i64) { + res.emplace_back(tensor.data<int64_t>()[t_shape[1] * i + j]); + std::cout << tensor.data<int64_t>()[t_shape[1] * i + j] << " "; + } } + std::cout << "|"; } std::cout << "]" << std::endl; } @@ -132,4 +136,4 @@ ov::Tensor extend_attention(ov::Tensor attention_mask) { } } // namespace generate_utils -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index 7510c59e46..d7998a9594 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -4,6 +4,7 @@ #pragma once #include <openvino/openvino.hpp> +#include <nlohmann/json.hpp> namespace ov { namespace generate_utils { @@ -22,5 +23,41 @@ void update_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention bool is_xml(const std::string& path); +template <typename> +struct json_type_traits {}; + +template <> +struct json_type_traits<int> { static constexpr auto json_value_t = nlohmann::json::value_t::number_integer; }; + +template <> +struct json_type_traits<int64_t> { static constexpr auto json_value_t = nlohmann::json::value_t::number_integer; }; + +template <> +struct json_type_traits<size_t> { static constexpr auto json_value_t = nlohmann::json::value_t::number_unsigned; }; + +template <> +struct json_type_traits<float> { static constexpr auto json_value_t = nlohmann::json::value_t::number_float; }; + +template <> +struct json_type_traits<std::string> { static constexpr auto json_value_t = nlohmann::json::value_t::string; }; + +template <> +struct json_type_traits<bool> { static constexpr auto json_value_t = nlohmann::json::value_t::boolean; }; + +template <typename T> +void read_json_param(const nlohmann::json& data, const std::string& name, T& param) { + if (data.contains(name) && data[name].type() == json_type_traits<T>::json_value_t) { + param = data[name]; + } +} + +template <typename T> +void read_anymap_param(const ov::AnyMap& config_map, const std::string& name, T& param) { + if (config_map.count(name)) { + param = config_map.at(name).as<T>(); + } +} + } // namespace generate_utils -} // namespace ov \ No newline at end of file +} // namespace ov + diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt index 62f26f3215..00722b6fff 100644 --- a/src/python/CMakeLists.txt +++ b/src/python/CMakeLists.txt @@ -5,8 +5,8 @@ include(FetchContent) FetchContent_Declare( pybind11 - GIT_REPOSITORY https://github.com/pybind/pybind11 - GIT_TAG v2.12.0 + URL https://github.com/pybind/pybind11/archive/3e9dfa2866941655c56877882565e7577de6fc7b.tar.gz + URL_HASH SHA256=9a7d245f405f470798b9d2a48912cc97230658024775299eac203f7c9c9ae37c ) set(CMAKE_POSITION_INDEPENDENT_CODE ON) FetchContent_GetProperties(pybind11) @@ -16,9 +16,7 @@ if(NOT pybind11_POPULATED) endif() pybind11_add_module(py_generate_pipeline py_generate_pipeline.cpp) -target_link_libraries(py_generate_pipeline PRIVATE genai) - -install(TARGETS py_generate_pipeline LIBRARY DESTINATION . COMPONENT pygenai) +target_link_libraries(py_generate_pipeline PRIVATE openvino::genai) # setting RPATH / LC_RPATH depending on platform if(LINUX) @@ -46,3 +44,8 @@ add_custom_command(TARGET py_generate_pipeline POST_BUILD find_package(Python3 REQUIRED COMPONENTS Interpreter Development) install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/openvino_genai/ DESTINATION python/openvino_genai/ COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR}) +install(TARGETS py_generate_pipeline LIBRARY DESTINATION python/openvino_genai/ COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR}) + +# wheel_genai component is used for wheel generation in pyproject.toml. +# Exclude wheel_genai from normal packaging process. +install(TARGETS genai py_generate_pipeline LIBRARY DESTINATION . COMPONENT wheel_genai RUNTIME DESTINATION . COMPONENT wheel_genai EXCLUDE_FROM_ALL) diff --git a/src/python/openvino_genai/__init__.py b/src/python/openvino_genai/__init__.py index f604e03e84..e069157fa7 100644 --- a/src/python/openvino_genai/__init__.py +++ b/src/python/openvino_genai/__init__.py @@ -8,3 +8,7 @@ if hasattr(os, "add_dll_directory"): os.add_dll_directory(os.path.dirname(__file__)) + +from .py_generate_pipeline import LLMPipeline, Tokenizer, GenerationConfig, DecodedResults, EncodedResults + +__all__ = ['LLMPipeline', 'Tokenizer', 'GenerationConfig', 'DecodedResults', 'EncodedResults'] diff --git a/src/python/py_generate_pipeline.cpp b/src/python/py_generate_pipeline.cpp index 74cbe7e27d..2aee67593c 100644 --- a/src/python/py_generate_pipeline.cpp +++ b/src/python/py_generate_pipeline.cpp @@ -62,15 +62,21 @@ std::string call_with_config(ov::LLMPipeline& pipe, const std::string& text, con return pipe(text, config); } +std::string ov_tokenizers_module_path() { + py::module_ m = py::module_::import("openvino_tokenizers"); + py::list path_list = m.attr("__path__"); + return std::string(py::str(path_list[0])) + "/lib"; +} + PYBIND11_MODULE(py_generate_pipeline, m) { m.doc() = "Pybind11 binding for LLM Pipeline"; - py::class_<LLMPipeline>(m, "LLMPipeline") - .def(py::init<const std::string, const ov::Tokenizer&, const std::string, const ov::AnyMap&>(), - py::arg("model_path"), py::arg("tokenizer"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap{}) - .def(py::init<std::string&, std::string, const ov::AnyMap&>(), - py::arg("path"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap{}) + .def(py::init<const std::string, const ov::Tokenizer&, const std::string, const ov::AnyMap&, const std::string&>(), + py::arg("model_path"), py::arg("tokenizer"), py::arg("device") = "CPU", + py::arg("plugin_config") = ov::AnyMap{}, py::arg("ov_tokenizers_path") = ov_tokenizers_module_path()) + .def(py::init<std::string&, std::string, const ov::AnyMap&, const std::string>(), + py::arg("path"), py::arg("device") = "CPU", py::arg("plugin_config") = ov::AnyMap{}, py::arg("ov_tokenizers_path") = ov_tokenizers_module_path()) .def("__call__", py::overload_cast<ov::LLMPipeline&, const std::string&, const py::kwargs&>(&call_with_kwargs)) .def("__call__", py::overload_cast<ov::LLMPipeline&, const std::string&, const ov::GenerationConfig&>(&call_with_config)) .def("generate", py::overload_cast<ov::LLMPipeline&, const std::string&, const py::kwargs&>(&call_with_kwargs)) @@ -96,7 +102,10 @@ PYBIND11_MODULE(py_generate_pipeline, m) { // Binding for Tokenizer py::class_<ov::Tokenizer>(m, "Tokenizer") .def(py::init<>()) - .def(py::init<std::string&, std::string>(), py::arg("tokenizers_path"), py::arg("device") = "CPU") + .def(py::init<std::string&, const std::string&, const std::string&>(), + py::arg("tokenizers_path"), + py::arg("device") = "CPU", + py::arg("ov_tokenizers_path") = py::str(ov_tokenizers_module_path())) // todo: implement encode/decode when for numpy inputs and outputs .def("encode", py::overload_cast<const std::string>(&ov::Tokenizer::encode), "Encode a single prompt") diff --git a/src/tests/python_tests/test_greedy.py b/src/tests/python_tests/test_greedy.py deleted file mode 100644 index f33909721b..0000000000 --- a/src/tests/python_tests/test_greedy.py +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright (C) 2023-2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -def test_tiny_llama(): - from transformers import AutoTokenizer, AutoModelForCausalLM - - tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0") - model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0") - - max_new_tokens = 32 - prompt = 'table is made of' - - encoded_prompt = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=True) - hf_encoded_output = model.generate(encoded_prompt, max_new_tokens=max_new_tokens, do_sample=False) - hf_output = tokenizer.decode(hf_encoded_output[0, encoded_prompt.shape[1]:]) - print(f'hf_output: {hf_output}') - - import sys - sys.path.append('src/python/openvino_genai/') - import py_generate_pipeline as genai - - pipe = genai.LLMPipeline('text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/') - ov_output = pipe(prompt, max_new_tokens=max_new_tokens, do_sample=False) - print(f'ov_output: {ov_output}') - - assert hf_output == ov_output - -if __name__ == '__main__': - test_tiny_llama() diff --git a/tests/python_tests/list_test_models.py b/tests/python_tests/list_test_models.py new file mode 100644 index 0000000000..09addcfaba --- /dev/null +++ b/tests/python_tests/list_test_models.py @@ -0,0 +1,21 @@ +def models_list(): + model_ids = [ + ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", "TinyLlama-1.1B-Chat-v1.0"), + # ("google/gemma-2b-it", "gemma-2b-it"), + # ("google/gemma-7b-it", "gemma-7b-it"), + # ("meta-llama/Llama-2-7b-chat-hf", "Llama-2-7b-chat-hf"), + # ("meta-llama/Llama-2-13b-chat-hf", "Llama-2-13b-chat-hf"), + # ("openlm-research/open_llama_3b", "open_llama_3b"), + # ("openlm-research/open_llama_7b", "open_llama_7b"), + # ("databricks/dolly-v2-3b", "dolly-v2-3b"), + # ("databricks/dolly-v2-12b", "dolly-v2-12b"), + # ("mistralai/Mistral-7B-v0.1", "Mistral-7B-v0.1"), + # ("ikala/redpajama-3b-chat", "redpajama-3b-chat"), + # ("microsoft/phi-1_5", "phi-1_5/"), + # ("Qwen/Qwen1.5-7B-Chat", "Qwen1.5-7B-Chat"), + ] + return model_ids + +if __name__ == "__main__": + for model_id, model_path in models_list(): + print(model_id, model_path) diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt new file mode 100644 index 0000000000..e536fd531e --- /dev/null +++ b/tests/python_tests/requirements.txt @@ -0,0 +1,4 @@ +pytest +transformers +torch +optimum-intel[openvino] @ git+https://github.com/huggingface/optimum-intel.git@fb1b35bef23242d65b2fb057c4a7ac78a7cfd4c3 \ No newline at end of file diff --git a/tests/python_tests/test_generate_api.py b/tests/python_tests/test_generate_api.py new file mode 100644 index 0000000000..1d46e227c9 --- /dev/null +++ b/tests/python_tests/test_generate_api.py @@ -0,0 +1,115 @@ +# Copyright (C) 2023-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import pytest +from list_test_models import models_list + + +@pytest.fixture(scope="module", params=models_list()) +def model_fixture(request): + model_id, path = request.param + from transformers import AutoTokenizer, AutoModelForCausalLM + tokenizer = AutoTokenizer.from_pretrained(model_id) + model = AutoModelForCausalLM.from_pretrained(model_id) + return model_id, path, tokenizer, model + +def run_hf_ov_genai_comparison(model_fixture, generation_config, prompt): + model_id, path, tokenizer, model = model_fixture + + generation_config_hf = generation_config.copy() + # in OpenVINO GenAI this parameter is called stop_criteria, + # while in HF it's called early_stopping. + # HF values True, False and "never" correspond to OV GenAI values "early", "heuristic" and "never" + if generation_config_hf.get('stop_criteria'): + generation_config_hf['early_stopping'] = stop_criteria_map()[generation_config_hf.pop('stop_criteria')] + + encoded_prompt = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=True) + hf_encoded_output = model.generate(encoded_prompt, **generation_config_hf) + hf_output = tokenizer.decode(hf_encoded_output[0, encoded_prompt.shape[1]:]) + + device = 'CPU' + ov_tokenizers_path = '../../build/openvino_tokenizers/src/' + import openvino_genai as ov_genai + + pipe = ov_genai.LLMPipeline(path, device, {}, ov_tokenizers_path) + ov_output = pipe.generate(prompt, **generation_config) + + if hf_output != ov_output: + print(f'hf_output: {hf_output}') + print(f'ov_output: {ov_output}') + + assert hf_output == ov_output + + +def stop_criteria_map(): + return {"never": "never", "early": True, "heuristic": False} + +test_cases = [ + (dict(max_new_tokens=20, do_sample=False), 'table is made of'), # generation_config, prompt + # (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=20, diversity_penalty=1.0), 'table is made of'), + # (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=20, diversity_penalty=1.0), 'Alan Turing was a'), + # (dict(num_beam_groups=3, num_beams=15, num_return_sequences=15, max_new_tokens=30, diversity_penalty=1.0), 'Alan Turing was a'), + # (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'table is made of'), + # (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.0), 'The Sun is yellow because'), + # (dict(num_beam_groups=2, num_beams=8, num_return_sequences=8, max_new_tokens=20, diversity_penalty=1.5), 'The Sun is yellow because'), +] +@pytest.mark.parametrize("generation_config,prompt", test_cases) +def test_greedy_decoding(model_fixture, generation_config, prompt): + run_hf_ov_genai_comparison(model_fixture, generation_config, prompt) + + +prompts = ['The Sun is yellow because', 'Alan Turing was a', 'table is made of'] +@pytest.mark.parametrize("num_beam_groups", [2, 3, 8]) +@pytest.mark.parametrize("group_size", [5, 3, 10]) +@pytest.mark.parametrize("max_new_tokens", [20, 15]) +@pytest.mark.parametrize("diversity_penalty", [1.0, 1.5]) +@pytest.mark.parametrize("prompt", prompts) +@pytest.mark.skip # temporarily +def test_beam_search_decoding(model_fixture, num_beam_groups, group_size, + max_new_tokens, diversity_penalty, prompt): + generation_config = dict( + num_beam_groups=num_beam_groups, + num_beams=num_beam_groups * group_size, + diversity_penalty=diversity_penalty, + num_return_sequences=num_beam_groups * group_size, + max_new_tokens=max_new_tokens, + ) + run_hf_ov_genai_comparison(model_fixture, generation_config, prompt) + + +@pytest.mark.parametrize("stop_criteria", ["never", "early", "heuristic"]) +@pytest.mark.parametrize("prompt", prompts) +@pytest.mark.parametrize("max_new_tokens", [20, 40, 300]) +@pytest.mark.skip # temporarily +def test_stop_criteria(model_fixture, stop_criteria, prompt, max_new_tokens): + # todo: for long sentences early stop_criteria fails + if (stop_criteria == 'early' and max_new_tokens >= 300): + pytest.skip() + generation_config = dict( + num_beam_groups=2, + num_beams=2 * 3, + diversity_penalty=1.0, + num_return_sequences=2 * 3, + max_new_tokens=max_new_tokens, + stop_criteria=stop_criteria, + ) + run_hf_ov_genai_comparison(model_fixture, generation_config, prompt) + + +# test long sequences +@pytest.mark.parametrize("num_beam_groups", [2]) +@pytest.mark.parametrize("group_size", [5]) +@pytest.mark.parametrize("max_new_tokens", [800, 2000]) +@pytest.mark.parametrize("diversity_penalty", [1.0]) +@pytest.mark.parametrize("prompt", prompts) +@pytest.mark.skip # will be enabled in nightly since are computationally expensive +def test_beam_search_long_sentences(model_fixture, num_beam_groups, group_size, + max_new_tokens, diversity_penalty, prompt): + generation_config = dict( + num_beam_groups=num_beam_groups, + num_beams=num_beam_groups * group_size, + diversity_penalty=1.0, + num_return_sequences=num_beam_groups * group_size, + max_new_tokens=max_new_tokens, + ) + run_hf_ov_genai_comparison(model_fixture, generation_config, prompt) diff --git a/text_generation/causal_lm/cpp/CMakeLists.txt b/text_generation/causal_lm/cpp/CMakeLists.txt index 07d91e6d3b..7e3ec23fde 100644 --- a/text_generation/causal_lm/cpp/CMakeLists.txt +++ b/text_generation/causal_lm/cpp/CMakeLists.txt @@ -10,7 +10,7 @@ else() set(OPENVINO_TOKENIZERS_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../bin/openvino_tokenizers.dll) # TODO: I'll go away after the generate() gets a way to find openvino_tokenizers endif() -find_package(openvino_genai REQUIRED PATHS +find_package(OpenVINOGenAI REQUIRED PATHS "${CMAKE_BINARY_DIR}" # Reuse the package from the build. ${OpenVINO_DIR} # GenAI may be installed alogside OpenVINO. ) @@ -46,14 +46,13 @@ set_target_properties(prompt_lookup_decoding_lm PROPERTIES CXX_STANDARD_REQUIRED find_package(TBB REQUIRED COMPONENTS tbb) target_link_libraries(prompt_lookup_decoding_lm PRIVATE TBB::tbb) -add_executable(generate_sample generate_pipeline/generate_sample.cpp) -target_link_libraries(generate_sample PRIVATE openvino::genai) -target_include_directories(generate_sample PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}") -set_target_properties(generate_sample PROPERTIES CXX_STANDARD 17) -set_target_properties(generate_sample PROPERTIES CXX_STANDARD_REQUIRED ON) - -add_executable(chat_sample generate_pipeline/chat_sample.cpp) +add_executable(chat_sample chat_sample.cpp) target_link_libraries(chat_sample PRIVATE openvino::genai) target_include_directories(chat_sample PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}") set_target_properties(chat_sample PROPERTIES CXX_STANDARD 17) set_target_properties(chat_sample PROPERTIES CXX_STANDARD_REQUIRED ON) + +install(TARGETS greedy_causal_lm beam_search_causal_lm speculative_decoding_lm prompt_lookup_decoding_lm chat_sample + RUNTIME DESTINATION samples_bin/ + COMPONENT samples_bin + EXCLUDE_FROM_ALL) diff --git a/text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp b/text_generation/causal_lm/cpp/chat_sample.cpp similarity index 100% rename from text_generation/causal_lm/cpp/generate_pipeline/chat_sample.cpp rename to text_generation/causal_lm/cpp/chat_sample.cpp diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp deleted file mode 100644 index 84e07c394b..0000000000 --- a/text_generation/causal_lm/cpp/generate_pipeline/generate_sample.cpp +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright (C) 2023-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "openvino/genai/llm_pipeline.hpp" - -using std::cout; -using std::endl; - -int main(int argc, char* argv[]) { - if (2 > argc && argc > 4) - throw std::runtime_error(std::string{"Usage: "} + argv[0] + " <MODEL_DIR> \"<PROMPT>\" <DEVICE>"); - std::string model_path = argv[1]; - - std::string prompt = "table is made of "; - std::string device = "CPU"; // can be replaced with GPU - - if (argc > 2) - prompt = argv[2]; - if (argc > 3) - device = argv[3]; - - // Example 1: Simplest example with greedy search - // Model, tokenizer and generation_config.json will be loaded from the model_path. - // If generation_config.json is not found default velues for gready search will be used - - // ov::streamer_lambda([](std::string subword){std::cout << subword << std::flush;}) - ov::LLMPipeline pipe(model_path, device); - // cout << prompt << pipe(prompt, ov::max_new_tokens(1000)) << endl; - - // todo: syntactic sugar to specify generation configs in place - // cout << prompt << pipe(prompt, ov::max_new_tokens(100)) << endl; - - - auto tokenizer = ov::Tokenizer(model_path); - auto [input_ids, attention_mask] = tokenizer.encode("table is made of "); - auto resuling_tokens = pipe.generate(input_ids, ov::max_new_tokens(1000)); - cout << tokenizer.decode(resuling_tokens.tokens[0]) << endl; - - // Example 2: Modifying generation_cofnig to use grouped beam search - ov::GenerationConfig config = pipe.get_generation_config(); - config.max_new_tokens = 100; - config.num_beams = 15; - config.num_beam_groups = 3; - // cout << prompt << pipe(prompt, config) << endl; - - // cout << endl << "grouped beam search generated candidates:" << endl; - // for (int i = 0; i < num_return_sequences; ++i) - // will return vector with num_return_sequences strings - // auto num_return_sequences = 3; - - // // Example 3: Greedy Decoding with multiple batch - // pipe = ov::LLMPipeline(model_path, device); - // config = pipe.generation_config(); - - // cout << endl << "greedy decoding with multiple batches:" << endl; - // std::vector<std::string> prompts = {"table is made of", "Alan Turing was a", "1 + 1 = ", "Why is the Sun yellow?"}; - // auto results = pipe(prompts, config.max_new_tokens(20)); - // for (const auto& res: results) - // std::cout << res.text << std::endl; - - // // Example 4: Calling tokenizer/detokenizer manually and getting beam scores for all candidates - // pipe = ov::LLMPipeline(model_path); - // auto [input_ids, attention_mask] = pipe.get_tokenizer().tokenize({prompt}); - // config = GenerationConfig::beam_search(); - // // config for grouped beam search - // config.max_new_tokens(30).num_groups(3).group_size(5).num_return_sequences(15); - - // cout << endl << "beam search with printing of all candidates:" << endl; - // auto beams = pipe.generate(input_ids, attention_mask, config); - // for (size_t i = 0; i < beams.scores.size(); i++) { - // std::cout << beams.scores[i] << ": " << pipe.get_tokenizer().detokenize(beams.tokens[i]) << std::endl; - // } - - // // for (const auto& beam : beams.second) - // // std::cout << beam.first << ": " << pipe.detokenize(beam.second) << std::endl; - - // { - // // Example 5: Speculative sampling - // std::string assitive_model_path = "text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16"; - // pipe = ov::LLMPipeline(model_path); - // auto [input_ids, attention_mask] = pipe.get_tokenizer().tokenize({prompt}); - // // config = GenerationConfig::assistive_decoding(assitive_model_path).num_assistant_tokens(5).max_new_tokens(20); - // pipe.generation_config().assistant_model(assitive_model_path); - - // cout << endl << "Speculative sampling with TinyLlama assistance:" << endl; - // auto results = pipe.generate(input_ids, attention_mask, config); - // for (size_t i = 0; i < beams.scores.size(); i++) { - // for (const auto& result : results) - // std::cout << pipe.get_tokenizer().detokenize(result.tokens) << std::endl; - // } - // } - - return 0; -} diff --git a/text_generation/causal_lm/cpp/greedy_causal_lm.cpp b/text_generation/causal_lm/cpp/greedy_causal_lm.cpp index 7b1dde4dc8..e410d170ca 100644 --- a/text_generation/causal_lm/cpp/greedy_causal_lm.cpp +++ b/text_generation/causal_lm/cpp/greedy_causal_lm.cpp @@ -17,6 +17,7 @@ int main(int argc, char* argv[]) try { ov::LLMPipeline pipe(model_path, device); ov::GenerationConfig config = pipe.get_generation_config(); config.max_new_tokens = 100; + config.do_sample = false; auto streamer = [](std::string subword){std::cout << subword << std::flush;}; // since streamer is set results will be printed each time a new token is generated diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers index c754503462..b45b752edf 160000 --- a/thirdparty/openvino_tokenizers +++ b/thirdparty/openvino_tokenizers @@ -1 +1 @@ -Subproject commit c754503462f569b648b598d57ff91ea57bb8deb1 +Subproject commit b45b752edf0245f65bcc0c2c6925b771fe55c4b5