Wovchena
diff --git a/‎.github/workflows/causal_lm_cpp.yml
+2-2 b/‎.github/workflows/causal_lm_cpp.yml
+2-2
diff --git a/‎.github/workflows/genai_python_lib.yml
+18-6 b/‎.github/workflows/genai_python_lib.yml
+18-6
diff --git a/‎.gitmodules
-6 b/‎.gitmodules
-6
diff --git a/‎CMakeLists.txt
+8-2 b/‎CMakeLists.txt
+8-2
diff --git a/‎text_generation/causal_lm/cpp/generate_pipeline/README.md ‎src/README.md
+52-40 b/‎text_generation/causal_lm/cpp/generate_pipeline/README.md ‎src/README.md
+52-40
diff --git a/‎src/cpp/CMakeLists.txt
+9-4 b/‎src/cpp/CMakeLists.txt
+9-4
diff --git a/‎src/cpp/include/openvino/genai/generation_config.hpp
+9-8 b/‎src/cpp/include/openvino/genai/generation_config.hpp
+9-8
diff --git a/‎src/cpp/include/openvino/genai/llm_pipeline.hpp
+26-5 b/‎src/cpp/include/openvino/genai/llm_pipeline.hpp
+26-5
@@ -194,8 +194,8 @@ jobs:
         shell: cmd
         run: |
           call w_openvino_toolkit_windows_2024.1.0.15008.f4afc983258_x86_64\setupvars.bat
-
-          .\build\Release\beam_search_causal_lm.exe .\TinyLlama-1.1B-Chat-v1.0\ "69" > .\pred.txt
+          .\build\text_generation\causal_lm\cpp\Release\beam_search_causal_lm.exe .\TinyLlama-1.1B-Chat-v1.0\ "69" > .\pred.txt
+          
           echo import transformers > ref.py
           echo predictions = open('pred.txt', 'r').read() >> ref.py
           echo tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') >> ref.py
 
@@ -2,7 +2,7 @@ name: genai_python_lib
 on: pull_request
 jobs:
   ubuntu_genai_python_lib:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-20.04-16-cores
     steps:
       - uses: actions/checkout@v4
         with:
@@ -16,9 +16,20 @@ jobs:
       - run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
       - run: source ./ov/setupvars.sh && cmake --build ./build/ --config Release -j
       - run: python -m pip install --pre openvino --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly  # Can't load CentOS libraries from the archive
-      - run: PYTHONPATH=./src/python/ python -c "from openvino_genai.py_generate_pipeline import LLMPipeline"
+      - run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+      - run: PYTHONPATH=./src/python/ python -c "from openvino_genai import LLMPipeline"
       - run: source ./ov/setupvars.sh && python -m pip install --pre . --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-      - run: python -c "from openvino_genai.py_generate_pipeline import LLMPipeline"
+      - run: python -c "from openvino_genai import LLMPipeline"
+      - name: GenAI Python API tests
+        run: |
+          source ./ov/setupvars.sh
+          cd ./tests/python_tests/
+          python -m pip install -r requirements.txt
+          models=$(python list_test_models.py)
+          echo "$models" | while read -r model_name model_path; do
+              optimum-cli export openvino --trust-remote-code --weight-format fp16 --model "$model_name" "$model_path"
+          done
+          python -m pytest test_generate_api.py
 
   windows_genai_python_lib:
     runs-on: windows-latest
@@ -37,6 +48,7 @@ jobs:
       - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
       - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake --build ./build/ --config Release -j
       - run: python -m pip install "numpy<1.27"
-      - run: set "PYTHONPATH=./src/python;" && call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -c "from openvino_genai.py_generate_pipeline import LLMPipeline"  # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
-      - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -m pip install .
-      - run: python -c "from openvino_genai.py_generate_pipeline import LLMPipeline"
+      - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
+      - run: set "PYTHONPATH=./src/python;" && call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -c "from openvino_genai import LLMPipeline"  # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
+      - run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -m pip install .      
+      - run: python -c "from openvino_genai import LLMPipeline"
@@ -1,9 +1,3 @@
 [submodule "thirdparty/openvino_tokenizers"]
     path = thirdparty/openvino_tokenizers
     url = https://github.com/openvinotoolkit/openvino_tokenizers.git
-[submodule "thirdparty/nlohmann_json"]
-	path = thirdparty/nlohmann_json
-	url = https://github.com/nlohmann/json.git
-[submodule "thirdparty/Jinja2Cpp"]
-	path = thirdparty/Jinja2Cpp
-	url = https://github.com/jinja2cpp/Jinja2Cpp
@@ -4,8 +4,14 @@
 
 cmake_minimum_required(VERSION 3.15)
 
-set(CMAKE_BUILD_TYPE "Release" CACHE STRING "CMake build type")
-set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Release" "Debug" "RelWithDebInfo" "MinSizeRel")
+# Multi config generators such as Visual Studio ignore CMAKE_BUILD_TYPE. Multi config generators are configured with
+# CMAKE_CONFIGURATION_TYPES, but limiting options in it completely removes such build options
+get_property(GENERATOR_IS_MULTI_CONFIG_VAR GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
+if(NOT GENERATOR_IS_MULTI_CONFIG_VAR AND NOT DEFINED CMAKE_BUILD_TYPE)
+    message(STATUS "CMAKE_BUILD_TYPE is not defined, 'Release' will be used")
+    # Setting CMAKE_BUILD_TYPE as CACHE must go before project(). Otherwise project() sets its value and set() doesn't take an effect
+    set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel ...")
+endif()
 
 project(openvino_genai VERSION 2024.2.0.0)
 
 
@@ -2,27 +2,41 @@
 
 ## Usage 
 
-Firs of all you need to convert your model with optimum-cli
+First of all you need to convert your model with optimum-cli
 ``` sh
 optimum-cli export openvino --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --weight-format fp16 --trust-remote-code "TinyLlama-1.1B-Chat-v1.0"
 pip install openvino-genai
 ```
 
 LLMPipeline is the main object used for decoding. You can initiliza it straigh away from the folder with the converted model. It will automanically load the main model, tokenizer, detokenizer and default generation configuration.
 
-### In Python
+### Python
 
 A minimalist example:
 ```python
-import py_generate_pipeline as genai # set more friendly module name
-pipe = genai.LLMPipeline(model_path, "CPU")
+import openvino_genai as ov_genai
+pipe = ov_genai.LLMPipeline(model_path, "CPU")
 print(pipe.generate("The Sun is yellow bacause"))
 ```
 
+Calling generate with custom generation config parameters, e.g. config for grouped beam search
+```python
+import openvino_genai as ov_genai
+pipe = ov_genai.LLMPipeline(model_path, "CPU")
+
+res = pipe.generate("The Sun is yellow bacause", max_new_tokens=30, num_groups=3, group_size=5)
+print(res)
+```
+
+output:
+```
+'it is made up of carbon atoms. The carbon atoms are arranged in a linear pattern, which gives the yellow color. The arrangement of carbon atoms in'
+```
+
 A simples chat in python:
 ```python
 import openvino_genai as ov_genai
-pipe = ov_genai.LLMPipeline(model_path)
+pipe = ov_ov_genai.LLMPipeline(model_path)
 
 config = {'num_groups': 3, 'group_size': 5, 'diversity_penalty': 1.1}
 pipe.set_generation_cofnig(config)
@@ -39,60 +53,45 @@ pipe.finish_chat()
 ```
 
 Test to compare with Huggingface outputs
-```python
-from transformers import AutoTokenizer, AutoModelForCausalLM
-
-tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-
-max_new_tokens = 32
-prompt = 'table is made of'
-
-encoded_prompt = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=False)
-hf_encoded_output = model.generate(encoded_prompt, max_new_tokens=max_new_tokens, do_sample=False)
-hf_output = tokenizer.decode(hf_encoded_output[0, encoded_prompt.shape[1]:])
-print(f'hf_output: {hf_output}')
-
-import sys
-sys.path.append('build-Debug/')
-import py_generate_pipeline as genai # set more friendly module name
-
-pipe = genai.LLMPipeline('text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/')
-ov_output = pipe(prompt, max_new_tokens=max_new_tokens)
-print(f'ov_output: {ov_output}')
 
-assert hf_output == ov_output
-
-```
-
-### In C++
+### C++
 
 Minimalistc example
 ```cpp
+#include "openvino/genai/llm_pipeline.hpp"
+#include <iostream>
+
 int main(int argc, char* argv[]) {
     std::string model_path = argv[1];
     ov::LLMPipeline pipe(model_path, "CPU");
-    cout << pipe.generate("The Sun is yellow bacause");
+    std::cout << pipe.generate("The Sun is yellow bacause");
 }
 ```
 
 Using Group Beam Search Decoding
 ```cpp
+#include "openvino/genai/llm_pipeline.hpp"
+#include <iostream>
+
 int main(int argc, char* argv[]) {
     std::string model_path = argv[1];
     ov::LLMPipeline pipe(model_path, "CPU");
+
     ov::GenerationConfig config = pipe.get_generation_config();
     config.max_new_tokens = 256;
     config.num_groups = 3;
     config.group_size = 5;
     config.diversity_penalty = 1.0f;
 
-    cout << pipe.generate("The Sun is yellow bacause", config);
+    std::cout << pipe.generate("The Sun is yellow bacause", config);
 }
 ```
 
 A simplest chat in C++
 ``` cpp
+#include "openvino/genai/llm_pipeline.hpp"
+#include <iostream>
+
 int main(int argc, char* argv[]) {
     std::string prompt;
 
@@ -142,24 +141,38 @@ int main(int argc, char* argv[]) {
 Streaming exapmle with lambda function
 
 ``` cpp
-int main(int argc, char* argv[]) {
-    auto streamer = [](std::string word) { std::cout << word << std::flush; };
 
+#include "openvino/genai/llm_pipeline.hpp"
+#include <iostream>
+
+int main(int argc, char* argv[]) {
     std::string model_path = argv[1];
     ov::LLMPipeline pipe(model_path, "CPU");
-    cout << pipe.generate("The Sun is yellow bacause", streamer);
+        
+    auto streamer = [](std::string word) { std::cout << word << std::flush; };
+    std::cout << pipe.generate("The Sun is yellow bacause", streamer);
 }
 ```
 
 Streaming with custom class
 ``` cpp
 #include <streamer_base.hpp>
+#include "openvino/genai/llm_pipeline.hpp"
+#include <iostream>
 
 class CustomStreamer: publict StreamerBase {
 public:
-    void put(int64_t token) {/* decode tokens and do process them*/};
-
-    void end() {/* decode tokens and do process them*/};
+    void put(int64_t token) {
+        /* custom decoding/tokens processing code
+        tokens_cache.push_back(token);
+        std::string text = m_tokenizer.decode(tokens_cache);
+        ...
+        */
+    };
+
+    void end() {
+        /* custom finalization */
+    };
 };
 
 int main(int argc, char* argv[]) {
@@ -170,4 +183,3 @@ int main(int argc, char* argv[]) {
     cout << pipe.generate("The Sun is yellow bacause", custom_streamer);
 }
 ```
-
 
@@ -13,8 +13,8 @@ FetchContent_MakeAvailable(nlohmann_json)
 
 function(ov_genai_build_jinja2cpp)
     FetchContent_Declare(jinja2cpp
-        URL https://github.com/ilya-lavrenov/Jinja2Cpp/archive/a5d002cbf44469775556daea14ba3ccdba1e365a.tar.gz
-        URL_HASH SHA256=5aa5378d9acf3c44dfb607fd7f16f48b17ffa6495c219957901e9191ffe28900)
+        URL https://github.com/ilya-lavrenov/Jinja2Cpp/archive/5433af6b225cd35df700023cf60df4acdd6cbcf3.tar.gz
+        URL_HASH SHA256=b90f6c44908beaacae8eeb2690d11a6ebb183b4560434698ac00017e7bc07d11)
 
     FetchContent_GetProperties(jinja2cpp)
     if(NOT jinja2cpp_POPULATED)
@@ -49,8 +49,6 @@ add_library(${TARGET_NAME} SHARED ${SOURCE_FILES})
 add_library(openvino::${TARGET_NAME} ALIAS ${TARGET_NAME})
 
 target_include_directories(${TARGET_NAME}
-    # TODO: remove it, because beam_search algo should not be exposed to end users
-    PRIVATE "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../../text_generation/causal_lm/cpp/>"
     PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>" "$<INSTALL_INTERFACE:runtime/include>")
 
 target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime PRIVATE nlohmann_json::nlohmann_json jinja2cpp)
@@ -76,6 +74,13 @@ install(TARGETS ${TARGET_NAME}
     LIBRARY DESTINATION python/openvino_genai/ COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR}
     RUNTIME DESTINATION python/openvino_genai/ COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR})
 
+# Copy libcore_tokenizers.so to build_dir/openvino_tokenizers/src/
+add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+    COMMAND "${CMAKE_COMMAND}" -E copy
+        "${CMAKE_BINARY_DIR}/_deps/fast_tokenizer-src/lib/libcore_tokenizers.so"
+        "${CMAKE_BINARY_DIR}/openvino_tokenizers/src/"
+    COMMENT "Copy libcore_tokenizers.so to build_dir/openvino_tokenizers/src/")
+
 # - Windows: `<openvino_dir>\runtime\bin\intel64\Release\`
 # - MacOS_x86: `<openvino_dir>/runtime/lib/intel64/Release`
 # - MacOS_arm64: `<openvino_dir>/runtime/lib/arm64/Release/`
 
@@ -14,9 +14,10 @@
 namespace ov {
 
 /**
- * @brief controls the stopping condition for grouped beam search. The following values are  possible:
- *        "early", where the generation stops as soon as there are `num_beams` complete candidates; "heuristic", where an 
- *        heuristic is applied and the generation stops when is it very unlikely to find better candidates;
+ * @brief controls the stopping condition for grouped beam search. The following values are possible:
+ *        "early" stops as soon as there are `num_beams` complete candidates.
+          "heuristic" stops when is it unlikely to find better candidates.
+          "never" stops when there cannot be better candidates.
  */
 enum class StopCriteria { early, heuristic, never };
 
@@ -25,11 +26,11 @@ enum class StopCriteria { early, heuristic, never };
  * 
  * @param max_length the maximum length the generated tokens can have. Corresponds to the length of the input prompt +
  *        `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set.
- * @param max_new_tokens the maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
+ * @param max_new_tokens the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length.
  * @param ignore_eos if set to true, then generation will not stop even if <eos> token is met.
- * @param num_beams  number of beams for beam search. 1 means no beam search.
+ * @param num_beams number of beams for beam search. 1 disables beam search.
  * @param num_beam_groups number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
- * @param diversity_penalty this value is subtracted from a beam's score if it generates a token same as any beam from other group at a
+ * @param diversity_penalty this value is subtracted from a beam's score if it generates the same token as any beam from other group at a
  *        particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled.
  * @param length_penalty exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
  *        the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
@@ -42,11 +43,11 @@ enum class StopCriteria { early, heuristic, never };
  *        heuristic is applied and the generation stops when is it very unlikely to find better candidates;
  *        "never", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
  * @param temperature the value used to modulate token probabilities for random sampling
- * @param top_p if set to float < 1, only the smallest set of most probable tokens with probabilities 
+ * @param top_p - if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
  * @param top_k the number of highest probability vocabulary tokens to keep for top-k-filtering.
  * @param do_sample whether or not to use multinomial random sampling
  *        that add up to `top_p` or higher are kept.
- * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty. 
+ * @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty. See https://arxiv.org/pdf/1909.05858.
  * @param pad_token_id id of padding token
  * @param bos_token_id id of <bos> token
  * @param eos_token_id id of <eos> token
 
@@ -39,6 +39,24 @@ class DecodedResults {
 public:
     std::vector<std::string> texts;
     std::vector<float> scores;
+
+     // @brief Convert DecodedResults to a vector of strings.
+     // @return A std::vector<std::string> containing the texts from the DecodedResults object.
+    operator std::vector<std::string>() const { 
+        return texts; 
+    }
+    
+     // @brief Overloads operator<< to enhance output the contents of DecodedResults.
+     // @return A reference to the output stream with the concatenated texts.
+    friend std::ostream& operator<<(std::ostream& os, const DecodedResults& dr) {
+       for (size_t i = 0; i < dr.texts.size(); ++i) {
+            os << dr.texts[i];
+            if (i != dr.texts.size() - 1) {
+                os << std::endl;
+            }
+        }
+        return os;
+    }
 };
 
 /**
@@ -47,13 +65,15 @@ class DecodedResults {
 class OPENVINO_GENAI_EXPORTS LLMPipeline {
 public:
     /**
-    * @brief Constructs a LLMPipeline when convert model xml/bin files, tokenizers and configuration and in the same dir.
+    * @brief Constructs an LLMPipeline from xml/bin files, tokenizers and configuration in the same dir.
     *
     * @param model_path Path to the dir model xml/bin files, tokenizers and generation_configs.json
     * @param device optional device
     * @param plugin_config optional plugin_config
     */
-    LLMPipeline(std::string& path, std::string device="CPU", const ov::AnyMap& plugin_config={});
+    LLMPipeline(std::string& path, std::string device="CPU", 
+                const ov::AnyMap& plugin_config={}, 
+                const std::string& ov_tokenizers_path="");
 
     /**
     * @brief Constructs a LLMPipeline when ov::Tokenizer is initialized manually using file from the different dirs.
@@ -67,7 +87,8 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
         const std::string model_path,
         const ov::Tokenizer& tokenizer,
         const std::string device="CPU",
-        const ov::AnyMap& plugin_config = {}
+        const ov::AnyMap& plugin_config = {},
+        const std::string& ov_tokenizers_path=""
     );
 
     ~LLMPipeline();
@@ -84,8 +105,8 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
 
     template <typename... Properties>
     util::EnableIfAllStringAny<std::string, Properties...> generate(
-        std::string text,
-        Properties&&... properties) {
+            std::string text,
+            Properties&&... properties) {
         return generate(text, AnyMap{std::forward<Properties>(properties)...});
     }
     std::string generate(std::string text, const ov::AnyMap& config);