Skip to content

Commit 7953c0f

Browse files
committed
Merge branch 'generate_pipeline' into fix-archive
2 parents 2486e53 + 75b7c37 commit 7953c0f

26 files changed

+432
-302
lines changed

.github/workflows/causal_lm_cpp.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -194,8 +194,8 @@ jobs:
194194
shell: cmd
195195
run: |
196196
call w_openvino_toolkit_windows_2024.1.0.15008.f4afc983258_x86_64\setupvars.bat
197-
198-
.\build\Release\beam_search_causal_lm.exe .\TinyLlama-1.1B-Chat-v1.0\ "69" > .\pred.txt
197+
.\build\text_generation\causal_lm\cpp\Release\beam_search_causal_lm.exe .\TinyLlama-1.1B-Chat-v1.0\ "69" > .\pred.txt
198+
199199
echo import transformers > ref.py
200200
echo predictions = open('pred.txt', 'r').read() >> ref.py
201201
echo tokenizer = transformers.LlamaTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0') >> ref.py

.github/workflows/genai_python_lib.yml

+18-6
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ name: genai_python_lib
22
on: pull_request
33
jobs:
44
ubuntu_genai_python_lib:
5-
runs-on: ubuntu-20.04
5+
runs-on: ubuntu-20.04-16-cores
66
steps:
77
- uses: actions/checkout@v4
88
with:
@@ -16,9 +16,20 @@ jobs:
1616
- run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
1717
- run: source ./ov/setupvars.sh && cmake --build ./build/ --config Release -j
1818
- run: python -m pip install --pre openvino --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly # Can't load CentOS libraries from the archive
19-
- run: PYTHONPATH=./src/python/ python -c "from openvino_genai.py_generate_pipeline import LLMPipeline"
19+
- run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
20+
- run: PYTHONPATH=./src/python/ python -c "from openvino_genai import LLMPipeline"
2021
- run: source ./ov/setupvars.sh && python -m pip install --pre . --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
21-
- run: python -c "from openvino_genai.py_generate_pipeline import LLMPipeline"
22+
- run: python -c "from openvino_genai import LLMPipeline"
23+
- name: GenAI Python API tests
24+
run: |
25+
source ./ov/setupvars.sh
26+
cd ./tests/python_tests/
27+
python -m pip install -r requirements.txt
28+
models=$(python list_test_models.py)
29+
echo "$models" | while read -r model_name model_path; do
30+
optimum-cli export openvino --trust-remote-code --weight-format fp16 --model "$model_name" "$model_path"
31+
done
32+
python -m pytest test_generate_api.py
2233
2334
windows_genai_python_lib:
2435
runs-on: windows-latest
@@ -37,6 +48,7 @@ jobs:
3748
- run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
3849
- run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake --build ./build/ --config Release -j
3950
- run: python -m pip install "numpy<1.27"
40-
- run: set "PYTHONPATH=./src/python;" && call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -c "from openvino_genai.py_generate_pipeline import LLMPipeline" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
41-
- run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -m pip install .
42-
- run: python -c "from openvino_genai.py_generate_pipeline import LLMPipeline"
51+
- run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers]
52+
- run: set "PYTHONPATH=./src/python;" && call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -c "from openvino_genai import LLMPipeline" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
53+
- run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -m pip install .
54+
- run: python -c "from openvino_genai import LLMPipeline"

.gitmodules

-6
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,3 @@
11
[submodule "thirdparty/openvino_tokenizers"]
22
path = thirdparty/openvino_tokenizers
33
url = https://github.com/openvinotoolkit/openvino_tokenizers.git
4-
[submodule "thirdparty/nlohmann_json"]
5-
path = thirdparty/nlohmann_json
6-
url = https://github.com/nlohmann/json.git
7-
[submodule "thirdparty/Jinja2Cpp"]
8-
path = thirdparty/Jinja2Cpp
9-
url = https://github.com/jinja2cpp/Jinja2Cpp

CMakeLists.txt

+8-2
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,14 @@
44

55
cmake_minimum_required(VERSION 3.15)
66

7-
set(CMAKE_BUILD_TYPE "Release" CACHE STRING "CMake build type")
8-
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Release" "Debug" "RelWithDebInfo" "MinSizeRel")
7+
# Multi config generators such as Visual Studio ignore CMAKE_BUILD_TYPE. Multi config generators are configured with
8+
# CMAKE_CONFIGURATION_TYPES, but limiting options in it completely removes such build options
9+
get_property(GENERATOR_IS_MULTI_CONFIG_VAR GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
10+
if(NOT GENERATOR_IS_MULTI_CONFIG_VAR AND NOT DEFINED CMAKE_BUILD_TYPE)
11+
message(STATUS "CMAKE_BUILD_TYPE is not defined, 'Release' will be used")
12+
# Setting CMAKE_BUILD_TYPE as CACHE must go before project(). Otherwise project() sets its value and set() doesn't take an effect
13+
set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel ...")
14+
endif()
915

1016
project(openvino_genai VERSION 2024.2.0.0)
1117

text_generation/causal_lm/cpp/generate_pipeline/README.md src/README.md

+52-40
Original file line numberDiff line numberDiff line change
@@ -2,27 +2,41 @@
22

33
## Usage
44

5-
Firs of all you need to convert your model with optimum-cli
5+
First of all you need to convert your model with optimum-cli
66
``` sh
77
optimum-cli export openvino --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --weight-format fp16 --trust-remote-code "TinyLlama-1.1B-Chat-v1.0"
88
pip install openvino-genai
99
```
1010

1111
LLMPipeline is the main object used for decoding. You can initiliza it straigh away from the folder with the converted model. It will automanically load the main model, tokenizer, detokenizer and default generation configuration.
1212

13-
### In Python
13+
### Python
1414

1515
A minimalist example:
1616
```python
17-
import py_generate_pipeline as genai # set more friendly module name
18-
pipe = genai.LLMPipeline(model_path, "CPU")
17+
import openvino_genai as ov_genai
18+
pipe = ov_genai.LLMPipeline(model_path, "CPU")
1919
print(pipe.generate("The Sun is yellow bacause"))
2020
```
2121

22+
Calling generate with custom generation config parameters, e.g. config for grouped beam search
23+
```python
24+
import openvino_genai as ov_genai
25+
pipe = ov_genai.LLMPipeline(model_path, "CPU")
26+
27+
res = pipe.generate("The Sun is yellow bacause", max_new_tokens=30, num_groups=3, group_size=5)
28+
print(res)
29+
```
30+
31+
output:
32+
```
33+
'it is made up of carbon atoms. The carbon atoms are arranged in a linear pattern, which gives the yellow color. The arrangement of carbon atoms in'
34+
```
35+
2236
A simples chat in python:
2337
```python
2438
import openvino_genai as ov_genai
25-
pipe = ov_genai.LLMPipeline(model_path)
39+
pipe = ov_ov_genai.LLMPipeline(model_path)
2640

2741
config = {'num_groups': 3, 'group_size': 5, 'diversity_penalty': 1.1}
2842
pipe.set_generation_cofnig(config)
@@ -39,60 +53,45 @@ pipe.finish_chat()
3953
```
4054

4155
Test to compare with Huggingface outputs
42-
```python
43-
from transformers import AutoTokenizer, AutoModelForCausalLM
44-
45-
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
46-
model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
47-
48-
max_new_tokens = 32
49-
prompt = 'table is made of'
50-
51-
encoded_prompt = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=False)
52-
hf_encoded_output = model.generate(encoded_prompt, max_new_tokens=max_new_tokens, do_sample=False)
53-
hf_output = tokenizer.decode(hf_encoded_output[0, encoded_prompt.shape[1]:])
54-
print(f'hf_output: {hf_output}')
55-
56-
import sys
57-
sys.path.append('build-Debug/')
58-
import py_generate_pipeline as genai # set more friendly module name
59-
60-
pipe = genai.LLMPipeline('text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/')
61-
ov_output = pipe(prompt, max_new_tokens=max_new_tokens)
62-
print(f'ov_output: {ov_output}')
6356

64-
assert hf_output == ov_output
65-
66-
```
67-
68-
### In C++
57+
### C++
6958

7059
Minimalistc example
7160
```cpp
61+
#include "openvino/genai/llm_pipeline.hpp"
62+
#include <iostream>
63+
7264
int main(int argc, char* argv[]) {
7365
std::string model_path = argv[1];
7466
ov::LLMPipeline pipe(model_path, "CPU");
75-
cout << pipe.generate("The Sun is yellow bacause");
67+
std::cout << pipe.generate("The Sun is yellow bacause");
7668
}
7769
```
7870
7971
Using Group Beam Search Decoding
8072
```cpp
73+
#include "openvino/genai/llm_pipeline.hpp"
74+
#include <iostream>
75+
8176
int main(int argc, char* argv[]) {
8277
std::string model_path = argv[1];
8378
ov::LLMPipeline pipe(model_path, "CPU");
79+
8480
ov::GenerationConfig config = pipe.get_generation_config();
8581
config.max_new_tokens = 256;
8682
config.num_groups = 3;
8783
config.group_size = 5;
8884
config.diversity_penalty = 1.0f;
8985
90-
cout << pipe.generate("The Sun is yellow bacause", config);
86+
std::cout << pipe.generate("The Sun is yellow bacause", config);
9187
}
9288
```
9389

9490
A simplest chat in C++
9591
``` cpp
92+
#include "openvino/genai/llm_pipeline.hpp"
93+
#include <iostream>
94+
9695
int main(int argc, char* argv[]) {
9796
std::string prompt;
9897

@@ -142,24 +141,38 @@ int main(int argc, char* argv[]) {
142141
Streaming exapmle with lambda function
143142

144143
``` cpp
145-
int main(int argc, char* argv[]) {
146-
auto streamer = [](std::string word) { std::cout << word << std::flush; };
147144

145+
#include "openvino/genai/llm_pipeline.hpp"
146+
#include <iostream>
147+
148+
int main(int argc, char* argv[]) {
148149
std::string model_path = argv[1];
149150
ov::LLMPipeline pipe(model_path, "CPU");
150-
cout << pipe.generate("The Sun is yellow bacause", streamer);
151+
152+
auto streamer = [](std::string word) { std::cout << word << std::flush; };
153+
std::cout << pipe.generate("The Sun is yellow bacause", streamer);
151154
}
152155
```
153156
154157
Streaming with custom class
155158
``` cpp
156159
#include <streamer_base.hpp>
160+
#include "openvino/genai/llm_pipeline.hpp"
161+
#include <iostream>
157162
158163
class CustomStreamer: publict StreamerBase {
159164
public:
160-
void put(int64_t token) {/* decode tokens and do process them*/};
161-
162-
void end() {/* decode tokens and do process them*/};
165+
void put(int64_t token) {
166+
/* custom decoding/tokens processing code
167+
tokens_cache.push_back(token);
168+
std::string text = m_tokenizer.decode(tokens_cache);
169+
...
170+
*/
171+
};
172+
173+
void end() {
174+
/* custom finalization */
175+
};
163176
};
164177
165178
int main(int argc, char* argv[]) {
@@ -170,4 +183,3 @@ int main(int argc, char* argv[]) {
170183
cout << pipe.generate("The Sun is yellow bacause", custom_streamer);
171184
}
172185
```
173-

src/cpp/CMakeLists.txt

+9-4
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ FetchContent_MakeAvailable(nlohmann_json)
1313

1414
function(ov_genai_build_jinja2cpp)
1515
FetchContent_Declare(jinja2cpp
16-
URL https://github.com/ilya-lavrenov/Jinja2Cpp/archive/a5d002cbf44469775556daea14ba3ccdba1e365a.tar.gz
17-
URL_HASH SHA256=5aa5378d9acf3c44dfb607fd7f16f48b17ffa6495c219957901e9191ffe28900)
16+
URL https://github.com/ilya-lavrenov/Jinja2Cpp/archive/5433af6b225cd35df700023cf60df4acdd6cbcf3.tar.gz
17+
URL_HASH SHA256=b90f6c44908beaacae8eeb2690d11a6ebb183b4560434698ac00017e7bc07d11)
1818

1919
FetchContent_GetProperties(jinja2cpp)
2020
if(NOT jinja2cpp_POPULATED)
@@ -49,8 +49,6 @@ add_library(${TARGET_NAME} SHARED ${SOURCE_FILES})
4949
add_library(openvino::${TARGET_NAME} ALIAS ${TARGET_NAME})
5050

5151
target_include_directories(${TARGET_NAME}
52-
# TODO: remove it, because beam_search algo should not be exposed to end users
53-
PRIVATE "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../../text_generation/causal_lm/cpp/>"
5452
PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>" "$<INSTALL_INTERFACE:runtime/include>")
5553

5654
target_link_libraries(${TARGET_NAME} PUBLIC openvino::runtime PRIVATE nlohmann_json::nlohmann_json jinja2cpp)
@@ -76,6 +74,13 @@ install(TARGETS ${TARGET_NAME}
7674
LIBRARY DESTINATION python/openvino_genai/ COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR}
7775
RUNTIME DESTINATION python/openvino_genai/ COMPONENT pygenai_${Python_VERSION_MAJOR}_${Python_VERSION_MINOR})
7876

77+
# Copy libcore_tokenizers.so to build_dir/openvino_tokenizers/src/
78+
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
79+
COMMAND "${CMAKE_COMMAND}" -E copy
80+
"${CMAKE_BINARY_DIR}/_deps/fast_tokenizer-src/lib/libcore_tokenizers.so"
81+
"${CMAKE_BINARY_DIR}/openvino_tokenizers/src/"
82+
COMMENT "Copy libcore_tokenizers.so to build_dir/openvino_tokenizers/src/")
83+
7984
# - Windows: `<openvino_dir>\runtime\bin\intel64\Release\`
8085
# - MacOS_x86: `<openvino_dir>/runtime/lib/intel64/Release`
8186
# - MacOS_arm64: `<openvino_dir>/runtime/lib/arm64/Release/`

src/cpp/include/openvino/genai/generation_config.hpp

+9-8
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,10 @@
1414
namespace ov {
1515

1616
/**
17-
* @brief controls the stopping condition for grouped beam search. The following values are possible:
18-
* "early", where the generation stops as soon as there are `num_beams` complete candidates; "heuristic", where an
19-
* heuristic is applied and the generation stops when is it very unlikely to find better candidates;
17+
* @brief controls the stopping condition for grouped beam search. The following values are possible:
18+
* "early" stops as soon as there are `num_beams` complete candidates.
19+
"heuristic" stops when is it unlikely to find better candidates.
20+
"never" stops when there cannot be better candidates.
2021
*/
2122
enum class StopCriteria { early, heuristic, never };
2223

@@ -25,11 +26,11 @@ enum class StopCriteria { early, heuristic, never };
2526
*
2627
* @param max_length the maximum length the generated tokens can have. Corresponds to the length of the input prompt +
2728
* `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set.
28-
* @param max_new_tokens the maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.
29+
* @param max_new_tokens the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length.
2930
* @param ignore_eos if set to true, then generation will not stop even if <eos> token is met.
30-
* @param num_beams number of beams for beam search. 1 means no beam search.
31+
* @param num_beams number of beams for beam search. 1 disables beam search.
3132
* @param num_beam_groups number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
32-
* @param diversity_penalty this value is subtracted from a beam's score if it generates a token same as any beam from other group at a
33+
* @param diversity_penalty this value is subtracted from a beam's score if it generates the same token as any beam from other group at a
3334
* particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled.
3435
* @param length_penalty exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
3536
* the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
@@ -42,11 +43,11 @@ enum class StopCriteria { early, heuristic, never };
4243
* heuristic is applied and the generation stops when is it very unlikely to find better candidates;
4344
* "never", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
4445
* @param temperature the value used to modulate token probabilities for random sampling
45-
* @param top_p if set to float < 1, only the smallest set of most probable tokens with probabilities
46+
* @param top_p - if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
4647
* @param top_k the number of highest probability vocabulary tokens to keep for top-k-filtering.
4748
* @param do_sample whether or not to use multinomial random sampling
4849
* that add up to `top_p` or higher are kept.
49-
* @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty.
50+
* @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty. See https://arxiv.org/pdf/1909.05858.
5051
* @param pad_token_id id of padding token
5152
* @param bos_token_id id of <bos> token
5253
* @param eos_token_id id of <eos> token

src/cpp/include/openvino/genai/llm_pipeline.hpp

+26-5
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,24 @@ class DecodedResults {
3939
public:
4040
std::vector<std::string> texts;
4141
std::vector<float> scores;
42+
43+
// @brief Convert DecodedResults to a vector of strings.
44+
// @return A std::vector<std::string> containing the texts from the DecodedResults object.
45+
operator std::vector<std::string>() const {
46+
return texts;
47+
}
48+
49+
// @brief Overloads operator<< to enhance output the contents of DecodedResults.
50+
// @return A reference to the output stream with the concatenated texts.
51+
friend std::ostream& operator<<(std::ostream& os, const DecodedResults& dr) {
52+
for (size_t i = 0; i < dr.texts.size(); ++i) {
53+
os << dr.texts[i];
54+
if (i != dr.texts.size() - 1) {
55+
os << std::endl;
56+
}
57+
}
58+
return os;
59+
}
4260
};
4361

4462
/**
@@ -47,13 +65,15 @@ class DecodedResults {
4765
class OPENVINO_GENAI_EXPORTS LLMPipeline {
4866
public:
4967
/**
50-
* @brief Constructs a LLMPipeline when convert model xml/bin files, tokenizers and configuration and in the same dir.
68+
* @brief Constructs an LLMPipeline from xml/bin files, tokenizers and configuration in the same dir.
5169
*
5270
* @param model_path Path to the dir model xml/bin files, tokenizers and generation_configs.json
5371
* @param device optional device
5472
* @param plugin_config optional plugin_config
5573
*/
56-
LLMPipeline(std::string& path, std::string device="CPU", const ov::AnyMap& plugin_config={});
74+
LLMPipeline(std::string& path, std::string device="CPU",
75+
const ov::AnyMap& plugin_config={},
76+
const std::string& ov_tokenizers_path="");
5777

5878
/**
5979
* @brief Constructs a LLMPipeline when ov::Tokenizer is initialized manually using file from the different dirs.
@@ -67,7 +87,8 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
6787
const std::string model_path,
6888
const ov::Tokenizer& tokenizer,
6989
const std::string device="CPU",
70-
const ov::AnyMap& plugin_config = {}
90+
const ov::AnyMap& plugin_config = {},
91+
const std::string& ov_tokenizers_path=""
7192
);
7293

7394
~LLMPipeline();
@@ -84,8 +105,8 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
84105

85106
template <typename... Properties>
86107
util::EnableIfAllStringAny<std::string, Properties...> generate(
87-
std::string text,
88-
Properties&&... properties) {
108+
std::string text,
109+
Properties&&... properties) {
89110
return generate(text, AnyMap{std::forward<Properties>(properties)...});
90111
}
91112
std::string generate(std::string text, const ov::AnyMap& config);

0 commit comments

Comments
 (0)