Skip to content

Commit 140b59c

Browse files
committedMay 27, 2024
Merge branch 'generate_pipeline' into fix-abi
2 parents 78666da + bbc8c25 commit 140b59c

23 files changed

+353
-353
lines changed
 

‎.github/workflows/genai_package.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ jobs:
1818
- run: sudo ./ov/install_dependencies/install_openvino_dependencies.sh
1919
- run: sudo apt-get install libtbb-dev
2020
- run: source ./ov/setupvars.sh && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/
21-
- run: source ./ov/setupvars.sh && cmake --build ./build/ --config ${{ matrix.build-type }} --target package
21+
- run: source ./ov/setupvars.sh && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j
2222
- run: source ./ov/setupvars.sh && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix ov
2323
- run: ov/samples/cpp/build_samples.sh -i ${{ github.workspace }}/s\ pace
2424
if: ${{ 'Release' == matrix.build-type }} # build_samples enforces Release build
@@ -49,7 +49,7 @@ jobs:
4949
- run: curl --output ov.zip https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2024.2.0-15349-765302e0de1/w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64.zip
5050
- run: unzip ov.zip
5151
- run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} -S ./ -B ./build/
52-
- run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake --build ./build/ --config ${{ matrix.build-type }} --target package
52+
- run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake --build ./build/ --config ${{ matrix.build-type }} --target package -j
5353
- run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake --install ./build/ --config ${{ matrix.build-type }} --prefix w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64
5454
- run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\samples\cpp\build_samples_msvc.bat -i "${{ github.workspace }}/samples_install"
5555
if: ${{ 'Release' == matrix.build-type }} # build_samples enforces Release build

‎.github/workflows/genai_python_lib.yml

+8-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ jobs:
2222
# build system doesn't. Install ./requirements-build.txt to detect possible conflicts.
2323
- run: source ./ov/setupvars.sh && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./requirements-build.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
2424
- run: PYTHONPATH=./src/python/ python -c "from openvino_genai import LLMPipeline"
25-
- run: source ./ov/setupvars.sh && CMAKE_BUILD_PARALLEL_LEVEL="" python -m pip install .
25+
- run: source ./ov/setupvars.sh && CMAKE_BUILD_PARALLEL_LEVEL="" python -m pip install --pre . --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
2626
- run: python -c "from openvino_genai import LLMPipeline"
2727
- name: GenAI Python API tests
2828
run: |
@@ -51,9 +51,16 @@ jobs:
5151
- run: unzip ov.zip
5252
- run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
5353
- run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && cmake --build ./build/ --config Release -j
54+
<<<<<<< HEAD
5455
# GitHub Actions already provides what is listed in ./requirements-build.txt but the internal
5556
# build system doesn't. Install ./requirements-build.txt to detect possible conflicts.
5657
- run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./requirements-build.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
58+
=======
59+
- run: python -m pip install "numpy<1.27"
60+
# GitHub Actions already provides what is listed in ./requirements-build.txt but the internal
61+
# build system doesn't. Install ./requirements-build.txt to detect possible conflicts.
62+
- run: call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./requirements-build.txt
63+
>>>>>>> generate_pipeline
5764
- run: set "PYTHONPATH=./src/python;" && call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -c "from openvino_genai import LLMPipeline" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
5865
- run: set CMAKE_BUILD_PARALLEL_LEVEL=&& call w_openvino_toolkit_windows_2024.2.0.dev20240515_x86_64\setupvars.bat && python -m pip install .
5966
- run: python -c "from openvino_genai import LLMPipeline"

‎src/README.md

+24-46
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ optimum-cli export openvino --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --weigh
88
pip install openvino-genai
99
```
1010

11-
LLMPipeline is the main object used for decoding. You can initiliza it straigh away from the folder with the converted model. It will automanically load the main model, tokenizer, detokenizer and default generation configuration.
11+
`LLMPipeline` is the main object used for decoding. You can construct it straight away from the folder with the converted model. It will automatically load the main model, tokenizer, detokenizer and default generation configuration.
1212

1313
### Python
1414

@@ -24,8 +24,8 @@ Calling generate with custom generation config parameters, e.g. config for group
2424
import openvino_genai as ov_genai
2525
pipe = ov_genai.LLMPipeline(model_path, "CPU")
2626

27-
res = pipe.generate("The Sun is yellow bacause", max_new_tokens=30, num_groups=3, group_size=5)
28-
print(res)
27+
result = pipe.generate("The Sun is yellow bacause", max_new_tokens=30, num_groups=3, group_size=5, diversity_penalty=1.5)
28+
print(result)
2929
```
3030

3131
output:
@@ -38,7 +38,7 @@ A simples chat in python:
3838
import openvino_genai as ov_genai
3939
pipe = ov_ov_genai.LLMPipeline(model_path)
4040

41-
config = {'num_groups': 3, 'group_size': 5, 'diversity_penalty': 1.1}
41+
config = {'num_groups': 3, 'group_size': 5, 'diversity_penalty': 1.5}
4242
pipe.set_generation_cofnig(config)
4343

4444
pipe.start_chat()
@@ -49,7 +49,6 @@ while True:
4949
        break
5050
    print(pipe(prompt))
5151
pipe.finish_chat()
52-
5352
```
5453

5554
Test to compare with Huggingface outputs
@@ -63,7 +62,7 @@ Minimalistc example
6362

6463
int main(int argc, char* argv[]) {
6564
std::string model_path = argv[1];
66-
ov::LLMPipeline pipe(model_path, "CPU");
65+
ov::genai::LLMPipeline pipe(model_path, "CPU");
6766
std::cout << pipe.generate("The Sun is yellow bacause");
6867
}
6968
```
@@ -75,9 +74,9 @@ Using Group Beam Search Decoding
7574
7675
int main(int argc, char* argv[]) {
7776
std::string model_path = argv[1];
78-
ov::LLMPipeline pipe(model_path, "CPU");
77+
ov::genai::LLMPipeline pipe(model_path, "CPU");
7978
80-
ov::GenerationConfig config = pipe.get_generation_config();
79+
ov::genai::GenerationConfig config = pipe.get_generation_config();
8180
config.max_new_tokens = 256;
8281
config.num_groups = 3;
8382
config.group_size = 5;
@@ -87,7 +86,7 @@ int main(int argc, char* argv[]) {
8786
}
8887
```
8988

90-
A simplest chat in C++
89+
A simple chat in C++ using grouped beam search decoding
9190
``` cpp
9291
#include "openvino/genai/llm_pipeline.hpp"
9392
#include <iostream>
@@ -96,71 +95,50 @@ int main(int argc, char* argv[]) {
9695
std::string prompt;
9796

9897
std::string model_path = argv[1];
99-
ov::LLMPipeline pipe(model_path, "CPU");
100-
101-
pipe.start_chat();
102-
for (size_t i = 0; i < questions.size(); i++) {
103-
std::cout << "question:\n";
104-
std::getline(std::cin, prompt);
105-
106-
std::cout << pipe(prompt) << std::endl>>;
107-
}
108-
pipe.finish_chat();
109-
}
110-
```
111-
112-
Specifying generation_config to use grouped beam search
113-
``` cpp
114-
int main(int argc, char* argv[]) {
115-
std::string prompt;
116-
117-
std::string model_path = argv[1];
118-
ov::LLMPipeline pipe(model_path, "CPU");
98+
ov::genai::LLMPipeline pipe(model_path, "CPU");
11999

120-
ov::GenerationConfig config = pipe.get_generation_config();
100+
ov::genai::GenerationConfig config = pipe.get_generation_config();
121101
config.max_new_tokens = 256;
122102
config.num_groups = 3;
123103
config.group_size = 5;
124104
config.diversity_penalty = 1.0f;
125105

126-
auto streamer = [](std::string word) { std::cout << word << std::flush; };
127-
128106
pipe.start_chat();
129-
for (size_t i = 0; i < questions.size(); i++) {
130-
107+
for (;;;) {
131108
std::cout << "question:\n";
132-
cout << prompt << endl;
109+
std::getline(std::cin, prompt);
110+
if (prompt == "Stop!")
111+
break;
133112

134-
auto answer = pipe(prompt, config, streamer);
135-
// no need to print answer, streamer will do that
113+
std::cout << "answer:\n";
114+
auto answer = pipe(prompt, config);
115+
std::cout << answer << std::endl;
136116
}
137117
pipe.finish_chat();
138118
}
139119
```
140120
141-
Streaming exapmle with lambda function
142-
121+
Streaming example with lambda function
143122
``` cpp
144-
145123
#include "openvino/genai/llm_pipeline.hpp"
146124
#include <iostream>
147125
148126
int main(int argc, char* argv[]) {
149127
std::string model_path = argv[1];
150-
ov::LLMPipeline pipe(model_path, "CPU");
128+
ov::genai::LLMPipeline pipe(model_path, "CPU");
151129
152130
auto streamer = [](std::string word) { std::cout << word << std::flush; };
153131
std::cout << pipe.generate("The Sun is yellow bacause", streamer);
154132
}
155133
```
156134

157-
Streaming with custom class
135+
Streaming with a custom class
158136
``` cpp
159-
#include <streamer_base.hpp>
137+
#include "openvino/genai/streamer_base.hpp"
160138
#include "openvino/genai/llm_pipeline.hpp"
161139
#include <iostream>
162140

163-
class CustomStreamer: publict StreamerBase {
141+
class CustomStreamer: public ov::genai::StreamerBase {
164142
public:
165143
void put(int64_t token) {
166144
/* custom decoding/tokens processing code
@@ -179,7 +157,7 @@ int main(int argc, char* argv[]) {
179157
CustomStreamer custom_streamer;
180158

181159
std::string model_path = argv[1];
182-
ov::LLMPipeline pipe(model_path, "CPU");
183-
cout << pipe.generate("The Sun is yellow bacause", custom_streamer);
160+
ov::genai::LLMPipeline pipe(model_path, "CPU");
161+
std::cout << pipe.generate("The Sun is yellow bacause", custom_streamer);
184162
}
185163
```

‎src/cpp/include/openvino/genai/generation_config.hpp

+28-15
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include "openvino/genai/tokenizer.hpp"
1313

1414
namespace ov {
15+
namespace genai {
1516

1617
/**
1718
* @brief controls the stopping condition for grouped beam search. The following values are possible:
@@ -22,43 +23,48 @@ namespace ov {
2223
enum class StopCriteria { early, heuristic, never };
2324

2425
/**
25-
* @brief structure to keep generation config parameters.
26+
* @brief Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group
27+
* and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will
28+
* be used while greedy and beam search parameters will not affect decoding at all.
2629
*
30+
* Generic parameters:
2731
* @param max_length the maximum length the generated tokens can have. Corresponds to the length of the input prompt +
2832
* `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set.
2933
* @param max_new_tokens the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length.
3034
* @param ignore_eos if set to true, then generation will not stop even if <eos> token is met.
35+
* @param pad_token_id token_id of <pad> (padding)
36+
* @param bos_token_id token_id of <bos> (beggining of sentence)
37+
* @param eos_token_id token_id of <eos> (end of sentence)
38+
* @param bos_token <bos> token string representation
39+
* @param eos_token <eos> token string representation
40+
*
41+
* Beam search specific parameters:
3142
* @param num_beams number of beams for beam search. 1 disables beam search.
3243
* @param num_beam_groups number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
3344
* @param diversity_penalty this value is subtracted from a beam's score if it generates the same token as any beam from other group at a
34-
* particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled.
45+
* particular time. See https://arxiv.org/pdf/1909.05858.
3546
* @param length_penalty exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
3647
* the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
3748
* likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
3849
* `length_penalty` < 0.0 encourages shorter sequences.
39-
* @param num_return_sequences the number of sequences to return for grouped beam search decoding
50+
* @param num_return_sequences the number of sequences to return for grouped beam search decoding.
4051
* @param no_repeat_ngram_size if set to int > 0, all ngrams of that size can only occur once.
4152
* @param stop_criteria controls the stopping condition for grouped beam search. It accepts the following values:
4253
* "early", where the generation stops as soon as there are `num_beams` complete candidates; "heuristic", where an
4354
* heuristic is applied and the generation stops when is it very unlikely to find better candidates;
4455
* "never", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
45-
* @param temperature the value used to modulate token probabilities for random sampling
56+
*
57+
* Random sampling parameters:
58+
* @param temperature the value used to modulate token probabilities for random sampling.
4659
* @param top_p - if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
4760
* @param top_k the number of highest probability vocabulary tokens to keep for top-k-filtering.
48-
* @param do_sample whether or not to use multinomial random sampling
49-
* that add up to `top_p` or higher are kept.
50-
* @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty. See https://arxiv.org/pdf/1909.05858.
51-
* @param pad_token_id id of padding token
52-
* @param bos_token_id id of <bos> token
53-
* @param eos_token_id id of <eos> token
54-
* @param bos_token <bos> token string representation
55-
* @param eos_token <eos> token string representation
56-
* @param draft_model draft model for assitive decoding
61+
* @param do_sample whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
62+
* @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty.
5763
*/
5864
class OPENVINO_GENAI_EXPORTS GenerationConfig {
5965
public:
6066
GenerationConfig() = default;
61-
GenerationConfig(std::string json_path);
67+
explicit GenerationConfig(std::string json_path);
6268

6369
// Generic
6470
size_t max_new_tokens = SIZE_MAX;
@@ -89,6 +95,13 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
8995
// used for chat scenario
9096
std::string bos_token = "<s>";
9197
std::string eos_token = "</s>";
98+
99+
size_t get_max_new_tokens(size_t prompt_length = 0) const;
100+
bool is_greedy_decoding() const;
101+
bool is_beam_search() const;
102+
bool is_multimomial() const;
103+
static GenerationConfig anymap_to_generation_config(const ov::AnyMap& config_map = {});
92104
};
93105

94-
} // namespace ov
106+
} // namespace genai
107+
} // namespace ov

‎src/cpp/include/openvino/genai/llm_pipeline.hpp

+23-19
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,13 @@
66
#include <optional>
77
#include <variant>
88

9-
#include <openvino/core/any.hpp>
9+
#include "openvino/core/any.hpp"
1010
#include "openvino/genai/generation_config.hpp"
1111
#include "openvino/genai/tokenizer.hpp"
1212
#include "openvino/genai/streamer_base.hpp"
1313

1414
namespace ov {
15+
namespace genai {
1516

1617
using StreamerVariant = std::variant<std::function<void (std::string)>, std::shared_ptr<StreamerBase>>;
1718
using OptionalGenerationConfig = std::optional<GenerationConfig>;
@@ -71,7 +72,7 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
7172
* @param device optional device
7273
* @param plugin_config optional plugin_config
7374
*/
74-
LLMPipeline(std::string& path, std::string device="CPU",
75+
LLMPipeline(const std::string& path, const std::string& device="CPU",
7576
const ov::AnyMap& plugin_config={},
7677
const std::string& ov_tokenizers_path="");
7778

@@ -84,11 +85,10 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
8485
* @param plugin_config optional plugin_config
8586
*/
8687
LLMPipeline(
87-
const std::string model_path,
88-
const ov::Tokenizer& tokenizer,
89-
const std::string device="CPU",
90-
const ov::AnyMap& plugin_config = {},
91-
const std::string& ov_tokenizers_path=""
88+
const std::string& model_path,
89+
const ov::genai::Tokenizer& tokenizer,
90+
const std::string& device="CPU",
91+
const ov::AnyMap& plugin_config = {}
9292
);
9393

9494
~LLMPipeline();
@@ -127,8 +127,7 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
127127
* @param generation_config optional GenerationConfig
128128
* @return DecodedResults a structure with resulting texts & scores
129129
*/
130-
DecodedResults generate(std::vector<std::string> texts, OptionalGenerationConfig generation_config);
131-
DecodedResults generate(std::initializer_list<std::string> text, OptionalGenerationConfig generation_config);
130+
DecodedResults generate(const std::vector<std::string>& texts, OptionalGenerationConfig generation_config);
132131

133132
/**
134133
* @brief Low level generate to be called with already encoded input_ids tokens.
@@ -153,14 +152,19 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
153152
return generate(text, AnyMap{std::forward<Properties>(properties)...});
154153
}
155154

156-
DecodedResults operator()(std::vector<std::string> text, OptionalGenerationConfig generation_config=std::nullopt);
157-
DecodedResults operator()(std::initializer_list<std::string> text, OptionalGenerationConfig generation_config=std::nullopt);
155+
DecodedResults operator()(const std::vector<std::string>& text, OptionalGenerationConfig generation_config=std::nullopt) {
156+
return generate(text, generation_config);
157+
}
158158

159-
// generate with streamers
160-
std::string operator()(std::string text, OptionalGenerationConfig generation_config=std::nullopt, OptionalStreamerVariant streamer=std::nullopt);
161-
std::string operator()(std::string text, OptionalStreamerVariant streamer);
159+
std::string operator()(
160+
std::string text,
161+
OptionalGenerationConfig generation_config=std::nullopt,
162+
OptionalStreamerVariant streamer=std::nullopt
163+
) {
164+
return generate(text, generation_config, streamer);
165+
}
162166

163-
ov::Tokenizer get_tokenizer();
167+
ov::genai::Tokenizer get_tokenizer();
164168
GenerationConfig get_generation_config() const;
165169
void set_generation_config(const GenerationConfig& generation_config);
166170

@@ -174,10 +178,9 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
174178
};
175179

176180
/*
177-
* utils that allow to use generate and operarator() in the folllowing way:
181+
* utils that allow to use generate and operator() in the following way:
178182
* pipe.generate(input_ids, ov::max_new_tokens(200), ov::temperature(1.0f),...)
179183
* pipe(text, ov::max_new_tokens(200), ov::temperature(1.0f),...)
180-
* All names match to names in cofnig except streamer.
181184
*/
182185
static constexpr ov::Property<size_t> max_new_tokens{"max_new_tokens"};
183186
static constexpr ov::Property<size_t> max_length{"max_length"};
@@ -207,6 +210,7 @@ static constexpr ov::Property<std::string> eos_token{"eos_token"};
207210

208211
// only lambda streamer can be set via ov::streamer(),... syntaxic sugar,
209212
// because std::variant<StremaerBase, std::function<>> can not be stored in AnyMap
210-
static constexpr ov::Property<std::function<void (std::string)>> streamer_lambda{"streamer"};
213+
static constexpr ov::Property<std::function<void (std::string)>> streamer{"streamer"};
211214

212-
} // namespace ov
215+
} // namespace genai
216+
} // namespace ov

0 commit comments

Comments
 (0)