Skip to content

Commit 74fc107

Browse files
committed
Merge branch 'generate_pipeline' into package
2 parents 6227b65 + a111a3f commit 74fc107

17 files changed

+335
-563
lines changed

src/cpp/include/openvino/genai/generation_config.hpp

-6
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ enum class StopCriteria { early, heuristic, never };
3131
* @param num_beam_groups number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
3232
* @param diversity_penalty this value is subtracted from a beam's score if it generates a token same as any beam from other group at a
3333
* particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled.
34-
* [more datails in this paper](https://arxiv.org/pdf/1610.02424.pdf).
3534
* @param length_penalty exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
3635
* the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
3736
* likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while
@@ -48,7 +47,6 @@ enum class StopCriteria { early, heuristic, never };
4847
* @param do_sample whether or not to use multinomial random sampling
4948
* that add up to `top_p` or higher are kept.
5049
* @param repetition_penalty the parameter for repetition penalty. 1.0 means no penalty.
51-
* [more datails in this paper](https://arxiv.org/pdf/1909.05858.pdf).
5250
* @param pad_token_id id of padding token
5351
* @param bos_token_id id of <bos> token
5452
* @param eos_token_id id of <eos> token
@@ -90,10 +88,6 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
9088
// used for chat scenario
9189
std::string bos_token = "<s>";
9290
std::string eos_token = "</s>";
93-
94-
// speculative sampling
95-
std::variant<std::string, ov::CompiledModel, ov::InferRequest> draft_model; // todo: remove or try to add ov::Model const ov::Model&,
9691
};
9792

98-
9993
} // namespace ov

src/cpp/include/openvino/genai/llm_pipeline.hpp

+57-16
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,10 @@
33

44
#pragma once
55

6-
#include <filesystem>
76
#include <optional>
7+
#include <variant>
88

9-
#include <openvino/runtime/infer_request.hpp>
109
#include <openvino/core/any.hpp>
11-
1210
#include "openvino/genai/generation_config.hpp"
1311
#include "openvino/genai/tokenizer.hpp"
1412
#include "openvino/genai/streamer_base.hpp"
@@ -84,18 +82,24 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
8482
* @param streamer optional streamer
8583
* @return std::string decoded resulting text
8684
*/
87-
std::string generate(std::string text, OptionalGenerationConfig generation_config, OptionalStreamerVariant streamer);
85+
std::string generate(std::string text, OptionalGenerationConfig generation_config=nullopt, OptionalStreamerVariant streamer=nullopt);
8886

89-
9087
template <typename... Properties>
9188
util::EnableIfAllStringAny<std::string, Properties...> generate(
9289
std::string text,
9390
Properties&&... properties) {
9491
return generate(text, AnyMap{std::forward<Properties>(properties)...});
9592
}
96-
9793
std::string generate(std::string text, const ov::AnyMap& config);
9894

95+
template <typename... Properties>
96+
util::EnableIfAllStringAny<EncodedResults, Properties...> generate(
97+
ov::Tensor input_ids,
98+
Properties&&... properties) {
99+
return generate(input_ids, AnyMap{std::forward<Properties>(properties)...});
100+
}
101+
EncodedResults generate(ov::Tensor input_ids, const ov::AnyMap& config);
102+
99103
/**
100104
* @brief High level generate for batched prompts which encodes inputs and returns decoded outputs.
101105
* Streamer cannot be used for multibatch inputs.
@@ -119,16 +123,22 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
119123
*/
120124
EncodedResults generate(ov::Tensor input_ids,
121125
std::optional<ov::Tensor> attention_mask,
122-
OptionalGenerationConfig generation_config,
123-
OptionalStreamerVariant streamer);
124-
125-
std::string operator()(std::string text, OptionalGenerationConfig generation_config);
126-
DecodedResults operator()(std::vector<std::string> text, OptionalGenerationConfig generation_config);
127-
DecodedResults operator()(std::initializer_list<std::string> text, OptionalGenerationConfig generation_config);
126+
OptionalGenerationConfig generation_config=nullopt,
127+
OptionalStreamerVariant streamer=nullopt);
128+
129+
template <typename InputsType, typename... Properties>
130+
util::EnableIfAllStringAny<std::string, Properties...> operator()(
131+
InputsType text,
132+
Properties&&... properties) {
133+
return generate(text, AnyMap{std::forward<Properties>(properties)...});
134+
}
135+
136+
DecodedResults operator()(std::vector<std::string> text, OptionalGenerationConfig generation_config=nullopt);
137+
DecodedResults operator()(std::initializer_list<std::string> text, OptionalGenerationConfig generation_config=nullopt);
128138

129139
// generate with streamers
130-
std::string operator()(std::string text, OptionalGenerationConfig generation_config, StreamerVariant streamer);
131-
std::string operator()(std::string text, StreamerVariant streamer);
140+
std::string operator()(std::string text, OptionalGenerationConfig generation_config=nullopt, OptionalStreamerVariant streamer=nullopt);
141+
std::string operator()(std::string text, OptionalStreamerVariant streamer);
132142

133143
ov::Tokenizer get_tokenizer();
134144
GenerationConfig get_generation_config() const;
@@ -143,9 +153,40 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline {
143153
std::unique_ptr<LLMPipelineImpl> m_pimpl;
144154
};
145155

156+
/*
157+
* utils that allow to use generate and operarator() in the folllowing way:
158+
* pipe.generate(input_ids, ov::max_new_tokens(200), ov::temperature(1.0f),...)
159+
* pipe(text, ov::max_new_tokens(200), ov::temperature(1.0f),...)
160+
* All names match to names in cofnig except streamer.
161+
*/
146162
static constexpr ov::Property<size_t> max_new_tokens{"max_new_tokens"};
163+
static constexpr ov::Property<size_t> max_length{"max_length"};
164+
static constexpr ov::Property<bool> ignore_eos{"ignore_eos"};
165+
166+
static constexpr ov::Property<size_t> num_beam_groups{"num_beam_groups"};
167+
static constexpr ov::Property<size_t> num_beams{"num_beams"};
168+
static constexpr ov::Property<float> diversity_penalty{"diversity_penalty"};
169+
static constexpr ov::Property<float> length_penalty{"length_penalty"};
170+
static constexpr ov::Property<size_t> num_return_sequences{"num_return_sequences"};
171+
static constexpr ov::Property<size_t> no_repeat_ngram_size{"no_repeat_ngram_size"};
172+
static constexpr ov::Property<StopCriteria> stop_criteria{"stop_criteria"};
173+
147174
static constexpr ov::Property<float> temperature{"temperature"};
148-
static constexpr ov::Property<std::function<void (std::string)>> streamer_lambda{"streamer_lambda"};
149-
static constexpr ov::Property<std::shared_ptr<StreamerBase>> streamer{"streamer"};
175+
static constexpr ov::Property<float> top_p{"top_p"};
176+
static constexpr ov::Property<int> top_k{"top_k"};
177+
static constexpr ov::Property<bool> do_sample{"do_sample"};
178+
static constexpr ov::Property<float> repetition_penalty{"repetition_penalty"};
179+
180+
181+
static constexpr ov::Property<int64_t> pad_token_id{"pad_token_id"};
182+
static constexpr ov::Property<int64_t> bos_token_id{"bos_token_id"};
183+
static constexpr ov::Property<int64_t> eos_token_id{"eos_token_id"};
184+
185+
static constexpr ov::Property<std::string> bos_token{"bos_token"};
186+
static constexpr ov::Property<std::string> eos_token{"eos_token"};
187+
188+
// only lambda streamer can be set via ov::streamer(),... syntaxic sugar,
189+
// because std::variant<StremaerBase, std::function<>> can not be stored in AnyMap
190+
static constexpr ov::Property<std::function<void (std::string)>> streamer_lambda{"streamer"};
150191

151192
} // namespace ov

src/cpp/include/openvino/genai/streamer_base.hpp

+12-1
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,21 @@
77

88
namespace ov {
99

10+
/**
11+
* @brief base class for streamers. In order to use inherit from from this class and inplement put, and methods
12+
*
13+
* @param m_tokenizer tokenizer
14+
*/
1015
class StreamerBase {
1116
public:
17+
Tokenizer m_tokenizer;
18+
StreamerBase(Tokenizer tokenizer): m_tokenizer(tokenizer) {};
19+
StreamerBase() = default;
20+
21+
/// @brief put is called every time new token is decoded
1222
virtual void put(int64_t token) = 0;
13-
23+
24+
/// @brief end is called at the end of generation. It can be used to flush cache if your own streamer has one
1425
virtual void end() = 0;
1526
};
1627

src/cpp/include/openvino/genai/tokenizer.hpp

+17-11
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,16 @@
33

44
#pragma once
55

6-
#include <filesystem>
76
#include <string>
87
#include <vector>
9-
#include <memory>
108
#include <initializer_list>
11-
129
#include <openvino/runtime/tensor.hpp>
13-
1410
#include "openvino/genai/visibility.hpp"
1511

1612
namespace ov {
1713

1814
/**
19-
* @brief class used to encode prompts and decode resulting tokens
15+
* @brief class is used to encode prompts and decode resulting tokens
2016
*/
2117
class OPENVINO_GENAI_EXPORTS Tokenizer {
2218
public:
@@ -25,21 +21,22 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
2521
* @param tokenizer_path openvino_tokenizer.xml and openvino_detokenizer.xml should be located in the tokenizer_path
2622
* @param device device. Currently only 'CPU' is supported
2723
*/
28-
Tokenizer(const std::string tokenizers_path, const std::string device="CPU");
24+
Tokenizer(const std::string& tokenizers_path, const std::string& device="CPU");
2925

3026
/**
3127
* @brief encode a single prompt
3228
* @return pair of [input_ids, attention_mask]
3329
*/
34-
std::pair<ov::Tensor, ov::Tensor> encode(std::string prompt);
30+
std::pair<ov::Tensor, ov::Tensor> encode(const std::string prompt);
3531

3632
/**
3733
* @brief encode batch of prompts. Left padding will be applied by default
3834
* @param prompts vector storing batch of prompts
3935
* @return pair of [input_ids, attention_mask]
4036
*/
41-
std::pair<ov::Tensor, ov::Tensor> encode(std::vector<std::string> prompts);
42-
std::pair<ov::Tensor, ov::Tensor> encode(std::initializer_list<std::string> prompts);
37+
std::pair<ov::Tensor, ov::Tensor> encode(std::vector<std::string>& prompts);
38+
std::pair<ov::Tensor, ov::Tensor> encode(std::vector<std::string>&& prompts);
39+
std::pair<ov::Tensor, ov::Tensor> encode(std::initializer_list<std::string>& prompts);
4340

4441
/**
4542
* @brief decode sequence of tokens
@@ -62,8 +59,17 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
6259
*/
6360
std::vector<std::string> decode(std::vector<std::vector<int64_t>> tokens);
6461

65-
int64_t m_bos_token_id = 1; // todo: read from rt_info
66-
int64_t m_eos_token_id = 2; // todo: read from rt_info
62+
// information about <bos>, <eos> tokens should be public,
63+
// they are used at least in StreamerBase descendants
64+
int64_t get_bos_token_id() const;
65+
int64_t get_eos_token_id() const;
66+
int64_t get_pad_token_id() const;
67+
68+
// Also need write access to set these tokens when they are not successfully read from xml rt_info.
69+
// In the latter case values can be read from config.json in LLMPipeline
70+
void set_bos_token_id(int64_t);
71+
void set_eos_token_id(int64_t);
72+
void set_pad_token_id(int64_t);
6773

6874
Tokenizer() = default;
6975
~Tokenizer();

0 commit comments

Comments
 (0)