Skip to content

Commit 3b5ab0d

Browse files
committed
Replace compex loops with index access, reuse pack_strings() and unpack_strings()
1 parent f301445 commit 3b5ab0d

File tree

3 files changed

+17
-47
lines changed

3 files changed

+17
-47
lines changed

CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,4 @@ endif()
1212

1313
add_subdirectory(llm/cpp)
1414
target_compile_definitions(llm PRIVATE USER_OV_EXTENSIONS_PATH=\"$<TARGET_FILE:user_ov_extensions>\")
15+
target_link_libraries(llm PRIVATE user_ov_extensions)

llm/cpp/llm.cpp

+15-46
Original file line numberDiff line numberDiff line change
@@ -2,41 +2,24 @@
22
// SPDX-License-Identifier: Apache-2.0
33

44
#include <openvino/openvino.hpp>
5+
#include <utils.hpp>
56

67
namespace {
78
std::pair<ov::Tensor, ov::Tensor> tokenize(ov::InferRequest&& tokenizer, std::string_view prompt) {
89
constexpr size_t BATCH_SIZE = 1;
9-
constexpr size_t INDEXES_SIZE = (2 + BATCH_SIZE) * sizeof(int32_t);
1010
ov::Tensor destination = tokenizer.get_input_tensor();
11-
destination.set_shape({INDEXES_SIZE + prompt.length()});
12-
// N - batch size, E - end offset (and start for the next string). Tensor layout in bytes:
13-
// Nnnn0000EeeeEeeeChars1Chars2
14-
int32_t* int_ptr = reinterpret_cast<int32_t*>(destination.data<uint8_t>());
15-
int_ptr[0] = BATCH_SIZE;
16-
int_ptr[1] = 0;
17-
int_ptr[2] = int32_t(prompt.length());
18-
std::copy(prompt.cbegin(), prompt.cend(), reinterpret_cast<char*>(int_ptr + 3));
11+
pack_strings(std::array<std::string_view, BATCH_SIZE>{prompt}, destination);
1912
tokenizer.infer();
2013
return {tokenizer.get_tensor("input_ids"), tokenizer.get_tensor("attention_mask")};
2114
}
2215

2316
void print_token(ov::InferRequest& detokenizer, int32_t out_token) {
2417
constexpr size_t BATCH_SIZE = 1;
25-
constexpr size_t INDEXES_SIZE = (2 + BATCH_SIZE) * sizeof(int32_t);
2618
ov::Tensor inp = detokenizer.get_input_tensor();
2719
inp.set_shape({BATCH_SIZE, 1});
2820
inp.data<int32_t>()[0] = out_token;
2921
detokenizer.infer();
30-
const ov::Tensor& detokenized = detokenizer.get_output_tensor();
31-
size_t tensor_size = detokenized.get_size();
32-
if (tensor_size <= INDEXES_SIZE) {
33-
throw std::runtime_error("The detokenized tensor must contain batch size, first string offset and end indices");
34-
}
35-
const char* char_ptr = reinterpret_cast<const char*>(detokenized.data<const uint8_t>());
36-
if (reinterpret_cast<const int32_t*>(char_ptr)[0] != BATCH_SIZE) {
37-
throw std::runtime_error("Expected batch 1 in the detokenized tensor");
38-
}
39-
std::cout.write(char_ptr + INDEXES_SIZE, std::streamsize(tensor_size - INDEXES_SIZE)).flush();
22+
std::cout << unpack_strings(detokenizer.get_output_tensor()).front() << std::flush;
4023
}
4124
}
4225

@@ -50,37 +33,28 @@ int main(int argc, char* argv[]) try {
5033
ov::InferRequest detokenizer = core.compile_model(argv[3], "CPU").create_infer_request();
5134
std::shared_ptr<ov::Model> model = core.read_model(argv[1]);
5235
constexpr size_t BATCH_SIZE = 1;
53-
std::map<std::string, ov::PartialShape> shapes = {
54-
{"input_ids", ov::PartialShape{
36+
std::map<size_t, ov::PartialShape> shapes = {
37+
{0, ov::PartialShape{
5538
BATCH_SIZE, {1, std::numeric_limits<ov::Dimension::value_type>::max()}
5639
}},
57-
{"attention_mask", ov::PartialShape{
40+
{1, ov::PartialShape{
5841
BATCH_SIZE, {1, std::numeric_limits<ov::Dimension::value_type>::max()}
5942
}}
6043
};
61-
for (const ov::Output<ov::Node>& input : model->inputs()) {
62-
for (const std::string& name : input.get_names()) {
63-
if (name.rfind("past_key_values", 0) == 0) {
64-
ov::PartialShape shape = input.get_partial_shape();
65-
shape[0] = BATCH_SIZE;
66-
shapes.emplace(name, shape);
67-
break;
68-
}
69-
}
44+
std::vector<ov::Output<ov::Node>> inputs = model->inputs();
45+
for (size_t idx = 2; idx < inputs.size(); ++idx) {
46+
ov::PartialShape shape = inputs.at(idx).get_partial_shape();
47+
shape.at(0) = BATCH_SIZE;
48+
shapes.emplace(idx, shape);
7049
}
7150
model->reshape(shapes);
7251
ov::preprocess::PrePostProcessor p3(model);
7352
p3.input("input_ids").tensor().set_element_type(ov::element::i32); // cast to the type of tokenyzer's output
7453
p3.input("attention_mask").tensor().set_element_type(ov::element::i32);
7554
model = p3.build();
7655
ov::InferRequest ireq = core.compile_model(model, "CPU", {ov::cache_dir("llm-cache")}).create_infer_request();
77-
for (const ov::Output<ov::Node>& input : model->inputs()) {
78-
for (const std::string& name : input.get_names()) {
79-
if (name.rfind("past_key_values", 0) == 0) {
80-
ireq.get_tensor(input).set_shape(input.get_partial_shape().get_min_shape());
81-
break;
82-
}
83-
}
56+
for (size_t idx = 2; idx < inputs.size(); ++idx) {
57+
ireq.get_input_tensor(idx).set_shape(inputs.at(idx).get_partial_shape().get_min_shape());
8458
}
8559
ireq.get_tensor("input_ids").set_shape(input_ids.get_shape()); // TODO: replace with ireq.set_tensor("input_ids", input_ids); after it's fixed
8660
ireq.get_tensor("attention_mask").set_shape(input_ids.get_shape());
@@ -96,13 +70,8 @@ int main(int argc, char* argv[]) try {
9670
ireq.get_tensor("attention_mask").data<int32_t>()[0] = 1;
9771
constexpr int32_t SPECIAL_EOS_TOKEN = 2; // There's no way to extract the value from the tokenizer for now
9872
while (out_token != SPECIAL_EOS_TOKEN) {
99-
for (const ov::Output<ov::Node>& input : model->inputs()) {
100-
for (const std::string& name : input.get_names()) {
101-
if (name.rfind("past_key_values", 0) == 0) {
102-
ireq.set_tensor(input, ireq.get_tensor("present" + name.substr(15)));
103-
break;
104-
}
105-
}
73+
for (size_t idx = 2; idx < inputs.size(); ++idx) {
74+
ireq.set_input_tensor(idx, ireq.get_output_tensor(idx - 1));
10675
}
10776
ireq.get_tensor("input_ids").data<int32_t>()[0] = out_token;
10877
ireq.start_async();

0 commit comments

Comments
 (0)