|
| 1 | +// Copyright (C) 2024 Intel Corporation |
| 2 | +// SPDX-License-Identifier: Apache-2.0 |
| 3 | + |
| 4 | +#include "infer_request.hpp" |
| 5 | + |
| 6 | +#include <memory> |
| 7 | +#include <openvino/runtime/ivariable_state.hpp> |
| 8 | + |
| 9 | +#include "llama.h" |
| 10 | +#include "openvino/runtime/make_tensor.hpp" |
| 11 | +#include "openvino/util/log.hpp" |
| 12 | +#include "state.hpp" |
| 13 | + |
| 14 | +namespace ov { |
| 15 | +namespace llama_cpp_plugin { |
| 16 | + |
| 17 | +void allocate_tensor_impl(ov::SoPtr<ov::ITensor>& tensor, |
| 18 | + const ov::element::Type& element_type, |
| 19 | + const ov::Shape& shape) { |
| 20 | + if (!tensor || tensor->get_element_type() != element_type) { |
| 21 | + tensor = ov::make_tensor(element_type, shape); |
| 22 | + } else { |
| 23 | + tensor->set_shape(shape); |
| 24 | + } |
| 25 | +} |
| 26 | + |
| 27 | +LlamaCppSyncInferRequest::LlamaCppSyncInferRequest(const std::shared_ptr<const LlamaCppModel>& compiled_model) |
| 28 | + : ov::ISyncInferRequest(compiled_model) { |
| 29 | + OPENVINO_DEBUG << "llama_cpp_plugin: infer request ctor called\n"; |
| 30 | + m_compiled_model_ptr = compiled_model; |
| 31 | + for (const auto& input : get_inputs()) { |
| 32 | + allocate_tensor(input, [input](ov::SoPtr<ov::ITensor>& tensor) { |
| 33 | + allocate_tensor_impl(tensor, |
| 34 | + input.get_element_type(), |
| 35 | + input.get_partial_shape().is_dynamic() ? ov::Shape{0} : input.get_shape()); |
| 36 | + }); |
| 37 | + } |
| 38 | + for (const auto& output : get_outputs()) { |
| 39 | + allocate_tensor(output, [output](ov::SoPtr<ov::ITensor>& tensor) { |
| 40 | + allocate_tensor_impl(tensor, |
| 41 | + output.get_element_type(), |
| 42 | + output.get_partial_shape().is_dynamic() ? ov::Shape{0} : output.get_shape()); |
| 43 | + }); |
| 44 | + } |
| 45 | +} |
| 46 | +void LlamaCppSyncInferRequest::set_tensors_impl(const ov::Output<const ov::Node> port, |
| 47 | + const std::vector<ov::SoPtr<ov::ITensor>>& tensors) { |
| 48 | + OPENVINO_DEBUG << "llama_cpp_plugin: set_tensors_impl called\n"; |
| 49 | +} |
| 50 | + |
| 51 | +void llama_batch_add_reimpl(struct llama_batch& batch, |
| 52 | + llama_token id, |
| 53 | + llama_pos pos, |
| 54 | + const std::vector<llama_seq_id>& seq_ids, |
| 55 | + bool logits) { |
| 56 | + batch.token[batch.n_tokens] = id; |
| 57 | + batch.pos[batch.n_tokens] = pos; |
| 58 | + batch.n_seq_id[batch.n_tokens] = seq_ids.size(); |
| 59 | + for (size_t i = 0; i < seq_ids.size(); ++i) { |
| 60 | + batch.seq_id[batch.n_tokens][i] = seq_ids[i]; |
| 61 | + } |
| 62 | + batch.logits[batch.n_tokens] = logits; |
| 63 | + |
| 64 | + batch.n_tokens++; |
| 65 | +} |
| 66 | + |
| 67 | +void LlamaCppSyncInferRequest::infer() { |
| 68 | + auto input_ids_tensor_ptr = get_tensor(get_inputs()[0]); // TODO (vshampor) correctly identify input_ids among |
| 69 | + // all inputs without hardcode |
| 70 | + // |
| 71 | + auto position_ids_tensor_ptr = get_tensor(get_inputs()[2]); // TODO (vshampor) correctly identify input_ids among |
| 72 | + // all inputs without hardcode |
| 73 | + OPENVINO_ASSERT(input_ids_tensor_ptr->get_element_type() == ov::element::Type_t::i64); |
| 74 | + OPENVINO_ASSERT(input_ids_tensor_ptr->get_shape().size() == 2); |
| 75 | + size_t sequence_length = input_ids_tensor_ptr->get_shape()[1]; |
| 76 | + |
| 77 | + // llama_batch actually contains one sequence |
| 78 | + llama_batch batch = llama_batch_init(sequence_length, /* embd = */ 0, /* n_seq_max = */ 1); |
| 79 | + const int64_t* data_ptr = input_ids_tensor_ptr->data<int64_t>(); |
| 80 | + |
| 81 | + const int64_t* sequence_start_ptr = data_ptr /* + seq_idx */; |
| 82 | + |
| 83 | + const int64_t* position_idx_ptr = position_ids_tensor_ptr->data<int64_t>(); |
| 84 | + |
| 85 | + for (size_t tok_idx = 0; tok_idx < sequence_length; ++tok_idx) { |
| 86 | + const int64_t token_id = sequence_start_ptr[tok_idx]; |
| 87 | + const int64_t position_id = position_idx_ptr[tok_idx]; |
| 88 | + llama_batch_add_reimpl(batch, |
| 89 | + token_id, |
| 90 | + position_id, |
| 91 | + {0}, |
| 92 | + true); // the last `true` here is a marker that the logits for this |
| 93 | + // token should be computed and returned |
| 94 | + } |
| 95 | + |
| 96 | + llama_context* ctx = m_compiled_model_ptr->m_llama_ctx; |
| 97 | + int32_t sts = llama_decode(ctx, batch); |
| 98 | + |
| 99 | + if (sts != 0) { |
| 100 | + OPENVINO_THROW("llama_decode failed with code ", sts); |
| 101 | + } |
| 102 | + |
| 103 | + size_t n_vocab = llama_n_vocab(m_compiled_model_ptr->m_llama_model_ptr); |
| 104 | + |
| 105 | + ov::Tensor output_tensor{ov::element::Type_t::f32, {1, sequence_length, n_vocab}}; |
| 106 | + float* output_tensor_data_ptr = output_tensor.data<float>(); |
| 107 | + |
| 108 | + for (size_t pos = 0; pos < sequence_length; pos++) { |
| 109 | + float* logits_from_llama = llama_get_logits_ith(ctx, pos); |
| 110 | + std::copy(logits_from_llama, logits_from_llama + n_vocab, output_tensor_data_ptr + pos * n_vocab); |
| 111 | + } |
| 112 | + |
| 113 | + auto& logit_output = get_outputs()[0]; |
| 114 | + allocate_tensor(logit_output, [&output_tensor](ov::SoPtr<ov::ITensor>& tensor) { |
| 115 | + allocate_tensor_impl(tensor, output_tensor.get_element_type(), output_tensor.get_shape()); |
| 116 | + output_tensor.copy_to(ov::make_tensor(tensor)); |
| 117 | + }); |
| 118 | +}; |
| 119 | +std::vector<ov::ProfilingInfo> LlamaCppSyncInferRequest::get_profiling_info() const { |
| 120 | + OPENVINO_DEBUG << "llama_cpp_plugin: get_profiling_info() called\n"; |
| 121 | + return std::vector<ov::ProfilingInfo>{}; |
| 122 | +}; |
| 123 | + |
| 124 | +std::vector<ov::SoPtr<ov::IVariableState>> LlamaCppSyncInferRequest::query_state() const { |
| 125 | + OPENVINO_DEBUG << "llama_cpp_plugin: query_state() called\n"; |
| 126 | + return {std::static_pointer_cast<ov::IVariableState>(std::make_shared<LlamaCppState>(m_compiled_model_ptr))}; |
| 127 | +} |
| 128 | +} // namespace llama_cpp_plugin |
| 129 | +} // namespace ov |
0 commit comments