Improve with comments

vshampor · vshampor · commit 7c9ccbe9ad98 · 2024-03-14T12:18:43.000+01:00
diff --git a/.github/workflows/llama_cpp_plugin_build_and_test.yml b/.github/workflows/llama_cpp_plugin_build_and_test.yml
@@ -29,6 +29,7 @@ jobs:
         with:
           submodules: recursive
           repository: vshampor/openvino
+          branch: llama_cpp_mod
           path: openvino
 
       - name: CMake - configure
diff --git a/modules/llama_cpp_plugin/CMakeLists.txt b/modules/llama_cpp_plugin/CMakeLists.txt
@@ -6,13 +6,15 @@ find_package(OpenVINODeveloperPackage REQUIRED)
 
 ov_option(ENABLE_LLAMA_CPP_PLUGIN_REGISTRATION "Enables registration of LLAMA_CPP plugin" ON)
 
-if(CMAKE_COMPILER_IS_GNUCXX)
-    ov_add_compiler_flags(-Wall)
-endif()
-
 add_subdirectory(src)
 
-add_subdirectory(third_party/llama.cpp)
+FetchContent_Declare(
+  llama_cpp
+  GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
+  GIT_TAG        b2417
+)
+
+FetchContent_MakeAvailable(llama_cpp)
 
 if(ENABLE_TESTS)
     include(CTest)
diff --git a/modules/llama_cpp_plugin/build.sh b/modules/llama_cpp_plugin/build.sh
diff --git a/modules/llama_cpp_plugin/include/compiled_model.hpp b/modules/llama_cpp_plugin/include/compiled_model.hpp
@@ -70,7 +70,7 @@ namespace ov {
 
             llama_model* m_llama_model_ptr = nullptr;
             llama_context* m_llama_ctx = nullptr;
-            std::shared_ptr<ov::Model> m_model;
+            std::shared_ptr<ov::Model> m_fake_model;
             size_t* num_tokens_processed_ptr = nullptr;  // TODO: (vshampor) find a better place for this kind of storage
 
             std::vector<ov::Output<const ov::Node>> m_fake_inputs;
diff --git a/modules/llama_cpp_plugin/src/CMakeLists.txt b/modules/llama_cpp_plugin/src/CMakeLists.txt
@@ -35,11 +35,6 @@ target_include_directories(${TARGET_NAME} PRIVATE
     "${CMAKE_CURRENT_SOURCE_DIR}"
     "${LlamaCppPlugin_SOURCE_DIR}/include")
 
-# link common OpenVINO Runtime libraries
-target_link_libraries(${TARGET_NAME} PRIVATE
-    openvino::interpreter_backend
-    openvino::reference)
-
 set( LLAMA_TARGET_NAME CACHE STRING "Exact target exposed by llama.cpp to link against as the main llama.cpp library")
 if(NOT LLAMA_TARGET_NAME)
     set( LLAMA_TARGET_NAME "llama" )
diff --git a/modules/llama_cpp_plugin/src/compiled_model.cpp b/modules/llama_cpp_plugin/src/compiled_model.cpp
@@ -54,28 +54,29 @@ LlamaCppModel::LlamaCppModel(const std::string& gguf_fname, const std::shared_pt
 
     ov::ParameterVector inputs{input_ids};
 
-    std::vector<std::pair<std::string, ov::element::Type_t>> unused_names_in_order = {
-        {"attention_mask", ov::element::Type_t::i64},
-        {"position_ids", ov::element::Type_t::i64},
-        {"beam_idx", ov::element::Type_t::i32}};
-    for (const auto& descr : unused_names_in_order) {
-        auto unused_inp = std::make_shared<ov::opset13::Parameter>(descr.second, ov::PartialShape({-1, -1}));
+    std::vector<std::tuple<std::string, ov::element::Type_t, ov::PartialShape>> additional_inputs_in_order = {
+        {"attention_mask", ov::element::Type_t::i64, {-1, -1}},
+        {"position_ids", ov::element::Type_t::i64, {-1, -1}},
+        {"beam_idx", ov::element::Type_t::i32, {-1, -1}}};
+
+    for (const auto& descr : additional_inputs_in_order) {
+        auto unused_inp = std::make_shared<ov::opset13::Parameter>(std::get<1>(descr), std::get<2>(descr));
         inputs.push_back(unused_inp);
     }
 
-    m_model = std::make_shared<ov::Model>(logits, inputs, "fake_ov_model_for_io_specification");
+    m_fake_model = std::make_shared<ov::Model>(logits, inputs, "fake_ov_model_for_io_specification");
 
-    m_model->inputs()[0].set_names({"input_ids"});
-    for (size_t i = 0; i < unused_names_in_order.size(); i++) {
-        m_model->inputs()[i + 1].set_names({unused_names_in_order[i].first});
+    m_fake_model->inputs()[0].set_names({"input_ids"});
+    for (size_t i = 0; i < additional_inputs_in_order.size(); i++) {
+        m_fake_model->inputs()[i + 1].set_names({std::get<0>(additional_inputs_in_order[i])});
     }
 
-    m_model->outputs()[0].set_names({"logits"});
+    m_fake_model->outputs()[0].set_names({"logits"});
 
-    for (auto input : m_model->inputs()) {
+    for (auto input : m_fake_model->inputs()) {
         m_fake_inputs.emplace_back(input);
     }
-    for (auto output : m_model->outputs()) {
+    for (auto output : m_fake_model->outputs()) {
         m_fake_outputs.emplace_back(output);
     }
 }
diff --git a/modules/llama_cpp_plugin/src/infer_request.cpp b/modules/llama_cpp_plugin/src/infer_request.cpp
@@ -62,6 +62,9 @@ void llama_batch_add_reimpl(struct llama_batch& batch,
 void LlamaCppSyncInferRequest::infer() {
     auto input_ids_tensor_ptr = get_tensor(get_inputs()[0]);  // TODO (vshampor) correctly identify input_ids among
                                                               // all inputs without hardcode
+                                                              //
+    auto position_ids_tensor_ptr = get_tensor(get_inputs()[2]);  // TODO (vshampor) correctly identify input_ids among
+                                                              // all inputs without hardcode
     OPENVINO_ASSERT(input_ids_tensor_ptr->get_element_type() == ov::element::Type_t::i64);
     OPENVINO_ASSERT(input_ids_tensor_ptr->get_shape().size() == 2);
     size_t sequence_length = input_ids_tensor_ptr->get_shape()[1];
@@ -72,15 +75,17 @@ void LlamaCppSyncInferRequest::infer() {
 
     const int64_t* sequence_start_ptr = data_ptr /* + seq_idx */;
 
+    const int64_t* position_idx_ptr = position_ids_tensor_ptr->data<int64_t>();
+
     for (size_t tok_idx = 0; tok_idx < sequence_length; ++tok_idx) {
         const int64_t token_id = sequence_start_ptr[tok_idx];
+        const int64_t position_id = position_idx_ptr[tok_idx];
         llama_batch_add_reimpl(batch,
                                token_id,
-                               *(m_compiled_model_ptr->num_tokens_processed_ptr),
+                               position_id,
                                {0},
                                true);  // the last `true` here is a marker that the logits for this
                                        // token should be computed and returned
-        *(m_compiled_model_ptr->num_tokens_processed_ptr) += 1;
     }
 
     llama_context* ctx = m_compiled_model_ptr->m_llama_ctx;
diff --git a/modules/llama_cpp_plugin/third_party/llama.cpp b/modules/llama_cpp_plugin/third_party/llama.cpp