From 8cb5fd5e025dcd799f8dbe419696c7a25c4a3c1a Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Fri, 8 Mar 2024 16:52:02 +0100
Subject: [PATCH 01/27] Initial commit

---
 .gitmodules                                   |   3 +
 modules/llama_cpp_plugin/CMakeLists.txt       |  32 +
 modules/llama_cpp_plugin/build.sh             |  19 +
 .../include/compiled_model.hpp                |  84 ++
 .../include/infer_request.hpp                 |  31 +
 modules/llama_cpp_plugin/include/plugin.hpp   | 112 +++
 modules/llama_cpp_plugin/src/CMakeLists.txt   |  58 ++
 .../llama_cpp_plugin/src/compiled_model.cpp   | 732 ++++++++++++++++++
 .../llama_cpp_plugin/src/infer_request.cpp    | 111 +++
 modules/llama_cpp_plugin/src/plugin.cpp       | 152 ++++
 modules/llama_cpp_plugin/tests/CMakeLists.txt |  37 +
 .../llama_cpp_plugin/third_party/llama.cpp    |   1 +
 modules/llama_cpp_plugin/tools/CMakeLists.txt |  22 +
 .../llama_cpp_plugin/tools/cache_embedder.cpp |  53 ++
 modules/llama_cpp_plugin/tools/runner.cpp     |  73 ++
 .../tools/tensor_comparator.cpp               |  95 +++
 16 files changed, 1615 insertions(+)
 create mode 100644 .gitmodules
 create mode 100644 modules/llama_cpp_plugin/CMakeLists.txt
 create mode 100755 modules/llama_cpp_plugin/build.sh
 create mode 100644 modules/llama_cpp_plugin/include/compiled_model.hpp
 create mode 100644 modules/llama_cpp_plugin/include/infer_request.hpp
 create mode 100644 modules/llama_cpp_plugin/include/plugin.hpp
 create mode 100644 modules/llama_cpp_plugin/src/CMakeLists.txt
 create mode 100644 modules/llama_cpp_plugin/src/compiled_model.cpp
 create mode 100644 modules/llama_cpp_plugin/src/infer_request.cpp
 create mode 100644 modules/llama_cpp_plugin/src/plugin.cpp
 create mode 100644 modules/llama_cpp_plugin/tests/CMakeLists.txt
 create mode 160000 modules/llama_cpp_plugin/third_party/llama.cpp
 create mode 100644 modules/llama_cpp_plugin/tools/CMakeLists.txt
 create mode 100644 modules/llama_cpp_plugin/tools/cache_embedder.cpp
 create mode 100644 modules/llama_cpp_plugin/tools/runner.cpp
 create mode 100644 modules/llama_cpp_plugin/tools/tensor_comparator.cpp

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000..29da379f7
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "modules/llama_cpp_plugin/third_party/llama.cpp"]
+	path = modules/llama_cpp_plugin/third_party/llama.cpp
+	url = https://github.com/vshampor/llama.cpp
diff --git a/modules/llama_cpp_plugin/CMakeLists.txt b/modules/llama_cpp_plugin/CMakeLists.txt
new file mode 100644
index 000000000..f5d3284b2
--- /dev/null
+++ b/modules/llama_cpp_plugin/CMakeLists.txt
@@ -0,0 +1,32 @@
+cmake_minimum_required(VERSION 3.13)
+
+project(LlamaCppPlugin)
+
+find_package(OpenVINODeveloperPackage REQUIRED)
+
+ov_option(ENABLE_LLAMA_CPP_PLUGIN_REGISTRATION "Enables registration of LLAMA_CPP plugin" OFF)
+
+if(CMAKE_COMPILER_IS_GNUCXX)
+    ov_add_compiler_flags(-Wall)
+endif()
+
+add_subdirectory(src)
+add_subdirectory(tools)
+
+add_subdirectory(third_party/llama.cpp)
+
+if(ENABLE_TESTS)
+    include(CTest)
+    enable_testing()
+
+    if(ENABLE_FUNCTIONAL_TESTS)
+        add_subdirectory(tests/functional)
+    endif()
+endif()
+
+
+# install
+
+if(OpenVINODeveloperPackage_FOUND)
+    ov_cpack(LlamaCppPlugin)
+endif()
diff --git a/modules/llama_cpp_plugin/build.sh b/modules/llama_cpp_plugin/build.sh
new file mode 100755
index 000000000..fa36b9e03
--- /dev/null
+++ b/modules/llama_cpp_plugin/build.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -e
+# What we want to do is build the llama.cpp dependency for different backends and have a separate plugin for each such build type.
+# Sadly, CMake does not reliably allow to add_subdirectory multiple times in the same build tree, let alone with different options,
+# since this would lead to "duplicate targets". There doesn't seem to be a solution to this problem even still. Thus, will have to
+# invoke the cmake configure and build stage separately for each llama.cpp backend type.
+
+BUILD_TYPE=$1
+COMMON_OPTS="-DOpenVINODeveloperPackage_DIR=/home/vshampor/work/openvino/build -DCMAKE_EXPORT_COMPILE_COMMANDS=1"
+
+# Regular CPU build of llama.cpp
+cmake -S ./ -B ./build/cpu/ ${COMMON_OPTS} "$@"
+cmake --build ./build/cpu/ -j --target llama --target llama_cpp_plugin
+
+
+# CUDA build
+cmake -S ./ -B ./build/cuda/ -DLLAMA_CUBLAS=1 -DPLUGIN_DEVICE_NAME="LLAMA_CPP_CUDA" -DPLUGIN_LIBRARY_NAME="llama_cpp_cuda_plugin" -DLLAMA_TARGET_NAME="llama_cuda" ${COMMON_OPTS} "$@"
+cmake --build ./build/cuda/ -j --target llama_cuda --target llama_cpp_cuda_plugin
diff --git a/modules/llama_cpp_plugin/include/compiled_model.hpp b/modules/llama_cpp_plugin/include/compiled_model.hpp
new file mode 100644
index 000000000..eb785e252
--- /dev/null
+++ b/modules/llama_cpp_plugin/include/compiled_model.hpp
@@ -0,0 +1,84 @@
+#ifndef LLAMA_CPP_COMPILED_MODEL_HPP
+#define LLAMA_CPP_COMPILED_MODEL_HPP
+
+#include "openvino/runtime/icompiled_model.hpp"
+#include "openvino/runtime/isync_infer_request.hpp"
+#include "llama.h"
+
+namespace ov {
+    namespace llama_cpp_plugin {
+        class LlamaCppSyncInferRequest;
+        class LlamaCppPlugin;
+        class LlamaCppModel: public ICompiledModel {
+        public:
+            LlamaCppModel(const std::shared_ptr<ov::Model>& model,
+                          const std::shared_ptr<const ov::IPlugin>& plugin,
+                          const ov::SoPtr<ov::IRemoteContext>& context,
+                          const std::shared_ptr<ov::threading::ITaskExecutor>& task_executor
+                          );
+
+            LlamaCppModel(const std::shared_ptr<ov::Model>& ov_model,
+                          std::istream& input_file,
+                          const std::shared_ptr<const IPlugin>& plugin);
+
+            LlamaCppModel(const std::string& gguf_fname,
+                          const std::shared_ptr<const IPlugin>& plugin);
+            /**
+             * @brief Export compiled model to stream
+             *
+             * @param model output stream
+             */
+            virtual void export_model(std::ostream& model) const override;
+
+            /**
+             * @brief Returns runtime model
+             *
+             * @return OpenVINO Model which represents runtime graph
+             */
+            virtual std::shared_ptr<const ov::Model> get_runtime_model() const override;
+
+            /**
+             * @brief Allows to set property
+             *
+             * @param properties new plugin properties
+             */
+            virtual void set_property(const ov::AnyMap& properties) override;
+
+            /**
+             * @brief Returns property
+             *
+             * @param name Property name
+             *
+             * @return Property value
+             *              virtual std::shared_ptr<ov::ISyncInferRequest> create_sync_infer_request() const override;
+            **/
+            virtual ov::Any get_property(const std::string& name) const override;
+            virtual const std::vector<ov::Output<const ov::Node>>& inputs() const override;
+            virtual const std::vector<ov::Output<const ov::Node>>& outputs() const override;
+        protected:
+            /**
+             * @brief Method creates infer request implementation
+             *
+             * @return Sync infer request
+             */
+            virtual std::shared_ptr<ov::ISyncInferRequest> create_sync_infer_request() const override;
+
+        private:
+            std::string get_current_gguf_file_path() const;
+            gguf_context* m_gguf_ctx = nullptr;
+            std::string m_converted_gguf_file_name;
+
+            llama_model* m_llama_model_ptr = nullptr;
+            llama_context* m_llama_ctx = nullptr;
+            size_t* num_tokens_processed_ptr = nullptr;  // TODO: (vshampor) find a better place for this kind of storage
+            std::shared_ptr<ov::Model> m_model;
+
+            std::vector<ov::Output<const ov::Node>> m_fake_inputs;
+            std::vector<ov::Output<const ov::Node>> m_fake_outputs;
+
+        friend class ov::llama_cpp_plugin::LlamaCppSyncInferRequest;
+        };
+    }
+}  // namespace ov
+
+#endif  // LLAMA_CPP_COMPILED_MODEL_HPP
diff --git a/modules/llama_cpp_plugin/include/infer_request.hpp b/modules/llama_cpp_plugin/include/infer_request.hpp
new file mode 100644
index 000000000..b6314010b
--- /dev/null
+++ b/modules/llama_cpp_plugin/include/infer_request.hpp
@@ -0,0 +1,31 @@
+#ifndef LLAMA_CPP_INFER_REQUEST_HPP
+#define LLAMA_CPP_INFER_REQUEST_HPP
+
+#include "openvino/openvino.hpp"
+#include "compiled_model.hpp"
+
+namespace ov {
+namespace llama_cpp_plugin {
+
+class LlamaCppSyncInferRequest : public ISyncInferRequest {
+public:
+    explicit LlamaCppSyncInferRequest(const std::shared_ptr<const LlamaCppModel>& compiled_model);
+    // explicit LlamaCppSyncInferRequest(const std::shared_ptr<const LlamaCppModel>& compiled_model): ov::ISyncInferRequest(compiled_model) {
+    //         std::cout << "VSHAMPOR: infer request ctor called\n";
+    //     }
+    virtual ~LlamaCppSyncInferRequest() {};
+
+    virtual void set_tensors_impl(const ov::Output<const ov::Node> port,
+                                  const std::vector<ov::SoPtr<ov::ITensor>>& tensors) override;
+
+    virtual void infer() override;
+    virtual std::vector<ov::ProfilingInfo> get_profiling_info() const override;
+    virtual std::vector<ov::SoPtr<ov::IVariableState>> query_state() const override;
+private:
+    std::shared_ptr<const LlamaCppModel> m_compiled_model_ptr;
+};
+
+}  // namespace LlamaCppPlugin
+};  // namespace ov
+
+#endif /* LLAMA_CPP_INFER_REQUEST_HPP */
diff --git a/modules/llama_cpp_plugin/include/plugin.hpp b/modules/llama_cpp_plugin/include/plugin.hpp
new file mode 100644
index 000000000..aea32ea1f
--- /dev/null
+++ b/modules/llama_cpp_plugin/include/plugin.hpp
@@ -0,0 +1,112 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#ifndef LLAMA_CPP_PLUGIN_HPP
+#define LLAMA_CPP_PLUGIN_HPP
+
+#include "openvino/runtime/iplugin.hpp"
+
+namespace ov {
+    namespace llama_cpp_plugin {
+        class LlamaCppPlugin : public IPlugin {
+        public:
+            LlamaCppPlugin();
+            /**
+             * @brief Compiles model from ov::Model object
+             * @param model A model object acquired from ov::Core::read_model or source construction
+             * @param properties A ov::AnyMap of properties relevant only for this load operation
+             * @return Created Compiled Model object
+             */
+            virtual std::shared_ptr<ov::ICompiledModel> compile_model(const std::shared_ptr<const ov::Model>& model,
+                const ov::AnyMap& properties) const override;
+
+
+            /**
+             * @brief Compiles model from ov::Model object, on specified remote context
+             * @param model A model object acquired from ov::Core::read_model or source construction
+             * @param properties A ov::AnyMap of properties relevant only for this load operation
+             * @param context A pointer to plugin context derived from RemoteContext class used to
+             *        execute the model
+             * @return Created Compiled Model object
+             */
+            virtual std::shared_ptr<ov::ICompiledModel> compile_model(const std::shared_ptr<const ov::Model>& model,
+                const ov::AnyMap& properties,
+                const ov::SoPtr<ov::IRemoteContext>& context) const override;
+
+            /**
+             * @brief Sets properties for plugin, acceptable keys can be found in openvino/runtime/properties.hpp
+             * @param properties ov::AnyMap of properties
+             */
+            virtual void set_property(const ov::AnyMap& properties) override;
+
+            /**
+             * @brief Gets properties related to plugin behaviour.
+             *
+             * @param name Property name.
+             * @param arguments Additional arguments to get a property.
+             *
+             * @return Value of a property corresponding to the property name.
+             */
+            virtual ov::Any get_property(const std::string& name, const ov::AnyMap& arguments) const override;
+
+            /**
+             * @brief Creates a remote context instance based on a map of properties
+             * @param remote_properties Map of device-specific shared context remote properties.
+             *
+             * @return A remote context object
+             */
+            virtual ov::SoPtr<ov::IRemoteContext> create_context(const ov::AnyMap& remote_properties) const override;
+
+            /**
+             * @brief Provides a default remote context instance if supported by a plugin
+             * @param remote_properties Map of device-specific shared context remote properties.
+             *
+             * @return The default context.
+             */
+            virtual ov::SoPtr<ov::IRemoteContext> get_default_context(const ov::AnyMap& remote_properties) const override;
+
+            /**
+             * @brief Creates an compiled model from an previously exported model using plugin implementation
+             *        and removes OpenVINO Runtime magic and plugin name
+             * @param model Reference to model output stream
+             * @param properties A ov::AnyMap of properties
+             * @return An Compiled model
+             */
+            virtual std::shared_ptr<ov::ICompiledModel> import_model(std::istream& model,
+                const ov::AnyMap& properties) const override;
+
+
+            virtual std::shared_ptr<ov::ICompiledModel> compile_model(const std::string& fname,
+                const ov::AnyMap& properties) const override;
+
+            /**
+             * @brief Creates an compiled model from an previously exported model using plugin implementation
+             *        and removes OpenVINO Runtime magic and plugin name
+             * @param model Reference to model output stream
+             * @param context A pointer to plugin context derived from RemoteContext class used to
+             *        execute the network
+             * @param properties A ov::AnyMap of properties
+             * @return An Compiled model
+             */
+            virtual std::shared_ptr<ov::ICompiledModel> import_model(std::istream& model,
+                const ov::SoPtr<ov::IRemoteContext>& context,
+                const ov::AnyMap& properties) const override;
+
+            /**
+             * @brief Queries a plugin about supported layers in model
+             * @param model Model object to query.
+             * @param properties Optional map of pairs: (property name, property value).
+             * @return An object containing a map of pairs an operation name -> a device name supporting this operation.
+             */
+            virtual ov::SupportedOpsMap query_model(const std::shared_ptr<const ov::Model>& model,
+                const ov::AnyMap& properties) const override;
+
+            std::string get_current_gguf_file_path() const;
+        private:
+            std::string m_cache_dir = "./";
+        };
+    }  // namespace llama_cpp_plugin
+}  // namespace ov
+
+#endif // LLAMA_CPP_PLUGIN_HPP
diff --git a/modules/llama_cpp_plugin/src/CMakeLists.txt b/modules/llama_cpp_plugin/src/CMakeLists.txt
new file mode 100644
index 000000000..5ec2caee7
--- /dev/null
+++ b/modules/llama_cpp_plugin/src/CMakeLists.txt
@@ -0,0 +1,58 @@
+set( PLUGIN_LIBRARY_NAME CACHE STRING "Library name for the generated plugin" ${TARGET_NAME})
+if(NOT PLUGIN_LIBRARY_NAME)
+    set( PLUGIN_LIBRARY_NAME "llama_cpp_plugin" )
+endif()
+
+set( PLUGIN_DEVICE_NAME CACHE STRING "Device name for the resulting plugin")
+if(NOT PLUGIN_DEVICE_NAME)
+    set( PLUGIN_DEVICE_NAME "LLAMA_CPP" )
+endif()
+
+set(TARGET_NAME ${PLUGIN_LIBRARY_NAME})
+
+file(GLOB_RECURSE SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
+file(GLOB_RECURSE HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp)
+
+if (NOT ENABLE_TEMPLATE_REGISTRATION)
+    # Skip install and registration of template component
+    set(skip_plugin SKIP_INSTALL SKIP_REGISTRATION)
+endif()
+
+
+
+# adds a shared library with plugin
+ov_add_plugin(NAME ${TARGET_NAME}
+              DEVICE_NAME ${PLUGIN_DEVICE_NAME}
+              SOURCES ${SOURCES} ${HEADERS}
+              ${skip_plugin}
+              VERSION_DEFINES_FOR plugin.cpp
+              ADD_CLANG_FORMAT)
+
+# Enable support of CC for the plugin
+ov_mark_target_as_cc(${TARGET_NAME})
+
+target_include_directories(${TARGET_NAME} PRIVATE
+    "${CMAKE_CURRENT_SOURCE_DIR}"
+    "${LlamaCppPlugin_SOURCE_DIR}/include")
+
+# link common OpenVINO Runtime libraries
+target_link_libraries(${TARGET_NAME} PRIVATE
+    openvino::interpreter_backend
+    openvino::reference)
+
+set( LLAMA_TARGET_NAME CACHE STRING "Exact target exposed by llama.cpp to link against as the main llama.cpp library")
+if(NOT LLAMA_TARGET_NAME)
+    set( LLAMA_TARGET_NAME "llama" )
+endif()
+
+# include and link llama.cpp  and ggml code
+target_link_libraries(${TARGET_NAME} PRIVATE ${LLAMA_TARGET_NAME})
+target_link_libraries(${TARGET_NAME} PRIVATE ggml)
+
+
+set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO})
+
+if (ENABLE_TEMPLATE_REGISTRATION)
+    # Update the plugins.xml file
+    ov_register_plugins(MAIN_TARGET ${TARGET_NAME})
+endif()
diff --git a/modules/llama_cpp_plugin/src/compiled_model.cpp b/modules/llama_cpp_plugin/src/compiled_model.cpp
new file mode 100644
index 000000000..932c0def4
--- /dev/null
+++ b/modules/llama_cpp_plugin/src/compiled_model.cpp
@@ -0,0 +1,732 @@
+#include "compiled_model.hpp"
+#include "plugin.hpp"
+#include "infer_request.hpp"
+#include <memory>
+#include <openvino/op/constant.hpp>
+#include <openvino/opsets/opset13.hpp>
+#include <fstream>
+#include <openvino/runtime/properties.hpp>
+
+namespace ov {
+    namespace llama_cpp_plugin {
+        class TensorWeightMatcher {
+        public:
+            // TODO (vshampor) implement this for faster weight node matching.
+            // Use std::list, two passes - first for full name match, second for prefix-match; remove entries from list on match
+            using RTInfoTensorName = std::string;
+            using OvNodeName = std::string;
+            using LlamaTensorName = std::string;
+
+            TensorWeightMatcher(const std::shared_ptr<ov::Model>& model, std::map<RTInfoTensorName, ov::Shape> tensor_names_with_shapes_to_match) {
+                std::multimap<RTInfoTensorName, std::shared_ptr<ov::op::v0::Constant>> intermediate_matches_map;
+
+                const auto node_vector = model->get_ops();
+                std::list<std::shared_ptr<ov::op::v0::Constant>> const_nodes_in_model;
+                for (const auto& node_ptr : node_vector) {
+                    if (ov::is_type<ov::op::v0::Constant>(node_ptr)) const_nodes_in_model.push_back(ov::as_type_ptr<ov::op::v0::Constant>(node_ptr));
+                }
+
+                // full substring match pass
+                std::map<RTInfoTensorName, ov::Shape> unmatched_rt_info_names_on_first_pass = extract_matches(intermediate_matches_map, tensor_names_with_shapes_to_match, const_nodes_in_model,
+                        [](const std::string& substring, const std::string& source) { return source.find(substring) != std::string::npos; });
+
+                // prefix substring match pass
+                std::map<RTInfoTensorName, ov::Shape> unmatched_rt_info_names_on_second_pass = extract_matches(intermediate_matches_map, unmatched_rt_info_names_on_first_pass, const_nodes_in_model,
+                        [](const std::string& substring, const std::string& source) {
+                        return source.find(get_weight_name_without_torch_postfix(substring)) != std::string::npos; });
+
+                for (auto it = intermediate_matches_map.begin(); it != intermediate_matches_map.end(); it = intermediate_matches_map.upper_bound(it->first)) {
+                    // TODO: perf improvement by iterating with ++;
+                    RTInfoTensorName rt_info_name = it->first;
+                    if (intermediate_matches_map.count(rt_info_name) != 1) {
+                        std::cout << "VSHAMPOR: multiple matches for weight name " << rt_info_name << " and shape " << it->second->get_shape().to_string() << ", found ";
+                        auto range_it_pair = intermediate_matches_map.equal_range(rt_info_name);
+                        for (auto multimatch_it = range_it_pair.first; multimatch_it != range_it_pair.second; multimatch_it++) {
+                            auto node_ptr = multimatch_it->second;
+                            std::cout << node_ptr->get_friendly_name() << "(shape " << node_ptr->get_shape().to_string() << "),";
+                        }
+                        std::cout << "will take the first match" << std::endl;
+                    }
+                    const auto& match = intermediate_matches_map.find(rt_info_name)->second;
+                    m_rtinfo_name_to_weight_node_map[rt_info_name] = match;
+                }
+                if (!unmatched_rt_info_names_on_second_pass.empty()) {
+                    std::cout << "VSHAMPOR: did not find the weight node for " << unmatched_rt_info_names_on_second_pass.size() << " weights:" << std::endl;
+                }
+                for (const auto& unmatched_entry: unmatched_rt_info_names_on_second_pass) {
+                    std::cout << '\t' << unmatched_entry.first << std::endl;
+                }
+            }
+
+        std::unordered_map<RTInfoTensorName, std::shared_ptr<ov::op::v0::Constant>> get_matches() { return m_rtinfo_name_to_weight_node_map; }
+
+        private:
+            std::map<RTInfoTensorName, ov::Shape> extract_matches(std::multimap<RTInfoTensorName, std::shared_ptr<ov::op::v0::Constant>>& output_matches_map,
+                                                                  const std::map<RTInfoTensorName, ov::Shape>& names_with_shapes_to_match,
+                                                                  const std::list<std::shared_ptr<ov::op::v0::Constant>>& search_list,
+                                                                  std::function<bool(const std::string& substring, const std::string& source)> name_match_predicate) {
+                std::map<RTInfoTensorName, ov::Shape> unmatched_rt_info_names;
+                for (const auto& pair: names_with_shapes_to_match) {
+                    RTInfoTensorName rt_info_name = pair.first;
+                    const ov::Shape& wanted_shape = pair.second;
+                    bool matched = false;
+                    for (auto it = search_list.begin(); it != search_list.end(); it++) {
+                        auto node_ptr = *it;
+                        const std::string& friendly_name = node_ptr->get_friendly_name();
+                        if (name_match_predicate(rt_info_name, friendly_name) &&
+                            node_ptr->get_shape() == wanted_shape) {
+                            output_matches_map.insert(std::make_pair(rt_info_name, node_ptr));
+                            matched = true;
+                            break;
+                        }
+                    }
+                    if (!matched) unmatched_rt_info_names.insert(pair);
+                }
+                return unmatched_rt_info_names;
+            }
+
+            static std::string get_weight_name_without_torch_postfix(std::string torch_weight_name) {
+                size_t idx = torch_weight_name.rfind(".");
+                if (idx == std::string::npos) return torch_weight_name;
+                return std::string(torch_weight_name, 0, idx);
+            }
+
+            size_t num_exact_matches = 0;
+            size_t num_partial_matches = 0;
+            std::unordered_map<RTInfoTensorName, std::shared_ptr<ov::op::v0::Constant>> m_rtinfo_name_to_weight_node_map;
+        };
+
+
+        std::vector<std::shared_ptr<ov::Node>> get_nodes_containing_name_with_shape(const std::shared_ptr<ov::Model>& model, const std::string& weight_name, const ov::Shape& shape) {
+            auto ops = model->get_ops();
+            std::vector<std::shared_ptr<ov::Node>> found_weight_nodes;
+            std::copy_if(ops.begin(), ops.end(), std::back_inserter(found_weight_nodes),
+                    [&weight_name, &shape](const std::shared_ptr<ov::Node>& val) {
+                        if (!ov::is_type<ov::op::v0::Constant>(val)) return false;
+                        std::shared_ptr<ov::op::v0::Constant> node_ptr = ov::as_type_ptr<ov::op::v0::Constant>(val);
+                        return val->get_friendly_name().find(weight_name) != std::string::npos &&
+                               val->get_shape() == shape;
+                    });
+            return found_weight_nodes;
+        }
+
+        bool has_weight_matches(const std::shared_ptr<ov::Model>& model, const std::string& weight_name, const ov::Shape& shape) {
+            std::vector<std::shared_ptr<ov::Node>> found_weight_nodes;
+            found_weight_nodes = get_nodes_containing_name_with_shape(model, weight_name, shape);
+            return !found_weight_nodes.empty();
+        }
+
+        std::string get_weight_name_without_torch_postfix(std::string torch_weight_name) {
+            size_t idx = torch_weight_name.rfind(".");
+            if (idx == std::string::npos) return torch_weight_name;
+            return std::string(torch_weight_name, 0, idx);
+        }
+
+        bool has_partial_weight_matches(const std::shared_ptr<ov::Model>& model, const std::string& weight_name, const ov::Shape& shape) {
+            std::vector<std::shared_ptr<ov::Node>> found_weight_nodes;
+            found_weight_nodes = get_nodes_containing_name_with_shape(model, get_weight_name_without_torch_postfix(weight_name), shape);
+            return !found_weight_nodes.empty();
+        }
+
+        std::shared_ptr<ov::op::v0::Constant> get_weight_by_name_and_shape(const std::shared_ptr<ov::Model>& model, const std::string& weight_name, const ov::Shape& shape) {
+            OPENVINO_ASSERT(has_weight_matches(model, weight_name, shape));
+            std::vector<std::shared_ptr<ov::Node>> found_weight_nodes;
+            found_weight_nodes = get_nodes_containing_name_with_shape(model, weight_name, shape);
+
+            if (found_weight_nodes.size() > 1) {
+                std::cout << "VSHAMPOR: multiple matches for weight name " << weight_name << " and shape " << shape.to_string() << ", found ";
+                for (const auto& node_ptr : found_weight_nodes) {
+                    std::cout << node_ptr->get_friendly_name() << "(shape " << shape.to_string() << "),";
+                }
+                std::cout << "will take the first match" << std::endl;
+            }
+            std::shared_ptr<ov::Node> node_with_tensor = found_weight_nodes.front();
+            OPENVINO_ASSERT(ov::is_type<ov::op::v0::Constant>(node_with_tensor));
+            std::shared_ptr<ov::op::v0::Constant> const_node_ptr = ov::as_type_ptr<ov::op::v0::Constant>(node_with_tensor);
+            return const_node_ptr;
+        }
+
+        using TransposePermutation = std::pair<size_t, size_t>;
+
+        std::vector<size_t> expand_front(const std::vector<size_t>& vec, size_t val) {
+            OPENVINO_ASSERT(vec.size() < GGML_MAX_DIMS);
+            std::vector<size_t> retval(GGML_MAX_DIMS, val);
+            std::copy(vec.rbegin(), vec.rend(), retval.rbegin());
+            return retval;
+        }
+
+        void write_float_plus_one(std::ofstream& out, const float* src) {
+            float elt = *src;
+            elt += 1;
+            out.write((const char*) &elt, sizeof(float));
+        }
+
+        void append_tensor_data_with_transpositions(const std::string& fname, const std::vector<gguf_tensor_info>& tensor_infos, const std::vector<void*>& tensor_data_ptrs,
+                const std::map<std::string, TransposePermutation>& transpositions, const std::set<std::string> increment_by_one_tensor_names) {
+             // assuming contiguous data underneath each pointer from tensor_data_ptrs
+             OPENVINO_ASSERT(tensor_infos.size() == tensor_data_ptrs.size());
+             std::ofstream out(fname, std::ios::app | std::ios::out);
+             for (size_t i = 0; i < tensor_infos.size(); i++) {
+                const auto& tensor_info = tensor_infos[i];
+                OPENVINO_ASSERT(tensor_info.type == GGML_TYPE_F32); // TODO (vshampor): writing transposed tensor data for other data types, especially lower-bitwidth; maybe use OV inference for that
+
+                const char* ir_tensor_data = reinterpret_cast<char*>(tensor_data_ptrs[i]);
+
+                std::string tensor_llama_name = std::string(tensor_info.name.data);
+                auto it = transpositions.find(tensor_llama_name);
+                if (it == transpositions.end()) {
+                    // original IR tensor should not be transposed to conform to GGUF expectations, can write as-is
+                    if (increment_by_one_tensor_names.count(tensor_llama_name) != 0) { // gemma case
+                        size_t elt_size = sizeof(float); // FP32 only for now
+                        OPENVINO_ASSERT(!(tensor_info.size % elt_size));
+                        size_t num_elts = tensor_info.size / elt_size;
+                        for (size_t elt_idx = 0; elt_idx < num_elts; elt_idx++) {
+                            write_float_plus_one(out, ((float*) ir_tensor_data) + elt_idx);
+                        }
+                    }
+                    else {
+                        out.write(ir_tensor_data, tensor_info.size);
+                    }
+                    continue;
+                }
+
+                if (it != transpositions.end()) {
+                    std::vector<size_t> gguf_layout_shape;
+
+                    // the shape in .ne is inverted w.r.t original export (~= IR) weight layout
+                    for (size_t dim_idx = 0; dim_idx < tensor_info.n_dims; dim_idx++) {
+                        gguf_layout_shape.push_back(tensor_info.ne[GGML_MAX_DIMS - 1 - tensor_info.n_dims - dim_idx]);
+                    }
+
+                    TransposePermutation permutation = it->second;
+                    std::vector<size_t> ir_layout_shape(gguf_layout_shape);
+                    std::swap(ir_layout_shape[permutation.first], ir_layout_shape[permutation.second]);
+
+                    std::vector<size_t> ir_layout_strides(tensor_info.n_dims, 1);
+
+                    for (size_t idx = 0; idx < tensor_info.n_dims - 1 ; idx++) {
+                        auto previous_stride_it = ir_layout_strides.rbegin() + idx;
+                        auto stride_it = ir_layout_strides.rbegin() + idx + 1;
+                        auto shape_it = ir_layout_shape.rbegin() + idx;
+                        *stride_it = *shape_it * *previous_stride_it;
+                    }
+
+
+                    std::vector<size_t> permuted_strides(ir_layout_strides);
+                    std::swap(permuted_strides[permutation.first], permuted_strides[permutation.second]);
+
+                    // expand up to GGML_MAX_DIMS
+                    std::vector<size_t> gguf_layout_shape_ex = expand_front(gguf_layout_shape, 1);
+                    // stride for unused dims will be 0, has no effect on loop because dimension idx for that dim is always 0
+                    permuted_strides = expand_front(permuted_strides, 0);
+
+
+
+                    std::cout << "VSHAMPOR: writing tensor " << tensor_info.name.data << " with size " << tensor_info.size;
+                    std::cout << " shape (GGUF layout) ";
+                    for (auto dim: gguf_layout_shape) std::cout << dim << ",";
+                    std::cout << " shape (IR layout) ";
+                    for (auto dim : ir_layout_shape) std::cout << dim << ",";
+                    std::cout << " stride (IR layout) ";
+                    for (auto stride : ir_layout_strides) std::cout << stride << ",";
+                    std::cout << " stride (IR layout, transposing) ";
+                    for (auto stride : permuted_strides) std::cout << stride << ",";
+                    std::cout << std::endl;
+
+                    // TODO (vshampor): rewrite the loop below using recurrent templates?
+                    // This relies on GGUF_MAX_DIMS == 4 and unused dims being equal to 1
+                    size_t current_offset = 0;
+                    size_t element_size = sizeof(float);
+                    size_t num_bytes_written = 0;
+                    for (size_t dim_0 = 0; dim_0 < gguf_layout_shape_ex[0]; dim_0++)
+                        for (size_t dim_1 = 0; dim_1 < gguf_layout_shape_ex[1]; dim_1++)
+                            for (size_t dim_2 = 0; dim_2 < gguf_layout_shape_ex[2]; dim_2++)
+                                for (size_t dim_3 = 0; dim_3 < gguf_layout_shape_ex[3]; dim_3++) {
+                                    current_offset = element_size * (dim_0 * permuted_strides[0] + dim_1 * permuted_strides[1] + dim_2 * permuted_strides[2] + dim_3 * permuted_strides[3]);
+                                    if (increment_by_one_tensor_names.count(tensor_llama_name) != 0) { // gemma case
+                                        write_float_plus_one(out, (float*) ir_tensor_data + current_offset);
+                                    }
+                                    else {
+                                        out.write(ir_tensor_data + current_offset, element_size);
+                                    }
+                                    num_bytes_written += element_size;
+                                }
+                    std::cout << "VSHAMPOR: wrote " << num_bytes_written << std::endl;
+                    OPENVINO_ASSERT(num_bytes_written == tensor_info.size);
+                }
+             }
+        }
+
+        struct ValueStorageForLifetimeExtension {
+            std::list<std::string> kv_key_string_storage;
+            std::list<std::string> kv_value_string_storage;
+            std::list<std::vector<char*>> str_arr_storage;
+            void* store_gguf_value_vector(const std::vector<gguf_value>& vec, gguf_type g_type) {
+                size_t elt_size;
+                switch (g_type) {
+                    case GGUF_TYPE_UINT8:   elt_size = sizeof(uint8_t); break;
+                    case GGUF_TYPE_INT8:    elt_size = sizeof(int8_t); break;
+                    case GGUF_TYPE_UINT16:  elt_size = sizeof(uint16_t); break;
+                    case GGUF_TYPE_INT16:   elt_size = sizeof(int16_t); break;
+                    case GGUF_TYPE_UINT32:  elt_size = sizeof(uint32_t); break;
+                    case GGUF_TYPE_INT32:   elt_size = sizeof(int32_t); break;
+                    case GGUF_TYPE_FLOAT32: elt_size = sizeof(float); break;
+                    case GGUF_TYPE_UINT64:  elt_size = sizeof(uint64_t); break;
+                    case GGUF_TYPE_INT64:   elt_size = sizeof(int64_t); break;
+                    case GGUF_TYPE_FLOAT64: elt_size = sizeof(double); break;
+                    case GGUF_TYPE_BOOL:    elt_size = sizeof(bool); break;
+                default:
+                    OPENVINO_THROW("Unknown array type");
+                }
+                size_t size_in_bytes = vec.size() * elt_size;
+                void* mem_ptr = new char[size_in_bytes];
+                for (size_t i = 0; i < vec.size(); i++) {
+                    switch (g_type) {
+                        case GGUF_TYPE_UINT8:   ((uint8_t*) mem_ptr)[i] = vec[i].uint8;     break;
+                        case GGUF_TYPE_INT8:    ((int8_t*) mem_ptr)[i] = vec[i].int8;      break;
+                        case GGUF_TYPE_UINT16:  ((uint16_t*) mem_ptr)[i] = vec[i].uint16;    break;
+                        case GGUF_TYPE_INT16:   ((int16_t*) mem_ptr)[i] = vec[i].int16;     break;
+                        case GGUF_TYPE_UINT32:  ((uint32_t*) mem_ptr)[i] = vec[i].uint32;    break;
+                        case GGUF_TYPE_INT32:   ((int32_t*) mem_ptr)[i] = vec[i].int32;     break;
+                        case GGUF_TYPE_FLOAT32: ((float*) mem_ptr)[i] = vec[i].float32;   break;
+                        case GGUF_TYPE_UINT64:  ((uint64_t*) mem_ptr)[i] = vec[i].uint64;    break;
+                        case GGUF_TYPE_INT64:   ((int64_t*) mem_ptr)[i] = vec[i].int64;     break;
+                        case GGUF_TYPE_FLOAT64: ((double*) mem_ptr)[i] = vec[i].float64;   break;
+                        case GGUF_TYPE_BOOL:    ((bool*) mem_ptr)[i] = vec[i].bool_;     break;
+                    default:
+                        OPENVINO_THROW("Unknown array type");
+                    }
+                }
+                return mem_ptr;
+            }
+
+            ValueStorageForLifetimeExtension() = default;
+            ~ValueStorageForLifetimeExtension() {
+                for (void* ptr: non_str_raw_storage) {
+                    delete[] (char*) ptr;
+                }
+            }
+            private:
+            std::list<void*> non_str_raw_storage;
+        };
+
+        bool maybe_parse_single_element(gguf_type g_type, ov::Any rtmap_value, gguf_value& dst, ValueStorageForLifetimeExtension& store) {
+                switch (g_type) {
+                    case GGUF_TYPE_UINT8:   dst.uint8    = rtmap_value.as<uint8_t>();  break;
+                    case GGUF_TYPE_INT8:    dst.int8     = rtmap_value.as<int8_t>(); ; break;
+                    case GGUF_TYPE_UINT16:  dst.uint16   = rtmap_value.as<uint16_t>(); break;
+                    case GGUF_TYPE_INT16:   dst.int16    = rtmap_value.as<int16_t>();  break;
+                    case GGUF_TYPE_UINT32:  dst.uint32   = rtmap_value.as<uint32_t>(); break;
+                    case GGUF_TYPE_INT32:   dst.int32    = rtmap_value.as<int32_t>();  break;
+                    case GGUF_TYPE_FLOAT32: dst.float32  = rtmap_value.as<float>();    break;
+                    case GGUF_TYPE_UINT64:  dst.uint64   = rtmap_value.as<uint64_t>(); break;
+                    case GGUF_TYPE_INT64:   dst.int64    = rtmap_value.as<int64_t>();  break;
+                    case GGUF_TYPE_FLOAT64: dst.float64  = rtmap_value.as<double>();   break;
+                    case GGUF_TYPE_BOOL:    dst.bool_    = rtmap_value.as<bool>();     break;
+                    case GGUF_TYPE_STRING: {
+                        std::string string_value = rtmap_value.as<std::string>();
+                        store.kv_value_string_storage.push_back(string_value);
+                        dst.str.n = string_value.length();
+                        dst.str.data = (char*) store.kv_value_string_storage.back().c_str(); // TODO (vshampor) see equivalent case below
+                        break;
+                    }
+                    default:
+                        return false;  // did not parse
+                }
+            return true; // parsed successfully
+        }
+
+        ov::Any get_any_associated_with_gguf_type(gguf_type g_type) {
+            switch (g_type) {
+                case GGUF_TYPE_UINT8:   return ov::Any(uint8_t());
+                case GGUF_TYPE_INT8:    return ov::Any(int8_t());  
+                case GGUF_TYPE_UINT16:  return ov::Any(uint16_t());
+                case GGUF_TYPE_INT16:   return ov::Any(int16_t()); 
+                case GGUF_TYPE_UINT32:  return ov::Any(uint32_t());
+                case GGUF_TYPE_INT32:   return ov::Any(int32_t()); 
+                case GGUF_TYPE_FLOAT32: return ov::Any(float());   
+                case GGUF_TYPE_UINT64:  return ov::Any(uint64_t());
+                case GGUF_TYPE_INT64:   return ov::Any(int64_t()); 
+                case GGUF_TYPE_FLOAT64: return ov::Any(double());  
+                case GGUF_TYPE_BOOL:    return ov::Any(bool());    
+                case GGUF_TYPE_STRING:  return ov::Any(std::string());
+                default:
+                    OPENVINO_THROW("Unknown gguf_type to turn into ov::Any");
+            }
+        }
+
+
+        LlamaCppModel::LlamaCppModel(const std::shared_ptr<ov::Model>& model,
+                      const std::shared_ptr<const ov::IPlugin>& plugin,
+                      const ov::SoPtr<ov::IRemoteContext>& context,
+                      const std::shared_ptr<ov::threading::ITaskExecutor>& task_executor
+                      ) : ICompiledModel(model, plugin, context, task_executor) {
+            m_model = model;
+            num_tokens_processed_ptr = new size_t; // TODO (vshampor): hack, remove
+            *num_tokens_processed_ptr = 0;
+            auto rt_info = model->get_rt_info();
+            OPENVINO_ASSERT(rt_info.count("lcp_kv_params") != 0);
+            OPENVINO_ASSERT(rt_info.count("lcp_kv_types") != 0);
+            OPENVINO_ASSERT(rt_info.count("lcp_kv_array_types") != 0);
+            OPENVINO_ASSERT(rt_info.count("lcp_tensor_name_map") != 0);
+            OPENVINO_ASSERT(rt_info.count("lcp_tensor_shape_map") != 0);
+            OPENVINO_ASSERT(rt_info.count("lcp_expected_tensor_shapes") != 0);
+            OPENVINO_ASSERT(rt_info.count("lcp_transpose_permutations") != 0);
+
+            RTMap& kv_params = model->get_rt_info<RTMap&>("lcp_kv_params");
+            RTMap& kv_types = model->get_rt_info<RTMap&>("lcp_kv_types");
+            RTMap& kv_array_types = model->get_rt_info<RTMap&>("lcp_kv_array_types");
+            RTMap& tensor_name_map = model->get_rt_info<RTMap&>("lcp_tensor_name_map");
+            RTMap& tensor_shape_map = model->get_rt_info<RTMap&>("lcp_tensor_shape_map");
+            RTMap& expected_tensor_shapes_map = model->get_rt_info<RTMap&>("lcp_expected_tensor_shapes");
+            RTMap& transpose_permutations_rtmap = model->get_rt_info<RTMap&>("lcp_transpose_permutations");
+
+            size_t gguf_version = model->get_rt_info<size_t>("lcp_gguf_version");
+            std::cout << "VSHAMPOR: parsed gguf_version " << gguf_version << std::endl;
+
+            // kv params
+            OPENVINO_ASSERT(kv_params.size() == kv_types.size());
+            size_t n_kv = kv_params.size();
+            std::vector<gguf_kv> kv_vector;
+            ValueStorageForLifetimeExtension store;
+
+            for (const auto& kv_pair: kv_params) {
+                gguf_kv kv;
+
+                const auto& key = kv_pair.first;
+                kv.key.n = key.length();
+                store.kv_key_string_storage.push_back(key);
+                kv.key.data = (char*) store.kv_key_string_storage.back().c_str(); // TODO (vshampor) see equivalent case below
+
+                uint32_t value_type = kv_types[key].as<uint32_t>();
+                gguf_type gguf_value_type = (gguf_type) value_type;
+                kv.type = gguf_value_type;
+                if (gguf_value_type != GGUF_TYPE_ARRAY) {
+                    bool is_parsed = maybe_parse_single_element(kv.type, kv_pair.second, kv.value, store);
+                    OPENVINO_ASSERT(is_parsed, "Invalid type of a GGUF kv-value");
+                }
+                else { // array case
+                    gguf_type element_type = (gguf_type) kv_array_types[key].as<uint32_t>();
+                    kv.value.arr.type = element_type;
+                    std::string serialized_array = kv_pair.second.as<std::string>();
+                    std::stringstream ss{serialized_array};
+                    std::vector<gguf_value> parsed_array;
+                    while (!ss.eof()) {
+                        gguf_value array_elt;
+                        ov::Any ov_any = get_any_associated_with_gguf_type(element_type);
+                        std::string token; ss >> token;
+                        if (std::string(kv.key.data) == "tokenizer.ggml.merges") {
+                            // tokenizer merges are pairs of tokens separated by whitespace, so need to read another to get a proper merge
+                            // TODO (vshampor): think of another delimiting strategy in the rt_info and use that strategy here for more robust code
+                            std::string another_token; ss >> another_token;
+                            token += std::string(" ") + another_token;
+                            ov_any = ov::Any::make<std::string>(token);
+                        }
+                        else {
+                            std::stringstream tok_ss{token};
+                            ov_any.read(tok_ss);
+                        }
+                        bool is_parsed = maybe_parse_single_element(element_type, ov_any, array_elt, store);
+                        OPENVINO_ASSERT(is_parsed);
+                        parsed_array.push_back(array_elt);
+                    }
+                    kv.value.arr.n = parsed_array.size();
+                    if (element_type == GGUF_TYPE_STRING) {
+                        // string element has already been lifetime-extended during parsing
+                        std::vector<char*> cstr_vector(parsed_array.size());
+                        for (size_t cstr_idx = 0; cstr_idx < parsed_array.size(); cstr_idx++) {
+                            cstr_vector[cstr_idx] = parsed_array[cstr_idx].str.data;
+                        }
+                        store.str_arr_storage.push_back(cstr_vector);
+                        kv.value.arr.data = store.str_arr_storage.back().data();
+                    }
+                    else {
+                        void* data_ptr = store.store_gguf_value_vector(parsed_array, element_type);
+                        kv.value.arr.data = data_ptr;
+                    }
+                }
+                kv_vector.push_back(kv);
+            }
+
+            auto token_types_kv_it = std::find_if(kv_vector.begin(), kv_vector.end(), [](const gguf_kv& val) { return std::string(val.key.data) == "tokenizer.ggml.token_type"; });
+            if (token_types_kv_it != kv_vector.end()) {
+                auto tokens_kv_it = std::find_if(kv_vector.begin(), kv_vector.end(), [](const gguf_kv& val) { return std::string(val.key.data) == "tokenizer.ggml.tokens"; });
+                if (tokens_kv_it != kv_vector.end()) {
+                    size_t expected_num_tokens = token_types_kv_it->value.arr.n;
+                    size_t actual_num_tokens = tokens_kv_it->value.arr.n;
+                    if (actual_num_tokens < expected_num_tokens) {
+                        std::cout << "VSHAMPOR: detected wrong vocab serialization/deserialization (expected " << expected_num_tokens << " tokens, parsed " << actual_num_tokens << " from vocab), filling tokens with bogus values" << std::endl;
+                        std::vector<char*> new_vocab;
+                        // char** old_vocab_data_ptr = (char**) tokens_kv_it->value.arr.data;
+                        // std::copy(old_vocab_data_ptr, old_vocab_data_ptr + actual_num_tokens, new_vocab.begin());
+                        // size_t extra_tokens_needed = expected_num_tokens - actual_num_tokens;
+                        size_t extra_tokens_needed = expected_num_tokens;
+                        for (size_t tok_idx = 0; tok_idx < extra_tokens_needed; tok_idx++) {
+                            std::stringstream ss;
+                            ss << "invalid_token_" << tok_idx;
+                            std::string new_token = ss.str();
+                            store.kv_value_string_storage.push_back(new_token);
+                            char* str_data_ptr = (char*) store.kv_value_string_storage.back().c_str();
+                            new_vocab.push_back(str_data_ptr);
+                        }
+                        OPENVINO_ASSERT(new_vocab.size() == expected_num_tokens);
+                        store.str_arr_storage.push_back(new_vocab);
+                        tokens_kv_it->value.arr.data = (void*) store.str_arr_storage.back().data();
+                        tokens_kv_it->value.arr.n = expected_num_tokens;
+                    }
+                }
+            }
+
+            // tensors
+            OPENVINO_ASSERT(tensor_name_map.size() == tensor_shape_map.size());
+            size_t n_tensors_in_rtinfo = tensor_name_map.size();
+            std::cout << "VSHAMPOR: got request for " << n_tensors_in_rtinfo << " tensors from rt_info\n";
+
+            std::vector<struct gguf_tensor_info> tensor_infos;
+            std::vector<void*> tensor_data_ptrs;
+
+            std::map<std::string, ov::Shape> parsed_weights_to_search_for;
+            for (const auto& llama_name_and_rtinfo_name : tensor_name_map) {
+                const std::string& llama_name = llama_name_and_rtinfo_name.first;
+                const std::string& rtinfo_name = llama_name_and_rtinfo_name.second.as<std::string>();
+                ov::Shape expected_shape = tensor_shape_map[llama_name].as<std::string>();
+                parsed_weights_to_search_for[rtinfo_name] = expected_shape;
+            }
+
+            TensorWeightMatcher matcher{model, parsed_weights_to_search_for};
+            std::unordered_map<std::string, std::shared_ptr<ov::op::v0::Constant>> matches = matcher.get_matches();
+            std::unordered_map<std::string, std::shared_ptr<ov::op::v0::Constant>> llama_name_to_constant_node_map;
+            for (const auto& entry : tensor_name_map) {
+                const auto& llama_name = entry.first;
+                const auto& rtinfo_name = entry.second.as<std::string>();
+                llama_name_to_constant_node_map[llama_name] = matches[rtinfo_name];
+            }
+            std::cout << "VSHAMPOR: requested tensors map to " << llama_name_to_constant_node_map.size() << " tensors to search in model (shared tensors considered)\n";
+
+
+            std::list<std::string> llama_name_storage;
+
+            size_t n_tensors = 0;
+
+            size_t offset = 0; // each tensor_info has to have a correct offset including padding, checked for in gguf_write_to_buf
+            for (const auto& matched_weight_pair : llama_name_to_constant_node_map) {
+                // Need to store the names in the list so that the passed c_str() pointers in tensor_infos to the llama names stay valid
+                // until they get deepcopied in gguf/llama functions
+                llama_name_storage.push_back(matched_weight_pair.first);
+                const std::string& llama_name = llama_name_storage.back();
+
+                auto weight_const_node_ptr = matched_weight_pair.second;
+                auto weight_shape = weight_const_node_ptr->get_shape();
+
+                // does hf-to-gguf invert all tensor dimensions with shapes > 1?
+                auto expected_weight_shape = ov::Shape(expected_tensor_shapes_map[llama_name].as<std::string>());
+                OPENVINO_ASSERT(expected_weight_shape.size() < GGML_MAX_DIMS);
+
+                gguf_tensor_info info;
+
+                info.type = GGML_TYPE_F32; // TODO (vshampor): better type assignment based on actual element type of the Constant node
+
+                info.name.n = llama_name.length();
+                info.name.data = (char*) llama_name.c_str();  // TODO (vshampor): either do this via const_cast, or will have to implement own structures for
+                                                              // read-only data passing to llama_load_model_from_data
+                info.n_dims = weight_shape.size();
+                std::fill(std::begin(info.ne), std::begin(info.ne) + GGML_MAX_DIMS, (uint64_t) 1);
+
+                // looks like GGUF expects inverse order of dimensions when compared to e.g. torch and actual row-major layout, see gguf.gguf_writer.GGUFWriter.add_tensor_info
+                // in gguf python package
+                std::copy(expected_weight_shape.rbegin(), expected_weight_shape.rend(), info.ne);
+
+                void* data_ptr = (void*)(weight_const_node_ptr->get_data_ptr()); // TODO (vshampor): danger - casts `const` away
+                                                                                 // also - the expected_weight_shape is in general different from actual ov::Tensor shape,
+                                                                                 // in particular it may be transposed, so we actually need to set the pointers to shape-corrected
+                                                                                 // tensor storage, which we don't do here - we are only preparing this data to get a convenient
+                                                                                 // gguf_context object to reuse metadata (header) writing code, tensor data transpositions will be done during
+                                                                                 // actual file write
+
+                info.size = weight_const_node_ptr->get_byte_size();
+                info.offset = offset;
+
+                const size_t size_pad = GGML_PAD(info.size, GGUF_DEFAULT_ALIGNMENT);
+                offset += size_pad;
+
+                info.data = data_ptr;
+
+                tensor_infos.push_back(info);
+                tensor_data_ptrs.push_back(data_ptr);
+                n_tensors++;
+            }
+
+            std::cout << "VSHAMPOR: found " << matches.size() << "/" << parsed_weights_to_search_for.size() << " tensors" << std::endl;
+
+            gguf_init_params gguf_params;
+            gguf_params.no_alloc = false;
+            gguf_params.ctx = nullptr;
+
+            m_gguf_ctx = gguf_init_from_data(n_tensors, tensor_infos.data(), n_kv, kv_vector.data(), tensor_data_ptrs.data(), gguf_params);
+
+            std::shared_ptr<const LlamaCppPlugin> llama_plugin_ptr = std::dynamic_pointer_cast<const LlamaCppPlugin>(plugin);
+            m_converted_gguf_file_name = llama_plugin_ptr->get_current_gguf_file_path();
+
+            std::cout << "VSHAMPOR: output filename is  " << m_converted_gguf_file_name << std::endl;
+            std::cout << "VSHAMPOR: writing metadata (GGUF header) " << std::endl;
+            gguf_write_to_file(m_gguf_ctx, m_converted_gguf_file_name.c_str(), /* only_meta = */ true);
+
+            std::map<std::string, TransposePermutation> transpose_permutations;
+
+            for (const auto& llama_name_and_permutation : transpose_permutations_rtmap) {
+                std::string permutation_str = llama_name_and_permutation.second.as<std::string>();
+                std::stringstream ss(permutation_str);
+                TransposePermutation permutation;
+                bool is_ok = true;
+                is_ok &= static_cast<bool>(ss >> permutation.first);
+                is_ok &= static_cast<bool>(ss >> permutation.second);
+                OPENVINO_ASSERT(is_ok, "failed to read permutation");
+                transpose_permutations[llama_name_and_permutation.first] = permutation;
+            }
+
+            std::set<std::string> gemma_tensor_names_to_increment;
+            // FIXME (vshampor): tried setting up commands for incrementing *_norm.weight values by 1 like it is done
+            // during llama.cpp HF-to-GGUF export, but it seems that it isn't necessary and IR stores the incremented weights already
+            // Is this due to constant folding?
+
+            // for (const auto& llama_name_and_rtinfo_name : tensor_name_map) {
+            //     const std::string& llama_name = llama_name_and_rtinfo_name.first;
+            //     const std::string& rtinfo_name = llama_name_and_rtinfo_name.second.as<std::string>();
+            //     std::string gemma_norm_suffix = "norm.weight";
+            //     if (rtinfo_name.size() < gemma_norm_suffix.size()) continue;
+            //     if (rtinfo_name.substr(rtinfo_name.size() - gemma_norm_suffix.size()) == gemma_norm_suffix) gemma_tensor_names_to_increment.insert(llama_name);
+            // }
+
+            std::cout << "VSHAMPOR: writing tensor data (blob with transpositions) " << std::endl;
+            append_tensor_data_with_transpositions(m_converted_gguf_file_name, tensor_infos, tensor_data_ptrs, transpose_permutations, gemma_tensor_names_to_increment);
+            std::cout << "VSHAMPOR: write finished." << m_converted_gguf_file_name << std::endl;
+
+            std::cout << "VSHAMPOR: loading llama model from written file..." << std::endl;
+            llama_model_params mparams = llama_model_default_params();
+            mparams.n_gpu_layers = 99;
+            m_llama_model_ptr = llama_load_model_from_file(m_converted_gguf_file_name.c_str(), mparams);
+            llama_context_params cparams = llama_context_default_params();
+            m_llama_ctx = llama_new_context_with_model(m_llama_model_ptr, cparams);
+
+            std::cout << "VSHAMPOR: llama model loaded successfully..." << std::endl;
+        }
+
+
+        LlamaCppModel::LlamaCppModel(const std::shared_ptr<ov::Model>& ov_model, std::istream& input_stream, const std::shared_ptr<const IPlugin>& plugin) :
+            ICompiledModel(ov_model, plugin) {
+            num_tokens_processed_ptr = new size_t; // TODO (vshampor): hack, remove
+            *num_tokens_processed_ptr = 0;
+            std::shared_ptr<const LlamaCppPlugin> llama_plugin = std::dynamic_pointer_cast<const LlamaCppPlugin>(plugin);
+            std::string current_file_path = llama_plugin->get_current_gguf_file_path();
+            std::ofstream output_stream(current_file_path, std::ios::binary);
+            output_stream << input_stream.rdbuf();
+
+
+            std::cout << "VSHAMPOR: loading llama model from imported and re-written file..." << std::endl;
+            llama_model_params mparams = llama_model_default_params();
+            mparams.n_gpu_layers = 99;
+            m_llama_model_ptr = llama_load_model_from_file(current_file_path.c_str(), mparams);
+            llama_context_params cparams = llama_context_default_params();
+            m_llama_ctx = llama_new_context_with_model(m_llama_model_ptr, cparams);
+            std::cout << "VSHAMPOR: llama model loaded successfully from cache..." << std::endl;
+        }
+
+        LlamaCppModel::LlamaCppModel(const std::string& gguf_fname, const std::shared_ptr<const IPlugin>& plugin) :
+            ICompiledModel(nullptr, plugin) {
+            num_tokens_processed_ptr = new size_t; // TODO (vshampor): hack, remove
+            *num_tokens_processed_ptr = 0;
+            std::cout << "VSHAMPOR: loading llama model directly from GGUF... " << std::endl;
+            llama_model_params mparams = llama_model_default_params();
+            mparams.n_gpu_layers = 99;
+            m_llama_model_ptr = llama_load_model_from_file(gguf_fname.c_str(), mparams);
+            llama_context_params cparams = llama_context_default_params();
+            m_llama_ctx = llama_new_context_with_model(m_llama_model_ptr, cparams);
+            std::cout << "VSHAMPOR: llama model loaded successfully from GGUF..." << std::endl;
+
+            auto input_ids = std::make_shared<ov::opset13::Parameter>(ov::element::Type_t::i64, ov::PartialShape({-1, -1}));
+            auto fake_convert = std::make_shared<ov::opset13::Convert>(input_ids->output(0), ov::element::Type_t::f32);
+            auto logits = std::make_shared<ov::opset13::Result>(fake_convert->output(0));
+
+            ov::ParameterVector inputs{input_ids};
+
+            std::vector<std::pair<std::string, ov::element::Type_t>> unused_names_in_order = { { "attention_mask", ov::element::Type_t::i64 },
+                                                                                               { "position_ids", ov::element::Type_t::i64 },
+                                                                                               { "beam_idx", ov::element::Type_t::i32 } };
+            for (const auto& descr : unused_names_in_order) {
+                auto unused_inp = std::make_shared<ov::opset13::Parameter>(descr.second, ov::PartialShape({-1, -1}));
+                inputs.push_back(unused_inp);
+            }
+
+            m_model = std::make_shared<ov::Model>(logits, inputs, "fake_ov_model_for_io_specification");
+
+            m_model->inputs()[0].set_names({"input_ids"});
+            for (size_t i = 0; i < unused_names_in_order.size(); i++) {
+                m_model->inputs()[i + 1].set_names({unused_names_in_order[i].first});
+            }
+
+            m_model->outputs()[0].set_names({"logits"});
+
+            for (auto input : m_model->inputs()) {
+                m_fake_inputs.emplace_back(input);
+            }
+            for (auto output : m_model->outputs()) {
+                m_fake_outputs.emplace_back(output);
+            }
+        }
+
+
+        void LlamaCppModel::export_model(std::ostream& output_stream) const {
+            std::cout << "VSHAMPOR: exporting model" << std::endl;
+
+            // FIXME (vshampor): it's a shame that loading a model from cache does not have an option to
+            // actually keep the already loaded model from xml and not be forced to deserialize an ov::Model
+            // representation from cache as well. As it stands, will need to write the whole IR into the cache entry
+            // along with the GGUF file.
+            //
+            std::stringstream xmlFile, binFile;
+            ov::pass::Serialize serializer(xmlFile, binFile);
+            serializer.run_on_model(m_model);
+
+            auto m_constants = binFile.str();
+            auto m_model = xmlFile.str();
+
+            auto dataSize = static_cast<std::uint64_t>(m_model.size());
+            output_stream.write(reinterpret_cast<char*>(&dataSize), sizeof(dataSize));
+            output_stream.write(m_model.c_str(), dataSize);
+
+            dataSize = static_cast<std::uint64_t>(m_constants.size());
+            output_stream.write(reinterpret_cast<char*>(&dataSize), sizeof(dataSize));
+            output_stream.write(reinterpret_cast<char*>(&m_constants[0]), dataSize);
+
+
+            std::ifstream in(m_converted_gguf_file_name, std::ios::binary);
+            output_stream << in.rdbuf();
+        }
+
+        std::shared_ptr<const ov::Model> LlamaCppModel::get_runtime_model() const {
+            OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented");
+        }
+
+        void LlamaCppModel::set_property(const ov::AnyMap& properties) {
+            std::cout << "VSHAMPOR: attempted to set_property (did nothing)";
+        }
+
+        ov::Any LlamaCppModel::get_property(const std::string& name) const {
+            if (ov::supported_properties == name) {
+                return decltype(ov::supported_properties)::value_type(std::vector<PropertyName>());
+            }
+            OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented");
+        }
+
+        std::shared_ptr<ov::ISyncInferRequest> LlamaCppModel::create_sync_infer_request() const {
+             return std::make_shared<LlamaCppSyncInferRequest>(std::static_pointer_cast<const LlamaCppModel>(shared_from_this()));
+        }
+
+         const std::vector<ov::Output<const ov::Node>>& LlamaCppModel::inputs() const {
+             return m_fake_inputs;
+         };
+         const std::vector<ov::Output<const ov::Node>>& LlamaCppModel::outputs() const {
+             return m_fake_outputs;
+         };
+    }
+}  // namespace ov
diff --git a/modules/llama_cpp_plugin/src/infer_request.cpp b/modules/llama_cpp_plugin/src/infer_request.cpp
new file mode 100644
index 000000000..0993422f6
--- /dev/null
+++ b/modules/llama_cpp_plugin/src/infer_request.cpp
@@ -0,0 +1,111 @@
+#include "infer_request.hpp"
+#include "openvino/runtime/make_tensor.hpp"
+#include "llama.h"
+
+namespace ov {
+    namespace llama_cpp_plugin {
+
+        void allocate_tensor_impl(ov::SoPtr<ov::ITensor>& tensor,
+                          const ov::element::Type& element_type,
+                          const ov::Shape& shape) {
+    if (!tensor || tensor->get_element_type() != element_type) {
+        tensor = ov::make_tensor(element_type, shape);
+    } else {
+        tensor->set_shape(shape);
+    }
+}
+
+        LlamaCppSyncInferRequest::LlamaCppSyncInferRequest(const std::shared_ptr<const LlamaCppModel>& compiled_model): ov::ISyncInferRequest(compiled_model) {
+            std::cout << "VSHAMPOR: infer request ctor called\n";
+            m_compiled_model_ptr = compiled_model;
+            // Allocate input/output tensors
+            for (const auto& input : get_inputs()) {
+                allocate_tensor(input, [input](ov::SoPtr<ov::ITensor>& tensor) {
+                    // Can add a check to avoid double work in case of shared tensors
+                    allocate_tensor_impl(tensor,
+                                         input.get_element_type(),
+                                         input.get_partial_shape().is_dynamic() ? ov::Shape{0} : input.get_shape());
+                });
+            }
+            for (const auto& output : get_outputs()) {
+                allocate_tensor(output, [output](ov::SoPtr<ov::ITensor>& tensor) {
+                    // Can add a check to avoid double work in case of shared tensors
+                    allocate_tensor_impl(tensor,
+                                         output.get_element_type(),
+                                         output.get_partial_shape().is_dynamic() ? ov::Shape{0} : output.get_shape());
+                });
+    }
+        }
+    void LlamaCppSyncInferRequest::set_tensors_impl(const ov::Output<const ov::Node> port, const std::vector<ov::SoPtr<ov::ITensor>>& tensors) {
+        std::cout << "VSHAMPOR: set_tensors_impl called\n";
+    }
+
+    void llama_batch_add_reimpl(
+                     struct llama_batch & batch,
+                            llama_token   id,
+                              llama_pos   pos,
+        const std::vector<llama_seq_id> & seq_ids,
+                                   bool   logits) {
+        batch.token   [batch.n_tokens] = id;
+        batch.pos     [batch.n_tokens] = pos;
+        batch.n_seq_id[batch.n_tokens] = seq_ids.size();
+        for (size_t i = 0; i < seq_ids.size(); ++i) {
+            batch.seq_id[batch.n_tokens][i] = seq_ids[i];
+        }
+        batch.logits  [batch.n_tokens] = logits;
+
+        batch.n_tokens++;
+    }
+
+    void LlamaCppSyncInferRequest::infer() {
+        auto input_ids_tensor_ptr = get_tensor(get_inputs()[0]); // TODO (vshampor) correctly identify input_ids among all inputs without hardcode
+        OPENVINO_ASSERT(input_ids_tensor_ptr->get_element_type() == ov::element::Type_t::i64);
+        OPENVINO_ASSERT(input_ids_tensor_ptr->get_shape().size() == 2);
+        size_t batch_size = input_ids_tensor_ptr->get_shape()[0];
+        size_t sequence_length = input_ids_tensor_ptr->get_shape()[1];
+
+        // llama_batch actually contains one sequence
+        llama_batch batch = llama_batch_init(sequence_length, /* embd = */ 0, /* n_seq_max = */ 1);
+        const int64_t* data_ptr = input_ids_tensor_ptr->data<int64_t>();
+
+        const int64_t* sequence_start_ptr = data_ptr /* + seq_idx */;
+
+        for (size_t tok_idx = 0; tok_idx < sequence_length; ++tok_idx) {
+            const int64_t token_id = sequence_start_ptr[tok_idx];
+            llama_batch_add_reimpl(batch, token_id, *(m_compiled_model_ptr->num_tokens_processed_ptr), { 0 }, true); // the last `true` here is a marker that the logits for this token should be computed and returned
+            size_t* ptr = m_compiled_model_ptr->num_tokens_processed_ptr;
+            (*ptr)++;
+        }
+
+
+        llama_context* ctx = m_compiled_model_ptr->m_llama_ctx;
+        int32_t sts = llama_decode(ctx, batch);
+
+        if (sts != 0) {
+            OPENVINO_THROW("llama_decode failed with code ", sts);
+        }
+
+        size_t n_vocab = llama_n_vocab(m_compiled_model_ptr->m_llama_model_ptr);
+
+        ov::Tensor output_tensor{ov::element::Type_t::f32, {1, sequence_length, n_vocab}};
+        float* output_tensor_data_ptr = output_tensor.data<float>();
+
+        for (size_t pos = 0; pos < sequence_length; pos++) {
+            float* logits_from_llama = llama_get_logits_ith(ctx, pos);
+            std::copy(logits_from_llama, logits_from_llama + n_vocab, output_tensor_data_ptr + pos * n_vocab);
+        }
+
+        auto& logit_output = get_outputs()[0];
+        allocate_tensor(logit_output, [&output_tensor](ov::SoPtr<ov::ITensor>& tensor) { allocate_tensor_impl(tensor, output_tensor.get_element_type(), output_tensor.get_shape());
+                                                                                         output_tensor.copy_to(ov::make_tensor(tensor)); });
+    };
+    std::vector<ov::ProfilingInfo> LlamaCppSyncInferRequest::get_profiling_info() const {
+        std::cout << "VSHAMPOR: get_profiling_info() called\n";
+        return std::vector<ov::ProfilingInfo>{};
+    };
+    std::vector<ov::SoPtr<ov::IVariableState>> LlamaCppSyncInferRequest::query_state() const {
+        std::cout << "VSHAMPOR: get_profiling_info() called\n";
+        return std::vector<ov::SoPtr<ov::IVariableState>>{};
+    }
+    }
+}  // namespace ov
diff --git a/modules/llama_cpp_plugin/src/plugin.cpp b/modules/llama_cpp_plugin/src/plugin.cpp
new file mode 100644
index 000000000..9f633426f
--- /dev/null
+++ b/modules/llama_cpp_plugin/src/plugin.cpp
@@ -0,0 +1,152 @@
+#include "plugin.hpp"
+#include "compiled_model.hpp"
+#include "openvino/op/constant.hpp"
+#include <openvino/runtime/properties.hpp>
+#include "openvino/runtime/internal_properties.hpp"
+
+
+namespace {
+static constexpr const char* wait_executor_name = "LlamaCppWaitExecutor";
+static constexpr const char* stream_executor_name = "LlamaCppStreamsExecutor";
+static constexpr const char* template_exclusive_executor = "LlamaCppExecutor";
+}  // namespace
+
+
+namespace ov {
+    namespace llama_cpp_plugin {
+        LlamaCppPlugin::LlamaCppPlugin() : IPlugin() {
+            set_device_name("LLAMA_CPP");
+        }
+        std::shared_ptr<ov::ICompiledModel> LlamaCppPlugin::compile_model(const std::shared_ptr<const ov::Model>& model,
+            const ov::AnyMap& properties) const {
+            std::cout << "VSHAMPOR: LlamaCppPlugin::compile_model" << std::endl;
+
+            //std::string gpt2_node_name = "transformer.h.9.attn.c_proj.weight";
+            //std::cout << "VSHAMPOR: sanity check - looking for node containing " << gpt2_node_name << std::endl;
+            //auto ops = model->get_ops();
+            //auto iter = std::find_if(ops.begin(), ops.end(), [gpt2_node_name](const std::shared_ptr<ov::Node>& val) {
+            //        return val->get_friendly_name().find(gpt2_node_name) != std::string::npos; });
+            //if (iter == ops.end()) {
+            //    std::cout << "VSHAMPOR: did not find the node\n";
+            //} else {
+            //    std::shared_ptr<ov::Node> node_with_tensor = *iter;
+            //    std::cout << "VSHAMPOR: node type is " << node_with_tensor->get_type_name() << std::endl;
+            //    std::shared_ptr<ov::op::v0::Constant> const_node_ptr = ov::as_type_ptr<ov::op::v0::Constant>(node_with_tensor);
+            //    const float* data_ptr = const_node_ptr->get_data_ptr<element::Type_t::f32>();
+            //    // ov::descriptor::Tensor& tensor_descr = node_with_tensor->get_output_tensor(0);
+            //    // std::cout << "VSHAMPOR: node output tensor shape is " << tensor_descr.get_shape().to_string() << std::endl;
+            //    // ov::TensorVector in, out;
+            //    // node_with_tensor->evaluate(out, in);
+            //    // std::cout << "VSHAMPOR: evaluated " << out.size() << " output tensors\n";
+            //    // if (!out.empty()) {
+            //    //     const ov::Tensor& tensor = out[0];
+            //    //     const float* vals = tensor.data<float>();
+            //    //     std::cout << "VSHAMPOR: first elements of the weight tensor are ";
+            //    //     for (size_t i = 0; i < 10; i++) {
+            //    //         std::cout << vals[i] << " ";
+            //    //     }
+            //    //     std::cout << std::endl;
+            //    // }
+            //    std::cout << "VSHAMPOR: first elements of the weight tensor are ";
+            //    for (size_t i = 0; i < 10; i++) {
+            //        std::cout << data_ptr[i] << " ";
+            //    }
+            //    std::cout << std::endl;
+            //}
+            return compile_model(model, properties, {});
+        }
+
+        std::shared_ptr<ov::ICompiledModel> LlamaCppPlugin::compile_model(const std::string& fname, const ov::AnyMap& properties) const {
+            return std::make_shared<LlamaCppModel>(fname, shared_from_this());
+        }
+        std::shared_ptr<ov::ICompiledModel> LlamaCppPlugin::compile_model(const std::shared_ptr<const ov::Model>& model,
+            const ov::AnyMap& properties,
+            const ov::SoPtr<ov::IRemoteContext>& context) const {
+            std::cout << "VSHAMPOR: compile_model called in C++" << std::endl;
+            return std::make_shared<LlamaCppModel>(model->clone(), shared_from_this(), context, get_executor_manager()->get_executor(template_exclusive_executor));
+        }
+
+        void LlamaCppPlugin::set_property(const ov::AnyMap& properties) {
+            for (const auto& map_entry : properties) {
+                if (map_entry.first == ov::cache_dir.name()) {
+                    m_cache_dir = map_entry.second.as<std::string>();
+                }
+                else {
+                    OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: setting property ", map_entry.first, "not implemented");
+                }
+            }
+        }
+
+        ov::Any LlamaCppPlugin::get_property(const std::string& name, const ov::AnyMap& arguments) const {
+            if (ov::supported_properties == name) {
+                return decltype(ov::supported_properties)::value_type(std::vector<PropertyName>({ov::cache_dir, ov::device::capabilities, ov::device::full_name}));
+            }
+            if (ov::device::capabilities == name) {
+                return decltype(ov::device::capabilities)::value_type(std::vector<std::string>({ov::device::capability::EXPORT_IMPORT}));
+            }
+            if (ov::internal::supported_properties == name) {
+                return decltype(ov::internal::supported_properties)::value_type(std::vector<PropertyName>({ov::internal::caching_properties}));
+            }
+
+            if (ov::cache_dir == name) {
+                return m_cache_dir;
+            }
+            if (ov::internal::caching_properties == name) {
+                return std::vector<ov::PropertyName>{ov::device::full_name};
+            }
+
+            if (ov::device::full_name == name) {
+                return std::string("LLAMA_CPP");
+            }
+
+            OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented");
+        }
+
+        ov::SoPtr<ov::IRemoteContext> LlamaCppPlugin::create_context(const ov::AnyMap& remote_properties) const {
+            OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented");
+        }
+        ov::SoPtr<ov::IRemoteContext> LlamaCppPlugin::get_default_context(const ov::AnyMap& remote_properties) const {
+            OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented");
+        }
+        std::shared_ptr<ov::ICompiledModel> LlamaCppPlugin::import_model(std::istream& model_file_stream,
+            const ov::AnyMap& properties) const {
+            std::cout << "VSHAMPOR: importing model" << '\n';
+            std::cout << "VSHAMPOR: deserializing ov::Model first" << '\n';
+             // read XML content
+             std::string xmlString;
+             std::uint64_t dataSize = 0;
+             model_file_stream.read(reinterpret_cast<char*>(&dataSize), sizeof(dataSize));
+             xmlString.resize(dataSize);
+             model_file_stream.read(const_cast<char*>(xmlString.c_str()), dataSize);
+
+             // read blob content
+             ov::Tensor weights;
+             model_file_stream.read(reinterpret_cast<char*>(&dataSize), sizeof(dataSize));
+             if (0 != dataSize) {
+                 weights = ov::Tensor(ov::element::from<char>(), ov::Shape{static_cast<ov::Shape::size_type>(dataSize)});
+                 model_file_stream.read(weights.data<char>(), dataSize);
+             }
+
+             auto ov_model = get_core()->read_model(xmlString, weights);
+            std::cout << "VSHAMPOR: ov::Model deserialized, passing the rest of the stream to LlamaCppModel ctor" << '\n';
+            return std::make_shared<LlamaCppModel>(ov_model, model_file_stream, shared_from_this());
+        }
+
+        const std::string CURRENT_GGUF_FILE_NAME = "current.gguf";
+        std::string LlamaCppPlugin::get_current_gguf_file_path() const { return m_cache_dir + "/" + CURRENT_GGUF_FILE_NAME; }
+
+        std::shared_ptr<ov::ICompiledModel> LlamaCppPlugin::import_model(std::istream& model,
+            const ov::SoPtr<ov::IRemoteContext>& context,
+            const ov::AnyMap& properties) const {
+            OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented");
+        }
+
+        ov::SupportedOpsMap LlamaCppPlugin::query_model(const std::shared_ptr<const ov::Model>& model,
+            const ov::AnyMap& properties) const {
+            OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented");
+        }
+    }
+}  // namespace ov
+
+static const ov::Version version = {CI_BUILD_NUMBER, "llama_cpp_plugin"};
+OV_DEFINE_PLUGIN_CREATE_FUNCTION(ov::llama_cpp_plugin::LlamaCppPlugin, version)
diff --git a/modules/llama_cpp_plugin/tests/CMakeLists.txt b/modules/llama_cpp_plugin/tests/CMakeLists.txt
new file mode 100644
index 000000000..11648c2bd
--- /dev/null
+++ b/modules/llama_cpp_plugin/tests/CMakeLists.txt
@@ -0,0 +1,37 @@
+set(TARGET_NAME llama_cpp_plugin_func_tests)
+
+if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+    ov_add_compiler_flags(/wd4305)
+endif()
+
+ov_add_test_target(
+        NAME ${TARGET_NAME}
+        ROOT ${CMAKE_CURRENT_SOURCE_DIR}
+        DEPENDENCIES
+            openvino_template_plugin
+        LINK_LIBRARIES
+            openvino::funcSharedTests
+            openvino::runtime::dev
+        INCLUDES
+            "${OpenVINOTemplatePlugin_SOURCE_DIR}/include"
+            "${CMAKE_CURRENT_SOURCE_DIR}/op_reference"
+        ADD_CLANG_FORMAT
+        LABELS
+            OV UNIT TEMPLATE
+)
+
+find_package(OpenCV QUIET COMPONENTS core imgproc)
+
+if(OpenCV_FOUND AND OpenCV_VERSION VERSION_GREATER_EQUAL 3.4)
+    message(STATUS "Reference preprocessing: OpenCV tests are enabled")
+    target_compile_definitions(${TARGET_NAME} PRIVATE OPENCV_TEMPLATE_TESTS)
+    target_link_libraries(${TARGET_NAME} PRIVATE opencv_imgproc opencv_core)
+else()
+    message(WARNING "Reference preprocessing: OpenCV tests are disabled, because OpenCV ver. 3.4+ is not found")
+endif()
+
+if (ENABLE_INTEL_CPU)
+    set_source_files_properties(
+        "${CMAKE_CURRENT_SOURCE_DIR}/shared_tests_instances/behavior/executable_network/get_metric.cpp"
+        PROPERTIES COMPILE_DEFINITIONS ENABLE_INTEL_CPU=1)
+endif()
diff --git a/modules/llama_cpp_plugin/third_party/llama.cpp b/modules/llama_cpp_plugin/third_party/llama.cpp
new file mode 160000
index 000000000..c8b02d38d
--- /dev/null
+++ b/modules/llama_cpp_plugin/third_party/llama.cpp
@@ -0,0 +1 @@
+Subproject commit c8b02d38d98db8dab774f6f7655d7e9aede882f5
diff --git a/modules/llama_cpp_plugin/tools/CMakeLists.txt b/modules/llama_cpp_plugin/tools/CMakeLists.txt
new file mode 100644
index 000000000..4a37341b8
--- /dev/null
+++ b/modules/llama_cpp_plugin/tools/CMakeLists.txt
@@ -0,0 +1,22 @@
+cmake_minimum_required(VERSION 3.10)
+set(CMAKE_CXX_STANDARD 11)
+
+find_package(OpenVINO REQUIRED)
+
+
+add_executable(llama_cpp_runner
+               "${CMAKE_CURRENT_SOURCE_DIR}/runner.cpp"
+              )
+target_link_libraries(llama_cpp_runner PRIVATE openvino::runtime)
+
+
+add_executable(tensor_comparator
+               "${CMAKE_CURRENT_SOURCE_DIR}/tensor_comparator.cpp"
+              )
+target_link_libraries(tensor_comparator PRIVATE ggml)
+
+add_executable(cache_embedder
+               "${CMAKE_CURRENT_SOURCE_DIR}/cache_embedder.cpp"
+              )
+
+target_compile_options(cache_embedder PUBLIC "--std=c++17")
diff --git a/modules/llama_cpp_plugin/tools/cache_embedder.cpp b/modules/llama_cpp_plugin/tools/cache_embedder.cpp
new file mode 100644
index 000000000..bbfbf229c
--- /dev/null
+++ b/modules/llama_cpp_plugin/tools/cache_embedder.cpp
@@ -0,0 +1,53 @@
+#include <fstream>
+#include <iostream>
+#include <cassert>
+#include <string>
+#include <stdint.h>
+#include <filesystem>
+
+int main(int argc, char* argv[]) {
+    assert(argc == 3);
+    std::string cache_blob_name = argv[1];
+    std::string gguf_file_name = argv[2];
+
+    std::uintmax_t original_file_size = std::filesystem::file_size(cache_blob_name);
+    std::fstream cache_io_stream(cache_blob_name, std::ios::binary | std::ios::in | std::ios::out);
+
+    {
+        std::string tmp;
+        std::getline(cache_io_stream, tmp); // skip the blob header
+        std::cout << "skipped header line" << std::endl;
+    }
+
+    std::uint64_t data_size = 0;
+    cache_io_stream.read(reinterpret_cast<char*>(&data_size), sizeof(data_size));
+    std::cout << "skipping IR XML content, size " << data_size << std::endl;
+    cache_io_stream.seekp(data_size, std::ios::cur); // skip IR xml content
+
+    cache_io_stream.read(reinterpret_cast<char*>(&data_size), sizeof(data_size));
+    std::cout << "skipping IR weight content, size " << data_size << std::endl;
+    cache_io_stream.seekp(data_size, std::ios::cur);  // skip IR weight content
+
+    std::streampos pos = cache_io_stream.tellp();
+    char magic[4];
+    for (size_t i = 0; i < 4; i++) {
+        cache_io_stream >> magic[i];
+    }
+
+    std::string curr_magic(magic);
+    std::cout << "magic at current position is " << curr_magic << std::endl;
+    assert(curr_magic == "GGUF");
+    cache_io_stream.seekp(pos);
+
+    std::ifstream gguf_input_stream(gguf_file_name, std::ios::binary);
+    cache_io_stream << gguf_input_stream.rdbuf();
+    std::cout << "gguf content write successful" << std::endl;
+    std::uintmax_t final_size = cache_io_stream.tellp();
+    cache_io_stream.close();
+    if (final_size < original_file_size) {
+        std::cout << "cache entry is now smaller (" << final_size << " vs original " << original_file_size << "), truncating" << std::endl;
+        std::filesystem::resize_file(cache_blob_name, final_size);
+    }
+
+    return 0;
+}
diff --git a/modules/llama_cpp_plugin/tools/runner.cpp b/modules/llama_cpp_plugin/tools/runner.cpp
new file mode 100644
index 000000000..390301cdb
--- /dev/null
+++ b/modules/llama_cpp_plugin/tools/runner.cpp
@@ -0,0 +1,73 @@
+#include "openvino/openvino.hpp"
+#include <cstring>
+
+int main(int argc, char* argv[]) {
+    ov::Core core;
+    core.set_property(ov::cache_dir("/tmp/my_cache_dir"));
+    std::string model_path = "/home/vshampor/work/optimum-intel/ov_model/openvino_model.xml";
+
+    std::cout << "VSHAMPOR: reading model\n";
+    std::shared_ptr<ov::Model> model = core.read_model(model_path);
+
+    std::cout << "VSHAMPOR: compiling model\n";
+    ov::CompiledModel compiled_model = core.compile_model(model, "LLAMA_CPP");
+
+    std::cout << "VSHAMPOR: compiled successfully\n";
+
+    std::cout << "VSHAMPOR: creating infer request\n";
+    ov::InferRequest infer_request = compiled_model.create_infer_request();
+    std::cout << "VSHAMPOR: infer request created\n";
+
+    // const ov::Output<const ov::Node>& input = compiled_model.input();
+    // std::cout << "VSHAMPOR: got input\n";
+    auto inputs = compiled_model.inputs();
+    std::cout << "VSHAMPOR: model has " << inputs.size() << " inputs\n";
+    for (const auto& input: inputs) {
+        std::cout << input.get_node()->get_friendly_name() << std::endl;
+    }
+
+    for (size_t i = 0; i < inputs.size(); i++) {
+        const auto& curr_input = inputs[i];
+        auto shape = curr_input.get_partial_shape();
+        if (shape.is_dynamic()) {
+            std::cout << "VSHAMPOR: processing input " << i << " with a dynamic shape of " << shape.to_string() << std::endl;
+            ov::Rank r = shape.rank();
+            if (r.get_length() == 2) {
+                ov::Tensor input_tensor{curr_input.get_element_type(), ov::Shape({1, 128})};
+                int64_t* data_ptr = input_tensor.data<int64_t>();
+                // fill with something
+                for (size_t elt_idx = 0; elt_idx < input_tensor.get_size(); elt_idx++) {
+                    data_ptr[elt_idx] = 42;
+                }
+                infer_request.set_input_tensor(i, input_tensor);
+            }
+            else {  // past_key_values
+                ov::Tensor input_tensor{curr_input.get_element_type(), ov::Shape({1, 12, 128, 64})};
+                infer_request.set_input_tensor(i, input_tensor);
+            }
+        }
+        else {
+            std::cout << "VSHAMPOR: processing input " << i << " with a non-dynamic shape of " << shape.to_string() << std::endl;
+            ov::Tensor input_tensor{curr_input.get_element_type(), curr_input.get_shape()};
+            infer_request.set_input_tensor(i, input_tensor);
+        }
+    }
+    std::cout << "VSHAMPOR: successfully set input tensor\n";
+
+    infer_request.infer();
+    std::cout << "VSHAMPOR: inferred successfully\n";
+
+    ov::Tensor output = infer_request.get_tensor("logits");
+    std::cout << "VSHAMPOR: got output tensor, shape " << output.get_shape().to_string() << std::endl;
+
+    size_t n_output_elts = 10;
+    std::cout << "VSHAMPOR: first " << n_output_elts << " elements are:" << std::endl;
+
+    float* output_data_ptr = output.data<float>();
+    for (size_t elt_idx = 0; elt_idx < n_output_elts; elt_idx++) {
+        std::cout << output_data_ptr[elt_idx] << " ";
+    }
+
+    std::cout << std::endl;
+    return 0;
+}
diff --git a/modules/llama_cpp_plugin/tools/tensor_comparator.cpp b/modules/llama_cpp_plugin/tools/tensor_comparator.cpp
new file mode 100644
index 000000000..83de96215
--- /dev/null
+++ b/modules/llama_cpp_plugin/tools/tensor_comparator.cpp
@@ -0,0 +1,95 @@
+#include "ggml.h"
+#include <cassert>
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <numeric>
+#include <vector>
+
+
+
+int main(int argc, char* argv[]) {
+    assert(argc == 3 || argc == 4);
+    std::string left_name(argv[1]);
+    std::string right_name(argv[2]);
+
+    gguf_init_params left_params; left_params.no_alloc = false; left_params.ctx = nullptr;
+    gguf_init_params right_params; left_params.no_alloc = false; right_params.ctx = nullptr;
+    gguf_context* left_ctx = gguf_init_from_file(left_name.c_str(), left_params);
+    gguf_context* right_ctx = gguf_init_from_file(right_name.c_str(), right_params);
+
+    std::vector<std::string> tensor_names;
+    if (argc == 4) tensor_names.push_back(std::string(argv[3]));
+    else {
+        for (size_t idx = 0; idx < left_ctx->header.n_tensors; idx++) {
+            gguf_tensor_info left_tensor_info = left_ctx->infos[idx];
+            tensor_names.push_back(left_tensor_info.name.data);
+        }
+    }
+
+    for (const auto& tensor_name : tensor_names) {
+
+
+        int left_tensor_idx = gguf_find_tensor(left_ctx, tensor_name.c_str());
+        int right_tensor_idx = gguf_find_tensor(right_ctx, tensor_name.c_str());
+
+        size_t left_tensor_offset = gguf_get_tensor_offset(left_ctx, left_tensor_idx) + left_ctx->offset;
+        size_t right_tensor_offset = gguf_get_tensor_offset(right_ctx, right_tensor_idx) + right_ctx->offset;
+
+        gguf_tensor_info left_tensor_info = left_ctx->infos[left_tensor_idx];
+        gguf_tensor_info right_tensor_info = right_ctx->infos[right_tensor_idx];
+
+        std::cout << "tensor name " << tensor_name << ", byte offsets: " << left_tensor_offset << " (left), " << right_tensor_offset << " (right)" << std::endl;
+        std::cout << "tensor name " << tensor_name << ", shape: ";
+        for (size_t i = 0; i < left_tensor_info.n_dims; i++) {
+            std::cout << left_tensor_info.ne[i] << ",";
+        }
+        std::cout << " (left), ";
+
+        for (size_t i = 0; i < right_tensor_info.n_dims; i++) {
+            std::cout << right_tensor_info.ne[i] << ",";
+        }
+        std::cout  << " (right) " << std::endl;
+
+        size_t left_tensor_size = std::accumulate(std::begin(left_tensor_info.ne), std::begin(left_tensor_info.ne) + GGML_MAX_DIMS, (size_t) sizeof(float), std::multiplies<size_t>());
+        size_t right_tensor_size = std::accumulate(std::begin(right_tensor_info.ne), std::begin(right_tensor_info.ne) + GGML_MAX_DIMS, (size_t) sizeof(float), std::multiplies<size_t>());
+
+        std::cout << "tensor name " << tensor_name << ", size (calculated): " << left_tensor_size << " (left), " << right_tensor_size << " (right)" << std::endl;
+
+        if (left_tensor_size != right_tensor_size) {
+            std::cout << "size mismatch (" << left_tensor_size << " left, " << right_tensor_size << "right), exiting" << std::endl;
+            exit(-1);
+        }
+
+        size_t bytes_compared = 0;
+
+        std::ifstream left_file(left_name, std::ios::binary);
+        std::ifstream right_file(right_name, std::ios::binary);
+
+        left_file.seekg(left_tensor_offset);
+        right_file.seekg(right_tensor_offset);
+
+        std::cout << "first 10 float values:" << std::endl;
+        for (size_t i = 0; i < 10; i++) {
+            float left_value; left_file.read((char*) &left_value, sizeof(float));
+            float right_value; right_file.read((char*) &right_value, sizeof(float));
+
+            std::cout << left_value <<  " left, " << right_value << " right" << std::endl;
+        }
+
+        left_file.seekg(left_tensor_offset);
+        right_file.seekg(right_tensor_offset);
+        for (size_t i = 0; i < left_tensor_size; i++) {
+            char left_byte; left_file.read((char*) &left_byte, sizeof(char));
+            char right_byte; right_file.read((char*) &right_byte, sizeof(char));
+
+            if (left_byte != right_byte) {
+                std::cout << "byte " << bytes_compared << " mismatch (" << std::hex << +((uint8_t) left_byte) << " left, " << +((uint8_t) right_byte) << " right)" << std::endl;
+                std::cout << "offset left " << std::hex << left_tensor_offset + bytes_compared << ", right " << right_tensor_offset + bytes_compared << std::endl;
+                exit(-1);
+            }
+            bytes_compared++;
+        }
+        std::cout << "tensor contents are identical, bytes compared: " << bytes_compared << std::endl;
+    }
+}

From f55badc9d2418e98304960934ffdc72af940a009 Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Mon, 11 Mar 2024 18:03:08 +0100
Subject: [PATCH 02/27] Basic test and test build

---
 modules/llama_cpp_plugin/CMakeLists.txt       |    5 +-
 .../llama_cpp_plugin/src/compiled_model.cpp   | 1519 +++++++++--------
 .../llama_cpp_plugin/src/infer_request.cpp    |  187 +-
 modules/llama_cpp_plugin/src/plugin.cpp       |  278 +--
 modules/llama_cpp_plugin/tests/CMakeLists.txt |   37 -
 .../llama_cpp_plugin/tests/e2e/CMakeLists.txt |   18 +
 .../tests/e2e/prompt_response.cpp             |   63 +
 .../tests/e2e/set_device_name.cpp             |   13 +
 modules/llama_cpp_plugin/tools/CMakeLists.txt |    3 +-
 9 files changed, 1183 insertions(+), 940 deletions(-)
 delete mode 100644 modules/llama_cpp_plugin/tests/CMakeLists.txt
 create mode 100644 modules/llama_cpp_plugin/tests/e2e/CMakeLists.txt
 create mode 100644 modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp
 create mode 100644 modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp

diff --git a/modules/llama_cpp_plugin/CMakeLists.txt b/modules/llama_cpp_plugin/CMakeLists.txt
index f5d3284b2..1385eea5d 100644
--- a/modules/llama_cpp_plugin/CMakeLists.txt
+++ b/modules/llama_cpp_plugin/CMakeLists.txt
@@ -18,10 +18,7 @@ add_subdirectory(third_party/llama.cpp)
 if(ENABLE_TESTS)
     include(CTest)
     enable_testing()
-
-    if(ENABLE_FUNCTIONAL_TESTS)
-        add_subdirectory(tests/functional)
-    endif()
+    add_subdirectory(tests/e2e)
 endif()
 
 
diff --git a/modules/llama_cpp_plugin/src/compiled_model.cpp b/modules/llama_cpp_plugin/src/compiled_model.cpp
index 932c0def4..85a65d7e6 100644
--- a/modules/llama_cpp_plugin/src/compiled_model.cpp
+++ b/modules/llama_cpp_plugin/src/compiled_model.cpp
@@ -1,732 +1,895 @@
 #include "compiled_model.hpp"
-#include "plugin.hpp"
-#include "infer_request.hpp"
+
+#include <fstream>
 #include <memory>
 #include <openvino/op/constant.hpp>
 #include <openvino/opsets/opset13.hpp>
-#include <fstream>
 #include <openvino/runtime/properties.hpp>
 
-namespace ov {
-    namespace llama_cpp_plugin {
-        class TensorWeightMatcher {
-        public:
-            // TODO (vshampor) implement this for faster weight node matching.
-            // Use std::list, two passes - first for full name match, second for prefix-match; remove entries from list on match
-            using RTInfoTensorName = std::string;
-            using OvNodeName = std::string;
-            using LlamaTensorName = std::string;
-
-            TensorWeightMatcher(const std::shared_ptr<ov::Model>& model, std::map<RTInfoTensorName, ov::Shape> tensor_names_with_shapes_to_match) {
-                std::multimap<RTInfoTensorName, std::shared_ptr<ov::op::v0::Constant>> intermediate_matches_map;
-
-                const auto node_vector = model->get_ops();
-                std::list<std::shared_ptr<ov::op::v0::Constant>> const_nodes_in_model;
-                for (const auto& node_ptr : node_vector) {
-                    if (ov::is_type<ov::op::v0::Constant>(node_ptr)) const_nodes_in_model.push_back(ov::as_type_ptr<ov::op::v0::Constant>(node_ptr));
-                }
-
-                // full substring match pass
-                std::map<RTInfoTensorName, ov::Shape> unmatched_rt_info_names_on_first_pass = extract_matches(intermediate_matches_map, tensor_names_with_shapes_to_match, const_nodes_in_model,
-                        [](const std::string& substring, const std::string& source) { return source.find(substring) != std::string::npos; });
-
-                // prefix substring match pass
-                std::map<RTInfoTensorName, ov::Shape> unmatched_rt_info_names_on_second_pass = extract_matches(intermediate_matches_map, unmatched_rt_info_names_on_first_pass, const_nodes_in_model,
-                        [](const std::string& substring, const std::string& source) {
-                        return source.find(get_weight_name_without_torch_postfix(substring)) != std::string::npos; });
-
-                for (auto it = intermediate_matches_map.begin(); it != intermediate_matches_map.end(); it = intermediate_matches_map.upper_bound(it->first)) {
-                    // TODO: perf improvement by iterating with ++;
-                    RTInfoTensorName rt_info_name = it->first;
-                    if (intermediate_matches_map.count(rt_info_name) != 1) {
-                        std::cout << "VSHAMPOR: multiple matches for weight name " << rt_info_name << " and shape " << it->second->get_shape().to_string() << ", found ";
-                        auto range_it_pair = intermediate_matches_map.equal_range(rt_info_name);
-                        for (auto multimatch_it = range_it_pair.first; multimatch_it != range_it_pair.second; multimatch_it++) {
-                            auto node_ptr = multimatch_it->second;
-                            std::cout << node_ptr->get_friendly_name() << "(shape " << node_ptr->get_shape().to_string() << "),";
-                        }
-                        std::cout << "will take the first match" << std::endl;
-                    }
-                    const auto& match = intermediate_matches_map.find(rt_info_name)->second;
-                    m_rtinfo_name_to_weight_node_map[rt_info_name] = match;
-                }
-                if (!unmatched_rt_info_names_on_second_pass.empty()) {
-                    std::cout << "VSHAMPOR: did not find the weight node for " << unmatched_rt_info_names_on_second_pass.size() << " weights:" << std::endl;
-                }
-                for (const auto& unmatched_entry: unmatched_rt_info_names_on_second_pass) {
-                    std::cout << '\t' << unmatched_entry.first << std::endl;
-                }
-            }
-
-        std::unordered_map<RTInfoTensorName, std::shared_ptr<ov::op::v0::Constant>> get_matches() { return m_rtinfo_name_to_weight_node_map; }
-
-        private:
-            std::map<RTInfoTensorName, ov::Shape> extract_matches(std::multimap<RTInfoTensorName, std::shared_ptr<ov::op::v0::Constant>>& output_matches_map,
-                                                                  const std::map<RTInfoTensorName, ov::Shape>& names_with_shapes_to_match,
-                                                                  const std::list<std::shared_ptr<ov::op::v0::Constant>>& search_list,
-                                                                  std::function<bool(const std::string& substring, const std::string& source)> name_match_predicate) {
-                std::map<RTInfoTensorName, ov::Shape> unmatched_rt_info_names;
-                for (const auto& pair: names_with_shapes_to_match) {
-                    RTInfoTensorName rt_info_name = pair.first;
-                    const ov::Shape& wanted_shape = pair.second;
-                    bool matched = false;
-                    for (auto it = search_list.begin(); it != search_list.end(); it++) {
-                        auto node_ptr = *it;
-                        const std::string& friendly_name = node_ptr->get_friendly_name();
-                        if (name_match_predicate(rt_info_name, friendly_name) &&
-                            node_ptr->get_shape() == wanted_shape) {
-                            output_matches_map.insert(std::make_pair(rt_info_name, node_ptr));
-                            matched = true;
-                            break;
-                        }
-                    }
-                    if (!matched) unmatched_rt_info_names.insert(pair);
-                }
-                return unmatched_rt_info_names;
-            }
-
-            static std::string get_weight_name_without_torch_postfix(std::string torch_weight_name) {
-                size_t idx = torch_weight_name.rfind(".");
-                if (idx == std::string::npos) return torch_weight_name;
-                return std::string(torch_weight_name, 0, idx);
-            }
-
-            size_t num_exact_matches = 0;
-            size_t num_partial_matches = 0;
-            std::unordered_map<RTInfoTensorName, std::shared_ptr<ov::op::v0::Constant>> m_rtinfo_name_to_weight_node_map;
-        };
-
-
-        std::vector<std::shared_ptr<ov::Node>> get_nodes_containing_name_with_shape(const std::shared_ptr<ov::Model>& model, const std::string& weight_name, const ov::Shape& shape) {
-            auto ops = model->get_ops();
-            std::vector<std::shared_ptr<ov::Node>> found_weight_nodes;
-            std::copy_if(ops.begin(), ops.end(), std::back_inserter(found_weight_nodes),
-                    [&weight_name, &shape](const std::shared_ptr<ov::Node>& val) {
-                        if (!ov::is_type<ov::op::v0::Constant>(val)) return false;
-                        std::shared_ptr<ov::op::v0::Constant> node_ptr = ov::as_type_ptr<ov::op::v0::Constant>(val);
-                        return val->get_friendly_name().find(weight_name) != std::string::npos &&
-                               val->get_shape() == shape;
-                    });
-            return found_weight_nodes;
-        }
-
-        bool has_weight_matches(const std::shared_ptr<ov::Model>& model, const std::string& weight_name, const ov::Shape& shape) {
-            std::vector<std::shared_ptr<ov::Node>> found_weight_nodes;
-            found_weight_nodes = get_nodes_containing_name_with_shape(model, weight_name, shape);
-            return !found_weight_nodes.empty();
-        }
-
-        std::string get_weight_name_without_torch_postfix(std::string torch_weight_name) {
-            size_t idx = torch_weight_name.rfind(".");
-            if (idx == std::string::npos) return torch_weight_name;
-            return std::string(torch_weight_name, 0, idx);
-        }
+#include "infer_request.hpp"
+#include "plugin.hpp"
 
-        bool has_partial_weight_matches(const std::shared_ptr<ov::Model>& model, const std::string& weight_name, const ov::Shape& shape) {
-            std::vector<std::shared_ptr<ov::Node>> found_weight_nodes;
-            found_weight_nodes = get_nodes_containing_name_with_shape(model, get_weight_name_without_torch_postfix(weight_name), shape);
-            return !found_weight_nodes.empty();
+namespace ov {
+namespace llama_cpp_plugin {
+class TensorWeightMatcher {
+public:
+    // TODO (vshampor) implement this for faster weight node matching.
+    // Use std::list, two passes - first for full name match, second for
+    // prefix-match; remove entries from list on match
+    using RTInfoTensorName = std::string;
+    using OvNodeName = std::string;
+    using LlamaTensorName = std::string;
+
+    TensorWeightMatcher(const std::shared_ptr<ov::Model>& model,
+                        std::map<RTInfoTensorName, ov::Shape> tensor_names_with_shapes_to_match) {
+        std::multimap<RTInfoTensorName, std::shared_ptr<ov::op::v0::Constant>> intermediate_matches_map;
+
+        const auto node_vector = model->get_ops();
+        std::list<std::shared_ptr<ov::op::v0::Constant>> const_nodes_in_model;
+        for (const auto& node_ptr : node_vector) {
+            if (ov::is_type<ov::op::v0::Constant>(node_ptr))
+                const_nodes_in_model.push_back(ov::as_type_ptr<ov::op::v0::Constant>(node_ptr));
         }
 
-        std::shared_ptr<ov::op::v0::Constant> get_weight_by_name_and_shape(const std::shared_ptr<ov::Model>& model, const std::string& weight_name, const ov::Shape& shape) {
-            OPENVINO_ASSERT(has_weight_matches(model, weight_name, shape));
-            std::vector<std::shared_ptr<ov::Node>> found_weight_nodes;
-            found_weight_nodes = get_nodes_containing_name_with_shape(model, weight_name, shape);
-
-            if (found_weight_nodes.size() > 1) {
-                std::cout << "VSHAMPOR: multiple matches for weight name " << weight_name << " and shape " << shape.to_string() << ", found ";
-                for (const auto& node_ptr : found_weight_nodes) {
-                    std::cout << node_ptr->get_friendly_name() << "(shape " << shape.to_string() << "),";
+        // full substring match pass
+        std::map<RTInfoTensorName, ov::Shape> unmatched_rt_info_names_on_first_pass =
+            extract_matches(intermediate_matches_map,
+                            tensor_names_with_shapes_to_match,
+                            const_nodes_in_model,
+                            [](const std::string& substring, const std::string& source) {
+                                return source.find(substring) != std::string::npos;
+                            });
+
+        // prefix substring match pass
+        std::map<RTInfoTensorName, ov::Shape> unmatched_rt_info_names_on_second_pass = extract_matches(
+            intermediate_matches_map,
+            unmatched_rt_info_names_on_first_pass,
+            const_nodes_in_model,
+            [](const std::string& substring, const std::string& source) {
+                return source.find(get_weight_name_without_torch_postfix(substring)) != std::string::npos;
+            });
+
+        for (auto it = intermediate_matches_map.begin(); it != intermediate_matches_map.end();
+             it = intermediate_matches_map.upper_bound(it->first)) {
+            // TODO: perf improvement by iterating with ++;
+            RTInfoTensorName rt_info_name = it->first;
+            if (intermediate_matches_map.count(rt_info_name) != 1) {
+                std::cout << "VSHAMPOR: multiple matches for weight name " << rt_info_name << " and shape "
+                          << it->second->get_shape().to_string() << ", found ";
+                auto range_it_pair = intermediate_matches_map.equal_range(rt_info_name);
+                for (auto multimatch_it = range_it_pair.first; multimatch_it != range_it_pair.second; multimatch_it++) {
+                    auto node_ptr = multimatch_it->second;
+                    std::cout << node_ptr->get_friendly_name() << "(shape " << node_ptr->get_shape().to_string()
+                              << "),";
                 }
                 std::cout << "will take the first match" << std::endl;
             }
-            std::shared_ptr<ov::Node> node_with_tensor = found_weight_nodes.front();
-            OPENVINO_ASSERT(ov::is_type<ov::op::v0::Constant>(node_with_tensor));
-            std::shared_ptr<ov::op::v0::Constant> const_node_ptr = ov::as_type_ptr<ov::op::v0::Constant>(node_with_tensor);
-            return const_node_ptr;
-        }
-
-        using TransposePermutation = std::pair<size_t, size_t>;
-
-        std::vector<size_t> expand_front(const std::vector<size_t>& vec, size_t val) {
-            OPENVINO_ASSERT(vec.size() < GGML_MAX_DIMS);
-            std::vector<size_t> retval(GGML_MAX_DIMS, val);
-            std::copy(vec.rbegin(), vec.rend(), retval.rbegin());
-            return retval;
+            const auto& match = intermediate_matches_map.find(rt_info_name)->second;
+            m_rtinfo_name_to_weight_node_map[rt_info_name] = match;
         }
-
-        void write_float_plus_one(std::ofstream& out, const float* src) {
-            float elt = *src;
-            elt += 1;
-            out.write((const char*) &elt, sizeof(float));
+        if (!unmatched_rt_info_names_on_second_pass.empty()) {
+            std::cout << "VSHAMPOR: did not find the weight node for " << unmatched_rt_info_names_on_second_pass.size()
+                      << " weights:" << std::endl;
         }
-
-        void append_tensor_data_with_transpositions(const std::string& fname, const std::vector<gguf_tensor_info>& tensor_infos, const std::vector<void*>& tensor_data_ptrs,
-                const std::map<std::string, TransposePermutation>& transpositions, const std::set<std::string> increment_by_one_tensor_names) {
-             // assuming contiguous data underneath each pointer from tensor_data_ptrs
-             OPENVINO_ASSERT(tensor_infos.size() == tensor_data_ptrs.size());
-             std::ofstream out(fname, std::ios::app | std::ios::out);
-             for (size_t i = 0; i < tensor_infos.size(); i++) {
-                const auto& tensor_info = tensor_infos[i];
-                OPENVINO_ASSERT(tensor_info.type == GGML_TYPE_F32); // TODO (vshampor): writing transposed tensor data for other data types, especially lower-bitwidth; maybe use OV inference for that
-
-                const char* ir_tensor_data = reinterpret_cast<char*>(tensor_data_ptrs[i]);
-
-                std::string tensor_llama_name = std::string(tensor_info.name.data);
-                auto it = transpositions.find(tensor_llama_name);
-                if (it == transpositions.end()) {
-                    // original IR tensor should not be transposed to conform to GGUF expectations, can write as-is
-                    if (increment_by_one_tensor_names.count(tensor_llama_name) != 0) { // gemma case
-                        size_t elt_size = sizeof(float); // FP32 only for now
-                        OPENVINO_ASSERT(!(tensor_info.size % elt_size));
-                        size_t num_elts = tensor_info.size / elt_size;
-                        for (size_t elt_idx = 0; elt_idx < num_elts; elt_idx++) {
-                            write_float_plus_one(out, ((float*) ir_tensor_data) + elt_idx);
-                        }
-                    }
-                    else {
-                        out.write(ir_tensor_data, tensor_info.size);
-                    }
-                    continue;
-                }
-
-                if (it != transpositions.end()) {
-                    std::vector<size_t> gguf_layout_shape;
-
-                    // the shape in .ne is inverted w.r.t original export (~= IR) weight layout
-                    for (size_t dim_idx = 0; dim_idx < tensor_info.n_dims; dim_idx++) {
-                        gguf_layout_shape.push_back(tensor_info.ne[GGML_MAX_DIMS - 1 - tensor_info.n_dims - dim_idx]);
-                    }
-
-                    TransposePermutation permutation = it->second;
-                    std::vector<size_t> ir_layout_shape(gguf_layout_shape);
-                    std::swap(ir_layout_shape[permutation.first], ir_layout_shape[permutation.second]);
-
-                    std::vector<size_t> ir_layout_strides(tensor_info.n_dims, 1);
-
-                    for (size_t idx = 0; idx < tensor_info.n_dims - 1 ; idx++) {
-                        auto previous_stride_it = ir_layout_strides.rbegin() + idx;
-                        auto stride_it = ir_layout_strides.rbegin() + idx + 1;
-                        auto shape_it = ir_layout_shape.rbegin() + idx;
-                        *stride_it = *shape_it * *previous_stride_it;
-                    }
-
-
-                    std::vector<size_t> permuted_strides(ir_layout_strides);
-                    std::swap(permuted_strides[permutation.first], permuted_strides[permutation.second]);
-
-                    // expand up to GGML_MAX_DIMS
-                    std::vector<size_t> gguf_layout_shape_ex = expand_front(gguf_layout_shape, 1);
-                    // stride for unused dims will be 0, has no effect on loop because dimension idx for that dim is always 0
-                    permuted_strides = expand_front(permuted_strides, 0);
-
-
-
-                    std::cout << "VSHAMPOR: writing tensor " << tensor_info.name.data << " with size " << tensor_info.size;
-                    std::cout << " shape (GGUF layout) ";
-                    for (auto dim: gguf_layout_shape) std::cout << dim << ",";
-                    std::cout << " shape (IR layout) ";
-                    for (auto dim : ir_layout_shape) std::cout << dim << ",";
-                    std::cout << " stride (IR layout) ";
-                    for (auto stride : ir_layout_strides) std::cout << stride << ",";
-                    std::cout << " stride (IR layout, transposing) ";
-                    for (auto stride : permuted_strides) std::cout << stride << ",";
-                    std::cout << std::endl;
-
-                    // TODO (vshampor): rewrite the loop below using recurrent templates?
-                    // This relies on GGUF_MAX_DIMS == 4 and unused dims being equal to 1
-                    size_t current_offset = 0;
-                    size_t element_size = sizeof(float);
-                    size_t num_bytes_written = 0;
-                    for (size_t dim_0 = 0; dim_0 < gguf_layout_shape_ex[0]; dim_0++)
-                        for (size_t dim_1 = 0; dim_1 < gguf_layout_shape_ex[1]; dim_1++)
-                            for (size_t dim_2 = 0; dim_2 < gguf_layout_shape_ex[2]; dim_2++)
-                                for (size_t dim_3 = 0; dim_3 < gguf_layout_shape_ex[3]; dim_3++) {
-                                    current_offset = element_size * (dim_0 * permuted_strides[0] + dim_1 * permuted_strides[1] + dim_2 * permuted_strides[2] + dim_3 * permuted_strides[3]);
-                                    if (increment_by_one_tensor_names.count(tensor_llama_name) != 0) { // gemma case
-                                        write_float_plus_one(out, (float*) ir_tensor_data + current_offset);
-                                    }
-                                    else {
-                                        out.write(ir_tensor_data + current_offset, element_size);
-                                    }
-                                    num_bytes_written += element_size;
-                                }
-                    std::cout << "VSHAMPOR: wrote " << num_bytes_written << std::endl;
-                    OPENVINO_ASSERT(num_bytes_written == tensor_info.size);
-                }
-             }
+        for (const auto& unmatched_entry : unmatched_rt_info_names_on_second_pass) {
+            std::cout << '\t' << unmatched_entry.first << std::endl;
         }
+    }
 
-        struct ValueStorageForLifetimeExtension {
-            std::list<std::string> kv_key_string_storage;
-            std::list<std::string> kv_value_string_storage;
-            std::list<std::vector<char*>> str_arr_storage;
-            void* store_gguf_value_vector(const std::vector<gguf_value>& vec, gguf_type g_type) {
-                size_t elt_size;
-                switch (g_type) {
-                    case GGUF_TYPE_UINT8:   elt_size = sizeof(uint8_t); break;
-                    case GGUF_TYPE_INT8:    elt_size = sizeof(int8_t); break;
-                    case GGUF_TYPE_UINT16:  elt_size = sizeof(uint16_t); break;
-                    case GGUF_TYPE_INT16:   elt_size = sizeof(int16_t); break;
-                    case GGUF_TYPE_UINT32:  elt_size = sizeof(uint32_t); break;
-                    case GGUF_TYPE_INT32:   elt_size = sizeof(int32_t); break;
-                    case GGUF_TYPE_FLOAT32: elt_size = sizeof(float); break;
-                    case GGUF_TYPE_UINT64:  elt_size = sizeof(uint64_t); break;
-                    case GGUF_TYPE_INT64:   elt_size = sizeof(int64_t); break;
-                    case GGUF_TYPE_FLOAT64: elt_size = sizeof(double); break;
-                    case GGUF_TYPE_BOOL:    elt_size = sizeof(bool); break;
-                default:
-                    OPENVINO_THROW("Unknown array type");
-                }
-                size_t size_in_bytes = vec.size() * elt_size;
-                void* mem_ptr = new char[size_in_bytes];
-                for (size_t i = 0; i < vec.size(); i++) {
-                    switch (g_type) {
-                        case GGUF_TYPE_UINT8:   ((uint8_t*) mem_ptr)[i] = vec[i].uint8;     break;
-                        case GGUF_TYPE_INT8:    ((int8_t*) mem_ptr)[i] = vec[i].int8;      break;
-                        case GGUF_TYPE_UINT16:  ((uint16_t*) mem_ptr)[i] = vec[i].uint16;    break;
-                        case GGUF_TYPE_INT16:   ((int16_t*) mem_ptr)[i] = vec[i].int16;     break;
-                        case GGUF_TYPE_UINT32:  ((uint32_t*) mem_ptr)[i] = vec[i].uint32;    break;
-                        case GGUF_TYPE_INT32:   ((int32_t*) mem_ptr)[i] = vec[i].int32;     break;
-                        case GGUF_TYPE_FLOAT32: ((float*) mem_ptr)[i] = vec[i].float32;   break;
-                        case GGUF_TYPE_UINT64:  ((uint64_t*) mem_ptr)[i] = vec[i].uint64;    break;
-                        case GGUF_TYPE_INT64:   ((int64_t*) mem_ptr)[i] = vec[i].int64;     break;
-                        case GGUF_TYPE_FLOAT64: ((double*) mem_ptr)[i] = vec[i].float64;   break;
-                        case GGUF_TYPE_BOOL:    ((bool*) mem_ptr)[i] = vec[i].bool_;     break;
-                    default:
-                        OPENVINO_THROW("Unknown array type");
-                    }
-                }
-                return mem_ptr;
-            }
+    std::unordered_map<RTInfoTensorName, std::shared_ptr<ov::op::v0::Constant>> get_matches() {
+        return m_rtinfo_name_to_weight_node_map;
+    }
 
-            ValueStorageForLifetimeExtension() = default;
-            ~ValueStorageForLifetimeExtension() {
-                for (void* ptr: non_str_raw_storage) {
-                    delete[] (char*) ptr;
+private:
+    std::map<RTInfoTensorName, ov::Shape> extract_matches(
+        std::multimap<RTInfoTensorName, std::shared_ptr<ov::op::v0::Constant>>& output_matches_map,
+        const std::map<RTInfoTensorName, ov::Shape>& names_with_shapes_to_match,
+        const std::list<std::shared_ptr<ov::op::v0::Constant>>& search_list,
+        std::function<bool(const std::string& substring, const std::string& source)> name_match_predicate) {
+        std::map<RTInfoTensorName, ov::Shape> unmatched_rt_info_names;
+        for (const auto& pair : names_with_shapes_to_match) {
+            RTInfoTensorName rt_info_name = pair.first;
+            const ov::Shape& wanted_shape = pair.second;
+            bool matched = false;
+            for (auto it = search_list.begin(); it != search_list.end(); it++) {
+                auto node_ptr = *it;
+                const std::string& friendly_name = node_ptr->get_friendly_name();
+                if (name_match_predicate(rt_info_name, friendly_name) && node_ptr->get_shape() == wanted_shape) {
+                    output_matches_map.insert(std::make_pair(rt_info_name, node_ptr));
+                    matched = true;
+                    break;
                 }
             }
-            private:
-            std::list<void*> non_str_raw_storage;
-        };
-
-        bool maybe_parse_single_element(gguf_type g_type, ov::Any rtmap_value, gguf_value& dst, ValueStorageForLifetimeExtension& store) {
-                switch (g_type) {
-                    case GGUF_TYPE_UINT8:   dst.uint8    = rtmap_value.as<uint8_t>();  break;
-                    case GGUF_TYPE_INT8:    dst.int8     = rtmap_value.as<int8_t>(); ; break;
-                    case GGUF_TYPE_UINT16:  dst.uint16   = rtmap_value.as<uint16_t>(); break;
-                    case GGUF_TYPE_INT16:   dst.int16    = rtmap_value.as<int16_t>();  break;
-                    case GGUF_TYPE_UINT32:  dst.uint32   = rtmap_value.as<uint32_t>(); break;
-                    case GGUF_TYPE_INT32:   dst.int32    = rtmap_value.as<int32_t>();  break;
-                    case GGUF_TYPE_FLOAT32: dst.float32  = rtmap_value.as<float>();    break;
-                    case GGUF_TYPE_UINT64:  dst.uint64   = rtmap_value.as<uint64_t>(); break;
-                    case GGUF_TYPE_INT64:   dst.int64    = rtmap_value.as<int64_t>();  break;
-                    case GGUF_TYPE_FLOAT64: dst.float64  = rtmap_value.as<double>();   break;
-                    case GGUF_TYPE_BOOL:    dst.bool_    = rtmap_value.as<bool>();     break;
-                    case GGUF_TYPE_STRING: {
-                        std::string string_value = rtmap_value.as<std::string>();
-                        store.kv_value_string_storage.push_back(string_value);
-                        dst.str.n = string_value.length();
-                        dst.str.data = (char*) store.kv_value_string_storage.back().c_str(); // TODO (vshampor) see equivalent case below
-                        break;
-                    }
-                    default:
-                        return false;  // did not parse
-                }
-            return true; // parsed successfully
+            if (!matched)
+                unmatched_rt_info_names.insert(pair);
         }
+        return unmatched_rt_info_names;
+    }
 
-        ov::Any get_any_associated_with_gguf_type(gguf_type g_type) {
-            switch (g_type) {
-                case GGUF_TYPE_UINT8:   return ov::Any(uint8_t());
-                case GGUF_TYPE_INT8:    return ov::Any(int8_t());  
-                case GGUF_TYPE_UINT16:  return ov::Any(uint16_t());
-                case GGUF_TYPE_INT16:   return ov::Any(int16_t()); 
-                case GGUF_TYPE_UINT32:  return ov::Any(uint32_t());
-                case GGUF_TYPE_INT32:   return ov::Any(int32_t()); 
-                case GGUF_TYPE_FLOAT32: return ov::Any(float());   
-                case GGUF_TYPE_UINT64:  return ov::Any(uint64_t());
-                case GGUF_TYPE_INT64:   return ov::Any(int64_t()); 
-                case GGUF_TYPE_FLOAT64: return ov::Any(double());  
-                case GGUF_TYPE_BOOL:    return ov::Any(bool());    
-                case GGUF_TYPE_STRING:  return ov::Any(std::string());
-                default:
-                    OPENVINO_THROW("Unknown gguf_type to turn into ov::Any");
-            }
-        }
-
-
-        LlamaCppModel::LlamaCppModel(const std::shared_ptr<ov::Model>& model,
-                      const std::shared_ptr<const ov::IPlugin>& plugin,
-                      const ov::SoPtr<ov::IRemoteContext>& context,
-                      const std::shared_ptr<ov::threading::ITaskExecutor>& task_executor
-                      ) : ICompiledModel(model, plugin, context, task_executor) {
-            m_model = model;
-            num_tokens_processed_ptr = new size_t; // TODO (vshampor): hack, remove
-            *num_tokens_processed_ptr = 0;
-            auto rt_info = model->get_rt_info();
-            OPENVINO_ASSERT(rt_info.count("lcp_kv_params") != 0);
-            OPENVINO_ASSERT(rt_info.count("lcp_kv_types") != 0);
-            OPENVINO_ASSERT(rt_info.count("lcp_kv_array_types") != 0);
-            OPENVINO_ASSERT(rt_info.count("lcp_tensor_name_map") != 0);
-            OPENVINO_ASSERT(rt_info.count("lcp_tensor_shape_map") != 0);
-            OPENVINO_ASSERT(rt_info.count("lcp_expected_tensor_shapes") != 0);
-            OPENVINO_ASSERT(rt_info.count("lcp_transpose_permutations") != 0);
-
-            RTMap& kv_params = model->get_rt_info<RTMap&>("lcp_kv_params");
-            RTMap& kv_types = model->get_rt_info<RTMap&>("lcp_kv_types");
-            RTMap& kv_array_types = model->get_rt_info<RTMap&>("lcp_kv_array_types");
-            RTMap& tensor_name_map = model->get_rt_info<RTMap&>("lcp_tensor_name_map");
-            RTMap& tensor_shape_map = model->get_rt_info<RTMap&>("lcp_tensor_shape_map");
-            RTMap& expected_tensor_shapes_map = model->get_rt_info<RTMap&>("lcp_expected_tensor_shapes");
-            RTMap& transpose_permutations_rtmap = model->get_rt_info<RTMap&>("lcp_transpose_permutations");
-
-            size_t gguf_version = model->get_rt_info<size_t>("lcp_gguf_version");
-            std::cout << "VSHAMPOR: parsed gguf_version " << gguf_version << std::endl;
-
-            // kv params
-            OPENVINO_ASSERT(kv_params.size() == kv_types.size());
-            size_t n_kv = kv_params.size();
-            std::vector<gguf_kv> kv_vector;
-            ValueStorageForLifetimeExtension store;
-
-            for (const auto& kv_pair: kv_params) {
-                gguf_kv kv;
-
-                const auto& key = kv_pair.first;
-                kv.key.n = key.length();
-                store.kv_key_string_storage.push_back(key);
-                kv.key.data = (char*) store.kv_key_string_storage.back().c_str(); // TODO (vshampor) see equivalent case below
-
-                uint32_t value_type = kv_types[key].as<uint32_t>();
-                gguf_type gguf_value_type = (gguf_type) value_type;
-                kv.type = gguf_value_type;
-                if (gguf_value_type != GGUF_TYPE_ARRAY) {
-                    bool is_parsed = maybe_parse_single_element(kv.type, kv_pair.second, kv.value, store);
-                    OPENVINO_ASSERT(is_parsed, "Invalid type of a GGUF kv-value");
-                }
-                else { // array case
-                    gguf_type element_type = (gguf_type) kv_array_types[key].as<uint32_t>();
-                    kv.value.arr.type = element_type;
-                    std::string serialized_array = kv_pair.second.as<std::string>();
-                    std::stringstream ss{serialized_array};
-                    std::vector<gguf_value> parsed_array;
-                    while (!ss.eof()) {
-                        gguf_value array_elt;
-                        ov::Any ov_any = get_any_associated_with_gguf_type(element_type);
-                        std::string token; ss >> token;
-                        if (std::string(kv.key.data) == "tokenizer.ggml.merges") {
-                            // tokenizer merges are pairs of tokens separated by whitespace, so need to read another to get a proper merge
-                            // TODO (vshampor): think of another delimiting strategy in the rt_info and use that strategy here for more robust code
-                            std::string another_token; ss >> another_token;
-                            token += std::string(" ") + another_token;
-                            ov_any = ov::Any::make<std::string>(token);
-                        }
-                        else {
-                            std::stringstream tok_ss{token};
-                            ov_any.read(tok_ss);
-                        }
-                        bool is_parsed = maybe_parse_single_element(element_type, ov_any, array_elt, store);
-                        OPENVINO_ASSERT(is_parsed);
-                        parsed_array.push_back(array_elt);
-                    }
-                    kv.value.arr.n = parsed_array.size();
-                    if (element_type == GGUF_TYPE_STRING) {
-                        // string element has already been lifetime-extended during parsing
-                        std::vector<char*> cstr_vector(parsed_array.size());
-                        for (size_t cstr_idx = 0; cstr_idx < parsed_array.size(); cstr_idx++) {
-                            cstr_vector[cstr_idx] = parsed_array[cstr_idx].str.data;
-                        }
-                        store.str_arr_storage.push_back(cstr_vector);
-                        kv.value.arr.data = store.str_arr_storage.back().data();
-                    }
-                    else {
-                        void* data_ptr = store.store_gguf_value_vector(parsed_array, element_type);
-                        kv.value.arr.data = data_ptr;
-                    }
-                }
-                kv_vector.push_back(kv);
-            }
+    static std::string get_weight_name_without_torch_postfix(std::string torch_weight_name) {
+        size_t idx = torch_weight_name.rfind(".");
+        if (idx == std::string::npos)
+            return torch_weight_name;
+        return std::string(torch_weight_name, 0, idx);
+    }
 
-            auto token_types_kv_it = std::find_if(kv_vector.begin(), kv_vector.end(), [](const gguf_kv& val) { return std::string(val.key.data) == "tokenizer.ggml.token_type"; });
-            if (token_types_kv_it != kv_vector.end()) {
-                auto tokens_kv_it = std::find_if(kv_vector.begin(), kv_vector.end(), [](const gguf_kv& val) { return std::string(val.key.data) == "tokenizer.ggml.tokens"; });
-                if (tokens_kv_it != kv_vector.end()) {
-                    size_t expected_num_tokens = token_types_kv_it->value.arr.n;
-                    size_t actual_num_tokens = tokens_kv_it->value.arr.n;
-                    if (actual_num_tokens < expected_num_tokens) {
-                        std::cout << "VSHAMPOR: detected wrong vocab serialization/deserialization (expected " << expected_num_tokens << " tokens, parsed " << actual_num_tokens << " from vocab), filling tokens with bogus values" << std::endl;
-                        std::vector<char*> new_vocab;
-                        // char** old_vocab_data_ptr = (char**) tokens_kv_it->value.arr.data;
-                        // std::copy(old_vocab_data_ptr, old_vocab_data_ptr + actual_num_tokens, new_vocab.begin());
-                        // size_t extra_tokens_needed = expected_num_tokens - actual_num_tokens;
-                        size_t extra_tokens_needed = expected_num_tokens;
-                        for (size_t tok_idx = 0; tok_idx < extra_tokens_needed; tok_idx++) {
-                            std::stringstream ss;
-                            ss << "invalid_token_" << tok_idx;
-                            std::string new_token = ss.str();
-                            store.kv_value_string_storage.push_back(new_token);
-                            char* str_data_ptr = (char*) store.kv_value_string_storage.back().c_str();
-                            new_vocab.push_back(str_data_ptr);
-                        }
-                        OPENVINO_ASSERT(new_vocab.size() == expected_num_tokens);
-                        store.str_arr_storage.push_back(new_vocab);
-                        tokens_kv_it->value.arr.data = (void*) store.str_arr_storage.back().data();
-                        tokens_kv_it->value.arr.n = expected_num_tokens;
-                    }
+    size_t num_exact_matches = 0;
+    size_t num_partial_matches = 0;
+    std::unordered_map<RTInfoTensorName, std::shared_ptr<ov::op::v0::Constant>> m_rtinfo_name_to_weight_node_map;
+};
+
+std::vector<std::shared_ptr<ov::Node>> get_nodes_containing_name_with_shape(const std::shared_ptr<ov::Model>& model,
+                                                                            const std::string& weight_name,
+                                                                            const ov::Shape& shape) {
+    auto ops = model->get_ops();
+    std::vector<std::shared_ptr<ov::Node>> found_weight_nodes;
+    std::copy_if(ops.begin(),
+                 ops.end(),
+                 std::back_inserter(found_weight_nodes),
+                 [&weight_name, &shape](const std::shared_ptr<ov::Node>& val) {
+                     if (!ov::is_type<ov::op::v0::Constant>(val))
+                         return false;
+                     std::shared_ptr<ov::op::v0::Constant> node_ptr = ov::as_type_ptr<ov::op::v0::Constant>(val);
+                     return val->get_friendly_name().find(weight_name) != std::string::npos &&
+                            val->get_shape() == shape;
+                 });
+    return found_weight_nodes;
+}
+
+bool has_weight_matches(const std::shared_ptr<ov::Model>& model,
+                        const std::string& weight_name,
+                        const ov::Shape& shape) {
+    std::vector<std::shared_ptr<ov::Node>> found_weight_nodes;
+    found_weight_nodes = get_nodes_containing_name_with_shape(model, weight_name, shape);
+    return !found_weight_nodes.empty();
+}
+
+std::string get_weight_name_without_torch_postfix(std::string torch_weight_name) {
+    size_t idx = torch_weight_name.rfind(".");
+    if (idx == std::string::npos)
+        return torch_weight_name;
+    return std::string(torch_weight_name, 0, idx);
+}
+
+bool has_partial_weight_matches(const std::shared_ptr<ov::Model>& model,
+                                const std::string& weight_name,
+                                const ov::Shape& shape) {
+    std::vector<std::shared_ptr<ov::Node>> found_weight_nodes;
+    found_weight_nodes =
+        get_nodes_containing_name_with_shape(model, get_weight_name_without_torch_postfix(weight_name), shape);
+    return !found_weight_nodes.empty();
+}
+
+std::shared_ptr<ov::op::v0::Constant> get_weight_by_name_and_shape(const std::shared_ptr<ov::Model>& model,
+                                                                   const std::string& weight_name,
+                                                                   const ov::Shape& shape) {
+    OPENVINO_ASSERT(has_weight_matches(model, weight_name, shape));
+    std::vector<std::shared_ptr<ov::Node>> found_weight_nodes;
+    found_weight_nodes = get_nodes_containing_name_with_shape(model, weight_name, shape);
+
+    if (found_weight_nodes.size() > 1) {
+        std::cout << "VSHAMPOR: multiple matches for weight name " << weight_name << " and shape " << shape.to_string()
+                  << ", found ";
+        for (const auto& node_ptr : found_weight_nodes) {
+            std::cout << node_ptr->get_friendly_name() << "(shape " << shape.to_string() << "),";
+        }
+        std::cout << "will take the first match" << std::endl;
+    }
+    std::shared_ptr<ov::Node> node_with_tensor = found_weight_nodes.front();
+    OPENVINO_ASSERT(ov::is_type<ov::op::v0::Constant>(node_with_tensor));
+    std::shared_ptr<ov::op::v0::Constant> const_node_ptr = ov::as_type_ptr<ov::op::v0::Constant>(node_with_tensor);
+    return const_node_ptr;
+}
+
+using TransposePermutation = std::pair<size_t, size_t>;
+
+std::vector<size_t> expand_front(const std::vector<size_t>& vec, size_t val) {
+    OPENVINO_ASSERT(vec.size() < GGML_MAX_DIMS);
+    std::vector<size_t> retval(GGML_MAX_DIMS, val);
+    std::copy(vec.rbegin(), vec.rend(), retval.rbegin());
+    return retval;
+}
+
+void write_float_plus_one(std::ofstream& out, const float* src) {
+    float elt = *src;
+    elt += 1;
+    out.write((const char*)&elt, sizeof(float));
+}
+
+void append_tensor_data_with_transpositions(const std::string& fname,
+                                            const std::vector<gguf_tensor_info>& tensor_infos,
+                                            const std::vector<void*>& tensor_data_ptrs,
+                                            const std::map<std::string, TransposePermutation>& transpositions,
+                                            const std::set<std::string> increment_by_one_tensor_names) {
+    // assuming contiguous data underneath each pointer from tensor_data_ptrs
+    OPENVINO_ASSERT(tensor_infos.size() == tensor_data_ptrs.size());
+    std::ofstream out(fname, std::ios::app | std::ios::out);
+    for (size_t i = 0; i < tensor_infos.size(); i++) {
+        const auto& tensor_info = tensor_infos[i];
+        OPENVINO_ASSERT(tensor_info.type == GGML_TYPE_F32);  // TODO (vshampor): writing transposed tensor data for
+                                                             // other data types, especially lower-bitwidth; maybe
+                                                             // use OV inference for that
+
+        const char* ir_tensor_data = reinterpret_cast<char*>(tensor_data_ptrs[i]);
+
+        std::string tensor_llama_name = std::string(tensor_info.name.data);
+        auto it = transpositions.find(tensor_llama_name);
+        if (it == transpositions.end()) {
+            // original IR tensor should not be transposed to conform to GGUF
+            // expectations, can write as-is
+            if (increment_by_one_tensor_names.count(tensor_llama_name) != 0) {  // gemma case
+                size_t elt_size = sizeof(float);                                // FP32 only for now
+                OPENVINO_ASSERT(!(tensor_info.size % elt_size));
+                size_t num_elts = tensor_info.size / elt_size;
+                for (size_t elt_idx = 0; elt_idx < num_elts; elt_idx++) {
+                    write_float_plus_one(out, ((float*)ir_tensor_data) + elt_idx);
                 }
+            } else {
+                out.write(ir_tensor_data, tensor_info.size);
             }
+            continue;
+        }
 
-            // tensors
-            OPENVINO_ASSERT(tensor_name_map.size() == tensor_shape_map.size());
-            size_t n_tensors_in_rtinfo = tensor_name_map.size();
-            std::cout << "VSHAMPOR: got request for " << n_tensors_in_rtinfo << " tensors from rt_info\n";
-
-            std::vector<struct gguf_tensor_info> tensor_infos;
-            std::vector<void*> tensor_data_ptrs;
-
-            std::map<std::string, ov::Shape> parsed_weights_to_search_for;
-            for (const auto& llama_name_and_rtinfo_name : tensor_name_map) {
-                const std::string& llama_name = llama_name_and_rtinfo_name.first;
-                const std::string& rtinfo_name = llama_name_and_rtinfo_name.second.as<std::string>();
-                ov::Shape expected_shape = tensor_shape_map[llama_name].as<std::string>();
-                parsed_weights_to_search_for[rtinfo_name] = expected_shape;
-            }
+        if (it != transpositions.end()) {
+            std::vector<size_t> gguf_layout_shape;
 
-            TensorWeightMatcher matcher{model, parsed_weights_to_search_for};
-            std::unordered_map<std::string, std::shared_ptr<ov::op::v0::Constant>> matches = matcher.get_matches();
-            std::unordered_map<std::string, std::shared_ptr<ov::op::v0::Constant>> llama_name_to_constant_node_map;
-            for (const auto& entry : tensor_name_map) {
-                const auto& llama_name = entry.first;
-                const auto& rtinfo_name = entry.second.as<std::string>();
-                llama_name_to_constant_node_map[llama_name] = matches[rtinfo_name];
+            // the shape in .ne is inverted w.r.t original export (~= IR) weight
+            // layout
+            for (size_t dim_idx = 0; dim_idx < tensor_info.n_dims; dim_idx++) {
+                gguf_layout_shape.push_back(tensor_info.ne[GGML_MAX_DIMS - 1 - tensor_info.n_dims - dim_idx]);
             }
-            std::cout << "VSHAMPOR: requested tensors map to " << llama_name_to_constant_node_map.size() << " tensors to search in model (shared tensors considered)\n";
-
-
-            std::list<std::string> llama_name_storage;
-
-            size_t n_tensors = 0;
-
-            size_t offset = 0; // each tensor_info has to have a correct offset including padding, checked for in gguf_write_to_buf
-            for (const auto& matched_weight_pair : llama_name_to_constant_node_map) {
-                // Need to store the names in the list so that the passed c_str() pointers in tensor_infos to the llama names stay valid
-                // until they get deepcopied in gguf/llama functions
-                llama_name_storage.push_back(matched_weight_pair.first);
-                const std::string& llama_name = llama_name_storage.back();
-
-                auto weight_const_node_ptr = matched_weight_pair.second;
-                auto weight_shape = weight_const_node_ptr->get_shape();
-
-                // does hf-to-gguf invert all tensor dimensions with shapes > 1?
-                auto expected_weight_shape = ov::Shape(expected_tensor_shapes_map[llama_name].as<std::string>());
-                OPENVINO_ASSERT(expected_weight_shape.size() < GGML_MAX_DIMS);
 
-                gguf_tensor_info info;
+            TransposePermutation permutation = it->second;
+            std::vector<size_t> ir_layout_shape(gguf_layout_shape);
+            std::swap(ir_layout_shape[permutation.first], ir_layout_shape[permutation.second]);
 
-                info.type = GGML_TYPE_F32; // TODO (vshampor): better type assignment based on actual element type of the Constant node
+            std::vector<size_t> ir_layout_strides(tensor_info.n_dims, 1);
 
-                info.name.n = llama_name.length();
-                info.name.data = (char*) llama_name.c_str();  // TODO (vshampor): either do this via const_cast, or will have to implement own structures for
-                                                              // read-only data passing to llama_load_model_from_data
-                info.n_dims = weight_shape.size();
-                std::fill(std::begin(info.ne), std::begin(info.ne) + GGML_MAX_DIMS, (uint64_t) 1);
-
-                // looks like GGUF expects inverse order of dimensions when compared to e.g. torch and actual row-major layout, see gguf.gguf_writer.GGUFWriter.add_tensor_info
-                // in gguf python package
-                std::copy(expected_weight_shape.rbegin(), expected_weight_shape.rend(), info.ne);
-
-                void* data_ptr = (void*)(weight_const_node_ptr->get_data_ptr()); // TODO (vshampor): danger - casts `const` away
-                                                                                 // also - the expected_weight_shape is in general different from actual ov::Tensor shape,
-                                                                                 // in particular it may be transposed, so we actually need to set the pointers to shape-corrected
-                                                                                 // tensor storage, which we don't do here - we are only preparing this data to get a convenient
-                                                                                 // gguf_context object to reuse metadata (header) writing code, tensor data transpositions will be done during
-                                                                                 // actual file write
-
-                info.size = weight_const_node_ptr->get_byte_size();
-                info.offset = offset;
-
-                const size_t size_pad = GGML_PAD(info.size, GGUF_DEFAULT_ALIGNMENT);
-                offset += size_pad;
-
-                info.data = data_ptr;
-
-                tensor_infos.push_back(info);
-                tensor_data_ptrs.push_back(data_ptr);
-                n_tensors++;
+            for (size_t idx = 0; idx < tensor_info.n_dims - 1; idx++) {
+                auto previous_stride_it = ir_layout_strides.rbegin() + idx;
+                auto stride_it = ir_layout_strides.rbegin() + idx + 1;
+                auto shape_it = ir_layout_shape.rbegin() + idx;
+                *stride_it = *shape_it * *previous_stride_it;
             }
 
-            std::cout << "VSHAMPOR: found " << matches.size() << "/" << parsed_weights_to_search_for.size() << " tensors" << std::endl;
-
-            gguf_init_params gguf_params;
-            gguf_params.no_alloc = false;
-            gguf_params.ctx = nullptr;
-
-            m_gguf_ctx = gguf_init_from_data(n_tensors, tensor_infos.data(), n_kv, kv_vector.data(), tensor_data_ptrs.data(), gguf_params);
-
-            std::shared_ptr<const LlamaCppPlugin> llama_plugin_ptr = std::dynamic_pointer_cast<const LlamaCppPlugin>(plugin);
-            m_converted_gguf_file_name = llama_plugin_ptr->get_current_gguf_file_path();
-
-            std::cout << "VSHAMPOR: output filename is  " << m_converted_gguf_file_name << std::endl;
-            std::cout << "VSHAMPOR: writing metadata (GGUF header) " << std::endl;
-            gguf_write_to_file(m_gguf_ctx, m_converted_gguf_file_name.c_str(), /* only_meta = */ true);
-
-            std::map<std::string, TransposePermutation> transpose_permutations;
-
-            for (const auto& llama_name_and_permutation : transpose_permutations_rtmap) {
-                std::string permutation_str = llama_name_and_permutation.second.as<std::string>();
-                std::stringstream ss(permutation_str);
-                TransposePermutation permutation;
-                bool is_ok = true;
-                is_ok &= static_cast<bool>(ss >> permutation.first);
-                is_ok &= static_cast<bool>(ss >> permutation.second);
-                OPENVINO_ASSERT(is_ok, "failed to read permutation");
-                transpose_permutations[llama_name_and_permutation.first] = permutation;
+            std::vector<size_t> permuted_strides(ir_layout_strides);
+            std::swap(permuted_strides[permutation.first], permuted_strides[permutation.second]);
+
+            // expand up to GGML_MAX_DIMS
+            std::vector<size_t> gguf_layout_shape_ex = expand_front(gguf_layout_shape, 1);
+            // stride for unused dims will be 0, has no effect on loop because
+            // dimension idx for that dim is always 0
+            permuted_strides = expand_front(permuted_strides, 0);
+
+            std::cout << "VSHAMPOR: writing tensor " << tensor_info.name.data << " with size " << tensor_info.size;
+            std::cout << " shape (GGUF layout) ";
+            for (auto dim : gguf_layout_shape)
+                std::cout << dim << ",";
+            std::cout << " shape (IR layout) ";
+            for (auto dim : ir_layout_shape)
+                std::cout << dim << ",";
+            std::cout << " stride (IR layout) ";
+            for (auto stride : ir_layout_strides)
+                std::cout << stride << ",";
+            std::cout << " stride (IR layout, transposing) ";
+            for (auto stride : permuted_strides)
+                std::cout << stride << ",";
+            std::cout << std::endl;
+
+            // TODO (vshampor): rewrite the loop below using recurrent templates?
+            // This relies on GGUF_MAX_DIMS == 4 and unused dims being equal to 1
+            size_t current_offset = 0;
+            size_t element_size = sizeof(float);
+            size_t num_bytes_written = 0;
+            for (size_t dim_0 = 0; dim_0 < gguf_layout_shape_ex[0]; dim_0++)
+                for (size_t dim_1 = 0; dim_1 < gguf_layout_shape_ex[1]; dim_1++)
+                    for (size_t dim_2 = 0; dim_2 < gguf_layout_shape_ex[2]; dim_2++)
+                        for (size_t dim_3 = 0; dim_3 < gguf_layout_shape_ex[3]; dim_3++) {
+                            current_offset = element_size * (dim_0 * permuted_strides[0] + dim_1 * permuted_strides[1] +
+                                                             dim_2 * permuted_strides[2] + dim_3 * permuted_strides[3]);
+                            if (increment_by_one_tensor_names.count(tensor_llama_name) != 0) {  // gemma case
+                                write_float_plus_one(out, (float*)ir_tensor_data + current_offset);
+                            } else {
+                                out.write(ir_tensor_data + current_offset, element_size);
+                            }
+                            num_bytes_written += element_size;
+                        }
+            std::cout << "VSHAMPOR: wrote " << num_bytes_written << std::endl;
+            OPENVINO_ASSERT(num_bytes_written == tensor_info.size);
+        }
+    }
+}
+
+struct ValueStorageForLifetimeExtension {
+    std::list<std::string> kv_key_string_storage;
+    std::list<std::string> kv_value_string_storage;
+    std::list<std::vector<char*>> str_arr_storage;
+    void* store_gguf_value_vector(const std::vector<gguf_value>& vec, gguf_type g_type) {
+        size_t elt_size;
+        switch (g_type) {
+        case GGUF_TYPE_UINT8:
+            elt_size = sizeof(uint8_t);
+            break;
+        case GGUF_TYPE_INT8:
+            elt_size = sizeof(int8_t);
+            break;
+        case GGUF_TYPE_UINT16:
+            elt_size = sizeof(uint16_t);
+            break;
+        case GGUF_TYPE_INT16:
+            elt_size = sizeof(int16_t);
+            break;
+        case GGUF_TYPE_UINT32:
+            elt_size = sizeof(uint32_t);
+            break;
+        case GGUF_TYPE_INT32:
+            elt_size = sizeof(int32_t);
+            break;
+        case GGUF_TYPE_FLOAT32:
+            elt_size = sizeof(float);
+            break;
+        case GGUF_TYPE_UINT64:
+            elt_size = sizeof(uint64_t);
+            break;
+        case GGUF_TYPE_INT64:
+            elt_size = sizeof(int64_t);
+            break;
+        case GGUF_TYPE_FLOAT64:
+            elt_size = sizeof(double);
+            break;
+        case GGUF_TYPE_BOOL:
+            elt_size = sizeof(bool);
+            break;
+        default:
+            OPENVINO_THROW("Unknown array type");
+        }
+        size_t size_in_bytes = vec.size() * elt_size;
+        void* mem_ptr = new char[size_in_bytes];
+        for (size_t i = 0; i < vec.size(); i++) {
+            switch (g_type) {
+            case GGUF_TYPE_UINT8:
+                ((uint8_t*)mem_ptr)[i] = vec[i].uint8;
+                break;
+            case GGUF_TYPE_INT8:
+                ((int8_t*)mem_ptr)[i] = vec[i].int8;
+                break;
+            case GGUF_TYPE_UINT16:
+                ((uint16_t*)mem_ptr)[i] = vec[i].uint16;
+                break;
+            case GGUF_TYPE_INT16:
+                ((int16_t*)mem_ptr)[i] = vec[i].int16;
+                break;
+            case GGUF_TYPE_UINT32:
+                ((uint32_t*)mem_ptr)[i] = vec[i].uint32;
+                break;
+            case GGUF_TYPE_INT32:
+                ((int32_t*)mem_ptr)[i] = vec[i].int32;
+                break;
+            case GGUF_TYPE_FLOAT32:
+                ((float*)mem_ptr)[i] = vec[i].float32;
+                break;
+            case GGUF_TYPE_UINT64:
+                ((uint64_t*)mem_ptr)[i] = vec[i].uint64;
+                break;
+            case GGUF_TYPE_INT64:
+                ((int64_t*)mem_ptr)[i] = vec[i].int64;
+                break;
+            case GGUF_TYPE_FLOAT64:
+                ((double*)mem_ptr)[i] = vec[i].float64;
+                break;
+            case GGUF_TYPE_BOOL:
+                ((bool*)mem_ptr)[i] = vec[i].bool_;
+                break;
+            default:
+                OPENVINO_THROW("Unknown array type");
             }
-
-            std::set<std::string> gemma_tensor_names_to_increment;
-            // FIXME (vshampor): tried setting up commands for incrementing *_norm.weight values by 1 like it is done
-            // during llama.cpp HF-to-GGUF export, but it seems that it isn't necessary and IR stores the incremented weights already
-            // Is this due to constant folding?
-
-            // for (const auto& llama_name_and_rtinfo_name : tensor_name_map) {
-            //     const std::string& llama_name = llama_name_and_rtinfo_name.first;
-            //     const std::string& rtinfo_name = llama_name_and_rtinfo_name.second.as<std::string>();
-            //     std::string gemma_norm_suffix = "norm.weight";
-            //     if (rtinfo_name.size() < gemma_norm_suffix.size()) continue;
-            //     if (rtinfo_name.substr(rtinfo_name.size() - gemma_norm_suffix.size()) == gemma_norm_suffix) gemma_tensor_names_to_increment.insert(llama_name);
-            // }
-
-            std::cout << "VSHAMPOR: writing tensor data (blob with transpositions) " << std::endl;
-            append_tensor_data_with_transpositions(m_converted_gguf_file_name, tensor_infos, tensor_data_ptrs, transpose_permutations, gemma_tensor_names_to_increment);
-            std::cout << "VSHAMPOR: write finished." << m_converted_gguf_file_name << std::endl;
-
-            std::cout << "VSHAMPOR: loading llama model from written file..." << std::endl;
-            llama_model_params mparams = llama_model_default_params();
-            mparams.n_gpu_layers = 99;
-            m_llama_model_ptr = llama_load_model_from_file(m_converted_gguf_file_name.c_str(), mparams);
-            llama_context_params cparams = llama_context_default_params();
-            m_llama_ctx = llama_new_context_with_model(m_llama_model_ptr, cparams);
-
-            std::cout << "VSHAMPOR: llama model loaded successfully..." << std::endl;
         }
+        return mem_ptr;
+    }
 
-
-        LlamaCppModel::LlamaCppModel(const std::shared_ptr<ov::Model>& ov_model, std::istream& input_stream, const std::shared_ptr<const IPlugin>& plugin) :
-            ICompiledModel(ov_model, plugin) {
-            num_tokens_processed_ptr = new size_t; // TODO (vshampor): hack, remove
-            *num_tokens_processed_ptr = 0;
-            std::shared_ptr<const LlamaCppPlugin> llama_plugin = std::dynamic_pointer_cast<const LlamaCppPlugin>(plugin);
-            std::string current_file_path = llama_plugin->get_current_gguf_file_path();
-            std::ofstream output_stream(current_file_path, std::ios::binary);
-            output_stream << input_stream.rdbuf();
-
-
-            std::cout << "VSHAMPOR: loading llama model from imported and re-written file..." << std::endl;
-            llama_model_params mparams = llama_model_default_params();
-            mparams.n_gpu_layers = 99;
-            m_llama_model_ptr = llama_load_model_from_file(current_file_path.c_str(), mparams);
-            llama_context_params cparams = llama_context_default_params();
-            m_llama_ctx = llama_new_context_with_model(m_llama_model_ptr, cparams);
-            std::cout << "VSHAMPOR: llama model loaded successfully from cache..." << std::endl;
+    ValueStorageForLifetimeExtension() = default;
+    ~ValueStorageForLifetimeExtension() {
+        for (void* ptr : non_str_raw_storage) {
+            delete[](char*) ptr;
         }
+    }
 
-        LlamaCppModel::LlamaCppModel(const std::string& gguf_fname, const std::shared_ptr<const IPlugin>& plugin) :
-            ICompiledModel(nullptr, plugin) {
-            num_tokens_processed_ptr = new size_t; // TODO (vshampor): hack, remove
-            *num_tokens_processed_ptr = 0;
-            std::cout << "VSHAMPOR: loading llama model directly from GGUF... " << std::endl;
-            llama_model_params mparams = llama_model_default_params();
-            mparams.n_gpu_layers = 99;
-            m_llama_model_ptr = llama_load_model_from_file(gguf_fname.c_str(), mparams);
-            llama_context_params cparams = llama_context_default_params();
-            m_llama_ctx = llama_new_context_with_model(m_llama_model_ptr, cparams);
-            std::cout << "VSHAMPOR: llama model loaded successfully from GGUF..." << std::endl;
-
-            auto input_ids = std::make_shared<ov::opset13::Parameter>(ov::element::Type_t::i64, ov::PartialShape({-1, -1}));
-            auto fake_convert = std::make_shared<ov::opset13::Convert>(input_ids->output(0), ov::element::Type_t::f32);
-            auto logits = std::make_shared<ov::opset13::Result>(fake_convert->output(0));
-
-            ov::ParameterVector inputs{input_ids};
-
-            std::vector<std::pair<std::string, ov::element::Type_t>> unused_names_in_order = { { "attention_mask", ov::element::Type_t::i64 },
-                                                                                               { "position_ids", ov::element::Type_t::i64 },
-                                                                                               { "beam_idx", ov::element::Type_t::i32 } };
-            for (const auto& descr : unused_names_in_order) {
-                auto unused_inp = std::make_shared<ov::opset13::Parameter>(descr.second, ov::PartialShape({-1, -1}));
-                inputs.push_back(unused_inp);
+private:
+    std::list<void*> non_str_raw_storage;
+};
+
+bool maybe_parse_single_element(gguf_type g_type,
+                                ov::Any rtmap_value,
+                                gguf_value& dst,
+                                ValueStorageForLifetimeExtension& store) {
+    switch (g_type) {
+    case GGUF_TYPE_UINT8:
+        dst.uint8 = rtmap_value.as<uint8_t>();
+        break;
+    case GGUF_TYPE_INT8:
+        dst.int8 = rtmap_value.as<int8_t>();
+        ;
+        break;
+    case GGUF_TYPE_UINT16:
+        dst.uint16 = rtmap_value.as<uint16_t>();
+        break;
+    case GGUF_TYPE_INT16:
+        dst.int16 = rtmap_value.as<int16_t>();
+        break;
+    case GGUF_TYPE_UINT32:
+        dst.uint32 = rtmap_value.as<uint32_t>();
+        break;
+    case GGUF_TYPE_INT32:
+        dst.int32 = rtmap_value.as<int32_t>();
+        break;
+    case GGUF_TYPE_FLOAT32:
+        dst.float32 = rtmap_value.as<float>();
+        break;
+    case GGUF_TYPE_UINT64:
+        dst.uint64 = rtmap_value.as<uint64_t>();
+        break;
+    case GGUF_TYPE_INT64:
+        dst.int64 = rtmap_value.as<int64_t>();
+        break;
+    case GGUF_TYPE_FLOAT64:
+        dst.float64 = rtmap_value.as<double>();
+        break;
+    case GGUF_TYPE_BOOL:
+        dst.bool_ = rtmap_value.as<bool>();
+        break;
+    case GGUF_TYPE_STRING: {
+        std::string string_value = rtmap_value.as<std::string>();
+        store.kv_value_string_storage.push_back(string_value);
+        dst.str.n = string_value.length();
+        dst.str.data =
+            (char*)store.kv_value_string_storage.back().c_str();  // TODO (vshampor) see equivalent case below
+        break;
+    }
+    default:
+        return false;  // did not parse
+    }
+    return true;  // parsed successfully
+}
+
+ov::Any get_any_associated_with_gguf_type(gguf_type g_type) {
+    switch (g_type) {
+    case GGUF_TYPE_UINT8:
+        return ov::Any(uint8_t());
+    case GGUF_TYPE_INT8:
+        return ov::Any(int8_t());
+    case GGUF_TYPE_UINT16:
+        return ov::Any(uint16_t());
+    case GGUF_TYPE_INT16:
+        return ov::Any(int16_t());
+    case GGUF_TYPE_UINT32:
+        return ov::Any(uint32_t());
+    case GGUF_TYPE_INT32:
+        return ov::Any(int32_t());
+    case GGUF_TYPE_FLOAT32:
+        return ov::Any(float());
+    case GGUF_TYPE_UINT64:
+        return ov::Any(uint64_t());
+    case GGUF_TYPE_INT64:
+        return ov::Any(int64_t());
+    case GGUF_TYPE_FLOAT64:
+        return ov::Any(double());
+    case GGUF_TYPE_BOOL:
+        return ov::Any(bool());
+    case GGUF_TYPE_STRING:
+        return ov::Any(std::string());
+    default:
+        OPENVINO_THROW("Unknown gguf_type to turn into ov::Any");
+    }
+}
+
+LlamaCppModel::LlamaCppModel(const std::shared_ptr<ov::Model>& model,
+                             const std::shared_ptr<const ov::IPlugin>& plugin,
+                             const ov::SoPtr<ov::IRemoteContext>& context,
+                             const std::shared_ptr<ov::threading::ITaskExecutor>& task_executor)
+    : ICompiledModel(model, plugin, context, task_executor) {
+    m_model = model;
+    num_tokens_processed_ptr = new size_t;  // TODO (vshampor): hack, remove
+    *num_tokens_processed_ptr = 0;
+    auto rt_info = model->get_rt_info();
+    OPENVINO_ASSERT(rt_info.count("lcp_kv_params") != 0);
+    OPENVINO_ASSERT(rt_info.count("lcp_kv_types") != 0);
+    OPENVINO_ASSERT(rt_info.count("lcp_kv_array_types") != 0);
+    OPENVINO_ASSERT(rt_info.count("lcp_tensor_name_map") != 0);
+    OPENVINO_ASSERT(rt_info.count("lcp_tensor_shape_map") != 0);
+    OPENVINO_ASSERT(rt_info.count("lcp_expected_tensor_shapes") != 0);
+    OPENVINO_ASSERT(rt_info.count("lcp_transpose_permutations") != 0);
+
+    RTMap& kv_params = model->get_rt_info<RTMap&>("lcp_kv_params");
+    RTMap& kv_types = model->get_rt_info<RTMap&>("lcp_kv_types");
+    RTMap& kv_array_types = model->get_rt_info<RTMap&>("lcp_kv_array_types");
+    RTMap& tensor_name_map = model->get_rt_info<RTMap&>("lcp_tensor_name_map");
+    RTMap& tensor_shape_map = model->get_rt_info<RTMap&>("lcp_tensor_shape_map");
+    RTMap& expected_tensor_shapes_map = model->get_rt_info<RTMap&>("lcp_expected_tensor_shapes");
+    RTMap& transpose_permutations_rtmap = model->get_rt_info<RTMap&>("lcp_transpose_permutations");
+
+    size_t gguf_version = model->get_rt_info<size_t>("lcp_gguf_version");
+    std::cout << "VSHAMPOR: parsed gguf_version " << gguf_version << std::endl;
+
+    // kv params
+    OPENVINO_ASSERT(kv_params.size() == kv_types.size());
+    size_t n_kv = kv_params.size();
+    std::vector<gguf_kv> kv_vector;
+    ValueStorageForLifetimeExtension store;
+
+    for (const auto& kv_pair : kv_params) {
+        gguf_kv kv;
+
+        const auto& key = kv_pair.first;
+        kv.key.n = key.length();
+        store.kv_key_string_storage.push_back(key);
+        kv.key.data = (char*)store.kv_key_string_storage.back().c_str();  // TODO (vshampor) see equivalent case below
+
+        uint32_t value_type = kv_types[key].as<uint32_t>();
+        gguf_type gguf_value_type = (gguf_type)value_type;
+        kv.type = gguf_value_type;
+        if (gguf_value_type != GGUF_TYPE_ARRAY) {
+            bool is_parsed = maybe_parse_single_element(kv.type, kv_pair.second, kv.value, store);
+            OPENVINO_ASSERT(is_parsed, "Invalid type of a GGUF kv-value");
+        } else {  // array case
+            gguf_type element_type = (gguf_type)kv_array_types[key].as<uint32_t>();
+            kv.value.arr.type = element_type;
+            std::string serialized_array = kv_pair.second.as<std::string>();
+            std::stringstream ss{serialized_array};
+            std::vector<gguf_value> parsed_array;
+            while (!ss.eof()) {
+                gguf_value array_elt;
+                ov::Any ov_any = get_any_associated_with_gguf_type(element_type);
+                std::string token;
+                ss >> token;
+                if (std::string(kv.key.data) == "tokenizer.ggml.merges") {
+                    // tokenizer merges are pairs of tokens separated by whitespace, so
+                    // need to read another to get a proper merge
+                    // TODO (vshampor): think of another delimiting strategy in the
+                    // rt_info and use that strategy here for more robust code
+                    std::string another_token;
+                    ss >> another_token;
+                    token += std::string(" ") + another_token;
+                    ov_any = ov::Any::make<std::string>(token);
+                } else {
+                    std::stringstream tok_ss{token};
+                    ov_any.read(tok_ss);
+                }
+                bool is_parsed = maybe_parse_single_element(element_type, ov_any, array_elt, store);
+                OPENVINO_ASSERT(is_parsed);
+                parsed_array.push_back(array_elt);
             }
-
-            m_model = std::make_shared<ov::Model>(logits, inputs, "fake_ov_model_for_io_specification");
-
-            m_model->inputs()[0].set_names({"input_ids"});
-            for (size_t i = 0; i < unused_names_in_order.size(); i++) {
-                m_model->inputs()[i + 1].set_names({unused_names_in_order[i].first});
+            kv.value.arr.n = parsed_array.size();
+            if (element_type == GGUF_TYPE_STRING) {
+                // string element has already been lifetime-extended during parsing
+                std::vector<char*> cstr_vector(parsed_array.size());
+                for (size_t cstr_idx = 0; cstr_idx < parsed_array.size(); cstr_idx++) {
+                    cstr_vector[cstr_idx] = parsed_array[cstr_idx].str.data;
+                }
+                store.str_arr_storage.push_back(cstr_vector);
+                kv.value.arr.data = store.str_arr_storage.back().data();
+            } else {
+                void* data_ptr = store.store_gguf_value_vector(parsed_array, element_type);
+                kv.value.arr.data = data_ptr;
             }
+        }
+        kv_vector.push_back(kv);
+    }
 
-            m_model->outputs()[0].set_names({"logits"});
-
-            for (auto input : m_model->inputs()) {
-                m_fake_inputs.emplace_back(input);
-            }
-            for (auto output : m_model->outputs()) {
-                m_fake_outputs.emplace_back(output);
+    auto token_types_kv_it = std::find_if(kv_vector.begin(), kv_vector.end(), [](const gguf_kv& val) {
+        return std::string(val.key.data) == "tokenizer.ggml.token_type";
+    });
+    if (token_types_kv_it != kv_vector.end()) {
+        auto tokens_kv_it = std::find_if(kv_vector.begin(), kv_vector.end(), [](const gguf_kv& val) {
+            return std::string(val.key.data) == "tokenizer.ggml.tokens";
+        });
+        if (tokens_kv_it != kv_vector.end()) {
+            size_t expected_num_tokens = token_types_kv_it->value.arr.n;
+            size_t actual_num_tokens = tokens_kv_it->value.arr.n;
+            if (actual_num_tokens < expected_num_tokens) {
+                std::cout << "VSHAMPOR: detected wrong vocab "
+                             "serialization/deserialization (expected "
+                          << expected_num_tokens << " tokens, parsed " << actual_num_tokens
+                          << " from vocab), filling tokens with bogus values" << std::endl;
+                std::vector<char*> new_vocab;
+                // char** old_vocab_data_ptr = (char**) tokens_kv_it->value.arr.data;
+                // std::copy(old_vocab_data_ptr, old_vocab_data_ptr + actual_num_tokens,
+                // new_vocab.begin()); size_t extra_tokens_needed = expected_num_tokens
+                // - actual_num_tokens;
+                size_t extra_tokens_needed = expected_num_tokens;
+                for (size_t tok_idx = 0; tok_idx < extra_tokens_needed; tok_idx++) {
+                    std::stringstream ss;
+                    ss << "invalid_token_" << tok_idx;
+                    std::string new_token = ss.str();
+                    store.kv_value_string_storage.push_back(new_token);
+                    char* str_data_ptr = (char*)store.kv_value_string_storage.back().c_str();
+                    new_vocab.push_back(str_data_ptr);
+                }
+                OPENVINO_ASSERT(new_vocab.size() == expected_num_tokens);
+                store.str_arr_storage.push_back(new_vocab);
+                tokens_kv_it->value.arr.data = (void*)store.str_arr_storage.back().data();
+                tokens_kv_it->value.arr.n = expected_num_tokens;
             }
         }
+    }
 
+    // tensors
+    OPENVINO_ASSERT(tensor_name_map.size() == tensor_shape_map.size());
+    size_t n_tensors_in_rtinfo = tensor_name_map.size();
+    std::cout << "VSHAMPOR: got request for " << n_tensors_in_rtinfo << " tensors from rt_info\n";
 
-        void LlamaCppModel::export_model(std::ostream& output_stream) const {
-            std::cout << "VSHAMPOR: exporting model" << std::endl;
-
-            // FIXME (vshampor): it's a shame that loading a model from cache does not have an option to
-            // actually keep the already loaded model from xml and not be forced to deserialize an ov::Model
-            // representation from cache as well. As it stands, will need to write the whole IR into the cache entry
-            // along with the GGUF file.
-            //
-            std::stringstream xmlFile, binFile;
-            ov::pass::Serialize serializer(xmlFile, binFile);
-            serializer.run_on_model(m_model);
-
-            auto m_constants = binFile.str();
-            auto m_model = xmlFile.str();
-
-            auto dataSize = static_cast<std::uint64_t>(m_model.size());
-            output_stream.write(reinterpret_cast<char*>(&dataSize), sizeof(dataSize));
-            output_stream.write(m_model.c_str(), dataSize);
+    std::vector<struct gguf_tensor_info> tensor_infos;
+    std::vector<void*> tensor_data_ptrs;
 
-            dataSize = static_cast<std::uint64_t>(m_constants.size());
-            output_stream.write(reinterpret_cast<char*>(&dataSize), sizeof(dataSize));
-            output_stream.write(reinterpret_cast<char*>(&m_constants[0]), dataSize);
+    std::map<std::string, ov::Shape> parsed_weights_to_search_for;
+    for (const auto& llama_name_and_rtinfo_name : tensor_name_map) {
+        const std::string& llama_name = llama_name_and_rtinfo_name.first;
+        const std::string& rtinfo_name = llama_name_and_rtinfo_name.second.as<std::string>();
+        ov::Shape expected_shape = tensor_shape_map[llama_name].as<std::string>();
+        parsed_weights_to_search_for[rtinfo_name] = expected_shape;
+    }
 
+    TensorWeightMatcher matcher{model, parsed_weights_to_search_for};
+    std::unordered_map<std::string, std::shared_ptr<ov::op::v0::Constant>> matches = matcher.get_matches();
+    std::unordered_map<std::string, std::shared_ptr<ov::op::v0::Constant>> llama_name_to_constant_node_map;
+    for (const auto& entry : tensor_name_map) {
+        const auto& llama_name = entry.first;
+        const auto& rtinfo_name = entry.second.as<std::string>();
+        llama_name_to_constant_node_map[llama_name] = matches[rtinfo_name];
+    }
+    std::cout << "VSHAMPOR: requested tensors map to " << llama_name_to_constant_node_map.size()
+              << " tensors to search in model (shared tensors considered)\n";
+
+    std::list<std::string> llama_name_storage;
+
+    size_t n_tensors = 0;
+
+    size_t offset = 0;  // each tensor_info has to have a correct offset including
+                        // padding, checked for in gguf_write_to_buf
+    for (const auto& matched_weight_pair : llama_name_to_constant_node_map) {
+        // Need to store the names in the list so that the passed c_str() pointers
+        // in tensor_infos to the llama names stay valid until they get deepcopied
+        // in gguf/llama functions
+        llama_name_storage.push_back(matched_weight_pair.first);
+        const std::string& llama_name = llama_name_storage.back();
+
+        auto weight_const_node_ptr = matched_weight_pair.second;
+        auto weight_shape = weight_const_node_ptr->get_shape();
+
+        // does hf-to-gguf invert all tensor dimensions with shapes > 1?
+        auto expected_weight_shape = ov::Shape(expected_tensor_shapes_map[llama_name].as<std::string>());
+        OPENVINO_ASSERT(expected_weight_shape.size() < GGML_MAX_DIMS);
+
+        gguf_tensor_info info;
+
+        info.type = GGML_TYPE_F32;  // TODO (vshampor): better type assignment based
+                                    // on actual element type of the Constant node
+
+        info.name.n = llama_name.length();
+        info.name.data = (char*)llama_name.c_str();  // TODO (vshampor): either do this via const_cast, or will
+                                                     // have to implement own structures for read-only data
+                                                     // passing to llama_load_model_from_data
+        info.n_dims = weight_shape.size();
+        std::fill(std::begin(info.ne), std::begin(info.ne) + GGML_MAX_DIMS, (uint64_t)1);
+
+        // looks like GGUF expects inverse order of dimensions when compared to e.g.
+        // torch and actual row-major layout, see
+        // gguf.gguf_writer.GGUFWriter.add_tensor_info in gguf python package
+        std::copy(expected_weight_shape.rbegin(), expected_weight_shape.rend(), info.ne);
+
+        void* data_ptr = (void*)(weight_const_node_ptr->get_data_ptr());  // TODO (vshampor): danger - casts
+                                                                          // `const` away also - the
+                                                                          // expected_weight_shape is in general
+                                                                          // different from actual ov::Tensor
+                                                                          // shape, in particular it may be
+                                                                          // transposed, so we actually need to set
+                                                                          // the pointers to shape-corrected tensor
+                                                                          // storage, which we don't do here - we
+                                                                          // are only preparing this data to get a
+                                                                          // convenient gguf_context object to
+                                                                          // reuse metadata (header) writing code,
+                                                                          // tensor data transpositions will be
+                                                                          // done during actual file write
+
+        info.size = weight_const_node_ptr->get_byte_size();
+        info.offset = offset;
+
+        const size_t size_pad = GGML_PAD(info.size, GGUF_DEFAULT_ALIGNMENT);
+        offset += size_pad;
+
+        info.data = data_ptr;
+
+        tensor_infos.push_back(info);
+        tensor_data_ptrs.push_back(data_ptr);
+        n_tensors++;
+    }
 
-            std::ifstream in(m_converted_gguf_file_name, std::ios::binary);
-            output_stream << in.rdbuf();
-        }
+    std::cout << "VSHAMPOR: found " << matches.size() << "/" << parsed_weights_to_search_for.size() << " tensors"
+              << std::endl;
+
+    gguf_init_params gguf_params;
+    gguf_params.no_alloc = false;
+    gguf_params.ctx = nullptr;
+
+    m_gguf_ctx = gguf_init_from_data(n_tensors,
+                                     tensor_infos.data(),
+                                     n_kv,
+                                     kv_vector.data(),
+                                     tensor_data_ptrs.data(),
+                                     gguf_params);
+
+    std::shared_ptr<const LlamaCppPlugin> llama_plugin_ptr = std::dynamic_pointer_cast<const LlamaCppPlugin>(plugin);
+    m_converted_gguf_file_name = llama_plugin_ptr->get_current_gguf_file_path();
+
+    std::cout << "VSHAMPOR: output filename is  " << m_converted_gguf_file_name << std::endl;
+    std::cout << "VSHAMPOR: writing metadata (GGUF header) " << std::endl;
+    gguf_write_to_file(m_gguf_ctx,
+                       m_converted_gguf_file_name.c_str(),
+                       /* only_meta = */ true);
+
+    std::map<std::string, TransposePermutation> transpose_permutations;
+
+    for (const auto& llama_name_and_permutation : transpose_permutations_rtmap) {
+        std::string permutation_str = llama_name_and_permutation.second.as<std::string>();
+        std::stringstream ss(permutation_str);
+        TransposePermutation permutation;
+        bool is_ok = true;
+        is_ok &= static_cast<bool>(ss >> permutation.first);
+        is_ok &= static_cast<bool>(ss >> permutation.second);
+        OPENVINO_ASSERT(is_ok, "failed to read permutation");
+        transpose_permutations[llama_name_and_permutation.first] = permutation;
+    }
 
-        std::shared_ptr<const ov::Model> LlamaCppModel::get_runtime_model() const {
-            OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented");
-        }
+    std::set<std::string> gemma_tensor_names_to_increment;
+    // FIXME (vshampor): tried setting up commands for incrementing *_norm.weight
+    // values by 1 like it is done during llama.cpp HF-to-GGUF export, but it
+    // seems that it isn't necessary and IR stores the incremented weights already
+    // Is this due to constant folding?
+
+    // for (const auto& llama_name_and_rtinfo_name : tensor_name_map) {
+    //     const std::string& llama_name = llama_name_and_rtinfo_name.first;
+    //     const std::string& rtinfo_name =
+    //     llama_name_and_rtinfo_name.second.as<std::string>(); std::string
+    //     gemma_norm_suffix = "norm.weight"; if (rtinfo_name.size() <
+    //     gemma_norm_suffix.size()) continue; if
+    //     (rtinfo_name.substr(rtinfo_name.size() - gemma_norm_suffix.size()) ==
+    //     gemma_norm_suffix) gemma_tensor_names_to_increment.insert(llama_name);
+    // }
+
+    std::cout << "VSHAMPOR: writing tensor data (blob with transpositions) " << std::endl;
+    append_tensor_data_with_transpositions(m_converted_gguf_file_name,
+                                           tensor_infos,
+                                           tensor_data_ptrs,
+                                           transpose_permutations,
+                                           gemma_tensor_names_to_increment);
+    std::cout << "VSHAMPOR: write finished." << m_converted_gguf_file_name << std::endl;
+
+    std::cout << "VSHAMPOR: loading llama model from written file..." << std::endl;
+    llama_model_params mparams = llama_model_default_params();
+    mparams.n_gpu_layers = 99;
+    m_llama_model_ptr = llama_load_model_from_file(m_converted_gguf_file_name.c_str(), mparams);
+    llama_context_params cparams = llama_context_default_params();
+    m_llama_ctx = llama_new_context_with_model(m_llama_model_ptr, cparams);
+
+    std::cout << "VSHAMPOR: llama model loaded successfully..." << std::endl;
+}
+
+LlamaCppModel::LlamaCppModel(const std::shared_ptr<ov::Model>& ov_model,
+                             std::istream& input_stream,
+                             const std::shared_ptr<const IPlugin>& plugin)
+    : ICompiledModel(ov_model, plugin) {
+    num_tokens_processed_ptr = new size_t;  // TODO (vshampor): hack, remove
+    *num_tokens_processed_ptr = 0;
+    std::shared_ptr<const LlamaCppPlugin> llama_plugin = std::dynamic_pointer_cast<const LlamaCppPlugin>(plugin);
+    std::string current_file_path = llama_plugin->get_current_gguf_file_path();
+    std::ofstream output_stream(current_file_path, std::ios::binary);
+    output_stream << input_stream.rdbuf();
+
+    std::cout << "VSHAMPOR: loading llama model from imported and re-written file..." << std::endl;
+    llama_model_params mparams = llama_model_default_params();
+    mparams.n_gpu_layers = 99;
+    m_llama_model_ptr = llama_load_model_from_file(current_file_path.c_str(), mparams);
+    llama_context_params cparams = llama_context_default_params();
+    m_llama_ctx = llama_new_context_with_model(m_llama_model_ptr, cparams);
+    std::cout << "VSHAMPOR: llama model loaded successfully from cache..." << std::endl;
+}
+
+LlamaCppModel::LlamaCppModel(const std::string& gguf_fname, const std::shared_ptr<const IPlugin>& plugin)
+    : ICompiledModel(nullptr, plugin) {
+    num_tokens_processed_ptr = new size_t;  // TODO (vshampor): hack, remove
+    *num_tokens_processed_ptr = 0;
+    std::cout << "VSHAMPOR: loading llama model directly from GGUF... " << std::endl;
+    llama_model_params mparams = llama_model_default_params();
+    mparams.n_gpu_layers = 99;
+    m_llama_model_ptr = llama_load_model_from_file(gguf_fname.c_str(), mparams);
+    llama_context_params cparams = llama_context_default_params();
+    m_llama_ctx = llama_new_context_with_model(m_llama_model_ptr, cparams);
+    std::cout << "VSHAMPOR: llama model loaded successfully from GGUF..." << std::endl;
+
+    auto input_ids = std::make_shared<ov::opset13::Parameter>(ov::element::Type_t::i64, ov::PartialShape({-1, -1}));
+    auto fake_convert = std::make_shared<ov::opset13::Convert>(input_ids->output(0), ov::element::Type_t::f32);
+    auto logits = std::make_shared<ov::opset13::Result>(fake_convert->output(0));
+
+    ov::ParameterVector inputs{input_ids};
+
+    std::vector<std::pair<std::string, ov::element::Type_t>> unused_names_in_order = {
+        {"attention_mask", ov::element::Type_t::i64},
+        {"position_ids", ov::element::Type_t::i64},
+        {"beam_idx", ov::element::Type_t::i32}};
+    for (const auto& descr : unused_names_in_order) {
+        auto unused_inp = std::make_shared<ov::opset13::Parameter>(descr.second, ov::PartialShape({-1, -1}));
+        inputs.push_back(unused_inp);
+    }
 
-        void LlamaCppModel::set_property(const ov::AnyMap& properties) {
-            std::cout << "VSHAMPOR: attempted to set_property (did nothing)";
-        }
+    m_model = std::make_shared<ov::Model>(logits, inputs, "fake_ov_model_for_io_specification");
 
-        ov::Any LlamaCppModel::get_property(const std::string& name) const {
-            if (ov::supported_properties == name) {
-                return decltype(ov::supported_properties)::value_type(std::vector<PropertyName>());
-            }
-            OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented");
-        }
+    m_model->inputs()[0].set_names({"input_ids"});
+    for (size_t i = 0; i < unused_names_in_order.size(); i++) {
+        m_model->inputs()[i + 1].set_names({unused_names_in_order[i].first});
+    }
 
-        std::shared_ptr<ov::ISyncInferRequest> LlamaCppModel::create_sync_infer_request() const {
-             return std::make_shared<LlamaCppSyncInferRequest>(std::static_pointer_cast<const LlamaCppModel>(shared_from_this()));
-        }
+    m_model->outputs()[0].set_names({"logits"});
 
-         const std::vector<ov::Output<const ov::Node>>& LlamaCppModel::inputs() const {
-             return m_fake_inputs;
-         };
-         const std::vector<ov::Output<const ov::Node>>& LlamaCppModel::outputs() const {
-             return m_fake_outputs;
-         };
+    for (auto input : m_model->inputs()) {
+        m_fake_inputs.emplace_back(input);
+    }
+    for (auto output : m_model->outputs()) {
+        m_fake_outputs.emplace_back(output);
+    }
+}
+
+void LlamaCppModel::export_model(std::ostream& output_stream) const {
+    std::cout << "VSHAMPOR: exporting model" << std::endl;
+
+    // FIXME (vshampor): it's a shame that loading a model from cache does not
+    // have an option to actually keep the already loaded model from xml and not
+    // be forced to deserialize an ov::Model representation from cache as well. As
+    // it stands, will need to write the whole IR into the cache entry along with
+    // the GGUF file.
+    //
+    std::stringstream xmlFile, binFile;
+    ov::pass::Serialize serializer(xmlFile, binFile);
+    serializer.run_on_model(m_model);
+
+    auto m_constants = binFile.str();
+    auto m_model = xmlFile.str();
+
+    auto dataSize = static_cast<std::uint64_t>(m_model.size());
+    output_stream.write(reinterpret_cast<char*>(&dataSize), sizeof(dataSize));
+    output_stream.write(m_model.c_str(), dataSize);
+
+    dataSize = static_cast<std::uint64_t>(m_constants.size());
+    output_stream.write(reinterpret_cast<char*>(&dataSize), sizeof(dataSize));
+    output_stream.write(reinterpret_cast<char*>(&m_constants[0]), dataSize);
+
+    std::ifstream in(m_converted_gguf_file_name, std::ios::binary);
+    output_stream << in.rdbuf();
+}
+
+std::shared_ptr<const ov::Model> LlamaCppModel::get_runtime_model() const {
+    OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented");
+}
+
+void LlamaCppModel::set_property(const ov::AnyMap& properties) {
+    std::cout << "VSHAMPOR: attempted to set_property (did nothing)";
+}
+
+ov::Any LlamaCppModel::get_property(const std::string& name) const {
+    if (ov::supported_properties == name) {
+        return decltype(ov::supported_properties)::value_type(std::vector<PropertyName>());
     }
+    OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented");
+}
+
+std::shared_ptr<ov::ISyncInferRequest> LlamaCppModel::create_sync_infer_request() const {
+    return std::make_shared<LlamaCppSyncInferRequest>(
+        std::static_pointer_cast<const LlamaCppModel>(shared_from_this()));
+}
+
+const std::vector<ov::Output<const ov::Node>>& LlamaCppModel::inputs() const {
+    return m_fake_inputs;
+};
+const std::vector<ov::Output<const ov::Node>>& LlamaCppModel::outputs() const {
+    return m_fake_outputs;
+};
+}  // namespace llama_cpp_plugin
 }  // namespace ov
diff --git a/modules/llama_cpp_plugin/src/infer_request.cpp b/modules/llama_cpp_plugin/src/infer_request.cpp
index 0993422f6..6b5e8ba1e 100644
--- a/modules/llama_cpp_plugin/src/infer_request.cpp
+++ b/modules/llama_cpp_plugin/src/infer_request.cpp
@@ -1,11 +1,12 @@
 #include "infer_request.hpp"
-#include "openvino/runtime/make_tensor.hpp"
+
 #include "llama.h"
+#include "openvino/runtime/make_tensor.hpp"
 
 namespace ov {
-    namespace llama_cpp_plugin {
+namespace llama_cpp_plugin {
 
-        void allocate_tensor_impl(ov::SoPtr<ov::ITensor>& tensor,
+void allocate_tensor_impl(ov::SoPtr<ov::ITensor>& tensor,
                           const ov::element::Type& element_type,
                           const ov::Shape& shape) {
     if (!tensor || tensor->get_element_type() != element_type) {
@@ -15,97 +16,105 @@ namespace ov {
     }
 }
 
-        LlamaCppSyncInferRequest::LlamaCppSyncInferRequest(const std::shared_ptr<const LlamaCppModel>& compiled_model): ov::ISyncInferRequest(compiled_model) {
-            std::cout << "VSHAMPOR: infer request ctor called\n";
-            m_compiled_model_ptr = compiled_model;
-            // Allocate input/output tensors
-            for (const auto& input : get_inputs()) {
-                allocate_tensor(input, [input](ov::SoPtr<ov::ITensor>& tensor) {
-                    // Can add a check to avoid double work in case of shared tensors
-                    allocate_tensor_impl(tensor,
-                                         input.get_element_type(),
-                                         input.get_partial_shape().is_dynamic() ? ov::Shape{0} : input.get_shape());
-                });
-            }
-            for (const auto& output : get_outputs()) {
-                allocate_tensor(output, [output](ov::SoPtr<ov::ITensor>& tensor) {
-                    // Can add a check to avoid double work in case of shared tensors
-                    allocate_tensor_impl(tensor,
-                                         output.get_element_type(),
-                                         output.get_partial_shape().is_dynamic() ? ov::Shape{0} : output.get_shape());
-                });
+LlamaCppSyncInferRequest::LlamaCppSyncInferRequest(const std::shared_ptr<const LlamaCppModel>& compiled_model)
+    : ov::ISyncInferRequest(compiled_model) {
+    std::cout << "VSHAMPOR: infer request ctor called\n";
+    m_compiled_model_ptr = compiled_model;
+    // Allocate input/output tensors
+    for (const auto& input : get_inputs()) {
+        allocate_tensor(input, [input](ov::SoPtr<ov::ITensor>& tensor) {
+            // Can add a check to avoid double work in case of shared tensors
+            allocate_tensor_impl(tensor,
+                                 input.get_element_type(),
+                                 input.get_partial_shape().is_dynamic() ? ov::Shape{0} : input.get_shape());
+        });
     }
-        }
-    void LlamaCppSyncInferRequest::set_tensors_impl(const ov::Output<const ov::Node> port, const std::vector<ov::SoPtr<ov::ITensor>>& tensors) {
-        std::cout << "VSHAMPOR: set_tensors_impl called\n";
+    for (const auto& output : get_outputs()) {
+        allocate_tensor(output, [output](ov::SoPtr<ov::ITensor>& tensor) {
+            // Can add a check to avoid double work in case of shared tensors
+            allocate_tensor_impl(tensor,
+                                 output.get_element_type(),
+                                 output.get_partial_shape().is_dynamic() ? ov::Shape{0} : output.get_shape());
+        });
     }
+}
+void LlamaCppSyncInferRequest::set_tensors_impl(const ov::Output<const ov::Node> port,
+                                                const std::vector<ov::SoPtr<ov::ITensor>>& tensors) {
+    std::cout << "VSHAMPOR: set_tensors_impl called\n";
+}
 
-    void llama_batch_add_reimpl(
-                     struct llama_batch & batch,
-                            llama_token   id,
-                              llama_pos   pos,
-        const std::vector<llama_seq_id> & seq_ids,
-                                   bool   logits) {
-        batch.token   [batch.n_tokens] = id;
-        batch.pos     [batch.n_tokens] = pos;
-        batch.n_seq_id[batch.n_tokens] = seq_ids.size();
-        for (size_t i = 0; i < seq_ids.size(); ++i) {
-            batch.seq_id[batch.n_tokens][i] = seq_ids[i];
-        }
-        batch.logits  [batch.n_tokens] = logits;
-
-        batch.n_tokens++;
+void llama_batch_add_reimpl(struct llama_batch& batch,
+                            llama_token id,
+                            llama_pos pos,
+                            const std::vector<llama_seq_id>& seq_ids,
+                            bool logits) {
+    batch.token[batch.n_tokens] = id;
+    batch.pos[batch.n_tokens] = pos;
+    batch.n_seq_id[batch.n_tokens] = seq_ids.size();
+    for (size_t i = 0; i < seq_ids.size(); ++i) {
+        batch.seq_id[batch.n_tokens][i] = seq_ids[i];
     }
+    batch.logits[batch.n_tokens] = logits;
+
+    batch.n_tokens++;
+}
 
-    void LlamaCppSyncInferRequest::infer() {
-        auto input_ids_tensor_ptr = get_tensor(get_inputs()[0]); // TODO (vshampor) correctly identify input_ids among all inputs without hardcode
-        OPENVINO_ASSERT(input_ids_tensor_ptr->get_element_type() == ov::element::Type_t::i64);
-        OPENVINO_ASSERT(input_ids_tensor_ptr->get_shape().size() == 2);
-        size_t batch_size = input_ids_tensor_ptr->get_shape()[0];
-        size_t sequence_length = input_ids_tensor_ptr->get_shape()[1];
-
-        // llama_batch actually contains one sequence
-        llama_batch batch = llama_batch_init(sequence_length, /* embd = */ 0, /* n_seq_max = */ 1);
-        const int64_t* data_ptr = input_ids_tensor_ptr->data<int64_t>();
-
-        const int64_t* sequence_start_ptr = data_ptr /* + seq_idx */;
-
-        for (size_t tok_idx = 0; tok_idx < sequence_length; ++tok_idx) {
-            const int64_t token_id = sequence_start_ptr[tok_idx];
-            llama_batch_add_reimpl(batch, token_id, *(m_compiled_model_ptr->num_tokens_processed_ptr), { 0 }, true); // the last `true` here is a marker that the logits for this token should be computed and returned
-            size_t* ptr = m_compiled_model_ptr->num_tokens_processed_ptr;
-            (*ptr)++;
-        }
-
-
-        llama_context* ctx = m_compiled_model_ptr->m_llama_ctx;
-        int32_t sts = llama_decode(ctx, batch);
-
-        if (sts != 0) {
-            OPENVINO_THROW("llama_decode failed with code ", sts);
-        }
-
-        size_t n_vocab = llama_n_vocab(m_compiled_model_ptr->m_llama_model_ptr);
-
-        ov::Tensor output_tensor{ov::element::Type_t::f32, {1, sequence_length, n_vocab}};
-        float* output_tensor_data_ptr = output_tensor.data<float>();
-
-        for (size_t pos = 0; pos < sequence_length; pos++) {
-            float* logits_from_llama = llama_get_logits_ith(ctx, pos);
-            std::copy(logits_from_llama, logits_from_llama + n_vocab, output_tensor_data_ptr + pos * n_vocab);
-        }
-
-        auto& logit_output = get_outputs()[0];
-        allocate_tensor(logit_output, [&output_tensor](ov::SoPtr<ov::ITensor>& tensor) { allocate_tensor_impl(tensor, output_tensor.get_element_type(), output_tensor.get_shape());
-                                                                                         output_tensor.copy_to(ov::make_tensor(tensor)); });
-    };
-    std::vector<ov::ProfilingInfo> LlamaCppSyncInferRequest::get_profiling_info() const {
-        std::cout << "VSHAMPOR: get_profiling_info() called\n";
-        return std::vector<ov::ProfilingInfo>{};
-    };
-    std::vector<ov::SoPtr<ov::IVariableState>> LlamaCppSyncInferRequest::query_state() const {
-        std::cout << "VSHAMPOR: get_profiling_info() called\n";
-        return std::vector<ov::SoPtr<ov::IVariableState>>{};
+void LlamaCppSyncInferRequest::infer() {
+    auto input_ids_tensor_ptr = get_tensor(get_inputs()[0]);  // TODO (vshampor) correctly identify input_ids among
+                                                              // all inputs without hardcode
+    OPENVINO_ASSERT(input_ids_tensor_ptr->get_element_type() == ov::element::Type_t::i64);
+    OPENVINO_ASSERT(input_ids_tensor_ptr->get_shape().size() == 2);
+    size_t batch_size = input_ids_tensor_ptr->get_shape()[0];
+    size_t sequence_length = input_ids_tensor_ptr->get_shape()[1];
+
+    // llama_batch actually contains one sequence
+    llama_batch batch = llama_batch_init(sequence_length, /* embd = */ 0, /* n_seq_max = */ 1);
+    const int64_t* data_ptr = input_ids_tensor_ptr->data<int64_t>();
+
+    const int64_t* sequence_start_ptr = data_ptr /* + seq_idx */;
+
+    for (size_t tok_idx = 0; tok_idx < sequence_length; ++tok_idx) {
+        const int64_t token_id = sequence_start_ptr[tok_idx];
+        llama_batch_add_reimpl(batch,
+                               token_id,
+                               *(m_compiled_model_ptr->num_tokens_processed_ptr),
+                               {0},
+                               true);  // the last `true` here is a marker that the logits for this
+                                       // token should be computed and returned
+        size_t* ptr = m_compiled_model_ptr->num_tokens_processed_ptr;
+        (*ptr)++;
     }
+
+    llama_context* ctx = m_compiled_model_ptr->m_llama_ctx;
+    int32_t sts = llama_decode(ctx, batch);
+
+    if (sts != 0) {
+        OPENVINO_THROW("llama_decode failed with code ", sts);
     }
+
+    size_t n_vocab = llama_n_vocab(m_compiled_model_ptr->m_llama_model_ptr);
+
+    ov::Tensor output_tensor{ov::element::Type_t::f32, {1, sequence_length, n_vocab}};
+    float* output_tensor_data_ptr = output_tensor.data<float>();
+
+    for (size_t pos = 0; pos < sequence_length; pos++) {
+        float* logits_from_llama = llama_get_logits_ith(ctx, pos);
+        std::copy(logits_from_llama, logits_from_llama + n_vocab, output_tensor_data_ptr + pos * n_vocab);
+    }
+
+    auto& logit_output = get_outputs()[0];
+    allocate_tensor(logit_output, [&output_tensor](ov::SoPtr<ov::ITensor>& tensor) {
+        allocate_tensor_impl(tensor, output_tensor.get_element_type(), output_tensor.get_shape());
+        output_tensor.copy_to(ov::make_tensor(tensor));
+    });
+};
+std::vector<ov::ProfilingInfo> LlamaCppSyncInferRequest::get_profiling_info() const {
+    std::cout << "VSHAMPOR: get_profiling_info() called\n";
+    return std::vector<ov::ProfilingInfo>{};
+};
+std::vector<ov::SoPtr<ov::IVariableState>> LlamaCppSyncInferRequest::query_state() const {
+    std::cout << "VSHAMPOR: get_profiling_info() called\n";
+    return std::vector<ov::SoPtr<ov::IVariableState>>{};
+}
+}  // namespace llama_cpp_plugin
 }  // namespace ov
diff --git a/modules/llama_cpp_plugin/src/plugin.cpp b/modules/llama_cpp_plugin/src/plugin.cpp
index 9f633426f..3e23c568f 100644
--- a/modules/llama_cpp_plugin/src/plugin.cpp
+++ b/modules/llama_cpp_plugin/src/plugin.cpp
@@ -1,151 +1,169 @@
 #include "plugin.hpp"
+
+#include <openvino/runtime/properties.hpp>
+
 #include "compiled_model.hpp"
 #include "openvino/op/constant.hpp"
-#include <openvino/runtime/properties.hpp>
 #include "openvino/runtime/internal_properties.hpp"
 
-
 namespace {
 static constexpr const char* wait_executor_name = "LlamaCppWaitExecutor";
 static constexpr const char* stream_executor_name = "LlamaCppStreamsExecutor";
 static constexpr const char* template_exclusive_executor = "LlamaCppExecutor";
 }  // namespace
 
-
 namespace ov {
-    namespace llama_cpp_plugin {
-        LlamaCppPlugin::LlamaCppPlugin() : IPlugin() {
-            set_device_name("LLAMA_CPP");
-        }
-        std::shared_ptr<ov::ICompiledModel> LlamaCppPlugin::compile_model(const std::shared_ptr<const ov::Model>& model,
-            const ov::AnyMap& properties) const {
-            std::cout << "VSHAMPOR: LlamaCppPlugin::compile_model" << std::endl;
-
-            //std::string gpt2_node_name = "transformer.h.9.attn.c_proj.weight";
-            //std::cout << "VSHAMPOR: sanity check - looking for node containing " << gpt2_node_name << std::endl;
-            //auto ops = model->get_ops();
-            //auto iter = std::find_if(ops.begin(), ops.end(), [gpt2_node_name](const std::shared_ptr<ov::Node>& val) {
-            //        return val->get_friendly_name().find(gpt2_node_name) != std::string::npos; });
-            //if (iter == ops.end()) {
-            //    std::cout << "VSHAMPOR: did not find the node\n";
-            //} else {
-            //    std::shared_ptr<ov::Node> node_with_tensor = *iter;
-            //    std::cout << "VSHAMPOR: node type is " << node_with_tensor->get_type_name() << std::endl;
-            //    std::shared_ptr<ov::op::v0::Constant> const_node_ptr = ov::as_type_ptr<ov::op::v0::Constant>(node_with_tensor);
-            //    const float* data_ptr = const_node_ptr->get_data_ptr<element::Type_t::f32>();
-            //    // ov::descriptor::Tensor& tensor_descr = node_with_tensor->get_output_tensor(0);
-            //    // std::cout << "VSHAMPOR: node output tensor shape is " << tensor_descr.get_shape().to_string() << std::endl;
-            //    // ov::TensorVector in, out;
-            //    // node_with_tensor->evaluate(out, in);
-            //    // std::cout << "VSHAMPOR: evaluated " << out.size() << " output tensors\n";
-            //    // if (!out.empty()) {
-            //    //     const ov::Tensor& tensor = out[0];
-            //    //     const float* vals = tensor.data<float>();
-            //    //     std::cout << "VSHAMPOR: first elements of the weight tensor are ";
-            //    //     for (size_t i = 0; i < 10; i++) {
-            //    //         std::cout << vals[i] << " ";
-            //    //     }
-            //    //     std::cout << std::endl;
-            //    // }
-            //    std::cout << "VSHAMPOR: first elements of the weight tensor are ";
-            //    for (size_t i = 0; i < 10; i++) {
-            //        std::cout << data_ptr[i] << " ";
-            //    }
-            //    std::cout << std::endl;
-            //}
-            return compile_model(model, properties, {});
-        }
-
-        std::shared_ptr<ov::ICompiledModel> LlamaCppPlugin::compile_model(const std::string& fname, const ov::AnyMap& properties) const {
-            return std::make_shared<LlamaCppModel>(fname, shared_from_this());
-        }
-        std::shared_ptr<ov::ICompiledModel> LlamaCppPlugin::compile_model(const std::shared_ptr<const ov::Model>& model,
-            const ov::AnyMap& properties,
-            const ov::SoPtr<ov::IRemoteContext>& context) const {
-            std::cout << "VSHAMPOR: compile_model called in C++" << std::endl;
-            return std::make_shared<LlamaCppModel>(model->clone(), shared_from_this(), context, get_executor_manager()->get_executor(template_exclusive_executor));
-        }
-
-        void LlamaCppPlugin::set_property(const ov::AnyMap& properties) {
-            for (const auto& map_entry : properties) {
-                if (map_entry.first == ov::cache_dir.name()) {
-                    m_cache_dir = map_entry.second.as<std::string>();
-                }
-                else {
-                    OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: setting property ", map_entry.first, "not implemented");
-                }
-            }
-        }
-
-        ov::Any LlamaCppPlugin::get_property(const std::string& name, const ov::AnyMap& arguments) const {
-            if (ov::supported_properties == name) {
-                return decltype(ov::supported_properties)::value_type(std::vector<PropertyName>({ov::cache_dir, ov::device::capabilities, ov::device::full_name}));
-            }
-            if (ov::device::capabilities == name) {
-                return decltype(ov::device::capabilities)::value_type(std::vector<std::string>({ov::device::capability::EXPORT_IMPORT}));
-            }
-            if (ov::internal::supported_properties == name) {
-                return decltype(ov::internal::supported_properties)::value_type(std::vector<PropertyName>({ov::internal::caching_properties}));
-            }
-
-            if (ov::cache_dir == name) {
-                return m_cache_dir;
-            }
-            if (ov::internal::caching_properties == name) {
-                return std::vector<ov::PropertyName>{ov::device::full_name};
-            }
-
-            if (ov::device::full_name == name) {
-                return std::string("LLAMA_CPP");
-            }
-
-            OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented");
+namespace llama_cpp_plugin {
+LlamaCppPlugin::LlamaCppPlugin() : IPlugin() {
+    set_device_name("LLAMA_CPP");
+}
+std::shared_ptr<ov::ICompiledModel> LlamaCppPlugin::compile_model(const std::shared_ptr<const ov::Model>& model,
+                                                                  const ov::AnyMap& properties) const {
+    std::cout << "VSHAMPOR: LlamaCppPlugin::compile_model" << std::endl;
+
+    // std::string gpt2_node_name = "transformer.h.9.attn.c_proj.weight";
+    // std::cout << "VSHAMPOR: sanity check - looking for node containing " <<
+    // gpt2_node_name << std::endl; auto ops = model->get_ops(); auto iter =
+    // std::find_if(ops.begin(), ops.end(), [gpt2_node_name](const
+    // std::shared_ptr<ov::Node>& val) {
+    //        return val->get_friendly_name().find(gpt2_node_name) !=
+    //        std::string::npos; });
+    // if (iter == ops.end()) {
+    //    std::cout << "VSHAMPOR: did not find the node\n";
+    //} else {
+    //    std::shared_ptr<ov::Node> node_with_tensor = *iter;
+    //    std::cout << "VSHAMPOR: node type is " <<
+    //    node_with_tensor->get_type_name() << std::endl;
+    //    std::shared_ptr<ov::op::v0::Constant> const_node_ptr =
+    //    ov::as_type_ptr<ov::op::v0::Constant>(node_with_tensor); const float*
+    //    data_ptr = const_node_ptr->get_data_ptr<element::Type_t::f32>();
+    //    // ov::descriptor::Tensor& tensor_descr =
+    //    node_with_tensor->get_output_tensor(0);
+    //    // std::cout << "VSHAMPOR: node output tensor shape is " <<
+    //    tensor_descr.get_shape().to_string() << std::endl;
+    //    // ov::TensorVector in, out;
+    //    // node_with_tensor->evaluate(out, in);
+    //    // std::cout << "VSHAMPOR: evaluated " << out.size() << " output
+    //    tensors\n";
+    //    // if (!out.empty()) {
+    //    //     const ov::Tensor& tensor = out[0];
+    //    //     const float* vals = tensor.data<float>();
+    //    //     std::cout << "VSHAMPOR: first elements of the weight tensor are
+    //    ";
+    //    //     for (size_t i = 0; i < 10; i++) {
+    //    //         std::cout << vals[i] << " ";
+    //    //     }
+    //    //     std::cout << std::endl;
+    //    // }
+    //    std::cout << "VSHAMPOR: first elements of the weight tensor are ";
+    //    for (size_t i = 0; i < 10; i++) {
+    //        std::cout << data_ptr[i] << " ";
+    //    }
+    //    std::cout << std::endl;
+    //}
+    return compile_model(model, properties, {});
+}
+
+std::shared_ptr<ov::ICompiledModel> LlamaCppPlugin::compile_model(const std::string& fname,
+                                                                  const ov::AnyMap& properties) const {
+    return std::make_shared<LlamaCppModel>(fname, shared_from_this());
+}
+std::shared_ptr<ov::ICompiledModel> LlamaCppPlugin::compile_model(const std::shared_ptr<const ov::Model>& model,
+                                                                  const ov::AnyMap& properties,
+                                                                  const ov::SoPtr<ov::IRemoteContext>& context) const {
+    std::cout << "VSHAMPOR: compile_model called in C++" << std::endl;
+    return std::make_shared<LlamaCppModel>(model->clone(),
+                                           shared_from_this(),
+                                           context,
+                                           get_executor_manager()->get_executor(template_exclusive_executor));
+}
+
+void LlamaCppPlugin::set_property(const ov::AnyMap& properties) {
+    for (const auto& map_entry : properties) {
+        if (map_entry.first == ov::cache_dir.name()) {
+            m_cache_dir = map_entry.second.as<std::string>();
+        } else {
+            OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: setting property ", map_entry.first, "not implemented");
         }
+    }
+}
 
-        ov::SoPtr<ov::IRemoteContext> LlamaCppPlugin::create_context(const ov::AnyMap& remote_properties) const {
-            OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented");
-        }
-        ov::SoPtr<ov::IRemoteContext> LlamaCppPlugin::get_default_context(const ov::AnyMap& remote_properties) const {
-            OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented");
-        }
-        std::shared_ptr<ov::ICompiledModel> LlamaCppPlugin::import_model(std::istream& model_file_stream,
-            const ov::AnyMap& properties) const {
-            std::cout << "VSHAMPOR: importing model" << '\n';
-            std::cout << "VSHAMPOR: deserializing ov::Model first" << '\n';
-             // read XML content
-             std::string xmlString;
-             std::uint64_t dataSize = 0;
-             model_file_stream.read(reinterpret_cast<char*>(&dataSize), sizeof(dataSize));
-             xmlString.resize(dataSize);
-             model_file_stream.read(const_cast<char*>(xmlString.c_str()), dataSize);
-
-             // read blob content
-             ov::Tensor weights;
-             model_file_stream.read(reinterpret_cast<char*>(&dataSize), sizeof(dataSize));
-             if (0 != dataSize) {
-                 weights = ov::Tensor(ov::element::from<char>(), ov::Shape{static_cast<ov::Shape::size_type>(dataSize)});
-                 model_file_stream.read(weights.data<char>(), dataSize);
-             }
-
-             auto ov_model = get_core()->read_model(xmlString, weights);
-            std::cout << "VSHAMPOR: ov::Model deserialized, passing the rest of the stream to LlamaCppModel ctor" << '\n';
-            return std::make_shared<LlamaCppModel>(ov_model, model_file_stream, shared_from_this());
-        }
+ov::Any LlamaCppPlugin::get_property(const std::string& name, const ov::AnyMap& arguments) const {
+    if (ov::supported_properties == name) {
+        return decltype(ov::supported_properties)::value_type(
+            std::vector<PropertyName>({ov::cache_dir, ov::device::capabilities, ov::device::full_name}));
+    }
+    if (ov::device::capabilities == name) {
+        return decltype(ov::device::capabilities)::value_type(
+            std::vector<std::string>({ov::device::capability::EXPORT_IMPORT}));
+    }
+    if (ov::internal::supported_properties == name) {
+        return decltype(ov::internal::supported_properties)::value_type(
+            std::vector<PropertyName>({ov::internal::caching_properties}));
+    }
 
-        const std::string CURRENT_GGUF_FILE_NAME = "current.gguf";
-        std::string LlamaCppPlugin::get_current_gguf_file_path() const { return m_cache_dir + "/" + CURRENT_GGUF_FILE_NAME; }
+    if (ov::cache_dir == name) {
+        return m_cache_dir;
+    }
+    if (ov::internal::caching_properties == name) {
+        return std::vector<ov::PropertyName>{ov::device::full_name};
+    }
 
-        std::shared_ptr<ov::ICompiledModel> LlamaCppPlugin::import_model(std::istream& model,
-            const ov::SoPtr<ov::IRemoteContext>& context,
-            const ov::AnyMap& properties) const {
-            OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented");
-        }
+    if (ov::device::full_name == name) {
+        return std::string("LLAMA_CPP");
+    }
 
-        ov::SupportedOpsMap LlamaCppPlugin::query_model(const std::shared_ptr<const ov::Model>& model,
-            const ov::AnyMap& properties) const {
-            OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented");
-        }
+    OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented");
+}
+
+ov::SoPtr<ov::IRemoteContext> LlamaCppPlugin::create_context(const ov::AnyMap& remote_properties) const {
+    OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented");
+}
+ov::SoPtr<ov::IRemoteContext> LlamaCppPlugin::get_default_context(const ov::AnyMap& remote_properties) const {
+    OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented");
+}
+std::shared_ptr<ov::ICompiledModel> LlamaCppPlugin::import_model(std::istream& model_file_stream,
+                                                                 const ov::AnyMap& properties) const {
+    std::cout << "VSHAMPOR: importing model" << '\n';
+    std::cout << "VSHAMPOR: deserializing ov::Model first" << '\n';
+    // read XML content
+    std::string xmlString;
+    std::uint64_t dataSize = 0;
+    model_file_stream.read(reinterpret_cast<char*>(&dataSize), sizeof(dataSize));
+    xmlString.resize(dataSize);
+    model_file_stream.read(const_cast<char*>(xmlString.c_str()), dataSize);
+
+    // read blob content
+    ov::Tensor weights;
+    model_file_stream.read(reinterpret_cast<char*>(&dataSize), sizeof(dataSize));
+    if (0 != dataSize) {
+        weights = ov::Tensor(ov::element::from<char>(), ov::Shape{static_cast<ov::Shape::size_type>(dataSize)});
+        model_file_stream.read(weights.data<char>(), dataSize);
     }
+
+    auto ov_model = get_core()->read_model(xmlString, weights);
+    std::cout << "VSHAMPOR: ov::Model deserialized, passing the rest of the "
+                 "stream to LlamaCppModel ctor"
+              << '\n';
+    return std::make_shared<LlamaCppModel>(ov_model, model_file_stream, shared_from_this());
+}
+
+const std::string CURRENT_GGUF_FILE_NAME = "current.gguf";
+std::string LlamaCppPlugin::get_current_gguf_file_path() const {
+    return m_cache_dir + "/" + CURRENT_GGUF_FILE_NAME;
+}
+
+std::shared_ptr<ov::ICompiledModel> LlamaCppPlugin::import_model(std::istream& model,
+                                                                 const ov::SoPtr<ov::IRemoteContext>& context,
+                                                                 const ov::AnyMap& properties) const {
+    OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented");
+}
+
+ov::SupportedOpsMap LlamaCppPlugin::query_model(const std::shared_ptr<const ov::Model>& model,
+                                                const ov::AnyMap& properties) const {
+    OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented");
+}
+}  // namespace llama_cpp_plugin
 }  // namespace ov
 
 static const ov::Version version = {CI_BUILD_NUMBER, "llama_cpp_plugin"};
diff --git a/modules/llama_cpp_plugin/tests/CMakeLists.txt b/modules/llama_cpp_plugin/tests/CMakeLists.txt
deleted file mode 100644
index 11648c2bd..000000000
--- a/modules/llama_cpp_plugin/tests/CMakeLists.txt
+++ /dev/null
@@ -1,37 +0,0 @@
-set(TARGET_NAME llama_cpp_plugin_func_tests)
-
-if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-    ov_add_compiler_flags(/wd4305)
-endif()
-
-ov_add_test_target(
-        NAME ${TARGET_NAME}
-        ROOT ${CMAKE_CURRENT_SOURCE_DIR}
-        DEPENDENCIES
-            openvino_template_plugin
-        LINK_LIBRARIES
-            openvino::funcSharedTests
-            openvino::runtime::dev
-        INCLUDES
-            "${OpenVINOTemplatePlugin_SOURCE_DIR}/include"
-            "${CMAKE_CURRENT_SOURCE_DIR}/op_reference"
-        ADD_CLANG_FORMAT
-        LABELS
-            OV UNIT TEMPLATE
-)
-
-find_package(OpenCV QUIET COMPONENTS core imgproc)
-
-if(OpenCV_FOUND AND OpenCV_VERSION VERSION_GREATER_EQUAL 3.4)
-    message(STATUS "Reference preprocessing: OpenCV tests are enabled")
-    target_compile_definitions(${TARGET_NAME} PRIVATE OPENCV_TEMPLATE_TESTS)
-    target_link_libraries(${TARGET_NAME} PRIVATE opencv_imgproc opencv_core)
-else()
-    message(WARNING "Reference preprocessing: OpenCV tests are disabled, because OpenCV ver. 3.4+ is not found")
-endif()
-
-if (ENABLE_INTEL_CPU)
-    set_source_files_properties(
-        "${CMAKE_CURRENT_SOURCE_DIR}/shared_tests_instances/behavior/executable_network/get_metric.cpp"
-        PROPERTIES COMPILE_DEFINITIONS ENABLE_INTEL_CPU=1)
-endif()
diff --git a/modules/llama_cpp_plugin/tests/e2e/CMakeLists.txt b/modules/llama_cpp_plugin/tests/e2e/CMakeLists.txt
new file mode 100644
index 000000000..4c16f3484
--- /dev/null
+++ b/modules/llama_cpp_plugin/tests/e2e/CMakeLists.txt
@@ -0,0 +1,18 @@
+
+set(TARGET_NAME llama_cpp_e2e_tests)
+
+ov_add_test_target(
+        NAME ${TARGET_NAME}
+        ROOT ${CMAKE_CURRENT_SOURCE_DIR}
+        DEPENDENCIES
+            llama_cpp_plugin
+        LINK_LIBRARIES
+            openvino::runtime::dev
+            openvino::funcSharedTests
+        INCLUDES
+            "${OpenVINOTemplatePlugin_SOURCE_DIR}/include"
+        ADD_CLANG_FORMAT
+        LABELS
+            OV UNIT TEMPLATE
+)
+
diff --git a/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp b/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp
new file mode 100644
index 000000000..f4e0369c5
--- /dev/null
+++ b/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp
@@ -0,0 +1,63 @@
+#include <gtest/gtest.h>
+#include "openvino/openvino.hpp"
+#include "common_test_utils/file_utils.hpp"
+
+const std::string TEST_FILES_DIR = "test_data";
+
+// "Why is the Sun yellow?"
+const std::vector<int64_t> GPT2_PROMPT_TOKEN_IDS = {5195, 318, 262, 3825, 7872, 30};
+// "The Sun is a bright red, which means it is a bright red. The Sun is a bright red because it is a bright red."
+const std::vector<int64_t> GPT2_REFERENCE_RESPONSE_TOKEN_IDS = {198, 464, 3825, 318, 257, 6016, 2266, 11, 543, 1724, 340, 318, 257, 6016, 2266, 13, 383, 3825, 318, 257, 6016, 2266, 780, 340, 318, 257, 6016, 2266, 13, 198, 198, 464};
+
+const auto SEP = ov::util::FileTraits<char>::file_separator;
+
+TEST(PromptResponseTest, TestGPT2) {
+    const std::string plugin_name = "LLAMA_CPP";
+    ov::Core core;
+
+    const std::string model_file_name = "gpt2.gguf";
+    const std::string model_file = ov::test::utils::getCurrentWorkingDir() + SEP +  TEST_FILES_DIR + SEP + model_file_name;
+    ov::InferRequest lm = core.compile_model(model_file, plugin_name).create_infer_request();
+    auto input_ids_tensor = ov::Tensor(ov::element::Type_t::i64, {1, GPT2_PROMPT_TOKEN_IDS.size()});
+    std::copy(GPT2_PROMPT_TOKEN_IDS.begin(), GPT2_PROMPT_TOKEN_IDS.end(), input_ids_tensor.data<int64_t>());
+    lm.set_tensor("input_ids", input_ids_tensor);
+    lm.set_tensor("attention_mask", ov::Tensor(ov::element::Type_t::i64, {1, GPT2_PROMPT_TOKEN_IDS.size()}));
+    ov::Tensor position_ids = lm.get_tensor("position_ids");
+    position_ids.set_shape(input_ids_tensor.get_shape());
+    std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
+
+    constexpr size_t BATCH_SIZE = 1;
+    lm.get_tensor("beam_idx").set_shape({BATCH_SIZE});
+    lm.get_tensor("beam_idx").data<int32_t>()[0] = 0;
+
+    lm.infer();
+
+    size_t vocab_size = lm.get_tensor("logits").get_shape().back();
+    float* logits = lm.get_tensor("logits").data<float>() + (input_ids_tensor.get_size() - 1) * vocab_size;
+    int64_t out_token = std::max_element(logits, logits + vocab_size) - logits;
+
+    lm.get_tensor("input_ids").set_shape({BATCH_SIZE, 1});
+    position_ids.set_shape({BATCH_SIZE, 1});
+
+    size_t cnt = 0;
+    std::vector<int64_t> out_token_ids;
+
+    while (cnt < GPT2_REFERENCE_RESPONSE_TOKEN_IDS.size()) {
+        lm.get_tensor("input_ids").data<int64_t>()[0] = out_token;
+        lm.get_tensor("attention_mask").set_shape({BATCH_SIZE, lm.get_tensor("attention_mask").get_shape().at(1) + 1});
+        std::fill_n(lm.get_tensor("attention_mask").data<int64_t>(), lm.get_tensor("attention_mask").get_size(), 1);
+        position_ids.data<int64_t>()[0] = int64_t(lm.get_tensor("attention_mask").get_size() - 2);
+        lm.start_async();
+        lm.wait();
+        logits = lm.get_tensor("logits").data<float>();
+        out_token = std::max_element(logits, logits + vocab_size) - logits;
+        out_token_ids.push_back(out_token);
+        cnt++;
+    }
+
+    lm.reset_state();
+
+    ASSERT_EQ(out_token_ids, GPT2_REFERENCE_RESPONSE_TOKEN_IDS);
+}
+
+
diff --git a/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp b/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp
new file mode 100644
index 000000000..df796aacb
--- /dev/null
+++ b/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp
@@ -0,0 +1,13 @@
+#include <stdexcept>
+#include <string>
+
+namespace ov {
+namespace test {
+void set_device_suffix(const std::string& suffix) {
+    if (!suffix.empty()) {
+        throw std::runtime_error("The suffix can't be used for TEMPLATE device!");
+    }
+}
+}  // namespace test
+}  // namespace ov
+
diff --git a/modules/llama_cpp_plugin/tools/CMakeLists.txt b/modules/llama_cpp_plugin/tools/CMakeLists.txt
index 4a37341b8..5209d5ca9 100644
--- a/modules/llama_cpp_plugin/tools/CMakeLists.txt
+++ b/modules/llama_cpp_plugin/tools/CMakeLists.txt
@@ -18,5 +18,4 @@ target_link_libraries(tensor_comparator PRIVATE ggml)
 add_executable(cache_embedder
                "${CMAKE_CURRENT_SOURCE_DIR}/cache_embedder.cpp"
               )
-
-target_compile_options(cache_embedder PUBLIC "--std=c++17")
+set_target_properties(cache_embedder PROPERTIES CXX_STANDARD 17)

From 609eed9fc0d4d23da90bf1491bf58d1ad30f3735 Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Tue, 12 Mar 2024 18:30:03 +0100
Subject: [PATCH 03/27] Remove unused code

---
 modules/llama_cpp_plugin/CMakeLists.txt       |   1 -
 .../include/compiled_model.hpp                |   6 +-
 .../include/infer_request.hpp                 |   4 +-
 modules/llama_cpp_plugin/include/plugin.hpp   |  65 +-
 .../llama_cpp_plugin/src/compiled_model.cpp   | 822 +-----------------
 .../llama_cpp_plugin/src/infer_request.cpp    |  21 +-
 modules/llama_cpp_plugin/src/plugin.cpp       |  85 +-
 .../tests/e2e/set_device_name.cpp             |   2 +-
 modules/llama_cpp_plugin/tools/CMakeLists.txt |  21 -
 .../llama_cpp_plugin/tools/cache_embedder.cpp |  53 --
 modules/llama_cpp_plugin/tools/runner.cpp     |  73 --
 .../tools/tensor_comparator.cpp               |  95 --
 12 files changed, 50 insertions(+), 1198 deletions(-)
 delete mode 100644 modules/llama_cpp_plugin/tools/CMakeLists.txt
 delete mode 100644 modules/llama_cpp_plugin/tools/cache_embedder.cpp
 delete mode 100644 modules/llama_cpp_plugin/tools/runner.cpp
 delete mode 100644 modules/llama_cpp_plugin/tools/tensor_comparator.cpp

diff --git a/modules/llama_cpp_plugin/CMakeLists.txt b/modules/llama_cpp_plugin/CMakeLists.txt
index 1385eea5d..89c5d4e0e 100644
--- a/modules/llama_cpp_plugin/CMakeLists.txt
+++ b/modules/llama_cpp_plugin/CMakeLists.txt
@@ -11,7 +11,6 @@ if(CMAKE_COMPILER_IS_GNUCXX)
 endif()
 
 add_subdirectory(src)
-add_subdirectory(tools)
 
 add_subdirectory(third_party/llama.cpp)
 
diff --git a/modules/llama_cpp_plugin/include/compiled_model.hpp b/modules/llama_cpp_plugin/include/compiled_model.hpp
index eb785e252..1ae79f12e 100644
--- a/modules/llama_cpp_plugin/include/compiled_model.hpp
+++ b/modules/llama_cpp_plugin/include/compiled_model.hpp
@@ -55,6 +55,7 @@ namespace ov {
             virtual ov::Any get_property(const std::string& name) const override;
             virtual const std::vector<ov::Output<const ov::Node>>& inputs() const override;
             virtual const std::vector<ov::Output<const ov::Node>>& outputs() const override;
+            virtual ~LlamaCppModel();
         protected:
             /**
              * @brief Method creates infer request implementation
@@ -64,14 +65,13 @@ namespace ov {
             virtual std::shared_ptr<ov::ISyncInferRequest> create_sync_infer_request() const override;
 
         private:
-            std::string get_current_gguf_file_path() const;
             gguf_context* m_gguf_ctx = nullptr;
-            std::string m_converted_gguf_file_name;
+            std::string m_gguf_fname;
 
             llama_model* m_llama_model_ptr = nullptr;
             llama_context* m_llama_ctx = nullptr;
-            size_t* num_tokens_processed_ptr = nullptr;  // TODO: (vshampor) find a better place for this kind of storage
             std::shared_ptr<ov::Model> m_model;
+            size_t* num_tokens_processed_ptr = nullptr;  // TODO: (vshampor) find a better place for this kind of storage
 
             std::vector<ov::Output<const ov::Node>> m_fake_inputs;
             std::vector<ov::Output<const ov::Node>> m_fake_outputs;
diff --git a/modules/llama_cpp_plugin/include/infer_request.hpp b/modules/llama_cpp_plugin/include/infer_request.hpp
index b6314010b..8954a180b 100644
--- a/modules/llama_cpp_plugin/include/infer_request.hpp
+++ b/modules/llama_cpp_plugin/include/infer_request.hpp
@@ -7,12 +7,10 @@
 namespace ov {
 namespace llama_cpp_plugin {
 
+
 class LlamaCppSyncInferRequest : public ISyncInferRequest {
 public:
     explicit LlamaCppSyncInferRequest(const std::shared_ptr<const LlamaCppModel>& compiled_model);
-    // explicit LlamaCppSyncInferRequest(const std::shared_ptr<const LlamaCppModel>& compiled_model): ov::ISyncInferRequest(compiled_model) {
-    //         std::cout << "VSHAMPOR: infer request ctor called\n";
-    //     }
     virtual ~LlamaCppSyncInferRequest() {};
 
     virtual void set_tensors_impl(const ov::Output<const ov::Node> port,
diff --git a/modules/llama_cpp_plugin/include/plugin.hpp b/modules/llama_cpp_plugin/include/plugin.hpp
index aea32ea1f..1d6fdf1e4 100644
--- a/modules/llama_cpp_plugin/include/plugin.hpp
+++ b/modules/llama_cpp_plugin/include/plugin.hpp
@@ -12,99 +12,36 @@ namespace ov {
         class LlamaCppPlugin : public IPlugin {
         public:
             LlamaCppPlugin();
-            /**
-             * @brief Compiles model from ov::Model object
-             * @param model A model object acquired from ov::Core::read_model or source construction
-             * @param properties A ov::AnyMap of properties relevant only for this load operation
-             * @return Created Compiled Model object
-             */
             virtual std::shared_ptr<ov::ICompiledModel> compile_model(const std::shared_ptr<const ov::Model>& model,
                 const ov::AnyMap& properties) const override;
 
-
-            /**
-             * @brief Compiles model from ov::Model object, on specified remote context
-             * @param model A model object acquired from ov::Core::read_model or source construction
-             * @param properties A ov::AnyMap of properties relevant only for this load operation
-             * @param context A pointer to plugin context derived from RemoteContext class used to
-             *        execute the model
-             * @return Created Compiled Model object
-             */
             virtual std::shared_ptr<ov::ICompiledModel> compile_model(const std::shared_ptr<const ov::Model>& model,
                 const ov::AnyMap& properties,
                 const ov::SoPtr<ov::IRemoteContext>& context) const override;
 
-            /**
-             * @brief Sets properties for plugin, acceptable keys can be found in openvino/runtime/properties.hpp
-             * @param properties ov::AnyMap of properties
-             */
             virtual void set_property(const ov::AnyMap& properties) override;
 
-            /**
-             * @brief Gets properties related to plugin behaviour.
-             *
-             * @param name Property name.
-             * @param arguments Additional arguments to get a property.
-             *
-             * @return Value of a property corresponding to the property name.
-             */
             virtual ov::Any get_property(const std::string& name, const ov::AnyMap& arguments) const override;
 
-            /**
-             * @brief Creates a remote context instance based on a map of properties
-             * @param remote_properties Map of device-specific shared context remote properties.
-             *
-             * @return A remote context object
-             */
             virtual ov::SoPtr<ov::IRemoteContext> create_context(const ov::AnyMap& remote_properties) const override;
 
-            /**
-             * @brief Provides a default remote context instance if supported by a plugin
-             * @param remote_properties Map of device-specific shared context remote properties.
-             *
-             * @return The default context.
-             */
             virtual ov::SoPtr<ov::IRemoteContext> get_default_context(const ov::AnyMap& remote_properties) const override;
 
-            /**
-             * @brief Creates an compiled model from an previously exported model using plugin implementation
-             *        and removes OpenVINO Runtime magic and plugin name
-             * @param model Reference to model output stream
-             * @param properties A ov::AnyMap of properties
-             * @return An Compiled model
-             */
             virtual std::shared_ptr<ov::ICompiledModel> import_model(std::istream& model,
                 const ov::AnyMap& properties) const override;
 
-
             virtual std::shared_ptr<ov::ICompiledModel> compile_model(const std::string& fname,
                 const ov::AnyMap& properties) const override;
 
-            /**
-             * @brief Creates an compiled model from an previously exported model using plugin implementation
-             *        and removes OpenVINO Runtime magic and plugin name
-             * @param model Reference to model output stream
-             * @param context A pointer to plugin context derived from RemoteContext class used to
-             *        execute the network
-             * @param properties A ov::AnyMap of properties
-             * @return An Compiled model
-             */
             virtual std::shared_ptr<ov::ICompiledModel> import_model(std::istream& model,
                 const ov::SoPtr<ov::IRemoteContext>& context,
                 const ov::AnyMap& properties) const override;
 
-            /**
-             * @brief Queries a plugin about supported layers in model
-             * @param model Model object to query.
-             * @param properties Optional map of pairs: (property name, property value).
-             * @return An object containing a map of pairs an operation name -> a device name supporting this operation.
-             */
             virtual ov::SupportedOpsMap query_model(const std::shared_ptr<const ov::Model>& model,
                 const ov::AnyMap& properties) const override;
 
-            std::string get_current_gguf_file_path() const;
         private:
-            std::string m_cache_dir = "./";
+            std::string m_cache_dir = "";
         };
     }  // namespace llama_cpp_plugin
 }  // namespace ov
diff --git a/modules/llama_cpp_plugin/src/compiled_model.cpp b/modules/llama_cpp_plugin/src/compiled_model.cpp
index 85a65d7e6..a1498f708 100644
--- a/modules/llama_cpp_plugin/src/compiled_model.cpp
+++ b/modules/llama_cpp_plugin/src/compiled_model.cpp
@@ -9,801 +9,46 @@
 #include "infer_request.hpp"
 #include "plugin.hpp"
 
+#include <openvino/util/log.hpp>
+
 namespace ov {
 namespace llama_cpp_plugin {
-class TensorWeightMatcher {
-public:
-    // TODO (vshampor) implement this for faster weight node matching.
-    // Use std::list, two passes - first for full name match, second for
-    // prefix-match; remove entries from list on match
-    using RTInfoTensorName = std::string;
-    using OvNodeName = std::string;
-    using LlamaTensorName = std::string;
-
-    TensorWeightMatcher(const std::shared_ptr<ov::Model>& model,
-                        std::map<RTInfoTensorName, ov::Shape> tensor_names_with_shapes_to_match) {
-        std::multimap<RTInfoTensorName, std::shared_ptr<ov::op::v0::Constant>> intermediate_matches_map;
-
-        const auto node_vector = model->get_ops();
-        std::list<std::shared_ptr<ov::op::v0::Constant>> const_nodes_in_model;
-        for (const auto& node_ptr : node_vector) {
-            if (ov::is_type<ov::op::v0::Constant>(node_ptr))
-                const_nodes_in_model.push_back(ov::as_type_ptr<ov::op::v0::Constant>(node_ptr));
-        }
-
-        // full substring match pass
-        std::map<RTInfoTensorName, ov::Shape> unmatched_rt_info_names_on_first_pass =
-            extract_matches(intermediate_matches_map,
-                            tensor_names_with_shapes_to_match,
-                            const_nodes_in_model,
-                            [](const std::string& substring, const std::string& source) {
-                                return source.find(substring) != std::string::npos;
-                            });
-
-        // prefix substring match pass
-        std::map<RTInfoTensorName, ov::Shape> unmatched_rt_info_names_on_second_pass = extract_matches(
-            intermediate_matches_map,
-            unmatched_rt_info_names_on_first_pass,
-            const_nodes_in_model,
-            [](const std::string& substring, const std::string& source) {
-                return source.find(get_weight_name_without_torch_postfix(substring)) != std::string::npos;
-            });
-
-        for (auto it = intermediate_matches_map.begin(); it != intermediate_matches_map.end();
-             it = intermediate_matches_map.upper_bound(it->first)) {
-            // TODO: perf improvement by iterating with ++;
-            RTInfoTensorName rt_info_name = it->first;
-            if (intermediate_matches_map.count(rt_info_name) != 1) {
-                std::cout << "VSHAMPOR: multiple matches for weight name " << rt_info_name << " and shape "
-                          << it->second->get_shape().to_string() << ", found ";
-                auto range_it_pair = intermediate_matches_map.equal_range(rt_info_name);
-                for (auto multimatch_it = range_it_pair.first; multimatch_it != range_it_pair.second; multimatch_it++) {
-                    auto node_ptr = multimatch_it->second;
-                    std::cout << node_ptr->get_friendly_name() << "(shape " << node_ptr->get_shape().to_string()
-                              << "),";
-                }
-                std::cout << "will take the first match" << std::endl;
-            }
-            const auto& match = intermediate_matches_map.find(rt_info_name)->second;
-            m_rtinfo_name_to_weight_node_map[rt_info_name] = match;
-        }
-        if (!unmatched_rt_info_names_on_second_pass.empty()) {
-            std::cout << "VSHAMPOR: did not find the weight node for " << unmatched_rt_info_names_on_second_pass.size()
-                      << " weights:" << std::endl;
-        }
-        for (const auto& unmatched_entry : unmatched_rt_info_names_on_second_pass) {
-            std::cout << '\t' << unmatched_entry.first << std::endl;
-        }
-    }
-
-    std::unordered_map<RTInfoTensorName, std::shared_ptr<ov::op::v0::Constant>> get_matches() {
-        return m_rtinfo_name_to_weight_node_map;
-    }
-
-private:
-    std::map<RTInfoTensorName, ov::Shape> extract_matches(
-        std::multimap<RTInfoTensorName, std::shared_ptr<ov::op::v0::Constant>>& output_matches_map,
-        const std::map<RTInfoTensorName, ov::Shape>& names_with_shapes_to_match,
-        const std::list<std::shared_ptr<ov::op::v0::Constant>>& search_list,
-        std::function<bool(const std::string& substring, const std::string& source)> name_match_predicate) {
-        std::map<RTInfoTensorName, ov::Shape> unmatched_rt_info_names;
-        for (const auto& pair : names_with_shapes_to_match) {
-            RTInfoTensorName rt_info_name = pair.first;
-            const ov::Shape& wanted_shape = pair.second;
-            bool matched = false;
-            for (auto it = search_list.begin(); it != search_list.end(); it++) {
-                auto node_ptr = *it;
-                const std::string& friendly_name = node_ptr->get_friendly_name();
-                if (name_match_predicate(rt_info_name, friendly_name) && node_ptr->get_shape() == wanted_shape) {
-                    output_matches_map.insert(std::make_pair(rt_info_name, node_ptr));
-                    matched = true;
-                    break;
-                }
-            }
-            if (!matched)
-                unmatched_rt_info_names.insert(pair);
-        }
-        return unmatched_rt_info_names;
-    }
-
-    static std::string get_weight_name_without_torch_postfix(std::string torch_weight_name) {
-        size_t idx = torch_weight_name.rfind(".");
-        if (idx == std::string::npos)
-            return torch_weight_name;
-        return std::string(torch_weight_name, 0, idx);
-    }
-
-    size_t num_exact_matches = 0;
-    size_t num_partial_matches = 0;
-    std::unordered_map<RTInfoTensorName, std::shared_ptr<ov::op::v0::Constant>> m_rtinfo_name_to_weight_node_map;
-};
-
-std::vector<std::shared_ptr<ov::Node>> get_nodes_containing_name_with_shape(const std::shared_ptr<ov::Model>& model,
-                                                                            const std::string& weight_name,
-                                                                            const ov::Shape& shape) {
-    auto ops = model->get_ops();
-    std::vector<std::shared_ptr<ov::Node>> found_weight_nodes;
-    std::copy_if(ops.begin(),
-                 ops.end(),
-                 std::back_inserter(found_weight_nodes),
-                 [&weight_name, &shape](const std::shared_ptr<ov::Node>& val) {
-                     if (!ov::is_type<ov::op::v0::Constant>(val))
-                         return false;
-                     std::shared_ptr<ov::op::v0::Constant> node_ptr = ov::as_type_ptr<ov::op::v0::Constant>(val);
-                     return val->get_friendly_name().find(weight_name) != std::string::npos &&
-                            val->get_shape() == shape;
-                 });
-    return found_weight_nodes;
-}
-
-bool has_weight_matches(const std::shared_ptr<ov::Model>& model,
-                        const std::string& weight_name,
-                        const ov::Shape& shape) {
-    std::vector<std::shared_ptr<ov::Node>> found_weight_nodes;
-    found_weight_nodes = get_nodes_containing_name_with_shape(model, weight_name, shape);
-    return !found_weight_nodes.empty();
-}
-
-std::string get_weight_name_without_torch_postfix(std::string torch_weight_name) {
-    size_t idx = torch_weight_name.rfind(".");
-    if (idx == std::string::npos)
-        return torch_weight_name;
-    return std::string(torch_weight_name, 0, idx);
-}
-
-bool has_partial_weight_matches(const std::shared_ptr<ov::Model>& model,
-                                const std::string& weight_name,
-                                const ov::Shape& shape) {
-    std::vector<std::shared_ptr<ov::Node>> found_weight_nodes;
-    found_weight_nodes =
-        get_nodes_containing_name_with_shape(model, get_weight_name_without_torch_postfix(weight_name), shape);
-    return !found_weight_nodes.empty();
-}
-
-std::shared_ptr<ov::op::v0::Constant> get_weight_by_name_and_shape(const std::shared_ptr<ov::Model>& model,
-                                                                   const std::string& weight_name,
-                                                                   const ov::Shape& shape) {
-    OPENVINO_ASSERT(has_weight_matches(model, weight_name, shape));
-    std::vector<std::shared_ptr<ov::Node>> found_weight_nodes;
-    found_weight_nodes = get_nodes_containing_name_with_shape(model, weight_name, shape);
-
-    if (found_weight_nodes.size() > 1) {
-        std::cout << "VSHAMPOR: multiple matches for weight name " << weight_name << " and shape " << shape.to_string()
-                  << ", found ";
-        for (const auto& node_ptr : found_weight_nodes) {
-            std::cout << node_ptr->get_friendly_name() << "(shape " << shape.to_string() << "),";
-        }
-        std::cout << "will take the first match" << std::endl;
-    }
-    std::shared_ptr<ov::Node> node_with_tensor = found_weight_nodes.front();
-    OPENVINO_ASSERT(ov::is_type<ov::op::v0::Constant>(node_with_tensor));
-    std::shared_ptr<ov::op::v0::Constant> const_node_ptr = ov::as_type_ptr<ov::op::v0::Constant>(node_with_tensor);
-    return const_node_ptr;
-}
-
-using TransposePermutation = std::pair<size_t, size_t>;
-
-std::vector<size_t> expand_front(const std::vector<size_t>& vec, size_t val) {
-    OPENVINO_ASSERT(vec.size() < GGML_MAX_DIMS);
-    std::vector<size_t> retval(GGML_MAX_DIMS, val);
-    std::copy(vec.rbegin(), vec.rend(), retval.rbegin());
-    return retval;
-}
-
-void write_float_plus_one(std::ofstream& out, const float* src) {
-    float elt = *src;
-    elt += 1;
-    out.write((const char*)&elt, sizeof(float));
-}
 
-void append_tensor_data_with_transpositions(const std::string& fname,
-                                            const std::vector<gguf_tensor_info>& tensor_infos,
-                                            const std::vector<void*>& tensor_data_ptrs,
-                                            const std::map<std::string, TransposePermutation>& transpositions,
-                                            const std::set<std::string> increment_by_one_tensor_names) {
-    // assuming contiguous data underneath each pointer from tensor_data_ptrs
-    OPENVINO_ASSERT(tensor_infos.size() == tensor_data_ptrs.size());
-    std::ofstream out(fname, std::ios::app | std::ios::out);
-    for (size_t i = 0; i < tensor_infos.size(); i++) {
-        const auto& tensor_info = tensor_infos[i];
-        OPENVINO_ASSERT(tensor_info.type == GGML_TYPE_F32);  // TODO (vshampor): writing transposed tensor data for
-                                                             // other data types, especially lower-bitwidth; maybe
-                                                             // use OV inference for that
-
-        const char* ir_tensor_data = reinterpret_cast<char*>(tensor_data_ptrs[i]);
-
-        std::string tensor_llama_name = std::string(tensor_info.name.data);
-        auto it = transpositions.find(tensor_llama_name);
-        if (it == transpositions.end()) {
-            // original IR tensor should not be transposed to conform to GGUF
-            // expectations, can write as-is
-            if (increment_by_one_tensor_names.count(tensor_llama_name) != 0) {  // gemma case
-                size_t elt_size = sizeof(float);                                // FP32 only for now
-                OPENVINO_ASSERT(!(tensor_info.size % elt_size));
-                size_t num_elts = tensor_info.size / elt_size;
-                for (size_t elt_idx = 0; elt_idx < num_elts; elt_idx++) {
-                    write_float_plus_one(out, ((float*)ir_tensor_data) + elt_idx);
-                }
-            } else {
-                out.write(ir_tensor_data, tensor_info.size);
-            }
-            continue;
-        }
-
-        if (it != transpositions.end()) {
-            std::vector<size_t> gguf_layout_shape;
-
-            // the shape in .ne is inverted w.r.t original export (~= IR) weight
-            // layout
-            for (size_t dim_idx = 0; dim_idx < tensor_info.n_dims; dim_idx++) {
-                gguf_layout_shape.push_back(tensor_info.ne[GGML_MAX_DIMS - 1 - tensor_info.n_dims - dim_idx]);
-            }
-
-            TransposePermutation permutation = it->second;
-            std::vector<size_t> ir_layout_shape(gguf_layout_shape);
-            std::swap(ir_layout_shape[permutation.first], ir_layout_shape[permutation.second]);
-
-            std::vector<size_t> ir_layout_strides(tensor_info.n_dims, 1);
-
-            for (size_t idx = 0; idx < tensor_info.n_dims - 1; idx++) {
-                auto previous_stride_it = ir_layout_strides.rbegin() + idx;
-                auto stride_it = ir_layout_strides.rbegin() + idx + 1;
-                auto shape_it = ir_layout_shape.rbegin() + idx;
-                *stride_it = *shape_it * *previous_stride_it;
-            }
-
-            std::vector<size_t> permuted_strides(ir_layout_strides);
-            std::swap(permuted_strides[permutation.first], permuted_strides[permutation.second]);
-
-            // expand up to GGML_MAX_DIMS
-            std::vector<size_t> gguf_layout_shape_ex = expand_front(gguf_layout_shape, 1);
-            // stride for unused dims will be 0, has no effect on loop because
-            // dimension idx for that dim is always 0
-            permuted_strides = expand_front(permuted_strides, 0);
-
-            std::cout << "VSHAMPOR: writing tensor " << tensor_info.name.data << " with size " << tensor_info.size;
-            std::cout << " shape (GGUF layout) ";
-            for (auto dim : gguf_layout_shape)
-                std::cout << dim << ",";
-            std::cout << " shape (IR layout) ";
-            for (auto dim : ir_layout_shape)
-                std::cout << dim << ",";
-            std::cout << " stride (IR layout) ";
-            for (auto stride : ir_layout_strides)
-                std::cout << stride << ",";
-            std::cout << " stride (IR layout, transposing) ";
-            for (auto stride : permuted_strides)
-                std::cout << stride << ",";
-            std::cout << std::endl;
-
-            // TODO (vshampor): rewrite the loop below using recurrent templates?
-            // This relies on GGUF_MAX_DIMS == 4 and unused dims being equal to 1
-            size_t current_offset = 0;
-            size_t element_size = sizeof(float);
-            size_t num_bytes_written = 0;
-            for (size_t dim_0 = 0; dim_0 < gguf_layout_shape_ex[0]; dim_0++)
-                for (size_t dim_1 = 0; dim_1 < gguf_layout_shape_ex[1]; dim_1++)
-                    for (size_t dim_2 = 0; dim_2 < gguf_layout_shape_ex[2]; dim_2++)
-                        for (size_t dim_3 = 0; dim_3 < gguf_layout_shape_ex[3]; dim_3++) {
-                            current_offset = element_size * (dim_0 * permuted_strides[0] + dim_1 * permuted_strides[1] +
-                                                             dim_2 * permuted_strides[2] + dim_3 * permuted_strides[3]);
-                            if (increment_by_one_tensor_names.count(tensor_llama_name) != 0) {  // gemma case
-                                write_float_plus_one(out, (float*)ir_tensor_data + current_offset);
-                            } else {
-                                out.write(ir_tensor_data + current_offset, element_size);
-                            }
-                            num_bytes_written += element_size;
-                        }
-            std::cout << "VSHAMPOR: wrote " << num_bytes_written << std::endl;
-            OPENVINO_ASSERT(num_bytes_written == tensor_info.size);
-        }
-    }
-}
-
-struct ValueStorageForLifetimeExtension {
-    std::list<std::string> kv_key_string_storage;
-    std::list<std::string> kv_value_string_storage;
-    std::list<std::vector<char*>> str_arr_storage;
-    void* store_gguf_value_vector(const std::vector<gguf_value>& vec, gguf_type g_type) {
-        size_t elt_size;
-        switch (g_type) {
-        case GGUF_TYPE_UINT8:
-            elt_size = sizeof(uint8_t);
-            break;
-        case GGUF_TYPE_INT8:
-            elt_size = sizeof(int8_t);
-            break;
-        case GGUF_TYPE_UINT16:
-            elt_size = sizeof(uint16_t);
-            break;
-        case GGUF_TYPE_INT16:
-            elt_size = sizeof(int16_t);
-            break;
-        case GGUF_TYPE_UINT32:
-            elt_size = sizeof(uint32_t);
-            break;
-        case GGUF_TYPE_INT32:
-            elt_size = sizeof(int32_t);
-            break;
-        case GGUF_TYPE_FLOAT32:
-            elt_size = sizeof(float);
-            break;
-        case GGUF_TYPE_UINT64:
-            elt_size = sizeof(uint64_t);
-            break;
-        case GGUF_TYPE_INT64:
-            elt_size = sizeof(int64_t);
-            break;
-        case GGUF_TYPE_FLOAT64:
-            elt_size = sizeof(double);
-            break;
-        case GGUF_TYPE_BOOL:
-            elt_size = sizeof(bool);
-            break;
-        default:
-            OPENVINO_THROW("Unknown array type");
-        }
-        size_t size_in_bytes = vec.size() * elt_size;
-        void* mem_ptr = new char[size_in_bytes];
-        for (size_t i = 0; i < vec.size(); i++) {
-            switch (g_type) {
-            case GGUF_TYPE_UINT8:
-                ((uint8_t*)mem_ptr)[i] = vec[i].uint8;
-                break;
-            case GGUF_TYPE_INT8:
-                ((int8_t*)mem_ptr)[i] = vec[i].int8;
-                break;
-            case GGUF_TYPE_UINT16:
-                ((uint16_t*)mem_ptr)[i] = vec[i].uint16;
-                break;
-            case GGUF_TYPE_INT16:
-                ((int16_t*)mem_ptr)[i] = vec[i].int16;
-                break;
-            case GGUF_TYPE_UINT32:
-                ((uint32_t*)mem_ptr)[i] = vec[i].uint32;
-                break;
-            case GGUF_TYPE_INT32:
-                ((int32_t*)mem_ptr)[i] = vec[i].int32;
-                break;
-            case GGUF_TYPE_FLOAT32:
-                ((float*)mem_ptr)[i] = vec[i].float32;
-                break;
-            case GGUF_TYPE_UINT64:
-                ((uint64_t*)mem_ptr)[i] = vec[i].uint64;
-                break;
-            case GGUF_TYPE_INT64:
-                ((int64_t*)mem_ptr)[i] = vec[i].int64;
-                break;
-            case GGUF_TYPE_FLOAT64:
-                ((double*)mem_ptr)[i] = vec[i].float64;
-                break;
-            case GGUF_TYPE_BOOL:
-                ((bool*)mem_ptr)[i] = vec[i].bool_;
-                break;
-            default:
-                OPENVINO_THROW("Unknown array type");
-            }
-        }
-        return mem_ptr;
-    }
-
-    ValueStorageForLifetimeExtension() = default;
-    ~ValueStorageForLifetimeExtension() {
-        for (void* ptr : non_str_raw_storage) {
-            delete[](char*) ptr;
-        }
-    }
-
-private:
-    std::list<void*> non_str_raw_storage;
-};
-
-bool maybe_parse_single_element(gguf_type g_type,
-                                ov::Any rtmap_value,
-                                gguf_value& dst,
-                                ValueStorageForLifetimeExtension& store) {
-    switch (g_type) {
-    case GGUF_TYPE_UINT8:
-        dst.uint8 = rtmap_value.as<uint8_t>();
-        break;
-    case GGUF_TYPE_INT8:
-        dst.int8 = rtmap_value.as<int8_t>();
-        ;
-        break;
-    case GGUF_TYPE_UINT16:
-        dst.uint16 = rtmap_value.as<uint16_t>();
-        break;
-    case GGUF_TYPE_INT16:
-        dst.int16 = rtmap_value.as<int16_t>();
-        break;
-    case GGUF_TYPE_UINT32:
-        dst.uint32 = rtmap_value.as<uint32_t>();
-        break;
-    case GGUF_TYPE_INT32:
-        dst.int32 = rtmap_value.as<int32_t>();
-        break;
-    case GGUF_TYPE_FLOAT32:
-        dst.float32 = rtmap_value.as<float>();
-        break;
-    case GGUF_TYPE_UINT64:
-        dst.uint64 = rtmap_value.as<uint64_t>();
-        break;
-    case GGUF_TYPE_INT64:
-        dst.int64 = rtmap_value.as<int64_t>();
-        break;
-    case GGUF_TYPE_FLOAT64:
-        dst.float64 = rtmap_value.as<double>();
-        break;
-    case GGUF_TYPE_BOOL:
-        dst.bool_ = rtmap_value.as<bool>();
-        break;
-    case GGUF_TYPE_STRING: {
-        std::string string_value = rtmap_value.as<std::string>();
-        store.kv_value_string_storage.push_back(string_value);
-        dst.str.n = string_value.length();
-        dst.str.data =
-            (char*)store.kv_value_string_storage.back().c_str();  // TODO (vshampor) see equivalent case below
-        break;
-    }
-    default:
-        return false;  // did not parse
-    }
-    return true;  // parsed successfully
-}
-
-ov::Any get_any_associated_with_gguf_type(gguf_type g_type) {
-    switch (g_type) {
-    case GGUF_TYPE_UINT8:
-        return ov::Any(uint8_t());
-    case GGUF_TYPE_INT8:
-        return ov::Any(int8_t());
-    case GGUF_TYPE_UINT16:
-        return ov::Any(uint16_t());
-    case GGUF_TYPE_INT16:
-        return ov::Any(int16_t());
-    case GGUF_TYPE_UINT32:
-        return ov::Any(uint32_t());
-    case GGUF_TYPE_INT32:
-        return ov::Any(int32_t());
-    case GGUF_TYPE_FLOAT32:
-        return ov::Any(float());
-    case GGUF_TYPE_UINT64:
-        return ov::Any(uint64_t());
-    case GGUF_TYPE_INT64:
-        return ov::Any(int64_t());
-    case GGUF_TYPE_FLOAT64:
-        return ov::Any(double());
-    case GGUF_TYPE_BOOL:
-        return ov::Any(bool());
-    case GGUF_TYPE_STRING:
-        return ov::Any(std::string());
-    default:
-        OPENVINO_THROW("Unknown gguf_type to turn into ov::Any");
-    }
-}
 
 LlamaCppModel::LlamaCppModel(const std::shared_ptr<ov::Model>& model,
                              const std::shared_ptr<const ov::IPlugin>& plugin,
                              const ov::SoPtr<ov::IRemoteContext>& context,
                              const std::shared_ptr<ov::threading::ITaskExecutor>& task_executor)
     : ICompiledModel(model, plugin, context, task_executor) {
-    m_model = model;
-    num_tokens_processed_ptr = new size_t;  // TODO (vshampor): hack, remove
-    *num_tokens_processed_ptr = 0;
-    auto rt_info = model->get_rt_info();
-    OPENVINO_ASSERT(rt_info.count("lcp_kv_params") != 0);
-    OPENVINO_ASSERT(rt_info.count("lcp_kv_types") != 0);
-    OPENVINO_ASSERT(rt_info.count("lcp_kv_array_types") != 0);
-    OPENVINO_ASSERT(rt_info.count("lcp_tensor_name_map") != 0);
-    OPENVINO_ASSERT(rt_info.count("lcp_tensor_shape_map") != 0);
-    OPENVINO_ASSERT(rt_info.count("lcp_expected_tensor_shapes") != 0);
-    OPENVINO_ASSERT(rt_info.count("lcp_transpose_permutations") != 0);
-
-    RTMap& kv_params = model->get_rt_info<RTMap&>("lcp_kv_params");
-    RTMap& kv_types = model->get_rt_info<RTMap&>("lcp_kv_types");
-    RTMap& kv_array_types = model->get_rt_info<RTMap&>("lcp_kv_array_types");
-    RTMap& tensor_name_map = model->get_rt_info<RTMap&>("lcp_tensor_name_map");
-    RTMap& tensor_shape_map = model->get_rt_info<RTMap&>("lcp_tensor_shape_map");
-    RTMap& expected_tensor_shapes_map = model->get_rt_info<RTMap&>("lcp_expected_tensor_shapes");
-    RTMap& transpose_permutations_rtmap = model->get_rt_info<RTMap&>("lcp_transpose_permutations");
-
-    size_t gguf_version = model->get_rt_info<size_t>("lcp_gguf_version");
-    std::cout << "VSHAMPOR: parsed gguf_version " << gguf_version << std::endl;
-
-    // kv params
-    OPENVINO_ASSERT(kv_params.size() == kv_types.size());
-    size_t n_kv = kv_params.size();
-    std::vector<gguf_kv> kv_vector;
-    ValueStorageForLifetimeExtension store;
-
-    for (const auto& kv_pair : kv_params) {
-        gguf_kv kv;
-
-        const auto& key = kv_pair.first;
-        kv.key.n = key.length();
-        store.kv_key_string_storage.push_back(key);
-        kv.key.data = (char*)store.kv_key_string_storage.back().c_str();  // TODO (vshampor) see equivalent case below
-
-        uint32_t value_type = kv_types[key].as<uint32_t>();
-        gguf_type gguf_value_type = (gguf_type)value_type;
-        kv.type = gguf_value_type;
-        if (gguf_value_type != GGUF_TYPE_ARRAY) {
-            bool is_parsed = maybe_parse_single_element(kv.type, kv_pair.second, kv.value, store);
-            OPENVINO_ASSERT(is_parsed, "Invalid type of a GGUF kv-value");
-        } else {  // array case
-            gguf_type element_type = (gguf_type)kv_array_types[key].as<uint32_t>();
-            kv.value.arr.type = element_type;
-            std::string serialized_array = kv_pair.second.as<std::string>();
-            std::stringstream ss{serialized_array};
-            std::vector<gguf_value> parsed_array;
-            while (!ss.eof()) {
-                gguf_value array_elt;
-                ov::Any ov_any = get_any_associated_with_gguf_type(element_type);
-                std::string token;
-                ss >> token;
-                if (std::string(kv.key.data) == "tokenizer.ggml.merges") {
-                    // tokenizer merges are pairs of tokens separated by whitespace, so
-                    // need to read another to get a proper merge
-                    // TODO (vshampor): think of another delimiting strategy in the
-                    // rt_info and use that strategy here for more robust code
-                    std::string another_token;
-                    ss >> another_token;
-                    token += std::string(" ") + another_token;
-                    ov_any = ov::Any::make<std::string>(token);
-                } else {
-                    std::stringstream tok_ss{token};
-                    ov_any.read(tok_ss);
-                }
-                bool is_parsed = maybe_parse_single_element(element_type, ov_any, array_elt, store);
-                OPENVINO_ASSERT(is_parsed);
-                parsed_array.push_back(array_elt);
-            }
-            kv.value.arr.n = parsed_array.size();
-            if (element_type == GGUF_TYPE_STRING) {
-                // string element has already been lifetime-extended during parsing
-                std::vector<char*> cstr_vector(parsed_array.size());
-                for (size_t cstr_idx = 0; cstr_idx < parsed_array.size(); cstr_idx++) {
-                    cstr_vector[cstr_idx] = parsed_array[cstr_idx].str.data;
-                }
-                store.str_arr_storage.push_back(cstr_vector);
-                kv.value.arr.data = store.str_arr_storage.back().data();
-            } else {
-                void* data_ptr = store.store_gguf_value_vector(parsed_array, element_type);
-                kv.value.arr.data = data_ptr;
-            }
-        }
-        kv_vector.push_back(kv);
-    }
-
-    auto token_types_kv_it = std::find_if(kv_vector.begin(), kv_vector.end(), [](const gguf_kv& val) {
-        return std::string(val.key.data) == "tokenizer.ggml.token_type";
-    });
-    if (token_types_kv_it != kv_vector.end()) {
-        auto tokens_kv_it = std::find_if(kv_vector.begin(), kv_vector.end(), [](const gguf_kv& val) {
-            return std::string(val.key.data) == "tokenizer.ggml.tokens";
-        });
-        if (tokens_kv_it != kv_vector.end()) {
-            size_t expected_num_tokens = token_types_kv_it->value.arr.n;
-            size_t actual_num_tokens = tokens_kv_it->value.arr.n;
-            if (actual_num_tokens < expected_num_tokens) {
-                std::cout << "VSHAMPOR: detected wrong vocab "
-                             "serialization/deserialization (expected "
-                          << expected_num_tokens << " tokens, parsed " << actual_num_tokens
-                          << " from vocab), filling tokens with bogus values" << std::endl;
-                std::vector<char*> new_vocab;
-                // char** old_vocab_data_ptr = (char**) tokens_kv_it->value.arr.data;
-                // std::copy(old_vocab_data_ptr, old_vocab_data_ptr + actual_num_tokens,
-                // new_vocab.begin()); size_t extra_tokens_needed = expected_num_tokens
-                // - actual_num_tokens;
-                size_t extra_tokens_needed = expected_num_tokens;
-                for (size_t tok_idx = 0; tok_idx < extra_tokens_needed; tok_idx++) {
-                    std::stringstream ss;
-                    ss << "invalid_token_" << tok_idx;
-                    std::string new_token = ss.str();
-                    store.kv_value_string_storage.push_back(new_token);
-                    char* str_data_ptr = (char*)store.kv_value_string_storage.back().c_str();
-                    new_vocab.push_back(str_data_ptr);
-                }
-                OPENVINO_ASSERT(new_vocab.size() == expected_num_tokens);
-                store.str_arr_storage.push_back(new_vocab);
-                tokens_kv_it->value.arr.data = (void*)store.str_arr_storage.back().data();
-                tokens_kv_it->value.arr.n = expected_num_tokens;
-            }
-        }
-    }
-
-    // tensors
-    OPENVINO_ASSERT(tensor_name_map.size() == tensor_shape_map.size());
-    size_t n_tensors_in_rtinfo = tensor_name_map.size();
-    std::cout << "VSHAMPOR: got request for " << n_tensors_in_rtinfo << " tensors from rt_info\n";
-
-    std::vector<struct gguf_tensor_info> tensor_infos;
-    std::vector<void*> tensor_data_ptrs;
-
-    std::map<std::string, ov::Shape> parsed_weights_to_search_for;
-    for (const auto& llama_name_and_rtinfo_name : tensor_name_map) {
-        const std::string& llama_name = llama_name_and_rtinfo_name.first;
-        const std::string& rtinfo_name = llama_name_and_rtinfo_name.second.as<std::string>();
-        ov::Shape expected_shape = tensor_shape_map[llama_name].as<std::string>();
-        parsed_weights_to_search_for[rtinfo_name] = expected_shape;
-    }
-
-    TensorWeightMatcher matcher{model, parsed_weights_to_search_for};
-    std::unordered_map<std::string, std::shared_ptr<ov::op::v0::Constant>> matches = matcher.get_matches();
-    std::unordered_map<std::string, std::shared_ptr<ov::op::v0::Constant>> llama_name_to_constant_node_map;
-    for (const auto& entry : tensor_name_map) {
-        const auto& llama_name = entry.first;
-        const auto& rtinfo_name = entry.second.as<std::string>();
-        llama_name_to_constant_node_map[llama_name] = matches[rtinfo_name];
-    }
-    std::cout << "VSHAMPOR: requested tensors map to " << llama_name_to_constant_node_map.size()
-              << " tensors to search in model (shared tensors considered)\n";
-
-    std::list<std::string> llama_name_storage;
-
-    size_t n_tensors = 0;
-
-    size_t offset = 0;  // each tensor_info has to have a correct offset including
-                        // padding, checked for in gguf_write_to_buf
-    for (const auto& matched_weight_pair : llama_name_to_constant_node_map) {
-        // Need to store the names in the list so that the passed c_str() pointers
-        // in tensor_infos to the llama names stay valid until they get deepcopied
-        // in gguf/llama functions
-        llama_name_storage.push_back(matched_weight_pair.first);
-        const std::string& llama_name = llama_name_storage.back();
-
-        auto weight_const_node_ptr = matched_weight_pair.second;
-        auto weight_shape = weight_const_node_ptr->get_shape();
-
-        // does hf-to-gguf invert all tensor dimensions with shapes > 1?
-        auto expected_weight_shape = ov::Shape(expected_tensor_shapes_map[llama_name].as<std::string>());
-        OPENVINO_ASSERT(expected_weight_shape.size() < GGML_MAX_DIMS);
-
-        gguf_tensor_info info;
-
-        info.type = GGML_TYPE_F32;  // TODO (vshampor): better type assignment based
-                                    // on actual element type of the Constant node
-
-        info.name.n = llama_name.length();
-        info.name.data = (char*)llama_name.c_str();  // TODO (vshampor): either do this via const_cast, or will
-                                                     // have to implement own structures for read-only data
-                                                     // passing to llama_load_model_from_data
-        info.n_dims = weight_shape.size();
-        std::fill(std::begin(info.ne), std::begin(info.ne) + GGML_MAX_DIMS, (uint64_t)1);
-
-        // looks like GGUF expects inverse order of dimensions when compared to e.g.
-        // torch and actual row-major layout, see
-        // gguf.gguf_writer.GGUFWriter.add_tensor_info in gguf python package
-        std::copy(expected_weight_shape.rbegin(), expected_weight_shape.rend(), info.ne);
-
-        void* data_ptr = (void*)(weight_const_node_ptr->get_data_ptr());  // TODO (vshampor): danger - casts
-                                                                          // `const` away also - the
-                                                                          // expected_weight_shape is in general
-                                                                          // different from actual ov::Tensor
-                                                                          // shape, in particular it may be
-                                                                          // transposed, so we actually need to set
-                                                                          // the pointers to shape-corrected tensor
-                                                                          // storage, which we don't do here - we
-                                                                          // are only preparing this data to get a
-                                                                          // convenient gguf_context object to
-                                                                          // reuse metadata (header) writing code,
-                                                                          // tensor data transpositions will be
-                                                                          // done during actual file write
-
-        info.size = weight_const_node_ptr->get_byte_size();
-        info.offset = offset;
-
-        const size_t size_pad = GGML_PAD(info.size, GGUF_DEFAULT_ALIGNMENT);
-        offset += size_pad;
-
-        info.data = data_ptr;
-
-        tensor_infos.push_back(info);
-        tensor_data_ptrs.push_back(data_ptr);
-        n_tensors++;
-    }
-
-    std::cout << "VSHAMPOR: found " << matches.size() << "/" << parsed_weights_to_search_for.size() << " tensors"
-              << std::endl;
-
-    gguf_init_params gguf_params;
-    gguf_params.no_alloc = false;
-    gguf_params.ctx = nullptr;
-
-    m_gguf_ctx = gguf_init_from_data(n_tensors,
-                                     tensor_infos.data(),
-                                     n_kv,
-                                     kv_vector.data(),
-                                     tensor_data_ptrs.data(),
-                                     gguf_params);
-
-    std::shared_ptr<const LlamaCppPlugin> llama_plugin_ptr = std::dynamic_pointer_cast<const LlamaCppPlugin>(plugin);
-    m_converted_gguf_file_name = llama_plugin_ptr->get_current_gguf_file_path();
-
-    std::cout << "VSHAMPOR: output filename is  " << m_converted_gguf_file_name << std::endl;
-    std::cout << "VSHAMPOR: writing metadata (GGUF header) " << std::endl;
-    gguf_write_to_file(m_gguf_ctx,
-                       m_converted_gguf_file_name.c_str(),
-                       /* only_meta = */ true);
-
-    std::map<std::string, TransposePermutation> transpose_permutations;
-
-    for (const auto& llama_name_and_permutation : transpose_permutations_rtmap) {
-        std::string permutation_str = llama_name_and_permutation.second.as<std::string>();
-        std::stringstream ss(permutation_str);
-        TransposePermutation permutation;
-        bool is_ok = true;
-        is_ok &= static_cast<bool>(ss >> permutation.first);
-        is_ok &= static_cast<bool>(ss >> permutation.second);
-        OPENVINO_ASSERT(is_ok, "failed to read permutation");
-        transpose_permutations[llama_name_and_permutation.first] = permutation;
+        OPENVINO_THROW_NOT_IMPLEMENTED("Currently only direct GGUF file loading is supported for the LLAMA_CPP* plugins");
     }
 
-    std::set<std::string> gemma_tensor_names_to_increment;
-    // FIXME (vshampor): tried setting up commands for incrementing *_norm.weight
-    // values by 1 like it is done during llama.cpp HF-to-GGUF export, but it
-    // seems that it isn't necessary and IR stores the incremented weights already
-    // Is this due to constant folding?
-
-    // for (const auto& llama_name_and_rtinfo_name : tensor_name_map) {
-    //     const std::string& llama_name = llama_name_and_rtinfo_name.first;
-    //     const std::string& rtinfo_name =
-    //     llama_name_and_rtinfo_name.second.as<std::string>(); std::string
-    //     gemma_norm_suffix = "norm.weight"; if (rtinfo_name.size() <
-    //     gemma_norm_suffix.size()) continue; if
-    //     (rtinfo_name.substr(rtinfo_name.size() - gemma_norm_suffix.size()) ==
-    //     gemma_norm_suffix) gemma_tensor_names_to_increment.insert(llama_name);
-    // }
-
-    std::cout << "VSHAMPOR: writing tensor data (blob with transpositions) " << std::endl;
-    append_tensor_data_with_transpositions(m_converted_gguf_file_name,
-                                           tensor_infos,
-                                           tensor_data_ptrs,
-                                           transpose_permutations,
-                                           gemma_tensor_names_to_increment);
-    std::cout << "VSHAMPOR: write finished." << m_converted_gguf_file_name << std::endl;
-
-    std::cout << "VSHAMPOR: loading llama model from written file..." << std::endl;
-    llama_model_params mparams = llama_model_default_params();
-    mparams.n_gpu_layers = 99;
-    m_llama_model_ptr = llama_load_model_from_file(m_converted_gguf_file_name.c_str(), mparams);
-    llama_context_params cparams = llama_context_default_params();
-    m_llama_ctx = llama_new_context_with_model(m_llama_model_ptr, cparams);
-
-    std::cout << "VSHAMPOR: llama model loaded successfully..." << std::endl;
-}
 
 LlamaCppModel::LlamaCppModel(const std::shared_ptr<ov::Model>& ov_model,
                              std::istream& input_stream,
                              const std::shared_ptr<const IPlugin>& plugin)
     : ICompiledModel(ov_model, plugin) {
-    num_tokens_processed_ptr = new size_t;  // TODO (vshampor): hack, remove
-    *num_tokens_processed_ptr = 0;
-    std::shared_ptr<const LlamaCppPlugin> llama_plugin = std::dynamic_pointer_cast<const LlamaCppPlugin>(plugin);
-    std::string current_file_path = llama_plugin->get_current_gguf_file_path();
-    std::ofstream output_stream(current_file_path, std::ios::binary);
-    output_stream << input_stream.rdbuf();
+        OPENVINO_THROW_NOT_IMPLEMENTED("Currently only direct GGUF file loading is supported for the LLAMA_CPP* plugins");
+}
 
-    std::cout << "VSHAMPOR: loading llama model from imported and re-written file..." << std::endl;
-    llama_model_params mparams = llama_model_default_params();
-    mparams.n_gpu_layers = 99;
-    m_llama_model_ptr = llama_load_model_from_file(current_file_path.c_str(), mparams);
-    llama_context_params cparams = llama_context_default_params();
-    m_llama_ctx = llama_new_context_with_model(m_llama_model_ptr, cparams);
-    std::cout << "VSHAMPOR: llama model loaded successfully from cache..." << std::endl;
+LlamaCppModel::~LlamaCppModel() {
+    llama_free(m_llama_ctx);
+    llama_free_model(m_llama_model_ptr);
+    llama_backend_free();
+    delete num_tokens_processed_ptr;
 }
 
 LlamaCppModel::LlamaCppModel(const std::string& gguf_fname, const std::shared_ptr<const IPlugin>& plugin)
-    : ICompiledModel(nullptr, plugin) {
+    : ICompiledModel(nullptr, plugin), m_gguf_fname(gguf_fname) {
     num_tokens_processed_ptr = new size_t;  // TODO (vshampor): hack, remove
     *num_tokens_processed_ptr = 0;
-    std::cout << "VSHAMPOR: loading llama model directly from GGUF... " << std::endl;
+    OPENVINO_DEBUG << "llama_cpp_plugin: loading llama model directly from GGUF... " << std::endl;
     llama_model_params mparams = llama_model_default_params();
     mparams.n_gpu_layers = 99;
     m_llama_model_ptr = llama_load_model_from_file(gguf_fname.c_str(), mparams);
     llama_context_params cparams = llama_context_default_params();
     m_llama_ctx = llama_new_context_with_model(m_llama_model_ptr, cparams);
-    std::cout << "VSHAMPOR: llama model loaded successfully from GGUF..." << std::endl;
+    OPENVINO_DEBUG << "llama_cpp_plugin: llama model loaded successfully from GGUF..." << std::endl;
 
     auto input_ids = std::make_shared<ov::opset13::Parameter>(ov::element::Type_t::i64, ov::PartialShape({-1, -1}));
     auto fake_convert = std::make_shared<ov::opset13::Convert>(input_ids->output(0), ov::element::Type_t::f32);
@@ -837,47 +82,20 @@ LlamaCppModel::LlamaCppModel(const std::string& gguf_fname, const std::shared_pt
     }
 }
 
-void LlamaCppModel::export_model(std::ostream& output_stream) const {
-    std::cout << "VSHAMPOR: exporting model" << std::endl;
-
-    // FIXME (vshampor): it's a shame that loading a model from cache does not
-    // have an option to actually keep the already loaded model from xml and not
-    // be forced to deserialize an ov::Model representation from cache as well. As
-    // it stands, will need to write the whole IR into the cache entry along with
-    // the GGUF file.
-    //
-    std::stringstream xmlFile, binFile;
-    ov::pass::Serialize serializer(xmlFile, binFile);
-    serializer.run_on_model(m_model);
-
-    auto m_constants = binFile.str();
-    auto m_model = xmlFile.str();
-
-    auto dataSize = static_cast<std::uint64_t>(m_model.size());
-    output_stream.write(reinterpret_cast<char*>(&dataSize), sizeof(dataSize));
-    output_stream.write(m_model.c_str(), dataSize);
-
-    dataSize = static_cast<std::uint64_t>(m_constants.size());
-    output_stream.write(reinterpret_cast<char*>(&dataSize), sizeof(dataSize));
-    output_stream.write(reinterpret_cast<char*>(&m_constants[0]), dataSize);
-
-    std::ifstream in(m_converted_gguf_file_name, std::ios::binary);
-    output_stream << in.rdbuf();
-}
 
 std::shared_ptr<const ov::Model> LlamaCppModel::get_runtime_model() const {
-    OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented");
+    OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: Not Implemented");
 }
 
 void LlamaCppModel::set_property(const ov::AnyMap& properties) {
-    std::cout << "VSHAMPOR: attempted to set_property (did nothing)";
+    OPENVINO_DEBUG << "llama_cpp_plugin: attempted to set_property (did nothing)";
 }
 
 ov::Any LlamaCppModel::get_property(const std::string& name) const {
     if (ov::supported_properties == name) {
         return decltype(ov::supported_properties)::value_type(std::vector<PropertyName>());
     }
-    OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented");
+    OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: Not Implemented");
 }
 
 std::shared_ptr<ov::ISyncInferRequest> LlamaCppModel::create_sync_infer_request() const {
@@ -891,5 +109,13 @@ const std::vector<ov::Output<const ov::Node>>& LlamaCppModel::inputs() const {
 const std::vector<ov::Output<const ov::Node>>& LlamaCppModel::outputs() const {
     return m_fake_outputs;
 };
+
+void LlamaCppModel::export_model(std::ostream& output_stream) const {
+    std::ifstream in(m_gguf_fname, std::ios::binary);
+    output_stream << in.rdbuf();
+}
+
+
+
 }  // namespace llama_cpp_plugin
 }  // namespace ov
diff --git a/modules/llama_cpp_plugin/src/infer_request.cpp b/modules/llama_cpp_plugin/src/infer_request.cpp
index 6b5e8ba1e..40307c573 100644
--- a/modules/llama_cpp_plugin/src/infer_request.cpp
+++ b/modules/llama_cpp_plugin/src/infer_request.cpp
@@ -1,7 +1,9 @@
 #include "infer_request.hpp"
+#include <openvino/runtime/ivariable_state.hpp>
 
 #include "llama.h"
 #include "openvino/runtime/make_tensor.hpp"
+#include "openvino/util/log.hpp"
 
 namespace ov {
 namespace llama_cpp_plugin {
@@ -18,12 +20,10 @@ void allocate_tensor_impl(ov::SoPtr<ov::ITensor>& tensor,
 
 LlamaCppSyncInferRequest::LlamaCppSyncInferRequest(const std::shared_ptr<const LlamaCppModel>& compiled_model)
     : ov::ISyncInferRequest(compiled_model) {
-    std::cout << "VSHAMPOR: infer request ctor called\n";
+    OPENVINO_DEBUG << "llama_cpp_plugin: infer request ctor called\n";
     m_compiled_model_ptr = compiled_model;
-    // Allocate input/output tensors
     for (const auto& input : get_inputs()) {
         allocate_tensor(input, [input](ov::SoPtr<ov::ITensor>& tensor) {
-            // Can add a check to avoid double work in case of shared tensors
             allocate_tensor_impl(tensor,
                                  input.get_element_type(),
                                  input.get_partial_shape().is_dynamic() ? ov::Shape{0} : input.get_shape());
@@ -31,7 +31,6 @@ LlamaCppSyncInferRequest::LlamaCppSyncInferRequest(const std::shared_ptr<const L
     }
     for (const auto& output : get_outputs()) {
         allocate_tensor(output, [output](ov::SoPtr<ov::ITensor>& tensor) {
-            // Can add a check to avoid double work in case of shared tensors
             allocate_tensor_impl(tensor,
                                  output.get_element_type(),
                                  output.get_partial_shape().is_dynamic() ? ov::Shape{0} : output.get_shape());
@@ -40,7 +39,7 @@ LlamaCppSyncInferRequest::LlamaCppSyncInferRequest(const std::shared_ptr<const L
 }
 void LlamaCppSyncInferRequest::set_tensors_impl(const ov::Output<const ov::Node> port,
                                                 const std::vector<ov::SoPtr<ov::ITensor>>& tensors) {
-    std::cout << "VSHAMPOR: set_tensors_impl called\n";
+    OPENVINO_DEBUG << "llama_cpp_plugin: set_tensors_impl called\n";
 }
 
 void llama_batch_add_reimpl(struct llama_batch& batch,
@@ -64,7 +63,6 @@ void LlamaCppSyncInferRequest::infer() {
                                                               // all inputs without hardcode
     OPENVINO_ASSERT(input_ids_tensor_ptr->get_element_type() == ov::element::Type_t::i64);
     OPENVINO_ASSERT(input_ids_tensor_ptr->get_shape().size() == 2);
-    size_t batch_size = input_ids_tensor_ptr->get_shape()[0];
     size_t sequence_length = input_ids_tensor_ptr->get_shape()[1];
 
     // llama_batch actually contains one sequence
@@ -81,8 +79,7 @@ void LlamaCppSyncInferRequest::infer() {
                                {0},
                                true);  // the last `true` here is a marker that the logits for this
                                        // token should be computed and returned
-        size_t* ptr = m_compiled_model_ptr->num_tokens_processed_ptr;
-        (*ptr)++;
+       *(m_compiled_model_ptr->num_tokens_processed_ptr) += 1;
     }
 
     llama_context* ctx = m_compiled_model_ptr->m_llama_ctx;
@@ -109,12 +106,14 @@ void LlamaCppSyncInferRequest::infer() {
     });
 };
 std::vector<ov::ProfilingInfo> LlamaCppSyncInferRequest::get_profiling_info() const {
-    std::cout << "VSHAMPOR: get_profiling_info() called\n";
+    OPENVINO_DEBUG << "llama_cpp_plugin: get_profiling_info() called\n";
     return std::vector<ov::ProfilingInfo>{};
 };
+
+
 std::vector<ov::SoPtr<ov::IVariableState>> LlamaCppSyncInferRequest::query_state() const {
-    std::cout << "VSHAMPOR: get_profiling_info() called\n";
-    return std::vector<ov::SoPtr<ov::IVariableState>>{};
+    OPENVINO_DEBUG << "llama_cpp_plugin: query_state() called\n";
+    return {};
 }
 }  // namespace llama_cpp_plugin
 }  // namespace ov
diff --git a/modules/llama_cpp_plugin/src/plugin.cpp b/modules/llama_cpp_plugin/src/plugin.cpp
index 3e23c568f..ec456cc45 100644
--- a/modules/llama_cpp_plugin/src/plugin.cpp
+++ b/modules/llama_cpp_plugin/src/plugin.cpp
@@ -4,6 +4,7 @@
 
 #include "compiled_model.hpp"
 #include "openvino/op/constant.hpp"
+#include "openvino/util/log.hpp"
 #include "openvino/runtime/internal_properties.hpp"
 
 namespace {
@@ -19,48 +20,7 @@ LlamaCppPlugin::LlamaCppPlugin() : IPlugin() {
 }
 std::shared_ptr<ov::ICompiledModel> LlamaCppPlugin::compile_model(const std::shared_ptr<const ov::Model>& model,
                                                                   const ov::AnyMap& properties) const {
-    std::cout << "VSHAMPOR: LlamaCppPlugin::compile_model" << std::endl;
-
-    // std::string gpt2_node_name = "transformer.h.9.attn.c_proj.weight";
-    // std::cout << "VSHAMPOR: sanity check - looking for node containing " <<
-    // gpt2_node_name << std::endl; auto ops = model->get_ops(); auto iter =
-    // std::find_if(ops.begin(), ops.end(), [gpt2_node_name](const
-    // std::shared_ptr<ov::Node>& val) {
-    //        return val->get_friendly_name().find(gpt2_node_name) !=
-    //        std::string::npos; });
-    // if (iter == ops.end()) {
-    //    std::cout << "VSHAMPOR: did not find the node\n";
-    //} else {
-    //    std::shared_ptr<ov::Node> node_with_tensor = *iter;
-    //    std::cout << "VSHAMPOR: node type is " <<
-    //    node_with_tensor->get_type_name() << std::endl;
-    //    std::shared_ptr<ov::op::v0::Constant> const_node_ptr =
-    //    ov::as_type_ptr<ov::op::v0::Constant>(node_with_tensor); const float*
-    //    data_ptr = const_node_ptr->get_data_ptr<element::Type_t::f32>();
-    //    // ov::descriptor::Tensor& tensor_descr =
-    //    node_with_tensor->get_output_tensor(0);
-    //    // std::cout << "VSHAMPOR: node output tensor shape is " <<
-    //    tensor_descr.get_shape().to_string() << std::endl;
-    //    // ov::TensorVector in, out;
-    //    // node_with_tensor->evaluate(out, in);
-    //    // std::cout << "VSHAMPOR: evaluated " << out.size() << " output
-    //    tensors\n";
-    //    // if (!out.empty()) {
-    //    //     const ov::Tensor& tensor = out[0];
-    //    //     const float* vals = tensor.data<float>();
-    //    //     std::cout << "VSHAMPOR: first elements of the weight tensor are
-    //    ";
-    //    //     for (size_t i = 0; i < 10; i++) {
-    //    //         std::cout << vals[i] << " ";
-    //    //     }
-    //    //     std::cout << std::endl;
-    //    // }
-    //    std::cout << "VSHAMPOR: first elements of the weight tensor are ";
-    //    for (size_t i = 0; i < 10; i++) {
-    //        std::cout << data_ptr[i] << " ";
-    //    }
-    //    std::cout << std::endl;
-    //}
+    OPENVINO_DEBUG << "llama_cpp_plugin: LlamaCppPlugin::compile_model" << std::endl;
     return compile_model(model, properties, {});
 }
 
@@ -71,7 +31,7 @@ std::shared_ptr<ov::ICompiledModel> LlamaCppPlugin::compile_model(const std::str
 std::shared_ptr<ov::ICompiledModel> LlamaCppPlugin::compile_model(const std::shared_ptr<const ov::Model>& model,
                                                                   const ov::AnyMap& properties,
                                                                   const ov::SoPtr<ov::IRemoteContext>& context) const {
-    std::cout << "VSHAMPOR: compile_model called in C++" << std::endl;
+    OPENVINO_DEBUG << "llama_cpp_plugin: compile_model called in C++" << std::endl;
     return std::make_shared<LlamaCppModel>(model->clone(),
                                            shared_from_this(),
                                            context,
@@ -83,7 +43,7 @@ void LlamaCppPlugin::set_property(const ov::AnyMap& properties) {
         if (map_entry.first == ov::cache_dir.name()) {
             m_cache_dir = map_entry.second.as<std::string>();
         } else {
-            OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: setting property ", map_entry.first, "not implemented");
+            OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: setting property ", map_entry.first, "not implemented");
         }
     }
 }
@@ -113,55 +73,30 @@ ov::Any LlamaCppPlugin::get_property(const std::string& name, const ov::AnyMap&
         return std::string("LLAMA_CPP");
     }
 
-    OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented");
+    OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: getting property ", name, "not implemented");
 }
 
 ov::SoPtr<ov::IRemoteContext> LlamaCppPlugin::create_context(const ov::AnyMap& remote_properties) const {
-    OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented");
+    OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: Not Implemented");
 }
 ov::SoPtr<ov::IRemoteContext> LlamaCppPlugin::get_default_context(const ov::AnyMap& remote_properties) const {
-    OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented");
+    OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: Not Implemented");
 }
 std::shared_ptr<ov::ICompiledModel> LlamaCppPlugin::import_model(std::istream& model_file_stream,
                                                                  const ov::AnyMap& properties) const {
-    std::cout << "VSHAMPOR: importing model" << '\n';
-    std::cout << "VSHAMPOR: deserializing ov::Model first" << '\n';
-    // read XML content
-    std::string xmlString;
-    std::uint64_t dataSize = 0;
-    model_file_stream.read(reinterpret_cast<char*>(&dataSize), sizeof(dataSize));
-    xmlString.resize(dataSize);
-    model_file_stream.read(const_cast<char*>(xmlString.c_str()), dataSize);
-
-    // read blob content
-    ov::Tensor weights;
-    model_file_stream.read(reinterpret_cast<char*>(&dataSize), sizeof(dataSize));
-    if (0 != dataSize) {
-        weights = ov::Tensor(ov::element::from<char>(), ov::Shape{static_cast<ov::Shape::size_type>(dataSize)});
-        model_file_stream.read(weights.data<char>(), dataSize);
-    }
-
-    auto ov_model = get_core()->read_model(xmlString, weights);
-    std::cout << "VSHAMPOR: ov::Model deserialized, passing the rest of the "
-                 "stream to LlamaCppModel ctor"
-              << '\n';
-    return std::make_shared<LlamaCppModel>(ov_model, model_file_stream, shared_from_this());
+    OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: model importing not implemented");
 }
 
-const std::string CURRENT_GGUF_FILE_NAME = "current.gguf";
-std::string LlamaCppPlugin::get_current_gguf_file_path() const {
-    return m_cache_dir + "/" + CURRENT_GGUF_FILE_NAME;
-}
 
 std::shared_ptr<ov::ICompiledModel> LlamaCppPlugin::import_model(std::istream& model,
                                                                  const ov::SoPtr<ov::IRemoteContext>& context,
                                                                  const ov::AnyMap& properties) const {
-    OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented");
+    OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: model importing not implemented");
 }
 
 ov::SupportedOpsMap LlamaCppPlugin::query_model(const std::shared_ptr<const ov::Model>& model,
                                                 const ov::AnyMap& properties) const {
-    OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented");
+    OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: model importing not implemented");
 }
 }  // namespace llama_cpp_plugin
 }  // namespace ov
diff --git a/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp b/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp
index df796aacb..8fb1fac80 100644
--- a/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp
+++ b/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp
@@ -5,7 +5,7 @@ namespace ov {
 namespace test {
 void set_device_suffix(const std::string& suffix) {
     if (!suffix.empty()) {
-        throw std::runtime_error("The suffix can't be used for TEMPLATE device!");
+        throw std::runtime_error("The suffix can't be used for LLAMA_CPP device!");
     }
 }
 }  // namespace test
diff --git a/modules/llama_cpp_plugin/tools/CMakeLists.txt b/modules/llama_cpp_plugin/tools/CMakeLists.txt
deleted file mode 100644
index 5209d5ca9..000000000
--- a/modules/llama_cpp_plugin/tools/CMakeLists.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-cmake_minimum_required(VERSION 3.10)
-set(CMAKE_CXX_STANDARD 11)
-
-find_package(OpenVINO REQUIRED)
-
-
-add_executable(llama_cpp_runner
-               "${CMAKE_CURRENT_SOURCE_DIR}/runner.cpp"
-              )
-target_link_libraries(llama_cpp_runner PRIVATE openvino::runtime)
-
-
-add_executable(tensor_comparator
-               "${CMAKE_CURRENT_SOURCE_DIR}/tensor_comparator.cpp"
-              )
-target_link_libraries(tensor_comparator PRIVATE ggml)
-
-add_executable(cache_embedder
-               "${CMAKE_CURRENT_SOURCE_DIR}/cache_embedder.cpp"
-              )
-set_target_properties(cache_embedder PROPERTIES CXX_STANDARD 17)
diff --git a/modules/llama_cpp_plugin/tools/cache_embedder.cpp b/modules/llama_cpp_plugin/tools/cache_embedder.cpp
deleted file mode 100644
index bbfbf229c..000000000
--- a/modules/llama_cpp_plugin/tools/cache_embedder.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-#include <fstream>
-#include <iostream>
-#include <cassert>
-#include <string>
-#include <stdint.h>
-#include <filesystem>
-
-int main(int argc, char* argv[]) {
-    assert(argc == 3);
-    std::string cache_blob_name = argv[1];
-    std::string gguf_file_name = argv[2];
-
-    std::uintmax_t original_file_size = std::filesystem::file_size(cache_blob_name);
-    std::fstream cache_io_stream(cache_blob_name, std::ios::binary | std::ios::in | std::ios::out);
-
-    {
-        std::string tmp;
-        std::getline(cache_io_stream, tmp); // skip the blob header
-        std::cout << "skipped header line" << std::endl;
-    }
-
-    std::uint64_t data_size = 0;
-    cache_io_stream.read(reinterpret_cast<char*>(&data_size), sizeof(data_size));
-    std::cout << "skipping IR XML content, size " << data_size << std::endl;
-    cache_io_stream.seekp(data_size, std::ios::cur); // skip IR xml content
-
-    cache_io_stream.read(reinterpret_cast<char*>(&data_size), sizeof(data_size));
-    std::cout << "skipping IR weight content, size " << data_size << std::endl;
-    cache_io_stream.seekp(data_size, std::ios::cur);  // skip IR weight content
-
-    std::streampos pos = cache_io_stream.tellp();
-    char magic[4];
-    for (size_t i = 0; i < 4; i++) {
-        cache_io_stream >> magic[i];
-    }
-
-    std::string curr_magic(magic);
-    std::cout << "magic at current position is " << curr_magic << std::endl;
-    assert(curr_magic == "GGUF");
-    cache_io_stream.seekp(pos);
-
-    std::ifstream gguf_input_stream(gguf_file_name, std::ios::binary);
-    cache_io_stream << gguf_input_stream.rdbuf();
-    std::cout << "gguf content write successful" << std::endl;
-    std::uintmax_t final_size = cache_io_stream.tellp();
-    cache_io_stream.close();
-    if (final_size < original_file_size) {
-        std::cout << "cache entry is now smaller (" << final_size << " vs original " << original_file_size << "), truncating" << std::endl;
-        std::filesystem::resize_file(cache_blob_name, final_size);
-    }
-
-    return 0;
-}
diff --git a/modules/llama_cpp_plugin/tools/runner.cpp b/modules/llama_cpp_plugin/tools/runner.cpp
deleted file mode 100644
index 390301cdb..000000000
--- a/modules/llama_cpp_plugin/tools/runner.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-#include "openvino/openvino.hpp"
-#include <cstring>
-
-int main(int argc, char* argv[]) {
-    ov::Core core;
-    core.set_property(ov::cache_dir("/tmp/my_cache_dir"));
-    std::string model_path = "/home/vshampor/work/optimum-intel/ov_model/openvino_model.xml";
-
-    std::cout << "VSHAMPOR: reading model\n";
-    std::shared_ptr<ov::Model> model = core.read_model(model_path);
-
-    std::cout << "VSHAMPOR: compiling model\n";
-    ov::CompiledModel compiled_model = core.compile_model(model, "LLAMA_CPP");
-
-    std::cout << "VSHAMPOR: compiled successfully\n";
-
-    std::cout << "VSHAMPOR: creating infer request\n";
-    ov::InferRequest infer_request = compiled_model.create_infer_request();
-    std::cout << "VSHAMPOR: infer request created\n";
-
-    // const ov::Output<const ov::Node>& input = compiled_model.input();
-    // std::cout << "VSHAMPOR: got input\n";
-    auto inputs = compiled_model.inputs();
-    std::cout << "VSHAMPOR: model has " << inputs.size() << " inputs\n";
-    for (const auto& input: inputs) {
-        std::cout << input.get_node()->get_friendly_name() << std::endl;
-    }
-
-    for (size_t i = 0; i < inputs.size(); i++) {
-        const auto& curr_input = inputs[i];
-        auto shape = curr_input.get_partial_shape();
-        if (shape.is_dynamic()) {
-            std::cout << "VSHAMPOR: processing input " << i << " with a dynamic shape of " << shape.to_string() << std::endl;
-            ov::Rank r = shape.rank();
-            if (r.get_length() == 2) {
-                ov::Tensor input_tensor{curr_input.get_element_type(), ov::Shape({1, 128})};
-                int64_t* data_ptr = input_tensor.data<int64_t>();
-                // fill with something
-                for (size_t elt_idx = 0; elt_idx < input_tensor.get_size(); elt_idx++) {
-                    data_ptr[elt_idx] = 42;
-                }
-                infer_request.set_input_tensor(i, input_tensor);
-            }
-            else {  // past_key_values
-                ov::Tensor input_tensor{curr_input.get_element_type(), ov::Shape({1, 12, 128, 64})};
-                infer_request.set_input_tensor(i, input_tensor);
-            }
-        }
-        else {
-            std::cout << "VSHAMPOR: processing input " << i << " with a non-dynamic shape of " << shape.to_string() << std::endl;
-            ov::Tensor input_tensor{curr_input.get_element_type(), curr_input.get_shape()};
-            infer_request.set_input_tensor(i, input_tensor);
-        }
-    }
-    std::cout << "VSHAMPOR: successfully set input tensor\n";
-
-    infer_request.infer();
-    std::cout << "VSHAMPOR: inferred successfully\n";
-
-    ov::Tensor output = infer_request.get_tensor("logits");
-    std::cout << "VSHAMPOR: got output tensor, shape " << output.get_shape().to_string() << std::endl;
-
-    size_t n_output_elts = 10;
-    std::cout << "VSHAMPOR: first " << n_output_elts << " elements are:" << std::endl;
-
-    float* output_data_ptr = output.data<float>();
-    for (size_t elt_idx = 0; elt_idx < n_output_elts; elt_idx++) {
-        std::cout << output_data_ptr[elt_idx] << " ";
-    }
-
-    std::cout << std::endl;
-    return 0;
-}
diff --git a/modules/llama_cpp_plugin/tools/tensor_comparator.cpp b/modules/llama_cpp_plugin/tools/tensor_comparator.cpp
deleted file mode 100644
index 83de96215..000000000
--- a/modules/llama_cpp_plugin/tools/tensor_comparator.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-#include "ggml.h"
-#include <cassert>
-#include <string>
-#include <iostream>
-#include <fstream>
-#include <numeric>
-#include <vector>
-
-
-
-int main(int argc, char* argv[]) {
-    assert(argc == 3 || argc == 4);
-    std::string left_name(argv[1]);
-    std::string right_name(argv[2]);
-
-    gguf_init_params left_params; left_params.no_alloc = false; left_params.ctx = nullptr;
-    gguf_init_params right_params; left_params.no_alloc = false; right_params.ctx = nullptr;
-    gguf_context* left_ctx = gguf_init_from_file(left_name.c_str(), left_params);
-    gguf_context* right_ctx = gguf_init_from_file(right_name.c_str(), right_params);
-
-    std::vector<std::string> tensor_names;
-    if (argc == 4) tensor_names.push_back(std::string(argv[3]));
-    else {
-        for (size_t idx = 0; idx < left_ctx->header.n_tensors; idx++) {
-            gguf_tensor_info left_tensor_info = left_ctx->infos[idx];
-            tensor_names.push_back(left_tensor_info.name.data);
-        }
-    }
-
-    for (const auto& tensor_name : tensor_names) {
-
-
-        int left_tensor_idx = gguf_find_tensor(left_ctx, tensor_name.c_str());
-        int right_tensor_idx = gguf_find_tensor(right_ctx, tensor_name.c_str());
-
-        size_t left_tensor_offset = gguf_get_tensor_offset(left_ctx, left_tensor_idx) + left_ctx->offset;
-        size_t right_tensor_offset = gguf_get_tensor_offset(right_ctx, right_tensor_idx) + right_ctx->offset;
-
-        gguf_tensor_info left_tensor_info = left_ctx->infos[left_tensor_idx];
-        gguf_tensor_info right_tensor_info = right_ctx->infos[right_tensor_idx];
-
-        std::cout << "tensor name " << tensor_name << ", byte offsets: " << left_tensor_offset << " (left), " << right_tensor_offset << " (right)" << std::endl;
-        std::cout << "tensor name " << tensor_name << ", shape: ";
-        for (size_t i = 0; i < left_tensor_info.n_dims; i++) {
-            std::cout << left_tensor_info.ne[i] << ",";
-        }
-        std::cout << " (left), ";
-
-        for (size_t i = 0; i < right_tensor_info.n_dims; i++) {
-            std::cout << right_tensor_info.ne[i] << ",";
-        }
-        std::cout  << " (right) " << std::endl;
-
-        size_t left_tensor_size = std::accumulate(std::begin(left_tensor_info.ne), std::begin(left_tensor_info.ne) + GGML_MAX_DIMS, (size_t) sizeof(float), std::multiplies<size_t>());
-        size_t right_tensor_size = std::accumulate(std::begin(right_tensor_info.ne), std::begin(right_tensor_info.ne) + GGML_MAX_DIMS, (size_t) sizeof(float), std::multiplies<size_t>());
-
-        std::cout << "tensor name " << tensor_name << ", size (calculated): " << left_tensor_size << " (left), " << right_tensor_size << " (right)" << std::endl;
-
-        if (left_tensor_size != right_tensor_size) {
-            std::cout << "size mismatch (" << left_tensor_size << " left, " << right_tensor_size << "right), exiting" << std::endl;
-            exit(-1);
-        }
-
-        size_t bytes_compared = 0;
-
-        std::ifstream left_file(left_name, std::ios::binary);
-        std::ifstream right_file(right_name, std::ios::binary);
-
-        left_file.seekg(left_tensor_offset);
-        right_file.seekg(right_tensor_offset);
-
-        std::cout << "first 10 float values:" << std::endl;
-        for (size_t i = 0; i < 10; i++) {
-            float left_value; left_file.read((char*) &left_value, sizeof(float));
-            float right_value; right_file.read((char*) &right_value, sizeof(float));
-
-            std::cout << left_value <<  " left, " << right_value << " right" << std::endl;
-        }
-
-        left_file.seekg(left_tensor_offset);
-        right_file.seekg(right_tensor_offset);
-        for (size_t i = 0; i < left_tensor_size; i++) {
-            char left_byte; left_file.read((char*) &left_byte, sizeof(char));
-            char right_byte; right_file.read((char*) &right_byte, sizeof(char));
-
-            if (left_byte != right_byte) {
-                std::cout << "byte " << bytes_compared << " mismatch (" << std::hex << +((uint8_t) left_byte) << " left, " << +((uint8_t) right_byte) << " right)" << std::endl;
-                std::cout << "offset left " << std::hex << left_tensor_offset + bytes_compared << ", right " << right_tensor_offset + bytes_compared << std::endl;
-                exit(-1);
-            }
-            bytes_compared++;
-        }
-        std::cout << "tensor contents are identical, bytes compared: " << bytes_compared << std::endl;
-    }
-}

From cd825d96badb104d949668bbde3dc8a97e19cf17 Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Wed, 13 Mar 2024 16:21:45 +0100
Subject: [PATCH 04/27] Properly register the plugin in .xml if requested to do
 so

---
 modules/llama_cpp_plugin/CMakeLists.txt     | 3 +--
 modules/llama_cpp_plugin/src/CMakeLists.txt | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/modules/llama_cpp_plugin/CMakeLists.txt b/modules/llama_cpp_plugin/CMakeLists.txt
index 89c5d4e0e..7e857e890 100644
--- a/modules/llama_cpp_plugin/CMakeLists.txt
+++ b/modules/llama_cpp_plugin/CMakeLists.txt
@@ -4,7 +4,7 @@ project(LlamaCppPlugin)
 
 find_package(OpenVINODeveloperPackage REQUIRED)
 
-ov_option(ENABLE_LLAMA_CPP_PLUGIN_REGISTRATION "Enables registration of LLAMA_CPP plugin" OFF)
+ov_option(ENABLE_LLAMA_CPP_PLUGIN_REGISTRATION "Enables registration of LLAMA_CPP plugin" ON)
 
 if(CMAKE_COMPILER_IS_GNUCXX)
     ov_add_compiler_flags(-Wall)
@@ -20,7 +20,6 @@ if(ENABLE_TESTS)
     add_subdirectory(tests/e2e)
 endif()
 
-
 # install
 
 if(OpenVINODeveloperPackage_FOUND)
diff --git a/modules/llama_cpp_plugin/src/CMakeLists.txt b/modules/llama_cpp_plugin/src/CMakeLists.txt
index 5ec2caee7..3a3f32990 100644
--- a/modules/llama_cpp_plugin/src/CMakeLists.txt
+++ b/modules/llama_cpp_plugin/src/CMakeLists.txt
@@ -13,7 +13,7 @@ set(TARGET_NAME ${PLUGIN_LIBRARY_NAME})
 file(GLOB_RECURSE SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
 file(GLOB_RECURSE HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp)
 
-if (NOT ENABLE_TEMPLATE_REGISTRATION)
+if (NOT ENABLE_LLAMA_CPP_PLUGIN_REGISTRATION)
     # Skip install and registration of template component
     set(skip_plugin SKIP_INSTALL SKIP_REGISTRATION)
 endif()
@@ -52,7 +52,7 @@ target_link_libraries(${TARGET_NAME} PRIVATE ggml)
 
 set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO})
 
-if (ENABLE_TEMPLATE_REGISTRATION)
+if (ENABLE_LLAMA_CPP_PLUGIN_REGISTRATION)
     # Update the plugins.xml file
     ov_register_plugins(MAIN_TARGET ${TARGET_NAME})
 endif()

From 16ebbaad713c43b3bf139bcc39b9bb55e7473a6f Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Wed, 13 Mar 2024 16:52:40 +0100
Subject: [PATCH 05/27] Add workflow for llama_cpp build and test

---
 .../llama_cpp_plugin_build_and_test.yml       | 53 +++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 .github/workflows/llama_cpp_plugin_build_and_test.yml

diff --git a/.github/workflows/llama_cpp_plugin_build_and_test.yml b/.github/workflows/llama_cpp_plugin_build_and_test.yml
new file mode 100644
index 000000000..35e6f2366
--- /dev/null
+++ b/.github/workflows/llama_cpp_plugin_build_and_test.yml
@@ -0,0 +1,53 @@
+name: precommit
+
+on:
+  pull_request:
+    types:
+      - opened
+      - reopened
+      - synchronize
+  paths:
+    - 'modules/llama_cpp_plugin/**'
+
+jobs:
+  build_ubuntu20:
+    runs-on: ubuntu-20.04
+    steps:
+     - name: Setup cmake
+       uses: jwlawson/actions-setup-cmake@v1.14
+       with:
+         cmake-version: '3.24.x'
+
+      - name: Checkout openvino_contrib
+        uses: actions/checkout@v3
+        submodules: recursive
+
+      - name: Checkout openvino
+        uses: actions/checkout@v3
+        submodules: recursive
+        repository: https://github.com/vshampor/openvino
+
+      - name: CMake - configure
+        run: cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=${{ github.workspace }}/openvino_contrib/modules -DBUILD_java_api=OFF -DBUILD_nvidia_plugin=OFF -DBUILD_custom_operations=OFF -DBUILD_openvino_code=OFF -DBUILD_token_merging=OFF -DENABLE_TESTS=ON -DENABLE_FUNCTIONAL_TESTS=ON -DENABLE_PLUGINS_XML=ON .
+
+      - name: CMake - build
+        run: cmake --build build -j`nproc` -- llama_cpp_plugin llama_cpp_e2e_tests
+
+      - name: Upload build artifacts
+        uses: alehechka/upload-tartifact@v2
+        with:
+          name: build_artifacts
+          path: ${{ github.workspace }}/bin/intel64/Release/
+
+  test_ubuntu20:
+    needs: build_ubuntu20
+    runs-on: ubuntu-20.04
+    steps:
+      - name: Download build artifacts
+        uses: alehechka/download-tartifact@v2
+        with:
+          name: build_artifacts
+          path: binaries
+
+      - name: Run E2E tests
+        run:  ${{ github.workspace }}/binaries/llama_cpp_e2e_tests

From aab9d855fee981f4b19c87f76805a8cb4f1320c5 Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Wed, 13 Mar 2024 17:00:06 +0100
Subject: [PATCH 06/27] Code style

---
 modules/llama_cpp_plugin/src/compiled_model.cpp | 17 ++++++-----------
 modules/llama_cpp_plugin/src/infer_request.cpp  |  4 ++--
 modules/llama_cpp_plugin/src/plugin.cpp         |  3 +--
 .../tests/e2e/prompt_response.cpp               | 12 +++++++-----
 .../tests/e2e/set_device_name.cpp               |  1 -
 5 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/modules/llama_cpp_plugin/src/compiled_model.cpp b/modules/llama_cpp_plugin/src/compiled_model.cpp
index a1498f708..17430353b 100644
--- a/modules/llama_cpp_plugin/src/compiled_model.cpp
+++ b/modules/llama_cpp_plugin/src/compiled_model.cpp
@@ -5,30 +5,27 @@
 #include <openvino/op/constant.hpp>
 #include <openvino/opsets/opset13.hpp>
 #include <openvino/runtime/properties.hpp>
+#include <openvino/util/log.hpp>
 
 #include "infer_request.hpp"
 #include "plugin.hpp"
 
-#include <openvino/util/log.hpp>
-
 namespace ov {
 namespace llama_cpp_plugin {
 
-
 LlamaCppModel::LlamaCppModel(const std::shared_ptr<ov::Model>& model,
                              const std::shared_ptr<const ov::IPlugin>& plugin,
                              const ov::SoPtr<ov::IRemoteContext>& context,
                              const std::shared_ptr<ov::threading::ITaskExecutor>& task_executor)
     : ICompiledModel(model, plugin, context, task_executor) {
-        OPENVINO_THROW_NOT_IMPLEMENTED("Currently only direct GGUF file loading is supported for the LLAMA_CPP* plugins");
-    }
-
+    OPENVINO_THROW_NOT_IMPLEMENTED("Currently only direct GGUF file loading is supported for the LLAMA_CPP* plugins");
+}
 
 LlamaCppModel::LlamaCppModel(const std::shared_ptr<ov::Model>& ov_model,
                              std::istream& input_stream,
                              const std::shared_ptr<const IPlugin>& plugin)
     : ICompiledModel(ov_model, plugin) {
-        OPENVINO_THROW_NOT_IMPLEMENTED("Currently only direct GGUF file loading is supported for the LLAMA_CPP* plugins");
+    OPENVINO_THROW_NOT_IMPLEMENTED("Currently only direct GGUF file loading is supported for the LLAMA_CPP* plugins");
 }
 
 LlamaCppModel::~LlamaCppModel() {
@@ -39,7 +36,8 @@ LlamaCppModel::~LlamaCppModel() {
 }
 
 LlamaCppModel::LlamaCppModel(const std::string& gguf_fname, const std::shared_ptr<const IPlugin>& plugin)
-    : ICompiledModel(nullptr, plugin), m_gguf_fname(gguf_fname) {
+    : ICompiledModel(nullptr, plugin),
+      m_gguf_fname(gguf_fname) {
     num_tokens_processed_ptr = new size_t;  // TODO (vshampor): hack, remove
     *num_tokens_processed_ptr = 0;
     OPENVINO_DEBUG << "llama_cpp_plugin: loading llama model directly from GGUF... " << std::endl;
@@ -82,7 +80,6 @@ LlamaCppModel::LlamaCppModel(const std::string& gguf_fname, const std::shared_pt
     }
 }
 
-
 std::shared_ptr<const ov::Model> LlamaCppModel::get_runtime_model() const {
     OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: Not Implemented");
 }
@@ -115,7 +112,5 @@ void LlamaCppModel::export_model(std::ostream& output_stream) const {
     output_stream << in.rdbuf();
 }
 
-
-
 }  // namespace llama_cpp_plugin
 }  // namespace ov
diff --git a/modules/llama_cpp_plugin/src/infer_request.cpp b/modules/llama_cpp_plugin/src/infer_request.cpp
index 40307c573..e41fe5a03 100644
--- a/modules/llama_cpp_plugin/src/infer_request.cpp
+++ b/modules/llama_cpp_plugin/src/infer_request.cpp
@@ -1,4 +1,5 @@
 #include "infer_request.hpp"
+
 #include <openvino/runtime/ivariable_state.hpp>
 
 #include "llama.h"
@@ -79,7 +80,7 @@ void LlamaCppSyncInferRequest::infer() {
                                {0},
                                true);  // the last `true` here is a marker that the logits for this
                                        // token should be computed and returned
-       *(m_compiled_model_ptr->num_tokens_processed_ptr) += 1;
+        *(m_compiled_model_ptr->num_tokens_processed_ptr) += 1;
     }
 
     llama_context* ctx = m_compiled_model_ptr->m_llama_ctx;
@@ -110,7 +111,6 @@ std::vector<ov::ProfilingInfo> LlamaCppSyncInferRequest::get_profiling_info() co
     return std::vector<ov::ProfilingInfo>{};
 };
 
-
 std::vector<ov::SoPtr<ov::IVariableState>> LlamaCppSyncInferRequest::query_state() const {
     OPENVINO_DEBUG << "llama_cpp_plugin: query_state() called\n";
     return {};
diff --git a/modules/llama_cpp_plugin/src/plugin.cpp b/modules/llama_cpp_plugin/src/plugin.cpp
index ec456cc45..22c90e439 100644
--- a/modules/llama_cpp_plugin/src/plugin.cpp
+++ b/modules/llama_cpp_plugin/src/plugin.cpp
@@ -4,8 +4,8 @@
 
 #include "compiled_model.hpp"
 #include "openvino/op/constant.hpp"
-#include "openvino/util/log.hpp"
 #include "openvino/runtime/internal_properties.hpp"
+#include "openvino/util/log.hpp"
 
 namespace {
 static constexpr const char* wait_executor_name = "LlamaCppWaitExecutor";
@@ -87,7 +87,6 @@ std::shared_ptr<ov::ICompiledModel> LlamaCppPlugin::import_model(std::istream& m
     OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: model importing not implemented");
 }
 
-
 std::shared_ptr<ov::ICompiledModel> LlamaCppPlugin::import_model(std::istream& model,
                                                                  const ov::SoPtr<ov::IRemoteContext>& context,
                                                                  const ov::AnyMap& properties) const {
diff --git a/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp b/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp
index f4e0369c5..60d1f8881 100644
--- a/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp
+++ b/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp
@@ -1,13 +1,16 @@
 #include <gtest/gtest.h>
-#include "openvino/openvino.hpp"
+
 #include "common_test_utils/file_utils.hpp"
+#include "openvino/openvino.hpp"
 
 const std::string TEST_FILES_DIR = "test_data";
 
 // "Why is the Sun yellow?"
 const std::vector<int64_t> GPT2_PROMPT_TOKEN_IDS = {5195, 318, 262, 3825, 7872, 30};
 // "The Sun is a bright red, which means it is a bright red. The Sun is a bright red because it is a bright red."
-const std::vector<int64_t> GPT2_REFERENCE_RESPONSE_TOKEN_IDS = {198, 464, 3825, 318, 257, 6016, 2266, 11, 543, 1724, 340, 318, 257, 6016, 2266, 13, 383, 3825, 318, 257, 6016, 2266, 780, 340, 318, 257, 6016, 2266, 13, 198, 198, 464};
+const std::vector<int64_t> GPT2_REFERENCE_RESPONSE_TOKEN_IDS = {
+    198, 464,  3825, 318, 257,  6016, 2266, 11,  543, 1724, 340,  318,  257, 6016, 2266, 13,
+    383, 3825, 318,  257, 6016, 2266, 780,  340, 318, 257,  6016, 2266, 13,  198,  198,  464};
 
 const auto SEP = ov::util::FileTraits<char>::file_separator;
 
@@ -16,7 +19,8 @@ TEST(PromptResponseTest, TestGPT2) {
     ov::Core core;
 
     const std::string model_file_name = "gpt2.gguf";
-    const std::string model_file = ov::test::utils::getCurrentWorkingDir() + SEP +  TEST_FILES_DIR + SEP + model_file_name;
+    const std::string model_file =
+        ov::test::utils::getCurrentWorkingDir() + SEP + TEST_FILES_DIR + SEP + model_file_name;
     ov::InferRequest lm = core.compile_model(model_file, plugin_name).create_infer_request();
     auto input_ids_tensor = ov::Tensor(ov::element::Type_t::i64, {1, GPT2_PROMPT_TOKEN_IDS.size()});
     std::copy(GPT2_PROMPT_TOKEN_IDS.begin(), GPT2_PROMPT_TOKEN_IDS.end(), input_ids_tensor.data<int64_t>());
@@ -59,5 +63,3 @@ TEST(PromptResponseTest, TestGPT2) {
 
     ASSERT_EQ(out_token_ids, GPT2_REFERENCE_RESPONSE_TOKEN_IDS);
 }
-
-
diff --git a/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp b/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp
index 8fb1fac80..aa06bc96f 100644
--- a/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp
+++ b/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp
@@ -10,4 +10,3 @@ void set_device_suffix(const std::string& suffix) {
 }
 }  // namespace test
 }  // namespace ov
-

From 106754ed13e711b335e0f6aa8e860edd8e7b5a98 Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Wed, 13 Mar 2024 17:04:33 +0100
Subject: [PATCH 07/27] Adjust workflow.yml

---
 .../llama_cpp_plugin_build_and_test.yml       | 26 +++++++++++--------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/llama_cpp_plugin_build_and_test.yml b/.github/workflows/llama_cpp_plugin_build_and_test.yml
index 35e6f2366..324c06536 100644
--- a/.github/workflows/llama_cpp_plugin_build_and_test.yml
+++ b/.github/workflows/llama_cpp_plugin_build_and_test.yml
@@ -1,4 +1,4 @@
-name: precommit
+name: llama_cpp_plugin_build_and_test
 
 on:
   pull_request:
@@ -6,29 +6,33 @@ on:
       - opened
       - reopened
       - synchronize
-  paths:
-    - 'modules/llama_cpp_plugin/**'
+    paths:
+      - 'modules/llama_cpp_plugin/**'
 
 jobs:
   build_ubuntu20:
     runs-on: ubuntu-20.04
     steps:
-     - name: Setup cmake
-       uses: jwlawson/actions-setup-cmake@v1.14
-       with:
-         cmake-version: '3.24.x'
+      - name: Setup cmake
+        uses: jwlawson/actions-setup-cmake@v1.14
+        with:
+          cmake-version: '3.24.x'
 
       - name: Checkout openvino_contrib
         uses: actions/checkout@v3
-        submodules: recursive
+        with:
+          submodules: recursive
+          path: openvino_contrib
 
       - name: Checkout openvino
         uses: actions/checkout@v3
-        submodules: recursive
-        repository: https://github.com/vshampor/openvino
+        with:
+          submodules: recursive
+          repository: vshampor/openvino
+          path: openvino
 
       - name: CMake - configure
-        run: cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=${{ github.workspace }}/openvino_contrib/modules -DBUILD_java_api=OFF -DBUILD_nvidia_plugin=OFF -DBUILD_custom_operations=OFF -DBUILD_openvino_code=OFF -DBUILD_token_merging=OFF -DENABLE_TESTS=ON -DENABLE_FUNCTIONAL_TESTS=ON -DENABLE_PLUGINS_XML=ON .
+        run: cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=${{ github.workspace }}/openvino_contrib/modules -DBUILD_java_api=OFF -DBUILD_nvidia_plugin=OFF -DBUILD_custom_operations=OFF -DBUILD_openvino_code=OFF -DBUILD_token_merging=OFF -DENABLE_TESTS=ON -DENABLE_FUNCTIONAL_TESTS=ON -DENABLE_PLUGINS_XML=ON openvino
 
       - name: CMake - build
         run: cmake --build build -j`nproc` -- llama_cpp_plugin llama_cpp_e2e_tests

From 4724f0f417b920a10ab51ae147c4626dc09214fc Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Thu, 14 Mar 2024 12:18:43 +0100
Subject: [PATCH 08/27] Improve with comments

---
 .../llama_cpp_plugin_build_and_test.yml       |  1 +
 modules/llama_cpp_plugin/CMakeLists.txt       | 12 +++++----
 modules/llama_cpp_plugin/build.sh             | 19 -------------
 .../include/compiled_model.hpp                |  2 +-
 modules/llama_cpp_plugin/src/CMakeLists.txt   |  5 ----
 .../llama_cpp_plugin/src/compiled_model.cpp   | 27 ++++++++++---------
 .../llama_cpp_plugin/src/infer_request.cpp    |  9 +++++--
 .../llama_cpp_plugin/third_party/llama.cpp    |  1 -
 8 files changed, 30 insertions(+), 46 deletions(-)
 delete mode 100755 modules/llama_cpp_plugin/build.sh
 delete mode 160000 modules/llama_cpp_plugin/third_party/llama.cpp

diff --git a/.github/workflows/llama_cpp_plugin_build_and_test.yml b/.github/workflows/llama_cpp_plugin_build_and_test.yml
index 324c06536..127aaf524 100644
--- a/.github/workflows/llama_cpp_plugin_build_and_test.yml
+++ b/.github/workflows/llama_cpp_plugin_build_and_test.yml
@@ -29,6 +29,7 @@ jobs:
         with:
           submodules: recursive
           repository: vshampor/openvino
+          branch: llama_cpp_mod
           path: openvino
 
       - name: CMake - configure
diff --git a/modules/llama_cpp_plugin/CMakeLists.txt b/modules/llama_cpp_plugin/CMakeLists.txt
index 7e857e890..d909dc88e 100644
--- a/modules/llama_cpp_plugin/CMakeLists.txt
+++ b/modules/llama_cpp_plugin/CMakeLists.txt
@@ -6,13 +6,15 @@ find_package(OpenVINODeveloperPackage REQUIRED)
 
 ov_option(ENABLE_LLAMA_CPP_PLUGIN_REGISTRATION "Enables registration of LLAMA_CPP plugin" ON)
 
-if(CMAKE_COMPILER_IS_GNUCXX)
-    ov_add_compiler_flags(-Wall)
-endif()
-
 add_subdirectory(src)
 
-add_subdirectory(third_party/llama.cpp)
+FetchContent_Declare(
+  llama_cpp
+  GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
+  GIT_TAG        b2417
+)
+
+FetchContent_MakeAvailable(llama_cpp)
 
 if(ENABLE_TESTS)
     include(CTest)
diff --git a/modules/llama_cpp_plugin/build.sh b/modules/llama_cpp_plugin/build.sh
deleted file mode 100755
index fa36b9e03..000000000
--- a/modules/llama_cpp_plugin/build.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-set -e
-# What we want to do is build the llama.cpp dependency for different backends and have a separate plugin for each such build type.
-# Sadly, CMake does not reliably allow to add_subdirectory multiple times in the same build tree, let alone with different options,
-# since this would lead to "duplicate targets". There doesn't seem to be a solution to this problem even still. Thus, will have to
-# invoke the cmake configure and build stage separately for each llama.cpp backend type.
-
-BUILD_TYPE=$1
-COMMON_OPTS="-DOpenVINODeveloperPackage_DIR=/home/vshampor/work/openvino/build -DCMAKE_EXPORT_COMPILE_COMMANDS=1"
-
-# Regular CPU build of llama.cpp
-cmake -S ./ -B ./build/cpu/ ${COMMON_OPTS} "$@"
-cmake --build ./build/cpu/ -j --target llama --target llama_cpp_plugin
-
-
-# CUDA build
-cmake -S ./ -B ./build/cuda/ -DLLAMA_CUBLAS=1 -DPLUGIN_DEVICE_NAME="LLAMA_CPP_CUDA" -DPLUGIN_LIBRARY_NAME="llama_cpp_cuda_plugin" -DLLAMA_TARGET_NAME="llama_cuda" ${COMMON_OPTS} "$@"
-cmake --build ./build/cuda/ -j --target llama_cuda --target llama_cpp_cuda_plugin
diff --git a/modules/llama_cpp_plugin/include/compiled_model.hpp b/modules/llama_cpp_plugin/include/compiled_model.hpp
index 1ae79f12e..a99d96061 100644
--- a/modules/llama_cpp_plugin/include/compiled_model.hpp
+++ b/modules/llama_cpp_plugin/include/compiled_model.hpp
@@ -70,7 +70,7 @@ namespace ov {
 
             llama_model* m_llama_model_ptr = nullptr;
             llama_context* m_llama_ctx = nullptr;
-            std::shared_ptr<ov::Model> m_model;
+            std::shared_ptr<ov::Model> m_fake_model;
             size_t* num_tokens_processed_ptr = nullptr;  // TODO: (vshampor) find a better place for this kind of storage
 
             std::vector<ov::Output<const ov::Node>> m_fake_inputs;
diff --git a/modules/llama_cpp_plugin/src/CMakeLists.txt b/modules/llama_cpp_plugin/src/CMakeLists.txt
index 3a3f32990..0ff3189c6 100644
--- a/modules/llama_cpp_plugin/src/CMakeLists.txt
+++ b/modules/llama_cpp_plugin/src/CMakeLists.txt
@@ -35,11 +35,6 @@ target_include_directories(${TARGET_NAME} PRIVATE
     "${CMAKE_CURRENT_SOURCE_DIR}"
     "${LlamaCppPlugin_SOURCE_DIR}/include")
 
-# link common OpenVINO Runtime libraries
-target_link_libraries(${TARGET_NAME} PRIVATE
-    openvino::interpreter_backend
-    openvino::reference)
-
 set( LLAMA_TARGET_NAME CACHE STRING "Exact target exposed by llama.cpp to link against as the main llama.cpp library")
 if(NOT LLAMA_TARGET_NAME)
     set( LLAMA_TARGET_NAME "llama" )
diff --git a/modules/llama_cpp_plugin/src/compiled_model.cpp b/modules/llama_cpp_plugin/src/compiled_model.cpp
index 17430353b..63349c854 100644
--- a/modules/llama_cpp_plugin/src/compiled_model.cpp
+++ b/modules/llama_cpp_plugin/src/compiled_model.cpp
@@ -54,28 +54,29 @@ LlamaCppModel::LlamaCppModel(const std::string& gguf_fname, const std::shared_pt
 
     ov::ParameterVector inputs{input_ids};
 
-    std::vector<std::pair<std::string, ov::element::Type_t>> unused_names_in_order = {
-        {"attention_mask", ov::element::Type_t::i64},
-        {"position_ids", ov::element::Type_t::i64},
-        {"beam_idx", ov::element::Type_t::i32}};
-    for (const auto& descr : unused_names_in_order) {
-        auto unused_inp = std::make_shared<ov::opset13::Parameter>(descr.second, ov::PartialShape({-1, -1}));
+    std::vector<std::tuple<std::string, ov::element::Type_t, ov::PartialShape>> additional_inputs_in_order = {
+        {"attention_mask", ov::element::Type_t::i64, {-1, -1}},
+        {"position_ids", ov::element::Type_t::i64, {-1, -1}},
+        {"beam_idx", ov::element::Type_t::i32, {-1, -1}}};
+
+    for (const auto& descr : additional_inputs_in_order) {
+        auto unused_inp = std::make_shared<ov::opset13::Parameter>(std::get<1>(descr), std::get<2>(descr));
         inputs.push_back(unused_inp);
     }
 
-    m_model = std::make_shared<ov::Model>(logits, inputs, "fake_ov_model_for_io_specification");
+    m_fake_model = std::make_shared<ov::Model>(logits, inputs, "fake_ov_model_for_io_specification");
 
-    m_model->inputs()[0].set_names({"input_ids"});
-    for (size_t i = 0; i < unused_names_in_order.size(); i++) {
-        m_model->inputs()[i + 1].set_names({unused_names_in_order[i].first});
+    m_fake_model->inputs()[0].set_names({"input_ids"});
+    for (size_t i = 0; i < additional_inputs_in_order.size(); i++) {
+        m_fake_model->inputs()[i + 1].set_names({std::get<0>(additional_inputs_in_order[i])});
     }
 
-    m_model->outputs()[0].set_names({"logits"});
+    m_fake_model->outputs()[0].set_names({"logits"});
 
-    for (auto input : m_model->inputs()) {
+    for (auto input : m_fake_model->inputs()) {
         m_fake_inputs.emplace_back(input);
     }
-    for (auto output : m_model->outputs()) {
+    for (auto output : m_fake_model->outputs()) {
         m_fake_outputs.emplace_back(output);
     }
 }
diff --git a/modules/llama_cpp_plugin/src/infer_request.cpp b/modules/llama_cpp_plugin/src/infer_request.cpp
index e41fe5a03..d745d0075 100644
--- a/modules/llama_cpp_plugin/src/infer_request.cpp
+++ b/modules/llama_cpp_plugin/src/infer_request.cpp
@@ -62,6 +62,9 @@ void llama_batch_add_reimpl(struct llama_batch& batch,
 void LlamaCppSyncInferRequest::infer() {
     auto input_ids_tensor_ptr = get_tensor(get_inputs()[0]);  // TODO (vshampor) correctly identify input_ids among
                                                               // all inputs without hardcode
+                                                              //
+    auto position_ids_tensor_ptr = get_tensor(get_inputs()[2]);  // TODO (vshampor) correctly identify input_ids among
+                                                              // all inputs without hardcode
     OPENVINO_ASSERT(input_ids_tensor_ptr->get_element_type() == ov::element::Type_t::i64);
     OPENVINO_ASSERT(input_ids_tensor_ptr->get_shape().size() == 2);
     size_t sequence_length = input_ids_tensor_ptr->get_shape()[1];
@@ -72,15 +75,17 @@ void LlamaCppSyncInferRequest::infer() {
 
     const int64_t* sequence_start_ptr = data_ptr /* + seq_idx */;
 
+    const int64_t* position_idx_ptr = position_ids_tensor_ptr->data<int64_t>();
+
     for (size_t tok_idx = 0; tok_idx < sequence_length; ++tok_idx) {
         const int64_t token_id = sequence_start_ptr[tok_idx];
+        const int64_t position_id = position_idx_ptr[tok_idx];
         llama_batch_add_reimpl(batch,
                                token_id,
-                               *(m_compiled_model_ptr->num_tokens_processed_ptr),
+                               position_id,
                                {0},
                                true);  // the last `true` here is a marker that the logits for this
                                        // token should be computed and returned
-        *(m_compiled_model_ptr->num_tokens_processed_ptr) += 1;
     }
 
     llama_context* ctx = m_compiled_model_ptr->m_llama_ctx;
diff --git a/modules/llama_cpp_plugin/third_party/llama.cpp b/modules/llama_cpp_plugin/third_party/llama.cpp
deleted file mode 160000
index c8b02d38d..000000000
--- a/modules/llama_cpp_plugin/third_party/llama.cpp
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit c8b02d38d98db8dab774f6f7655d7e9aede882f5

From 6fdf37626d2f07a5347cb850919d201ecf493f85 Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Thu, 14 Mar 2024 12:28:47 +0100
Subject: [PATCH 09/27] Fix formatting and workflow

---
 .../llama_cpp_plugin_build_and_test.yml       |  2 +-
 modules/llama_cpp_plugin/.clang-format        | 28 +++++++++++++++++++
 .../llama_cpp_plugin/src/compiled_model.cpp   |  6 ++--
 .../llama_cpp_plugin/src/infer_request.cpp    |  8 +++---
 .../tests/e2e/prompt_response.cpp             |  3 +-
 5 files changed, 39 insertions(+), 8 deletions(-)
 create mode 100644 modules/llama_cpp_plugin/.clang-format

diff --git a/.github/workflows/llama_cpp_plugin_build_and_test.yml b/.github/workflows/llama_cpp_plugin_build_and_test.yml
index 127aaf524..829f24c67 100644
--- a/.github/workflows/llama_cpp_plugin_build_and_test.yml
+++ b/.github/workflows/llama_cpp_plugin_build_and_test.yml
@@ -29,7 +29,7 @@ jobs:
         with:
           submodules: recursive
           repository: vshampor/openvino
-          branch: llama_cpp_mod
+          ref: llama_cpp_mod
           path: openvino
 
       - name: CMake - configure
diff --git a/modules/llama_cpp_plugin/.clang-format b/modules/llama_cpp_plugin/.clang-format
new file mode 100644
index 000000000..ebe747b78
--- /dev/null
+++ b/modules/llama_cpp_plugin/.clang-format
@@ -0,0 +1,28 @@
+BasedOnStyle: Google
+IndentWidth: 4
+UseTab: Never
+ColumnLimit: 120
+
+Language: Cpp
+Standard: Cpp11
+
+AccessModifierOffset: -4
+AlignConsecutiveMacros: true
+AllowAllArgumentsOnNextLine: false
+AllowAllConstructorInitializersOnNextLine: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLambdasOnASingleLine: Empty
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakBeforeMultilineStrings: false
+BinPackArguments: false
+BinPackParameters: false
+CommentPragmas: '^#'
+DerivePointerAlignment: false
+FixNamespaceComments: true
+IndentCaseLabels: false
+IndentPPDirectives: AfterHash
+ForEachMacros:
+  - foreach
+  - FOREACH_CHILD
diff --git a/modules/llama_cpp_plugin/src/compiled_model.cpp b/modules/llama_cpp_plugin/src/compiled_model.cpp
index 63349c854..ae4422c02 100644
--- a/modules/llama_cpp_plugin/src/compiled_model.cpp
+++ b/modules/llama_cpp_plugin/src/compiled_model.cpp
@@ -18,14 +18,16 @@ LlamaCppModel::LlamaCppModel(const std::shared_ptr<ov::Model>& model,
                              const ov::SoPtr<ov::IRemoteContext>& context,
                              const std::shared_ptr<ov::threading::ITaskExecutor>& task_executor)
     : ICompiledModel(model, plugin, context, task_executor) {
-    OPENVINO_THROW_NOT_IMPLEMENTED("Currently only direct GGUF file loading is supported for the LLAMA_CPP* plugins");
+    OPENVINO_THROW_NOT_IMPLEMENTED("Currently only direct GGUF file loading is "
+                                   "supported for the LLAMA_CPP* plugins");
 }
 
 LlamaCppModel::LlamaCppModel(const std::shared_ptr<ov::Model>& ov_model,
                              std::istream& input_stream,
                              const std::shared_ptr<const IPlugin>& plugin)
     : ICompiledModel(ov_model, plugin) {
-    OPENVINO_THROW_NOT_IMPLEMENTED("Currently only direct GGUF file loading is supported for the LLAMA_CPP* plugins");
+    OPENVINO_THROW_NOT_IMPLEMENTED("Currently only direct GGUF file loading is "
+                                   "supported for the LLAMA_CPP* plugins");
 }
 
 LlamaCppModel::~LlamaCppModel() {
diff --git a/modules/llama_cpp_plugin/src/infer_request.cpp b/modules/llama_cpp_plugin/src/infer_request.cpp
index d745d0075..9567b1922 100644
--- a/modules/llama_cpp_plugin/src/infer_request.cpp
+++ b/modules/llama_cpp_plugin/src/infer_request.cpp
@@ -60,11 +60,11 @@ void llama_batch_add_reimpl(struct llama_batch& batch,
 }
 
 void LlamaCppSyncInferRequest::infer() {
-    auto input_ids_tensor_ptr = get_tensor(get_inputs()[0]);  // TODO (vshampor) correctly identify input_ids among
-                                                              // all inputs without hardcode
-                                                              //
+    auto input_ids_tensor_ptr = get_tensor(get_inputs()[0]);     // TODO (vshampor) correctly identify input_ids among
+                                                                 // all inputs without hardcode
+                                                                 //
     auto position_ids_tensor_ptr = get_tensor(get_inputs()[2]);  // TODO (vshampor) correctly identify input_ids among
-                                                              // all inputs without hardcode
+                                                                 // all inputs without hardcode
     OPENVINO_ASSERT(input_ids_tensor_ptr->get_element_type() == ov::element::Type_t::i64);
     OPENVINO_ASSERT(input_ids_tensor_ptr->get_shape().size() == 2);
     size_t sequence_length = input_ids_tensor_ptr->get_shape()[1];
diff --git a/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp b/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp
index 60d1f8881..1101f5cb0 100644
--- a/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp
+++ b/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp
@@ -7,7 +7,8 @@ const std::string TEST_FILES_DIR = "test_data";
 
 // "Why is the Sun yellow?"
 const std::vector<int64_t> GPT2_PROMPT_TOKEN_IDS = {5195, 318, 262, 3825, 7872, 30};
-// "The Sun is a bright red, which means it is a bright red. The Sun is a bright red because it is a bright red."
+// "The Sun is a bright red, which means it is a bright red. The Sun is a bright
+// red because it is a bright red."
 const std::vector<int64_t> GPT2_REFERENCE_RESPONSE_TOKEN_IDS = {
     198, 464,  3825, 318, 257,  6016, 2266, 11,  543, 1724, 340,  318,  257, 6016, 2266, 13,
     383, 3825, 318,  257, 6016, 2266, 780,  340, 318, 257,  6016, 2266, 13,  198,  198,  464};

From bd7e96817a51874905ef75f3de0a7ee6a157ea5e Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Thu, 14 Mar 2024 13:44:23 +0100
Subject: [PATCH 10/27] Add copyrights

---
 modules/llama_cpp_plugin/CMakeLists.txt                | 3 +++
 modules/llama_cpp_plugin/include/compiled_model.hpp    | 3 +++
 modules/llama_cpp_plugin/include/infer_request.hpp     | 3 +++
 modules/llama_cpp_plugin/include/plugin.hpp            | 1 -
 modules/llama_cpp_plugin/src/CMakeLists.txt            | 3 +++
 modules/llama_cpp_plugin/src/compiled_model.cpp        | 3 +++
 modules/llama_cpp_plugin/src/infer_request.cpp         | 3 +++
 modules/llama_cpp_plugin/src/plugin.cpp                | 3 +++
 modules/llama_cpp_plugin/tests/e2e/CMakeLists.txt      | 2 ++
 modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp | 3 +++
 modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp | 3 +++
 11 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/modules/llama_cpp_plugin/CMakeLists.txt b/modules/llama_cpp_plugin/CMakeLists.txt
index d909dc88e..393f4f219 100644
--- a/modules/llama_cpp_plugin/CMakeLists.txt
+++ b/modules/llama_cpp_plugin/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 cmake_minimum_required(VERSION 3.13)
 
 project(LlamaCppPlugin)
diff --git a/modules/llama_cpp_plugin/include/compiled_model.hpp b/modules/llama_cpp_plugin/include/compiled_model.hpp
index a99d96061..9306ca437 100644
--- a/modules/llama_cpp_plugin/include/compiled_model.hpp
+++ b/modules/llama_cpp_plugin/include/compiled_model.hpp
@@ -1,3 +1,6 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
 #ifndef LLAMA_CPP_COMPILED_MODEL_HPP
 #define LLAMA_CPP_COMPILED_MODEL_HPP
 
diff --git a/modules/llama_cpp_plugin/include/infer_request.hpp b/modules/llama_cpp_plugin/include/infer_request.hpp
index 8954a180b..e8a0da65d 100644
--- a/modules/llama_cpp_plugin/include/infer_request.hpp
+++ b/modules/llama_cpp_plugin/include/infer_request.hpp
@@ -1,3 +1,6 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
 #ifndef LLAMA_CPP_INFER_REQUEST_HPP
 #define LLAMA_CPP_INFER_REQUEST_HPP
 
diff --git a/modules/llama_cpp_plugin/include/plugin.hpp b/modules/llama_cpp_plugin/include/plugin.hpp
index 1d6fdf1e4..f68ebd3d6 100644
--- a/modules/llama_cpp_plugin/include/plugin.hpp
+++ b/modules/llama_cpp_plugin/include/plugin.hpp
@@ -1,6 +1,5 @@
 // Copyright (C) 2018-2023 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
-//
 
 #ifndef LLAMA_CPP_PLUGIN_HPP
 #define LLAMA_CPP_PLUGIN_HPP
diff --git a/modules/llama_cpp_plugin/src/CMakeLists.txt b/modules/llama_cpp_plugin/src/CMakeLists.txt
index 0ff3189c6..d99a44795 100644
--- a/modules/llama_cpp_plugin/src/CMakeLists.txt
+++ b/modules/llama_cpp_plugin/src/CMakeLists.txt
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 set( PLUGIN_LIBRARY_NAME CACHE STRING "Library name for the generated plugin" ${TARGET_NAME})
 if(NOT PLUGIN_LIBRARY_NAME)
     set( PLUGIN_LIBRARY_NAME "llama_cpp_plugin" )
diff --git a/modules/llama_cpp_plugin/src/compiled_model.cpp b/modules/llama_cpp_plugin/src/compiled_model.cpp
index ae4422c02..7bafa658e 100644
--- a/modules/llama_cpp_plugin/src/compiled_model.cpp
+++ b/modules/llama_cpp_plugin/src/compiled_model.cpp
@@ -1,3 +1,6 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
 #include "compiled_model.hpp"
 
 #include <fstream>
diff --git a/modules/llama_cpp_plugin/src/infer_request.cpp b/modules/llama_cpp_plugin/src/infer_request.cpp
index 9567b1922..ee2bdbc45 100644
--- a/modules/llama_cpp_plugin/src/infer_request.cpp
+++ b/modules/llama_cpp_plugin/src/infer_request.cpp
@@ -1,3 +1,6 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
 #include "infer_request.hpp"
 
 #include <openvino/runtime/ivariable_state.hpp>
diff --git a/modules/llama_cpp_plugin/src/plugin.cpp b/modules/llama_cpp_plugin/src/plugin.cpp
index 22c90e439..77287555b 100644
--- a/modules/llama_cpp_plugin/src/plugin.cpp
+++ b/modules/llama_cpp_plugin/src/plugin.cpp
@@ -1,3 +1,6 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
 #include "plugin.hpp"
 
 #include <openvino/runtime/properties.hpp>
diff --git a/modules/llama_cpp_plugin/tests/e2e/CMakeLists.txt b/modules/llama_cpp_plugin/tests/e2e/CMakeLists.txt
index 4c16f3484..ea96e9d3b 100644
--- a/modules/llama_cpp_plugin/tests/e2e/CMakeLists.txt
+++ b/modules/llama_cpp_plugin/tests/e2e/CMakeLists.txt
@@ -1,3 +1,5 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
 
 set(TARGET_NAME llama_cpp_e2e_tests)
 
diff --git a/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp b/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp
index 1101f5cb0..351104bf1 100644
--- a/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp
+++ b/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp
@@ -1,3 +1,6 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
 #include <gtest/gtest.h>
 
 #include "common_test_utils/file_utils.hpp"
diff --git a/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp b/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp
index aa06bc96f..7577f1673 100644
--- a/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp
+++ b/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp
@@ -1,3 +1,6 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
 #include <stdexcept>
 #include <string>
 

From 8b96020aa480e0838247fde3862459f9437e1a65 Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Thu, 14 Mar 2024 13:46:33 +0100
Subject: [PATCH 11/27] Remove state

---
 modules/llama_cpp_plugin/include/compiled_model.hpp | 1 -
 modules/llama_cpp_plugin/src/compiled_model.cpp     | 3 ---
 2 files changed, 4 deletions(-)

diff --git a/modules/llama_cpp_plugin/include/compiled_model.hpp b/modules/llama_cpp_plugin/include/compiled_model.hpp
index 9306ca437..292e373fd 100644
--- a/modules/llama_cpp_plugin/include/compiled_model.hpp
+++ b/modules/llama_cpp_plugin/include/compiled_model.hpp
@@ -74,7 +74,6 @@ namespace ov {
             llama_model* m_llama_model_ptr = nullptr;
             llama_context* m_llama_ctx = nullptr;
             std::shared_ptr<ov::Model> m_fake_model;
-            size_t* num_tokens_processed_ptr = nullptr;  // TODO: (vshampor) find a better place for this kind of storage
 
             std::vector<ov::Output<const ov::Node>> m_fake_inputs;
             std::vector<ov::Output<const ov::Node>> m_fake_outputs;
diff --git a/modules/llama_cpp_plugin/src/compiled_model.cpp b/modules/llama_cpp_plugin/src/compiled_model.cpp
index 7bafa658e..5fed08758 100644
--- a/modules/llama_cpp_plugin/src/compiled_model.cpp
+++ b/modules/llama_cpp_plugin/src/compiled_model.cpp
@@ -37,14 +37,11 @@ LlamaCppModel::~LlamaCppModel() {
     llama_free(m_llama_ctx);
     llama_free_model(m_llama_model_ptr);
     llama_backend_free();
-    delete num_tokens_processed_ptr;
 }
 
 LlamaCppModel::LlamaCppModel(const std::string& gguf_fname, const std::shared_ptr<const IPlugin>& plugin)
     : ICompiledModel(nullptr, plugin),
       m_gguf_fname(gguf_fname) {
-    num_tokens_processed_ptr = new size_t;  // TODO (vshampor): hack, remove
-    *num_tokens_processed_ptr = 0;
     OPENVINO_DEBUG << "llama_cpp_plugin: loading llama model directly from GGUF... " << std::endl;
     llama_model_params mparams = llama_model_default_params();
     mparams.n_gpu_layers = 99;

From 689492a59283e3775dc9056282294e3911c459ba Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Thu, 14 Mar 2024 14:02:32 +0100
Subject: [PATCH 12/27] Add test data preparation step to workflow

---
 .../workflows/llama_cpp_plugin_build_and_test.yml    | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/llama_cpp_plugin_build_and_test.yml b/.github/workflows/llama_cpp_plugin_build_and_test.yml
index 829f24c67..82d27420b 100644
--- a/.github/workflows/llama_cpp_plugin_build_and_test.yml
+++ b/.github/workflows/llama_cpp_plugin_build_and_test.yml
@@ -42,7 +42,7 @@ jobs:
         uses: alehechka/upload-tartifact@v2
         with:
           name: build_artifacts
-          path: ${{ github.workspace }}/bin/intel64/Release/
+          path: ${{ github.workspace }}/openvino/bin/intel64/Release/
 
   test_ubuntu20:
     needs: build_ubuntu20
@@ -54,5 +54,15 @@ jobs:
           name: build_artifacts
           path: binaries
 
+      - name: Prepare test data
+        uses: actions/checkout@v3
+        with:
+          repository: ggerganov/llama.cpp
+          path: llama.cpp
+        run: pip install llama.cpp/requirements/requirements-convert-hf-to-gguf.txt
+        run: huggingface-cli huggingface-cli download gpt2 model.safetensors tokenizer.json tokenizer_config.json vocab.json config.json merges.txt --local-dir hf_gpt2
+        run: mkdir -p ${{ github.workspace }}/test_data
+        run: python3 llama.cpp/convert-hf-to-gguf.py hf_gpt2 --outtype f32 --outfile ${{ github.workspace }}/test_data/gpt2.gguf
+
       - name: Run E2E tests
         run:  ${{ github.workspace }}/binaries/llama_cpp_e2e_tests

From dff65c7b66df84ac88083836d8190e9f5a0c9743 Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Thu, 14 Mar 2024 14:35:38 +0100
Subject: [PATCH 13/27] Fix workflow

---
 .../llama_cpp_plugin_build_and_test.yml       | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/llama_cpp_plugin_build_and_test.yml b/.github/workflows/llama_cpp_plugin_build_and_test.yml
index 82d27420b..cbef0e2b7 100644
--- a/.github/workflows/llama_cpp_plugin_build_and_test.yml
+++ b/.github/workflows/llama_cpp_plugin_build_and_test.yml
@@ -38,8 +38,9 @@ jobs:
       - name: CMake - build
         run: cmake --build build -j`nproc` -- llama_cpp_plugin llama_cpp_e2e_tests
 
+
       - name: Upload build artifacts
-        uses: alehechka/upload-tartifact@v2
+        uses: actions/upload-artifact@v4
         with:
           name: build_artifacts
           path: ${{ github.workspace }}/openvino/bin/intel64/Release/
@@ -49,20 +50,23 @@ jobs:
     runs-on: ubuntu-20.04
     steps:
       - name: Download build artifacts
-        uses: alehechka/download-tartifact@v2
+        uses: actions/download-artifact@v4
         with:
           name: build_artifacts
-          path: binaries
+          path: ${{ github.workspace }}/binaries
 
-      - name: Prepare test data
+      - name: Prepare test data - checkout llama.cpp repo
         uses: actions/checkout@v3
         with:
           repository: ggerganov/llama.cpp
           path: llama.cpp
-        run: pip install llama.cpp/requirements/requirements-convert-hf-to-gguf.txt
-        run: huggingface-cli huggingface-cli download gpt2 model.safetensors tokenizer.json tokenizer_config.json vocab.json config.json merges.txt --local-dir hf_gpt2
-        run: mkdir -p ${{ github.workspace }}/test_data
-        run: python3 llama.cpp/convert-hf-to-gguf.py hf_gpt2 --outtype f32 --outfile ${{ github.workspace }}/test_data/gpt2.gguf
+
+      - name: Prepare test data - convert test model files
+        run: |
+          pip install -r llama.cpp/requirements/requirements-convert-hf-to-gguf.txt
+          huggingface-cli download gpt2 model.safetensors tokenizer.json tokenizer_config.json vocab.json config.json merges.txt --local-dir hf_gpt2
+          mkdir -p ${{ github.workspace }}/test_data
+          python3 llama.cpp/convert-hf-to-gguf.py hf_gpt2 --outtype f32 --outfile ${{ github.workspace }}/test_data/gpt2.gguf
 
       - name: Run E2E tests
         run:  ${{ github.workspace }}/binaries/llama_cpp_e2e_tests

From d43fa2fd27bb223aceca1694789945660066815d Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Fri, 15 Mar 2024 13:46:47 +0100
Subject: [PATCH 14/27] Allow resetting llama kv cache with .reset_state

---
 .../include/compiled_model.hpp                |  2 ++
 modules/llama_cpp_plugin/include/state.hpp    | 25 +++++++++++++++++++
 .../llama_cpp_plugin/src/infer_request.cpp    |  4 ++-
 3 files changed, 30 insertions(+), 1 deletion(-)
 create mode 100644 modules/llama_cpp_plugin/include/state.hpp

diff --git a/modules/llama_cpp_plugin/include/compiled_model.hpp b/modules/llama_cpp_plugin/include/compiled_model.hpp
index 292e373fd..38f3696e2 100644
--- a/modules/llama_cpp_plugin/include/compiled_model.hpp
+++ b/modules/llama_cpp_plugin/include/compiled_model.hpp
@@ -12,6 +12,7 @@ namespace ov {
     namespace llama_cpp_plugin {
         class LlamaCppSyncInferRequest;
         class LlamaCppPlugin;
+        class LlamaCppState;
         class LlamaCppModel: public ICompiledModel {
         public:
             LlamaCppModel(const std::shared_ptr<ov::Model>& model,
@@ -79,6 +80,7 @@ namespace ov {
             std::vector<ov::Output<const ov::Node>> m_fake_outputs;
 
         friend class ov::llama_cpp_plugin::LlamaCppSyncInferRequest;
+        friend class ov::llama_cpp_plugin::LlamaCppState;
         };
     }
 }  // namespace ov
diff --git a/modules/llama_cpp_plugin/include/state.hpp b/modules/llama_cpp_plugin/include/state.hpp
new file mode 100644
index 000000000..18e615888
--- /dev/null
+++ b/modules/llama_cpp_plugin/include/state.hpp
@@ -0,0 +1,25 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef LLAMA_CPP_PLUGIN_HPP
+#define LLAMA_CPP_PLUGIN_HPP
+
+#include "openvino/runtime/ivariable_state.hpp"
+#include "compiled_model.hpp"
+
+namespace ov {
+    namespace llama_cpp_plugin {
+        class LlamaCppState : public IVariableState {
+        public:
+            LlamaCppState() = delete;
+            LlamaCppState(const std::shared_ptr<const LlamaCppModel>& model_ptr) : m_model_ptr(model_ptr), IVariableState("llama_cpp_state") {}
+            void reset() override {
+                std::cout << "VSHAMPOR: resetting state" << std::endl;
+                llama_kv_cache_clear(m_model_ptr->m_llama_ctx);
+            }
+        private:
+            const std::shared_ptr<const LlamaCppModel>& m_model_ptr;
+        };
+    }
+}
+#endif // LLAMA_CPP_STATE_HPP
diff --git a/modules/llama_cpp_plugin/src/infer_request.cpp b/modules/llama_cpp_plugin/src/infer_request.cpp
index ee2bdbc45..5efd868d8 100644
--- a/modules/llama_cpp_plugin/src/infer_request.cpp
+++ b/modules/llama_cpp_plugin/src/infer_request.cpp
@@ -3,11 +3,13 @@
 
 #include "infer_request.hpp"
 
+#include <memory>
 #include <openvino/runtime/ivariable_state.hpp>
 
 #include "llama.h"
 #include "openvino/runtime/make_tensor.hpp"
 #include "openvino/util/log.hpp"
+#include "state.hpp"
 
 namespace ov {
 namespace llama_cpp_plugin {
@@ -121,7 +123,7 @@ std::vector<ov::ProfilingInfo> LlamaCppSyncInferRequest::get_profiling_info() co
 
 std::vector<ov::SoPtr<ov::IVariableState>> LlamaCppSyncInferRequest::query_state() const {
     OPENVINO_DEBUG << "llama_cpp_plugin: query_state() called\n";
-    return {};
+    return {std::static_pointer_cast<ov::IVariableState>(std::make_shared<LlamaCppState>(m_compiled_model_ptr))};
 }
 }  // namespace llama_cpp_plugin
 }  // namespace ov

From cf25e3e757ddafb9b0e70a5b66f8938bee215d40 Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Fri, 15 Mar 2024 14:38:46 +0100
Subject: [PATCH 15/27] Set executable mode on test binary

---
 .github/workflows/llama_cpp_plugin_build_and_test.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/llama_cpp_plugin_build_and_test.yml b/.github/workflows/llama_cpp_plugin_build_and_test.yml
index cbef0e2b7..98b543df9 100644
--- a/.github/workflows/llama_cpp_plugin_build_and_test.yml
+++ b/.github/workflows/llama_cpp_plugin_build_and_test.yml
@@ -69,4 +69,6 @@ jobs:
           python3 llama.cpp/convert-hf-to-gguf.py hf_gpt2 --outtype f32 --outfile ${{ github.workspace }}/test_data/gpt2.gguf
 
       - name: Run E2E tests
-        run:  ${{ github.workspace }}/binaries/llama_cpp_e2e_tests
+        run: |
+          chmod +x ${{ github.workspace }}/binaries/llama_cpp_e2e_tests
+          ${{ github.workspace }}/binaries/llama_cpp_e2e_tests

From 795273c928153d9c8daa112894acbb9584631815 Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Fri, 15 Mar 2024 15:18:42 +0100
Subject: [PATCH 16/27] Align thread setting with llama's main executable

---
 modules/llama_cpp_plugin/src/compiled_model.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/modules/llama_cpp_plugin/src/compiled_model.cpp b/modules/llama_cpp_plugin/src/compiled_model.cpp
index 5fed08758..0525956ad 100644
--- a/modules/llama_cpp_plugin/src/compiled_model.cpp
+++ b/modules/llama_cpp_plugin/src/compiled_model.cpp
@@ -9,6 +9,7 @@
 #include <openvino/opsets/opset13.hpp>
 #include <openvino/runtime/properties.hpp>
 #include <openvino/util/log.hpp>
+#include <thread>
 
 #include "infer_request.hpp"
 #include "plugin.hpp"
@@ -47,6 +48,8 @@ LlamaCppModel::LlamaCppModel(const std::string& gguf_fname, const std::shared_pt
     mparams.n_gpu_layers = 99;
     m_llama_model_ptr = llama_load_model_from_file(gguf_fname.c_str(), mparams);
     llama_context_params cparams = llama_context_default_params();
+    cparams.n_threads =
+        std::thread::hardware_concurrency();  // TODO (vshampor): reuse equivalent setting defined by OV API
     m_llama_ctx = llama_new_context_with_model(m_llama_model_ptr, cparams);
     OPENVINO_DEBUG << "llama_cpp_plugin: llama model loaded successfully from GGUF..." << std::endl;
 

From 9990329d0ce1272f31c114ec3769b944332ed9a5 Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Fri, 15 Mar 2024 15:23:26 +0100
Subject: [PATCH 17/27] Set library path in workflow

---
 .github/workflows/llama_cpp_plugin_build_and_test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/llama_cpp_plugin_build_and_test.yml b/.github/workflows/llama_cpp_plugin_build_and_test.yml
index 98b543df9..8fd31a97a 100644
--- a/.github/workflows/llama_cpp_plugin_build_and_test.yml
+++ b/.github/workflows/llama_cpp_plugin_build_and_test.yml
@@ -71,4 +71,5 @@ jobs:
       - name: Run E2E tests
         run: |
           chmod +x ${{ github.workspace }}/binaries/llama_cpp_e2e_tests
+          export LD_LIBRARY_PATH=${{ github.workspace }}/binaries
           ${{ github.workspace }}/binaries/llama_cpp_e2e_tests

From d2504407c7af736763902834da67f23f3e91284e Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Fri, 15 Mar 2024 16:06:32 +0100
Subject: [PATCH 18/27] Add step to install libtbb

---
 .github/workflows/llama_cpp_plugin_build_and_test.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/llama_cpp_plugin_build_and_test.yml b/.github/workflows/llama_cpp_plugin_build_and_test.yml
index 8fd31a97a..db99f7109 100644
--- a/.github/workflows/llama_cpp_plugin_build_and_test.yml
+++ b/.github/workflows/llama_cpp_plugin_build_and_test.yml
@@ -68,6 +68,8 @@ jobs:
           mkdir -p ${{ github.workspace }}/test_data
           python3 llama.cpp/convert-hf-to-gguf.py hf_gpt2 --outtype f32 --outfile ${{ github.workspace }}/test_data/gpt2.gguf
 
+      - name: Install TBB
+        run: sudo apt install -y libtbb2
       - name: Run E2E tests
         run: |
           chmod +x ${{ github.workspace }}/binaries/llama_cpp_e2e_tests

From 6f938ce9f7239827eb62077b1dfb760f25de4285 Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Fri, 15 Mar 2024 16:21:24 +0100
Subject: [PATCH 19/27] Take n_ctx from model

---
 modules/llama_cpp_plugin/src/compiled_model.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/llama_cpp_plugin/src/compiled_model.cpp b/modules/llama_cpp_plugin/src/compiled_model.cpp
index 0525956ad..5af82100d 100644
--- a/modules/llama_cpp_plugin/src/compiled_model.cpp
+++ b/modules/llama_cpp_plugin/src/compiled_model.cpp
@@ -50,6 +50,7 @@ LlamaCppModel::LlamaCppModel(const std::string& gguf_fname, const std::shared_pt
     llama_context_params cparams = llama_context_default_params();
     cparams.n_threads =
         std::thread::hardware_concurrency();  // TODO (vshampor): reuse equivalent setting defined by OV API
+    cparams.n_ctx = 0;  // this means that the actual n_ctx will be taken equal to the model's train-time value
     m_llama_ctx = llama_new_context_with_model(m_llama_model_ptr, cparams);
     OPENVINO_DEBUG << "llama_cpp_plugin: llama model loaded successfully from GGUF..." << std::endl;
 

From 520bf774c6132d8e8eaade62206d8a8f70511625 Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Fri, 15 Mar 2024 16:32:09 +0100
Subject: [PATCH 20/27] Remove debug print

---
 modules/llama_cpp_plugin/include/state.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modules/llama_cpp_plugin/include/state.hpp b/modules/llama_cpp_plugin/include/state.hpp
index 18e615888..efd612bc3 100644
--- a/modules/llama_cpp_plugin/include/state.hpp
+++ b/modules/llama_cpp_plugin/include/state.hpp
@@ -14,7 +14,6 @@ namespace ov {
             LlamaCppState() = delete;
             LlamaCppState(const std::shared_ptr<const LlamaCppModel>& model_ptr) : m_model_ptr(model_ptr), IVariableState("llama_cpp_state") {}
             void reset() override {
-                std::cout << "VSHAMPOR: resetting state" << std::endl;
                 llama_kv_cache_clear(m_model_ptr->m_llama_ctx);
             }
         private:

From 628375874a2d5e1580cf0f2622da56e23501fb54 Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Fri, 15 Mar 2024 16:56:26 +0100
Subject: [PATCH 21/27] Add README.md

---
 modules/llama_cpp_plugin/README.md | 52 ++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 modules/llama_cpp_plugin/README.md

diff --git a/modules/llama_cpp_plugin/README.md b/modules/llama_cpp_plugin/README.md
new file mode 100644
index 000000000..bd0ce6dd8
--- /dev/null
+++ b/modules/llama_cpp_plugin/README.md
@@ -0,0 +1,52 @@
+### Build instructions
+
+This plugin should be built in the same fashion as the rest of the modules:
+
+1. Check out the OpenVINO repository proper (https://github.com/openvinotoolkit/openvino)
+2. Configure the CMake build of the OpenVINO repository, making sure to point the corresponding CMake option to the location of the `openvino_contrib` repository. The command below, executed in the `openvino` repo root, will configure the build so that the modules other `llama_cpp_plugin` module will not be built to save build time - adjust the `-DBUILD_*` options if you need the other modules as well.
+
+```bash
+cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=<PATH_TO_YOUR_CHECKED_OUT_OPENVINO_CONTRIB>/modules -DBUILD_java_api=OFF -DBUILD_nvidia_plugin=OFF -DBUILD_custom_operations=OFF -DBUILD_openvino_code=OFF -DBUILD_token_merging=OFF -DENABLE_PLUGINS_XML=ON .
+```
+
+3. Build the plugin either as part of the complete openvino build by executing:
+
+```bash
+cmake --build build -j`nproc`
+```
+
+or separately by specifying only the `llama_cpp_plugin` target:
+
+```bash
+cmake --build build -j`nproc` -- llama_cpp_plugin
+```
+
+4. Now you can utilize the built `libllama_cpp_plugin.so` as a regular OV plugin with the device name `"LLAMA_CPP"` to directly load GGUF files and infer them using OV API with llama.cpp execution under the hood. Make sure that the plugin is discoverable by the OV runtime (e.g. by putting the built `libllama_cpp_plugin.so`, `libllama.so` and the autogenerated `plugins.xml` from the built location to your OV binaries location, or by setting `LD_LIBRARY_PATH` appropriately).
+
+#### Example of LLM inference code
+
+```C++
+
+ov::Core core;
+auto model = core.compile_model("model.gguf", "LLAMA_CPP") 
+auto input_ids = ov::Tensor(ov::element::Type_t::i64, {1, 128});
+auto position_ids = ov::Tensor(ov::element::Type_t::i64, {1, 128});
+std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
+
+auto infer_request == model.create_infer_request();
+infer_request.set_tensor("input_ids", input_ids);
+infer_request.set_tensor("position_ids", position_ids);
+infer_request.infer();
+
+size_t vocab_size = lm.get_tensor("logits").get_shape().back();
+float* logits = lm.get_tensor("logits").data<float>() + (input_ids_tensor.get_size() - 1) * vocab_size;
+int64_t out_token = std::max_element(logits, logits + vocab_size) - logits;
+```
+
+The models obtained by the `.compile_model` call with the `LLAMA_CPP` plugin expose two inputs (`input_ids` and `position_ids`) and a single output (`logits`) with equivalent meaning to the corresponding arguments in the LLM model representations in the huggingface `transformers` repository. The `attention_mask` and `beam_idx` inputs may be set as well, but will have no effect on the execution. 
+
+Only batch size of 1 is currently supported.
+
+
+
+

From 43e17410d96c2b57474a462f56b21ec8a176968e Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Fri, 15 Mar 2024 20:23:48 +0100
Subject: [PATCH 22/27] Install correct libtbb

---
 .github/workflows/llama_cpp_plugin_build_and_test.yml | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/llama_cpp_plugin_build_and_test.yml b/.github/workflows/llama_cpp_plugin_build_and_test.yml
index db99f7109..6b01806d1 100644
--- a/.github/workflows/llama_cpp_plugin_build_and_test.yml
+++ b/.github/workflows/llama_cpp_plugin_build_and_test.yml
@@ -68,10 +68,14 @@ jobs:
           mkdir -p ${{ github.workspace }}/test_data
           python3 llama.cpp/convert-hf-to-gguf.py hf_gpt2 --outtype f32 --outfile ${{ github.workspace }}/test_data/gpt2.gguf
 
-      - name: Install TBB
-        run: sudo apt install -y libtbb2
+      - name: Install libtbb2
+        run: |
+          wget https://storage.openvinotoolkit.org/dependencies/thirdparty/linux/oneapi-tbb-2021.2.4-lin.tgz
+          mkdir -p tbb
+          tar xvzf oneapi-tbb-2021.2.4-lin.tgz
+
       - name: Run E2E tests
         run: |
           chmod +x ${{ github.workspace }}/binaries/llama_cpp_e2e_tests
-          export LD_LIBRARY_PATH=${{ github.workspace }}/binaries
+          export LD_LIBRARY_PATH=${{ github.workspace }}/binaries:${{ github.workspace }}/tbb/lib
           ${{ github.workspace }}/binaries/llama_cpp_e2e_tests

From 1c6c51e8fa09af455344baeeb1528f8b6ae6fa56 Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Mon, 18 Mar 2024 23:32:47 +0100
Subject: [PATCH 23/27] Use OV from master

---
 .github/workflows/llama_cpp_plugin_build_and_test.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/llama_cpp_plugin_build_and_test.yml b/.github/workflows/llama_cpp_plugin_build_and_test.yml
index 6b01806d1..d73757ad5 100644
--- a/.github/workflows/llama_cpp_plugin_build_and_test.yml
+++ b/.github/workflows/llama_cpp_plugin_build_and_test.yml
@@ -28,8 +28,7 @@ jobs:
         uses: actions/checkout@v3
         with:
           submodules: recursive
-          repository: vshampor/openvino
-          ref: llama_cpp_mod
+          repository: openvinotoolkit/openvino
           path: openvino
 
       - name: CMake - configure

From 53fe441740e80c631aac1951d8ddd1eae0633b59 Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Tue, 19 Mar 2024 15:00:29 +0100
Subject: [PATCH 24/27] Remove gitmodules

---
 .gitmodules | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 .gitmodules

diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index 29da379f7..000000000
--- a/.gitmodules
+++ /dev/null
@@ -1,3 +0,0 @@
-[submodule "modules/llama_cpp_plugin/third_party/llama.cpp"]
-	path = modules/llama_cpp_plugin/third_party/llama.cpp
-	url = https://github.com/vshampor/llama.cpp

From d5447c9683426a7f7d3309473e57a52600afc014 Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Fri, 22 Mar 2024 14:19:37 +0100
Subject: [PATCH 25/27] Apply comments

---
 .../llama_cpp_plugin_build_and_test.yml       |  70 +++++-----
 modules/llama_cpp_plugin/CMakeLists.txt       |  10 +-
 modules/llama_cpp_plugin/README.md            |  10 +-
 .../include/compiled_model.hpp                | 124 ++++++++----------
 .../include/infer_request.hpp                 |   8 +-
 modules/llama_cpp_plugin/include/plugin.hpp   |  54 ++++----
 modules/llama_cpp_plugin/include/state.hpp    |  31 +++--
 modules/llama_cpp_plugin/src/CMakeLists.txt   |  13 +-
 .../llama_cpp_plugin/src/compiled_model.cpp   |  17 ---
 modules/llama_cpp_plugin/src/plugin.cpp       |  30 ++---
 .../llama_cpp_plugin/tests/e2e/CMakeLists.txt |  26 ++--
 11 files changed, 177 insertions(+), 216 deletions(-)

diff --git a/.github/workflows/llama_cpp_plugin_build_and_test.yml b/.github/workflows/llama_cpp_plugin_build_and_test.yml
index d73757ad5..4f9ecb4d9 100644
--- a/.github/workflows/llama_cpp_plugin_build_and_test.yml
+++ b/.github/workflows/llama_cpp_plugin_build_and_test.yml
@@ -1,38 +1,38 @@
 name: llama_cpp_plugin_build_and_test
 
 on:
-  pull_request:
-    types:
-      - opened
+    pull_request:
+        types:
+            - opened
       - reopened
       - synchronize
     paths:
-      - 'modules/llama_cpp_plugin/**'
+        - 'modules/llama_cpp_plugin/**'
 
 jobs:
-  build_ubuntu20:
-    runs-on: ubuntu-20.04
+    build_ubuntu20:
+        runs-on: ubuntu-20.04
     steps:
-      - name: Setup cmake
-        uses: jwlawson/actions-setup-cmake@v1.14
-        with:
-          cmake-version: '3.24.x'
+        - name: Setup cmake
+          uses: jwlawson/actions-setup-cmake@v1.14
+          with:
+              cmake-version: '3.24.x'
 
       - name: Checkout openvino_contrib
         uses: actions/checkout@v3
         with:
-          submodules: recursive
-          path: openvino_contrib
+            submodules: recursive
+        path: openvino_contrib
 
       - name: Checkout openvino
         uses: actions/checkout@v3
         with:
-          submodules: recursive
-          repository: openvinotoolkit/openvino
-          path: openvino
+            submodules: recursive
+        repository: openvinotoolkit/openvino
+        path: openvino
 
       - name: CMake - configure
-        run: cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=${{ github.workspace }}/openvino_contrib/modules -DBUILD_java_api=OFF -DBUILD_nvidia_plugin=OFF -DBUILD_custom_operations=OFF -DBUILD_openvino_code=OFF -DBUILD_token_merging=OFF -DENABLE_TESTS=ON -DENABLE_FUNCTIONAL_TESTS=ON -DENABLE_PLUGINS_XML=ON openvino
+        run: cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=${{ github.workspace }}/openvino_contrib/modules/llama_cpp_plugin -DENABLE_TESTS=ON -DENABLE_FUNCTIONAL_TESTS=ON -DENABLE_PLUGINS_XML=ON -DENABLE_PLUGIN_REGISTRATION=ON openvino
 
       - name: CMake - build
         run: cmake --build build -j`nproc` -- llama_cpp_plugin llama_cpp_e2e_tests
@@ -41,40 +41,40 @@ jobs:
       - name: Upload build artifacts
         uses: actions/upload-artifact@v4
         with:
-          name: build_artifacts
-          path: ${{ github.workspace }}/openvino/bin/intel64/Release/
+            name: build_artifacts
+        path: ${{ github.workspace }}/openvino/bin/intel64/Release/
 
-  test_ubuntu20:
+test_ubuntu20:
     needs: build_ubuntu20
     runs-on: ubuntu-20.04
     steps:
-      - name: Download build artifacts
-        uses: actions/download-artifact@v4
-        with:
-          name: build_artifacts
+        - name: Download build artifacts
+          uses: actions/download-artifact@v4
+          with:
+              name: build_artifacts
           path: ${{ github.workspace }}/binaries
 
       - name: Prepare test data - checkout llama.cpp repo
         uses: actions/checkout@v3
         with:
-          repository: ggerganov/llama.cpp
-          path: llama.cpp
+            repository: ggerganov/llama.cpp
+        path: llama.cpp
 
       - name: Prepare test data - convert test model files
         run: |
-          pip install -r llama.cpp/requirements/requirements-convert-hf-to-gguf.txt
-          huggingface-cli download gpt2 model.safetensors tokenizer.json tokenizer_config.json vocab.json config.json merges.txt --local-dir hf_gpt2
-          mkdir -p ${{ github.workspace }}/test_data
-          python3 llama.cpp/convert-hf-to-gguf.py hf_gpt2 --outtype f32 --outfile ${{ github.workspace }}/test_data/gpt2.gguf
+            pip install -r llama.cpp/requirements/requirements-convert-hf-to-gguf.txt
+            huggingface-cli download gpt2 model.safetensors tokenizer.json tokenizer_config.json vocab.json config.json merges.txt --local-dir hf_gpt2
+            mkdir -p ${{ github.workspace }}/test_data
+            python3 llama.cpp/convert-hf-to-gguf.py hf_gpt2 --outtype f32 --outfile ${{ github.workspace }}/test_data/gpt2.gguf
 
       - name: Install libtbb2
         run: |
-          wget https://storage.openvinotoolkit.org/dependencies/thirdparty/linux/oneapi-tbb-2021.2.4-lin.tgz
-          mkdir -p tbb
-          tar xvzf oneapi-tbb-2021.2.4-lin.tgz
+            wget https://storage.openvinotoolkit.org/dependencies/thirdparty/linux/oneapi-tbb-2021.2.4-lin.tgz
+            mkdir -p tbb
+            tar xvzf oneapi-tbb-2021.2.4-lin.tgz
 
       - name: Run E2E tests
         run: |
-          chmod +x ${{ github.workspace }}/binaries/llama_cpp_e2e_tests
-          export LD_LIBRARY_PATH=${{ github.workspace }}/binaries:${{ github.workspace }}/tbb/lib
-          ${{ github.workspace }}/binaries/llama_cpp_e2e_tests
+            chmod +x ${{ github.workspace }}/binaries/llama_cpp_e2e_tests
+            export LD_LIBRARY_PATH=${{ github.workspace }}/binaries:${{ github.workspace }}/tbb/lib
+            ${{ github.workspace }}/binaries/llama_cpp_e2e_tests
diff --git a/modules/llama_cpp_plugin/CMakeLists.txt b/modules/llama_cpp_plugin/CMakeLists.txt
index 393f4f219..8c9939eab 100644
--- a/modules/llama_cpp_plugin/CMakeLists.txt
+++ b/modules/llama_cpp_plugin/CMakeLists.txt
@@ -7,15 +7,15 @@ project(LlamaCppPlugin)
 
 find_package(OpenVINODeveloperPackage REQUIRED)
 
-ov_option(ENABLE_LLAMA_CPP_PLUGIN_REGISTRATION "Enables registration of LLAMA_CPP plugin" ON)
+ov_option(ENABLE_LLAMA_CPP_PLUGIN_REGISTRATION "Enables registration of LLAMA_CPP plugin" OFF)
 
 add_subdirectory(src)
 
 FetchContent_Declare(
-  llama_cpp
-  GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
-  GIT_TAG        b2417
-)
+    llama_cpp
+    GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
+    GIT_TAG        b2417
+    )
 
 FetchContent_MakeAvailable(llama_cpp)
 
diff --git a/modules/llama_cpp_plugin/README.md b/modules/llama_cpp_plugin/README.md
index bd0ce6dd8..df20db7d3 100644
--- a/modules/llama_cpp_plugin/README.md
+++ b/modules/llama_cpp_plugin/README.md
@@ -6,19 +6,19 @@ This plugin should be built in the same fashion as the rest of the modules:
 2. Configure the CMake build of the OpenVINO repository, making sure to point the corresponding CMake option to the location of the `openvino_contrib` repository. The command below, executed in the `openvino` repo root, will configure the build so that the modules other `llama_cpp_plugin` module will not be built to save build time - adjust the `-DBUILD_*` options if you need the other modules as well.
 
 ```bash
-cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=<PATH_TO_YOUR_CHECKED_OUT_OPENVINO_CONTRIB>/modules -DBUILD_java_api=OFF -DBUILD_nvidia_plugin=OFF -DBUILD_custom_operations=OFF -DBUILD_openvino_code=OFF -DBUILD_token_merging=OFF -DENABLE_PLUGINS_XML=ON .
+cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=<PATH_TO_YOUR_CHECKED_OUT_OPENVINO_CONTRIB>/modules/llama_cpp_plugin -DENABLE_PLUGINS_XML=ON -DENABLE_PLUGIN_REGISTRATION=ON .
 ```
 
 3. Build the plugin either as part of the complete openvino build by executing:
 
 ```bash
-cmake --build build -j`nproc`
+cmake --build build --parallel
 ```
 
 or separately by specifying only the `llama_cpp_plugin` target:
 
 ```bash
-cmake --build build -j`nproc` -- llama_cpp_plugin
+cmake --build build --parallel -- llama_cpp_plugin
 ```
 
 4. Now you can utilize the built `libllama_cpp_plugin.so` as a regular OV plugin with the device name `"LLAMA_CPP"` to directly load GGUF files and infer them using OV API with llama.cpp execution under the hood. Make sure that the plugin is discoverable by the OV runtime (e.g. by putting the built `libllama_cpp_plugin.so`, `libllama.so` and the autogenerated `plugins.xml` from the built location to your OV binaries location, or by setting `LD_LIBRARY_PATH` appropriately).
@@ -28,7 +28,7 @@ cmake --build build -j`nproc` -- llama_cpp_plugin
 ```C++
 
 ov::Core core;
-auto model = core.compile_model("model.gguf", "LLAMA_CPP") 
+auto model = core.compile_model("model.gguf", "LLAMA_CPP")
 auto input_ids = ov::Tensor(ov::element::Type_t::i64, {1, 128});
 auto position_ids = ov::Tensor(ov::element::Type_t::i64, {1, 128});
 std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
@@ -43,7 +43,7 @@ float* logits = lm.get_tensor("logits").data<float>() + (input_ids_tensor.get_si
 int64_t out_token = std::max_element(logits, logits + vocab_size) - logits;
 ```
 
-The models obtained by the `.compile_model` call with the `LLAMA_CPP` plugin expose two inputs (`input_ids` and `position_ids`) and a single output (`logits`) with equivalent meaning to the corresponding arguments in the LLM model representations in the huggingface `transformers` repository. The `attention_mask` and `beam_idx` inputs may be set as well, but will have no effect on the execution. 
+The models obtained by the `.compile_model` call with the `LLAMA_CPP` plugin expose two inputs (`input_ids` and `position_ids`) and a single output (`logits`) with equivalent meaning to the corresponding arguments in the LLM model representations in the huggingface `transformers` repository. The `attention_mask` and `beam_idx` inputs may be set as well, but will have no effect on the execution.
 
 Only batch size of 1 is currently supported.
 
diff --git a/modules/llama_cpp_plugin/include/compiled_model.hpp b/modules/llama_cpp_plugin/include/compiled_model.hpp
index 38f3696e2..4dce17819 100644
--- a/modules/llama_cpp_plugin/include/compiled_model.hpp
+++ b/modules/llama_cpp_plugin/include/compiled_model.hpp
@@ -4,85 +4,75 @@
 #ifndef LLAMA_CPP_COMPILED_MODEL_HPP
 #define LLAMA_CPP_COMPILED_MODEL_HPP
 
+#include "llama.h"
 #include "openvino/runtime/icompiled_model.hpp"
 #include "openvino/runtime/isync_infer_request.hpp"
-#include "llama.h"
 
 namespace ov {
-    namespace llama_cpp_plugin {
-        class LlamaCppSyncInferRequest;
-        class LlamaCppPlugin;
-        class LlamaCppState;
-        class LlamaCppModel: public ICompiledModel {
-        public:
-            LlamaCppModel(const std::shared_ptr<ov::Model>& model,
-                          const std::shared_ptr<const ov::IPlugin>& plugin,
-                          const ov::SoPtr<ov::IRemoteContext>& context,
-                          const std::shared_ptr<ov::threading::ITaskExecutor>& task_executor
-                          );
-
-            LlamaCppModel(const std::shared_ptr<ov::Model>& ov_model,
-                          std::istream& input_file,
-                          const std::shared_ptr<const IPlugin>& plugin);
+namespace llama_cpp_plugin {
+class LlamaCppSyncInferRequest;
+class LlamaCppPlugin;
+class LlamaCppState;
+class LlamaCppModel : public ICompiledModel {
+public:
+    LlamaCppModel(const std::string& gguf_fname, const std::shared_ptr<const IPlugin>& plugin);
+    /**
+     * @brief Export compiled model to stream
+     *
+     * @param model output stream
+     */
+    virtual void export_model(std::ostream& model) const override;
 
-            LlamaCppModel(const std::string& gguf_fname,
-                          const std::shared_ptr<const IPlugin>& plugin);
-            /**
-             * @brief Export compiled model to stream
-             *
-             * @param model output stream
-             */
-            virtual void export_model(std::ostream& model) const override;
+    /**
+     * @brief Returns runtime model
+     *
+     * @return OpenVINO Model which represents runtime graph
+     */
+    virtual std::shared_ptr<const ov::Model> get_runtime_model() const override;
 
-            /**
-             * @brief Returns runtime model
-             *
-             * @return OpenVINO Model which represents runtime graph
-             */
-            virtual std::shared_ptr<const ov::Model> get_runtime_model() const override;
+    /**
+     * @brief Allows to set property
+     *
+     * @param properties new plugin properties
+     */
+    virtual void set_property(const ov::AnyMap& properties) override;
 
-            /**
-             * @brief Allows to set property
-             *
-             * @param properties new plugin properties
-             */
-            virtual void set_property(const ov::AnyMap& properties) override;
+    /**
+     * @brief Returns property
+     *
+     * @param name Property name
+     *
+     * @return Property value
+     *              virtual std::shared_ptr<ov::ISyncInferRequest> create_sync_infer_request() const override;
+     **/
+    virtual ov::Any get_property(const std::string& name) const override;
+    virtual const std::vector<ov::Output<const ov::Node>>& inputs() const override;
+    virtual const std::vector<ov::Output<const ov::Node>>& outputs() const override;
+    virtual ~LlamaCppModel();
 
-            /**
-             * @brief Returns property
-             *
-             * @param name Property name
-             *
-             * @return Property value
-             *              virtual std::shared_ptr<ov::ISyncInferRequest> create_sync_infer_request() const override;
-            **/
-            virtual ov::Any get_property(const std::string& name) const override;
-            virtual const std::vector<ov::Output<const ov::Node>>& inputs() const override;
-            virtual const std::vector<ov::Output<const ov::Node>>& outputs() const override;
-            virtual ~LlamaCppModel();
-        protected:
-            /**
-             * @brief Method creates infer request implementation
-             *
-             * @return Sync infer request
-             */
-            virtual std::shared_ptr<ov::ISyncInferRequest> create_sync_infer_request() const override;
+protected:
+    /**
+     * @brief Method creates infer request implementation
+     *
+     * @return Sync infer request
+     */
+    virtual std::shared_ptr<ov::ISyncInferRequest> create_sync_infer_request() const override;
 
-        private:
-            gguf_context* m_gguf_ctx = nullptr;
-            std::string m_gguf_fname;
+private:
+    gguf_context* m_gguf_ctx = nullptr;
+    std::string m_gguf_fname;
 
-            llama_model* m_llama_model_ptr = nullptr;
-            llama_context* m_llama_ctx = nullptr;
-            std::shared_ptr<ov::Model> m_fake_model;
+    llama_model* m_llama_model_ptr = nullptr;
+    llama_context* m_llama_ctx = nullptr;
+    std::shared_ptr<ov::Model> m_fake_model;
 
-            std::vector<ov::Output<const ov::Node>> m_fake_inputs;
-            std::vector<ov::Output<const ov::Node>> m_fake_outputs;
+    std::vector<ov::Output<const ov::Node>> m_fake_inputs;
+    std::vector<ov::Output<const ov::Node>> m_fake_outputs;
 
-        friend class ov::llama_cpp_plugin::LlamaCppSyncInferRequest;
-        friend class ov::llama_cpp_plugin::LlamaCppState;
-        };
-    }
+    friend class ov::llama_cpp_plugin::LlamaCppSyncInferRequest;
+    friend class ov::llama_cpp_plugin::LlamaCppState;
+};
+}  // namespace llama_cpp_plugin
 }  // namespace ov
 
 #endif  // LLAMA_CPP_COMPILED_MODEL_HPP
diff --git a/modules/llama_cpp_plugin/include/infer_request.hpp b/modules/llama_cpp_plugin/include/infer_request.hpp
index e8a0da65d..8f298ab57 100644
--- a/modules/llama_cpp_plugin/include/infer_request.hpp
+++ b/modules/llama_cpp_plugin/include/infer_request.hpp
@@ -4,17 +4,16 @@
 #ifndef LLAMA_CPP_INFER_REQUEST_HPP
 #define LLAMA_CPP_INFER_REQUEST_HPP
 
-#include "openvino/openvino.hpp"
 #include "compiled_model.hpp"
+#include "openvino/openvino.hpp"
 
 namespace ov {
 namespace llama_cpp_plugin {
 
-
 class LlamaCppSyncInferRequest : public ISyncInferRequest {
 public:
     explicit LlamaCppSyncInferRequest(const std::shared_ptr<const LlamaCppModel>& compiled_model);
-    virtual ~LlamaCppSyncInferRequest() {};
+    virtual ~LlamaCppSyncInferRequest(){};
 
     virtual void set_tensors_impl(const ov::Output<const ov::Node> port,
                                   const std::vector<ov::SoPtr<ov::ITensor>>& tensors) override;
@@ -22,11 +21,12 @@ class LlamaCppSyncInferRequest : public ISyncInferRequest {
     virtual void infer() override;
     virtual std::vector<ov::ProfilingInfo> get_profiling_info() const override;
     virtual std::vector<ov::SoPtr<ov::IVariableState>> query_state() const override;
+
 private:
     std::shared_ptr<const LlamaCppModel> m_compiled_model_ptr;
 };
 
-}  // namespace LlamaCppPlugin
+}  // namespace llama_cpp_plugin
 };  // namespace ov
 
 #endif /* LLAMA_CPP_INFER_REQUEST_HPP */
diff --git a/modules/llama_cpp_plugin/include/plugin.hpp b/modules/llama_cpp_plugin/include/plugin.hpp
index f68ebd3d6..1bcb6abbd 100644
--- a/modules/llama_cpp_plugin/include/plugin.hpp
+++ b/modules/llama_cpp_plugin/include/plugin.hpp
@@ -7,42 +7,40 @@
 #include "openvino/runtime/iplugin.hpp"
 
 namespace ov {
-    namespace llama_cpp_plugin {
-        class LlamaCppPlugin : public IPlugin {
-        public:
-            LlamaCppPlugin();
-            virtual std::shared_ptr<ov::ICompiledModel> compile_model(const std::shared_ptr<const ov::Model>& model,
-                const ov::AnyMap& properties) const override;
+namespace llama_cpp_plugin {
+class LlamaCppPlugin : public IPlugin {
+public:
+    LlamaCppPlugin();
+    virtual std::shared_ptr<ov::ICompiledModel> compile_model(const std::shared_ptr<const ov::Model>& model,
+                                                              const ov::AnyMap& properties) const override;
 
-            virtual std::shared_ptr<ov::ICompiledModel> compile_model(const std::shared_ptr<const ov::Model>& model,
-                const ov::AnyMap& properties,
-                const ov::SoPtr<ov::IRemoteContext>& context) const override;
+    virtual std::shared_ptr<ov::ICompiledModel> compile_model(
+        const std::shared_ptr<const ov::Model>& model,
+        const ov::AnyMap& properties,
+        const ov::SoPtr<ov::IRemoteContext>& context) const override;
 
-            virtual void set_property(const ov::AnyMap& properties) override;
+    virtual void set_property(const ov::AnyMap& properties) override;
 
-            virtual ov::Any get_property(const std::string& name, const ov::AnyMap& arguments) const override;
+    virtual ov::Any get_property(const std::string& name, const ov::AnyMap& arguments) const override;
 
-            virtual ov::SoPtr<ov::IRemoteContext> create_context(const ov::AnyMap& remote_properties) const override;
+    virtual ov::SoPtr<ov::IRemoteContext> create_context(const ov::AnyMap& remote_properties) const override;
 
-            virtual ov::SoPtr<ov::IRemoteContext> get_default_context(const ov::AnyMap& remote_properties) const override;
+    virtual ov::SoPtr<ov::IRemoteContext> get_default_context(const ov::AnyMap& remote_properties) const override;
 
-            virtual std::shared_ptr<ov::ICompiledModel> import_model(std::istream& model,
-                const ov::AnyMap& properties) const override;
+    virtual std::shared_ptr<ov::ICompiledModel> import_model(std::istream& model,
+                                                             const ov::AnyMap& properties) const override;
 
-            virtual std::shared_ptr<ov::ICompiledModel> compile_model(const std::string& fname,
-                const ov::AnyMap& properties) const override;
+    virtual std::shared_ptr<ov::ICompiledModel> compile_model(const std::string& fname,
+                                                              const ov::AnyMap& properties) const override;
 
-            virtual std::shared_ptr<ov::ICompiledModel> import_model(std::istream& model,
-                const ov::SoPtr<ov::IRemoteContext>& context,
-                const ov::AnyMap& properties) const override;
+    virtual std::shared_ptr<ov::ICompiledModel> import_model(std::istream& model,
+                                                             const ov::SoPtr<ov::IRemoteContext>& context,
+                                                             const ov::AnyMap& properties) const override;
 
-            virtual ov::SupportedOpsMap query_model(const std::shared_ptr<const ov::Model>& model,
-                const ov::AnyMap& properties) const override;
-
-        private:
-            std::string m_cache_dir = "";
-        };
-    }  // namespace llama_cpp_plugin
+    virtual ov::SupportedOpsMap query_model(const std::shared_ptr<const ov::Model>& model,
+                                            const ov::AnyMap& properties) const override;
+};
+}  // namespace llama_cpp_plugin
 }  // namespace ov
 
-#endif // LLAMA_CPP_PLUGIN_HPP
+#endif  // LLAMA_CPP_PLUGIN_HPP
diff --git a/modules/llama_cpp_plugin/include/state.hpp b/modules/llama_cpp_plugin/include/state.hpp
index efd612bc3..229970894 100644
--- a/modules/llama_cpp_plugin/include/state.hpp
+++ b/modules/llama_cpp_plugin/include/state.hpp
@@ -4,21 +4,24 @@
 #ifndef LLAMA_CPP_PLUGIN_HPP
 #define LLAMA_CPP_PLUGIN_HPP
 
-#include "openvino/runtime/ivariable_state.hpp"
 #include "compiled_model.hpp"
+#include "openvino/runtime/ivariable_state.hpp"
 
 namespace ov {
-    namespace llama_cpp_plugin {
-        class LlamaCppState : public IVariableState {
-        public:
-            LlamaCppState() = delete;
-            LlamaCppState(const std::shared_ptr<const LlamaCppModel>& model_ptr) : m_model_ptr(model_ptr), IVariableState("llama_cpp_state") {}
-            void reset() override {
-                llama_kv_cache_clear(m_model_ptr->m_llama_ctx);
-            }
-        private:
-            const std::shared_ptr<const LlamaCppModel>& m_model_ptr;
-        };
+namespace llama_cpp_plugin {
+class LlamaCppState : public IVariableState {
+public:
+    LlamaCppState() = delete;
+    LlamaCppState(const std::shared_ptr<const LlamaCppModel>& model_ptr)
+        : m_model_ptr(model_ptr),
+          IVariableState("llama_cpp_state") {}
+    void reset() override {
+        llama_kv_cache_clear(m_model_ptr->m_llama_ctx);
     }
-}
-#endif // LLAMA_CPP_STATE_HPP
+
+private:
+    const std::shared_ptr<const LlamaCppModel>& m_model_ptr;
+};
+}  // namespace llama_cpp_plugin
+}  // namespace ov
+#endif  // LLAMA_CPP_STATE_HPP
diff --git a/modules/llama_cpp_plugin/src/CMakeLists.txt b/modules/llama_cpp_plugin/src/CMakeLists.txt
index d99a44795..258df852f 100644
--- a/modules/llama_cpp_plugin/src/CMakeLists.txt
+++ b/modules/llama_cpp_plugin/src/CMakeLists.txt
@@ -25,14 +25,11 @@ endif()
 
 # adds a shared library with plugin
 ov_add_plugin(NAME ${TARGET_NAME}
-              DEVICE_NAME ${PLUGIN_DEVICE_NAME}
-              SOURCES ${SOURCES} ${HEADERS}
-              ${skip_plugin}
-              VERSION_DEFINES_FOR plugin.cpp
-              ADD_CLANG_FORMAT)
-
-# Enable support of CC for the plugin
-ov_mark_target_as_cc(${TARGET_NAME})
+    DEVICE_NAME ${PLUGIN_DEVICE_NAME}
+    SOURCES ${SOURCES} ${HEADERS}
+    ${skip_plugin}
+    VERSION_DEFINES_FOR plugin.cpp
+    ADD_CLANG_FORMAT)
 
 target_include_directories(${TARGET_NAME} PRIVATE
     "${CMAKE_CURRENT_SOURCE_DIR}"
diff --git a/modules/llama_cpp_plugin/src/compiled_model.cpp b/modules/llama_cpp_plugin/src/compiled_model.cpp
index 5af82100d..adf9e17cf 100644
--- a/modules/llama_cpp_plugin/src/compiled_model.cpp
+++ b/modules/llama_cpp_plugin/src/compiled_model.cpp
@@ -17,23 +17,6 @@
 namespace ov {
 namespace llama_cpp_plugin {
 
-LlamaCppModel::LlamaCppModel(const std::shared_ptr<ov::Model>& model,
-                             const std::shared_ptr<const ov::IPlugin>& plugin,
-                             const ov::SoPtr<ov::IRemoteContext>& context,
-                             const std::shared_ptr<ov::threading::ITaskExecutor>& task_executor)
-    : ICompiledModel(model, plugin, context, task_executor) {
-    OPENVINO_THROW_NOT_IMPLEMENTED("Currently only direct GGUF file loading is "
-                                   "supported for the LLAMA_CPP* plugins");
-}
-
-LlamaCppModel::LlamaCppModel(const std::shared_ptr<ov::Model>& ov_model,
-                             std::istream& input_stream,
-                             const std::shared_ptr<const IPlugin>& plugin)
-    : ICompiledModel(ov_model, plugin) {
-    OPENVINO_THROW_NOT_IMPLEMENTED("Currently only direct GGUF file loading is "
-                                   "supported for the LLAMA_CPP* plugins");
-}
-
 LlamaCppModel::~LlamaCppModel() {
     llama_free(m_llama_ctx);
     llama_free_model(m_llama_model_ptr);
diff --git a/modules/llama_cpp_plugin/src/plugin.cpp b/modules/llama_cpp_plugin/src/plugin.cpp
index 77287555b..52536130c 100644
--- a/modules/llama_cpp_plugin/src/plugin.cpp
+++ b/modules/llama_cpp_plugin/src/plugin.cpp
@@ -23,38 +23,31 @@ LlamaCppPlugin::LlamaCppPlugin() : IPlugin() {
 }
 std::shared_ptr<ov::ICompiledModel> LlamaCppPlugin::compile_model(const std::shared_ptr<const ov::Model>& model,
                                                                   const ov::AnyMap& properties) const {
-    OPENVINO_DEBUG << "llama_cpp_plugin: LlamaCppPlugin::compile_model" << std::endl;
-    return compile_model(model, properties, {});
+    OPENVINO_THROW_NOT_IMPLEMENTED("Currently only direct GGUF file loading is "
+                                   "supported for the LLAMA_CPP* plugins");
 }
 
-std::shared_ptr<ov::ICompiledModel> LlamaCppPlugin::compile_model(const std::string& fname,
-                                                                  const ov::AnyMap& properties) const {
-    return std::make_shared<LlamaCppModel>(fname, shared_from_this());
-}
 std::shared_ptr<ov::ICompiledModel> LlamaCppPlugin::compile_model(const std::shared_ptr<const ov::Model>& model,
                                                                   const ov::AnyMap& properties,
                                                                   const ov::SoPtr<ov::IRemoteContext>& context) const {
-    OPENVINO_DEBUG << "llama_cpp_plugin: compile_model called in C++" << std::endl;
-    return std::make_shared<LlamaCppModel>(model->clone(),
-                                           shared_from_this(),
-                                           context,
-                                           get_executor_manager()->get_executor(template_exclusive_executor));
+    OPENVINO_THROW_NOT_IMPLEMENTED("Currently only direct GGUF file loading is "
+                                   "supported for the LLAMA_CPP* plugins");
+}
+std::shared_ptr<ov::ICompiledModel> LlamaCppPlugin::compile_model(const std::string& fname,
+                                                                  const ov::AnyMap& properties) const {
+    return std::make_shared<LlamaCppModel>(fname, shared_from_this());
 }
 
 void LlamaCppPlugin::set_property(const ov::AnyMap& properties) {
     for (const auto& map_entry : properties) {
-        if (map_entry.first == ov::cache_dir.name()) {
-            m_cache_dir = map_entry.second.as<std::string>();
-        } else {
-            OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: setting property ", map_entry.first, "not implemented");
-        }
+        OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: setting property ", map_entry.first, "not implemented");
     }
 }
 
 ov::Any LlamaCppPlugin::get_property(const std::string& name, const ov::AnyMap& arguments) const {
     if (ov::supported_properties == name) {
         return decltype(ov::supported_properties)::value_type(
-            std::vector<PropertyName>({ov::cache_dir, ov::device::capabilities, ov::device::full_name}));
+            std::vector<PropertyName>({ov::device::capabilities, ov::device::full_name}));
     }
     if (ov::device::capabilities == name) {
         return decltype(ov::device::capabilities)::value_type(
@@ -65,9 +58,6 @@ ov::Any LlamaCppPlugin::get_property(const std::string& name, const ov::AnyMap&
             std::vector<PropertyName>({ov::internal::caching_properties}));
     }
 
-    if (ov::cache_dir == name) {
-        return m_cache_dir;
-    }
     if (ov::internal::caching_properties == name) {
         return std::vector<ov::PropertyName>{ov::device::full_name};
     }
diff --git a/modules/llama_cpp_plugin/tests/e2e/CMakeLists.txt b/modules/llama_cpp_plugin/tests/e2e/CMakeLists.txt
index ea96e9d3b..096ad46ad 100644
--- a/modules/llama_cpp_plugin/tests/e2e/CMakeLists.txt
+++ b/modules/llama_cpp_plugin/tests/e2e/CMakeLists.txt
@@ -4,17 +4,17 @@
 set(TARGET_NAME llama_cpp_e2e_tests)
 
 ov_add_test_target(
-        NAME ${TARGET_NAME}
-        ROOT ${CMAKE_CURRENT_SOURCE_DIR}
-        DEPENDENCIES
-            llama_cpp_plugin
-        LINK_LIBRARIES
-            openvino::runtime::dev
-            openvino::funcSharedTests
-        INCLUDES
-            "${OpenVINOTemplatePlugin_SOURCE_DIR}/include"
-        ADD_CLANG_FORMAT
-        LABELS
-            OV UNIT TEMPLATE
-)
+    NAME ${TARGET_NAME}
+    ROOT ${CMAKE_CURRENT_SOURCE_DIR}
+    DEPENDENCIES
+    llama_cpp_plugin
+    LINK_LIBRARIES
+    openvino::runtime::dev
+    openvino::funcSharedTests
+    INCLUDES
+    "${LlamaCppPlugin_SOURCE_DIR}/include"
+    ADD_CLANG_FORMAT
+    LABELS
+    OV UNIT TEMPLATE
+    )
 

From de225a507574882956dfa48f3edf9fdcbd0088cf Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Fri, 22 Mar 2024 20:29:53 +0100
Subject: [PATCH 26/27] Fix workflow indents

---
 .../llama_cpp_plugin_build_and_test.yml       | 70 +++++++++----------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/.github/workflows/llama_cpp_plugin_build_and_test.yml b/.github/workflows/llama_cpp_plugin_build_and_test.yml
index 4f9ecb4d9..50ec92dc1 100644
--- a/.github/workflows/llama_cpp_plugin_build_and_test.yml
+++ b/.github/workflows/llama_cpp_plugin_build_and_test.yml
@@ -1,38 +1,38 @@
 name: llama_cpp_plugin_build_and_test
 
 on:
-    pull_request:
-        types:
-            - opened
+  pull_request:
+    types:
+      - opened
       - reopened
       - synchronize
     paths:
-        - 'modules/llama_cpp_plugin/**'
+      - 'modules/llama_cpp_plugin/**'
 
 jobs:
-    build_ubuntu20:
-        runs-on: ubuntu-20.04
+  build_ubuntu20:
+    runs-on: ubuntu-20.04
     steps:
-        - name: Setup cmake
-          uses: jwlawson/actions-setup-cmake@v1.14
-          with:
-              cmake-version: '3.24.x'
+      - name: Setup cmake
+        uses: jwlawson/actions-setup-cmake@v1.14
+        with:
+          cmake-version: '3.24.x'
 
       - name: Checkout openvino_contrib
         uses: actions/checkout@v3
         with:
-            submodules: recursive
-        path: openvino_contrib
+          submodules: recursive
+          path: openvino_contrib
 
       - name: Checkout openvino
         uses: actions/checkout@v3
         with:
-            submodules: recursive
-        repository: openvinotoolkit/openvino
-        path: openvino
+          submodules: recursive
+          repository: openvinotoolkit/openvino
+          path: openvino
 
       - name: CMake - configure
-        run: cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=${{ github.workspace }}/openvino_contrib/modules/llama_cpp_plugin -DENABLE_TESTS=ON -DENABLE_FUNCTIONAL_TESTS=ON -DENABLE_PLUGINS_XML=ON -DENABLE_PLUGIN_REGISTRATION=ON openvino
+        run: cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=${{ github.workspace }}/openvino_contrib/modules/llama_cpp_plugin -DENABLE_TESTS=ON -DENABLE_FUNCTIONAL_TESTS=ON -DENABLE_PLUGINS_XML=ON -DENABLE_LLAMA_CPP_PLUGIN_REGISTRATION=ON openvino
 
       - name: CMake - build
         run: cmake --build build -j`nproc` -- llama_cpp_plugin llama_cpp_e2e_tests
@@ -41,40 +41,40 @@ jobs:
       - name: Upload build artifacts
         uses: actions/upload-artifact@v4
         with:
-            name: build_artifacts
-        path: ${{ github.workspace }}/openvino/bin/intel64/Release/
+          name: build_artifacts
+          path: ${{ github.workspace }}/openvino/bin/intel64/Release/
 
-test_ubuntu20:
+  test_ubuntu20:
     needs: build_ubuntu20
     runs-on: ubuntu-20.04
     steps:
-        - name: Download build artifacts
-          uses: actions/download-artifact@v4
-          with:
-              name: build_artifacts
+      - name: Download build artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: build_artifacts
           path: ${{ github.workspace }}/binaries
 
       - name: Prepare test data - checkout llama.cpp repo
         uses: actions/checkout@v3
         with:
-            repository: ggerganov/llama.cpp
-        path: llama.cpp
+          repository: ggerganov/llama.cpp
+          path: llama.cpp
 
       - name: Prepare test data - convert test model files
         run: |
-            pip install -r llama.cpp/requirements/requirements-convert-hf-to-gguf.txt
-            huggingface-cli download gpt2 model.safetensors tokenizer.json tokenizer_config.json vocab.json config.json merges.txt --local-dir hf_gpt2
-            mkdir -p ${{ github.workspace }}/test_data
-            python3 llama.cpp/convert-hf-to-gguf.py hf_gpt2 --outtype f32 --outfile ${{ github.workspace }}/test_data/gpt2.gguf
+          pip install -r llama.cpp/requirements/requirements-convert-hf-to-gguf.txt
+          huggingface-cli download gpt2 model.safetensors tokenizer.json tokenizer_config.json vocab.json config.json merges.txt --local-dir hf_gpt2
+          mkdir -p ${{ github.workspace }}/test_data
+          python3 llama.cpp/convert-hf-to-gguf.py hf_gpt2 --outtype f32 --outfile ${{ github.workspace }}/test_data/gpt2.gguf
 
       - name: Install libtbb2
         run: |
-            wget https://storage.openvinotoolkit.org/dependencies/thirdparty/linux/oneapi-tbb-2021.2.4-lin.tgz
-            mkdir -p tbb
-            tar xvzf oneapi-tbb-2021.2.4-lin.tgz
+          wget https://storage.openvinotoolkit.org/dependencies/thirdparty/linux/oneapi-tbb-2021.2.4-lin.tgz
+          mkdir -p tbb
+          tar xvzf oneapi-tbb-2021.2.4-lin.tgz
 
       - name: Run E2E tests
         run: |
-            chmod +x ${{ github.workspace }}/binaries/llama_cpp_e2e_tests
-            export LD_LIBRARY_PATH=${{ github.workspace }}/binaries:${{ github.workspace }}/tbb/lib
-            ${{ github.workspace }}/binaries/llama_cpp_e2e_tests
+          chmod +x ${{ github.workspace }}/binaries/llama_cpp_e2e_tests
+          export LD_LIBRARY_PATH=${{ github.workspace }}/binaries:${{ github.workspace }}/tbb/lib
+          ${{ github.workspace }}/binaries/llama_cpp_e2e_tests

From aef994892ef3c37b427225818d5188f3a9503825 Mon Sep 17 00:00:00 2001
From: Vasily Shamporov <vasily.shamporov@intel.com>
Date: Mon, 25 Mar 2024 17:27:25 +0100
Subject: [PATCH 27/27] Improve workflow

---
 .../workflows/llama_cpp_plugin_build_and_test.yml    | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/llama_cpp_plugin_build_and_test.yml b/.github/workflows/llama_cpp_plugin_build_and_test.yml
index 50ec92dc1..4d0af3bdf 100644
--- a/.github/workflows/llama_cpp_plugin_build_and_test.yml
+++ b/.github/workflows/llama_cpp_plugin_build_and_test.yml
@@ -2,16 +2,12 @@ name: llama_cpp_plugin_build_and_test
 
 on:
   pull_request:
-    types:
-      - opened
-      - reopened
-      - synchronize
     paths:
       - 'modules/llama_cpp_plugin/**'
 
 jobs:
   build_ubuntu20:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-20.04-8-cores
     steps:
       - name: Setup cmake
         uses: jwlawson/actions-setup-cmake@v1.14
@@ -19,13 +15,13 @@ jobs:
           cmake-version: '3.24.x'
 
       - name: Checkout openvino_contrib
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           submodules: recursive
           path: openvino_contrib
 
       - name: Checkout openvino
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           submodules: recursive
           repository: openvinotoolkit/openvino
@@ -55,7 +51,7 @@ jobs:
           path: ${{ github.workspace }}/binaries
 
       - name: Prepare test data - checkout llama.cpp repo
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           repository: ggerganov/llama.cpp
           path: llama.cpp