openvinotoolkit
diff --git a/‎.github/workflows/llama_cpp_plugin_build_and_test.yml
+35-35 b/‎.github/workflows/llama_cpp_plugin_build_and_test.yml
+35-35
diff --git a/‎modules/llama_cpp_plugin/CMakeLists.txt
+5-5 b/‎modules/llama_cpp_plugin/CMakeLists.txt
+5-5
diff --git a/‎modules/llama_cpp_plugin/README.md
+5-5 b/‎modules/llama_cpp_plugin/README.md
+5-5
diff --git a/‎modules/llama_cpp_plugin/include/compiled_model.hpp
+57-67 b/‎modules/llama_cpp_plugin/include/compiled_model.hpp
+57-67
diff --git a/‎modules/llama_cpp_plugin/include/infer_request.hpp
+4-4 b/‎modules/llama_cpp_plugin/include/infer_request.hpp
+4-4
@@ -1,38 +1,38 @@
 name: llama_cpp_plugin_build_and_test
 
 on:
-  pull_request:
-    types:
-      - opened
+    pull_request:
+        types:
+            - opened
       - reopened
       - synchronize
     paths:
-      - 'modules/llama_cpp_plugin/**'
+        - 'modules/llama_cpp_plugin/**'
 
 jobs:
-  build_ubuntu20:
-    runs-on: ubuntu-20.04
+    build_ubuntu20:
+        runs-on: ubuntu-20.04
     steps:
-      - name: Setup cmake
-        uses: jwlawson/actions-setup-cmake@v1.14
-        with:
-          cmake-version: '3.24.x'
+        - name: Setup cmake
+          uses: jwlawson/actions-setup-cmake@v1.14
+          with:
+              cmake-version: '3.24.x'
 
       - name: Checkout openvino_contrib
         uses: actions/checkout@v3
         with:
-          submodules: recursive
-          path: openvino_contrib
+            submodules: recursive
+        path: openvino_contrib
 
       - name: Checkout openvino
         uses: actions/checkout@v3
         with:
-          submodules: recursive
-          repository: openvinotoolkit/openvino
-          path: openvino
+            submodules: recursive
+        repository: openvinotoolkit/openvino
+        path: openvino
 
       - name: CMake - configure
-        run: cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=${{ github.workspace }}/openvino_contrib/modules -DBUILD_java_api=OFF -DBUILD_nvidia_plugin=OFF -DBUILD_custom_operations=OFF -DBUILD_openvino_code=OFF -DBUILD_token_merging=OFF -DENABLE_TESTS=ON -DENABLE_FUNCTIONAL_TESTS=ON -DENABLE_PLUGINS_XML=ON openvino
+        run: cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=${{ github.workspace }}/openvino_contrib/modules/llama_cpp_plugin -DENABLE_TESTS=ON -DENABLE_FUNCTIONAL_TESTS=ON -DENABLE_PLUGINS_XML=ON -DENABLE_PLUGIN_REGISTRATION=ON openvino
 
       - name: CMake - build
         run: cmake --build build -j`nproc` -- llama_cpp_plugin llama_cpp_e2e_tests
@@ -41,40 +41,40 @@ jobs:
       - name: Upload build artifacts
         uses: actions/upload-artifact@v4
         with:
-          name: build_artifacts
-          path: ${{ github.workspace }}/openvino/bin/intel64/Release/
+            name: build_artifacts
+        path: ${{ github.workspace }}/openvino/bin/intel64/Release/
 
-  test_ubuntu20:
+test_ubuntu20:
     needs: build_ubuntu20
     runs-on: ubuntu-20.04
     steps:
-      - name: Download build artifacts
-        uses: actions/download-artifact@v4
-        with:
-          name: build_artifacts
+        - name: Download build artifacts
+          uses: actions/download-artifact@v4
+          with:
+              name: build_artifacts
           path: ${{ github.workspace }}/binaries
 
       - name: Prepare test data - checkout llama.cpp repo
         uses: actions/checkout@v3
         with:
-          repository: ggerganov/llama.cpp
-          path: llama.cpp
+            repository: ggerganov/llama.cpp
+        path: llama.cpp
 
       - name: Prepare test data - convert test model files
         run: |
-          pip install -r llama.cpp/requirements/requirements-convert-hf-to-gguf.txt
-          huggingface-cli download gpt2 model.safetensors tokenizer.json tokenizer_config.json vocab.json config.json merges.txt --local-dir hf_gpt2
-          mkdir -p ${{ github.workspace }}/test_data
-          python3 llama.cpp/convert-hf-to-gguf.py hf_gpt2 --outtype f32 --outfile ${{ github.workspace }}/test_data/gpt2.gguf
+            pip install -r llama.cpp/requirements/requirements-convert-hf-to-gguf.txt
+            huggingface-cli download gpt2 model.safetensors tokenizer.json tokenizer_config.json vocab.json config.json merges.txt --local-dir hf_gpt2
+            mkdir -p ${{ github.workspace }}/test_data
+            python3 llama.cpp/convert-hf-to-gguf.py hf_gpt2 --outtype f32 --outfile ${{ github.workspace }}/test_data/gpt2.gguf
 
       - name: Install libtbb2
         run: |
-          wget https://storage.openvinotoolkit.org/dependencies/thirdparty/linux/oneapi-tbb-2021.2.4-lin.tgz
-          mkdir -p tbb
-          tar xvzf oneapi-tbb-2021.2.4-lin.tgz
+            wget https://storage.openvinotoolkit.org/dependencies/thirdparty/linux/oneapi-tbb-2021.2.4-lin.tgz
+            mkdir -p tbb
+            tar xvzf oneapi-tbb-2021.2.4-lin.tgz
 
       - name: Run E2E tests
         run: |
-          chmod +x ${{ github.workspace }}/binaries/llama_cpp_e2e_tests
-          export LD_LIBRARY_PATH=${{ github.workspace }}/binaries:${{ github.workspace }}/tbb/lib
-          ${{ github.workspace }}/binaries/llama_cpp_e2e_tests
+            chmod +x ${{ github.workspace }}/binaries/llama_cpp_e2e_tests
+            export LD_LIBRARY_PATH=${{ github.workspace }}/binaries:${{ github.workspace }}/tbb/lib
+            ${{ github.workspace }}/binaries/llama_cpp_e2e_tests
@@ -7,15 +7,15 @@ project(LlamaCppPlugin)
 
 find_package(OpenVINODeveloperPackage REQUIRED)
 
-ov_option(ENABLE_LLAMA_CPP_PLUGIN_REGISTRATION "Enables registration of LLAMA_CPP plugin" ON)
+ov_option(ENABLE_LLAMA_CPP_PLUGIN_REGISTRATION "Enables registration of LLAMA_CPP plugin" OFF)
 
 add_subdirectory(src)
 
 FetchContent_Declare(
-  llama_cpp
-  GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
-  GIT_TAG        b2417
-)
+    llama_cpp
+    GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
+    GIT_TAG        b2417
+    )
 
 FetchContent_MakeAvailable(llama_cpp)
 
 
@@ -6,19 +6,19 @@ This plugin should be built in the same fashion as the rest of the modules:
 2. Configure the CMake build of the OpenVINO repository, making sure to point the corresponding CMake option to the location of the `openvino_contrib` repository. The command below, executed in the `openvino` repo root, will configure the build so that the modules other `llama_cpp_plugin` module will not be built to save build time - adjust the `-DBUILD_*` options if you need the other modules as well.
 
 ```bash
-cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=<PATH_TO_YOUR_CHECKED_OUT_OPENVINO_CONTRIB>/modules -DBUILD_java_api=OFF -DBUILD_nvidia_plugin=OFF -DBUILD_custom_operations=OFF -DBUILD_openvino_code=OFF -DBUILD_token_merging=OFF -DENABLE_PLUGINS_XML=ON .
+cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=<PATH_TO_YOUR_CHECKED_OUT_OPENVINO_CONTRIB>/modules/llama_cpp_plugin -DENABLE_PLUGINS_XML=ON -DENABLE_PLUGIN_REGISTRATION=ON .
 ```
 
 3. Build the plugin either as part of the complete openvino build by executing:
 
 ```bash
-cmake --build build -j`nproc`
+cmake --build build --parallel
 ```
 
 or separately by specifying only the `llama_cpp_plugin` target:
 
 ```bash
-cmake --build build -j`nproc` -- llama_cpp_plugin
+cmake --build build --parallel -- llama_cpp_plugin
 ```
 
 4. Now you can utilize the built `libllama_cpp_plugin.so` as a regular OV plugin with the device name `"LLAMA_CPP"` to directly load GGUF files and infer them using OV API with llama.cpp execution under the hood. Make sure that the plugin is discoverable by the OV runtime (e.g. by putting the built `libllama_cpp_plugin.so`, `libllama.so` and the autogenerated `plugins.xml` from the built location to your OV binaries location, or by setting `LD_LIBRARY_PATH` appropriately).
@@ -28,7 +28,7 @@ cmake --build build -j`nproc` -- llama_cpp_plugin
 ```C++
 
 ov::Core core;
-auto model = core.compile_model("model.gguf", "LLAMA_CPP") 
+auto model = core.compile_model("model.gguf", "LLAMA_CPP")
 auto input_ids = ov::Tensor(ov::element::Type_t::i64, {1, 128});
 auto position_ids = ov::Tensor(ov::element::Type_t::i64, {1, 128});
 std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
@@ -43,7 +43,7 @@ float* logits = lm.get_tensor("logits").data<float>() + (input_ids_tensor.get_si
 int64_t out_token = std::max_element(logits, logits + vocab_size) - logits;
 ```
 
-The models obtained by the `.compile_model` call with the `LLAMA_CPP` plugin expose two inputs (`input_ids` and `position_ids`) and a single output (`logits`) with equivalent meaning to the corresponding arguments in the LLM model representations in the huggingface `transformers` repository. The `attention_mask` and `beam_idx` inputs may be set as well, but will have no effect on the execution. 
+The models obtained by the `.compile_model` call with the `LLAMA_CPP` plugin expose two inputs (`input_ids` and `position_ids`) and a single output (`logits`) with equivalent meaning to the corresponding arguments in the LLM model representations in the huggingface `transformers` repository. The `attention_mask` and `beam_idx` inputs may be set as well, but will have no effect on the execution.
 
 Only batch size of 1 is currently supported.
 
 
@@ -4,85 +4,75 @@
 #ifndef LLAMA_CPP_COMPILED_MODEL_HPP
 #define LLAMA_CPP_COMPILED_MODEL_HPP
 
+#include "llama.h"
 #include "openvino/runtime/icompiled_model.hpp"
 #include "openvino/runtime/isync_infer_request.hpp"
-#include "llama.h"
 
 namespace ov {
-    namespace llama_cpp_plugin {
-        class LlamaCppSyncInferRequest;
-        class LlamaCppPlugin;
-        class LlamaCppState;
-        class LlamaCppModel: public ICompiledModel {
-        public:
-            LlamaCppModel(const std::shared_ptr<ov::Model>& model,
-                          const std::shared_ptr<const ov::IPlugin>& plugin,
-                          const ov::SoPtr<ov::IRemoteContext>& context,
-                          const std::shared_ptr<ov::threading::ITaskExecutor>& task_executor
-                          );
-
-            LlamaCppModel(const std::shared_ptr<ov::Model>& ov_model,
-                          std::istream& input_file,
-                          const std::shared_ptr<const IPlugin>& plugin);
+namespace llama_cpp_plugin {
+class LlamaCppSyncInferRequest;
+class LlamaCppPlugin;
+class LlamaCppState;
+class LlamaCppModel : public ICompiledModel {
+public:
+    LlamaCppModel(const std::string& gguf_fname, const std::shared_ptr<const IPlugin>& plugin);
+    /**
+     * @brief Export compiled model to stream
+     *
+     * @param model output stream
+     */
+    virtual void export_model(std::ostream& model) const override;
 
-            LlamaCppModel(const std::string& gguf_fname,
-                          const std::shared_ptr<const IPlugin>& plugin);
-            /**
-             * @brief Export compiled model to stream
-             *
-             * @param model output stream
-             */
-            virtual void export_model(std::ostream& model) const override;
+    /**
+     * @brief Returns runtime model
+     *
+     * @return OpenVINO Model which represents runtime graph
+     */
+    virtual std::shared_ptr<const ov::Model> get_runtime_model() const override;
 
-            /**
-             * @brief Returns runtime model
-             *
-             * @return OpenVINO Model which represents runtime graph
-             */
-            virtual std::shared_ptr<const ov::Model> get_runtime_model() const override;
+    /**
+     * @brief Allows to set property
+     *
+     * @param properties new plugin properties
+     */
+    virtual void set_property(const ov::AnyMap& properties) override;
 
-            /**
-             * @brief Allows to set property
-             *
-             * @param properties new plugin properties
-             */
-            virtual void set_property(const ov::AnyMap& properties) override;
+    /**
+     * @brief Returns property
+     *
+     * @param name Property name
+     *
+     * @return Property value
+     *              virtual std::shared_ptr<ov::ISyncInferRequest> create_sync_infer_request() const override;
+     **/
+    virtual ov::Any get_property(const std::string& name) const override;
+    virtual const std::vector<ov::Output<const ov::Node>>& inputs() const override;
+    virtual const std::vector<ov::Output<const ov::Node>>& outputs() const override;
+    virtual ~LlamaCppModel();
 
-            /**
-             * @brief Returns property
-             *
-             * @param name Property name
-             *
-             * @return Property value
-             *              virtual std::shared_ptr<ov::ISyncInferRequest> create_sync_infer_request() const override;
-            **/
-            virtual ov::Any get_property(const std::string& name) const override;
-            virtual const std::vector<ov::Output<const ov::Node>>& inputs() const override;
-            virtual const std::vector<ov::Output<const ov::Node>>& outputs() const override;
-            virtual ~LlamaCppModel();
-        protected:
-            /**
-             * @brief Method creates infer request implementation
-             *
-             * @return Sync infer request
-             */
-            virtual std::shared_ptr<ov::ISyncInferRequest> create_sync_infer_request() const override;
+protected:
+    /**
+     * @brief Method creates infer request implementation
+     *
+     * @return Sync infer request
+     */
+    virtual std::shared_ptr<ov::ISyncInferRequest> create_sync_infer_request() const override;
 
-        private:
-            gguf_context* m_gguf_ctx = nullptr;
-            std::string m_gguf_fname;
+private:
+    gguf_context* m_gguf_ctx = nullptr;
+    std::string m_gguf_fname;
 
-            llama_model* m_llama_model_ptr = nullptr;
-            llama_context* m_llama_ctx = nullptr;
-            std::shared_ptr<ov::Model> m_fake_model;
+    llama_model* m_llama_model_ptr = nullptr;
+    llama_context* m_llama_ctx = nullptr;
+    std::shared_ptr<ov::Model> m_fake_model;
 
-            std::vector<ov::Output<const ov::Node>> m_fake_inputs;
-            std::vector<ov::Output<const ov::Node>> m_fake_outputs;
+    std::vector<ov::Output<const ov::Node>> m_fake_inputs;
+    std::vector<ov::Output<const ov::Node>> m_fake_outputs;
 
-        friend class ov::llama_cpp_plugin::LlamaCppSyncInferRequest;
-        friend class ov::llama_cpp_plugin::LlamaCppState;
-        };
-    }
+    friend class ov::llama_cpp_plugin::LlamaCppSyncInferRequest;
+    friend class ov::llama_cpp_plugin::LlamaCppState;
+};
+}  // namespace llama_cpp_plugin
 }  // namespace ov
 
 #endif  // LLAMA_CPP_COMPILED_MODEL_HPP
@@ -4,29 +4,29 @@
 #ifndef LLAMA_CPP_INFER_REQUEST_HPP
 #define LLAMA_CPP_INFER_REQUEST_HPP
 
-#include "openvino/openvino.hpp"
 #include "compiled_model.hpp"
+#include "openvino/openvino.hpp"
 
 namespace ov {
 namespace llama_cpp_plugin {
 
-
 class LlamaCppSyncInferRequest : public ISyncInferRequest {
 public:
     explicit LlamaCppSyncInferRequest(const std::shared_ptr<const LlamaCppModel>& compiled_model);
-    virtual ~LlamaCppSyncInferRequest() {};
+    virtual ~LlamaCppSyncInferRequest(){};
 
     virtual void set_tensors_impl(const ov::Output<const ov::Node> port,
                                   const std::vector<ov::SoPtr<ov::ITensor>>& tensors) override;
 
     virtual void infer() override;
     virtual std::vector<ov::ProfilingInfo> get_profiling_info() const override;
     virtual std::vector<ov::SoPtr<ov::IVariableState>> query_state() const override;
+
 private:
     std::shared_ptr<const LlamaCppModel> m_compiled_model_ptr;
 };
 
-}  // namespace LlamaCppPlugin
+}  // namespace llama_cpp_plugin
 };  // namespace ov
 
 #endif /* LLAMA_CPP_INFER_REQUEST_HPP */