Skip to content

Commit d5447c9

Browse files
committed
Apply comments
1 parent 53fe441 commit d5447c9

11 files changed

+177
-216
lines changed
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,38 @@
11
name: llama_cpp_plugin_build_and_test
22

33
on:
4-
pull_request:
5-
types:
6-
- opened
4+
pull_request:
5+
types:
6+
- opened
77
- reopened
88
- synchronize
99
paths:
10-
- 'modules/llama_cpp_plugin/**'
10+
- 'modules/llama_cpp_plugin/**'
1111

1212
jobs:
13-
build_ubuntu20:
14-
runs-on: ubuntu-20.04
13+
build_ubuntu20:
14+
runs-on: ubuntu-20.04
1515
steps:
16-
- name: Setup cmake
17-
uses: jwlawson/actions-setup-cmake@v1.14
18-
with:
19-
cmake-version: '3.24.x'
16+
- name: Setup cmake
17+
uses: jwlawson/actions-setup-cmake@v1.14
18+
with:
19+
cmake-version: '3.24.x'
2020

2121
- name: Checkout openvino_contrib
2222
uses: actions/checkout@v3
2323
with:
24-
submodules: recursive
25-
path: openvino_contrib
24+
submodules: recursive
25+
path: openvino_contrib
2626

2727
- name: Checkout openvino
2828
uses: actions/checkout@v3
2929
with:
30-
submodules: recursive
31-
repository: openvinotoolkit/openvino
32-
path: openvino
30+
submodules: recursive
31+
repository: openvinotoolkit/openvino
32+
path: openvino
3333

3434
- name: CMake - configure
35-
run: cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=${{ github.workspace }}/openvino_contrib/modules -DBUILD_java_api=OFF -DBUILD_nvidia_plugin=OFF -DBUILD_custom_operations=OFF -DBUILD_openvino_code=OFF -DBUILD_token_merging=OFF -DENABLE_TESTS=ON -DENABLE_FUNCTIONAL_TESTS=ON -DENABLE_PLUGINS_XML=ON openvino
35+
run: cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=${{ github.workspace }}/openvino_contrib/modules/llama_cpp_plugin -DENABLE_TESTS=ON -DENABLE_FUNCTIONAL_TESTS=ON -DENABLE_PLUGINS_XML=ON -DENABLE_PLUGIN_REGISTRATION=ON openvino
3636

3737
- name: CMake - build
3838
run: cmake --build build -j`nproc` -- llama_cpp_plugin llama_cpp_e2e_tests
@@ -41,40 +41,40 @@ jobs:
4141
- name: Upload build artifacts
4242
uses: actions/upload-artifact@v4
4343
with:
44-
name: build_artifacts
45-
path: ${{ github.workspace }}/openvino/bin/intel64/Release/
44+
name: build_artifacts
45+
path: ${{ github.workspace }}/openvino/bin/intel64/Release/
4646

47-
test_ubuntu20:
47+
test_ubuntu20:
4848
needs: build_ubuntu20
4949
runs-on: ubuntu-20.04
5050
steps:
51-
- name: Download build artifacts
52-
uses: actions/download-artifact@v4
53-
with:
54-
name: build_artifacts
51+
- name: Download build artifacts
52+
uses: actions/download-artifact@v4
53+
with:
54+
name: build_artifacts
5555
path: ${{ github.workspace }}/binaries
5656

5757
- name: Prepare test data - checkout llama.cpp repo
5858
uses: actions/checkout@v3
5959
with:
60-
repository: ggerganov/llama.cpp
61-
path: llama.cpp
60+
repository: ggerganov/llama.cpp
61+
path: llama.cpp
6262

6363
- name: Prepare test data - convert test model files
6464
run: |
65-
pip install -r llama.cpp/requirements/requirements-convert-hf-to-gguf.txt
66-
huggingface-cli download gpt2 model.safetensors tokenizer.json tokenizer_config.json vocab.json config.json merges.txt --local-dir hf_gpt2
67-
mkdir -p ${{ github.workspace }}/test_data
68-
python3 llama.cpp/convert-hf-to-gguf.py hf_gpt2 --outtype f32 --outfile ${{ github.workspace }}/test_data/gpt2.gguf
65+
pip install -r llama.cpp/requirements/requirements-convert-hf-to-gguf.txt
66+
huggingface-cli download gpt2 model.safetensors tokenizer.json tokenizer_config.json vocab.json config.json merges.txt --local-dir hf_gpt2
67+
mkdir -p ${{ github.workspace }}/test_data
68+
python3 llama.cpp/convert-hf-to-gguf.py hf_gpt2 --outtype f32 --outfile ${{ github.workspace }}/test_data/gpt2.gguf
6969
7070
- name: Install libtbb2
7171
run: |
72-
wget https://storage.openvinotoolkit.org/dependencies/thirdparty/linux/oneapi-tbb-2021.2.4-lin.tgz
73-
mkdir -p tbb
74-
tar xvzf oneapi-tbb-2021.2.4-lin.tgz
72+
wget https://storage.openvinotoolkit.org/dependencies/thirdparty/linux/oneapi-tbb-2021.2.4-lin.tgz
73+
mkdir -p tbb
74+
tar xvzf oneapi-tbb-2021.2.4-lin.tgz
7575
7676
- name: Run E2E tests
7777
run: |
78-
chmod +x ${{ github.workspace }}/binaries/llama_cpp_e2e_tests
79-
export LD_LIBRARY_PATH=${{ github.workspace }}/binaries:${{ github.workspace }}/tbb/lib
80-
${{ github.workspace }}/binaries/llama_cpp_e2e_tests
78+
chmod +x ${{ github.workspace }}/binaries/llama_cpp_e2e_tests
79+
export LD_LIBRARY_PATH=${{ github.workspace }}/binaries:${{ github.workspace }}/tbb/lib
80+
${{ github.workspace }}/binaries/llama_cpp_e2e_tests

modules/llama_cpp_plugin/CMakeLists.txt

+5-5
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,15 @@ project(LlamaCppPlugin)
77

88
find_package(OpenVINODeveloperPackage REQUIRED)
99

10-
ov_option(ENABLE_LLAMA_CPP_PLUGIN_REGISTRATION "Enables registration of LLAMA_CPP plugin" ON)
10+
ov_option(ENABLE_LLAMA_CPP_PLUGIN_REGISTRATION "Enables registration of LLAMA_CPP plugin" OFF)
1111

1212
add_subdirectory(src)
1313

1414
FetchContent_Declare(
15-
llama_cpp
16-
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
17-
GIT_TAG b2417
18-
)
15+
llama_cpp
16+
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
17+
GIT_TAG b2417
18+
)
1919

2020
FetchContent_MakeAvailable(llama_cpp)
2121

modules/llama_cpp_plugin/README.md

+5-5
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,19 @@ This plugin should be built in the same fashion as the rest of the modules:
66
2. Configure the CMake build of the OpenVINO repository, making sure to point the corresponding CMake option to the location of the `openvino_contrib` repository. The command below, executed in the `openvino` repo root, will configure the build so that the modules other `llama_cpp_plugin` module will not be built to save build time - adjust the `-DBUILD_*` options if you need the other modules as well.
77

88
```bash
9-
cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=<PATH_TO_YOUR_CHECKED_OUT_OPENVINO_CONTRIB>/modules -DBUILD_java_api=OFF -DBUILD_nvidia_plugin=OFF -DBUILD_custom_operations=OFF -DBUILD_openvino_code=OFF -DBUILD_token_merging=OFF -DENABLE_PLUGINS_XML=ON .
9+
cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=<PATH_TO_YOUR_CHECKED_OUT_OPENVINO_CONTRIB>/modules/llama_cpp_plugin -DENABLE_PLUGINS_XML=ON -DENABLE_PLUGIN_REGISTRATION=ON .
1010
```
1111

1212
3. Build the plugin either as part of the complete openvino build by executing:
1313

1414
```bash
15-
cmake --build build -j`nproc`
15+
cmake --build build --parallel
1616
```
1717

1818
or separately by specifying only the `llama_cpp_plugin` target:
1919

2020
```bash
21-
cmake --build build -j`nproc` -- llama_cpp_plugin
21+
cmake --build build --parallel -- llama_cpp_plugin
2222
```
2323

2424
4. Now you can utilize the built `libllama_cpp_plugin.so` as a regular OV plugin with the device name `"LLAMA_CPP"` to directly load GGUF files and infer them using OV API with llama.cpp execution under the hood. Make sure that the plugin is discoverable by the OV runtime (e.g. by putting the built `libllama_cpp_plugin.so`, `libllama.so` and the autogenerated `plugins.xml` from the built location to your OV binaries location, or by setting `LD_LIBRARY_PATH` appropriately).
@@ -28,7 +28,7 @@ cmake --build build -j`nproc` -- llama_cpp_plugin
2828
```C++
2929

3030
ov::Core core;
31-
auto model = core.compile_model("model.gguf", "LLAMA_CPP")
31+
auto model = core.compile_model("model.gguf", "LLAMA_CPP")
3232
auto input_ids = ov::Tensor(ov::element::Type_t::i64, {1, 128});
3333
auto position_ids = ov::Tensor(ov::element::Type_t::i64, {1, 128});
3434
std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
@@ -43,7 +43,7 @@ float* logits = lm.get_tensor("logits").data<float>() + (input_ids_tensor.get_si
4343
int64_t out_token = std::max_element(logits, logits + vocab_size) - logits;
4444
```
4545
46-
The models obtained by the `.compile_model` call with the `LLAMA_CPP` plugin expose two inputs (`input_ids` and `position_ids`) and a single output (`logits`) with equivalent meaning to the corresponding arguments in the LLM model representations in the huggingface `transformers` repository. The `attention_mask` and `beam_idx` inputs may be set as well, but will have no effect on the execution.
46+
The models obtained by the `.compile_model` call with the `LLAMA_CPP` plugin expose two inputs (`input_ids` and `position_ids`) and a single output (`logits`) with equivalent meaning to the corresponding arguments in the LLM model representations in the huggingface `transformers` repository. The `attention_mask` and `beam_idx` inputs may be set as well, but will have no effect on the execution.
4747
4848
Only batch size of 1 is currently supported.
4949

modules/llama_cpp_plugin/include/compiled_model.hpp

+57-67
Original file line numberDiff line numberDiff line change
@@ -4,85 +4,75 @@
44
#ifndef LLAMA_CPP_COMPILED_MODEL_HPP
55
#define LLAMA_CPP_COMPILED_MODEL_HPP
66

7+
#include "llama.h"
78
#include "openvino/runtime/icompiled_model.hpp"
89
#include "openvino/runtime/isync_infer_request.hpp"
9-
#include "llama.h"
1010

1111
namespace ov {
12-
namespace llama_cpp_plugin {
13-
class LlamaCppSyncInferRequest;
14-
class LlamaCppPlugin;
15-
class LlamaCppState;
16-
class LlamaCppModel: public ICompiledModel {
17-
public:
18-
LlamaCppModel(const std::shared_ptr<ov::Model>& model,
19-
const std::shared_ptr<const ov::IPlugin>& plugin,
20-
const ov::SoPtr<ov::IRemoteContext>& context,
21-
const std::shared_ptr<ov::threading::ITaskExecutor>& task_executor
22-
);
23-
24-
LlamaCppModel(const std::shared_ptr<ov::Model>& ov_model,
25-
std::istream& input_file,
26-
const std::shared_ptr<const IPlugin>& plugin);
12+
namespace llama_cpp_plugin {
13+
class LlamaCppSyncInferRequest;
14+
class LlamaCppPlugin;
15+
class LlamaCppState;
16+
class LlamaCppModel : public ICompiledModel {
17+
public:
18+
LlamaCppModel(const std::string& gguf_fname, const std::shared_ptr<const IPlugin>& plugin);
19+
/**
20+
* @brief Export compiled model to stream
21+
*
22+
* @param model output stream
23+
*/
24+
virtual void export_model(std::ostream& model) const override;
2725

28-
LlamaCppModel(const std::string& gguf_fname,
29-
const std::shared_ptr<const IPlugin>& plugin);
30-
/**
31-
* @brief Export compiled model to stream
32-
*
33-
* @param model output stream
34-
*/
35-
virtual void export_model(std::ostream& model) const override;
26+
/**
27+
* @brief Returns runtime model
28+
*
29+
* @return OpenVINO Model which represents runtime graph
30+
*/
31+
virtual std::shared_ptr<const ov::Model> get_runtime_model() const override;
3632

37-
/**
38-
* @brief Returns runtime model
39-
*
40-
* @return OpenVINO Model which represents runtime graph
41-
*/
42-
virtual std::shared_ptr<const ov::Model> get_runtime_model() const override;
33+
/**
34+
* @brief Allows to set property
35+
*
36+
* @param properties new plugin properties
37+
*/
38+
virtual void set_property(const ov::AnyMap& properties) override;
4339

44-
/**
45-
* @brief Allows to set property
46-
*
47-
* @param properties new plugin properties
48-
*/
49-
virtual void set_property(const ov::AnyMap& properties) override;
40+
/**
41+
* @brief Returns property
42+
*
43+
* @param name Property name
44+
*
45+
* @return Property value
46+
* virtual std::shared_ptr<ov::ISyncInferRequest> create_sync_infer_request() const override;
47+
**/
48+
virtual ov::Any get_property(const std::string& name) const override;
49+
virtual const std::vector<ov::Output<const ov::Node>>& inputs() const override;
50+
virtual const std::vector<ov::Output<const ov::Node>>& outputs() const override;
51+
virtual ~LlamaCppModel();
5052

51-
/**
52-
* @brief Returns property
53-
*
54-
* @param name Property name
55-
*
56-
* @return Property value
57-
* virtual std::shared_ptr<ov::ISyncInferRequest> create_sync_infer_request() const override;
58-
**/
59-
virtual ov::Any get_property(const std::string& name) const override;
60-
virtual const std::vector<ov::Output<const ov::Node>>& inputs() const override;
61-
virtual const std::vector<ov::Output<const ov::Node>>& outputs() const override;
62-
virtual ~LlamaCppModel();
63-
protected:
64-
/**
65-
* @brief Method creates infer request implementation
66-
*
67-
* @return Sync infer request
68-
*/
69-
virtual std::shared_ptr<ov::ISyncInferRequest> create_sync_infer_request() const override;
53+
protected:
54+
/**
55+
* @brief Method creates infer request implementation
56+
*
57+
* @return Sync infer request
58+
*/
59+
virtual std::shared_ptr<ov::ISyncInferRequest> create_sync_infer_request() const override;
7060

71-
private:
72-
gguf_context* m_gguf_ctx = nullptr;
73-
std::string m_gguf_fname;
61+
private:
62+
gguf_context* m_gguf_ctx = nullptr;
63+
std::string m_gguf_fname;
7464

75-
llama_model* m_llama_model_ptr = nullptr;
76-
llama_context* m_llama_ctx = nullptr;
77-
std::shared_ptr<ov::Model> m_fake_model;
65+
llama_model* m_llama_model_ptr = nullptr;
66+
llama_context* m_llama_ctx = nullptr;
67+
std::shared_ptr<ov::Model> m_fake_model;
7868

79-
std::vector<ov::Output<const ov::Node>> m_fake_inputs;
80-
std::vector<ov::Output<const ov::Node>> m_fake_outputs;
69+
std::vector<ov::Output<const ov::Node>> m_fake_inputs;
70+
std::vector<ov::Output<const ov::Node>> m_fake_outputs;
8171

82-
friend class ov::llama_cpp_plugin::LlamaCppSyncInferRequest;
83-
friend class ov::llama_cpp_plugin::LlamaCppState;
84-
};
85-
}
72+
friend class ov::llama_cpp_plugin::LlamaCppSyncInferRequest;
73+
friend class ov::llama_cpp_plugin::LlamaCppState;
74+
};
75+
} // namespace llama_cpp_plugin
8676
} // namespace ov
8777

8878
#endif // LLAMA_CPP_COMPILED_MODEL_HPP

modules/llama_cpp_plugin/include/infer_request.hpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -4,29 +4,29 @@
44
#ifndef LLAMA_CPP_INFER_REQUEST_HPP
55
#define LLAMA_CPP_INFER_REQUEST_HPP
66

7-
#include "openvino/openvino.hpp"
87
#include "compiled_model.hpp"
8+
#include "openvino/openvino.hpp"
99

1010
namespace ov {
1111
namespace llama_cpp_plugin {
1212

13-
1413
class LlamaCppSyncInferRequest : public ISyncInferRequest {
1514
public:
1615
explicit LlamaCppSyncInferRequest(const std::shared_ptr<const LlamaCppModel>& compiled_model);
17-
virtual ~LlamaCppSyncInferRequest() {};
16+
virtual ~LlamaCppSyncInferRequest(){};
1817

1918
virtual void set_tensors_impl(const ov::Output<const ov::Node> port,
2019
const std::vector<ov::SoPtr<ov::ITensor>>& tensors) override;
2120

2221
virtual void infer() override;
2322
virtual std::vector<ov::ProfilingInfo> get_profiling_info() const override;
2423
virtual std::vector<ov::SoPtr<ov::IVariableState>> query_state() const override;
24+
2525
private:
2626
std::shared_ptr<const LlamaCppModel> m_compiled_model_ptr;
2727
};
2828

29-
} // namespace LlamaCppPlugin
29+
} // namespace llama_cpp_plugin
3030
}; // namespace ov
3131

3232
#endif /* LLAMA_CPP_INFER_REQUEST_HPP */

0 commit comments

Comments
 (0)