Skip to content

Commit e8f60ce

Browse files
authored
Merge branch 'master' into java-api
2 parents 30b8088 + c4b3ef9 commit e8f60ce

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+1539
-35
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
name: llama_cpp_plugin_build_and_test
2+
3+
on:
4+
pull_request:
5+
paths:
6+
- 'modules/llama_cpp_plugin/**'
7+
8+
jobs:
9+
build_ubuntu20:
10+
runs-on: ubuntu-20.04-8-cores
11+
steps:
12+
- name: Setup cmake
13+
uses: jwlawson/actions-setup-cmake@v1.14
14+
with:
15+
cmake-version: '3.24.x'
16+
17+
- name: Checkout openvino_contrib
18+
uses: actions/checkout@v4
19+
with:
20+
submodules: recursive
21+
path: openvino_contrib
22+
23+
- name: Checkout openvino
24+
uses: actions/checkout@v4
25+
with:
26+
submodules: recursive
27+
repository: openvinotoolkit/openvino
28+
path: openvino
29+
30+
- name: CMake - configure
31+
run: cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=${{ github.workspace }}/openvino_contrib/modules/llama_cpp_plugin -DENABLE_TESTS=ON -DENABLE_FUNCTIONAL_TESTS=ON -DENABLE_PLUGINS_XML=ON -DENABLE_LLAMA_CPP_PLUGIN_REGISTRATION=ON openvino
32+
33+
- name: CMake - build
34+
run: cmake --build build -j`nproc` -- llama_cpp_plugin llama_cpp_e2e_tests llama_cpp_func_tests
35+
36+
37+
- name: Upload build artifacts
38+
uses: actions/upload-artifact@v4
39+
with:
40+
name: build_artifacts
41+
path: ${{ github.workspace }}/openvino/bin/intel64/Release/
42+
43+
test_ubuntu20:
44+
needs: build_ubuntu20
45+
runs-on: ubuntu-20.04
46+
steps:
47+
- name: Download build artifacts
48+
uses: actions/download-artifact@v4
49+
with:
50+
name: build_artifacts
51+
path: ${{ github.workspace }}/binaries
52+
53+
- name: Prepare test data - checkout llama.cpp repo
54+
uses: actions/checkout@v4
55+
with:
56+
repository: ggerganov/llama.cpp
57+
path: llama.cpp
58+
59+
- name: Prepare test data - convert test model files
60+
run: |
61+
pip install -r llama.cpp/requirements/requirements-convert-hf-to-gguf.txt
62+
huggingface-cli download gpt2 model.safetensors tokenizer.json tokenizer_config.json vocab.json config.json merges.txt --local-dir hf_gpt2
63+
mkdir -p ${{ github.workspace }}/test_data
64+
python3 llama.cpp/convert-hf-to-gguf.py hf_gpt2 --outtype f32 --outfile ${{ github.workspace }}/test_data/gpt2.gguf
65+
66+
- name: Install libtbb2
67+
run: |
68+
wget https://storage.openvinotoolkit.org/dependencies/thirdparty/linux/oneapi-tbb-2021.2.4-lin.tgz
69+
mkdir -p tbb
70+
tar xvzf oneapi-tbb-2021.2.4-lin.tgz
71+
72+
- name: Run functional tests
73+
run: |
74+
chmod +x ${{ github.workspace }}/binaries/llama_cpp_func_tests
75+
export LD_LIBRARY_PATH=${{ github.workspace }}/binaries:${{ github.workspace }}/tbb/lib
76+
${{ github.workspace }}/binaries/llama_cpp_func_tests
77+
78+
- name: Run E2E tests
79+
run: |
80+
chmod +x ${{ github.workspace }}/binaries/llama_cpp_e2e_tests
81+
export LD_LIBRARY_PATH=${{ github.workspace }}/binaries:${{ github.workspace }}/tbb/lib
82+
${{ github.workspace }}/binaries/llama_cpp_e2e_tests
+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
BasedOnStyle: Google
2+
IndentWidth: 4
3+
UseTab: Never
4+
ColumnLimit: 120
5+
6+
Language: Cpp
7+
Standard: Cpp11
8+
9+
AccessModifierOffset: -4
10+
AlignConsecutiveMacros: true
11+
AllowAllArgumentsOnNextLine: false
12+
AllowAllConstructorInitializersOnNextLine: false
13+
AllowAllParametersOfDeclarationOnNextLine: false
14+
AllowShortFunctionsOnASingleLine: Empty
15+
AllowShortIfStatementsOnASingleLine: Never
16+
AllowShortLambdasOnASingleLine: Empty
17+
AllowShortLoopsOnASingleLine: false
18+
AlwaysBreakBeforeMultilineStrings: false
19+
BinPackArguments: false
20+
BinPackParameters: false
21+
CommentPragmas: '^#'
22+
DerivePointerAlignment: false
23+
FixNamespaceComments: true
24+
IndentCaseLabels: false
25+
IndentPPDirectives: AfterHash
26+
ForEachMacros:
27+
- foreach
28+
- FOREACH_CHILD
+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Copyright (C) 2024 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
cmake_minimum_required(VERSION 3.13)
5+
6+
project(LlamaCppPlugin)
7+
8+
find_package(OpenVINODeveloperPackage REQUIRED)
9+
10+
ov_option(ENABLE_LLAMA_CPP_PLUGIN_REGISTRATION "Enables registration of LLAMA_CPP plugin" OFF)
11+
12+
add_subdirectory(src)
13+
14+
include(FetchContent)
15+
16+
FetchContent_Declare(
17+
llama_cpp
18+
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
19+
GIT_TAG b2417
20+
)
21+
22+
FetchContent_MakeAvailable(llama_cpp)
23+
24+
if(ENABLE_TESTS)
25+
include(CTest)
26+
enable_testing()
27+
add_subdirectory(tests/common)
28+
add_subdirectory(tests/e2e)
29+
add_subdirectory(tests/functional)
30+
endif()
31+
32+
# install
33+
34+
if(OpenVINODeveloperPackage_FOUND)
35+
ov_cpack(LlamaCppPlugin)
36+
endif()

modules/llama_cpp_plugin/README.md

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
### Build instructions
2+
3+
This plugin should be built in the same fashion as the rest of the modules:
4+
5+
1. Check out the OpenVINO repository proper (https://github.com/openvinotoolkit/openvino)
6+
2. Configure the CMake build of the OpenVINO repository, making sure to point the corresponding CMake option to the location of the `openvino_contrib` repository. The command below, executed in the `openvino` repo root, will configure the build so that the modules other `llama_cpp_plugin` module will not be built to save build time - adjust the `-DBUILD_*` options if you need the other modules as well.
7+
8+
```bash
9+
cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=<PATH_TO_YOUR_CHECKED_OUT_OPENVINO_CONTRIB>/modules/llama_cpp_plugin -DENABLE_PLUGINS_XML=ON -DENABLE_PLUGIN_REGISTRATION=ON .
10+
```
11+
12+
3. Build the plugin either as part of the complete openvino build by executing:
13+
14+
```bash
15+
cmake --build build --parallel
16+
```
17+
18+
or separately by specifying only the `llama_cpp_plugin` target:
19+
20+
```bash
21+
cmake --build build --parallel -- llama_cpp_plugin
22+
```
23+
24+
4. Now you can utilize the built `libllama_cpp_plugin.so` as a regular OV plugin with the device name `"LLAMA_CPP"` to directly load GGUF files and infer them using OV API with llama.cpp execution under the hood. Make sure that the plugin is discoverable by the OV runtime (e.g. by putting the built `libllama_cpp_plugin.so`, `libllama.so` and the autogenerated `plugins.xml` from the built location to your OV binaries location, or by setting `LD_LIBRARY_PATH` appropriately).
25+
26+
#### Example of LLM inference code
27+
28+
```C++
29+
30+
ov::Core core;
31+
auto model = core.compile_model("model.gguf", "LLAMA_CPP")
32+
auto input_ids = ov::Tensor(ov::element::Type_t::i64, {1, 128});
33+
auto position_ids = ov::Tensor(ov::element::Type_t::i64, {1, 128});
34+
std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), 0);
35+
36+
auto infer_request == model.create_infer_request();
37+
infer_request.set_tensor("input_ids", input_ids);
38+
infer_request.set_tensor("position_ids", position_ids);
39+
infer_request.infer();
40+
41+
size_t vocab_size = lm.get_tensor("logits").get_shape().back();
42+
float* logits = lm.get_tensor("logits").data<float>() + (input_ids_tensor.get_size() - 1) * vocab_size;
43+
int64_t out_token = std::max_element(logits, logits + vocab_size) - logits;
44+
```
45+
46+
The models obtained by the `.compile_model` call with the `LLAMA_CPP` plugin expose two inputs (`input_ids` and `position_ids`) and a single output (`logits`) with equivalent meaning to the corresponding arguments in the LLM model representations in the huggingface `transformers` repository. The `attention_mask` and `beam_idx` inputs may be set as well, but will have no effect on the execution.
47+
48+
Only batch size of 1 is currently supported.
49+
50+
51+
52+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
// Copyright (C) 2024 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
#ifndef LLAMA_CPP_COMPILED_MODEL_HPP
5+
#define LLAMA_CPP_COMPILED_MODEL_HPP
6+
7+
#include "llama.h"
8+
#include "openvino/runtime/icompiled_model.hpp"
9+
#include "openvino/runtime/isync_infer_request.hpp"
10+
11+
namespace ov {
12+
namespace llama_cpp_plugin {
13+
class LlamaCppSyncInferRequest;
14+
class LlamaCppPlugin;
15+
class LlamaCppState;
16+
class LlamaCppModel : public ICompiledModel {
17+
public:
18+
LlamaCppModel(const std::string& gguf_fname, const std::shared_ptr<const IPlugin>& plugin, size_t num_threads = 0);
19+
/**
20+
* @brief Export compiled model to stream
21+
*
22+
* @param model output stream
23+
*/
24+
virtual void export_model(std::ostream& model) const override;
25+
26+
/**
27+
* @brief Returns runtime model
28+
*
29+
* @return OpenVINO Model which represents runtime graph
30+
*/
31+
virtual std::shared_ptr<const ov::Model> get_runtime_model() const override;
32+
33+
/**
34+
* @brief Allows to set property
35+
*
36+
* @param properties new plugin properties
37+
*/
38+
virtual void set_property(const ov::AnyMap& properties) override;
39+
40+
/**
41+
* @brief Returns property
42+
*
43+
* @param name Property name
44+
*
45+
* @return Property value
46+
* virtual std::shared_ptr<ov::ISyncInferRequest> create_sync_infer_request() const override;
47+
**/
48+
virtual ov::Any get_property(const std::string& name) const override;
49+
virtual const std::vector<ov::Output<const ov::Node>>& inputs() const override;
50+
virtual const std::vector<ov::Output<const ov::Node>>& outputs() const override;
51+
virtual ~LlamaCppModel();
52+
53+
protected:
54+
/**
55+
* @brief Method creates infer request implementation
56+
*
57+
* @return Sync infer request
58+
*/
59+
virtual std::shared_ptr<ov::ISyncInferRequest> create_sync_infer_request() const override;
60+
61+
private:
62+
gguf_context* m_gguf_ctx = nullptr;
63+
std::string m_gguf_fname;
64+
65+
llama_model* m_llama_model_ptr = nullptr;
66+
llama_context* m_llama_ctx = nullptr;
67+
std::shared_ptr<ov::Model> m_fake_model;
68+
69+
std::vector<ov::Output<const ov::Node>> m_fake_inputs;
70+
std::vector<ov::Output<const ov::Node>> m_fake_outputs;
71+
72+
friend class ov::llama_cpp_plugin::LlamaCppSyncInferRequest;
73+
friend class ov::llama_cpp_plugin::LlamaCppState;
74+
};
75+
} // namespace llama_cpp_plugin
76+
} // namespace ov
77+
78+
#endif // LLAMA_CPP_COMPILED_MODEL_HPP
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
// Copyright (C) 2024 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
#ifndef LLAMA_CPP_INFER_REQUEST_HPP
5+
#define LLAMA_CPP_INFER_REQUEST_HPP
6+
7+
#include "compiled_model.hpp"
8+
#include "openvino/openvino.hpp"
9+
10+
namespace ov {
11+
namespace llama_cpp_plugin {
12+
13+
class LlamaCppSyncInferRequest : public ISyncInferRequest {
14+
public:
15+
explicit LlamaCppSyncInferRequest(const std::shared_ptr<const LlamaCppModel>& compiled_model);
16+
virtual ~LlamaCppSyncInferRequest(){};
17+
18+
virtual void set_tensors_impl(const ov::Output<const ov::Node> port,
19+
const std::vector<ov::SoPtr<ov::ITensor>>& tensors) override;
20+
21+
virtual void infer() override;
22+
virtual std::vector<ov::ProfilingInfo> get_profiling_info() const override;
23+
virtual std::vector<ov::SoPtr<ov::IVariableState>> query_state() const override;
24+
25+
private:
26+
std::shared_ptr<const LlamaCppModel> m_compiled_model_ptr;
27+
};
28+
29+
} // namespace llama_cpp_plugin
30+
}; // namespace ov
31+
32+
#endif /* LLAMA_CPP_INFER_REQUEST_HPP */
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
// Copyright (C) 2018-2023 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
#ifndef LLAMA_CPP_PLUGIN_HPP
5+
#define LLAMA_CPP_PLUGIN_HPP
6+
7+
#include "openvino/runtime/iplugin.hpp"
8+
9+
namespace ov {
10+
namespace llama_cpp_plugin {
11+
class LlamaCppPlugin : public IPlugin {
12+
public:
13+
LlamaCppPlugin();
14+
virtual std::shared_ptr<ov::ICompiledModel> compile_model(const std::shared_ptr<const ov::Model>& model,
15+
const ov::AnyMap& properties) const override;
16+
17+
virtual std::shared_ptr<ov::ICompiledModel> compile_model(
18+
const std::shared_ptr<const ov::Model>& model,
19+
const ov::AnyMap& properties,
20+
const ov::SoPtr<ov::IRemoteContext>& context) const override;
21+
22+
virtual void set_property(const ov::AnyMap& properties) override;
23+
24+
virtual ov::Any get_property(const std::string& name, const ov::AnyMap& arguments) const override;
25+
26+
virtual ov::SoPtr<ov::IRemoteContext> create_context(const ov::AnyMap& remote_properties) const override;
27+
28+
virtual ov::SoPtr<ov::IRemoteContext> get_default_context(const ov::AnyMap& remote_properties) const override;
29+
30+
virtual std::shared_ptr<ov::ICompiledModel> import_model(std::istream& model,
31+
const ov::AnyMap& properties) const override;
32+
33+
virtual std::shared_ptr<ov::ICompiledModel> compile_model(const std::string& fname,
34+
const ov::AnyMap& properties) const override;
35+
36+
virtual std::shared_ptr<ov::ICompiledModel> import_model(std::istream& model,
37+
const ov::SoPtr<ov::IRemoteContext>& context,
38+
const ov::AnyMap& properties) const override;
39+
40+
virtual ov::SupportedOpsMap query_model(const std::shared_ptr<const ov::Model>& model,
41+
const ov::AnyMap& properties) const override;
42+
43+
private:
44+
size_t m_num_threads = 0;
45+
};
46+
} // namespace llama_cpp_plugin
47+
} // namespace ov
48+
49+
#endif // LLAMA_CPP_PLUGIN_HPP
+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
// Copyright (C) 2018-2023 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
#ifndef LLAMA_CPP_PLUGIN_HPP
5+
#define LLAMA_CPP_PLUGIN_HPP
6+
7+
#include "compiled_model.hpp"
8+
#include "openvino/runtime/ivariable_state.hpp"
9+
10+
namespace ov {
11+
namespace llama_cpp_plugin {
12+
class LlamaCppState : public IVariableState {
13+
public:
14+
LlamaCppState() = delete;
15+
LlamaCppState(const std::shared_ptr<const LlamaCppModel>& model_ptr)
16+
: m_model_ptr(model_ptr),
17+
IVariableState("llama_cpp_state") {}
18+
void reset() override {
19+
llama_kv_cache_clear(m_model_ptr->m_llama_ctx);
20+
}
21+
22+
private:
23+
const std::shared_ptr<const LlamaCppModel>& m_model_ptr;
24+
};
25+
} // namespace llama_cpp_plugin
26+
} // namespace ov
27+
#endif // LLAMA_CPP_STATE_HPP

0 commit comments

Comments
 (0)