From 8cb5fd5e025dcd799f8dbe419696c7a25c4a3c1a Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Fri, 8 Mar 2024 16:52:02 +0100 Subject: [PATCH 01/27] Initial commit --- .gitmodules | 3 + modules/llama_cpp_plugin/CMakeLists.txt | 32 + modules/llama_cpp_plugin/build.sh | 19 + .../include/compiled_model.hpp | 84 ++ .../include/infer_request.hpp | 31 + modules/llama_cpp_plugin/include/plugin.hpp | 112 +++ modules/llama_cpp_plugin/src/CMakeLists.txt | 58 ++ .../llama_cpp_plugin/src/compiled_model.cpp | 732 ++++++++++++++++++ .../llama_cpp_plugin/src/infer_request.cpp | 111 +++ modules/llama_cpp_plugin/src/plugin.cpp | 152 ++++ modules/llama_cpp_plugin/tests/CMakeLists.txt | 37 + .../llama_cpp_plugin/third_party/llama.cpp | 1 + modules/llama_cpp_plugin/tools/CMakeLists.txt | 22 + .../llama_cpp_plugin/tools/cache_embedder.cpp | 53 ++ modules/llama_cpp_plugin/tools/runner.cpp | 73 ++ .../tools/tensor_comparator.cpp | 95 +++ 16 files changed, 1615 insertions(+) create mode 100644 .gitmodules create mode 100644 modules/llama_cpp_plugin/CMakeLists.txt create mode 100755 modules/llama_cpp_plugin/build.sh create mode 100644 modules/llama_cpp_plugin/include/compiled_model.hpp create mode 100644 modules/llama_cpp_plugin/include/infer_request.hpp create mode 100644 modules/llama_cpp_plugin/include/plugin.hpp create mode 100644 modules/llama_cpp_plugin/src/CMakeLists.txt create mode 100644 modules/llama_cpp_plugin/src/compiled_model.cpp create mode 100644 modules/llama_cpp_plugin/src/infer_request.cpp create mode 100644 modules/llama_cpp_plugin/src/plugin.cpp create mode 100644 modules/llama_cpp_plugin/tests/CMakeLists.txt create mode 160000 modules/llama_cpp_plugin/third_party/llama.cpp create mode 100644 modules/llama_cpp_plugin/tools/CMakeLists.txt create mode 100644 modules/llama_cpp_plugin/tools/cache_embedder.cpp create mode 100644 modules/llama_cpp_plugin/tools/runner.cpp create mode 100644 modules/llama_cpp_plugin/tools/tensor_comparator.cpp diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..29da379f7 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "modules/llama_cpp_plugin/third_party/llama.cpp"] + path = modules/llama_cpp_plugin/third_party/llama.cpp + url = https://github.com/vshampor/llama.cpp diff --git a/modules/llama_cpp_plugin/CMakeLists.txt b/modules/llama_cpp_plugin/CMakeLists.txt new file mode 100644 index 000000000..f5d3284b2 --- /dev/null +++ b/modules/llama_cpp_plugin/CMakeLists.txt @@ -0,0 +1,32 @@ +cmake_minimum_required(VERSION 3.13) + +project(LlamaCppPlugin) + +find_package(OpenVINODeveloperPackage REQUIRED) + +ov_option(ENABLE_LLAMA_CPP_PLUGIN_REGISTRATION "Enables registration of LLAMA_CPP plugin" OFF) + +if(CMAKE_COMPILER_IS_GNUCXX) + ov_add_compiler_flags(-Wall) +endif() + +add_subdirectory(src) +add_subdirectory(tools) + +add_subdirectory(third_party/llama.cpp) + +if(ENABLE_TESTS) + include(CTest) + enable_testing() + + if(ENABLE_FUNCTIONAL_TESTS) + add_subdirectory(tests/functional) + endif() +endif() + + +# install + +if(OpenVINODeveloperPackage_FOUND) + ov_cpack(LlamaCppPlugin) +endif() diff --git a/modules/llama_cpp_plugin/build.sh b/modules/llama_cpp_plugin/build.sh new file mode 100755 index 000000000..fa36b9e03 --- /dev/null +++ b/modules/llama_cpp_plugin/build.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e +# What we want to do is build the llama.cpp dependency for different backends and have a separate plugin for each such build type. +# Sadly, CMake does not reliably allow to add_subdirectory multiple times in the same build tree, let alone with different options, +# since this would lead to "duplicate targets". There doesn't seem to be a solution to this problem even still. Thus, will have to +# invoke the cmake configure and build stage separately for each llama.cpp backend type. + +BUILD_TYPE=$1 +COMMON_OPTS="-DOpenVINODeveloperPackage_DIR=/home/vshampor/work/openvino/build -DCMAKE_EXPORT_COMPILE_COMMANDS=1" + +# Regular CPU build of llama.cpp +cmake -S ./ -B ./build/cpu/ ${COMMON_OPTS} "$@" +cmake --build ./build/cpu/ -j --target llama --target llama_cpp_plugin + + +# CUDA build +cmake -S ./ -B ./build/cuda/ -DLLAMA_CUBLAS=1 -DPLUGIN_DEVICE_NAME="LLAMA_CPP_CUDA" -DPLUGIN_LIBRARY_NAME="llama_cpp_cuda_plugin" -DLLAMA_TARGET_NAME="llama_cuda" ${COMMON_OPTS} "$@" +cmake --build ./build/cuda/ -j --target llama_cuda --target llama_cpp_cuda_plugin diff --git a/modules/llama_cpp_plugin/include/compiled_model.hpp b/modules/llama_cpp_plugin/include/compiled_model.hpp new file mode 100644 index 000000000..eb785e252 --- /dev/null +++ b/modules/llama_cpp_plugin/include/compiled_model.hpp @@ -0,0 +1,84 @@ +#ifndef LLAMA_CPP_COMPILED_MODEL_HPP +#define LLAMA_CPP_COMPILED_MODEL_HPP + +#include "openvino/runtime/icompiled_model.hpp" +#include "openvino/runtime/isync_infer_request.hpp" +#include "llama.h" + +namespace ov { + namespace llama_cpp_plugin { + class LlamaCppSyncInferRequest; + class LlamaCppPlugin; + class LlamaCppModel: public ICompiledModel { + public: + LlamaCppModel(const std::shared_ptr& model, + const std::shared_ptr& plugin, + const ov::SoPtr& context, + const std::shared_ptr& task_executor + ); + + LlamaCppModel(const std::shared_ptr& ov_model, + std::istream& input_file, + const std::shared_ptr& plugin); + + LlamaCppModel(const std::string& gguf_fname, + const std::shared_ptr& plugin); + /** + * @brief Export compiled model to stream + * + * @param model output stream + */ + virtual void export_model(std::ostream& model) const override; + + /** + * @brief Returns runtime model + * + * @return OpenVINO Model which represents runtime graph + */ + virtual std::shared_ptr get_runtime_model() const override; + + /** + * @brief Allows to set property + * + * @param properties new plugin properties + */ + virtual void set_property(const ov::AnyMap& properties) override; + + /** + * @brief Returns property + * + * @param name Property name + * + * @return Property value + * virtual std::shared_ptr create_sync_infer_request() const override; + **/ + virtual ov::Any get_property(const std::string& name) const override; + virtual const std::vector>& inputs() const override; + virtual const std::vector>& outputs() const override; + protected: + /** + * @brief Method creates infer request implementation + * + * @return Sync infer request + */ + virtual std::shared_ptr create_sync_infer_request() const override; + + private: + std::string get_current_gguf_file_path() const; + gguf_context* m_gguf_ctx = nullptr; + std::string m_converted_gguf_file_name; + + llama_model* m_llama_model_ptr = nullptr; + llama_context* m_llama_ctx = nullptr; + size_t* num_tokens_processed_ptr = nullptr; // TODO: (vshampor) find a better place for this kind of storage + std::shared_ptr m_model; + + std::vector> m_fake_inputs; + std::vector> m_fake_outputs; + + friend class ov::llama_cpp_plugin::LlamaCppSyncInferRequest; + }; + } +} // namespace ov + +#endif // LLAMA_CPP_COMPILED_MODEL_HPP diff --git a/modules/llama_cpp_plugin/include/infer_request.hpp b/modules/llama_cpp_plugin/include/infer_request.hpp new file mode 100644 index 000000000..b6314010b --- /dev/null +++ b/modules/llama_cpp_plugin/include/infer_request.hpp @@ -0,0 +1,31 @@ +#ifndef LLAMA_CPP_INFER_REQUEST_HPP +#define LLAMA_CPP_INFER_REQUEST_HPP + +#include "openvino/openvino.hpp" +#include "compiled_model.hpp" + +namespace ov { +namespace llama_cpp_plugin { + +class LlamaCppSyncInferRequest : public ISyncInferRequest { +public: + explicit LlamaCppSyncInferRequest(const std::shared_ptr& compiled_model); + // explicit LlamaCppSyncInferRequest(const std::shared_ptr& compiled_model): ov::ISyncInferRequest(compiled_model) { + // std::cout << "VSHAMPOR: infer request ctor called\n"; + // } + virtual ~LlamaCppSyncInferRequest() {}; + + virtual void set_tensors_impl(const ov::Output port, + const std::vector>& tensors) override; + + virtual void infer() override; + virtual std::vector get_profiling_info() const override; + virtual std::vector> query_state() const override; +private: + std::shared_ptr m_compiled_model_ptr; +}; + +} // namespace LlamaCppPlugin +}; // namespace ov + +#endif /* LLAMA_CPP_INFER_REQUEST_HPP */ diff --git a/modules/llama_cpp_plugin/include/plugin.hpp b/modules/llama_cpp_plugin/include/plugin.hpp new file mode 100644 index 000000000..aea32ea1f --- /dev/null +++ b/modules/llama_cpp_plugin/include/plugin.hpp @@ -0,0 +1,112 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#ifndef LLAMA_CPP_PLUGIN_HPP +#define LLAMA_CPP_PLUGIN_HPP + +#include "openvino/runtime/iplugin.hpp" + +namespace ov { + namespace llama_cpp_plugin { + class LlamaCppPlugin : public IPlugin { + public: + LlamaCppPlugin(); + /** + * @brief Compiles model from ov::Model object + * @param model A model object acquired from ov::Core::read_model or source construction + * @param properties A ov::AnyMap of properties relevant only for this load operation + * @return Created Compiled Model object + */ + virtual std::shared_ptr compile_model(const std::shared_ptr& model, + const ov::AnyMap& properties) const override; + + + /** + * @brief Compiles model from ov::Model object, on specified remote context + * @param model A model object acquired from ov::Core::read_model or source construction + * @param properties A ov::AnyMap of properties relevant only for this load operation + * @param context A pointer to plugin context derived from RemoteContext class used to + * execute the model + * @return Created Compiled Model object + */ + virtual std::shared_ptr compile_model(const std::shared_ptr& model, + const ov::AnyMap& properties, + const ov::SoPtr& context) const override; + + /** + * @brief Sets properties for plugin, acceptable keys can be found in openvino/runtime/properties.hpp + * @param properties ov::AnyMap of properties + */ + virtual void set_property(const ov::AnyMap& properties) override; + + /** + * @brief Gets properties related to plugin behaviour. + * + * @param name Property name. + * @param arguments Additional arguments to get a property. + * + * @return Value of a property corresponding to the property name. + */ + virtual ov::Any get_property(const std::string& name, const ov::AnyMap& arguments) const override; + + /** + * @brief Creates a remote context instance based on a map of properties + * @param remote_properties Map of device-specific shared context remote properties. + * + * @return A remote context object + */ + virtual ov::SoPtr create_context(const ov::AnyMap& remote_properties) const override; + + /** + * @brief Provides a default remote context instance if supported by a plugin + * @param remote_properties Map of device-specific shared context remote properties. + * + * @return The default context. + */ + virtual ov::SoPtr get_default_context(const ov::AnyMap& remote_properties) const override; + + /** + * @brief Creates an compiled model from an previously exported model using plugin implementation + * and removes OpenVINO Runtime magic and plugin name + * @param model Reference to model output stream + * @param properties A ov::AnyMap of properties + * @return An Compiled model + */ + virtual std::shared_ptr import_model(std::istream& model, + const ov::AnyMap& properties) const override; + + + virtual std::shared_ptr compile_model(const std::string& fname, + const ov::AnyMap& properties) const override; + + /** + * @brief Creates an compiled model from an previously exported model using plugin implementation + * and removes OpenVINO Runtime magic and plugin name + * @param model Reference to model output stream + * @param context A pointer to plugin context derived from RemoteContext class used to + * execute the network + * @param properties A ov::AnyMap of properties + * @return An Compiled model + */ + virtual std::shared_ptr import_model(std::istream& model, + const ov::SoPtr& context, + const ov::AnyMap& properties) const override; + + /** + * @brief Queries a plugin about supported layers in model + * @param model Model object to query. + * @param properties Optional map of pairs: (property name, property value). + * @return An object containing a map of pairs an operation name -> a device name supporting this operation. + */ + virtual ov::SupportedOpsMap query_model(const std::shared_ptr& model, + const ov::AnyMap& properties) const override; + + std::string get_current_gguf_file_path() const; + private: + std::string m_cache_dir = "./"; + }; + } // namespace llama_cpp_plugin +} // namespace ov + +#endif // LLAMA_CPP_PLUGIN_HPP diff --git a/modules/llama_cpp_plugin/src/CMakeLists.txt b/modules/llama_cpp_plugin/src/CMakeLists.txt new file mode 100644 index 000000000..5ec2caee7 --- /dev/null +++ b/modules/llama_cpp_plugin/src/CMakeLists.txt @@ -0,0 +1,58 @@ +set( PLUGIN_LIBRARY_NAME CACHE STRING "Library name for the generated plugin" ${TARGET_NAME}) +if(NOT PLUGIN_LIBRARY_NAME) + set( PLUGIN_LIBRARY_NAME "llama_cpp_plugin" ) +endif() + +set( PLUGIN_DEVICE_NAME CACHE STRING "Device name for the resulting plugin") +if(NOT PLUGIN_DEVICE_NAME) + set( PLUGIN_DEVICE_NAME "LLAMA_CPP" ) +endif() + +set(TARGET_NAME ${PLUGIN_LIBRARY_NAME}) + +file(GLOB_RECURSE SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) +file(GLOB_RECURSE HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) + +if (NOT ENABLE_TEMPLATE_REGISTRATION) + # Skip install and registration of template component + set(skip_plugin SKIP_INSTALL SKIP_REGISTRATION) +endif() + + + +# adds a shared library with plugin +ov_add_plugin(NAME ${TARGET_NAME} + DEVICE_NAME ${PLUGIN_DEVICE_NAME} + SOURCES ${SOURCES} ${HEADERS} + ${skip_plugin} + VERSION_DEFINES_FOR plugin.cpp + ADD_CLANG_FORMAT) + +# Enable support of CC for the plugin +ov_mark_target_as_cc(${TARGET_NAME}) + +target_include_directories(${TARGET_NAME} PRIVATE + "${CMAKE_CURRENT_SOURCE_DIR}" + "${LlamaCppPlugin_SOURCE_DIR}/include") + +# link common OpenVINO Runtime libraries +target_link_libraries(${TARGET_NAME} PRIVATE + openvino::interpreter_backend + openvino::reference) + +set( LLAMA_TARGET_NAME CACHE STRING "Exact target exposed by llama.cpp to link against as the main llama.cpp library") +if(NOT LLAMA_TARGET_NAME) + set( LLAMA_TARGET_NAME "llama" ) +endif() + +# include and link llama.cpp and ggml code +target_link_libraries(${TARGET_NAME} PRIVATE ${LLAMA_TARGET_NAME}) +target_link_libraries(${TARGET_NAME} PRIVATE ggml) + + +set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO}) + +if (ENABLE_TEMPLATE_REGISTRATION) + # Update the plugins.xml file + ov_register_plugins(MAIN_TARGET ${TARGET_NAME}) +endif() diff --git a/modules/llama_cpp_plugin/src/compiled_model.cpp b/modules/llama_cpp_plugin/src/compiled_model.cpp new file mode 100644 index 000000000..932c0def4 --- /dev/null +++ b/modules/llama_cpp_plugin/src/compiled_model.cpp @@ -0,0 +1,732 @@ +#include "compiled_model.hpp" +#include "plugin.hpp" +#include "infer_request.hpp" +#include +#include +#include +#include +#include + +namespace ov { + namespace llama_cpp_plugin { + class TensorWeightMatcher { + public: + // TODO (vshampor) implement this for faster weight node matching. + // Use std::list, two passes - first for full name match, second for prefix-match; remove entries from list on match + using RTInfoTensorName = std::string; + using OvNodeName = std::string; + using LlamaTensorName = std::string; + + TensorWeightMatcher(const std::shared_ptr& model, std::map tensor_names_with_shapes_to_match) { + std::multimap> intermediate_matches_map; + + const auto node_vector = model->get_ops(); + std::list> const_nodes_in_model; + for (const auto& node_ptr : node_vector) { + if (ov::is_type(node_ptr)) const_nodes_in_model.push_back(ov::as_type_ptr(node_ptr)); + } + + // full substring match pass + std::map unmatched_rt_info_names_on_first_pass = extract_matches(intermediate_matches_map, tensor_names_with_shapes_to_match, const_nodes_in_model, + [](const std::string& substring, const std::string& source) { return source.find(substring) != std::string::npos; }); + + // prefix substring match pass + std::map unmatched_rt_info_names_on_second_pass = extract_matches(intermediate_matches_map, unmatched_rt_info_names_on_first_pass, const_nodes_in_model, + [](const std::string& substring, const std::string& source) { + return source.find(get_weight_name_without_torch_postfix(substring)) != std::string::npos; }); + + for (auto it = intermediate_matches_map.begin(); it != intermediate_matches_map.end(); it = intermediate_matches_map.upper_bound(it->first)) { + // TODO: perf improvement by iterating with ++; + RTInfoTensorName rt_info_name = it->first; + if (intermediate_matches_map.count(rt_info_name) != 1) { + std::cout << "VSHAMPOR: multiple matches for weight name " << rt_info_name << " and shape " << it->second->get_shape().to_string() << ", found "; + auto range_it_pair = intermediate_matches_map.equal_range(rt_info_name); + for (auto multimatch_it = range_it_pair.first; multimatch_it != range_it_pair.second; multimatch_it++) { + auto node_ptr = multimatch_it->second; + std::cout << node_ptr->get_friendly_name() << "(shape " << node_ptr->get_shape().to_string() << "),"; + } + std::cout << "will take the first match" << std::endl; + } + const auto& match = intermediate_matches_map.find(rt_info_name)->second; + m_rtinfo_name_to_weight_node_map[rt_info_name] = match; + } + if (!unmatched_rt_info_names_on_second_pass.empty()) { + std::cout << "VSHAMPOR: did not find the weight node for " << unmatched_rt_info_names_on_second_pass.size() << " weights:" << std::endl; + } + for (const auto& unmatched_entry: unmatched_rt_info_names_on_second_pass) { + std::cout << '\t' << unmatched_entry.first << std::endl; + } + } + + std::unordered_map> get_matches() { return m_rtinfo_name_to_weight_node_map; } + + private: + std::map extract_matches(std::multimap>& output_matches_map, + const std::map& names_with_shapes_to_match, + const std::list>& search_list, + std::function name_match_predicate) { + std::map unmatched_rt_info_names; + for (const auto& pair: names_with_shapes_to_match) { + RTInfoTensorName rt_info_name = pair.first; + const ov::Shape& wanted_shape = pair.second; + bool matched = false; + for (auto it = search_list.begin(); it != search_list.end(); it++) { + auto node_ptr = *it; + const std::string& friendly_name = node_ptr->get_friendly_name(); + if (name_match_predicate(rt_info_name, friendly_name) && + node_ptr->get_shape() == wanted_shape) { + output_matches_map.insert(std::make_pair(rt_info_name, node_ptr)); + matched = true; + break; + } + } + if (!matched) unmatched_rt_info_names.insert(pair); + } + return unmatched_rt_info_names; + } + + static std::string get_weight_name_without_torch_postfix(std::string torch_weight_name) { + size_t idx = torch_weight_name.rfind("."); + if (idx == std::string::npos) return torch_weight_name; + return std::string(torch_weight_name, 0, idx); + } + + size_t num_exact_matches = 0; + size_t num_partial_matches = 0; + std::unordered_map> m_rtinfo_name_to_weight_node_map; + }; + + + std::vector> get_nodes_containing_name_with_shape(const std::shared_ptr& model, const std::string& weight_name, const ov::Shape& shape) { + auto ops = model->get_ops(); + std::vector> found_weight_nodes; + std::copy_if(ops.begin(), ops.end(), std::back_inserter(found_weight_nodes), + [&weight_name, &shape](const std::shared_ptr& val) { + if (!ov::is_type(val)) return false; + std::shared_ptr node_ptr = ov::as_type_ptr(val); + return val->get_friendly_name().find(weight_name) != std::string::npos && + val->get_shape() == shape; + }); + return found_weight_nodes; + } + + bool has_weight_matches(const std::shared_ptr& model, const std::string& weight_name, const ov::Shape& shape) { + std::vector> found_weight_nodes; + found_weight_nodes = get_nodes_containing_name_with_shape(model, weight_name, shape); + return !found_weight_nodes.empty(); + } + + std::string get_weight_name_without_torch_postfix(std::string torch_weight_name) { + size_t idx = torch_weight_name.rfind("."); + if (idx == std::string::npos) return torch_weight_name; + return std::string(torch_weight_name, 0, idx); + } + + bool has_partial_weight_matches(const std::shared_ptr& model, const std::string& weight_name, const ov::Shape& shape) { + std::vector> found_weight_nodes; + found_weight_nodes = get_nodes_containing_name_with_shape(model, get_weight_name_without_torch_postfix(weight_name), shape); + return !found_weight_nodes.empty(); + } + + std::shared_ptr get_weight_by_name_and_shape(const std::shared_ptr& model, const std::string& weight_name, const ov::Shape& shape) { + OPENVINO_ASSERT(has_weight_matches(model, weight_name, shape)); + std::vector> found_weight_nodes; + found_weight_nodes = get_nodes_containing_name_with_shape(model, weight_name, shape); + + if (found_weight_nodes.size() > 1) { + std::cout << "VSHAMPOR: multiple matches for weight name " << weight_name << " and shape " << shape.to_string() << ", found "; + for (const auto& node_ptr : found_weight_nodes) { + std::cout << node_ptr->get_friendly_name() << "(shape " << shape.to_string() << "),"; + } + std::cout << "will take the first match" << std::endl; + } + std::shared_ptr node_with_tensor = found_weight_nodes.front(); + OPENVINO_ASSERT(ov::is_type(node_with_tensor)); + std::shared_ptr const_node_ptr = ov::as_type_ptr(node_with_tensor); + return const_node_ptr; + } + + using TransposePermutation = std::pair; + + std::vector expand_front(const std::vector& vec, size_t val) { + OPENVINO_ASSERT(vec.size() < GGML_MAX_DIMS); + std::vector retval(GGML_MAX_DIMS, val); + std::copy(vec.rbegin(), vec.rend(), retval.rbegin()); + return retval; + } + + void write_float_plus_one(std::ofstream& out, const float* src) { + float elt = *src; + elt += 1; + out.write((const char*) &elt, sizeof(float)); + } + + void append_tensor_data_with_transpositions(const std::string& fname, const std::vector& tensor_infos, const std::vector& tensor_data_ptrs, + const std::map& transpositions, const std::set increment_by_one_tensor_names) { + // assuming contiguous data underneath each pointer from tensor_data_ptrs + OPENVINO_ASSERT(tensor_infos.size() == tensor_data_ptrs.size()); + std::ofstream out(fname, std::ios::app | std::ios::out); + for (size_t i = 0; i < tensor_infos.size(); i++) { + const auto& tensor_info = tensor_infos[i]; + OPENVINO_ASSERT(tensor_info.type == GGML_TYPE_F32); // TODO (vshampor): writing transposed tensor data for other data types, especially lower-bitwidth; maybe use OV inference for that + + const char* ir_tensor_data = reinterpret_cast(tensor_data_ptrs[i]); + + std::string tensor_llama_name = std::string(tensor_info.name.data); + auto it = transpositions.find(tensor_llama_name); + if (it == transpositions.end()) { + // original IR tensor should not be transposed to conform to GGUF expectations, can write as-is + if (increment_by_one_tensor_names.count(tensor_llama_name) != 0) { // gemma case + size_t elt_size = sizeof(float); // FP32 only for now + OPENVINO_ASSERT(!(tensor_info.size % elt_size)); + size_t num_elts = tensor_info.size / elt_size; + for (size_t elt_idx = 0; elt_idx < num_elts; elt_idx++) { + write_float_plus_one(out, ((float*) ir_tensor_data) + elt_idx); + } + } + else { + out.write(ir_tensor_data, tensor_info.size); + } + continue; + } + + if (it != transpositions.end()) { + std::vector gguf_layout_shape; + + // the shape in .ne is inverted w.r.t original export (~= IR) weight layout + for (size_t dim_idx = 0; dim_idx < tensor_info.n_dims; dim_idx++) { + gguf_layout_shape.push_back(tensor_info.ne[GGML_MAX_DIMS - 1 - tensor_info.n_dims - dim_idx]); + } + + TransposePermutation permutation = it->second; + std::vector ir_layout_shape(gguf_layout_shape); + std::swap(ir_layout_shape[permutation.first], ir_layout_shape[permutation.second]); + + std::vector ir_layout_strides(tensor_info.n_dims, 1); + + for (size_t idx = 0; idx < tensor_info.n_dims - 1 ; idx++) { + auto previous_stride_it = ir_layout_strides.rbegin() + idx; + auto stride_it = ir_layout_strides.rbegin() + idx + 1; + auto shape_it = ir_layout_shape.rbegin() + idx; + *stride_it = *shape_it * *previous_stride_it; + } + + + std::vector permuted_strides(ir_layout_strides); + std::swap(permuted_strides[permutation.first], permuted_strides[permutation.second]); + + // expand up to GGML_MAX_DIMS + std::vector gguf_layout_shape_ex = expand_front(gguf_layout_shape, 1); + // stride for unused dims will be 0, has no effect on loop because dimension idx for that dim is always 0 + permuted_strides = expand_front(permuted_strides, 0); + + + + std::cout << "VSHAMPOR: writing tensor " << tensor_info.name.data << " with size " << tensor_info.size; + std::cout << " shape (GGUF layout) "; + for (auto dim: gguf_layout_shape) std::cout << dim << ","; + std::cout << " shape (IR layout) "; + for (auto dim : ir_layout_shape) std::cout << dim << ","; + std::cout << " stride (IR layout) "; + for (auto stride : ir_layout_strides) std::cout << stride << ","; + std::cout << " stride (IR layout, transposing) "; + for (auto stride : permuted_strides) std::cout << stride << ","; + std::cout << std::endl; + + // TODO (vshampor): rewrite the loop below using recurrent templates? + // This relies on GGUF_MAX_DIMS == 4 and unused dims being equal to 1 + size_t current_offset = 0; + size_t element_size = sizeof(float); + size_t num_bytes_written = 0; + for (size_t dim_0 = 0; dim_0 < gguf_layout_shape_ex[0]; dim_0++) + for (size_t dim_1 = 0; dim_1 < gguf_layout_shape_ex[1]; dim_1++) + for (size_t dim_2 = 0; dim_2 < gguf_layout_shape_ex[2]; dim_2++) + for (size_t dim_3 = 0; dim_3 < gguf_layout_shape_ex[3]; dim_3++) { + current_offset = element_size * (dim_0 * permuted_strides[0] + dim_1 * permuted_strides[1] + dim_2 * permuted_strides[2] + dim_3 * permuted_strides[3]); + if (increment_by_one_tensor_names.count(tensor_llama_name) != 0) { // gemma case + write_float_plus_one(out, (float*) ir_tensor_data + current_offset); + } + else { + out.write(ir_tensor_data + current_offset, element_size); + } + num_bytes_written += element_size; + } + std::cout << "VSHAMPOR: wrote " << num_bytes_written << std::endl; + OPENVINO_ASSERT(num_bytes_written == tensor_info.size); + } + } + } + + struct ValueStorageForLifetimeExtension { + std::list kv_key_string_storage; + std::list kv_value_string_storage; + std::list> str_arr_storage; + void* store_gguf_value_vector(const std::vector& vec, gguf_type g_type) { + size_t elt_size; + switch (g_type) { + case GGUF_TYPE_UINT8: elt_size = sizeof(uint8_t); break; + case GGUF_TYPE_INT8: elt_size = sizeof(int8_t); break; + case GGUF_TYPE_UINT16: elt_size = sizeof(uint16_t); break; + case GGUF_TYPE_INT16: elt_size = sizeof(int16_t); break; + case GGUF_TYPE_UINT32: elt_size = sizeof(uint32_t); break; + case GGUF_TYPE_INT32: elt_size = sizeof(int32_t); break; + case GGUF_TYPE_FLOAT32: elt_size = sizeof(float); break; + case GGUF_TYPE_UINT64: elt_size = sizeof(uint64_t); break; + case GGUF_TYPE_INT64: elt_size = sizeof(int64_t); break; + case GGUF_TYPE_FLOAT64: elt_size = sizeof(double); break; + case GGUF_TYPE_BOOL: elt_size = sizeof(bool); break; + default: + OPENVINO_THROW("Unknown array type"); + } + size_t size_in_bytes = vec.size() * elt_size; + void* mem_ptr = new char[size_in_bytes]; + for (size_t i = 0; i < vec.size(); i++) { + switch (g_type) { + case GGUF_TYPE_UINT8: ((uint8_t*) mem_ptr)[i] = vec[i].uint8; break; + case GGUF_TYPE_INT8: ((int8_t*) mem_ptr)[i] = vec[i].int8; break; + case GGUF_TYPE_UINT16: ((uint16_t*) mem_ptr)[i] = vec[i].uint16; break; + case GGUF_TYPE_INT16: ((int16_t*) mem_ptr)[i] = vec[i].int16; break; + case GGUF_TYPE_UINT32: ((uint32_t*) mem_ptr)[i] = vec[i].uint32; break; + case GGUF_TYPE_INT32: ((int32_t*) mem_ptr)[i] = vec[i].int32; break; + case GGUF_TYPE_FLOAT32: ((float*) mem_ptr)[i] = vec[i].float32; break; + case GGUF_TYPE_UINT64: ((uint64_t*) mem_ptr)[i] = vec[i].uint64; break; + case GGUF_TYPE_INT64: ((int64_t*) mem_ptr)[i] = vec[i].int64; break; + case GGUF_TYPE_FLOAT64: ((double*) mem_ptr)[i] = vec[i].float64; break; + case GGUF_TYPE_BOOL: ((bool*) mem_ptr)[i] = vec[i].bool_; break; + default: + OPENVINO_THROW("Unknown array type"); + } + } + return mem_ptr; + } + + ValueStorageForLifetimeExtension() = default; + ~ValueStorageForLifetimeExtension() { + for (void* ptr: non_str_raw_storage) { + delete[] (char*) ptr; + } + } + private: + std::list non_str_raw_storage; + }; + + bool maybe_parse_single_element(gguf_type g_type, ov::Any rtmap_value, gguf_value& dst, ValueStorageForLifetimeExtension& store) { + switch (g_type) { + case GGUF_TYPE_UINT8: dst.uint8 = rtmap_value.as(); break; + case GGUF_TYPE_INT8: dst.int8 = rtmap_value.as(); ; break; + case GGUF_TYPE_UINT16: dst.uint16 = rtmap_value.as(); break; + case GGUF_TYPE_INT16: dst.int16 = rtmap_value.as(); break; + case GGUF_TYPE_UINT32: dst.uint32 = rtmap_value.as(); break; + case GGUF_TYPE_INT32: dst.int32 = rtmap_value.as(); break; + case GGUF_TYPE_FLOAT32: dst.float32 = rtmap_value.as(); break; + case GGUF_TYPE_UINT64: dst.uint64 = rtmap_value.as(); break; + case GGUF_TYPE_INT64: dst.int64 = rtmap_value.as(); break; + case GGUF_TYPE_FLOAT64: dst.float64 = rtmap_value.as(); break; + case GGUF_TYPE_BOOL: dst.bool_ = rtmap_value.as(); break; + case GGUF_TYPE_STRING: { + std::string string_value = rtmap_value.as(); + store.kv_value_string_storage.push_back(string_value); + dst.str.n = string_value.length(); + dst.str.data = (char*) store.kv_value_string_storage.back().c_str(); // TODO (vshampor) see equivalent case below + break; + } + default: + return false; // did not parse + } + return true; // parsed successfully + } + + ov::Any get_any_associated_with_gguf_type(gguf_type g_type) { + switch (g_type) { + case GGUF_TYPE_UINT8: return ov::Any(uint8_t()); + case GGUF_TYPE_INT8: return ov::Any(int8_t()); + case GGUF_TYPE_UINT16: return ov::Any(uint16_t()); + case GGUF_TYPE_INT16: return ov::Any(int16_t()); + case GGUF_TYPE_UINT32: return ov::Any(uint32_t()); + case GGUF_TYPE_INT32: return ov::Any(int32_t()); + case GGUF_TYPE_FLOAT32: return ov::Any(float()); + case GGUF_TYPE_UINT64: return ov::Any(uint64_t()); + case GGUF_TYPE_INT64: return ov::Any(int64_t()); + case GGUF_TYPE_FLOAT64: return ov::Any(double()); + case GGUF_TYPE_BOOL: return ov::Any(bool()); + case GGUF_TYPE_STRING: return ov::Any(std::string()); + default: + OPENVINO_THROW("Unknown gguf_type to turn into ov::Any"); + } + } + + + LlamaCppModel::LlamaCppModel(const std::shared_ptr& model, + const std::shared_ptr& plugin, + const ov::SoPtr& context, + const std::shared_ptr& task_executor + ) : ICompiledModel(model, plugin, context, task_executor) { + m_model = model; + num_tokens_processed_ptr = new size_t; // TODO (vshampor): hack, remove + *num_tokens_processed_ptr = 0; + auto rt_info = model->get_rt_info(); + OPENVINO_ASSERT(rt_info.count("lcp_kv_params") != 0); + OPENVINO_ASSERT(rt_info.count("lcp_kv_types") != 0); + OPENVINO_ASSERT(rt_info.count("lcp_kv_array_types") != 0); + OPENVINO_ASSERT(rt_info.count("lcp_tensor_name_map") != 0); + OPENVINO_ASSERT(rt_info.count("lcp_tensor_shape_map") != 0); + OPENVINO_ASSERT(rt_info.count("lcp_expected_tensor_shapes") != 0); + OPENVINO_ASSERT(rt_info.count("lcp_transpose_permutations") != 0); + + RTMap& kv_params = model->get_rt_info("lcp_kv_params"); + RTMap& kv_types = model->get_rt_info("lcp_kv_types"); + RTMap& kv_array_types = model->get_rt_info("lcp_kv_array_types"); + RTMap& tensor_name_map = model->get_rt_info("lcp_tensor_name_map"); + RTMap& tensor_shape_map = model->get_rt_info("lcp_tensor_shape_map"); + RTMap& expected_tensor_shapes_map = model->get_rt_info("lcp_expected_tensor_shapes"); + RTMap& transpose_permutations_rtmap = model->get_rt_info("lcp_transpose_permutations"); + + size_t gguf_version = model->get_rt_info("lcp_gguf_version"); + std::cout << "VSHAMPOR: parsed gguf_version " << gguf_version << std::endl; + + // kv params + OPENVINO_ASSERT(kv_params.size() == kv_types.size()); + size_t n_kv = kv_params.size(); + std::vector kv_vector; + ValueStorageForLifetimeExtension store; + + for (const auto& kv_pair: kv_params) { + gguf_kv kv; + + const auto& key = kv_pair.first; + kv.key.n = key.length(); + store.kv_key_string_storage.push_back(key); + kv.key.data = (char*) store.kv_key_string_storage.back().c_str(); // TODO (vshampor) see equivalent case below + + uint32_t value_type = kv_types[key].as(); + gguf_type gguf_value_type = (gguf_type) value_type; + kv.type = gguf_value_type; + if (gguf_value_type != GGUF_TYPE_ARRAY) { + bool is_parsed = maybe_parse_single_element(kv.type, kv_pair.second, kv.value, store); + OPENVINO_ASSERT(is_parsed, "Invalid type of a GGUF kv-value"); + } + else { // array case + gguf_type element_type = (gguf_type) kv_array_types[key].as(); + kv.value.arr.type = element_type; + std::string serialized_array = kv_pair.second.as(); + std::stringstream ss{serialized_array}; + std::vector parsed_array; + while (!ss.eof()) { + gguf_value array_elt; + ov::Any ov_any = get_any_associated_with_gguf_type(element_type); + std::string token; ss >> token; + if (std::string(kv.key.data) == "tokenizer.ggml.merges") { + // tokenizer merges are pairs of tokens separated by whitespace, so need to read another to get a proper merge + // TODO (vshampor): think of another delimiting strategy in the rt_info and use that strategy here for more robust code + std::string another_token; ss >> another_token; + token += std::string(" ") + another_token; + ov_any = ov::Any::make(token); + } + else { + std::stringstream tok_ss{token}; + ov_any.read(tok_ss); + } + bool is_parsed = maybe_parse_single_element(element_type, ov_any, array_elt, store); + OPENVINO_ASSERT(is_parsed); + parsed_array.push_back(array_elt); + } + kv.value.arr.n = parsed_array.size(); + if (element_type == GGUF_TYPE_STRING) { + // string element has already been lifetime-extended during parsing + std::vector cstr_vector(parsed_array.size()); + for (size_t cstr_idx = 0; cstr_idx < parsed_array.size(); cstr_idx++) { + cstr_vector[cstr_idx] = parsed_array[cstr_idx].str.data; + } + store.str_arr_storage.push_back(cstr_vector); + kv.value.arr.data = store.str_arr_storage.back().data(); + } + else { + void* data_ptr = store.store_gguf_value_vector(parsed_array, element_type); + kv.value.arr.data = data_ptr; + } + } + kv_vector.push_back(kv); + } + + auto token_types_kv_it = std::find_if(kv_vector.begin(), kv_vector.end(), [](const gguf_kv& val) { return std::string(val.key.data) == "tokenizer.ggml.token_type"; }); + if (token_types_kv_it != kv_vector.end()) { + auto tokens_kv_it = std::find_if(kv_vector.begin(), kv_vector.end(), [](const gguf_kv& val) { return std::string(val.key.data) == "tokenizer.ggml.tokens"; }); + if (tokens_kv_it != kv_vector.end()) { + size_t expected_num_tokens = token_types_kv_it->value.arr.n; + size_t actual_num_tokens = tokens_kv_it->value.arr.n; + if (actual_num_tokens < expected_num_tokens) { + std::cout << "VSHAMPOR: detected wrong vocab serialization/deserialization (expected " << expected_num_tokens << " tokens, parsed " << actual_num_tokens << " from vocab), filling tokens with bogus values" << std::endl; + std::vector new_vocab; + // char** old_vocab_data_ptr = (char**) tokens_kv_it->value.arr.data; + // std::copy(old_vocab_data_ptr, old_vocab_data_ptr + actual_num_tokens, new_vocab.begin()); + // size_t extra_tokens_needed = expected_num_tokens - actual_num_tokens; + size_t extra_tokens_needed = expected_num_tokens; + for (size_t tok_idx = 0; tok_idx < extra_tokens_needed; tok_idx++) { + std::stringstream ss; + ss << "invalid_token_" << tok_idx; + std::string new_token = ss.str(); + store.kv_value_string_storage.push_back(new_token); + char* str_data_ptr = (char*) store.kv_value_string_storage.back().c_str(); + new_vocab.push_back(str_data_ptr); + } + OPENVINO_ASSERT(new_vocab.size() == expected_num_tokens); + store.str_arr_storage.push_back(new_vocab); + tokens_kv_it->value.arr.data = (void*) store.str_arr_storage.back().data(); + tokens_kv_it->value.arr.n = expected_num_tokens; + } + } + } + + // tensors + OPENVINO_ASSERT(tensor_name_map.size() == tensor_shape_map.size()); + size_t n_tensors_in_rtinfo = tensor_name_map.size(); + std::cout << "VSHAMPOR: got request for " << n_tensors_in_rtinfo << " tensors from rt_info\n"; + + std::vector tensor_infos; + std::vector tensor_data_ptrs; + + std::map parsed_weights_to_search_for; + for (const auto& llama_name_and_rtinfo_name : tensor_name_map) { + const std::string& llama_name = llama_name_and_rtinfo_name.first; + const std::string& rtinfo_name = llama_name_and_rtinfo_name.second.as(); + ov::Shape expected_shape = tensor_shape_map[llama_name].as(); + parsed_weights_to_search_for[rtinfo_name] = expected_shape; + } + + TensorWeightMatcher matcher{model, parsed_weights_to_search_for}; + std::unordered_map> matches = matcher.get_matches(); + std::unordered_map> llama_name_to_constant_node_map; + for (const auto& entry : tensor_name_map) { + const auto& llama_name = entry.first; + const auto& rtinfo_name = entry.second.as(); + llama_name_to_constant_node_map[llama_name] = matches[rtinfo_name]; + } + std::cout << "VSHAMPOR: requested tensors map to " << llama_name_to_constant_node_map.size() << " tensors to search in model (shared tensors considered)\n"; + + + std::list llama_name_storage; + + size_t n_tensors = 0; + + size_t offset = 0; // each tensor_info has to have a correct offset including padding, checked for in gguf_write_to_buf + for (const auto& matched_weight_pair : llama_name_to_constant_node_map) { + // Need to store the names in the list so that the passed c_str() pointers in tensor_infos to the llama names stay valid + // until they get deepcopied in gguf/llama functions + llama_name_storage.push_back(matched_weight_pair.first); + const std::string& llama_name = llama_name_storage.back(); + + auto weight_const_node_ptr = matched_weight_pair.second; + auto weight_shape = weight_const_node_ptr->get_shape(); + + // does hf-to-gguf invert all tensor dimensions with shapes > 1? + auto expected_weight_shape = ov::Shape(expected_tensor_shapes_map[llama_name].as()); + OPENVINO_ASSERT(expected_weight_shape.size() < GGML_MAX_DIMS); + + gguf_tensor_info info; + + info.type = GGML_TYPE_F32; // TODO (vshampor): better type assignment based on actual element type of the Constant node + + info.name.n = llama_name.length(); + info.name.data = (char*) llama_name.c_str(); // TODO (vshampor): either do this via const_cast, or will have to implement own structures for + // read-only data passing to llama_load_model_from_data + info.n_dims = weight_shape.size(); + std::fill(std::begin(info.ne), std::begin(info.ne) + GGML_MAX_DIMS, (uint64_t) 1); + + // looks like GGUF expects inverse order of dimensions when compared to e.g. torch and actual row-major layout, see gguf.gguf_writer.GGUFWriter.add_tensor_info + // in gguf python package + std::copy(expected_weight_shape.rbegin(), expected_weight_shape.rend(), info.ne); + + void* data_ptr = (void*)(weight_const_node_ptr->get_data_ptr()); // TODO (vshampor): danger - casts `const` away + // also - the expected_weight_shape is in general different from actual ov::Tensor shape, + // in particular it may be transposed, so we actually need to set the pointers to shape-corrected + // tensor storage, which we don't do here - we are only preparing this data to get a convenient + // gguf_context object to reuse metadata (header) writing code, tensor data transpositions will be done during + // actual file write + + info.size = weight_const_node_ptr->get_byte_size(); + info.offset = offset; + + const size_t size_pad = GGML_PAD(info.size, GGUF_DEFAULT_ALIGNMENT); + offset += size_pad; + + info.data = data_ptr; + + tensor_infos.push_back(info); + tensor_data_ptrs.push_back(data_ptr); + n_tensors++; + } + + std::cout << "VSHAMPOR: found " << matches.size() << "/" << parsed_weights_to_search_for.size() << " tensors" << std::endl; + + gguf_init_params gguf_params; + gguf_params.no_alloc = false; + gguf_params.ctx = nullptr; + + m_gguf_ctx = gguf_init_from_data(n_tensors, tensor_infos.data(), n_kv, kv_vector.data(), tensor_data_ptrs.data(), gguf_params); + + std::shared_ptr llama_plugin_ptr = std::dynamic_pointer_cast(plugin); + m_converted_gguf_file_name = llama_plugin_ptr->get_current_gguf_file_path(); + + std::cout << "VSHAMPOR: output filename is " << m_converted_gguf_file_name << std::endl; + std::cout << "VSHAMPOR: writing metadata (GGUF header) " << std::endl; + gguf_write_to_file(m_gguf_ctx, m_converted_gguf_file_name.c_str(), /* only_meta = */ true); + + std::map transpose_permutations; + + for (const auto& llama_name_and_permutation : transpose_permutations_rtmap) { + std::string permutation_str = llama_name_and_permutation.second.as(); + std::stringstream ss(permutation_str); + TransposePermutation permutation; + bool is_ok = true; + is_ok &= static_cast(ss >> permutation.first); + is_ok &= static_cast(ss >> permutation.second); + OPENVINO_ASSERT(is_ok, "failed to read permutation"); + transpose_permutations[llama_name_and_permutation.first] = permutation; + } + + std::set gemma_tensor_names_to_increment; + // FIXME (vshampor): tried setting up commands for incrementing *_norm.weight values by 1 like it is done + // during llama.cpp HF-to-GGUF export, but it seems that it isn't necessary and IR stores the incremented weights already + // Is this due to constant folding? + + // for (const auto& llama_name_and_rtinfo_name : tensor_name_map) { + // const std::string& llama_name = llama_name_and_rtinfo_name.first; + // const std::string& rtinfo_name = llama_name_and_rtinfo_name.second.as(); + // std::string gemma_norm_suffix = "norm.weight"; + // if (rtinfo_name.size() < gemma_norm_suffix.size()) continue; + // if (rtinfo_name.substr(rtinfo_name.size() - gemma_norm_suffix.size()) == gemma_norm_suffix) gemma_tensor_names_to_increment.insert(llama_name); + // } + + std::cout << "VSHAMPOR: writing tensor data (blob with transpositions) " << std::endl; + append_tensor_data_with_transpositions(m_converted_gguf_file_name, tensor_infos, tensor_data_ptrs, transpose_permutations, gemma_tensor_names_to_increment); + std::cout << "VSHAMPOR: write finished." << m_converted_gguf_file_name << std::endl; + + std::cout << "VSHAMPOR: loading llama model from written file..." << std::endl; + llama_model_params mparams = llama_model_default_params(); + mparams.n_gpu_layers = 99; + m_llama_model_ptr = llama_load_model_from_file(m_converted_gguf_file_name.c_str(), mparams); + llama_context_params cparams = llama_context_default_params(); + m_llama_ctx = llama_new_context_with_model(m_llama_model_ptr, cparams); + + std::cout << "VSHAMPOR: llama model loaded successfully..." << std::endl; + } + + + LlamaCppModel::LlamaCppModel(const std::shared_ptr& ov_model, std::istream& input_stream, const std::shared_ptr& plugin) : + ICompiledModel(ov_model, plugin) { + num_tokens_processed_ptr = new size_t; // TODO (vshampor): hack, remove + *num_tokens_processed_ptr = 0; + std::shared_ptr llama_plugin = std::dynamic_pointer_cast(plugin); + std::string current_file_path = llama_plugin->get_current_gguf_file_path(); + std::ofstream output_stream(current_file_path, std::ios::binary); + output_stream << input_stream.rdbuf(); + + + std::cout << "VSHAMPOR: loading llama model from imported and re-written file..." << std::endl; + llama_model_params mparams = llama_model_default_params(); + mparams.n_gpu_layers = 99; + m_llama_model_ptr = llama_load_model_from_file(current_file_path.c_str(), mparams); + llama_context_params cparams = llama_context_default_params(); + m_llama_ctx = llama_new_context_with_model(m_llama_model_ptr, cparams); + std::cout << "VSHAMPOR: llama model loaded successfully from cache..." << std::endl; + } + + LlamaCppModel::LlamaCppModel(const std::string& gguf_fname, const std::shared_ptr& plugin) : + ICompiledModel(nullptr, plugin) { + num_tokens_processed_ptr = new size_t; // TODO (vshampor): hack, remove + *num_tokens_processed_ptr = 0; + std::cout << "VSHAMPOR: loading llama model directly from GGUF... " << std::endl; + llama_model_params mparams = llama_model_default_params(); + mparams.n_gpu_layers = 99; + m_llama_model_ptr = llama_load_model_from_file(gguf_fname.c_str(), mparams); + llama_context_params cparams = llama_context_default_params(); + m_llama_ctx = llama_new_context_with_model(m_llama_model_ptr, cparams); + std::cout << "VSHAMPOR: llama model loaded successfully from GGUF..." << std::endl; + + auto input_ids = std::make_shared(ov::element::Type_t::i64, ov::PartialShape({-1, -1})); + auto fake_convert = std::make_shared(input_ids->output(0), ov::element::Type_t::f32); + auto logits = std::make_shared(fake_convert->output(0)); + + ov::ParameterVector inputs{input_ids}; + + std::vector> unused_names_in_order = { { "attention_mask", ov::element::Type_t::i64 }, + { "position_ids", ov::element::Type_t::i64 }, + { "beam_idx", ov::element::Type_t::i32 } }; + for (const auto& descr : unused_names_in_order) { + auto unused_inp = std::make_shared(descr.second, ov::PartialShape({-1, -1})); + inputs.push_back(unused_inp); + } + + m_model = std::make_shared(logits, inputs, "fake_ov_model_for_io_specification"); + + m_model->inputs()[0].set_names({"input_ids"}); + for (size_t i = 0; i < unused_names_in_order.size(); i++) { + m_model->inputs()[i + 1].set_names({unused_names_in_order[i].first}); + } + + m_model->outputs()[0].set_names({"logits"}); + + for (auto input : m_model->inputs()) { + m_fake_inputs.emplace_back(input); + } + for (auto output : m_model->outputs()) { + m_fake_outputs.emplace_back(output); + } + } + + + void LlamaCppModel::export_model(std::ostream& output_stream) const { + std::cout << "VSHAMPOR: exporting model" << std::endl; + + // FIXME (vshampor): it's a shame that loading a model from cache does not have an option to + // actually keep the already loaded model from xml and not be forced to deserialize an ov::Model + // representation from cache as well. As it stands, will need to write the whole IR into the cache entry + // along with the GGUF file. + // + std::stringstream xmlFile, binFile; + ov::pass::Serialize serializer(xmlFile, binFile); + serializer.run_on_model(m_model); + + auto m_constants = binFile.str(); + auto m_model = xmlFile.str(); + + auto dataSize = static_cast(m_model.size()); + output_stream.write(reinterpret_cast(&dataSize), sizeof(dataSize)); + output_stream.write(m_model.c_str(), dataSize); + + dataSize = static_cast(m_constants.size()); + output_stream.write(reinterpret_cast(&dataSize), sizeof(dataSize)); + output_stream.write(reinterpret_cast(&m_constants[0]), dataSize); + + + std::ifstream in(m_converted_gguf_file_name, std::ios::binary); + output_stream << in.rdbuf(); + } + + std::shared_ptr LlamaCppModel::get_runtime_model() const { + OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented"); + } + + void LlamaCppModel::set_property(const ov::AnyMap& properties) { + std::cout << "VSHAMPOR: attempted to set_property (did nothing)"; + } + + ov::Any LlamaCppModel::get_property(const std::string& name) const { + if (ov::supported_properties == name) { + return decltype(ov::supported_properties)::value_type(std::vector()); + } + OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented"); + } + + std::shared_ptr LlamaCppModel::create_sync_infer_request() const { + return std::make_shared(std::static_pointer_cast(shared_from_this())); + } + + const std::vector>& LlamaCppModel::inputs() const { + return m_fake_inputs; + }; + const std::vector>& LlamaCppModel::outputs() const { + return m_fake_outputs; + }; + } +} // namespace ov diff --git a/modules/llama_cpp_plugin/src/infer_request.cpp b/modules/llama_cpp_plugin/src/infer_request.cpp new file mode 100644 index 000000000..0993422f6 --- /dev/null +++ b/modules/llama_cpp_plugin/src/infer_request.cpp @@ -0,0 +1,111 @@ +#include "infer_request.hpp" +#include "openvino/runtime/make_tensor.hpp" +#include "llama.h" + +namespace ov { + namespace llama_cpp_plugin { + + void allocate_tensor_impl(ov::SoPtr& tensor, + const ov::element::Type& element_type, + const ov::Shape& shape) { + if (!tensor || tensor->get_element_type() != element_type) { + tensor = ov::make_tensor(element_type, shape); + } else { + tensor->set_shape(shape); + } +} + + LlamaCppSyncInferRequest::LlamaCppSyncInferRequest(const std::shared_ptr& compiled_model): ov::ISyncInferRequest(compiled_model) { + std::cout << "VSHAMPOR: infer request ctor called\n"; + m_compiled_model_ptr = compiled_model; + // Allocate input/output tensors + for (const auto& input : get_inputs()) { + allocate_tensor(input, [input](ov::SoPtr& tensor) { + // Can add a check to avoid double work in case of shared tensors + allocate_tensor_impl(tensor, + input.get_element_type(), + input.get_partial_shape().is_dynamic() ? ov::Shape{0} : input.get_shape()); + }); + } + for (const auto& output : get_outputs()) { + allocate_tensor(output, [output](ov::SoPtr& tensor) { + // Can add a check to avoid double work in case of shared tensors + allocate_tensor_impl(tensor, + output.get_element_type(), + output.get_partial_shape().is_dynamic() ? ov::Shape{0} : output.get_shape()); + }); + } + } + void LlamaCppSyncInferRequest::set_tensors_impl(const ov::Output port, const std::vector>& tensors) { + std::cout << "VSHAMPOR: set_tensors_impl called\n"; + } + + void llama_batch_add_reimpl( + struct llama_batch & batch, + llama_token id, + llama_pos pos, + const std::vector & seq_ids, + bool logits) { + batch.token [batch.n_tokens] = id; + batch.pos [batch.n_tokens] = pos; + batch.n_seq_id[batch.n_tokens] = seq_ids.size(); + for (size_t i = 0; i < seq_ids.size(); ++i) { + batch.seq_id[batch.n_tokens][i] = seq_ids[i]; + } + batch.logits [batch.n_tokens] = logits; + + batch.n_tokens++; + } + + void LlamaCppSyncInferRequest::infer() { + auto input_ids_tensor_ptr = get_tensor(get_inputs()[0]); // TODO (vshampor) correctly identify input_ids among all inputs without hardcode + OPENVINO_ASSERT(input_ids_tensor_ptr->get_element_type() == ov::element::Type_t::i64); + OPENVINO_ASSERT(input_ids_tensor_ptr->get_shape().size() == 2); + size_t batch_size = input_ids_tensor_ptr->get_shape()[0]; + size_t sequence_length = input_ids_tensor_ptr->get_shape()[1]; + + // llama_batch actually contains one sequence + llama_batch batch = llama_batch_init(sequence_length, /* embd = */ 0, /* n_seq_max = */ 1); + const int64_t* data_ptr = input_ids_tensor_ptr->data(); + + const int64_t* sequence_start_ptr = data_ptr /* + seq_idx */; + + for (size_t tok_idx = 0; tok_idx < sequence_length; ++tok_idx) { + const int64_t token_id = sequence_start_ptr[tok_idx]; + llama_batch_add_reimpl(batch, token_id, *(m_compiled_model_ptr->num_tokens_processed_ptr), { 0 }, true); // the last `true` here is a marker that the logits for this token should be computed and returned + size_t* ptr = m_compiled_model_ptr->num_tokens_processed_ptr; + (*ptr)++; + } + + + llama_context* ctx = m_compiled_model_ptr->m_llama_ctx; + int32_t sts = llama_decode(ctx, batch); + + if (sts != 0) { + OPENVINO_THROW("llama_decode failed with code ", sts); + } + + size_t n_vocab = llama_n_vocab(m_compiled_model_ptr->m_llama_model_ptr); + + ov::Tensor output_tensor{ov::element::Type_t::f32, {1, sequence_length, n_vocab}}; + float* output_tensor_data_ptr = output_tensor.data(); + + for (size_t pos = 0; pos < sequence_length; pos++) { + float* logits_from_llama = llama_get_logits_ith(ctx, pos); + std::copy(logits_from_llama, logits_from_llama + n_vocab, output_tensor_data_ptr + pos * n_vocab); + } + + auto& logit_output = get_outputs()[0]; + allocate_tensor(logit_output, [&output_tensor](ov::SoPtr& tensor) { allocate_tensor_impl(tensor, output_tensor.get_element_type(), output_tensor.get_shape()); + output_tensor.copy_to(ov::make_tensor(tensor)); }); + }; + std::vector LlamaCppSyncInferRequest::get_profiling_info() const { + std::cout << "VSHAMPOR: get_profiling_info() called\n"; + return std::vector{}; + }; + std::vector> LlamaCppSyncInferRequest::query_state() const { + std::cout << "VSHAMPOR: get_profiling_info() called\n"; + return std::vector>{}; + } + } +} // namespace ov diff --git a/modules/llama_cpp_plugin/src/plugin.cpp b/modules/llama_cpp_plugin/src/plugin.cpp new file mode 100644 index 000000000..9f633426f --- /dev/null +++ b/modules/llama_cpp_plugin/src/plugin.cpp @@ -0,0 +1,152 @@ +#include "plugin.hpp" +#include "compiled_model.hpp" +#include "openvino/op/constant.hpp" +#include +#include "openvino/runtime/internal_properties.hpp" + + +namespace { +static constexpr const char* wait_executor_name = "LlamaCppWaitExecutor"; +static constexpr const char* stream_executor_name = "LlamaCppStreamsExecutor"; +static constexpr const char* template_exclusive_executor = "LlamaCppExecutor"; +} // namespace + + +namespace ov { + namespace llama_cpp_plugin { + LlamaCppPlugin::LlamaCppPlugin() : IPlugin() { + set_device_name("LLAMA_CPP"); + } + std::shared_ptr LlamaCppPlugin::compile_model(const std::shared_ptr& model, + const ov::AnyMap& properties) const { + std::cout << "VSHAMPOR: LlamaCppPlugin::compile_model" << std::endl; + + //std::string gpt2_node_name = "transformer.h.9.attn.c_proj.weight"; + //std::cout << "VSHAMPOR: sanity check - looking for node containing " << gpt2_node_name << std::endl; + //auto ops = model->get_ops(); + //auto iter = std::find_if(ops.begin(), ops.end(), [gpt2_node_name](const std::shared_ptr& val) { + // return val->get_friendly_name().find(gpt2_node_name) != std::string::npos; }); + //if (iter == ops.end()) { + // std::cout << "VSHAMPOR: did not find the node\n"; + //} else { + // std::shared_ptr node_with_tensor = *iter; + // std::cout << "VSHAMPOR: node type is " << node_with_tensor->get_type_name() << std::endl; + // std::shared_ptr const_node_ptr = ov::as_type_ptr(node_with_tensor); + // const float* data_ptr = const_node_ptr->get_data_ptr(); + // // ov::descriptor::Tensor& tensor_descr = node_with_tensor->get_output_tensor(0); + // // std::cout << "VSHAMPOR: node output tensor shape is " << tensor_descr.get_shape().to_string() << std::endl; + // // ov::TensorVector in, out; + // // node_with_tensor->evaluate(out, in); + // // std::cout << "VSHAMPOR: evaluated " << out.size() << " output tensors\n"; + // // if (!out.empty()) { + // // const ov::Tensor& tensor = out[0]; + // // const float* vals = tensor.data(); + // // std::cout << "VSHAMPOR: first elements of the weight tensor are "; + // // for (size_t i = 0; i < 10; i++) { + // // std::cout << vals[i] << " "; + // // } + // // std::cout << std::endl; + // // } + // std::cout << "VSHAMPOR: first elements of the weight tensor are "; + // for (size_t i = 0; i < 10; i++) { + // std::cout << data_ptr[i] << " "; + // } + // std::cout << std::endl; + //} + return compile_model(model, properties, {}); + } + + std::shared_ptr LlamaCppPlugin::compile_model(const std::string& fname, const ov::AnyMap& properties) const { + return std::make_shared(fname, shared_from_this()); + } + std::shared_ptr LlamaCppPlugin::compile_model(const std::shared_ptr& model, + const ov::AnyMap& properties, + const ov::SoPtr& context) const { + std::cout << "VSHAMPOR: compile_model called in C++" << std::endl; + return std::make_shared(model->clone(), shared_from_this(), context, get_executor_manager()->get_executor(template_exclusive_executor)); + } + + void LlamaCppPlugin::set_property(const ov::AnyMap& properties) { + for (const auto& map_entry : properties) { + if (map_entry.first == ov::cache_dir.name()) { + m_cache_dir = map_entry.second.as(); + } + else { + OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: setting property ", map_entry.first, "not implemented"); + } + } + } + + ov::Any LlamaCppPlugin::get_property(const std::string& name, const ov::AnyMap& arguments) const { + if (ov::supported_properties == name) { + return decltype(ov::supported_properties)::value_type(std::vector({ov::cache_dir, ov::device::capabilities, ov::device::full_name})); + } + if (ov::device::capabilities == name) { + return decltype(ov::device::capabilities)::value_type(std::vector({ov::device::capability::EXPORT_IMPORT})); + } + if (ov::internal::supported_properties == name) { + return decltype(ov::internal::supported_properties)::value_type(std::vector({ov::internal::caching_properties})); + } + + if (ov::cache_dir == name) { + return m_cache_dir; + } + if (ov::internal::caching_properties == name) { + return std::vector{ov::device::full_name}; + } + + if (ov::device::full_name == name) { + return std::string("LLAMA_CPP"); + } + + OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented"); + } + + ov::SoPtr LlamaCppPlugin::create_context(const ov::AnyMap& remote_properties) const { + OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented"); + } + ov::SoPtr LlamaCppPlugin::get_default_context(const ov::AnyMap& remote_properties) const { + OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented"); + } + std::shared_ptr LlamaCppPlugin::import_model(std::istream& model_file_stream, + const ov::AnyMap& properties) const { + std::cout << "VSHAMPOR: importing model" << '\n'; + std::cout << "VSHAMPOR: deserializing ov::Model first" << '\n'; + // read XML content + std::string xmlString; + std::uint64_t dataSize = 0; + model_file_stream.read(reinterpret_cast(&dataSize), sizeof(dataSize)); + xmlString.resize(dataSize); + model_file_stream.read(const_cast(xmlString.c_str()), dataSize); + + // read blob content + ov::Tensor weights; + model_file_stream.read(reinterpret_cast(&dataSize), sizeof(dataSize)); + if (0 != dataSize) { + weights = ov::Tensor(ov::element::from(), ov::Shape{static_cast(dataSize)}); + model_file_stream.read(weights.data(), dataSize); + } + + auto ov_model = get_core()->read_model(xmlString, weights); + std::cout << "VSHAMPOR: ov::Model deserialized, passing the rest of the stream to LlamaCppModel ctor" << '\n'; + return std::make_shared(ov_model, model_file_stream, shared_from_this()); + } + + const std::string CURRENT_GGUF_FILE_NAME = "current.gguf"; + std::string LlamaCppPlugin::get_current_gguf_file_path() const { return m_cache_dir + "/" + CURRENT_GGUF_FILE_NAME; } + + std::shared_ptr LlamaCppPlugin::import_model(std::istream& model, + const ov::SoPtr& context, + const ov::AnyMap& properties) const { + OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented"); + } + + ov::SupportedOpsMap LlamaCppPlugin::query_model(const std::shared_ptr& model, + const ov::AnyMap& properties) const { + OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented"); + } + } +} // namespace ov + +static const ov::Version version = {CI_BUILD_NUMBER, "llama_cpp_plugin"}; +OV_DEFINE_PLUGIN_CREATE_FUNCTION(ov::llama_cpp_plugin::LlamaCppPlugin, version) diff --git a/modules/llama_cpp_plugin/tests/CMakeLists.txt b/modules/llama_cpp_plugin/tests/CMakeLists.txt new file mode 100644 index 000000000..11648c2bd --- /dev/null +++ b/modules/llama_cpp_plugin/tests/CMakeLists.txt @@ -0,0 +1,37 @@ +set(TARGET_NAME llama_cpp_plugin_func_tests) + +if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + ov_add_compiler_flags(/wd4305) +endif() + +ov_add_test_target( + NAME ${TARGET_NAME} + ROOT ${CMAKE_CURRENT_SOURCE_DIR} + DEPENDENCIES + openvino_template_plugin + LINK_LIBRARIES + openvino::funcSharedTests + openvino::runtime::dev + INCLUDES + "${OpenVINOTemplatePlugin_SOURCE_DIR}/include" + "${CMAKE_CURRENT_SOURCE_DIR}/op_reference" + ADD_CLANG_FORMAT + LABELS + OV UNIT TEMPLATE +) + +find_package(OpenCV QUIET COMPONENTS core imgproc) + +if(OpenCV_FOUND AND OpenCV_VERSION VERSION_GREATER_EQUAL 3.4) + message(STATUS "Reference preprocessing: OpenCV tests are enabled") + target_compile_definitions(${TARGET_NAME} PRIVATE OPENCV_TEMPLATE_TESTS) + target_link_libraries(${TARGET_NAME} PRIVATE opencv_imgproc opencv_core) +else() + message(WARNING "Reference preprocessing: OpenCV tests are disabled, because OpenCV ver. 3.4+ is not found") +endif() + +if (ENABLE_INTEL_CPU) + set_source_files_properties( + "${CMAKE_CURRENT_SOURCE_DIR}/shared_tests_instances/behavior/executable_network/get_metric.cpp" + PROPERTIES COMPILE_DEFINITIONS ENABLE_INTEL_CPU=1) +endif() diff --git a/modules/llama_cpp_plugin/third_party/llama.cpp b/modules/llama_cpp_plugin/third_party/llama.cpp new file mode 160000 index 000000000..c8b02d38d --- /dev/null +++ b/modules/llama_cpp_plugin/third_party/llama.cpp @@ -0,0 +1 @@ +Subproject commit c8b02d38d98db8dab774f6f7655d7e9aede882f5 diff --git a/modules/llama_cpp_plugin/tools/CMakeLists.txt b/modules/llama_cpp_plugin/tools/CMakeLists.txt new file mode 100644 index 000000000..4a37341b8 --- /dev/null +++ b/modules/llama_cpp_plugin/tools/CMakeLists.txt @@ -0,0 +1,22 @@ +cmake_minimum_required(VERSION 3.10) +set(CMAKE_CXX_STANDARD 11) + +find_package(OpenVINO REQUIRED) + + +add_executable(llama_cpp_runner + "${CMAKE_CURRENT_SOURCE_DIR}/runner.cpp" + ) +target_link_libraries(llama_cpp_runner PRIVATE openvino::runtime) + + +add_executable(tensor_comparator + "${CMAKE_CURRENT_SOURCE_DIR}/tensor_comparator.cpp" + ) +target_link_libraries(tensor_comparator PRIVATE ggml) + +add_executable(cache_embedder + "${CMAKE_CURRENT_SOURCE_DIR}/cache_embedder.cpp" + ) + +target_compile_options(cache_embedder PUBLIC "--std=c++17") diff --git a/modules/llama_cpp_plugin/tools/cache_embedder.cpp b/modules/llama_cpp_plugin/tools/cache_embedder.cpp new file mode 100644 index 000000000..bbfbf229c --- /dev/null +++ b/modules/llama_cpp_plugin/tools/cache_embedder.cpp @@ -0,0 +1,53 @@ +#include +#include +#include +#include +#include +#include + +int main(int argc, char* argv[]) { + assert(argc == 3); + std::string cache_blob_name = argv[1]; + std::string gguf_file_name = argv[2]; + + std::uintmax_t original_file_size = std::filesystem::file_size(cache_blob_name); + std::fstream cache_io_stream(cache_blob_name, std::ios::binary | std::ios::in | std::ios::out); + + { + std::string tmp; + std::getline(cache_io_stream, tmp); // skip the blob header + std::cout << "skipped header line" << std::endl; + } + + std::uint64_t data_size = 0; + cache_io_stream.read(reinterpret_cast(&data_size), sizeof(data_size)); + std::cout << "skipping IR XML content, size " << data_size << std::endl; + cache_io_stream.seekp(data_size, std::ios::cur); // skip IR xml content + + cache_io_stream.read(reinterpret_cast(&data_size), sizeof(data_size)); + std::cout << "skipping IR weight content, size " << data_size << std::endl; + cache_io_stream.seekp(data_size, std::ios::cur); // skip IR weight content + + std::streampos pos = cache_io_stream.tellp(); + char magic[4]; + for (size_t i = 0; i < 4; i++) { + cache_io_stream >> magic[i]; + } + + std::string curr_magic(magic); + std::cout << "magic at current position is " << curr_magic << std::endl; + assert(curr_magic == "GGUF"); + cache_io_stream.seekp(pos); + + std::ifstream gguf_input_stream(gguf_file_name, std::ios::binary); + cache_io_stream << gguf_input_stream.rdbuf(); + std::cout << "gguf content write successful" << std::endl; + std::uintmax_t final_size = cache_io_stream.tellp(); + cache_io_stream.close(); + if (final_size < original_file_size) { + std::cout << "cache entry is now smaller (" << final_size << " vs original " << original_file_size << "), truncating" << std::endl; + std::filesystem::resize_file(cache_blob_name, final_size); + } + + return 0; +} diff --git a/modules/llama_cpp_plugin/tools/runner.cpp b/modules/llama_cpp_plugin/tools/runner.cpp new file mode 100644 index 000000000..390301cdb --- /dev/null +++ b/modules/llama_cpp_plugin/tools/runner.cpp @@ -0,0 +1,73 @@ +#include "openvino/openvino.hpp" +#include + +int main(int argc, char* argv[]) { + ov::Core core; + core.set_property(ov::cache_dir("/tmp/my_cache_dir")); + std::string model_path = "/home/vshampor/work/optimum-intel/ov_model/openvino_model.xml"; + + std::cout << "VSHAMPOR: reading model\n"; + std::shared_ptr model = core.read_model(model_path); + + std::cout << "VSHAMPOR: compiling model\n"; + ov::CompiledModel compiled_model = core.compile_model(model, "LLAMA_CPP"); + + std::cout << "VSHAMPOR: compiled successfully\n"; + + std::cout << "VSHAMPOR: creating infer request\n"; + ov::InferRequest infer_request = compiled_model.create_infer_request(); + std::cout << "VSHAMPOR: infer request created\n"; + + // const ov::Output& input = compiled_model.input(); + // std::cout << "VSHAMPOR: got input\n"; + auto inputs = compiled_model.inputs(); + std::cout << "VSHAMPOR: model has " << inputs.size() << " inputs\n"; + for (const auto& input: inputs) { + std::cout << input.get_node()->get_friendly_name() << std::endl; + } + + for (size_t i = 0; i < inputs.size(); i++) { + const auto& curr_input = inputs[i]; + auto shape = curr_input.get_partial_shape(); + if (shape.is_dynamic()) { + std::cout << "VSHAMPOR: processing input " << i << " with a dynamic shape of " << shape.to_string() << std::endl; + ov::Rank r = shape.rank(); + if (r.get_length() == 2) { + ov::Tensor input_tensor{curr_input.get_element_type(), ov::Shape({1, 128})}; + int64_t* data_ptr = input_tensor.data(); + // fill with something + for (size_t elt_idx = 0; elt_idx < input_tensor.get_size(); elt_idx++) { + data_ptr[elt_idx] = 42; + } + infer_request.set_input_tensor(i, input_tensor); + } + else { // past_key_values + ov::Tensor input_tensor{curr_input.get_element_type(), ov::Shape({1, 12, 128, 64})}; + infer_request.set_input_tensor(i, input_tensor); + } + } + else { + std::cout << "VSHAMPOR: processing input " << i << " with a non-dynamic shape of " << shape.to_string() << std::endl; + ov::Tensor input_tensor{curr_input.get_element_type(), curr_input.get_shape()}; + infer_request.set_input_tensor(i, input_tensor); + } + } + std::cout << "VSHAMPOR: successfully set input tensor\n"; + + infer_request.infer(); + std::cout << "VSHAMPOR: inferred successfully\n"; + + ov::Tensor output = infer_request.get_tensor("logits"); + std::cout << "VSHAMPOR: got output tensor, shape " << output.get_shape().to_string() << std::endl; + + size_t n_output_elts = 10; + std::cout << "VSHAMPOR: first " << n_output_elts << " elements are:" << std::endl; + + float* output_data_ptr = output.data(); + for (size_t elt_idx = 0; elt_idx < n_output_elts; elt_idx++) { + std::cout << output_data_ptr[elt_idx] << " "; + } + + std::cout << std::endl; + return 0; +} diff --git a/modules/llama_cpp_plugin/tools/tensor_comparator.cpp b/modules/llama_cpp_plugin/tools/tensor_comparator.cpp new file mode 100644 index 000000000..83de96215 --- /dev/null +++ b/modules/llama_cpp_plugin/tools/tensor_comparator.cpp @@ -0,0 +1,95 @@ +#include "ggml.h" +#include +#include +#include +#include +#include +#include + + + +int main(int argc, char* argv[]) { + assert(argc == 3 || argc == 4); + std::string left_name(argv[1]); + std::string right_name(argv[2]); + + gguf_init_params left_params; left_params.no_alloc = false; left_params.ctx = nullptr; + gguf_init_params right_params; left_params.no_alloc = false; right_params.ctx = nullptr; + gguf_context* left_ctx = gguf_init_from_file(left_name.c_str(), left_params); + gguf_context* right_ctx = gguf_init_from_file(right_name.c_str(), right_params); + + std::vector tensor_names; + if (argc == 4) tensor_names.push_back(std::string(argv[3])); + else { + for (size_t idx = 0; idx < left_ctx->header.n_tensors; idx++) { + gguf_tensor_info left_tensor_info = left_ctx->infos[idx]; + tensor_names.push_back(left_tensor_info.name.data); + } + } + + for (const auto& tensor_name : tensor_names) { + + + int left_tensor_idx = gguf_find_tensor(left_ctx, tensor_name.c_str()); + int right_tensor_idx = gguf_find_tensor(right_ctx, tensor_name.c_str()); + + size_t left_tensor_offset = gguf_get_tensor_offset(left_ctx, left_tensor_idx) + left_ctx->offset; + size_t right_tensor_offset = gguf_get_tensor_offset(right_ctx, right_tensor_idx) + right_ctx->offset; + + gguf_tensor_info left_tensor_info = left_ctx->infos[left_tensor_idx]; + gguf_tensor_info right_tensor_info = right_ctx->infos[right_tensor_idx]; + + std::cout << "tensor name " << tensor_name << ", byte offsets: " << left_tensor_offset << " (left), " << right_tensor_offset << " (right)" << std::endl; + std::cout << "tensor name " << tensor_name << ", shape: "; + for (size_t i = 0; i < left_tensor_info.n_dims; i++) { + std::cout << left_tensor_info.ne[i] << ","; + } + std::cout << " (left), "; + + for (size_t i = 0; i < right_tensor_info.n_dims; i++) { + std::cout << right_tensor_info.ne[i] << ","; + } + std::cout << " (right) " << std::endl; + + size_t left_tensor_size = std::accumulate(std::begin(left_tensor_info.ne), std::begin(left_tensor_info.ne) + GGML_MAX_DIMS, (size_t) sizeof(float), std::multiplies()); + size_t right_tensor_size = std::accumulate(std::begin(right_tensor_info.ne), std::begin(right_tensor_info.ne) + GGML_MAX_DIMS, (size_t) sizeof(float), std::multiplies()); + + std::cout << "tensor name " << tensor_name << ", size (calculated): " << left_tensor_size << " (left), " << right_tensor_size << " (right)" << std::endl; + + if (left_tensor_size != right_tensor_size) { + std::cout << "size mismatch (" << left_tensor_size << " left, " << right_tensor_size << "right), exiting" << std::endl; + exit(-1); + } + + size_t bytes_compared = 0; + + std::ifstream left_file(left_name, std::ios::binary); + std::ifstream right_file(right_name, std::ios::binary); + + left_file.seekg(left_tensor_offset); + right_file.seekg(right_tensor_offset); + + std::cout << "first 10 float values:" << std::endl; + for (size_t i = 0; i < 10; i++) { + float left_value; left_file.read((char*) &left_value, sizeof(float)); + float right_value; right_file.read((char*) &right_value, sizeof(float)); + + std::cout << left_value << " left, " << right_value << " right" << std::endl; + } + + left_file.seekg(left_tensor_offset); + right_file.seekg(right_tensor_offset); + for (size_t i = 0; i < left_tensor_size; i++) { + char left_byte; left_file.read((char*) &left_byte, sizeof(char)); + char right_byte; right_file.read((char*) &right_byte, sizeof(char)); + + if (left_byte != right_byte) { + std::cout << "byte " << bytes_compared << " mismatch (" << std::hex << +((uint8_t) left_byte) << " left, " << +((uint8_t) right_byte) << " right)" << std::endl; + std::cout << "offset left " << std::hex << left_tensor_offset + bytes_compared << ", right " << right_tensor_offset + bytes_compared << std::endl; + exit(-1); + } + bytes_compared++; + } + std::cout << "tensor contents are identical, bytes compared: " << bytes_compared << std::endl; + } +} From f55badc9d2418e98304960934ffdc72af940a009 Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Mon, 11 Mar 2024 18:03:08 +0100 Subject: [PATCH 02/27] Basic test and test build --- modules/llama_cpp_plugin/CMakeLists.txt | 5 +- .../llama_cpp_plugin/src/compiled_model.cpp | 1519 +++++++++-------- .../llama_cpp_plugin/src/infer_request.cpp | 187 +- modules/llama_cpp_plugin/src/plugin.cpp | 278 +-- modules/llama_cpp_plugin/tests/CMakeLists.txt | 37 - .../llama_cpp_plugin/tests/e2e/CMakeLists.txt | 18 + .../tests/e2e/prompt_response.cpp | 63 + .../tests/e2e/set_device_name.cpp | 13 + modules/llama_cpp_plugin/tools/CMakeLists.txt | 3 +- 9 files changed, 1183 insertions(+), 940 deletions(-) delete mode 100644 modules/llama_cpp_plugin/tests/CMakeLists.txt create mode 100644 modules/llama_cpp_plugin/tests/e2e/CMakeLists.txt create mode 100644 modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp create mode 100644 modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp diff --git a/modules/llama_cpp_plugin/CMakeLists.txt b/modules/llama_cpp_plugin/CMakeLists.txt index f5d3284b2..1385eea5d 100644 --- a/modules/llama_cpp_plugin/CMakeLists.txt +++ b/modules/llama_cpp_plugin/CMakeLists.txt @@ -18,10 +18,7 @@ add_subdirectory(third_party/llama.cpp) if(ENABLE_TESTS) include(CTest) enable_testing() - - if(ENABLE_FUNCTIONAL_TESTS) - add_subdirectory(tests/functional) - endif() + add_subdirectory(tests/e2e) endif() diff --git a/modules/llama_cpp_plugin/src/compiled_model.cpp b/modules/llama_cpp_plugin/src/compiled_model.cpp index 932c0def4..85a65d7e6 100644 --- a/modules/llama_cpp_plugin/src/compiled_model.cpp +++ b/modules/llama_cpp_plugin/src/compiled_model.cpp @@ -1,732 +1,895 @@ #include "compiled_model.hpp" -#include "plugin.hpp" -#include "infer_request.hpp" + +#include #include #include #include -#include #include -namespace ov { - namespace llama_cpp_plugin { - class TensorWeightMatcher { - public: - // TODO (vshampor) implement this for faster weight node matching. - // Use std::list, two passes - first for full name match, second for prefix-match; remove entries from list on match - using RTInfoTensorName = std::string; - using OvNodeName = std::string; - using LlamaTensorName = std::string; - - TensorWeightMatcher(const std::shared_ptr& model, std::map tensor_names_with_shapes_to_match) { - std::multimap> intermediate_matches_map; - - const auto node_vector = model->get_ops(); - std::list> const_nodes_in_model; - for (const auto& node_ptr : node_vector) { - if (ov::is_type(node_ptr)) const_nodes_in_model.push_back(ov::as_type_ptr(node_ptr)); - } - - // full substring match pass - std::map unmatched_rt_info_names_on_first_pass = extract_matches(intermediate_matches_map, tensor_names_with_shapes_to_match, const_nodes_in_model, - [](const std::string& substring, const std::string& source) { return source.find(substring) != std::string::npos; }); - - // prefix substring match pass - std::map unmatched_rt_info_names_on_second_pass = extract_matches(intermediate_matches_map, unmatched_rt_info_names_on_first_pass, const_nodes_in_model, - [](const std::string& substring, const std::string& source) { - return source.find(get_weight_name_without_torch_postfix(substring)) != std::string::npos; }); - - for (auto it = intermediate_matches_map.begin(); it != intermediate_matches_map.end(); it = intermediate_matches_map.upper_bound(it->first)) { - // TODO: perf improvement by iterating with ++; - RTInfoTensorName rt_info_name = it->first; - if (intermediate_matches_map.count(rt_info_name) != 1) { - std::cout << "VSHAMPOR: multiple matches for weight name " << rt_info_name << " and shape " << it->second->get_shape().to_string() << ", found "; - auto range_it_pair = intermediate_matches_map.equal_range(rt_info_name); - for (auto multimatch_it = range_it_pair.first; multimatch_it != range_it_pair.second; multimatch_it++) { - auto node_ptr = multimatch_it->second; - std::cout << node_ptr->get_friendly_name() << "(shape " << node_ptr->get_shape().to_string() << "),"; - } - std::cout << "will take the first match" << std::endl; - } - const auto& match = intermediate_matches_map.find(rt_info_name)->second; - m_rtinfo_name_to_weight_node_map[rt_info_name] = match; - } - if (!unmatched_rt_info_names_on_second_pass.empty()) { - std::cout << "VSHAMPOR: did not find the weight node for " << unmatched_rt_info_names_on_second_pass.size() << " weights:" << std::endl; - } - for (const auto& unmatched_entry: unmatched_rt_info_names_on_second_pass) { - std::cout << '\t' << unmatched_entry.first << std::endl; - } - } - - std::unordered_map> get_matches() { return m_rtinfo_name_to_weight_node_map; } - - private: - std::map extract_matches(std::multimap>& output_matches_map, - const std::map& names_with_shapes_to_match, - const std::list>& search_list, - std::function name_match_predicate) { - std::map unmatched_rt_info_names; - for (const auto& pair: names_with_shapes_to_match) { - RTInfoTensorName rt_info_name = pair.first; - const ov::Shape& wanted_shape = pair.second; - bool matched = false; - for (auto it = search_list.begin(); it != search_list.end(); it++) { - auto node_ptr = *it; - const std::string& friendly_name = node_ptr->get_friendly_name(); - if (name_match_predicate(rt_info_name, friendly_name) && - node_ptr->get_shape() == wanted_shape) { - output_matches_map.insert(std::make_pair(rt_info_name, node_ptr)); - matched = true; - break; - } - } - if (!matched) unmatched_rt_info_names.insert(pair); - } - return unmatched_rt_info_names; - } - - static std::string get_weight_name_without_torch_postfix(std::string torch_weight_name) { - size_t idx = torch_weight_name.rfind("."); - if (idx == std::string::npos) return torch_weight_name; - return std::string(torch_weight_name, 0, idx); - } - - size_t num_exact_matches = 0; - size_t num_partial_matches = 0; - std::unordered_map> m_rtinfo_name_to_weight_node_map; - }; - - - std::vector> get_nodes_containing_name_with_shape(const std::shared_ptr& model, const std::string& weight_name, const ov::Shape& shape) { - auto ops = model->get_ops(); - std::vector> found_weight_nodes; - std::copy_if(ops.begin(), ops.end(), std::back_inserter(found_weight_nodes), - [&weight_name, &shape](const std::shared_ptr& val) { - if (!ov::is_type(val)) return false; - std::shared_ptr node_ptr = ov::as_type_ptr(val); - return val->get_friendly_name().find(weight_name) != std::string::npos && - val->get_shape() == shape; - }); - return found_weight_nodes; - } - - bool has_weight_matches(const std::shared_ptr& model, const std::string& weight_name, const ov::Shape& shape) { - std::vector> found_weight_nodes; - found_weight_nodes = get_nodes_containing_name_with_shape(model, weight_name, shape); - return !found_weight_nodes.empty(); - } - - std::string get_weight_name_without_torch_postfix(std::string torch_weight_name) { - size_t idx = torch_weight_name.rfind("."); - if (idx == std::string::npos) return torch_weight_name; - return std::string(torch_weight_name, 0, idx); - } +#include "infer_request.hpp" +#include "plugin.hpp" - bool has_partial_weight_matches(const std::shared_ptr& model, const std::string& weight_name, const ov::Shape& shape) { - std::vector> found_weight_nodes; - found_weight_nodes = get_nodes_containing_name_with_shape(model, get_weight_name_without_torch_postfix(weight_name), shape); - return !found_weight_nodes.empty(); +namespace ov { +namespace llama_cpp_plugin { +class TensorWeightMatcher { +public: + // TODO (vshampor) implement this for faster weight node matching. + // Use std::list, two passes - first for full name match, second for + // prefix-match; remove entries from list on match + using RTInfoTensorName = std::string; + using OvNodeName = std::string; + using LlamaTensorName = std::string; + + TensorWeightMatcher(const std::shared_ptr& model, + std::map tensor_names_with_shapes_to_match) { + std::multimap> intermediate_matches_map; + + const auto node_vector = model->get_ops(); + std::list> const_nodes_in_model; + for (const auto& node_ptr : node_vector) { + if (ov::is_type(node_ptr)) + const_nodes_in_model.push_back(ov::as_type_ptr(node_ptr)); } - std::shared_ptr get_weight_by_name_and_shape(const std::shared_ptr& model, const std::string& weight_name, const ov::Shape& shape) { - OPENVINO_ASSERT(has_weight_matches(model, weight_name, shape)); - std::vector> found_weight_nodes; - found_weight_nodes = get_nodes_containing_name_with_shape(model, weight_name, shape); - - if (found_weight_nodes.size() > 1) { - std::cout << "VSHAMPOR: multiple matches for weight name " << weight_name << " and shape " << shape.to_string() << ", found "; - for (const auto& node_ptr : found_weight_nodes) { - std::cout << node_ptr->get_friendly_name() << "(shape " << shape.to_string() << "),"; + // full substring match pass + std::map unmatched_rt_info_names_on_first_pass = + extract_matches(intermediate_matches_map, + tensor_names_with_shapes_to_match, + const_nodes_in_model, + [](const std::string& substring, const std::string& source) { + return source.find(substring) != std::string::npos; + }); + + // prefix substring match pass + std::map unmatched_rt_info_names_on_second_pass = extract_matches( + intermediate_matches_map, + unmatched_rt_info_names_on_first_pass, + const_nodes_in_model, + [](const std::string& substring, const std::string& source) { + return source.find(get_weight_name_without_torch_postfix(substring)) != std::string::npos; + }); + + for (auto it = intermediate_matches_map.begin(); it != intermediate_matches_map.end(); + it = intermediate_matches_map.upper_bound(it->first)) { + // TODO: perf improvement by iterating with ++; + RTInfoTensorName rt_info_name = it->first; + if (intermediate_matches_map.count(rt_info_name) != 1) { + std::cout << "VSHAMPOR: multiple matches for weight name " << rt_info_name << " and shape " + << it->second->get_shape().to_string() << ", found "; + auto range_it_pair = intermediate_matches_map.equal_range(rt_info_name); + for (auto multimatch_it = range_it_pair.first; multimatch_it != range_it_pair.second; multimatch_it++) { + auto node_ptr = multimatch_it->second; + std::cout << node_ptr->get_friendly_name() << "(shape " << node_ptr->get_shape().to_string() + << "),"; } std::cout << "will take the first match" << std::endl; } - std::shared_ptr node_with_tensor = found_weight_nodes.front(); - OPENVINO_ASSERT(ov::is_type(node_with_tensor)); - std::shared_ptr const_node_ptr = ov::as_type_ptr(node_with_tensor); - return const_node_ptr; - } - - using TransposePermutation = std::pair; - - std::vector expand_front(const std::vector& vec, size_t val) { - OPENVINO_ASSERT(vec.size() < GGML_MAX_DIMS); - std::vector retval(GGML_MAX_DIMS, val); - std::copy(vec.rbegin(), vec.rend(), retval.rbegin()); - return retval; + const auto& match = intermediate_matches_map.find(rt_info_name)->second; + m_rtinfo_name_to_weight_node_map[rt_info_name] = match; } - - void write_float_plus_one(std::ofstream& out, const float* src) { - float elt = *src; - elt += 1; - out.write((const char*) &elt, sizeof(float)); + if (!unmatched_rt_info_names_on_second_pass.empty()) { + std::cout << "VSHAMPOR: did not find the weight node for " << unmatched_rt_info_names_on_second_pass.size() + << " weights:" << std::endl; } - - void append_tensor_data_with_transpositions(const std::string& fname, const std::vector& tensor_infos, const std::vector& tensor_data_ptrs, - const std::map& transpositions, const std::set increment_by_one_tensor_names) { - // assuming contiguous data underneath each pointer from tensor_data_ptrs - OPENVINO_ASSERT(tensor_infos.size() == tensor_data_ptrs.size()); - std::ofstream out(fname, std::ios::app | std::ios::out); - for (size_t i = 0; i < tensor_infos.size(); i++) { - const auto& tensor_info = tensor_infos[i]; - OPENVINO_ASSERT(tensor_info.type == GGML_TYPE_F32); // TODO (vshampor): writing transposed tensor data for other data types, especially lower-bitwidth; maybe use OV inference for that - - const char* ir_tensor_data = reinterpret_cast(tensor_data_ptrs[i]); - - std::string tensor_llama_name = std::string(tensor_info.name.data); - auto it = transpositions.find(tensor_llama_name); - if (it == transpositions.end()) { - // original IR tensor should not be transposed to conform to GGUF expectations, can write as-is - if (increment_by_one_tensor_names.count(tensor_llama_name) != 0) { // gemma case - size_t elt_size = sizeof(float); // FP32 only for now - OPENVINO_ASSERT(!(tensor_info.size % elt_size)); - size_t num_elts = tensor_info.size / elt_size; - for (size_t elt_idx = 0; elt_idx < num_elts; elt_idx++) { - write_float_plus_one(out, ((float*) ir_tensor_data) + elt_idx); - } - } - else { - out.write(ir_tensor_data, tensor_info.size); - } - continue; - } - - if (it != transpositions.end()) { - std::vector gguf_layout_shape; - - // the shape in .ne is inverted w.r.t original export (~= IR) weight layout - for (size_t dim_idx = 0; dim_idx < tensor_info.n_dims; dim_idx++) { - gguf_layout_shape.push_back(tensor_info.ne[GGML_MAX_DIMS - 1 - tensor_info.n_dims - dim_idx]); - } - - TransposePermutation permutation = it->second; - std::vector ir_layout_shape(gguf_layout_shape); - std::swap(ir_layout_shape[permutation.first], ir_layout_shape[permutation.second]); - - std::vector ir_layout_strides(tensor_info.n_dims, 1); - - for (size_t idx = 0; idx < tensor_info.n_dims - 1 ; idx++) { - auto previous_stride_it = ir_layout_strides.rbegin() + idx; - auto stride_it = ir_layout_strides.rbegin() + idx + 1; - auto shape_it = ir_layout_shape.rbegin() + idx; - *stride_it = *shape_it * *previous_stride_it; - } - - - std::vector permuted_strides(ir_layout_strides); - std::swap(permuted_strides[permutation.first], permuted_strides[permutation.second]); - - // expand up to GGML_MAX_DIMS - std::vector gguf_layout_shape_ex = expand_front(gguf_layout_shape, 1); - // stride for unused dims will be 0, has no effect on loop because dimension idx for that dim is always 0 - permuted_strides = expand_front(permuted_strides, 0); - - - - std::cout << "VSHAMPOR: writing tensor " << tensor_info.name.data << " with size " << tensor_info.size; - std::cout << " shape (GGUF layout) "; - for (auto dim: gguf_layout_shape) std::cout << dim << ","; - std::cout << " shape (IR layout) "; - for (auto dim : ir_layout_shape) std::cout << dim << ","; - std::cout << " stride (IR layout) "; - for (auto stride : ir_layout_strides) std::cout << stride << ","; - std::cout << " stride (IR layout, transposing) "; - for (auto stride : permuted_strides) std::cout << stride << ","; - std::cout << std::endl; - - // TODO (vshampor): rewrite the loop below using recurrent templates? - // This relies on GGUF_MAX_DIMS == 4 and unused dims being equal to 1 - size_t current_offset = 0; - size_t element_size = sizeof(float); - size_t num_bytes_written = 0; - for (size_t dim_0 = 0; dim_0 < gguf_layout_shape_ex[0]; dim_0++) - for (size_t dim_1 = 0; dim_1 < gguf_layout_shape_ex[1]; dim_1++) - for (size_t dim_2 = 0; dim_2 < gguf_layout_shape_ex[2]; dim_2++) - for (size_t dim_3 = 0; dim_3 < gguf_layout_shape_ex[3]; dim_3++) { - current_offset = element_size * (dim_0 * permuted_strides[0] + dim_1 * permuted_strides[1] + dim_2 * permuted_strides[2] + dim_3 * permuted_strides[3]); - if (increment_by_one_tensor_names.count(tensor_llama_name) != 0) { // gemma case - write_float_plus_one(out, (float*) ir_tensor_data + current_offset); - } - else { - out.write(ir_tensor_data + current_offset, element_size); - } - num_bytes_written += element_size; - } - std::cout << "VSHAMPOR: wrote " << num_bytes_written << std::endl; - OPENVINO_ASSERT(num_bytes_written == tensor_info.size); - } - } + for (const auto& unmatched_entry : unmatched_rt_info_names_on_second_pass) { + std::cout << '\t' << unmatched_entry.first << std::endl; } + } - struct ValueStorageForLifetimeExtension { - std::list kv_key_string_storage; - std::list kv_value_string_storage; - std::list> str_arr_storage; - void* store_gguf_value_vector(const std::vector& vec, gguf_type g_type) { - size_t elt_size; - switch (g_type) { - case GGUF_TYPE_UINT8: elt_size = sizeof(uint8_t); break; - case GGUF_TYPE_INT8: elt_size = sizeof(int8_t); break; - case GGUF_TYPE_UINT16: elt_size = sizeof(uint16_t); break; - case GGUF_TYPE_INT16: elt_size = sizeof(int16_t); break; - case GGUF_TYPE_UINT32: elt_size = sizeof(uint32_t); break; - case GGUF_TYPE_INT32: elt_size = sizeof(int32_t); break; - case GGUF_TYPE_FLOAT32: elt_size = sizeof(float); break; - case GGUF_TYPE_UINT64: elt_size = sizeof(uint64_t); break; - case GGUF_TYPE_INT64: elt_size = sizeof(int64_t); break; - case GGUF_TYPE_FLOAT64: elt_size = sizeof(double); break; - case GGUF_TYPE_BOOL: elt_size = sizeof(bool); break; - default: - OPENVINO_THROW("Unknown array type"); - } - size_t size_in_bytes = vec.size() * elt_size; - void* mem_ptr = new char[size_in_bytes]; - for (size_t i = 0; i < vec.size(); i++) { - switch (g_type) { - case GGUF_TYPE_UINT8: ((uint8_t*) mem_ptr)[i] = vec[i].uint8; break; - case GGUF_TYPE_INT8: ((int8_t*) mem_ptr)[i] = vec[i].int8; break; - case GGUF_TYPE_UINT16: ((uint16_t*) mem_ptr)[i] = vec[i].uint16; break; - case GGUF_TYPE_INT16: ((int16_t*) mem_ptr)[i] = vec[i].int16; break; - case GGUF_TYPE_UINT32: ((uint32_t*) mem_ptr)[i] = vec[i].uint32; break; - case GGUF_TYPE_INT32: ((int32_t*) mem_ptr)[i] = vec[i].int32; break; - case GGUF_TYPE_FLOAT32: ((float*) mem_ptr)[i] = vec[i].float32; break; - case GGUF_TYPE_UINT64: ((uint64_t*) mem_ptr)[i] = vec[i].uint64; break; - case GGUF_TYPE_INT64: ((int64_t*) mem_ptr)[i] = vec[i].int64; break; - case GGUF_TYPE_FLOAT64: ((double*) mem_ptr)[i] = vec[i].float64; break; - case GGUF_TYPE_BOOL: ((bool*) mem_ptr)[i] = vec[i].bool_; break; - default: - OPENVINO_THROW("Unknown array type"); - } - } - return mem_ptr; - } + std::unordered_map> get_matches() { + return m_rtinfo_name_to_weight_node_map; + } - ValueStorageForLifetimeExtension() = default; - ~ValueStorageForLifetimeExtension() { - for (void* ptr: non_str_raw_storage) { - delete[] (char*) ptr; +private: + std::map extract_matches( + std::multimap>& output_matches_map, + const std::map& names_with_shapes_to_match, + const std::list>& search_list, + std::function name_match_predicate) { + std::map unmatched_rt_info_names; + for (const auto& pair : names_with_shapes_to_match) { + RTInfoTensorName rt_info_name = pair.first; + const ov::Shape& wanted_shape = pair.second; + bool matched = false; + for (auto it = search_list.begin(); it != search_list.end(); it++) { + auto node_ptr = *it; + const std::string& friendly_name = node_ptr->get_friendly_name(); + if (name_match_predicate(rt_info_name, friendly_name) && node_ptr->get_shape() == wanted_shape) { + output_matches_map.insert(std::make_pair(rt_info_name, node_ptr)); + matched = true; + break; } } - private: - std::list non_str_raw_storage; - }; - - bool maybe_parse_single_element(gguf_type g_type, ov::Any rtmap_value, gguf_value& dst, ValueStorageForLifetimeExtension& store) { - switch (g_type) { - case GGUF_TYPE_UINT8: dst.uint8 = rtmap_value.as(); break; - case GGUF_TYPE_INT8: dst.int8 = rtmap_value.as(); ; break; - case GGUF_TYPE_UINT16: dst.uint16 = rtmap_value.as(); break; - case GGUF_TYPE_INT16: dst.int16 = rtmap_value.as(); break; - case GGUF_TYPE_UINT32: dst.uint32 = rtmap_value.as(); break; - case GGUF_TYPE_INT32: dst.int32 = rtmap_value.as(); break; - case GGUF_TYPE_FLOAT32: dst.float32 = rtmap_value.as(); break; - case GGUF_TYPE_UINT64: dst.uint64 = rtmap_value.as(); break; - case GGUF_TYPE_INT64: dst.int64 = rtmap_value.as(); break; - case GGUF_TYPE_FLOAT64: dst.float64 = rtmap_value.as(); break; - case GGUF_TYPE_BOOL: dst.bool_ = rtmap_value.as(); break; - case GGUF_TYPE_STRING: { - std::string string_value = rtmap_value.as(); - store.kv_value_string_storage.push_back(string_value); - dst.str.n = string_value.length(); - dst.str.data = (char*) store.kv_value_string_storage.back().c_str(); // TODO (vshampor) see equivalent case below - break; - } - default: - return false; // did not parse - } - return true; // parsed successfully + if (!matched) + unmatched_rt_info_names.insert(pair); } + return unmatched_rt_info_names; + } - ov::Any get_any_associated_with_gguf_type(gguf_type g_type) { - switch (g_type) { - case GGUF_TYPE_UINT8: return ov::Any(uint8_t()); - case GGUF_TYPE_INT8: return ov::Any(int8_t()); - case GGUF_TYPE_UINT16: return ov::Any(uint16_t()); - case GGUF_TYPE_INT16: return ov::Any(int16_t()); - case GGUF_TYPE_UINT32: return ov::Any(uint32_t()); - case GGUF_TYPE_INT32: return ov::Any(int32_t()); - case GGUF_TYPE_FLOAT32: return ov::Any(float()); - case GGUF_TYPE_UINT64: return ov::Any(uint64_t()); - case GGUF_TYPE_INT64: return ov::Any(int64_t()); - case GGUF_TYPE_FLOAT64: return ov::Any(double()); - case GGUF_TYPE_BOOL: return ov::Any(bool()); - case GGUF_TYPE_STRING: return ov::Any(std::string()); - default: - OPENVINO_THROW("Unknown gguf_type to turn into ov::Any"); - } - } - - - LlamaCppModel::LlamaCppModel(const std::shared_ptr& model, - const std::shared_ptr& plugin, - const ov::SoPtr& context, - const std::shared_ptr& task_executor - ) : ICompiledModel(model, plugin, context, task_executor) { - m_model = model; - num_tokens_processed_ptr = new size_t; // TODO (vshampor): hack, remove - *num_tokens_processed_ptr = 0; - auto rt_info = model->get_rt_info(); - OPENVINO_ASSERT(rt_info.count("lcp_kv_params") != 0); - OPENVINO_ASSERT(rt_info.count("lcp_kv_types") != 0); - OPENVINO_ASSERT(rt_info.count("lcp_kv_array_types") != 0); - OPENVINO_ASSERT(rt_info.count("lcp_tensor_name_map") != 0); - OPENVINO_ASSERT(rt_info.count("lcp_tensor_shape_map") != 0); - OPENVINO_ASSERT(rt_info.count("lcp_expected_tensor_shapes") != 0); - OPENVINO_ASSERT(rt_info.count("lcp_transpose_permutations") != 0); - - RTMap& kv_params = model->get_rt_info("lcp_kv_params"); - RTMap& kv_types = model->get_rt_info("lcp_kv_types"); - RTMap& kv_array_types = model->get_rt_info("lcp_kv_array_types"); - RTMap& tensor_name_map = model->get_rt_info("lcp_tensor_name_map"); - RTMap& tensor_shape_map = model->get_rt_info("lcp_tensor_shape_map"); - RTMap& expected_tensor_shapes_map = model->get_rt_info("lcp_expected_tensor_shapes"); - RTMap& transpose_permutations_rtmap = model->get_rt_info("lcp_transpose_permutations"); - - size_t gguf_version = model->get_rt_info("lcp_gguf_version"); - std::cout << "VSHAMPOR: parsed gguf_version " << gguf_version << std::endl; - - // kv params - OPENVINO_ASSERT(kv_params.size() == kv_types.size()); - size_t n_kv = kv_params.size(); - std::vector kv_vector; - ValueStorageForLifetimeExtension store; - - for (const auto& kv_pair: kv_params) { - gguf_kv kv; - - const auto& key = kv_pair.first; - kv.key.n = key.length(); - store.kv_key_string_storage.push_back(key); - kv.key.data = (char*) store.kv_key_string_storage.back().c_str(); // TODO (vshampor) see equivalent case below - - uint32_t value_type = kv_types[key].as(); - gguf_type gguf_value_type = (gguf_type) value_type; - kv.type = gguf_value_type; - if (gguf_value_type != GGUF_TYPE_ARRAY) { - bool is_parsed = maybe_parse_single_element(kv.type, kv_pair.second, kv.value, store); - OPENVINO_ASSERT(is_parsed, "Invalid type of a GGUF kv-value"); - } - else { // array case - gguf_type element_type = (gguf_type) kv_array_types[key].as(); - kv.value.arr.type = element_type; - std::string serialized_array = kv_pair.second.as(); - std::stringstream ss{serialized_array}; - std::vector parsed_array; - while (!ss.eof()) { - gguf_value array_elt; - ov::Any ov_any = get_any_associated_with_gguf_type(element_type); - std::string token; ss >> token; - if (std::string(kv.key.data) == "tokenizer.ggml.merges") { - // tokenizer merges are pairs of tokens separated by whitespace, so need to read another to get a proper merge - // TODO (vshampor): think of another delimiting strategy in the rt_info and use that strategy here for more robust code - std::string another_token; ss >> another_token; - token += std::string(" ") + another_token; - ov_any = ov::Any::make(token); - } - else { - std::stringstream tok_ss{token}; - ov_any.read(tok_ss); - } - bool is_parsed = maybe_parse_single_element(element_type, ov_any, array_elt, store); - OPENVINO_ASSERT(is_parsed); - parsed_array.push_back(array_elt); - } - kv.value.arr.n = parsed_array.size(); - if (element_type == GGUF_TYPE_STRING) { - // string element has already been lifetime-extended during parsing - std::vector cstr_vector(parsed_array.size()); - for (size_t cstr_idx = 0; cstr_idx < parsed_array.size(); cstr_idx++) { - cstr_vector[cstr_idx] = parsed_array[cstr_idx].str.data; - } - store.str_arr_storage.push_back(cstr_vector); - kv.value.arr.data = store.str_arr_storage.back().data(); - } - else { - void* data_ptr = store.store_gguf_value_vector(parsed_array, element_type); - kv.value.arr.data = data_ptr; - } - } - kv_vector.push_back(kv); - } + static std::string get_weight_name_without_torch_postfix(std::string torch_weight_name) { + size_t idx = torch_weight_name.rfind("."); + if (idx == std::string::npos) + return torch_weight_name; + return std::string(torch_weight_name, 0, idx); + } - auto token_types_kv_it = std::find_if(kv_vector.begin(), kv_vector.end(), [](const gguf_kv& val) { return std::string(val.key.data) == "tokenizer.ggml.token_type"; }); - if (token_types_kv_it != kv_vector.end()) { - auto tokens_kv_it = std::find_if(kv_vector.begin(), kv_vector.end(), [](const gguf_kv& val) { return std::string(val.key.data) == "tokenizer.ggml.tokens"; }); - if (tokens_kv_it != kv_vector.end()) { - size_t expected_num_tokens = token_types_kv_it->value.arr.n; - size_t actual_num_tokens = tokens_kv_it->value.arr.n; - if (actual_num_tokens < expected_num_tokens) { - std::cout << "VSHAMPOR: detected wrong vocab serialization/deserialization (expected " << expected_num_tokens << " tokens, parsed " << actual_num_tokens << " from vocab), filling tokens with bogus values" << std::endl; - std::vector new_vocab; - // char** old_vocab_data_ptr = (char**) tokens_kv_it->value.arr.data; - // std::copy(old_vocab_data_ptr, old_vocab_data_ptr + actual_num_tokens, new_vocab.begin()); - // size_t extra_tokens_needed = expected_num_tokens - actual_num_tokens; - size_t extra_tokens_needed = expected_num_tokens; - for (size_t tok_idx = 0; tok_idx < extra_tokens_needed; tok_idx++) { - std::stringstream ss; - ss << "invalid_token_" << tok_idx; - std::string new_token = ss.str(); - store.kv_value_string_storage.push_back(new_token); - char* str_data_ptr = (char*) store.kv_value_string_storage.back().c_str(); - new_vocab.push_back(str_data_ptr); - } - OPENVINO_ASSERT(new_vocab.size() == expected_num_tokens); - store.str_arr_storage.push_back(new_vocab); - tokens_kv_it->value.arr.data = (void*) store.str_arr_storage.back().data(); - tokens_kv_it->value.arr.n = expected_num_tokens; - } + size_t num_exact_matches = 0; + size_t num_partial_matches = 0; + std::unordered_map> m_rtinfo_name_to_weight_node_map; +}; + +std::vector> get_nodes_containing_name_with_shape(const std::shared_ptr& model, + const std::string& weight_name, + const ov::Shape& shape) { + auto ops = model->get_ops(); + std::vector> found_weight_nodes; + std::copy_if(ops.begin(), + ops.end(), + std::back_inserter(found_weight_nodes), + [&weight_name, &shape](const std::shared_ptr& val) { + if (!ov::is_type(val)) + return false; + std::shared_ptr node_ptr = ov::as_type_ptr(val); + return val->get_friendly_name().find(weight_name) != std::string::npos && + val->get_shape() == shape; + }); + return found_weight_nodes; +} + +bool has_weight_matches(const std::shared_ptr& model, + const std::string& weight_name, + const ov::Shape& shape) { + std::vector> found_weight_nodes; + found_weight_nodes = get_nodes_containing_name_with_shape(model, weight_name, shape); + return !found_weight_nodes.empty(); +} + +std::string get_weight_name_without_torch_postfix(std::string torch_weight_name) { + size_t idx = torch_weight_name.rfind("."); + if (idx == std::string::npos) + return torch_weight_name; + return std::string(torch_weight_name, 0, idx); +} + +bool has_partial_weight_matches(const std::shared_ptr& model, + const std::string& weight_name, + const ov::Shape& shape) { + std::vector> found_weight_nodes; + found_weight_nodes = + get_nodes_containing_name_with_shape(model, get_weight_name_without_torch_postfix(weight_name), shape); + return !found_weight_nodes.empty(); +} + +std::shared_ptr get_weight_by_name_and_shape(const std::shared_ptr& model, + const std::string& weight_name, + const ov::Shape& shape) { + OPENVINO_ASSERT(has_weight_matches(model, weight_name, shape)); + std::vector> found_weight_nodes; + found_weight_nodes = get_nodes_containing_name_with_shape(model, weight_name, shape); + + if (found_weight_nodes.size() > 1) { + std::cout << "VSHAMPOR: multiple matches for weight name " << weight_name << " and shape " << shape.to_string() + << ", found "; + for (const auto& node_ptr : found_weight_nodes) { + std::cout << node_ptr->get_friendly_name() << "(shape " << shape.to_string() << "),"; + } + std::cout << "will take the first match" << std::endl; + } + std::shared_ptr node_with_tensor = found_weight_nodes.front(); + OPENVINO_ASSERT(ov::is_type(node_with_tensor)); + std::shared_ptr const_node_ptr = ov::as_type_ptr(node_with_tensor); + return const_node_ptr; +} + +using TransposePermutation = std::pair; + +std::vector expand_front(const std::vector& vec, size_t val) { + OPENVINO_ASSERT(vec.size() < GGML_MAX_DIMS); + std::vector retval(GGML_MAX_DIMS, val); + std::copy(vec.rbegin(), vec.rend(), retval.rbegin()); + return retval; +} + +void write_float_plus_one(std::ofstream& out, const float* src) { + float elt = *src; + elt += 1; + out.write((const char*)&elt, sizeof(float)); +} + +void append_tensor_data_with_transpositions(const std::string& fname, + const std::vector& tensor_infos, + const std::vector& tensor_data_ptrs, + const std::map& transpositions, + const std::set increment_by_one_tensor_names) { + // assuming contiguous data underneath each pointer from tensor_data_ptrs + OPENVINO_ASSERT(tensor_infos.size() == tensor_data_ptrs.size()); + std::ofstream out(fname, std::ios::app | std::ios::out); + for (size_t i = 0; i < tensor_infos.size(); i++) { + const auto& tensor_info = tensor_infos[i]; + OPENVINO_ASSERT(tensor_info.type == GGML_TYPE_F32); // TODO (vshampor): writing transposed tensor data for + // other data types, especially lower-bitwidth; maybe + // use OV inference for that + + const char* ir_tensor_data = reinterpret_cast(tensor_data_ptrs[i]); + + std::string tensor_llama_name = std::string(tensor_info.name.data); + auto it = transpositions.find(tensor_llama_name); + if (it == transpositions.end()) { + // original IR tensor should not be transposed to conform to GGUF + // expectations, can write as-is + if (increment_by_one_tensor_names.count(tensor_llama_name) != 0) { // gemma case + size_t elt_size = sizeof(float); // FP32 only for now + OPENVINO_ASSERT(!(tensor_info.size % elt_size)); + size_t num_elts = tensor_info.size / elt_size; + for (size_t elt_idx = 0; elt_idx < num_elts; elt_idx++) { + write_float_plus_one(out, ((float*)ir_tensor_data) + elt_idx); } + } else { + out.write(ir_tensor_data, tensor_info.size); } + continue; + } - // tensors - OPENVINO_ASSERT(tensor_name_map.size() == tensor_shape_map.size()); - size_t n_tensors_in_rtinfo = tensor_name_map.size(); - std::cout << "VSHAMPOR: got request for " << n_tensors_in_rtinfo << " tensors from rt_info\n"; - - std::vector tensor_infos; - std::vector tensor_data_ptrs; - - std::map parsed_weights_to_search_for; - for (const auto& llama_name_and_rtinfo_name : tensor_name_map) { - const std::string& llama_name = llama_name_and_rtinfo_name.first; - const std::string& rtinfo_name = llama_name_and_rtinfo_name.second.as(); - ov::Shape expected_shape = tensor_shape_map[llama_name].as(); - parsed_weights_to_search_for[rtinfo_name] = expected_shape; - } + if (it != transpositions.end()) { + std::vector gguf_layout_shape; - TensorWeightMatcher matcher{model, parsed_weights_to_search_for}; - std::unordered_map> matches = matcher.get_matches(); - std::unordered_map> llama_name_to_constant_node_map; - for (const auto& entry : tensor_name_map) { - const auto& llama_name = entry.first; - const auto& rtinfo_name = entry.second.as(); - llama_name_to_constant_node_map[llama_name] = matches[rtinfo_name]; + // the shape in .ne is inverted w.r.t original export (~= IR) weight + // layout + for (size_t dim_idx = 0; dim_idx < tensor_info.n_dims; dim_idx++) { + gguf_layout_shape.push_back(tensor_info.ne[GGML_MAX_DIMS - 1 - tensor_info.n_dims - dim_idx]); } - std::cout << "VSHAMPOR: requested tensors map to " << llama_name_to_constant_node_map.size() << " tensors to search in model (shared tensors considered)\n"; - - - std::list llama_name_storage; - - size_t n_tensors = 0; - - size_t offset = 0; // each tensor_info has to have a correct offset including padding, checked for in gguf_write_to_buf - for (const auto& matched_weight_pair : llama_name_to_constant_node_map) { - // Need to store the names in the list so that the passed c_str() pointers in tensor_infos to the llama names stay valid - // until they get deepcopied in gguf/llama functions - llama_name_storage.push_back(matched_weight_pair.first); - const std::string& llama_name = llama_name_storage.back(); - - auto weight_const_node_ptr = matched_weight_pair.second; - auto weight_shape = weight_const_node_ptr->get_shape(); - - // does hf-to-gguf invert all tensor dimensions with shapes > 1? - auto expected_weight_shape = ov::Shape(expected_tensor_shapes_map[llama_name].as()); - OPENVINO_ASSERT(expected_weight_shape.size() < GGML_MAX_DIMS); - gguf_tensor_info info; + TransposePermutation permutation = it->second; + std::vector ir_layout_shape(gguf_layout_shape); + std::swap(ir_layout_shape[permutation.first], ir_layout_shape[permutation.second]); - info.type = GGML_TYPE_F32; // TODO (vshampor): better type assignment based on actual element type of the Constant node + std::vector ir_layout_strides(tensor_info.n_dims, 1); - info.name.n = llama_name.length(); - info.name.data = (char*) llama_name.c_str(); // TODO (vshampor): either do this via const_cast, or will have to implement own structures for - // read-only data passing to llama_load_model_from_data - info.n_dims = weight_shape.size(); - std::fill(std::begin(info.ne), std::begin(info.ne) + GGML_MAX_DIMS, (uint64_t) 1); - - // looks like GGUF expects inverse order of dimensions when compared to e.g. torch and actual row-major layout, see gguf.gguf_writer.GGUFWriter.add_tensor_info - // in gguf python package - std::copy(expected_weight_shape.rbegin(), expected_weight_shape.rend(), info.ne); - - void* data_ptr = (void*)(weight_const_node_ptr->get_data_ptr()); // TODO (vshampor): danger - casts `const` away - // also - the expected_weight_shape is in general different from actual ov::Tensor shape, - // in particular it may be transposed, so we actually need to set the pointers to shape-corrected - // tensor storage, which we don't do here - we are only preparing this data to get a convenient - // gguf_context object to reuse metadata (header) writing code, tensor data transpositions will be done during - // actual file write - - info.size = weight_const_node_ptr->get_byte_size(); - info.offset = offset; - - const size_t size_pad = GGML_PAD(info.size, GGUF_DEFAULT_ALIGNMENT); - offset += size_pad; - - info.data = data_ptr; - - tensor_infos.push_back(info); - tensor_data_ptrs.push_back(data_ptr); - n_tensors++; + for (size_t idx = 0; idx < tensor_info.n_dims - 1; idx++) { + auto previous_stride_it = ir_layout_strides.rbegin() + idx; + auto stride_it = ir_layout_strides.rbegin() + idx + 1; + auto shape_it = ir_layout_shape.rbegin() + idx; + *stride_it = *shape_it * *previous_stride_it; } - std::cout << "VSHAMPOR: found " << matches.size() << "/" << parsed_weights_to_search_for.size() << " tensors" << std::endl; - - gguf_init_params gguf_params; - gguf_params.no_alloc = false; - gguf_params.ctx = nullptr; - - m_gguf_ctx = gguf_init_from_data(n_tensors, tensor_infos.data(), n_kv, kv_vector.data(), tensor_data_ptrs.data(), gguf_params); - - std::shared_ptr llama_plugin_ptr = std::dynamic_pointer_cast(plugin); - m_converted_gguf_file_name = llama_plugin_ptr->get_current_gguf_file_path(); - - std::cout << "VSHAMPOR: output filename is " << m_converted_gguf_file_name << std::endl; - std::cout << "VSHAMPOR: writing metadata (GGUF header) " << std::endl; - gguf_write_to_file(m_gguf_ctx, m_converted_gguf_file_name.c_str(), /* only_meta = */ true); - - std::map transpose_permutations; - - for (const auto& llama_name_and_permutation : transpose_permutations_rtmap) { - std::string permutation_str = llama_name_and_permutation.second.as(); - std::stringstream ss(permutation_str); - TransposePermutation permutation; - bool is_ok = true; - is_ok &= static_cast(ss >> permutation.first); - is_ok &= static_cast(ss >> permutation.second); - OPENVINO_ASSERT(is_ok, "failed to read permutation"); - transpose_permutations[llama_name_and_permutation.first] = permutation; + std::vector permuted_strides(ir_layout_strides); + std::swap(permuted_strides[permutation.first], permuted_strides[permutation.second]); + + // expand up to GGML_MAX_DIMS + std::vector gguf_layout_shape_ex = expand_front(gguf_layout_shape, 1); + // stride for unused dims will be 0, has no effect on loop because + // dimension idx for that dim is always 0 + permuted_strides = expand_front(permuted_strides, 0); + + std::cout << "VSHAMPOR: writing tensor " << tensor_info.name.data << " with size " << tensor_info.size; + std::cout << " shape (GGUF layout) "; + for (auto dim : gguf_layout_shape) + std::cout << dim << ","; + std::cout << " shape (IR layout) "; + for (auto dim : ir_layout_shape) + std::cout << dim << ","; + std::cout << " stride (IR layout) "; + for (auto stride : ir_layout_strides) + std::cout << stride << ","; + std::cout << " stride (IR layout, transposing) "; + for (auto stride : permuted_strides) + std::cout << stride << ","; + std::cout << std::endl; + + // TODO (vshampor): rewrite the loop below using recurrent templates? + // This relies on GGUF_MAX_DIMS == 4 and unused dims being equal to 1 + size_t current_offset = 0; + size_t element_size = sizeof(float); + size_t num_bytes_written = 0; + for (size_t dim_0 = 0; dim_0 < gguf_layout_shape_ex[0]; dim_0++) + for (size_t dim_1 = 0; dim_1 < gguf_layout_shape_ex[1]; dim_1++) + for (size_t dim_2 = 0; dim_2 < gguf_layout_shape_ex[2]; dim_2++) + for (size_t dim_3 = 0; dim_3 < gguf_layout_shape_ex[3]; dim_3++) { + current_offset = element_size * (dim_0 * permuted_strides[0] + dim_1 * permuted_strides[1] + + dim_2 * permuted_strides[2] + dim_3 * permuted_strides[3]); + if (increment_by_one_tensor_names.count(tensor_llama_name) != 0) { // gemma case + write_float_plus_one(out, (float*)ir_tensor_data + current_offset); + } else { + out.write(ir_tensor_data + current_offset, element_size); + } + num_bytes_written += element_size; + } + std::cout << "VSHAMPOR: wrote " << num_bytes_written << std::endl; + OPENVINO_ASSERT(num_bytes_written == tensor_info.size); + } + } +} + +struct ValueStorageForLifetimeExtension { + std::list kv_key_string_storage; + std::list kv_value_string_storage; + std::list> str_arr_storage; + void* store_gguf_value_vector(const std::vector& vec, gguf_type g_type) { + size_t elt_size; + switch (g_type) { + case GGUF_TYPE_UINT8: + elt_size = sizeof(uint8_t); + break; + case GGUF_TYPE_INT8: + elt_size = sizeof(int8_t); + break; + case GGUF_TYPE_UINT16: + elt_size = sizeof(uint16_t); + break; + case GGUF_TYPE_INT16: + elt_size = sizeof(int16_t); + break; + case GGUF_TYPE_UINT32: + elt_size = sizeof(uint32_t); + break; + case GGUF_TYPE_INT32: + elt_size = sizeof(int32_t); + break; + case GGUF_TYPE_FLOAT32: + elt_size = sizeof(float); + break; + case GGUF_TYPE_UINT64: + elt_size = sizeof(uint64_t); + break; + case GGUF_TYPE_INT64: + elt_size = sizeof(int64_t); + break; + case GGUF_TYPE_FLOAT64: + elt_size = sizeof(double); + break; + case GGUF_TYPE_BOOL: + elt_size = sizeof(bool); + break; + default: + OPENVINO_THROW("Unknown array type"); + } + size_t size_in_bytes = vec.size() * elt_size; + void* mem_ptr = new char[size_in_bytes]; + for (size_t i = 0; i < vec.size(); i++) { + switch (g_type) { + case GGUF_TYPE_UINT8: + ((uint8_t*)mem_ptr)[i] = vec[i].uint8; + break; + case GGUF_TYPE_INT8: + ((int8_t*)mem_ptr)[i] = vec[i].int8; + break; + case GGUF_TYPE_UINT16: + ((uint16_t*)mem_ptr)[i] = vec[i].uint16; + break; + case GGUF_TYPE_INT16: + ((int16_t*)mem_ptr)[i] = vec[i].int16; + break; + case GGUF_TYPE_UINT32: + ((uint32_t*)mem_ptr)[i] = vec[i].uint32; + break; + case GGUF_TYPE_INT32: + ((int32_t*)mem_ptr)[i] = vec[i].int32; + break; + case GGUF_TYPE_FLOAT32: + ((float*)mem_ptr)[i] = vec[i].float32; + break; + case GGUF_TYPE_UINT64: + ((uint64_t*)mem_ptr)[i] = vec[i].uint64; + break; + case GGUF_TYPE_INT64: + ((int64_t*)mem_ptr)[i] = vec[i].int64; + break; + case GGUF_TYPE_FLOAT64: + ((double*)mem_ptr)[i] = vec[i].float64; + break; + case GGUF_TYPE_BOOL: + ((bool*)mem_ptr)[i] = vec[i].bool_; + break; + default: + OPENVINO_THROW("Unknown array type"); } - - std::set gemma_tensor_names_to_increment; - // FIXME (vshampor): tried setting up commands for incrementing *_norm.weight values by 1 like it is done - // during llama.cpp HF-to-GGUF export, but it seems that it isn't necessary and IR stores the incremented weights already - // Is this due to constant folding? - - // for (const auto& llama_name_and_rtinfo_name : tensor_name_map) { - // const std::string& llama_name = llama_name_and_rtinfo_name.first; - // const std::string& rtinfo_name = llama_name_and_rtinfo_name.second.as(); - // std::string gemma_norm_suffix = "norm.weight"; - // if (rtinfo_name.size() < gemma_norm_suffix.size()) continue; - // if (rtinfo_name.substr(rtinfo_name.size() - gemma_norm_suffix.size()) == gemma_norm_suffix) gemma_tensor_names_to_increment.insert(llama_name); - // } - - std::cout << "VSHAMPOR: writing tensor data (blob with transpositions) " << std::endl; - append_tensor_data_with_transpositions(m_converted_gguf_file_name, tensor_infos, tensor_data_ptrs, transpose_permutations, gemma_tensor_names_to_increment); - std::cout << "VSHAMPOR: write finished." << m_converted_gguf_file_name << std::endl; - - std::cout << "VSHAMPOR: loading llama model from written file..." << std::endl; - llama_model_params mparams = llama_model_default_params(); - mparams.n_gpu_layers = 99; - m_llama_model_ptr = llama_load_model_from_file(m_converted_gguf_file_name.c_str(), mparams); - llama_context_params cparams = llama_context_default_params(); - m_llama_ctx = llama_new_context_with_model(m_llama_model_ptr, cparams); - - std::cout << "VSHAMPOR: llama model loaded successfully..." << std::endl; } + return mem_ptr; + } - - LlamaCppModel::LlamaCppModel(const std::shared_ptr& ov_model, std::istream& input_stream, const std::shared_ptr& plugin) : - ICompiledModel(ov_model, plugin) { - num_tokens_processed_ptr = new size_t; // TODO (vshampor): hack, remove - *num_tokens_processed_ptr = 0; - std::shared_ptr llama_plugin = std::dynamic_pointer_cast(plugin); - std::string current_file_path = llama_plugin->get_current_gguf_file_path(); - std::ofstream output_stream(current_file_path, std::ios::binary); - output_stream << input_stream.rdbuf(); - - - std::cout << "VSHAMPOR: loading llama model from imported and re-written file..." << std::endl; - llama_model_params mparams = llama_model_default_params(); - mparams.n_gpu_layers = 99; - m_llama_model_ptr = llama_load_model_from_file(current_file_path.c_str(), mparams); - llama_context_params cparams = llama_context_default_params(); - m_llama_ctx = llama_new_context_with_model(m_llama_model_ptr, cparams); - std::cout << "VSHAMPOR: llama model loaded successfully from cache..." << std::endl; + ValueStorageForLifetimeExtension() = default; + ~ValueStorageForLifetimeExtension() { + for (void* ptr : non_str_raw_storage) { + delete[](char*) ptr; } + } - LlamaCppModel::LlamaCppModel(const std::string& gguf_fname, const std::shared_ptr& plugin) : - ICompiledModel(nullptr, plugin) { - num_tokens_processed_ptr = new size_t; // TODO (vshampor): hack, remove - *num_tokens_processed_ptr = 0; - std::cout << "VSHAMPOR: loading llama model directly from GGUF... " << std::endl; - llama_model_params mparams = llama_model_default_params(); - mparams.n_gpu_layers = 99; - m_llama_model_ptr = llama_load_model_from_file(gguf_fname.c_str(), mparams); - llama_context_params cparams = llama_context_default_params(); - m_llama_ctx = llama_new_context_with_model(m_llama_model_ptr, cparams); - std::cout << "VSHAMPOR: llama model loaded successfully from GGUF..." << std::endl; - - auto input_ids = std::make_shared(ov::element::Type_t::i64, ov::PartialShape({-1, -1})); - auto fake_convert = std::make_shared(input_ids->output(0), ov::element::Type_t::f32); - auto logits = std::make_shared(fake_convert->output(0)); - - ov::ParameterVector inputs{input_ids}; - - std::vector> unused_names_in_order = { { "attention_mask", ov::element::Type_t::i64 }, - { "position_ids", ov::element::Type_t::i64 }, - { "beam_idx", ov::element::Type_t::i32 } }; - for (const auto& descr : unused_names_in_order) { - auto unused_inp = std::make_shared(descr.second, ov::PartialShape({-1, -1})); - inputs.push_back(unused_inp); +private: + std::list non_str_raw_storage; +}; + +bool maybe_parse_single_element(gguf_type g_type, + ov::Any rtmap_value, + gguf_value& dst, + ValueStorageForLifetimeExtension& store) { + switch (g_type) { + case GGUF_TYPE_UINT8: + dst.uint8 = rtmap_value.as(); + break; + case GGUF_TYPE_INT8: + dst.int8 = rtmap_value.as(); + ; + break; + case GGUF_TYPE_UINT16: + dst.uint16 = rtmap_value.as(); + break; + case GGUF_TYPE_INT16: + dst.int16 = rtmap_value.as(); + break; + case GGUF_TYPE_UINT32: + dst.uint32 = rtmap_value.as(); + break; + case GGUF_TYPE_INT32: + dst.int32 = rtmap_value.as(); + break; + case GGUF_TYPE_FLOAT32: + dst.float32 = rtmap_value.as(); + break; + case GGUF_TYPE_UINT64: + dst.uint64 = rtmap_value.as(); + break; + case GGUF_TYPE_INT64: + dst.int64 = rtmap_value.as(); + break; + case GGUF_TYPE_FLOAT64: + dst.float64 = rtmap_value.as(); + break; + case GGUF_TYPE_BOOL: + dst.bool_ = rtmap_value.as(); + break; + case GGUF_TYPE_STRING: { + std::string string_value = rtmap_value.as(); + store.kv_value_string_storage.push_back(string_value); + dst.str.n = string_value.length(); + dst.str.data = + (char*)store.kv_value_string_storage.back().c_str(); // TODO (vshampor) see equivalent case below + break; + } + default: + return false; // did not parse + } + return true; // parsed successfully +} + +ov::Any get_any_associated_with_gguf_type(gguf_type g_type) { + switch (g_type) { + case GGUF_TYPE_UINT8: + return ov::Any(uint8_t()); + case GGUF_TYPE_INT8: + return ov::Any(int8_t()); + case GGUF_TYPE_UINT16: + return ov::Any(uint16_t()); + case GGUF_TYPE_INT16: + return ov::Any(int16_t()); + case GGUF_TYPE_UINT32: + return ov::Any(uint32_t()); + case GGUF_TYPE_INT32: + return ov::Any(int32_t()); + case GGUF_TYPE_FLOAT32: + return ov::Any(float()); + case GGUF_TYPE_UINT64: + return ov::Any(uint64_t()); + case GGUF_TYPE_INT64: + return ov::Any(int64_t()); + case GGUF_TYPE_FLOAT64: + return ov::Any(double()); + case GGUF_TYPE_BOOL: + return ov::Any(bool()); + case GGUF_TYPE_STRING: + return ov::Any(std::string()); + default: + OPENVINO_THROW("Unknown gguf_type to turn into ov::Any"); + } +} + +LlamaCppModel::LlamaCppModel(const std::shared_ptr& model, + const std::shared_ptr& plugin, + const ov::SoPtr& context, + const std::shared_ptr& task_executor) + : ICompiledModel(model, plugin, context, task_executor) { + m_model = model; + num_tokens_processed_ptr = new size_t; // TODO (vshampor): hack, remove + *num_tokens_processed_ptr = 0; + auto rt_info = model->get_rt_info(); + OPENVINO_ASSERT(rt_info.count("lcp_kv_params") != 0); + OPENVINO_ASSERT(rt_info.count("lcp_kv_types") != 0); + OPENVINO_ASSERT(rt_info.count("lcp_kv_array_types") != 0); + OPENVINO_ASSERT(rt_info.count("lcp_tensor_name_map") != 0); + OPENVINO_ASSERT(rt_info.count("lcp_tensor_shape_map") != 0); + OPENVINO_ASSERT(rt_info.count("lcp_expected_tensor_shapes") != 0); + OPENVINO_ASSERT(rt_info.count("lcp_transpose_permutations") != 0); + + RTMap& kv_params = model->get_rt_info("lcp_kv_params"); + RTMap& kv_types = model->get_rt_info("lcp_kv_types"); + RTMap& kv_array_types = model->get_rt_info("lcp_kv_array_types"); + RTMap& tensor_name_map = model->get_rt_info("lcp_tensor_name_map"); + RTMap& tensor_shape_map = model->get_rt_info("lcp_tensor_shape_map"); + RTMap& expected_tensor_shapes_map = model->get_rt_info("lcp_expected_tensor_shapes"); + RTMap& transpose_permutations_rtmap = model->get_rt_info("lcp_transpose_permutations"); + + size_t gguf_version = model->get_rt_info("lcp_gguf_version"); + std::cout << "VSHAMPOR: parsed gguf_version " << gguf_version << std::endl; + + // kv params + OPENVINO_ASSERT(kv_params.size() == kv_types.size()); + size_t n_kv = kv_params.size(); + std::vector kv_vector; + ValueStorageForLifetimeExtension store; + + for (const auto& kv_pair : kv_params) { + gguf_kv kv; + + const auto& key = kv_pair.first; + kv.key.n = key.length(); + store.kv_key_string_storage.push_back(key); + kv.key.data = (char*)store.kv_key_string_storage.back().c_str(); // TODO (vshampor) see equivalent case below + + uint32_t value_type = kv_types[key].as(); + gguf_type gguf_value_type = (gguf_type)value_type; + kv.type = gguf_value_type; + if (gguf_value_type != GGUF_TYPE_ARRAY) { + bool is_parsed = maybe_parse_single_element(kv.type, kv_pair.second, kv.value, store); + OPENVINO_ASSERT(is_parsed, "Invalid type of a GGUF kv-value"); + } else { // array case + gguf_type element_type = (gguf_type)kv_array_types[key].as(); + kv.value.arr.type = element_type; + std::string serialized_array = kv_pair.second.as(); + std::stringstream ss{serialized_array}; + std::vector parsed_array; + while (!ss.eof()) { + gguf_value array_elt; + ov::Any ov_any = get_any_associated_with_gguf_type(element_type); + std::string token; + ss >> token; + if (std::string(kv.key.data) == "tokenizer.ggml.merges") { + // tokenizer merges are pairs of tokens separated by whitespace, so + // need to read another to get a proper merge + // TODO (vshampor): think of another delimiting strategy in the + // rt_info and use that strategy here for more robust code + std::string another_token; + ss >> another_token; + token += std::string(" ") + another_token; + ov_any = ov::Any::make(token); + } else { + std::stringstream tok_ss{token}; + ov_any.read(tok_ss); + } + bool is_parsed = maybe_parse_single_element(element_type, ov_any, array_elt, store); + OPENVINO_ASSERT(is_parsed); + parsed_array.push_back(array_elt); } - - m_model = std::make_shared(logits, inputs, "fake_ov_model_for_io_specification"); - - m_model->inputs()[0].set_names({"input_ids"}); - for (size_t i = 0; i < unused_names_in_order.size(); i++) { - m_model->inputs()[i + 1].set_names({unused_names_in_order[i].first}); + kv.value.arr.n = parsed_array.size(); + if (element_type == GGUF_TYPE_STRING) { + // string element has already been lifetime-extended during parsing + std::vector cstr_vector(parsed_array.size()); + for (size_t cstr_idx = 0; cstr_idx < parsed_array.size(); cstr_idx++) { + cstr_vector[cstr_idx] = parsed_array[cstr_idx].str.data; + } + store.str_arr_storage.push_back(cstr_vector); + kv.value.arr.data = store.str_arr_storage.back().data(); + } else { + void* data_ptr = store.store_gguf_value_vector(parsed_array, element_type); + kv.value.arr.data = data_ptr; } + } + kv_vector.push_back(kv); + } - m_model->outputs()[0].set_names({"logits"}); - - for (auto input : m_model->inputs()) { - m_fake_inputs.emplace_back(input); - } - for (auto output : m_model->outputs()) { - m_fake_outputs.emplace_back(output); + auto token_types_kv_it = std::find_if(kv_vector.begin(), kv_vector.end(), [](const gguf_kv& val) { + return std::string(val.key.data) == "tokenizer.ggml.token_type"; + }); + if (token_types_kv_it != kv_vector.end()) { + auto tokens_kv_it = std::find_if(kv_vector.begin(), kv_vector.end(), [](const gguf_kv& val) { + return std::string(val.key.data) == "tokenizer.ggml.tokens"; + }); + if (tokens_kv_it != kv_vector.end()) { + size_t expected_num_tokens = token_types_kv_it->value.arr.n; + size_t actual_num_tokens = tokens_kv_it->value.arr.n; + if (actual_num_tokens < expected_num_tokens) { + std::cout << "VSHAMPOR: detected wrong vocab " + "serialization/deserialization (expected " + << expected_num_tokens << " tokens, parsed " << actual_num_tokens + << " from vocab), filling tokens with bogus values" << std::endl; + std::vector new_vocab; + // char** old_vocab_data_ptr = (char**) tokens_kv_it->value.arr.data; + // std::copy(old_vocab_data_ptr, old_vocab_data_ptr + actual_num_tokens, + // new_vocab.begin()); size_t extra_tokens_needed = expected_num_tokens + // - actual_num_tokens; + size_t extra_tokens_needed = expected_num_tokens; + for (size_t tok_idx = 0; tok_idx < extra_tokens_needed; tok_idx++) { + std::stringstream ss; + ss << "invalid_token_" << tok_idx; + std::string new_token = ss.str(); + store.kv_value_string_storage.push_back(new_token); + char* str_data_ptr = (char*)store.kv_value_string_storage.back().c_str(); + new_vocab.push_back(str_data_ptr); + } + OPENVINO_ASSERT(new_vocab.size() == expected_num_tokens); + store.str_arr_storage.push_back(new_vocab); + tokens_kv_it->value.arr.data = (void*)store.str_arr_storage.back().data(); + tokens_kv_it->value.arr.n = expected_num_tokens; } } + } + // tensors + OPENVINO_ASSERT(tensor_name_map.size() == tensor_shape_map.size()); + size_t n_tensors_in_rtinfo = tensor_name_map.size(); + std::cout << "VSHAMPOR: got request for " << n_tensors_in_rtinfo << " tensors from rt_info\n"; - void LlamaCppModel::export_model(std::ostream& output_stream) const { - std::cout << "VSHAMPOR: exporting model" << std::endl; - - // FIXME (vshampor): it's a shame that loading a model from cache does not have an option to - // actually keep the already loaded model from xml and not be forced to deserialize an ov::Model - // representation from cache as well. As it stands, will need to write the whole IR into the cache entry - // along with the GGUF file. - // - std::stringstream xmlFile, binFile; - ov::pass::Serialize serializer(xmlFile, binFile); - serializer.run_on_model(m_model); - - auto m_constants = binFile.str(); - auto m_model = xmlFile.str(); - - auto dataSize = static_cast(m_model.size()); - output_stream.write(reinterpret_cast(&dataSize), sizeof(dataSize)); - output_stream.write(m_model.c_str(), dataSize); + std::vector tensor_infos; + std::vector tensor_data_ptrs; - dataSize = static_cast(m_constants.size()); - output_stream.write(reinterpret_cast(&dataSize), sizeof(dataSize)); - output_stream.write(reinterpret_cast(&m_constants[0]), dataSize); + std::map parsed_weights_to_search_for; + for (const auto& llama_name_and_rtinfo_name : tensor_name_map) { + const std::string& llama_name = llama_name_and_rtinfo_name.first; + const std::string& rtinfo_name = llama_name_and_rtinfo_name.second.as(); + ov::Shape expected_shape = tensor_shape_map[llama_name].as(); + parsed_weights_to_search_for[rtinfo_name] = expected_shape; + } + TensorWeightMatcher matcher{model, parsed_weights_to_search_for}; + std::unordered_map> matches = matcher.get_matches(); + std::unordered_map> llama_name_to_constant_node_map; + for (const auto& entry : tensor_name_map) { + const auto& llama_name = entry.first; + const auto& rtinfo_name = entry.second.as(); + llama_name_to_constant_node_map[llama_name] = matches[rtinfo_name]; + } + std::cout << "VSHAMPOR: requested tensors map to " << llama_name_to_constant_node_map.size() + << " tensors to search in model (shared tensors considered)\n"; + + std::list llama_name_storage; + + size_t n_tensors = 0; + + size_t offset = 0; // each tensor_info has to have a correct offset including + // padding, checked for in gguf_write_to_buf + for (const auto& matched_weight_pair : llama_name_to_constant_node_map) { + // Need to store the names in the list so that the passed c_str() pointers + // in tensor_infos to the llama names stay valid until they get deepcopied + // in gguf/llama functions + llama_name_storage.push_back(matched_weight_pair.first); + const std::string& llama_name = llama_name_storage.back(); + + auto weight_const_node_ptr = matched_weight_pair.second; + auto weight_shape = weight_const_node_ptr->get_shape(); + + // does hf-to-gguf invert all tensor dimensions with shapes > 1? + auto expected_weight_shape = ov::Shape(expected_tensor_shapes_map[llama_name].as()); + OPENVINO_ASSERT(expected_weight_shape.size() < GGML_MAX_DIMS); + + gguf_tensor_info info; + + info.type = GGML_TYPE_F32; // TODO (vshampor): better type assignment based + // on actual element type of the Constant node + + info.name.n = llama_name.length(); + info.name.data = (char*)llama_name.c_str(); // TODO (vshampor): either do this via const_cast, or will + // have to implement own structures for read-only data + // passing to llama_load_model_from_data + info.n_dims = weight_shape.size(); + std::fill(std::begin(info.ne), std::begin(info.ne) + GGML_MAX_DIMS, (uint64_t)1); + + // looks like GGUF expects inverse order of dimensions when compared to e.g. + // torch and actual row-major layout, see + // gguf.gguf_writer.GGUFWriter.add_tensor_info in gguf python package + std::copy(expected_weight_shape.rbegin(), expected_weight_shape.rend(), info.ne); + + void* data_ptr = (void*)(weight_const_node_ptr->get_data_ptr()); // TODO (vshampor): danger - casts + // `const` away also - the + // expected_weight_shape is in general + // different from actual ov::Tensor + // shape, in particular it may be + // transposed, so we actually need to set + // the pointers to shape-corrected tensor + // storage, which we don't do here - we + // are only preparing this data to get a + // convenient gguf_context object to + // reuse metadata (header) writing code, + // tensor data transpositions will be + // done during actual file write + + info.size = weight_const_node_ptr->get_byte_size(); + info.offset = offset; + + const size_t size_pad = GGML_PAD(info.size, GGUF_DEFAULT_ALIGNMENT); + offset += size_pad; + + info.data = data_ptr; + + tensor_infos.push_back(info); + tensor_data_ptrs.push_back(data_ptr); + n_tensors++; + } - std::ifstream in(m_converted_gguf_file_name, std::ios::binary); - output_stream << in.rdbuf(); - } + std::cout << "VSHAMPOR: found " << matches.size() << "/" << parsed_weights_to_search_for.size() << " tensors" + << std::endl; + + gguf_init_params gguf_params; + gguf_params.no_alloc = false; + gguf_params.ctx = nullptr; + + m_gguf_ctx = gguf_init_from_data(n_tensors, + tensor_infos.data(), + n_kv, + kv_vector.data(), + tensor_data_ptrs.data(), + gguf_params); + + std::shared_ptr llama_plugin_ptr = std::dynamic_pointer_cast(plugin); + m_converted_gguf_file_name = llama_plugin_ptr->get_current_gguf_file_path(); + + std::cout << "VSHAMPOR: output filename is " << m_converted_gguf_file_name << std::endl; + std::cout << "VSHAMPOR: writing metadata (GGUF header) " << std::endl; + gguf_write_to_file(m_gguf_ctx, + m_converted_gguf_file_name.c_str(), + /* only_meta = */ true); + + std::map transpose_permutations; + + for (const auto& llama_name_and_permutation : transpose_permutations_rtmap) { + std::string permutation_str = llama_name_and_permutation.second.as(); + std::stringstream ss(permutation_str); + TransposePermutation permutation; + bool is_ok = true; + is_ok &= static_cast(ss >> permutation.first); + is_ok &= static_cast(ss >> permutation.second); + OPENVINO_ASSERT(is_ok, "failed to read permutation"); + transpose_permutations[llama_name_and_permutation.first] = permutation; + } - std::shared_ptr LlamaCppModel::get_runtime_model() const { - OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented"); - } + std::set gemma_tensor_names_to_increment; + // FIXME (vshampor): tried setting up commands for incrementing *_norm.weight + // values by 1 like it is done during llama.cpp HF-to-GGUF export, but it + // seems that it isn't necessary and IR stores the incremented weights already + // Is this due to constant folding? + + // for (const auto& llama_name_and_rtinfo_name : tensor_name_map) { + // const std::string& llama_name = llama_name_and_rtinfo_name.first; + // const std::string& rtinfo_name = + // llama_name_and_rtinfo_name.second.as(); std::string + // gemma_norm_suffix = "norm.weight"; if (rtinfo_name.size() < + // gemma_norm_suffix.size()) continue; if + // (rtinfo_name.substr(rtinfo_name.size() - gemma_norm_suffix.size()) == + // gemma_norm_suffix) gemma_tensor_names_to_increment.insert(llama_name); + // } + + std::cout << "VSHAMPOR: writing tensor data (blob with transpositions) " << std::endl; + append_tensor_data_with_transpositions(m_converted_gguf_file_name, + tensor_infos, + tensor_data_ptrs, + transpose_permutations, + gemma_tensor_names_to_increment); + std::cout << "VSHAMPOR: write finished." << m_converted_gguf_file_name << std::endl; + + std::cout << "VSHAMPOR: loading llama model from written file..." << std::endl; + llama_model_params mparams = llama_model_default_params(); + mparams.n_gpu_layers = 99; + m_llama_model_ptr = llama_load_model_from_file(m_converted_gguf_file_name.c_str(), mparams); + llama_context_params cparams = llama_context_default_params(); + m_llama_ctx = llama_new_context_with_model(m_llama_model_ptr, cparams); + + std::cout << "VSHAMPOR: llama model loaded successfully..." << std::endl; +} + +LlamaCppModel::LlamaCppModel(const std::shared_ptr& ov_model, + std::istream& input_stream, + const std::shared_ptr& plugin) + : ICompiledModel(ov_model, plugin) { + num_tokens_processed_ptr = new size_t; // TODO (vshampor): hack, remove + *num_tokens_processed_ptr = 0; + std::shared_ptr llama_plugin = std::dynamic_pointer_cast(plugin); + std::string current_file_path = llama_plugin->get_current_gguf_file_path(); + std::ofstream output_stream(current_file_path, std::ios::binary); + output_stream << input_stream.rdbuf(); + + std::cout << "VSHAMPOR: loading llama model from imported and re-written file..." << std::endl; + llama_model_params mparams = llama_model_default_params(); + mparams.n_gpu_layers = 99; + m_llama_model_ptr = llama_load_model_from_file(current_file_path.c_str(), mparams); + llama_context_params cparams = llama_context_default_params(); + m_llama_ctx = llama_new_context_with_model(m_llama_model_ptr, cparams); + std::cout << "VSHAMPOR: llama model loaded successfully from cache..." << std::endl; +} + +LlamaCppModel::LlamaCppModel(const std::string& gguf_fname, const std::shared_ptr& plugin) + : ICompiledModel(nullptr, plugin) { + num_tokens_processed_ptr = new size_t; // TODO (vshampor): hack, remove + *num_tokens_processed_ptr = 0; + std::cout << "VSHAMPOR: loading llama model directly from GGUF... " << std::endl; + llama_model_params mparams = llama_model_default_params(); + mparams.n_gpu_layers = 99; + m_llama_model_ptr = llama_load_model_from_file(gguf_fname.c_str(), mparams); + llama_context_params cparams = llama_context_default_params(); + m_llama_ctx = llama_new_context_with_model(m_llama_model_ptr, cparams); + std::cout << "VSHAMPOR: llama model loaded successfully from GGUF..." << std::endl; + + auto input_ids = std::make_shared(ov::element::Type_t::i64, ov::PartialShape({-1, -1})); + auto fake_convert = std::make_shared(input_ids->output(0), ov::element::Type_t::f32); + auto logits = std::make_shared(fake_convert->output(0)); + + ov::ParameterVector inputs{input_ids}; + + std::vector> unused_names_in_order = { + {"attention_mask", ov::element::Type_t::i64}, + {"position_ids", ov::element::Type_t::i64}, + {"beam_idx", ov::element::Type_t::i32}}; + for (const auto& descr : unused_names_in_order) { + auto unused_inp = std::make_shared(descr.second, ov::PartialShape({-1, -1})); + inputs.push_back(unused_inp); + } - void LlamaCppModel::set_property(const ov::AnyMap& properties) { - std::cout << "VSHAMPOR: attempted to set_property (did nothing)"; - } + m_model = std::make_shared(logits, inputs, "fake_ov_model_for_io_specification"); - ov::Any LlamaCppModel::get_property(const std::string& name) const { - if (ov::supported_properties == name) { - return decltype(ov::supported_properties)::value_type(std::vector()); - } - OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented"); - } + m_model->inputs()[0].set_names({"input_ids"}); + for (size_t i = 0; i < unused_names_in_order.size(); i++) { + m_model->inputs()[i + 1].set_names({unused_names_in_order[i].first}); + } - std::shared_ptr LlamaCppModel::create_sync_infer_request() const { - return std::make_shared(std::static_pointer_cast(shared_from_this())); - } + m_model->outputs()[0].set_names({"logits"}); - const std::vector>& LlamaCppModel::inputs() const { - return m_fake_inputs; - }; - const std::vector>& LlamaCppModel::outputs() const { - return m_fake_outputs; - }; + for (auto input : m_model->inputs()) { + m_fake_inputs.emplace_back(input); + } + for (auto output : m_model->outputs()) { + m_fake_outputs.emplace_back(output); + } +} + +void LlamaCppModel::export_model(std::ostream& output_stream) const { + std::cout << "VSHAMPOR: exporting model" << std::endl; + + // FIXME (vshampor): it's a shame that loading a model from cache does not + // have an option to actually keep the already loaded model from xml and not + // be forced to deserialize an ov::Model representation from cache as well. As + // it stands, will need to write the whole IR into the cache entry along with + // the GGUF file. + // + std::stringstream xmlFile, binFile; + ov::pass::Serialize serializer(xmlFile, binFile); + serializer.run_on_model(m_model); + + auto m_constants = binFile.str(); + auto m_model = xmlFile.str(); + + auto dataSize = static_cast(m_model.size()); + output_stream.write(reinterpret_cast(&dataSize), sizeof(dataSize)); + output_stream.write(m_model.c_str(), dataSize); + + dataSize = static_cast(m_constants.size()); + output_stream.write(reinterpret_cast(&dataSize), sizeof(dataSize)); + output_stream.write(reinterpret_cast(&m_constants[0]), dataSize); + + std::ifstream in(m_converted_gguf_file_name, std::ios::binary); + output_stream << in.rdbuf(); +} + +std::shared_ptr LlamaCppModel::get_runtime_model() const { + OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented"); +} + +void LlamaCppModel::set_property(const ov::AnyMap& properties) { + std::cout << "VSHAMPOR: attempted to set_property (did nothing)"; +} + +ov::Any LlamaCppModel::get_property(const std::string& name) const { + if (ov::supported_properties == name) { + return decltype(ov::supported_properties)::value_type(std::vector()); } + OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented"); +} + +std::shared_ptr LlamaCppModel::create_sync_infer_request() const { + return std::make_shared( + std::static_pointer_cast(shared_from_this())); +} + +const std::vector>& LlamaCppModel::inputs() const { + return m_fake_inputs; +}; +const std::vector>& LlamaCppModel::outputs() const { + return m_fake_outputs; +}; +} // namespace llama_cpp_plugin } // namespace ov diff --git a/modules/llama_cpp_plugin/src/infer_request.cpp b/modules/llama_cpp_plugin/src/infer_request.cpp index 0993422f6..6b5e8ba1e 100644 --- a/modules/llama_cpp_plugin/src/infer_request.cpp +++ b/modules/llama_cpp_plugin/src/infer_request.cpp @@ -1,11 +1,12 @@ #include "infer_request.hpp" -#include "openvino/runtime/make_tensor.hpp" + #include "llama.h" +#include "openvino/runtime/make_tensor.hpp" namespace ov { - namespace llama_cpp_plugin { +namespace llama_cpp_plugin { - void allocate_tensor_impl(ov::SoPtr& tensor, +void allocate_tensor_impl(ov::SoPtr& tensor, const ov::element::Type& element_type, const ov::Shape& shape) { if (!tensor || tensor->get_element_type() != element_type) { @@ -15,97 +16,105 @@ namespace ov { } } - LlamaCppSyncInferRequest::LlamaCppSyncInferRequest(const std::shared_ptr& compiled_model): ov::ISyncInferRequest(compiled_model) { - std::cout << "VSHAMPOR: infer request ctor called\n"; - m_compiled_model_ptr = compiled_model; - // Allocate input/output tensors - for (const auto& input : get_inputs()) { - allocate_tensor(input, [input](ov::SoPtr& tensor) { - // Can add a check to avoid double work in case of shared tensors - allocate_tensor_impl(tensor, - input.get_element_type(), - input.get_partial_shape().is_dynamic() ? ov::Shape{0} : input.get_shape()); - }); - } - for (const auto& output : get_outputs()) { - allocate_tensor(output, [output](ov::SoPtr& tensor) { - // Can add a check to avoid double work in case of shared tensors - allocate_tensor_impl(tensor, - output.get_element_type(), - output.get_partial_shape().is_dynamic() ? ov::Shape{0} : output.get_shape()); - }); +LlamaCppSyncInferRequest::LlamaCppSyncInferRequest(const std::shared_ptr& compiled_model) + : ov::ISyncInferRequest(compiled_model) { + std::cout << "VSHAMPOR: infer request ctor called\n"; + m_compiled_model_ptr = compiled_model; + // Allocate input/output tensors + for (const auto& input : get_inputs()) { + allocate_tensor(input, [input](ov::SoPtr& tensor) { + // Can add a check to avoid double work in case of shared tensors + allocate_tensor_impl(tensor, + input.get_element_type(), + input.get_partial_shape().is_dynamic() ? ov::Shape{0} : input.get_shape()); + }); } - } - void LlamaCppSyncInferRequest::set_tensors_impl(const ov::Output port, const std::vector>& tensors) { - std::cout << "VSHAMPOR: set_tensors_impl called\n"; + for (const auto& output : get_outputs()) { + allocate_tensor(output, [output](ov::SoPtr& tensor) { + // Can add a check to avoid double work in case of shared tensors + allocate_tensor_impl(tensor, + output.get_element_type(), + output.get_partial_shape().is_dynamic() ? ov::Shape{0} : output.get_shape()); + }); } +} +void LlamaCppSyncInferRequest::set_tensors_impl(const ov::Output port, + const std::vector>& tensors) { + std::cout << "VSHAMPOR: set_tensors_impl called\n"; +} - void llama_batch_add_reimpl( - struct llama_batch & batch, - llama_token id, - llama_pos pos, - const std::vector & seq_ids, - bool logits) { - batch.token [batch.n_tokens] = id; - batch.pos [batch.n_tokens] = pos; - batch.n_seq_id[batch.n_tokens] = seq_ids.size(); - for (size_t i = 0; i < seq_ids.size(); ++i) { - batch.seq_id[batch.n_tokens][i] = seq_ids[i]; - } - batch.logits [batch.n_tokens] = logits; - - batch.n_tokens++; +void llama_batch_add_reimpl(struct llama_batch& batch, + llama_token id, + llama_pos pos, + const std::vector& seq_ids, + bool logits) { + batch.token[batch.n_tokens] = id; + batch.pos[batch.n_tokens] = pos; + batch.n_seq_id[batch.n_tokens] = seq_ids.size(); + for (size_t i = 0; i < seq_ids.size(); ++i) { + batch.seq_id[batch.n_tokens][i] = seq_ids[i]; } + batch.logits[batch.n_tokens] = logits; + + batch.n_tokens++; +} - void LlamaCppSyncInferRequest::infer() { - auto input_ids_tensor_ptr = get_tensor(get_inputs()[0]); // TODO (vshampor) correctly identify input_ids among all inputs without hardcode - OPENVINO_ASSERT(input_ids_tensor_ptr->get_element_type() == ov::element::Type_t::i64); - OPENVINO_ASSERT(input_ids_tensor_ptr->get_shape().size() == 2); - size_t batch_size = input_ids_tensor_ptr->get_shape()[0]; - size_t sequence_length = input_ids_tensor_ptr->get_shape()[1]; - - // llama_batch actually contains one sequence - llama_batch batch = llama_batch_init(sequence_length, /* embd = */ 0, /* n_seq_max = */ 1); - const int64_t* data_ptr = input_ids_tensor_ptr->data(); - - const int64_t* sequence_start_ptr = data_ptr /* + seq_idx */; - - for (size_t tok_idx = 0; tok_idx < sequence_length; ++tok_idx) { - const int64_t token_id = sequence_start_ptr[tok_idx]; - llama_batch_add_reimpl(batch, token_id, *(m_compiled_model_ptr->num_tokens_processed_ptr), { 0 }, true); // the last `true` here is a marker that the logits for this token should be computed and returned - size_t* ptr = m_compiled_model_ptr->num_tokens_processed_ptr; - (*ptr)++; - } - - - llama_context* ctx = m_compiled_model_ptr->m_llama_ctx; - int32_t sts = llama_decode(ctx, batch); - - if (sts != 0) { - OPENVINO_THROW("llama_decode failed with code ", sts); - } - - size_t n_vocab = llama_n_vocab(m_compiled_model_ptr->m_llama_model_ptr); - - ov::Tensor output_tensor{ov::element::Type_t::f32, {1, sequence_length, n_vocab}}; - float* output_tensor_data_ptr = output_tensor.data(); - - for (size_t pos = 0; pos < sequence_length; pos++) { - float* logits_from_llama = llama_get_logits_ith(ctx, pos); - std::copy(logits_from_llama, logits_from_llama + n_vocab, output_tensor_data_ptr + pos * n_vocab); - } - - auto& logit_output = get_outputs()[0]; - allocate_tensor(logit_output, [&output_tensor](ov::SoPtr& tensor) { allocate_tensor_impl(tensor, output_tensor.get_element_type(), output_tensor.get_shape()); - output_tensor.copy_to(ov::make_tensor(tensor)); }); - }; - std::vector LlamaCppSyncInferRequest::get_profiling_info() const { - std::cout << "VSHAMPOR: get_profiling_info() called\n"; - return std::vector{}; - }; - std::vector> LlamaCppSyncInferRequest::query_state() const { - std::cout << "VSHAMPOR: get_profiling_info() called\n"; - return std::vector>{}; +void LlamaCppSyncInferRequest::infer() { + auto input_ids_tensor_ptr = get_tensor(get_inputs()[0]); // TODO (vshampor) correctly identify input_ids among + // all inputs without hardcode + OPENVINO_ASSERT(input_ids_tensor_ptr->get_element_type() == ov::element::Type_t::i64); + OPENVINO_ASSERT(input_ids_tensor_ptr->get_shape().size() == 2); + size_t batch_size = input_ids_tensor_ptr->get_shape()[0]; + size_t sequence_length = input_ids_tensor_ptr->get_shape()[1]; + + // llama_batch actually contains one sequence + llama_batch batch = llama_batch_init(sequence_length, /* embd = */ 0, /* n_seq_max = */ 1); + const int64_t* data_ptr = input_ids_tensor_ptr->data(); + + const int64_t* sequence_start_ptr = data_ptr /* + seq_idx */; + + for (size_t tok_idx = 0; tok_idx < sequence_length; ++tok_idx) { + const int64_t token_id = sequence_start_ptr[tok_idx]; + llama_batch_add_reimpl(batch, + token_id, + *(m_compiled_model_ptr->num_tokens_processed_ptr), + {0}, + true); // the last `true` here is a marker that the logits for this + // token should be computed and returned + size_t* ptr = m_compiled_model_ptr->num_tokens_processed_ptr; + (*ptr)++; } + + llama_context* ctx = m_compiled_model_ptr->m_llama_ctx; + int32_t sts = llama_decode(ctx, batch); + + if (sts != 0) { + OPENVINO_THROW("llama_decode failed with code ", sts); } + + size_t n_vocab = llama_n_vocab(m_compiled_model_ptr->m_llama_model_ptr); + + ov::Tensor output_tensor{ov::element::Type_t::f32, {1, sequence_length, n_vocab}}; + float* output_tensor_data_ptr = output_tensor.data(); + + for (size_t pos = 0; pos < sequence_length; pos++) { + float* logits_from_llama = llama_get_logits_ith(ctx, pos); + std::copy(logits_from_llama, logits_from_llama + n_vocab, output_tensor_data_ptr + pos * n_vocab); + } + + auto& logit_output = get_outputs()[0]; + allocate_tensor(logit_output, [&output_tensor](ov::SoPtr& tensor) { + allocate_tensor_impl(tensor, output_tensor.get_element_type(), output_tensor.get_shape()); + output_tensor.copy_to(ov::make_tensor(tensor)); + }); +}; +std::vector LlamaCppSyncInferRequest::get_profiling_info() const { + std::cout << "VSHAMPOR: get_profiling_info() called\n"; + return std::vector{}; +}; +std::vector> LlamaCppSyncInferRequest::query_state() const { + std::cout << "VSHAMPOR: get_profiling_info() called\n"; + return std::vector>{}; +} +} // namespace llama_cpp_plugin } // namespace ov diff --git a/modules/llama_cpp_plugin/src/plugin.cpp b/modules/llama_cpp_plugin/src/plugin.cpp index 9f633426f..3e23c568f 100644 --- a/modules/llama_cpp_plugin/src/plugin.cpp +++ b/modules/llama_cpp_plugin/src/plugin.cpp @@ -1,151 +1,169 @@ #include "plugin.hpp" + +#include + #include "compiled_model.hpp" #include "openvino/op/constant.hpp" -#include #include "openvino/runtime/internal_properties.hpp" - namespace { static constexpr const char* wait_executor_name = "LlamaCppWaitExecutor"; static constexpr const char* stream_executor_name = "LlamaCppStreamsExecutor"; static constexpr const char* template_exclusive_executor = "LlamaCppExecutor"; } // namespace - namespace ov { - namespace llama_cpp_plugin { - LlamaCppPlugin::LlamaCppPlugin() : IPlugin() { - set_device_name("LLAMA_CPP"); - } - std::shared_ptr LlamaCppPlugin::compile_model(const std::shared_ptr& model, - const ov::AnyMap& properties) const { - std::cout << "VSHAMPOR: LlamaCppPlugin::compile_model" << std::endl; - - //std::string gpt2_node_name = "transformer.h.9.attn.c_proj.weight"; - //std::cout << "VSHAMPOR: sanity check - looking for node containing " << gpt2_node_name << std::endl; - //auto ops = model->get_ops(); - //auto iter = std::find_if(ops.begin(), ops.end(), [gpt2_node_name](const std::shared_ptr& val) { - // return val->get_friendly_name().find(gpt2_node_name) != std::string::npos; }); - //if (iter == ops.end()) { - // std::cout << "VSHAMPOR: did not find the node\n"; - //} else { - // std::shared_ptr node_with_tensor = *iter; - // std::cout << "VSHAMPOR: node type is " << node_with_tensor->get_type_name() << std::endl; - // std::shared_ptr const_node_ptr = ov::as_type_ptr(node_with_tensor); - // const float* data_ptr = const_node_ptr->get_data_ptr(); - // // ov::descriptor::Tensor& tensor_descr = node_with_tensor->get_output_tensor(0); - // // std::cout << "VSHAMPOR: node output tensor shape is " << tensor_descr.get_shape().to_string() << std::endl; - // // ov::TensorVector in, out; - // // node_with_tensor->evaluate(out, in); - // // std::cout << "VSHAMPOR: evaluated " << out.size() << " output tensors\n"; - // // if (!out.empty()) { - // // const ov::Tensor& tensor = out[0]; - // // const float* vals = tensor.data(); - // // std::cout << "VSHAMPOR: first elements of the weight tensor are "; - // // for (size_t i = 0; i < 10; i++) { - // // std::cout << vals[i] << " "; - // // } - // // std::cout << std::endl; - // // } - // std::cout << "VSHAMPOR: first elements of the weight tensor are "; - // for (size_t i = 0; i < 10; i++) { - // std::cout << data_ptr[i] << " "; - // } - // std::cout << std::endl; - //} - return compile_model(model, properties, {}); - } - - std::shared_ptr LlamaCppPlugin::compile_model(const std::string& fname, const ov::AnyMap& properties) const { - return std::make_shared(fname, shared_from_this()); - } - std::shared_ptr LlamaCppPlugin::compile_model(const std::shared_ptr& model, - const ov::AnyMap& properties, - const ov::SoPtr& context) const { - std::cout << "VSHAMPOR: compile_model called in C++" << std::endl; - return std::make_shared(model->clone(), shared_from_this(), context, get_executor_manager()->get_executor(template_exclusive_executor)); - } - - void LlamaCppPlugin::set_property(const ov::AnyMap& properties) { - for (const auto& map_entry : properties) { - if (map_entry.first == ov::cache_dir.name()) { - m_cache_dir = map_entry.second.as(); - } - else { - OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: setting property ", map_entry.first, "not implemented"); - } - } - } - - ov::Any LlamaCppPlugin::get_property(const std::string& name, const ov::AnyMap& arguments) const { - if (ov::supported_properties == name) { - return decltype(ov::supported_properties)::value_type(std::vector({ov::cache_dir, ov::device::capabilities, ov::device::full_name})); - } - if (ov::device::capabilities == name) { - return decltype(ov::device::capabilities)::value_type(std::vector({ov::device::capability::EXPORT_IMPORT})); - } - if (ov::internal::supported_properties == name) { - return decltype(ov::internal::supported_properties)::value_type(std::vector({ov::internal::caching_properties})); - } - - if (ov::cache_dir == name) { - return m_cache_dir; - } - if (ov::internal::caching_properties == name) { - return std::vector{ov::device::full_name}; - } - - if (ov::device::full_name == name) { - return std::string("LLAMA_CPP"); - } - - OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented"); +namespace llama_cpp_plugin { +LlamaCppPlugin::LlamaCppPlugin() : IPlugin() { + set_device_name("LLAMA_CPP"); +} +std::shared_ptr LlamaCppPlugin::compile_model(const std::shared_ptr& model, + const ov::AnyMap& properties) const { + std::cout << "VSHAMPOR: LlamaCppPlugin::compile_model" << std::endl; + + // std::string gpt2_node_name = "transformer.h.9.attn.c_proj.weight"; + // std::cout << "VSHAMPOR: sanity check - looking for node containing " << + // gpt2_node_name << std::endl; auto ops = model->get_ops(); auto iter = + // std::find_if(ops.begin(), ops.end(), [gpt2_node_name](const + // std::shared_ptr& val) { + // return val->get_friendly_name().find(gpt2_node_name) != + // std::string::npos; }); + // if (iter == ops.end()) { + // std::cout << "VSHAMPOR: did not find the node\n"; + //} else { + // std::shared_ptr node_with_tensor = *iter; + // std::cout << "VSHAMPOR: node type is " << + // node_with_tensor->get_type_name() << std::endl; + // std::shared_ptr const_node_ptr = + // ov::as_type_ptr(node_with_tensor); const float* + // data_ptr = const_node_ptr->get_data_ptr(); + // // ov::descriptor::Tensor& tensor_descr = + // node_with_tensor->get_output_tensor(0); + // // std::cout << "VSHAMPOR: node output tensor shape is " << + // tensor_descr.get_shape().to_string() << std::endl; + // // ov::TensorVector in, out; + // // node_with_tensor->evaluate(out, in); + // // std::cout << "VSHAMPOR: evaluated " << out.size() << " output + // tensors\n"; + // // if (!out.empty()) { + // // const ov::Tensor& tensor = out[0]; + // // const float* vals = tensor.data(); + // // std::cout << "VSHAMPOR: first elements of the weight tensor are + // "; + // // for (size_t i = 0; i < 10; i++) { + // // std::cout << vals[i] << " "; + // // } + // // std::cout << std::endl; + // // } + // std::cout << "VSHAMPOR: first elements of the weight tensor are "; + // for (size_t i = 0; i < 10; i++) { + // std::cout << data_ptr[i] << " "; + // } + // std::cout << std::endl; + //} + return compile_model(model, properties, {}); +} + +std::shared_ptr LlamaCppPlugin::compile_model(const std::string& fname, + const ov::AnyMap& properties) const { + return std::make_shared(fname, shared_from_this()); +} +std::shared_ptr LlamaCppPlugin::compile_model(const std::shared_ptr& model, + const ov::AnyMap& properties, + const ov::SoPtr& context) const { + std::cout << "VSHAMPOR: compile_model called in C++" << std::endl; + return std::make_shared(model->clone(), + shared_from_this(), + context, + get_executor_manager()->get_executor(template_exclusive_executor)); +} + +void LlamaCppPlugin::set_property(const ov::AnyMap& properties) { + for (const auto& map_entry : properties) { + if (map_entry.first == ov::cache_dir.name()) { + m_cache_dir = map_entry.second.as(); + } else { + OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: setting property ", map_entry.first, "not implemented"); } + } +} - ov::SoPtr LlamaCppPlugin::create_context(const ov::AnyMap& remote_properties) const { - OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented"); - } - ov::SoPtr LlamaCppPlugin::get_default_context(const ov::AnyMap& remote_properties) const { - OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented"); - } - std::shared_ptr LlamaCppPlugin::import_model(std::istream& model_file_stream, - const ov::AnyMap& properties) const { - std::cout << "VSHAMPOR: importing model" << '\n'; - std::cout << "VSHAMPOR: deserializing ov::Model first" << '\n'; - // read XML content - std::string xmlString; - std::uint64_t dataSize = 0; - model_file_stream.read(reinterpret_cast(&dataSize), sizeof(dataSize)); - xmlString.resize(dataSize); - model_file_stream.read(const_cast(xmlString.c_str()), dataSize); - - // read blob content - ov::Tensor weights; - model_file_stream.read(reinterpret_cast(&dataSize), sizeof(dataSize)); - if (0 != dataSize) { - weights = ov::Tensor(ov::element::from(), ov::Shape{static_cast(dataSize)}); - model_file_stream.read(weights.data(), dataSize); - } - - auto ov_model = get_core()->read_model(xmlString, weights); - std::cout << "VSHAMPOR: ov::Model deserialized, passing the rest of the stream to LlamaCppModel ctor" << '\n'; - return std::make_shared(ov_model, model_file_stream, shared_from_this()); - } +ov::Any LlamaCppPlugin::get_property(const std::string& name, const ov::AnyMap& arguments) const { + if (ov::supported_properties == name) { + return decltype(ov::supported_properties)::value_type( + std::vector({ov::cache_dir, ov::device::capabilities, ov::device::full_name})); + } + if (ov::device::capabilities == name) { + return decltype(ov::device::capabilities)::value_type( + std::vector({ov::device::capability::EXPORT_IMPORT})); + } + if (ov::internal::supported_properties == name) { + return decltype(ov::internal::supported_properties)::value_type( + std::vector({ov::internal::caching_properties})); + } - const std::string CURRENT_GGUF_FILE_NAME = "current.gguf"; - std::string LlamaCppPlugin::get_current_gguf_file_path() const { return m_cache_dir + "/" + CURRENT_GGUF_FILE_NAME; } + if (ov::cache_dir == name) { + return m_cache_dir; + } + if (ov::internal::caching_properties == name) { + return std::vector{ov::device::full_name}; + } - std::shared_ptr LlamaCppPlugin::import_model(std::istream& model, - const ov::SoPtr& context, - const ov::AnyMap& properties) const { - OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented"); - } + if (ov::device::full_name == name) { + return std::string("LLAMA_CPP"); + } - ov::SupportedOpsMap LlamaCppPlugin::query_model(const std::shared_ptr& model, - const ov::AnyMap& properties) const { - OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented"); - } + OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented"); +} + +ov::SoPtr LlamaCppPlugin::create_context(const ov::AnyMap& remote_properties) const { + OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented"); +} +ov::SoPtr LlamaCppPlugin::get_default_context(const ov::AnyMap& remote_properties) const { + OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented"); +} +std::shared_ptr LlamaCppPlugin::import_model(std::istream& model_file_stream, + const ov::AnyMap& properties) const { + std::cout << "VSHAMPOR: importing model" << '\n'; + std::cout << "VSHAMPOR: deserializing ov::Model first" << '\n'; + // read XML content + std::string xmlString; + std::uint64_t dataSize = 0; + model_file_stream.read(reinterpret_cast(&dataSize), sizeof(dataSize)); + xmlString.resize(dataSize); + model_file_stream.read(const_cast(xmlString.c_str()), dataSize); + + // read blob content + ov::Tensor weights; + model_file_stream.read(reinterpret_cast(&dataSize), sizeof(dataSize)); + if (0 != dataSize) { + weights = ov::Tensor(ov::element::from(), ov::Shape{static_cast(dataSize)}); + model_file_stream.read(weights.data(), dataSize); } + + auto ov_model = get_core()->read_model(xmlString, weights); + std::cout << "VSHAMPOR: ov::Model deserialized, passing the rest of the " + "stream to LlamaCppModel ctor" + << '\n'; + return std::make_shared(ov_model, model_file_stream, shared_from_this()); +} + +const std::string CURRENT_GGUF_FILE_NAME = "current.gguf"; +std::string LlamaCppPlugin::get_current_gguf_file_path() const { + return m_cache_dir + "/" + CURRENT_GGUF_FILE_NAME; +} + +std::shared_ptr LlamaCppPlugin::import_model(std::istream& model, + const ov::SoPtr& context, + const ov::AnyMap& properties) const { + OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented"); +} + +ov::SupportedOpsMap LlamaCppPlugin::query_model(const std::shared_ptr& model, + const ov::AnyMap& properties) const { + OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented"); +} +} // namespace llama_cpp_plugin } // namespace ov static const ov::Version version = {CI_BUILD_NUMBER, "llama_cpp_plugin"}; diff --git a/modules/llama_cpp_plugin/tests/CMakeLists.txt b/modules/llama_cpp_plugin/tests/CMakeLists.txt deleted file mode 100644 index 11648c2bd..000000000 --- a/modules/llama_cpp_plugin/tests/CMakeLists.txt +++ /dev/null @@ -1,37 +0,0 @@ -set(TARGET_NAME llama_cpp_plugin_func_tests) - -if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") - ov_add_compiler_flags(/wd4305) -endif() - -ov_add_test_target( - NAME ${TARGET_NAME} - ROOT ${CMAKE_CURRENT_SOURCE_DIR} - DEPENDENCIES - openvino_template_plugin - LINK_LIBRARIES - openvino::funcSharedTests - openvino::runtime::dev - INCLUDES - "${OpenVINOTemplatePlugin_SOURCE_DIR}/include" - "${CMAKE_CURRENT_SOURCE_DIR}/op_reference" - ADD_CLANG_FORMAT - LABELS - OV UNIT TEMPLATE -) - -find_package(OpenCV QUIET COMPONENTS core imgproc) - -if(OpenCV_FOUND AND OpenCV_VERSION VERSION_GREATER_EQUAL 3.4) - message(STATUS "Reference preprocessing: OpenCV tests are enabled") - target_compile_definitions(${TARGET_NAME} PRIVATE OPENCV_TEMPLATE_TESTS) - target_link_libraries(${TARGET_NAME} PRIVATE opencv_imgproc opencv_core) -else() - message(WARNING "Reference preprocessing: OpenCV tests are disabled, because OpenCV ver. 3.4+ is not found") -endif() - -if (ENABLE_INTEL_CPU) - set_source_files_properties( - "${CMAKE_CURRENT_SOURCE_DIR}/shared_tests_instances/behavior/executable_network/get_metric.cpp" - PROPERTIES COMPILE_DEFINITIONS ENABLE_INTEL_CPU=1) -endif() diff --git a/modules/llama_cpp_plugin/tests/e2e/CMakeLists.txt b/modules/llama_cpp_plugin/tests/e2e/CMakeLists.txt new file mode 100644 index 000000000..4c16f3484 --- /dev/null +++ b/modules/llama_cpp_plugin/tests/e2e/CMakeLists.txt @@ -0,0 +1,18 @@ + +set(TARGET_NAME llama_cpp_e2e_tests) + +ov_add_test_target( + NAME ${TARGET_NAME} + ROOT ${CMAKE_CURRENT_SOURCE_DIR} + DEPENDENCIES + llama_cpp_plugin + LINK_LIBRARIES + openvino::runtime::dev + openvino::funcSharedTests + INCLUDES + "${OpenVINOTemplatePlugin_SOURCE_DIR}/include" + ADD_CLANG_FORMAT + LABELS + OV UNIT TEMPLATE +) + diff --git a/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp b/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp new file mode 100644 index 000000000..f4e0369c5 --- /dev/null +++ b/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp @@ -0,0 +1,63 @@ +#include +#include "openvino/openvino.hpp" +#include "common_test_utils/file_utils.hpp" + +const std::string TEST_FILES_DIR = "test_data"; + +// "Why is the Sun yellow?" +const std::vector GPT2_PROMPT_TOKEN_IDS = {5195, 318, 262, 3825, 7872, 30}; +// "The Sun is a bright red, which means it is a bright red. The Sun is a bright red because it is a bright red." +const std::vector GPT2_REFERENCE_RESPONSE_TOKEN_IDS = {198, 464, 3825, 318, 257, 6016, 2266, 11, 543, 1724, 340, 318, 257, 6016, 2266, 13, 383, 3825, 318, 257, 6016, 2266, 780, 340, 318, 257, 6016, 2266, 13, 198, 198, 464}; + +const auto SEP = ov::util::FileTraits::file_separator; + +TEST(PromptResponseTest, TestGPT2) { + const std::string plugin_name = "LLAMA_CPP"; + ov::Core core; + + const std::string model_file_name = "gpt2.gguf"; + const std::string model_file = ov::test::utils::getCurrentWorkingDir() + SEP + TEST_FILES_DIR + SEP + model_file_name; + ov::InferRequest lm = core.compile_model(model_file, plugin_name).create_infer_request(); + auto input_ids_tensor = ov::Tensor(ov::element::Type_t::i64, {1, GPT2_PROMPT_TOKEN_IDS.size()}); + std::copy(GPT2_PROMPT_TOKEN_IDS.begin(), GPT2_PROMPT_TOKEN_IDS.end(), input_ids_tensor.data()); + lm.set_tensor("input_ids", input_ids_tensor); + lm.set_tensor("attention_mask", ov::Tensor(ov::element::Type_t::i64, {1, GPT2_PROMPT_TOKEN_IDS.size()})); + ov::Tensor position_ids = lm.get_tensor("position_ids"); + position_ids.set_shape(input_ids_tensor.get_shape()); + std::iota(position_ids.data(), position_ids.data() + position_ids.get_size(), 0); + + constexpr size_t BATCH_SIZE = 1; + lm.get_tensor("beam_idx").set_shape({BATCH_SIZE}); + lm.get_tensor("beam_idx").data()[0] = 0; + + lm.infer(); + + size_t vocab_size = lm.get_tensor("logits").get_shape().back(); + float* logits = lm.get_tensor("logits").data() + (input_ids_tensor.get_size() - 1) * vocab_size; + int64_t out_token = std::max_element(logits, logits + vocab_size) - logits; + + lm.get_tensor("input_ids").set_shape({BATCH_SIZE, 1}); + position_ids.set_shape({BATCH_SIZE, 1}); + + size_t cnt = 0; + std::vector out_token_ids; + + while (cnt < GPT2_REFERENCE_RESPONSE_TOKEN_IDS.size()) { + lm.get_tensor("input_ids").data()[0] = out_token; + lm.get_tensor("attention_mask").set_shape({BATCH_SIZE, lm.get_tensor("attention_mask").get_shape().at(1) + 1}); + std::fill_n(lm.get_tensor("attention_mask").data(), lm.get_tensor("attention_mask").get_size(), 1); + position_ids.data()[0] = int64_t(lm.get_tensor("attention_mask").get_size() - 2); + lm.start_async(); + lm.wait(); + logits = lm.get_tensor("logits").data(); + out_token = std::max_element(logits, logits + vocab_size) - logits; + out_token_ids.push_back(out_token); + cnt++; + } + + lm.reset_state(); + + ASSERT_EQ(out_token_ids, GPT2_REFERENCE_RESPONSE_TOKEN_IDS); +} + + diff --git a/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp b/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp new file mode 100644 index 000000000..df796aacb --- /dev/null +++ b/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp @@ -0,0 +1,13 @@ +#include +#include + +namespace ov { +namespace test { +void set_device_suffix(const std::string& suffix) { + if (!suffix.empty()) { + throw std::runtime_error("The suffix can't be used for TEMPLATE device!"); + } +} +} // namespace test +} // namespace ov + diff --git a/modules/llama_cpp_plugin/tools/CMakeLists.txt b/modules/llama_cpp_plugin/tools/CMakeLists.txt index 4a37341b8..5209d5ca9 100644 --- a/modules/llama_cpp_plugin/tools/CMakeLists.txt +++ b/modules/llama_cpp_plugin/tools/CMakeLists.txt @@ -18,5 +18,4 @@ target_link_libraries(tensor_comparator PRIVATE ggml) add_executable(cache_embedder "${CMAKE_CURRENT_SOURCE_DIR}/cache_embedder.cpp" ) - -target_compile_options(cache_embedder PUBLIC "--std=c++17") +set_target_properties(cache_embedder PROPERTIES CXX_STANDARD 17) From 609eed9fc0d4d23da90bf1491bf58d1ad30f3735 Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Tue, 12 Mar 2024 18:30:03 +0100 Subject: [PATCH 03/27] Remove unused code --- modules/llama_cpp_plugin/CMakeLists.txt | 1 - .../include/compiled_model.hpp | 6 +- .../include/infer_request.hpp | 4 +- modules/llama_cpp_plugin/include/plugin.hpp | 65 +- .../llama_cpp_plugin/src/compiled_model.cpp | 822 +----------------- .../llama_cpp_plugin/src/infer_request.cpp | 21 +- modules/llama_cpp_plugin/src/plugin.cpp | 85 +- .../tests/e2e/set_device_name.cpp | 2 +- modules/llama_cpp_plugin/tools/CMakeLists.txt | 21 - .../llama_cpp_plugin/tools/cache_embedder.cpp | 53 -- modules/llama_cpp_plugin/tools/runner.cpp | 73 -- .../tools/tensor_comparator.cpp | 95 -- 12 files changed, 50 insertions(+), 1198 deletions(-) delete mode 100644 modules/llama_cpp_plugin/tools/CMakeLists.txt delete mode 100644 modules/llama_cpp_plugin/tools/cache_embedder.cpp delete mode 100644 modules/llama_cpp_plugin/tools/runner.cpp delete mode 100644 modules/llama_cpp_plugin/tools/tensor_comparator.cpp diff --git a/modules/llama_cpp_plugin/CMakeLists.txt b/modules/llama_cpp_plugin/CMakeLists.txt index 1385eea5d..89c5d4e0e 100644 --- a/modules/llama_cpp_plugin/CMakeLists.txt +++ b/modules/llama_cpp_plugin/CMakeLists.txt @@ -11,7 +11,6 @@ if(CMAKE_COMPILER_IS_GNUCXX) endif() add_subdirectory(src) -add_subdirectory(tools) add_subdirectory(third_party/llama.cpp) diff --git a/modules/llama_cpp_plugin/include/compiled_model.hpp b/modules/llama_cpp_plugin/include/compiled_model.hpp index eb785e252..1ae79f12e 100644 --- a/modules/llama_cpp_plugin/include/compiled_model.hpp +++ b/modules/llama_cpp_plugin/include/compiled_model.hpp @@ -55,6 +55,7 @@ namespace ov { virtual ov::Any get_property(const std::string& name) const override; virtual const std::vector>& inputs() const override; virtual const std::vector>& outputs() const override; + virtual ~LlamaCppModel(); protected: /** * @brief Method creates infer request implementation @@ -64,14 +65,13 @@ namespace ov { virtual std::shared_ptr create_sync_infer_request() const override; private: - std::string get_current_gguf_file_path() const; gguf_context* m_gguf_ctx = nullptr; - std::string m_converted_gguf_file_name; + std::string m_gguf_fname; llama_model* m_llama_model_ptr = nullptr; llama_context* m_llama_ctx = nullptr; - size_t* num_tokens_processed_ptr = nullptr; // TODO: (vshampor) find a better place for this kind of storage std::shared_ptr m_model; + size_t* num_tokens_processed_ptr = nullptr; // TODO: (vshampor) find a better place for this kind of storage std::vector> m_fake_inputs; std::vector> m_fake_outputs; diff --git a/modules/llama_cpp_plugin/include/infer_request.hpp b/modules/llama_cpp_plugin/include/infer_request.hpp index b6314010b..8954a180b 100644 --- a/modules/llama_cpp_plugin/include/infer_request.hpp +++ b/modules/llama_cpp_plugin/include/infer_request.hpp @@ -7,12 +7,10 @@ namespace ov { namespace llama_cpp_plugin { + class LlamaCppSyncInferRequest : public ISyncInferRequest { public: explicit LlamaCppSyncInferRequest(const std::shared_ptr& compiled_model); - // explicit LlamaCppSyncInferRequest(const std::shared_ptr& compiled_model): ov::ISyncInferRequest(compiled_model) { - // std::cout << "VSHAMPOR: infer request ctor called\n"; - // } virtual ~LlamaCppSyncInferRequest() {}; virtual void set_tensors_impl(const ov::Output port, diff --git a/modules/llama_cpp_plugin/include/plugin.hpp b/modules/llama_cpp_plugin/include/plugin.hpp index aea32ea1f..1d6fdf1e4 100644 --- a/modules/llama_cpp_plugin/include/plugin.hpp +++ b/modules/llama_cpp_plugin/include/plugin.hpp @@ -12,99 +12,36 @@ namespace ov { class LlamaCppPlugin : public IPlugin { public: LlamaCppPlugin(); - /** - * @brief Compiles model from ov::Model object - * @param model A model object acquired from ov::Core::read_model or source construction - * @param properties A ov::AnyMap of properties relevant only for this load operation - * @return Created Compiled Model object - */ virtual std::shared_ptr compile_model(const std::shared_ptr& model, const ov::AnyMap& properties) const override; - - /** - * @brief Compiles model from ov::Model object, on specified remote context - * @param model A model object acquired from ov::Core::read_model or source construction - * @param properties A ov::AnyMap of properties relevant only for this load operation - * @param context A pointer to plugin context derived from RemoteContext class used to - * execute the model - * @return Created Compiled Model object - */ virtual std::shared_ptr compile_model(const std::shared_ptr& model, const ov::AnyMap& properties, const ov::SoPtr& context) const override; - /** - * @brief Sets properties for plugin, acceptable keys can be found in openvino/runtime/properties.hpp - * @param properties ov::AnyMap of properties - */ virtual void set_property(const ov::AnyMap& properties) override; - /** - * @brief Gets properties related to plugin behaviour. - * - * @param name Property name. - * @param arguments Additional arguments to get a property. - * - * @return Value of a property corresponding to the property name. - */ virtual ov::Any get_property(const std::string& name, const ov::AnyMap& arguments) const override; - /** - * @brief Creates a remote context instance based on a map of properties - * @param remote_properties Map of device-specific shared context remote properties. - * - * @return A remote context object - */ virtual ov::SoPtr create_context(const ov::AnyMap& remote_properties) const override; - /** - * @brief Provides a default remote context instance if supported by a plugin - * @param remote_properties Map of device-specific shared context remote properties. - * - * @return The default context. - */ virtual ov::SoPtr get_default_context(const ov::AnyMap& remote_properties) const override; - /** - * @brief Creates an compiled model from an previously exported model using plugin implementation - * and removes OpenVINO Runtime magic and plugin name - * @param model Reference to model output stream - * @param properties A ov::AnyMap of properties - * @return An Compiled model - */ virtual std::shared_ptr import_model(std::istream& model, const ov::AnyMap& properties) const override; - virtual std::shared_ptr compile_model(const std::string& fname, const ov::AnyMap& properties) const override; - /** - * @brief Creates an compiled model from an previously exported model using plugin implementation - * and removes OpenVINO Runtime magic and plugin name - * @param model Reference to model output stream - * @param context A pointer to plugin context derived from RemoteContext class used to - * execute the network - * @param properties A ov::AnyMap of properties - * @return An Compiled model - */ virtual std::shared_ptr import_model(std::istream& model, const ov::SoPtr& context, const ov::AnyMap& properties) const override; - /** - * @brief Queries a plugin about supported layers in model - * @param model Model object to query. - * @param properties Optional map of pairs: (property name, property value). - * @return An object containing a map of pairs an operation name -> a device name supporting this operation. - */ virtual ov::SupportedOpsMap query_model(const std::shared_ptr& model, const ov::AnyMap& properties) const override; - std::string get_current_gguf_file_path() const; private: - std::string m_cache_dir = "./"; + std::string m_cache_dir = ""; }; } // namespace llama_cpp_plugin } // namespace ov diff --git a/modules/llama_cpp_plugin/src/compiled_model.cpp b/modules/llama_cpp_plugin/src/compiled_model.cpp index 85a65d7e6..a1498f708 100644 --- a/modules/llama_cpp_plugin/src/compiled_model.cpp +++ b/modules/llama_cpp_plugin/src/compiled_model.cpp @@ -9,801 +9,46 @@ #include "infer_request.hpp" #include "plugin.hpp" +#include + namespace ov { namespace llama_cpp_plugin { -class TensorWeightMatcher { -public: - // TODO (vshampor) implement this for faster weight node matching. - // Use std::list, two passes - first for full name match, second for - // prefix-match; remove entries from list on match - using RTInfoTensorName = std::string; - using OvNodeName = std::string; - using LlamaTensorName = std::string; - - TensorWeightMatcher(const std::shared_ptr& model, - std::map tensor_names_with_shapes_to_match) { - std::multimap> intermediate_matches_map; - - const auto node_vector = model->get_ops(); - std::list> const_nodes_in_model; - for (const auto& node_ptr : node_vector) { - if (ov::is_type(node_ptr)) - const_nodes_in_model.push_back(ov::as_type_ptr(node_ptr)); - } - - // full substring match pass - std::map unmatched_rt_info_names_on_first_pass = - extract_matches(intermediate_matches_map, - tensor_names_with_shapes_to_match, - const_nodes_in_model, - [](const std::string& substring, const std::string& source) { - return source.find(substring) != std::string::npos; - }); - - // prefix substring match pass - std::map unmatched_rt_info_names_on_second_pass = extract_matches( - intermediate_matches_map, - unmatched_rt_info_names_on_first_pass, - const_nodes_in_model, - [](const std::string& substring, const std::string& source) { - return source.find(get_weight_name_without_torch_postfix(substring)) != std::string::npos; - }); - - for (auto it = intermediate_matches_map.begin(); it != intermediate_matches_map.end(); - it = intermediate_matches_map.upper_bound(it->first)) { - // TODO: perf improvement by iterating with ++; - RTInfoTensorName rt_info_name = it->first; - if (intermediate_matches_map.count(rt_info_name) != 1) { - std::cout << "VSHAMPOR: multiple matches for weight name " << rt_info_name << " and shape " - << it->second->get_shape().to_string() << ", found "; - auto range_it_pair = intermediate_matches_map.equal_range(rt_info_name); - for (auto multimatch_it = range_it_pair.first; multimatch_it != range_it_pair.second; multimatch_it++) { - auto node_ptr = multimatch_it->second; - std::cout << node_ptr->get_friendly_name() << "(shape " << node_ptr->get_shape().to_string() - << "),"; - } - std::cout << "will take the first match" << std::endl; - } - const auto& match = intermediate_matches_map.find(rt_info_name)->second; - m_rtinfo_name_to_weight_node_map[rt_info_name] = match; - } - if (!unmatched_rt_info_names_on_second_pass.empty()) { - std::cout << "VSHAMPOR: did not find the weight node for " << unmatched_rt_info_names_on_second_pass.size() - << " weights:" << std::endl; - } - for (const auto& unmatched_entry : unmatched_rt_info_names_on_second_pass) { - std::cout << '\t' << unmatched_entry.first << std::endl; - } - } - - std::unordered_map> get_matches() { - return m_rtinfo_name_to_weight_node_map; - } - -private: - std::map extract_matches( - std::multimap>& output_matches_map, - const std::map& names_with_shapes_to_match, - const std::list>& search_list, - std::function name_match_predicate) { - std::map unmatched_rt_info_names; - for (const auto& pair : names_with_shapes_to_match) { - RTInfoTensorName rt_info_name = pair.first; - const ov::Shape& wanted_shape = pair.second; - bool matched = false; - for (auto it = search_list.begin(); it != search_list.end(); it++) { - auto node_ptr = *it; - const std::string& friendly_name = node_ptr->get_friendly_name(); - if (name_match_predicate(rt_info_name, friendly_name) && node_ptr->get_shape() == wanted_shape) { - output_matches_map.insert(std::make_pair(rt_info_name, node_ptr)); - matched = true; - break; - } - } - if (!matched) - unmatched_rt_info_names.insert(pair); - } - return unmatched_rt_info_names; - } - - static std::string get_weight_name_without_torch_postfix(std::string torch_weight_name) { - size_t idx = torch_weight_name.rfind("."); - if (idx == std::string::npos) - return torch_weight_name; - return std::string(torch_weight_name, 0, idx); - } - - size_t num_exact_matches = 0; - size_t num_partial_matches = 0; - std::unordered_map> m_rtinfo_name_to_weight_node_map; -}; - -std::vector> get_nodes_containing_name_with_shape(const std::shared_ptr& model, - const std::string& weight_name, - const ov::Shape& shape) { - auto ops = model->get_ops(); - std::vector> found_weight_nodes; - std::copy_if(ops.begin(), - ops.end(), - std::back_inserter(found_weight_nodes), - [&weight_name, &shape](const std::shared_ptr& val) { - if (!ov::is_type(val)) - return false; - std::shared_ptr node_ptr = ov::as_type_ptr(val); - return val->get_friendly_name().find(weight_name) != std::string::npos && - val->get_shape() == shape; - }); - return found_weight_nodes; -} - -bool has_weight_matches(const std::shared_ptr& model, - const std::string& weight_name, - const ov::Shape& shape) { - std::vector> found_weight_nodes; - found_weight_nodes = get_nodes_containing_name_with_shape(model, weight_name, shape); - return !found_weight_nodes.empty(); -} - -std::string get_weight_name_without_torch_postfix(std::string torch_weight_name) { - size_t idx = torch_weight_name.rfind("."); - if (idx == std::string::npos) - return torch_weight_name; - return std::string(torch_weight_name, 0, idx); -} - -bool has_partial_weight_matches(const std::shared_ptr& model, - const std::string& weight_name, - const ov::Shape& shape) { - std::vector> found_weight_nodes; - found_weight_nodes = - get_nodes_containing_name_with_shape(model, get_weight_name_without_torch_postfix(weight_name), shape); - return !found_weight_nodes.empty(); -} - -std::shared_ptr get_weight_by_name_and_shape(const std::shared_ptr& model, - const std::string& weight_name, - const ov::Shape& shape) { - OPENVINO_ASSERT(has_weight_matches(model, weight_name, shape)); - std::vector> found_weight_nodes; - found_weight_nodes = get_nodes_containing_name_with_shape(model, weight_name, shape); - - if (found_weight_nodes.size() > 1) { - std::cout << "VSHAMPOR: multiple matches for weight name " << weight_name << " and shape " << shape.to_string() - << ", found "; - for (const auto& node_ptr : found_weight_nodes) { - std::cout << node_ptr->get_friendly_name() << "(shape " << shape.to_string() << "),"; - } - std::cout << "will take the first match" << std::endl; - } - std::shared_ptr node_with_tensor = found_weight_nodes.front(); - OPENVINO_ASSERT(ov::is_type(node_with_tensor)); - std::shared_ptr const_node_ptr = ov::as_type_ptr(node_with_tensor); - return const_node_ptr; -} - -using TransposePermutation = std::pair; - -std::vector expand_front(const std::vector& vec, size_t val) { - OPENVINO_ASSERT(vec.size() < GGML_MAX_DIMS); - std::vector retval(GGML_MAX_DIMS, val); - std::copy(vec.rbegin(), vec.rend(), retval.rbegin()); - return retval; -} - -void write_float_plus_one(std::ofstream& out, const float* src) { - float elt = *src; - elt += 1; - out.write((const char*)&elt, sizeof(float)); -} -void append_tensor_data_with_transpositions(const std::string& fname, - const std::vector& tensor_infos, - const std::vector& tensor_data_ptrs, - const std::map& transpositions, - const std::set increment_by_one_tensor_names) { - // assuming contiguous data underneath each pointer from tensor_data_ptrs - OPENVINO_ASSERT(tensor_infos.size() == tensor_data_ptrs.size()); - std::ofstream out(fname, std::ios::app | std::ios::out); - for (size_t i = 0; i < tensor_infos.size(); i++) { - const auto& tensor_info = tensor_infos[i]; - OPENVINO_ASSERT(tensor_info.type == GGML_TYPE_F32); // TODO (vshampor): writing transposed tensor data for - // other data types, especially lower-bitwidth; maybe - // use OV inference for that - - const char* ir_tensor_data = reinterpret_cast(tensor_data_ptrs[i]); - - std::string tensor_llama_name = std::string(tensor_info.name.data); - auto it = transpositions.find(tensor_llama_name); - if (it == transpositions.end()) { - // original IR tensor should not be transposed to conform to GGUF - // expectations, can write as-is - if (increment_by_one_tensor_names.count(tensor_llama_name) != 0) { // gemma case - size_t elt_size = sizeof(float); // FP32 only for now - OPENVINO_ASSERT(!(tensor_info.size % elt_size)); - size_t num_elts = tensor_info.size / elt_size; - for (size_t elt_idx = 0; elt_idx < num_elts; elt_idx++) { - write_float_plus_one(out, ((float*)ir_tensor_data) + elt_idx); - } - } else { - out.write(ir_tensor_data, tensor_info.size); - } - continue; - } - - if (it != transpositions.end()) { - std::vector gguf_layout_shape; - - // the shape in .ne is inverted w.r.t original export (~= IR) weight - // layout - for (size_t dim_idx = 0; dim_idx < tensor_info.n_dims; dim_idx++) { - gguf_layout_shape.push_back(tensor_info.ne[GGML_MAX_DIMS - 1 - tensor_info.n_dims - dim_idx]); - } - - TransposePermutation permutation = it->second; - std::vector ir_layout_shape(gguf_layout_shape); - std::swap(ir_layout_shape[permutation.first], ir_layout_shape[permutation.second]); - - std::vector ir_layout_strides(tensor_info.n_dims, 1); - - for (size_t idx = 0; idx < tensor_info.n_dims - 1; idx++) { - auto previous_stride_it = ir_layout_strides.rbegin() + idx; - auto stride_it = ir_layout_strides.rbegin() + idx + 1; - auto shape_it = ir_layout_shape.rbegin() + idx; - *stride_it = *shape_it * *previous_stride_it; - } - - std::vector permuted_strides(ir_layout_strides); - std::swap(permuted_strides[permutation.first], permuted_strides[permutation.second]); - - // expand up to GGML_MAX_DIMS - std::vector gguf_layout_shape_ex = expand_front(gguf_layout_shape, 1); - // stride for unused dims will be 0, has no effect on loop because - // dimension idx for that dim is always 0 - permuted_strides = expand_front(permuted_strides, 0); - - std::cout << "VSHAMPOR: writing tensor " << tensor_info.name.data << " with size " << tensor_info.size; - std::cout << " shape (GGUF layout) "; - for (auto dim : gguf_layout_shape) - std::cout << dim << ","; - std::cout << " shape (IR layout) "; - for (auto dim : ir_layout_shape) - std::cout << dim << ","; - std::cout << " stride (IR layout) "; - for (auto stride : ir_layout_strides) - std::cout << stride << ","; - std::cout << " stride (IR layout, transposing) "; - for (auto stride : permuted_strides) - std::cout << stride << ","; - std::cout << std::endl; - - // TODO (vshampor): rewrite the loop below using recurrent templates? - // This relies on GGUF_MAX_DIMS == 4 and unused dims being equal to 1 - size_t current_offset = 0; - size_t element_size = sizeof(float); - size_t num_bytes_written = 0; - for (size_t dim_0 = 0; dim_0 < gguf_layout_shape_ex[0]; dim_0++) - for (size_t dim_1 = 0; dim_1 < gguf_layout_shape_ex[1]; dim_1++) - for (size_t dim_2 = 0; dim_2 < gguf_layout_shape_ex[2]; dim_2++) - for (size_t dim_3 = 0; dim_3 < gguf_layout_shape_ex[3]; dim_3++) { - current_offset = element_size * (dim_0 * permuted_strides[0] + dim_1 * permuted_strides[1] + - dim_2 * permuted_strides[2] + dim_3 * permuted_strides[3]); - if (increment_by_one_tensor_names.count(tensor_llama_name) != 0) { // gemma case - write_float_plus_one(out, (float*)ir_tensor_data + current_offset); - } else { - out.write(ir_tensor_data + current_offset, element_size); - } - num_bytes_written += element_size; - } - std::cout << "VSHAMPOR: wrote " << num_bytes_written << std::endl; - OPENVINO_ASSERT(num_bytes_written == tensor_info.size); - } - } -} - -struct ValueStorageForLifetimeExtension { - std::list kv_key_string_storage; - std::list kv_value_string_storage; - std::list> str_arr_storage; - void* store_gguf_value_vector(const std::vector& vec, gguf_type g_type) { - size_t elt_size; - switch (g_type) { - case GGUF_TYPE_UINT8: - elt_size = sizeof(uint8_t); - break; - case GGUF_TYPE_INT8: - elt_size = sizeof(int8_t); - break; - case GGUF_TYPE_UINT16: - elt_size = sizeof(uint16_t); - break; - case GGUF_TYPE_INT16: - elt_size = sizeof(int16_t); - break; - case GGUF_TYPE_UINT32: - elt_size = sizeof(uint32_t); - break; - case GGUF_TYPE_INT32: - elt_size = sizeof(int32_t); - break; - case GGUF_TYPE_FLOAT32: - elt_size = sizeof(float); - break; - case GGUF_TYPE_UINT64: - elt_size = sizeof(uint64_t); - break; - case GGUF_TYPE_INT64: - elt_size = sizeof(int64_t); - break; - case GGUF_TYPE_FLOAT64: - elt_size = sizeof(double); - break; - case GGUF_TYPE_BOOL: - elt_size = sizeof(bool); - break; - default: - OPENVINO_THROW("Unknown array type"); - } - size_t size_in_bytes = vec.size() * elt_size; - void* mem_ptr = new char[size_in_bytes]; - for (size_t i = 0; i < vec.size(); i++) { - switch (g_type) { - case GGUF_TYPE_UINT8: - ((uint8_t*)mem_ptr)[i] = vec[i].uint8; - break; - case GGUF_TYPE_INT8: - ((int8_t*)mem_ptr)[i] = vec[i].int8; - break; - case GGUF_TYPE_UINT16: - ((uint16_t*)mem_ptr)[i] = vec[i].uint16; - break; - case GGUF_TYPE_INT16: - ((int16_t*)mem_ptr)[i] = vec[i].int16; - break; - case GGUF_TYPE_UINT32: - ((uint32_t*)mem_ptr)[i] = vec[i].uint32; - break; - case GGUF_TYPE_INT32: - ((int32_t*)mem_ptr)[i] = vec[i].int32; - break; - case GGUF_TYPE_FLOAT32: - ((float*)mem_ptr)[i] = vec[i].float32; - break; - case GGUF_TYPE_UINT64: - ((uint64_t*)mem_ptr)[i] = vec[i].uint64; - break; - case GGUF_TYPE_INT64: - ((int64_t*)mem_ptr)[i] = vec[i].int64; - break; - case GGUF_TYPE_FLOAT64: - ((double*)mem_ptr)[i] = vec[i].float64; - break; - case GGUF_TYPE_BOOL: - ((bool*)mem_ptr)[i] = vec[i].bool_; - break; - default: - OPENVINO_THROW("Unknown array type"); - } - } - return mem_ptr; - } - - ValueStorageForLifetimeExtension() = default; - ~ValueStorageForLifetimeExtension() { - for (void* ptr : non_str_raw_storage) { - delete[](char*) ptr; - } - } - -private: - std::list non_str_raw_storage; -}; - -bool maybe_parse_single_element(gguf_type g_type, - ov::Any rtmap_value, - gguf_value& dst, - ValueStorageForLifetimeExtension& store) { - switch (g_type) { - case GGUF_TYPE_UINT8: - dst.uint8 = rtmap_value.as(); - break; - case GGUF_TYPE_INT8: - dst.int8 = rtmap_value.as(); - ; - break; - case GGUF_TYPE_UINT16: - dst.uint16 = rtmap_value.as(); - break; - case GGUF_TYPE_INT16: - dst.int16 = rtmap_value.as(); - break; - case GGUF_TYPE_UINT32: - dst.uint32 = rtmap_value.as(); - break; - case GGUF_TYPE_INT32: - dst.int32 = rtmap_value.as(); - break; - case GGUF_TYPE_FLOAT32: - dst.float32 = rtmap_value.as(); - break; - case GGUF_TYPE_UINT64: - dst.uint64 = rtmap_value.as(); - break; - case GGUF_TYPE_INT64: - dst.int64 = rtmap_value.as(); - break; - case GGUF_TYPE_FLOAT64: - dst.float64 = rtmap_value.as(); - break; - case GGUF_TYPE_BOOL: - dst.bool_ = rtmap_value.as(); - break; - case GGUF_TYPE_STRING: { - std::string string_value = rtmap_value.as(); - store.kv_value_string_storage.push_back(string_value); - dst.str.n = string_value.length(); - dst.str.data = - (char*)store.kv_value_string_storage.back().c_str(); // TODO (vshampor) see equivalent case below - break; - } - default: - return false; // did not parse - } - return true; // parsed successfully -} - -ov::Any get_any_associated_with_gguf_type(gguf_type g_type) { - switch (g_type) { - case GGUF_TYPE_UINT8: - return ov::Any(uint8_t()); - case GGUF_TYPE_INT8: - return ov::Any(int8_t()); - case GGUF_TYPE_UINT16: - return ov::Any(uint16_t()); - case GGUF_TYPE_INT16: - return ov::Any(int16_t()); - case GGUF_TYPE_UINT32: - return ov::Any(uint32_t()); - case GGUF_TYPE_INT32: - return ov::Any(int32_t()); - case GGUF_TYPE_FLOAT32: - return ov::Any(float()); - case GGUF_TYPE_UINT64: - return ov::Any(uint64_t()); - case GGUF_TYPE_INT64: - return ov::Any(int64_t()); - case GGUF_TYPE_FLOAT64: - return ov::Any(double()); - case GGUF_TYPE_BOOL: - return ov::Any(bool()); - case GGUF_TYPE_STRING: - return ov::Any(std::string()); - default: - OPENVINO_THROW("Unknown gguf_type to turn into ov::Any"); - } -} LlamaCppModel::LlamaCppModel(const std::shared_ptr& model, const std::shared_ptr& plugin, const ov::SoPtr& context, const std::shared_ptr& task_executor) : ICompiledModel(model, plugin, context, task_executor) { - m_model = model; - num_tokens_processed_ptr = new size_t; // TODO (vshampor): hack, remove - *num_tokens_processed_ptr = 0; - auto rt_info = model->get_rt_info(); - OPENVINO_ASSERT(rt_info.count("lcp_kv_params") != 0); - OPENVINO_ASSERT(rt_info.count("lcp_kv_types") != 0); - OPENVINO_ASSERT(rt_info.count("lcp_kv_array_types") != 0); - OPENVINO_ASSERT(rt_info.count("lcp_tensor_name_map") != 0); - OPENVINO_ASSERT(rt_info.count("lcp_tensor_shape_map") != 0); - OPENVINO_ASSERT(rt_info.count("lcp_expected_tensor_shapes") != 0); - OPENVINO_ASSERT(rt_info.count("lcp_transpose_permutations") != 0); - - RTMap& kv_params = model->get_rt_info("lcp_kv_params"); - RTMap& kv_types = model->get_rt_info("lcp_kv_types"); - RTMap& kv_array_types = model->get_rt_info("lcp_kv_array_types"); - RTMap& tensor_name_map = model->get_rt_info("lcp_tensor_name_map"); - RTMap& tensor_shape_map = model->get_rt_info("lcp_tensor_shape_map"); - RTMap& expected_tensor_shapes_map = model->get_rt_info("lcp_expected_tensor_shapes"); - RTMap& transpose_permutations_rtmap = model->get_rt_info("lcp_transpose_permutations"); - - size_t gguf_version = model->get_rt_info("lcp_gguf_version"); - std::cout << "VSHAMPOR: parsed gguf_version " << gguf_version << std::endl; - - // kv params - OPENVINO_ASSERT(kv_params.size() == kv_types.size()); - size_t n_kv = kv_params.size(); - std::vector kv_vector; - ValueStorageForLifetimeExtension store; - - for (const auto& kv_pair : kv_params) { - gguf_kv kv; - - const auto& key = kv_pair.first; - kv.key.n = key.length(); - store.kv_key_string_storage.push_back(key); - kv.key.data = (char*)store.kv_key_string_storage.back().c_str(); // TODO (vshampor) see equivalent case below - - uint32_t value_type = kv_types[key].as(); - gguf_type gguf_value_type = (gguf_type)value_type; - kv.type = gguf_value_type; - if (gguf_value_type != GGUF_TYPE_ARRAY) { - bool is_parsed = maybe_parse_single_element(kv.type, kv_pair.second, kv.value, store); - OPENVINO_ASSERT(is_parsed, "Invalid type of a GGUF kv-value"); - } else { // array case - gguf_type element_type = (gguf_type)kv_array_types[key].as(); - kv.value.arr.type = element_type; - std::string serialized_array = kv_pair.second.as(); - std::stringstream ss{serialized_array}; - std::vector parsed_array; - while (!ss.eof()) { - gguf_value array_elt; - ov::Any ov_any = get_any_associated_with_gguf_type(element_type); - std::string token; - ss >> token; - if (std::string(kv.key.data) == "tokenizer.ggml.merges") { - // tokenizer merges are pairs of tokens separated by whitespace, so - // need to read another to get a proper merge - // TODO (vshampor): think of another delimiting strategy in the - // rt_info and use that strategy here for more robust code - std::string another_token; - ss >> another_token; - token += std::string(" ") + another_token; - ov_any = ov::Any::make(token); - } else { - std::stringstream tok_ss{token}; - ov_any.read(tok_ss); - } - bool is_parsed = maybe_parse_single_element(element_type, ov_any, array_elt, store); - OPENVINO_ASSERT(is_parsed); - parsed_array.push_back(array_elt); - } - kv.value.arr.n = parsed_array.size(); - if (element_type == GGUF_TYPE_STRING) { - // string element has already been lifetime-extended during parsing - std::vector cstr_vector(parsed_array.size()); - for (size_t cstr_idx = 0; cstr_idx < parsed_array.size(); cstr_idx++) { - cstr_vector[cstr_idx] = parsed_array[cstr_idx].str.data; - } - store.str_arr_storage.push_back(cstr_vector); - kv.value.arr.data = store.str_arr_storage.back().data(); - } else { - void* data_ptr = store.store_gguf_value_vector(parsed_array, element_type); - kv.value.arr.data = data_ptr; - } - } - kv_vector.push_back(kv); - } - - auto token_types_kv_it = std::find_if(kv_vector.begin(), kv_vector.end(), [](const gguf_kv& val) { - return std::string(val.key.data) == "tokenizer.ggml.token_type"; - }); - if (token_types_kv_it != kv_vector.end()) { - auto tokens_kv_it = std::find_if(kv_vector.begin(), kv_vector.end(), [](const gguf_kv& val) { - return std::string(val.key.data) == "tokenizer.ggml.tokens"; - }); - if (tokens_kv_it != kv_vector.end()) { - size_t expected_num_tokens = token_types_kv_it->value.arr.n; - size_t actual_num_tokens = tokens_kv_it->value.arr.n; - if (actual_num_tokens < expected_num_tokens) { - std::cout << "VSHAMPOR: detected wrong vocab " - "serialization/deserialization (expected " - << expected_num_tokens << " tokens, parsed " << actual_num_tokens - << " from vocab), filling tokens with bogus values" << std::endl; - std::vector new_vocab; - // char** old_vocab_data_ptr = (char**) tokens_kv_it->value.arr.data; - // std::copy(old_vocab_data_ptr, old_vocab_data_ptr + actual_num_tokens, - // new_vocab.begin()); size_t extra_tokens_needed = expected_num_tokens - // - actual_num_tokens; - size_t extra_tokens_needed = expected_num_tokens; - for (size_t tok_idx = 0; tok_idx < extra_tokens_needed; tok_idx++) { - std::stringstream ss; - ss << "invalid_token_" << tok_idx; - std::string new_token = ss.str(); - store.kv_value_string_storage.push_back(new_token); - char* str_data_ptr = (char*)store.kv_value_string_storage.back().c_str(); - new_vocab.push_back(str_data_ptr); - } - OPENVINO_ASSERT(new_vocab.size() == expected_num_tokens); - store.str_arr_storage.push_back(new_vocab); - tokens_kv_it->value.arr.data = (void*)store.str_arr_storage.back().data(); - tokens_kv_it->value.arr.n = expected_num_tokens; - } - } - } - - // tensors - OPENVINO_ASSERT(tensor_name_map.size() == tensor_shape_map.size()); - size_t n_tensors_in_rtinfo = tensor_name_map.size(); - std::cout << "VSHAMPOR: got request for " << n_tensors_in_rtinfo << " tensors from rt_info\n"; - - std::vector tensor_infos; - std::vector tensor_data_ptrs; - - std::map parsed_weights_to_search_for; - for (const auto& llama_name_and_rtinfo_name : tensor_name_map) { - const std::string& llama_name = llama_name_and_rtinfo_name.first; - const std::string& rtinfo_name = llama_name_and_rtinfo_name.second.as(); - ov::Shape expected_shape = tensor_shape_map[llama_name].as(); - parsed_weights_to_search_for[rtinfo_name] = expected_shape; - } - - TensorWeightMatcher matcher{model, parsed_weights_to_search_for}; - std::unordered_map> matches = matcher.get_matches(); - std::unordered_map> llama_name_to_constant_node_map; - for (const auto& entry : tensor_name_map) { - const auto& llama_name = entry.first; - const auto& rtinfo_name = entry.second.as(); - llama_name_to_constant_node_map[llama_name] = matches[rtinfo_name]; - } - std::cout << "VSHAMPOR: requested tensors map to " << llama_name_to_constant_node_map.size() - << " tensors to search in model (shared tensors considered)\n"; - - std::list llama_name_storage; - - size_t n_tensors = 0; - - size_t offset = 0; // each tensor_info has to have a correct offset including - // padding, checked for in gguf_write_to_buf - for (const auto& matched_weight_pair : llama_name_to_constant_node_map) { - // Need to store the names in the list so that the passed c_str() pointers - // in tensor_infos to the llama names stay valid until they get deepcopied - // in gguf/llama functions - llama_name_storage.push_back(matched_weight_pair.first); - const std::string& llama_name = llama_name_storage.back(); - - auto weight_const_node_ptr = matched_weight_pair.second; - auto weight_shape = weight_const_node_ptr->get_shape(); - - // does hf-to-gguf invert all tensor dimensions with shapes > 1? - auto expected_weight_shape = ov::Shape(expected_tensor_shapes_map[llama_name].as()); - OPENVINO_ASSERT(expected_weight_shape.size() < GGML_MAX_DIMS); - - gguf_tensor_info info; - - info.type = GGML_TYPE_F32; // TODO (vshampor): better type assignment based - // on actual element type of the Constant node - - info.name.n = llama_name.length(); - info.name.data = (char*)llama_name.c_str(); // TODO (vshampor): either do this via const_cast, or will - // have to implement own structures for read-only data - // passing to llama_load_model_from_data - info.n_dims = weight_shape.size(); - std::fill(std::begin(info.ne), std::begin(info.ne) + GGML_MAX_DIMS, (uint64_t)1); - - // looks like GGUF expects inverse order of dimensions when compared to e.g. - // torch and actual row-major layout, see - // gguf.gguf_writer.GGUFWriter.add_tensor_info in gguf python package - std::copy(expected_weight_shape.rbegin(), expected_weight_shape.rend(), info.ne); - - void* data_ptr = (void*)(weight_const_node_ptr->get_data_ptr()); // TODO (vshampor): danger - casts - // `const` away also - the - // expected_weight_shape is in general - // different from actual ov::Tensor - // shape, in particular it may be - // transposed, so we actually need to set - // the pointers to shape-corrected tensor - // storage, which we don't do here - we - // are only preparing this data to get a - // convenient gguf_context object to - // reuse metadata (header) writing code, - // tensor data transpositions will be - // done during actual file write - - info.size = weight_const_node_ptr->get_byte_size(); - info.offset = offset; - - const size_t size_pad = GGML_PAD(info.size, GGUF_DEFAULT_ALIGNMENT); - offset += size_pad; - - info.data = data_ptr; - - tensor_infos.push_back(info); - tensor_data_ptrs.push_back(data_ptr); - n_tensors++; - } - - std::cout << "VSHAMPOR: found " << matches.size() << "/" << parsed_weights_to_search_for.size() << " tensors" - << std::endl; - - gguf_init_params gguf_params; - gguf_params.no_alloc = false; - gguf_params.ctx = nullptr; - - m_gguf_ctx = gguf_init_from_data(n_tensors, - tensor_infos.data(), - n_kv, - kv_vector.data(), - tensor_data_ptrs.data(), - gguf_params); - - std::shared_ptr llama_plugin_ptr = std::dynamic_pointer_cast(plugin); - m_converted_gguf_file_name = llama_plugin_ptr->get_current_gguf_file_path(); - - std::cout << "VSHAMPOR: output filename is " << m_converted_gguf_file_name << std::endl; - std::cout << "VSHAMPOR: writing metadata (GGUF header) " << std::endl; - gguf_write_to_file(m_gguf_ctx, - m_converted_gguf_file_name.c_str(), - /* only_meta = */ true); - - std::map transpose_permutations; - - for (const auto& llama_name_and_permutation : transpose_permutations_rtmap) { - std::string permutation_str = llama_name_and_permutation.second.as(); - std::stringstream ss(permutation_str); - TransposePermutation permutation; - bool is_ok = true; - is_ok &= static_cast(ss >> permutation.first); - is_ok &= static_cast(ss >> permutation.second); - OPENVINO_ASSERT(is_ok, "failed to read permutation"); - transpose_permutations[llama_name_and_permutation.first] = permutation; + OPENVINO_THROW_NOT_IMPLEMENTED("Currently only direct GGUF file loading is supported for the LLAMA_CPP* plugins"); } - std::set gemma_tensor_names_to_increment; - // FIXME (vshampor): tried setting up commands for incrementing *_norm.weight - // values by 1 like it is done during llama.cpp HF-to-GGUF export, but it - // seems that it isn't necessary and IR stores the incremented weights already - // Is this due to constant folding? - - // for (const auto& llama_name_and_rtinfo_name : tensor_name_map) { - // const std::string& llama_name = llama_name_and_rtinfo_name.first; - // const std::string& rtinfo_name = - // llama_name_and_rtinfo_name.second.as(); std::string - // gemma_norm_suffix = "norm.weight"; if (rtinfo_name.size() < - // gemma_norm_suffix.size()) continue; if - // (rtinfo_name.substr(rtinfo_name.size() - gemma_norm_suffix.size()) == - // gemma_norm_suffix) gemma_tensor_names_to_increment.insert(llama_name); - // } - - std::cout << "VSHAMPOR: writing tensor data (blob with transpositions) " << std::endl; - append_tensor_data_with_transpositions(m_converted_gguf_file_name, - tensor_infos, - tensor_data_ptrs, - transpose_permutations, - gemma_tensor_names_to_increment); - std::cout << "VSHAMPOR: write finished." << m_converted_gguf_file_name << std::endl; - - std::cout << "VSHAMPOR: loading llama model from written file..." << std::endl; - llama_model_params mparams = llama_model_default_params(); - mparams.n_gpu_layers = 99; - m_llama_model_ptr = llama_load_model_from_file(m_converted_gguf_file_name.c_str(), mparams); - llama_context_params cparams = llama_context_default_params(); - m_llama_ctx = llama_new_context_with_model(m_llama_model_ptr, cparams); - - std::cout << "VSHAMPOR: llama model loaded successfully..." << std::endl; -} LlamaCppModel::LlamaCppModel(const std::shared_ptr& ov_model, std::istream& input_stream, const std::shared_ptr& plugin) : ICompiledModel(ov_model, plugin) { - num_tokens_processed_ptr = new size_t; // TODO (vshampor): hack, remove - *num_tokens_processed_ptr = 0; - std::shared_ptr llama_plugin = std::dynamic_pointer_cast(plugin); - std::string current_file_path = llama_plugin->get_current_gguf_file_path(); - std::ofstream output_stream(current_file_path, std::ios::binary); - output_stream << input_stream.rdbuf(); + OPENVINO_THROW_NOT_IMPLEMENTED("Currently only direct GGUF file loading is supported for the LLAMA_CPP* plugins"); +} - std::cout << "VSHAMPOR: loading llama model from imported and re-written file..." << std::endl; - llama_model_params mparams = llama_model_default_params(); - mparams.n_gpu_layers = 99; - m_llama_model_ptr = llama_load_model_from_file(current_file_path.c_str(), mparams); - llama_context_params cparams = llama_context_default_params(); - m_llama_ctx = llama_new_context_with_model(m_llama_model_ptr, cparams); - std::cout << "VSHAMPOR: llama model loaded successfully from cache..." << std::endl; +LlamaCppModel::~LlamaCppModel() { + llama_free(m_llama_ctx); + llama_free_model(m_llama_model_ptr); + llama_backend_free(); + delete num_tokens_processed_ptr; } LlamaCppModel::LlamaCppModel(const std::string& gguf_fname, const std::shared_ptr& plugin) - : ICompiledModel(nullptr, plugin) { + : ICompiledModel(nullptr, plugin), m_gguf_fname(gguf_fname) { num_tokens_processed_ptr = new size_t; // TODO (vshampor): hack, remove *num_tokens_processed_ptr = 0; - std::cout << "VSHAMPOR: loading llama model directly from GGUF... " << std::endl; + OPENVINO_DEBUG << "llama_cpp_plugin: loading llama model directly from GGUF... " << std::endl; llama_model_params mparams = llama_model_default_params(); mparams.n_gpu_layers = 99; m_llama_model_ptr = llama_load_model_from_file(gguf_fname.c_str(), mparams); llama_context_params cparams = llama_context_default_params(); m_llama_ctx = llama_new_context_with_model(m_llama_model_ptr, cparams); - std::cout << "VSHAMPOR: llama model loaded successfully from GGUF..." << std::endl; + OPENVINO_DEBUG << "llama_cpp_plugin: llama model loaded successfully from GGUF..." << std::endl; auto input_ids = std::make_shared(ov::element::Type_t::i64, ov::PartialShape({-1, -1})); auto fake_convert = std::make_shared(input_ids->output(0), ov::element::Type_t::f32); @@ -837,47 +82,20 @@ LlamaCppModel::LlamaCppModel(const std::string& gguf_fname, const std::shared_pt } } -void LlamaCppModel::export_model(std::ostream& output_stream) const { - std::cout << "VSHAMPOR: exporting model" << std::endl; - - // FIXME (vshampor): it's a shame that loading a model from cache does not - // have an option to actually keep the already loaded model from xml and not - // be forced to deserialize an ov::Model representation from cache as well. As - // it stands, will need to write the whole IR into the cache entry along with - // the GGUF file. - // - std::stringstream xmlFile, binFile; - ov::pass::Serialize serializer(xmlFile, binFile); - serializer.run_on_model(m_model); - - auto m_constants = binFile.str(); - auto m_model = xmlFile.str(); - - auto dataSize = static_cast(m_model.size()); - output_stream.write(reinterpret_cast(&dataSize), sizeof(dataSize)); - output_stream.write(m_model.c_str(), dataSize); - - dataSize = static_cast(m_constants.size()); - output_stream.write(reinterpret_cast(&dataSize), sizeof(dataSize)); - output_stream.write(reinterpret_cast(&m_constants[0]), dataSize); - - std::ifstream in(m_converted_gguf_file_name, std::ios::binary); - output_stream << in.rdbuf(); -} std::shared_ptr LlamaCppModel::get_runtime_model() const { - OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented"); + OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: Not Implemented"); } void LlamaCppModel::set_property(const ov::AnyMap& properties) { - std::cout << "VSHAMPOR: attempted to set_property (did nothing)"; + OPENVINO_DEBUG << "llama_cpp_plugin: attempted to set_property (did nothing)"; } ov::Any LlamaCppModel::get_property(const std::string& name) const { if (ov::supported_properties == name) { return decltype(ov::supported_properties)::value_type(std::vector()); } - OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented"); + OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: Not Implemented"); } std::shared_ptr LlamaCppModel::create_sync_infer_request() const { @@ -891,5 +109,13 @@ const std::vector>& LlamaCppModel::inputs() const { const std::vector>& LlamaCppModel::outputs() const { return m_fake_outputs; }; + +void LlamaCppModel::export_model(std::ostream& output_stream) const { + std::ifstream in(m_gguf_fname, std::ios::binary); + output_stream << in.rdbuf(); +} + + + } // namespace llama_cpp_plugin } // namespace ov diff --git a/modules/llama_cpp_plugin/src/infer_request.cpp b/modules/llama_cpp_plugin/src/infer_request.cpp index 6b5e8ba1e..40307c573 100644 --- a/modules/llama_cpp_plugin/src/infer_request.cpp +++ b/modules/llama_cpp_plugin/src/infer_request.cpp @@ -1,7 +1,9 @@ #include "infer_request.hpp" +#include #include "llama.h" #include "openvino/runtime/make_tensor.hpp" +#include "openvino/util/log.hpp" namespace ov { namespace llama_cpp_plugin { @@ -18,12 +20,10 @@ void allocate_tensor_impl(ov::SoPtr& tensor, LlamaCppSyncInferRequest::LlamaCppSyncInferRequest(const std::shared_ptr& compiled_model) : ov::ISyncInferRequest(compiled_model) { - std::cout << "VSHAMPOR: infer request ctor called\n"; + OPENVINO_DEBUG << "llama_cpp_plugin: infer request ctor called\n"; m_compiled_model_ptr = compiled_model; - // Allocate input/output tensors for (const auto& input : get_inputs()) { allocate_tensor(input, [input](ov::SoPtr& tensor) { - // Can add a check to avoid double work in case of shared tensors allocate_tensor_impl(tensor, input.get_element_type(), input.get_partial_shape().is_dynamic() ? ov::Shape{0} : input.get_shape()); @@ -31,7 +31,6 @@ LlamaCppSyncInferRequest::LlamaCppSyncInferRequest(const std::shared_ptr& tensor) { - // Can add a check to avoid double work in case of shared tensors allocate_tensor_impl(tensor, output.get_element_type(), output.get_partial_shape().is_dynamic() ? ov::Shape{0} : output.get_shape()); @@ -40,7 +39,7 @@ LlamaCppSyncInferRequest::LlamaCppSyncInferRequest(const std::shared_ptr port, const std::vector>& tensors) { - std::cout << "VSHAMPOR: set_tensors_impl called\n"; + OPENVINO_DEBUG << "llama_cpp_plugin: set_tensors_impl called\n"; } void llama_batch_add_reimpl(struct llama_batch& batch, @@ -64,7 +63,6 @@ void LlamaCppSyncInferRequest::infer() { // all inputs without hardcode OPENVINO_ASSERT(input_ids_tensor_ptr->get_element_type() == ov::element::Type_t::i64); OPENVINO_ASSERT(input_ids_tensor_ptr->get_shape().size() == 2); - size_t batch_size = input_ids_tensor_ptr->get_shape()[0]; size_t sequence_length = input_ids_tensor_ptr->get_shape()[1]; // llama_batch actually contains one sequence @@ -81,8 +79,7 @@ void LlamaCppSyncInferRequest::infer() { {0}, true); // the last `true` here is a marker that the logits for this // token should be computed and returned - size_t* ptr = m_compiled_model_ptr->num_tokens_processed_ptr; - (*ptr)++; + *(m_compiled_model_ptr->num_tokens_processed_ptr) += 1; } llama_context* ctx = m_compiled_model_ptr->m_llama_ctx; @@ -109,12 +106,14 @@ void LlamaCppSyncInferRequest::infer() { }); }; std::vector LlamaCppSyncInferRequest::get_profiling_info() const { - std::cout << "VSHAMPOR: get_profiling_info() called\n"; + OPENVINO_DEBUG << "llama_cpp_plugin: get_profiling_info() called\n"; return std::vector{}; }; + + std::vector> LlamaCppSyncInferRequest::query_state() const { - std::cout << "VSHAMPOR: get_profiling_info() called\n"; - return std::vector>{}; + OPENVINO_DEBUG << "llama_cpp_plugin: query_state() called\n"; + return {}; } } // namespace llama_cpp_plugin } // namespace ov diff --git a/modules/llama_cpp_plugin/src/plugin.cpp b/modules/llama_cpp_plugin/src/plugin.cpp index 3e23c568f..ec456cc45 100644 --- a/modules/llama_cpp_plugin/src/plugin.cpp +++ b/modules/llama_cpp_plugin/src/plugin.cpp @@ -4,6 +4,7 @@ #include "compiled_model.hpp" #include "openvino/op/constant.hpp" +#include "openvino/util/log.hpp" #include "openvino/runtime/internal_properties.hpp" namespace { @@ -19,48 +20,7 @@ LlamaCppPlugin::LlamaCppPlugin() : IPlugin() { } std::shared_ptr LlamaCppPlugin::compile_model(const std::shared_ptr& model, const ov::AnyMap& properties) const { - std::cout << "VSHAMPOR: LlamaCppPlugin::compile_model" << std::endl; - - // std::string gpt2_node_name = "transformer.h.9.attn.c_proj.weight"; - // std::cout << "VSHAMPOR: sanity check - looking for node containing " << - // gpt2_node_name << std::endl; auto ops = model->get_ops(); auto iter = - // std::find_if(ops.begin(), ops.end(), [gpt2_node_name](const - // std::shared_ptr& val) { - // return val->get_friendly_name().find(gpt2_node_name) != - // std::string::npos; }); - // if (iter == ops.end()) { - // std::cout << "VSHAMPOR: did not find the node\n"; - //} else { - // std::shared_ptr node_with_tensor = *iter; - // std::cout << "VSHAMPOR: node type is " << - // node_with_tensor->get_type_name() << std::endl; - // std::shared_ptr const_node_ptr = - // ov::as_type_ptr(node_with_tensor); const float* - // data_ptr = const_node_ptr->get_data_ptr(); - // // ov::descriptor::Tensor& tensor_descr = - // node_with_tensor->get_output_tensor(0); - // // std::cout << "VSHAMPOR: node output tensor shape is " << - // tensor_descr.get_shape().to_string() << std::endl; - // // ov::TensorVector in, out; - // // node_with_tensor->evaluate(out, in); - // // std::cout << "VSHAMPOR: evaluated " << out.size() << " output - // tensors\n"; - // // if (!out.empty()) { - // // const ov::Tensor& tensor = out[0]; - // // const float* vals = tensor.data(); - // // std::cout << "VSHAMPOR: first elements of the weight tensor are - // "; - // // for (size_t i = 0; i < 10; i++) { - // // std::cout << vals[i] << " "; - // // } - // // std::cout << std::endl; - // // } - // std::cout << "VSHAMPOR: first elements of the weight tensor are "; - // for (size_t i = 0; i < 10; i++) { - // std::cout << data_ptr[i] << " "; - // } - // std::cout << std::endl; - //} + OPENVINO_DEBUG << "llama_cpp_plugin: LlamaCppPlugin::compile_model" << std::endl; return compile_model(model, properties, {}); } @@ -71,7 +31,7 @@ std::shared_ptr LlamaCppPlugin::compile_model(const std::str std::shared_ptr LlamaCppPlugin::compile_model(const std::shared_ptr& model, const ov::AnyMap& properties, const ov::SoPtr& context) const { - std::cout << "VSHAMPOR: compile_model called in C++" << std::endl; + OPENVINO_DEBUG << "llama_cpp_plugin: compile_model called in C++" << std::endl; return std::make_shared(model->clone(), shared_from_this(), context, @@ -83,7 +43,7 @@ void LlamaCppPlugin::set_property(const ov::AnyMap& properties) { if (map_entry.first == ov::cache_dir.name()) { m_cache_dir = map_entry.second.as(); } else { - OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: setting property ", map_entry.first, "not implemented"); + OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: setting property ", map_entry.first, "not implemented"); } } } @@ -113,55 +73,30 @@ ov::Any LlamaCppPlugin::get_property(const std::string& name, const ov::AnyMap& return std::string("LLAMA_CPP"); } - OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented"); + OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: getting property ", name, "not implemented"); } ov::SoPtr LlamaCppPlugin::create_context(const ov::AnyMap& remote_properties) const { - OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented"); + OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: Not Implemented"); } ov::SoPtr LlamaCppPlugin::get_default_context(const ov::AnyMap& remote_properties) const { - OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented"); + OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: Not Implemented"); } std::shared_ptr LlamaCppPlugin::import_model(std::istream& model_file_stream, const ov::AnyMap& properties) const { - std::cout << "VSHAMPOR: importing model" << '\n'; - std::cout << "VSHAMPOR: deserializing ov::Model first" << '\n'; - // read XML content - std::string xmlString; - std::uint64_t dataSize = 0; - model_file_stream.read(reinterpret_cast(&dataSize), sizeof(dataSize)); - xmlString.resize(dataSize); - model_file_stream.read(const_cast(xmlString.c_str()), dataSize); - - // read blob content - ov::Tensor weights; - model_file_stream.read(reinterpret_cast(&dataSize), sizeof(dataSize)); - if (0 != dataSize) { - weights = ov::Tensor(ov::element::from(), ov::Shape{static_cast(dataSize)}); - model_file_stream.read(weights.data(), dataSize); - } - - auto ov_model = get_core()->read_model(xmlString, weights); - std::cout << "VSHAMPOR: ov::Model deserialized, passing the rest of the " - "stream to LlamaCppModel ctor" - << '\n'; - return std::make_shared(ov_model, model_file_stream, shared_from_this()); + OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: model importing not implemented"); } -const std::string CURRENT_GGUF_FILE_NAME = "current.gguf"; -std::string LlamaCppPlugin::get_current_gguf_file_path() const { - return m_cache_dir + "/" + CURRENT_GGUF_FILE_NAME; -} std::shared_ptr LlamaCppPlugin::import_model(std::istream& model, const ov::SoPtr& context, const ov::AnyMap& properties) const { - OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented"); + OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: model importing not implemented"); } ov::SupportedOpsMap LlamaCppPlugin::query_model(const std::shared_ptr& model, const ov::AnyMap& properties) const { - OPENVINO_THROW_NOT_IMPLEMENTED("VSHAMPOR: Not Implemented"); + OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: model importing not implemented"); } } // namespace llama_cpp_plugin } // namespace ov diff --git a/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp b/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp index df796aacb..8fb1fac80 100644 --- a/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp +++ b/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp @@ -5,7 +5,7 @@ namespace ov { namespace test { void set_device_suffix(const std::string& suffix) { if (!suffix.empty()) { - throw std::runtime_error("The suffix can't be used for TEMPLATE device!"); + throw std::runtime_error("The suffix can't be used for LLAMA_CPP device!"); } } } // namespace test diff --git a/modules/llama_cpp_plugin/tools/CMakeLists.txt b/modules/llama_cpp_plugin/tools/CMakeLists.txt deleted file mode 100644 index 5209d5ca9..000000000 --- a/modules/llama_cpp_plugin/tools/CMakeLists.txt +++ /dev/null @@ -1,21 +0,0 @@ -cmake_minimum_required(VERSION 3.10) -set(CMAKE_CXX_STANDARD 11) - -find_package(OpenVINO REQUIRED) - - -add_executable(llama_cpp_runner - "${CMAKE_CURRENT_SOURCE_DIR}/runner.cpp" - ) -target_link_libraries(llama_cpp_runner PRIVATE openvino::runtime) - - -add_executable(tensor_comparator - "${CMAKE_CURRENT_SOURCE_DIR}/tensor_comparator.cpp" - ) -target_link_libraries(tensor_comparator PRIVATE ggml) - -add_executable(cache_embedder - "${CMAKE_CURRENT_SOURCE_DIR}/cache_embedder.cpp" - ) -set_target_properties(cache_embedder PROPERTIES CXX_STANDARD 17) diff --git a/modules/llama_cpp_plugin/tools/cache_embedder.cpp b/modules/llama_cpp_plugin/tools/cache_embedder.cpp deleted file mode 100644 index bbfbf229c..000000000 --- a/modules/llama_cpp_plugin/tools/cache_embedder.cpp +++ /dev/null @@ -1,53 +0,0 @@ -#include -#include -#include -#include -#include -#include - -int main(int argc, char* argv[]) { - assert(argc == 3); - std::string cache_blob_name = argv[1]; - std::string gguf_file_name = argv[2]; - - std::uintmax_t original_file_size = std::filesystem::file_size(cache_blob_name); - std::fstream cache_io_stream(cache_blob_name, std::ios::binary | std::ios::in | std::ios::out); - - { - std::string tmp; - std::getline(cache_io_stream, tmp); // skip the blob header - std::cout << "skipped header line" << std::endl; - } - - std::uint64_t data_size = 0; - cache_io_stream.read(reinterpret_cast(&data_size), sizeof(data_size)); - std::cout << "skipping IR XML content, size " << data_size << std::endl; - cache_io_stream.seekp(data_size, std::ios::cur); // skip IR xml content - - cache_io_stream.read(reinterpret_cast(&data_size), sizeof(data_size)); - std::cout << "skipping IR weight content, size " << data_size << std::endl; - cache_io_stream.seekp(data_size, std::ios::cur); // skip IR weight content - - std::streampos pos = cache_io_stream.tellp(); - char magic[4]; - for (size_t i = 0; i < 4; i++) { - cache_io_stream >> magic[i]; - } - - std::string curr_magic(magic); - std::cout << "magic at current position is " << curr_magic << std::endl; - assert(curr_magic == "GGUF"); - cache_io_stream.seekp(pos); - - std::ifstream gguf_input_stream(gguf_file_name, std::ios::binary); - cache_io_stream << gguf_input_stream.rdbuf(); - std::cout << "gguf content write successful" << std::endl; - std::uintmax_t final_size = cache_io_stream.tellp(); - cache_io_stream.close(); - if (final_size < original_file_size) { - std::cout << "cache entry is now smaller (" << final_size << " vs original " << original_file_size << "), truncating" << std::endl; - std::filesystem::resize_file(cache_blob_name, final_size); - } - - return 0; -} diff --git a/modules/llama_cpp_plugin/tools/runner.cpp b/modules/llama_cpp_plugin/tools/runner.cpp deleted file mode 100644 index 390301cdb..000000000 --- a/modules/llama_cpp_plugin/tools/runner.cpp +++ /dev/null @@ -1,73 +0,0 @@ -#include "openvino/openvino.hpp" -#include - -int main(int argc, char* argv[]) { - ov::Core core; - core.set_property(ov::cache_dir("/tmp/my_cache_dir")); - std::string model_path = "/home/vshampor/work/optimum-intel/ov_model/openvino_model.xml"; - - std::cout << "VSHAMPOR: reading model\n"; - std::shared_ptr model = core.read_model(model_path); - - std::cout << "VSHAMPOR: compiling model\n"; - ov::CompiledModel compiled_model = core.compile_model(model, "LLAMA_CPP"); - - std::cout << "VSHAMPOR: compiled successfully\n"; - - std::cout << "VSHAMPOR: creating infer request\n"; - ov::InferRequest infer_request = compiled_model.create_infer_request(); - std::cout << "VSHAMPOR: infer request created\n"; - - // const ov::Output& input = compiled_model.input(); - // std::cout << "VSHAMPOR: got input\n"; - auto inputs = compiled_model.inputs(); - std::cout << "VSHAMPOR: model has " << inputs.size() << " inputs\n"; - for (const auto& input: inputs) { - std::cout << input.get_node()->get_friendly_name() << std::endl; - } - - for (size_t i = 0; i < inputs.size(); i++) { - const auto& curr_input = inputs[i]; - auto shape = curr_input.get_partial_shape(); - if (shape.is_dynamic()) { - std::cout << "VSHAMPOR: processing input " << i << " with a dynamic shape of " << shape.to_string() << std::endl; - ov::Rank r = shape.rank(); - if (r.get_length() == 2) { - ov::Tensor input_tensor{curr_input.get_element_type(), ov::Shape({1, 128})}; - int64_t* data_ptr = input_tensor.data(); - // fill with something - for (size_t elt_idx = 0; elt_idx < input_tensor.get_size(); elt_idx++) { - data_ptr[elt_idx] = 42; - } - infer_request.set_input_tensor(i, input_tensor); - } - else { // past_key_values - ov::Tensor input_tensor{curr_input.get_element_type(), ov::Shape({1, 12, 128, 64})}; - infer_request.set_input_tensor(i, input_tensor); - } - } - else { - std::cout << "VSHAMPOR: processing input " << i << " with a non-dynamic shape of " << shape.to_string() << std::endl; - ov::Tensor input_tensor{curr_input.get_element_type(), curr_input.get_shape()}; - infer_request.set_input_tensor(i, input_tensor); - } - } - std::cout << "VSHAMPOR: successfully set input tensor\n"; - - infer_request.infer(); - std::cout << "VSHAMPOR: inferred successfully\n"; - - ov::Tensor output = infer_request.get_tensor("logits"); - std::cout << "VSHAMPOR: got output tensor, shape " << output.get_shape().to_string() << std::endl; - - size_t n_output_elts = 10; - std::cout << "VSHAMPOR: first " << n_output_elts << " elements are:" << std::endl; - - float* output_data_ptr = output.data(); - for (size_t elt_idx = 0; elt_idx < n_output_elts; elt_idx++) { - std::cout << output_data_ptr[elt_idx] << " "; - } - - std::cout << std::endl; - return 0; -} diff --git a/modules/llama_cpp_plugin/tools/tensor_comparator.cpp b/modules/llama_cpp_plugin/tools/tensor_comparator.cpp deleted file mode 100644 index 83de96215..000000000 --- a/modules/llama_cpp_plugin/tools/tensor_comparator.cpp +++ /dev/null @@ -1,95 +0,0 @@ -#include "ggml.h" -#include -#include -#include -#include -#include -#include - - - -int main(int argc, char* argv[]) { - assert(argc == 3 || argc == 4); - std::string left_name(argv[1]); - std::string right_name(argv[2]); - - gguf_init_params left_params; left_params.no_alloc = false; left_params.ctx = nullptr; - gguf_init_params right_params; left_params.no_alloc = false; right_params.ctx = nullptr; - gguf_context* left_ctx = gguf_init_from_file(left_name.c_str(), left_params); - gguf_context* right_ctx = gguf_init_from_file(right_name.c_str(), right_params); - - std::vector tensor_names; - if (argc == 4) tensor_names.push_back(std::string(argv[3])); - else { - for (size_t idx = 0; idx < left_ctx->header.n_tensors; idx++) { - gguf_tensor_info left_tensor_info = left_ctx->infos[idx]; - tensor_names.push_back(left_tensor_info.name.data); - } - } - - for (const auto& tensor_name : tensor_names) { - - - int left_tensor_idx = gguf_find_tensor(left_ctx, tensor_name.c_str()); - int right_tensor_idx = gguf_find_tensor(right_ctx, tensor_name.c_str()); - - size_t left_tensor_offset = gguf_get_tensor_offset(left_ctx, left_tensor_idx) + left_ctx->offset; - size_t right_tensor_offset = gguf_get_tensor_offset(right_ctx, right_tensor_idx) + right_ctx->offset; - - gguf_tensor_info left_tensor_info = left_ctx->infos[left_tensor_idx]; - gguf_tensor_info right_tensor_info = right_ctx->infos[right_tensor_idx]; - - std::cout << "tensor name " << tensor_name << ", byte offsets: " << left_tensor_offset << " (left), " << right_tensor_offset << " (right)" << std::endl; - std::cout << "tensor name " << tensor_name << ", shape: "; - for (size_t i = 0; i < left_tensor_info.n_dims; i++) { - std::cout << left_tensor_info.ne[i] << ","; - } - std::cout << " (left), "; - - for (size_t i = 0; i < right_tensor_info.n_dims; i++) { - std::cout << right_tensor_info.ne[i] << ","; - } - std::cout << " (right) " << std::endl; - - size_t left_tensor_size = std::accumulate(std::begin(left_tensor_info.ne), std::begin(left_tensor_info.ne) + GGML_MAX_DIMS, (size_t) sizeof(float), std::multiplies()); - size_t right_tensor_size = std::accumulate(std::begin(right_tensor_info.ne), std::begin(right_tensor_info.ne) + GGML_MAX_DIMS, (size_t) sizeof(float), std::multiplies()); - - std::cout << "tensor name " << tensor_name << ", size (calculated): " << left_tensor_size << " (left), " << right_tensor_size << " (right)" << std::endl; - - if (left_tensor_size != right_tensor_size) { - std::cout << "size mismatch (" << left_tensor_size << " left, " << right_tensor_size << "right), exiting" << std::endl; - exit(-1); - } - - size_t bytes_compared = 0; - - std::ifstream left_file(left_name, std::ios::binary); - std::ifstream right_file(right_name, std::ios::binary); - - left_file.seekg(left_tensor_offset); - right_file.seekg(right_tensor_offset); - - std::cout << "first 10 float values:" << std::endl; - for (size_t i = 0; i < 10; i++) { - float left_value; left_file.read((char*) &left_value, sizeof(float)); - float right_value; right_file.read((char*) &right_value, sizeof(float)); - - std::cout << left_value << " left, " << right_value << " right" << std::endl; - } - - left_file.seekg(left_tensor_offset); - right_file.seekg(right_tensor_offset); - for (size_t i = 0; i < left_tensor_size; i++) { - char left_byte; left_file.read((char*) &left_byte, sizeof(char)); - char right_byte; right_file.read((char*) &right_byte, sizeof(char)); - - if (left_byte != right_byte) { - std::cout << "byte " << bytes_compared << " mismatch (" << std::hex << +((uint8_t) left_byte) << " left, " << +((uint8_t) right_byte) << " right)" << std::endl; - std::cout << "offset left " << std::hex << left_tensor_offset + bytes_compared << ", right " << right_tensor_offset + bytes_compared << std::endl; - exit(-1); - } - bytes_compared++; - } - std::cout << "tensor contents are identical, bytes compared: " << bytes_compared << std::endl; - } -} From cd825d96badb104d949668bbde3dc8a97e19cf17 Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Wed, 13 Mar 2024 16:21:45 +0100 Subject: [PATCH 04/27] Properly register the plugin in .xml if requested to do so --- modules/llama_cpp_plugin/CMakeLists.txt | 3 +-- modules/llama_cpp_plugin/src/CMakeLists.txt | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/modules/llama_cpp_plugin/CMakeLists.txt b/modules/llama_cpp_plugin/CMakeLists.txt index 89c5d4e0e..7e857e890 100644 --- a/modules/llama_cpp_plugin/CMakeLists.txt +++ b/modules/llama_cpp_plugin/CMakeLists.txt @@ -4,7 +4,7 @@ project(LlamaCppPlugin) find_package(OpenVINODeveloperPackage REQUIRED) -ov_option(ENABLE_LLAMA_CPP_PLUGIN_REGISTRATION "Enables registration of LLAMA_CPP plugin" OFF) +ov_option(ENABLE_LLAMA_CPP_PLUGIN_REGISTRATION "Enables registration of LLAMA_CPP plugin" ON) if(CMAKE_COMPILER_IS_GNUCXX) ov_add_compiler_flags(-Wall) @@ -20,7 +20,6 @@ if(ENABLE_TESTS) add_subdirectory(tests/e2e) endif() - # install if(OpenVINODeveloperPackage_FOUND) diff --git a/modules/llama_cpp_plugin/src/CMakeLists.txt b/modules/llama_cpp_plugin/src/CMakeLists.txt index 5ec2caee7..3a3f32990 100644 --- a/modules/llama_cpp_plugin/src/CMakeLists.txt +++ b/modules/llama_cpp_plugin/src/CMakeLists.txt @@ -13,7 +13,7 @@ set(TARGET_NAME ${PLUGIN_LIBRARY_NAME}) file(GLOB_RECURSE SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) file(GLOB_RECURSE HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) -if (NOT ENABLE_TEMPLATE_REGISTRATION) +if (NOT ENABLE_LLAMA_CPP_PLUGIN_REGISTRATION) # Skip install and registration of template component set(skip_plugin SKIP_INSTALL SKIP_REGISTRATION) endif() @@ -52,7 +52,7 @@ target_link_libraries(${TARGET_NAME} PRIVATE ggml) set_target_properties(${TARGET_NAME} PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE ${ENABLE_LTO}) -if (ENABLE_TEMPLATE_REGISTRATION) +if (ENABLE_LLAMA_CPP_PLUGIN_REGISTRATION) # Update the plugins.xml file ov_register_plugins(MAIN_TARGET ${TARGET_NAME}) endif() From 16ebbaad713c43b3bf139bcc39b9bb55e7473a6f Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Wed, 13 Mar 2024 16:52:40 +0100 Subject: [PATCH 05/27] Add workflow for llama_cpp build and test --- .../llama_cpp_plugin_build_and_test.yml | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 .github/workflows/llama_cpp_plugin_build_and_test.yml diff --git a/.github/workflows/llama_cpp_plugin_build_and_test.yml b/.github/workflows/llama_cpp_plugin_build_and_test.yml new file mode 100644 index 000000000..35e6f2366 --- /dev/null +++ b/.github/workflows/llama_cpp_plugin_build_and_test.yml @@ -0,0 +1,53 @@ +name: precommit + +on: + pull_request: + types: + - opened + - reopened + - synchronize + paths: + - 'modules/llama_cpp_plugin/**' + +jobs: + build_ubuntu20: + runs-on: ubuntu-20.04 + steps: + - name: Setup cmake + uses: jwlawson/actions-setup-cmake@v1.14 + with: + cmake-version: '3.24.x' + + - name: Checkout openvino_contrib + uses: actions/checkout@v3 + submodules: recursive + + - name: Checkout openvino + uses: actions/checkout@v3 + submodules: recursive + repository: https://github.com/vshampor/openvino + + - name: CMake - configure + run: cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=${{ github.workspace }}/openvino_contrib/modules -DBUILD_java_api=OFF -DBUILD_nvidia_plugin=OFF -DBUILD_custom_operations=OFF -DBUILD_openvino_code=OFF -DBUILD_token_merging=OFF -DENABLE_TESTS=ON -DENABLE_FUNCTIONAL_TESTS=ON -DENABLE_PLUGINS_XML=ON . + + - name: CMake - build + run: cmake --build build -j`nproc` -- llama_cpp_plugin llama_cpp_e2e_tests + + - name: Upload build artifacts + uses: alehechka/upload-tartifact@v2 + with: + name: build_artifacts + path: ${{ github.workspace }}/bin/intel64/Release/ + + test_ubuntu20: + needs: build_ubuntu20 + runs-on: ubuntu-20.04 + steps: + - name: Download build artifacts + uses: alehechka/download-tartifact@v2 + with: + name: build_artifacts + path: binaries + + - name: Run E2E tests + run: ${{ github.workspace }}/binaries/llama_cpp_e2e_tests From aab9d855fee981f4b19c87f76805a8cb4f1320c5 Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Wed, 13 Mar 2024 17:00:06 +0100 Subject: [PATCH 06/27] Code style --- modules/llama_cpp_plugin/src/compiled_model.cpp | 17 ++++++----------- modules/llama_cpp_plugin/src/infer_request.cpp | 4 ++-- modules/llama_cpp_plugin/src/plugin.cpp | 3 +-- .../tests/e2e/prompt_response.cpp | 12 +++++++----- .../tests/e2e/set_device_name.cpp | 1 - 5 files changed, 16 insertions(+), 21 deletions(-) diff --git a/modules/llama_cpp_plugin/src/compiled_model.cpp b/modules/llama_cpp_plugin/src/compiled_model.cpp index a1498f708..17430353b 100644 --- a/modules/llama_cpp_plugin/src/compiled_model.cpp +++ b/modules/llama_cpp_plugin/src/compiled_model.cpp @@ -5,30 +5,27 @@ #include #include #include +#include #include "infer_request.hpp" #include "plugin.hpp" -#include - namespace ov { namespace llama_cpp_plugin { - LlamaCppModel::LlamaCppModel(const std::shared_ptr& model, const std::shared_ptr& plugin, const ov::SoPtr& context, const std::shared_ptr& task_executor) : ICompiledModel(model, plugin, context, task_executor) { - OPENVINO_THROW_NOT_IMPLEMENTED("Currently only direct GGUF file loading is supported for the LLAMA_CPP* plugins"); - } - + OPENVINO_THROW_NOT_IMPLEMENTED("Currently only direct GGUF file loading is supported for the LLAMA_CPP* plugins"); +} LlamaCppModel::LlamaCppModel(const std::shared_ptr& ov_model, std::istream& input_stream, const std::shared_ptr& plugin) : ICompiledModel(ov_model, plugin) { - OPENVINO_THROW_NOT_IMPLEMENTED("Currently only direct GGUF file loading is supported for the LLAMA_CPP* plugins"); + OPENVINO_THROW_NOT_IMPLEMENTED("Currently only direct GGUF file loading is supported for the LLAMA_CPP* plugins"); } LlamaCppModel::~LlamaCppModel() { @@ -39,7 +36,8 @@ LlamaCppModel::~LlamaCppModel() { } LlamaCppModel::LlamaCppModel(const std::string& gguf_fname, const std::shared_ptr& plugin) - : ICompiledModel(nullptr, plugin), m_gguf_fname(gguf_fname) { + : ICompiledModel(nullptr, plugin), + m_gguf_fname(gguf_fname) { num_tokens_processed_ptr = new size_t; // TODO (vshampor): hack, remove *num_tokens_processed_ptr = 0; OPENVINO_DEBUG << "llama_cpp_plugin: loading llama model directly from GGUF... " << std::endl; @@ -82,7 +80,6 @@ LlamaCppModel::LlamaCppModel(const std::string& gguf_fname, const std::shared_pt } } - std::shared_ptr LlamaCppModel::get_runtime_model() const { OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: Not Implemented"); } @@ -115,7 +112,5 @@ void LlamaCppModel::export_model(std::ostream& output_stream) const { output_stream << in.rdbuf(); } - - } // namespace llama_cpp_plugin } // namespace ov diff --git a/modules/llama_cpp_plugin/src/infer_request.cpp b/modules/llama_cpp_plugin/src/infer_request.cpp index 40307c573..e41fe5a03 100644 --- a/modules/llama_cpp_plugin/src/infer_request.cpp +++ b/modules/llama_cpp_plugin/src/infer_request.cpp @@ -1,4 +1,5 @@ #include "infer_request.hpp" + #include #include "llama.h" @@ -79,7 +80,7 @@ void LlamaCppSyncInferRequest::infer() { {0}, true); // the last `true` here is a marker that the logits for this // token should be computed and returned - *(m_compiled_model_ptr->num_tokens_processed_ptr) += 1; + *(m_compiled_model_ptr->num_tokens_processed_ptr) += 1; } llama_context* ctx = m_compiled_model_ptr->m_llama_ctx; @@ -110,7 +111,6 @@ std::vector LlamaCppSyncInferRequest::get_profiling_info() co return std::vector{}; }; - std::vector> LlamaCppSyncInferRequest::query_state() const { OPENVINO_DEBUG << "llama_cpp_plugin: query_state() called\n"; return {}; diff --git a/modules/llama_cpp_plugin/src/plugin.cpp b/modules/llama_cpp_plugin/src/plugin.cpp index ec456cc45..22c90e439 100644 --- a/modules/llama_cpp_plugin/src/plugin.cpp +++ b/modules/llama_cpp_plugin/src/plugin.cpp @@ -4,8 +4,8 @@ #include "compiled_model.hpp" #include "openvino/op/constant.hpp" -#include "openvino/util/log.hpp" #include "openvino/runtime/internal_properties.hpp" +#include "openvino/util/log.hpp" namespace { static constexpr const char* wait_executor_name = "LlamaCppWaitExecutor"; @@ -87,7 +87,6 @@ std::shared_ptr LlamaCppPlugin::import_model(std::istream& m OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: model importing not implemented"); } - std::shared_ptr LlamaCppPlugin::import_model(std::istream& model, const ov::SoPtr& context, const ov::AnyMap& properties) const { diff --git a/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp b/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp index f4e0369c5..60d1f8881 100644 --- a/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp +++ b/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp @@ -1,13 +1,16 @@ #include -#include "openvino/openvino.hpp" + #include "common_test_utils/file_utils.hpp" +#include "openvino/openvino.hpp" const std::string TEST_FILES_DIR = "test_data"; // "Why is the Sun yellow?" const std::vector GPT2_PROMPT_TOKEN_IDS = {5195, 318, 262, 3825, 7872, 30}; // "The Sun is a bright red, which means it is a bright red. The Sun is a bright red because it is a bright red." -const std::vector GPT2_REFERENCE_RESPONSE_TOKEN_IDS = {198, 464, 3825, 318, 257, 6016, 2266, 11, 543, 1724, 340, 318, 257, 6016, 2266, 13, 383, 3825, 318, 257, 6016, 2266, 780, 340, 318, 257, 6016, 2266, 13, 198, 198, 464}; +const std::vector GPT2_REFERENCE_RESPONSE_TOKEN_IDS = { + 198, 464, 3825, 318, 257, 6016, 2266, 11, 543, 1724, 340, 318, 257, 6016, 2266, 13, + 383, 3825, 318, 257, 6016, 2266, 780, 340, 318, 257, 6016, 2266, 13, 198, 198, 464}; const auto SEP = ov::util::FileTraits::file_separator; @@ -16,7 +19,8 @@ TEST(PromptResponseTest, TestGPT2) { ov::Core core; const std::string model_file_name = "gpt2.gguf"; - const std::string model_file = ov::test::utils::getCurrentWorkingDir() + SEP + TEST_FILES_DIR + SEP + model_file_name; + const std::string model_file = + ov::test::utils::getCurrentWorkingDir() + SEP + TEST_FILES_DIR + SEP + model_file_name; ov::InferRequest lm = core.compile_model(model_file, plugin_name).create_infer_request(); auto input_ids_tensor = ov::Tensor(ov::element::Type_t::i64, {1, GPT2_PROMPT_TOKEN_IDS.size()}); std::copy(GPT2_PROMPT_TOKEN_IDS.begin(), GPT2_PROMPT_TOKEN_IDS.end(), input_ids_tensor.data()); @@ -59,5 +63,3 @@ TEST(PromptResponseTest, TestGPT2) { ASSERT_EQ(out_token_ids, GPT2_REFERENCE_RESPONSE_TOKEN_IDS); } - - diff --git a/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp b/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp index 8fb1fac80..aa06bc96f 100644 --- a/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp +++ b/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp @@ -10,4 +10,3 @@ void set_device_suffix(const std::string& suffix) { } } // namespace test } // namespace ov - From 106754ed13e711b335e0f6aa8e860edd8e7b5a98 Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Wed, 13 Mar 2024 17:04:33 +0100 Subject: [PATCH 07/27] Adjust workflow.yml --- .../llama_cpp_plugin_build_and_test.yml | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/.github/workflows/llama_cpp_plugin_build_and_test.yml b/.github/workflows/llama_cpp_plugin_build_and_test.yml index 35e6f2366..324c06536 100644 --- a/.github/workflows/llama_cpp_plugin_build_and_test.yml +++ b/.github/workflows/llama_cpp_plugin_build_and_test.yml @@ -1,4 +1,4 @@ -name: precommit +name: llama_cpp_plugin_build_and_test on: pull_request: @@ -6,29 +6,33 @@ on: - opened - reopened - synchronize - paths: - - 'modules/llama_cpp_plugin/**' + paths: + - 'modules/llama_cpp_plugin/**' jobs: build_ubuntu20: runs-on: ubuntu-20.04 steps: - - name: Setup cmake - uses: jwlawson/actions-setup-cmake@v1.14 - with: - cmake-version: '3.24.x' + - name: Setup cmake + uses: jwlawson/actions-setup-cmake@v1.14 + with: + cmake-version: '3.24.x' - name: Checkout openvino_contrib uses: actions/checkout@v3 - submodules: recursive + with: + submodules: recursive + path: openvino_contrib - name: Checkout openvino uses: actions/checkout@v3 - submodules: recursive - repository: https://github.com/vshampor/openvino + with: + submodules: recursive + repository: vshampor/openvino + path: openvino - name: CMake - configure - run: cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=${{ github.workspace }}/openvino_contrib/modules -DBUILD_java_api=OFF -DBUILD_nvidia_plugin=OFF -DBUILD_custom_operations=OFF -DBUILD_openvino_code=OFF -DBUILD_token_merging=OFF -DENABLE_TESTS=ON -DENABLE_FUNCTIONAL_TESTS=ON -DENABLE_PLUGINS_XML=ON . + run: cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=${{ github.workspace }}/openvino_contrib/modules -DBUILD_java_api=OFF -DBUILD_nvidia_plugin=OFF -DBUILD_custom_operations=OFF -DBUILD_openvino_code=OFF -DBUILD_token_merging=OFF -DENABLE_TESTS=ON -DENABLE_FUNCTIONAL_TESTS=ON -DENABLE_PLUGINS_XML=ON openvino - name: CMake - build run: cmake --build build -j`nproc` -- llama_cpp_plugin llama_cpp_e2e_tests From 4724f0f417b920a10ab51ae147c4626dc09214fc Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Thu, 14 Mar 2024 12:18:43 +0100 Subject: [PATCH 08/27] Improve with comments --- .../llama_cpp_plugin_build_and_test.yml | 1 + modules/llama_cpp_plugin/CMakeLists.txt | 12 +++++---- modules/llama_cpp_plugin/build.sh | 19 ------------- .../include/compiled_model.hpp | 2 +- modules/llama_cpp_plugin/src/CMakeLists.txt | 5 ---- .../llama_cpp_plugin/src/compiled_model.cpp | 27 ++++++++++--------- .../llama_cpp_plugin/src/infer_request.cpp | 9 +++++-- .../llama_cpp_plugin/third_party/llama.cpp | 1 - 8 files changed, 30 insertions(+), 46 deletions(-) delete mode 100755 modules/llama_cpp_plugin/build.sh delete mode 160000 modules/llama_cpp_plugin/third_party/llama.cpp diff --git a/.github/workflows/llama_cpp_plugin_build_and_test.yml b/.github/workflows/llama_cpp_plugin_build_and_test.yml index 324c06536..127aaf524 100644 --- a/.github/workflows/llama_cpp_plugin_build_and_test.yml +++ b/.github/workflows/llama_cpp_plugin_build_and_test.yml @@ -29,6 +29,7 @@ jobs: with: submodules: recursive repository: vshampor/openvino + branch: llama_cpp_mod path: openvino - name: CMake - configure diff --git a/modules/llama_cpp_plugin/CMakeLists.txt b/modules/llama_cpp_plugin/CMakeLists.txt index 7e857e890..d909dc88e 100644 --- a/modules/llama_cpp_plugin/CMakeLists.txt +++ b/modules/llama_cpp_plugin/CMakeLists.txt @@ -6,13 +6,15 @@ find_package(OpenVINODeveloperPackage REQUIRED) ov_option(ENABLE_LLAMA_CPP_PLUGIN_REGISTRATION "Enables registration of LLAMA_CPP plugin" ON) -if(CMAKE_COMPILER_IS_GNUCXX) - ov_add_compiler_flags(-Wall) -endif() - add_subdirectory(src) -add_subdirectory(third_party/llama.cpp) +FetchContent_Declare( + llama_cpp + GIT_REPOSITORY https://github.com/ggerganov/llama.cpp + GIT_TAG b2417 +) + +FetchContent_MakeAvailable(llama_cpp) if(ENABLE_TESTS) include(CTest) diff --git a/modules/llama_cpp_plugin/build.sh b/modules/llama_cpp_plugin/build.sh deleted file mode 100755 index fa36b9e03..000000000 --- a/modules/llama_cpp_plugin/build.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -set -e -# What we want to do is build the llama.cpp dependency for different backends and have a separate plugin for each such build type. -# Sadly, CMake does not reliably allow to add_subdirectory multiple times in the same build tree, let alone with different options, -# since this would lead to "duplicate targets". There doesn't seem to be a solution to this problem even still. Thus, will have to -# invoke the cmake configure and build stage separately for each llama.cpp backend type. - -BUILD_TYPE=$1 -COMMON_OPTS="-DOpenVINODeveloperPackage_DIR=/home/vshampor/work/openvino/build -DCMAKE_EXPORT_COMPILE_COMMANDS=1" - -# Regular CPU build of llama.cpp -cmake -S ./ -B ./build/cpu/ ${COMMON_OPTS} "$@" -cmake --build ./build/cpu/ -j --target llama --target llama_cpp_plugin - - -# CUDA build -cmake -S ./ -B ./build/cuda/ -DLLAMA_CUBLAS=1 -DPLUGIN_DEVICE_NAME="LLAMA_CPP_CUDA" -DPLUGIN_LIBRARY_NAME="llama_cpp_cuda_plugin" -DLLAMA_TARGET_NAME="llama_cuda" ${COMMON_OPTS} "$@" -cmake --build ./build/cuda/ -j --target llama_cuda --target llama_cpp_cuda_plugin diff --git a/modules/llama_cpp_plugin/include/compiled_model.hpp b/modules/llama_cpp_plugin/include/compiled_model.hpp index 1ae79f12e..a99d96061 100644 --- a/modules/llama_cpp_plugin/include/compiled_model.hpp +++ b/modules/llama_cpp_plugin/include/compiled_model.hpp @@ -70,7 +70,7 @@ namespace ov { llama_model* m_llama_model_ptr = nullptr; llama_context* m_llama_ctx = nullptr; - std::shared_ptr m_model; + std::shared_ptr m_fake_model; size_t* num_tokens_processed_ptr = nullptr; // TODO: (vshampor) find a better place for this kind of storage std::vector> m_fake_inputs; diff --git a/modules/llama_cpp_plugin/src/CMakeLists.txt b/modules/llama_cpp_plugin/src/CMakeLists.txt index 3a3f32990..0ff3189c6 100644 --- a/modules/llama_cpp_plugin/src/CMakeLists.txt +++ b/modules/llama_cpp_plugin/src/CMakeLists.txt @@ -35,11 +35,6 @@ target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" "${LlamaCppPlugin_SOURCE_DIR}/include") -# link common OpenVINO Runtime libraries -target_link_libraries(${TARGET_NAME} PRIVATE - openvino::interpreter_backend - openvino::reference) - set( LLAMA_TARGET_NAME CACHE STRING "Exact target exposed by llama.cpp to link against as the main llama.cpp library") if(NOT LLAMA_TARGET_NAME) set( LLAMA_TARGET_NAME "llama" ) diff --git a/modules/llama_cpp_plugin/src/compiled_model.cpp b/modules/llama_cpp_plugin/src/compiled_model.cpp index 17430353b..63349c854 100644 --- a/modules/llama_cpp_plugin/src/compiled_model.cpp +++ b/modules/llama_cpp_plugin/src/compiled_model.cpp @@ -54,28 +54,29 @@ LlamaCppModel::LlamaCppModel(const std::string& gguf_fname, const std::shared_pt ov::ParameterVector inputs{input_ids}; - std::vector> unused_names_in_order = { - {"attention_mask", ov::element::Type_t::i64}, - {"position_ids", ov::element::Type_t::i64}, - {"beam_idx", ov::element::Type_t::i32}}; - for (const auto& descr : unused_names_in_order) { - auto unused_inp = std::make_shared(descr.second, ov::PartialShape({-1, -1})); + std::vector> additional_inputs_in_order = { + {"attention_mask", ov::element::Type_t::i64, {-1, -1}}, + {"position_ids", ov::element::Type_t::i64, {-1, -1}}, + {"beam_idx", ov::element::Type_t::i32, {-1, -1}}}; + + for (const auto& descr : additional_inputs_in_order) { + auto unused_inp = std::make_shared(std::get<1>(descr), std::get<2>(descr)); inputs.push_back(unused_inp); } - m_model = std::make_shared(logits, inputs, "fake_ov_model_for_io_specification"); + m_fake_model = std::make_shared(logits, inputs, "fake_ov_model_for_io_specification"); - m_model->inputs()[0].set_names({"input_ids"}); - for (size_t i = 0; i < unused_names_in_order.size(); i++) { - m_model->inputs()[i + 1].set_names({unused_names_in_order[i].first}); + m_fake_model->inputs()[0].set_names({"input_ids"}); + for (size_t i = 0; i < additional_inputs_in_order.size(); i++) { + m_fake_model->inputs()[i + 1].set_names({std::get<0>(additional_inputs_in_order[i])}); } - m_model->outputs()[0].set_names({"logits"}); + m_fake_model->outputs()[0].set_names({"logits"}); - for (auto input : m_model->inputs()) { + for (auto input : m_fake_model->inputs()) { m_fake_inputs.emplace_back(input); } - for (auto output : m_model->outputs()) { + for (auto output : m_fake_model->outputs()) { m_fake_outputs.emplace_back(output); } } diff --git a/modules/llama_cpp_plugin/src/infer_request.cpp b/modules/llama_cpp_plugin/src/infer_request.cpp index e41fe5a03..d745d0075 100644 --- a/modules/llama_cpp_plugin/src/infer_request.cpp +++ b/modules/llama_cpp_plugin/src/infer_request.cpp @@ -62,6 +62,9 @@ void llama_batch_add_reimpl(struct llama_batch& batch, void LlamaCppSyncInferRequest::infer() { auto input_ids_tensor_ptr = get_tensor(get_inputs()[0]); // TODO (vshampor) correctly identify input_ids among // all inputs without hardcode + // + auto position_ids_tensor_ptr = get_tensor(get_inputs()[2]); // TODO (vshampor) correctly identify input_ids among + // all inputs without hardcode OPENVINO_ASSERT(input_ids_tensor_ptr->get_element_type() == ov::element::Type_t::i64); OPENVINO_ASSERT(input_ids_tensor_ptr->get_shape().size() == 2); size_t sequence_length = input_ids_tensor_ptr->get_shape()[1]; @@ -72,15 +75,17 @@ void LlamaCppSyncInferRequest::infer() { const int64_t* sequence_start_ptr = data_ptr /* + seq_idx */; + const int64_t* position_idx_ptr = position_ids_tensor_ptr->data(); + for (size_t tok_idx = 0; tok_idx < sequence_length; ++tok_idx) { const int64_t token_id = sequence_start_ptr[tok_idx]; + const int64_t position_id = position_idx_ptr[tok_idx]; llama_batch_add_reimpl(batch, token_id, - *(m_compiled_model_ptr->num_tokens_processed_ptr), + position_id, {0}, true); // the last `true` here is a marker that the logits for this // token should be computed and returned - *(m_compiled_model_ptr->num_tokens_processed_ptr) += 1; } llama_context* ctx = m_compiled_model_ptr->m_llama_ctx; diff --git a/modules/llama_cpp_plugin/third_party/llama.cpp b/modules/llama_cpp_plugin/third_party/llama.cpp deleted file mode 160000 index c8b02d38d..000000000 --- a/modules/llama_cpp_plugin/third_party/llama.cpp +++ /dev/null @@ -1 +0,0 @@ -Subproject commit c8b02d38d98db8dab774f6f7655d7e9aede882f5 From 6fdf37626d2f07a5347cb850919d201ecf493f85 Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Thu, 14 Mar 2024 12:28:47 +0100 Subject: [PATCH 09/27] Fix formatting and workflow --- .../llama_cpp_plugin_build_and_test.yml | 2 +- modules/llama_cpp_plugin/.clang-format | 28 +++++++++++++++++++ .../llama_cpp_plugin/src/compiled_model.cpp | 6 ++-- .../llama_cpp_plugin/src/infer_request.cpp | 8 +++--- .../tests/e2e/prompt_response.cpp | 3 +- 5 files changed, 39 insertions(+), 8 deletions(-) create mode 100644 modules/llama_cpp_plugin/.clang-format diff --git a/.github/workflows/llama_cpp_plugin_build_and_test.yml b/.github/workflows/llama_cpp_plugin_build_and_test.yml index 127aaf524..829f24c67 100644 --- a/.github/workflows/llama_cpp_plugin_build_and_test.yml +++ b/.github/workflows/llama_cpp_plugin_build_and_test.yml @@ -29,7 +29,7 @@ jobs: with: submodules: recursive repository: vshampor/openvino - branch: llama_cpp_mod + ref: llama_cpp_mod path: openvino - name: CMake - configure diff --git a/modules/llama_cpp_plugin/.clang-format b/modules/llama_cpp_plugin/.clang-format new file mode 100644 index 000000000..ebe747b78 --- /dev/null +++ b/modules/llama_cpp_plugin/.clang-format @@ -0,0 +1,28 @@ +BasedOnStyle: Google +IndentWidth: 4 +UseTab: Never +ColumnLimit: 120 + +Language: Cpp +Standard: Cpp11 + +AccessModifierOffset: -4 +AlignConsecutiveMacros: true +AllowAllArgumentsOnNextLine: false +AllowAllConstructorInitializersOnNextLine: false +AllowAllParametersOfDeclarationOnNextLine: false +AllowShortFunctionsOnASingleLine: Empty +AllowShortIfStatementsOnASingleLine: Never +AllowShortLambdasOnASingleLine: Empty +AllowShortLoopsOnASingleLine: false +AlwaysBreakBeforeMultilineStrings: false +BinPackArguments: false +BinPackParameters: false +CommentPragmas: '^#' +DerivePointerAlignment: false +FixNamespaceComments: true +IndentCaseLabels: false +IndentPPDirectives: AfterHash +ForEachMacros: + - foreach + - FOREACH_CHILD diff --git a/modules/llama_cpp_plugin/src/compiled_model.cpp b/modules/llama_cpp_plugin/src/compiled_model.cpp index 63349c854..ae4422c02 100644 --- a/modules/llama_cpp_plugin/src/compiled_model.cpp +++ b/modules/llama_cpp_plugin/src/compiled_model.cpp @@ -18,14 +18,16 @@ LlamaCppModel::LlamaCppModel(const std::shared_ptr& model, const ov::SoPtr& context, const std::shared_ptr& task_executor) : ICompiledModel(model, plugin, context, task_executor) { - OPENVINO_THROW_NOT_IMPLEMENTED("Currently only direct GGUF file loading is supported for the LLAMA_CPP* plugins"); + OPENVINO_THROW_NOT_IMPLEMENTED("Currently only direct GGUF file loading is " + "supported for the LLAMA_CPP* plugins"); } LlamaCppModel::LlamaCppModel(const std::shared_ptr& ov_model, std::istream& input_stream, const std::shared_ptr& plugin) : ICompiledModel(ov_model, plugin) { - OPENVINO_THROW_NOT_IMPLEMENTED("Currently only direct GGUF file loading is supported for the LLAMA_CPP* plugins"); + OPENVINO_THROW_NOT_IMPLEMENTED("Currently only direct GGUF file loading is " + "supported for the LLAMA_CPP* plugins"); } LlamaCppModel::~LlamaCppModel() { diff --git a/modules/llama_cpp_plugin/src/infer_request.cpp b/modules/llama_cpp_plugin/src/infer_request.cpp index d745d0075..9567b1922 100644 --- a/modules/llama_cpp_plugin/src/infer_request.cpp +++ b/modules/llama_cpp_plugin/src/infer_request.cpp @@ -60,11 +60,11 @@ void llama_batch_add_reimpl(struct llama_batch& batch, } void LlamaCppSyncInferRequest::infer() { - auto input_ids_tensor_ptr = get_tensor(get_inputs()[0]); // TODO (vshampor) correctly identify input_ids among - // all inputs without hardcode - // + auto input_ids_tensor_ptr = get_tensor(get_inputs()[0]); // TODO (vshampor) correctly identify input_ids among + // all inputs without hardcode + // auto position_ids_tensor_ptr = get_tensor(get_inputs()[2]); // TODO (vshampor) correctly identify input_ids among - // all inputs without hardcode + // all inputs without hardcode OPENVINO_ASSERT(input_ids_tensor_ptr->get_element_type() == ov::element::Type_t::i64); OPENVINO_ASSERT(input_ids_tensor_ptr->get_shape().size() == 2); size_t sequence_length = input_ids_tensor_ptr->get_shape()[1]; diff --git a/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp b/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp index 60d1f8881..1101f5cb0 100644 --- a/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp +++ b/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp @@ -7,7 +7,8 @@ const std::string TEST_FILES_DIR = "test_data"; // "Why is the Sun yellow?" const std::vector GPT2_PROMPT_TOKEN_IDS = {5195, 318, 262, 3825, 7872, 30}; -// "The Sun is a bright red, which means it is a bright red. The Sun is a bright red because it is a bright red." +// "The Sun is a bright red, which means it is a bright red. The Sun is a bright +// red because it is a bright red." const std::vector GPT2_REFERENCE_RESPONSE_TOKEN_IDS = { 198, 464, 3825, 318, 257, 6016, 2266, 11, 543, 1724, 340, 318, 257, 6016, 2266, 13, 383, 3825, 318, 257, 6016, 2266, 780, 340, 318, 257, 6016, 2266, 13, 198, 198, 464}; From bd7e96817a51874905ef75f3de0a7ee6a157ea5e Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Thu, 14 Mar 2024 13:44:23 +0100 Subject: [PATCH 10/27] Add copyrights --- modules/llama_cpp_plugin/CMakeLists.txt | 3 +++ modules/llama_cpp_plugin/include/compiled_model.hpp | 3 +++ modules/llama_cpp_plugin/include/infer_request.hpp | 3 +++ modules/llama_cpp_plugin/include/plugin.hpp | 1 - modules/llama_cpp_plugin/src/CMakeLists.txt | 3 +++ modules/llama_cpp_plugin/src/compiled_model.cpp | 3 +++ modules/llama_cpp_plugin/src/infer_request.cpp | 3 +++ modules/llama_cpp_plugin/src/plugin.cpp | 3 +++ modules/llama_cpp_plugin/tests/e2e/CMakeLists.txt | 2 ++ modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp | 3 +++ modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp | 3 +++ 11 files changed, 29 insertions(+), 1 deletion(-) diff --git a/modules/llama_cpp_plugin/CMakeLists.txt b/modules/llama_cpp_plugin/CMakeLists.txt index d909dc88e..393f4f219 100644 --- a/modules/llama_cpp_plugin/CMakeLists.txt +++ b/modules/llama_cpp_plugin/CMakeLists.txt @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + cmake_minimum_required(VERSION 3.13) project(LlamaCppPlugin) diff --git a/modules/llama_cpp_plugin/include/compiled_model.hpp b/modules/llama_cpp_plugin/include/compiled_model.hpp index a99d96061..9306ca437 100644 --- a/modules/llama_cpp_plugin/include/compiled_model.hpp +++ b/modules/llama_cpp_plugin/include/compiled_model.hpp @@ -1,3 +1,6 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + #ifndef LLAMA_CPP_COMPILED_MODEL_HPP #define LLAMA_CPP_COMPILED_MODEL_HPP diff --git a/modules/llama_cpp_plugin/include/infer_request.hpp b/modules/llama_cpp_plugin/include/infer_request.hpp index 8954a180b..e8a0da65d 100644 --- a/modules/llama_cpp_plugin/include/infer_request.hpp +++ b/modules/llama_cpp_plugin/include/infer_request.hpp @@ -1,3 +1,6 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + #ifndef LLAMA_CPP_INFER_REQUEST_HPP #define LLAMA_CPP_INFER_REQUEST_HPP diff --git a/modules/llama_cpp_plugin/include/plugin.hpp b/modules/llama_cpp_plugin/include/plugin.hpp index 1d6fdf1e4..f68ebd3d6 100644 --- a/modules/llama_cpp_plugin/include/plugin.hpp +++ b/modules/llama_cpp_plugin/include/plugin.hpp @@ -1,6 +1,5 @@ // Copyright (C) 2018-2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -// #ifndef LLAMA_CPP_PLUGIN_HPP #define LLAMA_CPP_PLUGIN_HPP diff --git a/modules/llama_cpp_plugin/src/CMakeLists.txt b/modules/llama_cpp_plugin/src/CMakeLists.txt index 0ff3189c6..d99a44795 100644 --- a/modules/llama_cpp_plugin/src/CMakeLists.txt +++ b/modules/llama_cpp_plugin/src/CMakeLists.txt @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + set( PLUGIN_LIBRARY_NAME CACHE STRING "Library name for the generated plugin" ${TARGET_NAME}) if(NOT PLUGIN_LIBRARY_NAME) set( PLUGIN_LIBRARY_NAME "llama_cpp_plugin" ) diff --git a/modules/llama_cpp_plugin/src/compiled_model.cpp b/modules/llama_cpp_plugin/src/compiled_model.cpp index ae4422c02..7bafa658e 100644 --- a/modules/llama_cpp_plugin/src/compiled_model.cpp +++ b/modules/llama_cpp_plugin/src/compiled_model.cpp @@ -1,3 +1,6 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + #include "compiled_model.hpp" #include diff --git a/modules/llama_cpp_plugin/src/infer_request.cpp b/modules/llama_cpp_plugin/src/infer_request.cpp index 9567b1922..ee2bdbc45 100644 --- a/modules/llama_cpp_plugin/src/infer_request.cpp +++ b/modules/llama_cpp_plugin/src/infer_request.cpp @@ -1,3 +1,6 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + #include "infer_request.hpp" #include diff --git a/modules/llama_cpp_plugin/src/plugin.cpp b/modules/llama_cpp_plugin/src/plugin.cpp index 22c90e439..77287555b 100644 --- a/modules/llama_cpp_plugin/src/plugin.cpp +++ b/modules/llama_cpp_plugin/src/plugin.cpp @@ -1,3 +1,6 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + #include "plugin.hpp" #include diff --git a/modules/llama_cpp_plugin/tests/e2e/CMakeLists.txt b/modules/llama_cpp_plugin/tests/e2e/CMakeLists.txt index 4c16f3484..ea96e9d3b 100644 --- a/modules/llama_cpp_plugin/tests/e2e/CMakeLists.txt +++ b/modules/llama_cpp_plugin/tests/e2e/CMakeLists.txt @@ -1,3 +1,5 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 set(TARGET_NAME llama_cpp_e2e_tests) diff --git a/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp b/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp index 1101f5cb0..351104bf1 100644 --- a/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp +++ b/modules/llama_cpp_plugin/tests/e2e/prompt_response.cpp @@ -1,3 +1,6 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + #include #include "common_test_utils/file_utils.hpp" diff --git a/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp b/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp index aa06bc96f..7577f1673 100644 --- a/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp +++ b/modules/llama_cpp_plugin/tests/e2e/set_device_name.cpp @@ -1,3 +1,6 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + #include #include From 8b96020aa480e0838247fde3862459f9437e1a65 Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Thu, 14 Mar 2024 13:46:33 +0100 Subject: [PATCH 11/27] Remove state --- modules/llama_cpp_plugin/include/compiled_model.hpp | 1 - modules/llama_cpp_plugin/src/compiled_model.cpp | 3 --- 2 files changed, 4 deletions(-) diff --git a/modules/llama_cpp_plugin/include/compiled_model.hpp b/modules/llama_cpp_plugin/include/compiled_model.hpp index 9306ca437..292e373fd 100644 --- a/modules/llama_cpp_plugin/include/compiled_model.hpp +++ b/modules/llama_cpp_plugin/include/compiled_model.hpp @@ -74,7 +74,6 @@ namespace ov { llama_model* m_llama_model_ptr = nullptr; llama_context* m_llama_ctx = nullptr; std::shared_ptr m_fake_model; - size_t* num_tokens_processed_ptr = nullptr; // TODO: (vshampor) find a better place for this kind of storage std::vector> m_fake_inputs; std::vector> m_fake_outputs; diff --git a/modules/llama_cpp_plugin/src/compiled_model.cpp b/modules/llama_cpp_plugin/src/compiled_model.cpp index 7bafa658e..5fed08758 100644 --- a/modules/llama_cpp_plugin/src/compiled_model.cpp +++ b/modules/llama_cpp_plugin/src/compiled_model.cpp @@ -37,14 +37,11 @@ LlamaCppModel::~LlamaCppModel() { llama_free(m_llama_ctx); llama_free_model(m_llama_model_ptr); llama_backend_free(); - delete num_tokens_processed_ptr; } LlamaCppModel::LlamaCppModel(const std::string& gguf_fname, const std::shared_ptr& plugin) : ICompiledModel(nullptr, plugin), m_gguf_fname(gguf_fname) { - num_tokens_processed_ptr = new size_t; // TODO (vshampor): hack, remove - *num_tokens_processed_ptr = 0; OPENVINO_DEBUG << "llama_cpp_plugin: loading llama model directly from GGUF... " << std::endl; llama_model_params mparams = llama_model_default_params(); mparams.n_gpu_layers = 99; From 689492a59283e3775dc9056282294e3911c459ba Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Thu, 14 Mar 2024 14:02:32 +0100 Subject: [PATCH 12/27] Add test data preparation step to workflow --- .../workflows/llama_cpp_plugin_build_and_test.yml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/.github/workflows/llama_cpp_plugin_build_and_test.yml b/.github/workflows/llama_cpp_plugin_build_and_test.yml index 829f24c67..82d27420b 100644 --- a/.github/workflows/llama_cpp_plugin_build_and_test.yml +++ b/.github/workflows/llama_cpp_plugin_build_and_test.yml @@ -42,7 +42,7 @@ jobs: uses: alehechka/upload-tartifact@v2 with: name: build_artifacts - path: ${{ github.workspace }}/bin/intel64/Release/ + path: ${{ github.workspace }}/openvino/bin/intel64/Release/ test_ubuntu20: needs: build_ubuntu20 @@ -54,5 +54,15 @@ jobs: name: build_artifacts path: binaries + - name: Prepare test data + uses: actions/checkout@v3 + with: + repository: ggerganov/llama.cpp + path: llama.cpp + run: pip install llama.cpp/requirements/requirements-convert-hf-to-gguf.txt + run: huggingface-cli huggingface-cli download gpt2 model.safetensors tokenizer.json tokenizer_config.json vocab.json config.json merges.txt --local-dir hf_gpt2 + run: mkdir -p ${{ github.workspace }}/test_data + run: python3 llama.cpp/convert-hf-to-gguf.py hf_gpt2 --outtype f32 --outfile ${{ github.workspace }}/test_data/gpt2.gguf + - name: Run E2E tests run: ${{ github.workspace }}/binaries/llama_cpp_e2e_tests From dff65c7b66df84ac88083836d8190e9f5a0c9743 Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Thu, 14 Mar 2024 14:35:38 +0100 Subject: [PATCH 13/27] Fix workflow --- .../llama_cpp_plugin_build_and_test.yml | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/.github/workflows/llama_cpp_plugin_build_and_test.yml b/.github/workflows/llama_cpp_plugin_build_and_test.yml index 82d27420b..cbef0e2b7 100644 --- a/.github/workflows/llama_cpp_plugin_build_and_test.yml +++ b/.github/workflows/llama_cpp_plugin_build_and_test.yml @@ -38,8 +38,9 @@ jobs: - name: CMake - build run: cmake --build build -j`nproc` -- llama_cpp_plugin llama_cpp_e2e_tests + - name: Upload build artifacts - uses: alehechka/upload-tartifact@v2 + uses: actions/upload-artifact@v4 with: name: build_artifacts path: ${{ github.workspace }}/openvino/bin/intel64/Release/ @@ -49,20 +50,23 @@ jobs: runs-on: ubuntu-20.04 steps: - name: Download build artifacts - uses: alehechka/download-tartifact@v2 + uses: actions/download-artifact@v4 with: name: build_artifacts - path: binaries + path: ${{ github.workspace }}/binaries - - name: Prepare test data + - name: Prepare test data - checkout llama.cpp repo uses: actions/checkout@v3 with: repository: ggerganov/llama.cpp path: llama.cpp - run: pip install llama.cpp/requirements/requirements-convert-hf-to-gguf.txt - run: huggingface-cli huggingface-cli download gpt2 model.safetensors tokenizer.json tokenizer_config.json vocab.json config.json merges.txt --local-dir hf_gpt2 - run: mkdir -p ${{ github.workspace }}/test_data - run: python3 llama.cpp/convert-hf-to-gguf.py hf_gpt2 --outtype f32 --outfile ${{ github.workspace }}/test_data/gpt2.gguf + + - name: Prepare test data - convert test model files + run: | + pip install -r llama.cpp/requirements/requirements-convert-hf-to-gguf.txt + huggingface-cli download gpt2 model.safetensors tokenizer.json tokenizer_config.json vocab.json config.json merges.txt --local-dir hf_gpt2 + mkdir -p ${{ github.workspace }}/test_data + python3 llama.cpp/convert-hf-to-gguf.py hf_gpt2 --outtype f32 --outfile ${{ github.workspace }}/test_data/gpt2.gguf - name: Run E2E tests run: ${{ github.workspace }}/binaries/llama_cpp_e2e_tests From d43fa2fd27bb223aceca1694789945660066815d Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Fri, 15 Mar 2024 13:46:47 +0100 Subject: [PATCH 14/27] Allow resetting llama kv cache with .reset_state --- .../include/compiled_model.hpp | 2 ++ modules/llama_cpp_plugin/include/state.hpp | 25 +++++++++++++++++++ .../llama_cpp_plugin/src/infer_request.cpp | 4 ++- 3 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 modules/llama_cpp_plugin/include/state.hpp diff --git a/modules/llama_cpp_plugin/include/compiled_model.hpp b/modules/llama_cpp_plugin/include/compiled_model.hpp index 292e373fd..38f3696e2 100644 --- a/modules/llama_cpp_plugin/include/compiled_model.hpp +++ b/modules/llama_cpp_plugin/include/compiled_model.hpp @@ -12,6 +12,7 @@ namespace ov { namespace llama_cpp_plugin { class LlamaCppSyncInferRequest; class LlamaCppPlugin; + class LlamaCppState; class LlamaCppModel: public ICompiledModel { public: LlamaCppModel(const std::shared_ptr& model, @@ -79,6 +80,7 @@ namespace ov { std::vector> m_fake_outputs; friend class ov::llama_cpp_plugin::LlamaCppSyncInferRequest; + friend class ov::llama_cpp_plugin::LlamaCppState; }; } } // namespace ov diff --git a/modules/llama_cpp_plugin/include/state.hpp b/modules/llama_cpp_plugin/include/state.hpp new file mode 100644 index 000000000..18e615888 --- /dev/null +++ b/modules/llama_cpp_plugin/include/state.hpp @@ -0,0 +1,25 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#ifndef LLAMA_CPP_PLUGIN_HPP +#define LLAMA_CPP_PLUGIN_HPP + +#include "openvino/runtime/ivariable_state.hpp" +#include "compiled_model.hpp" + +namespace ov { + namespace llama_cpp_plugin { + class LlamaCppState : public IVariableState { + public: + LlamaCppState() = delete; + LlamaCppState(const std::shared_ptr& model_ptr) : m_model_ptr(model_ptr), IVariableState("llama_cpp_state") {} + void reset() override { + std::cout << "VSHAMPOR: resetting state" << std::endl; + llama_kv_cache_clear(m_model_ptr->m_llama_ctx); + } + private: + const std::shared_ptr& m_model_ptr; + }; + } +} +#endif // LLAMA_CPP_STATE_HPP diff --git a/modules/llama_cpp_plugin/src/infer_request.cpp b/modules/llama_cpp_plugin/src/infer_request.cpp index ee2bdbc45..5efd868d8 100644 --- a/modules/llama_cpp_plugin/src/infer_request.cpp +++ b/modules/llama_cpp_plugin/src/infer_request.cpp @@ -3,11 +3,13 @@ #include "infer_request.hpp" +#include #include #include "llama.h" #include "openvino/runtime/make_tensor.hpp" #include "openvino/util/log.hpp" +#include "state.hpp" namespace ov { namespace llama_cpp_plugin { @@ -121,7 +123,7 @@ std::vector LlamaCppSyncInferRequest::get_profiling_info() co std::vector> LlamaCppSyncInferRequest::query_state() const { OPENVINO_DEBUG << "llama_cpp_plugin: query_state() called\n"; - return {}; + return {std::static_pointer_cast(std::make_shared(m_compiled_model_ptr))}; } } // namespace llama_cpp_plugin } // namespace ov From cf25e3e757ddafb9b0e70a5b66f8938bee215d40 Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Fri, 15 Mar 2024 14:38:46 +0100 Subject: [PATCH 15/27] Set executable mode on test binary --- .github/workflows/llama_cpp_plugin_build_and_test.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/llama_cpp_plugin_build_and_test.yml b/.github/workflows/llama_cpp_plugin_build_and_test.yml index cbef0e2b7..98b543df9 100644 --- a/.github/workflows/llama_cpp_plugin_build_and_test.yml +++ b/.github/workflows/llama_cpp_plugin_build_and_test.yml @@ -69,4 +69,6 @@ jobs: python3 llama.cpp/convert-hf-to-gguf.py hf_gpt2 --outtype f32 --outfile ${{ github.workspace }}/test_data/gpt2.gguf - name: Run E2E tests - run: ${{ github.workspace }}/binaries/llama_cpp_e2e_tests + run: | + chmod +x ${{ github.workspace }}/binaries/llama_cpp_e2e_tests + ${{ github.workspace }}/binaries/llama_cpp_e2e_tests From 795273c928153d9c8daa112894acbb9584631815 Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Fri, 15 Mar 2024 15:18:42 +0100 Subject: [PATCH 16/27] Align thread setting with llama's main executable --- modules/llama_cpp_plugin/src/compiled_model.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modules/llama_cpp_plugin/src/compiled_model.cpp b/modules/llama_cpp_plugin/src/compiled_model.cpp index 5fed08758..0525956ad 100644 --- a/modules/llama_cpp_plugin/src/compiled_model.cpp +++ b/modules/llama_cpp_plugin/src/compiled_model.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include "infer_request.hpp" #include "plugin.hpp" @@ -47,6 +48,8 @@ LlamaCppModel::LlamaCppModel(const std::string& gguf_fname, const std::shared_pt mparams.n_gpu_layers = 99; m_llama_model_ptr = llama_load_model_from_file(gguf_fname.c_str(), mparams); llama_context_params cparams = llama_context_default_params(); + cparams.n_threads = + std::thread::hardware_concurrency(); // TODO (vshampor): reuse equivalent setting defined by OV API m_llama_ctx = llama_new_context_with_model(m_llama_model_ptr, cparams); OPENVINO_DEBUG << "llama_cpp_plugin: llama model loaded successfully from GGUF..." << std::endl; From 9990329d0ce1272f31c114ec3769b944332ed9a5 Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Fri, 15 Mar 2024 15:23:26 +0100 Subject: [PATCH 17/27] Set library path in workflow --- .github/workflows/llama_cpp_plugin_build_and_test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/llama_cpp_plugin_build_and_test.yml b/.github/workflows/llama_cpp_plugin_build_and_test.yml index 98b543df9..8fd31a97a 100644 --- a/.github/workflows/llama_cpp_plugin_build_and_test.yml +++ b/.github/workflows/llama_cpp_plugin_build_and_test.yml @@ -71,4 +71,5 @@ jobs: - name: Run E2E tests run: | chmod +x ${{ github.workspace }}/binaries/llama_cpp_e2e_tests + export LD_LIBRARY_PATH=${{ github.workspace }}/binaries ${{ github.workspace }}/binaries/llama_cpp_e2e_tests From d2504407c7af736763902834da67f23f3e91284e Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Fri, 15 Mar 2024 16:06:32 +0100 Subject: [PATCH 18/27] Add step to install libtbb --- .github/workflows/llama_cpp_plugin_build_and_test.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/llama_cpp_plugin_build_and_test.yml b/.github/workflows/llama_cpp_plugin_build_and_test.yml index 8fd31a97a..db99f7109 100644 --- a/.github/workflows/llama_cpp_plugin_build_and_test.yml +++ b/.github/workflows/llama_cpp_plugin_build_and_test.yml @@ -68,6 +68,8 @@ jobs: mkdir -p ${{ github.workspace }}/test_data python3 llama.cpp/convert-hf-to-gguf.py hf_gpt2 --outtype f32 --outfile ${{ github.workspace }}/test_data/gpt2.gguf + - name: Install TBB + run: sudo apt install -y libtbb2 - name: Run E2E tests run: | chmod +x ${{ github.workspace }}/binaries/llama_cpp_e2e_tests From 6f938ce9f7239827eb62077b1dfb760f25de4285 Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Fri, 15 Mar 2024 16:21:24 +0100 Subject: [PATCH 19/27] Take n_ctx from model --- modules/llama_cpp_plugin/src/compiled_model.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/llama_cpp_plugin/src/compiled_model.cpp b/modules/llama_cpp_plugin/src/compiled_model.cpp index 0525956ad..5af82100d 100644 --- a/modules/llama_cpp_plugin/src/compiled_model.cpp +++ b/modules/llama_cpp_plugin/src/compiled_model.cpp @@ -50,6 +50,7 @@ LlamaCppModel::LlamaCppModel(const std::string& gguf_fname, const std::shared_pt llama_context_params cparams = llama_context_default_params(); cparams.n_threads = std::thread::hardware_concurrency(); // TODO (vshampor): reuse equivalent setting defined by OV API + cparams.n_ctx = 0; // this means that the actual n_ctx will be taken equal to the model's train-time value m_llama_ctx = llama_new_context_with_model(m_llama_model_ptr, cparams); OPENVINO_DEBUG << "llama_cpp_plugin: llama model loaded successfully from GGUF..." << std::endl; From 520bf774c6132d8e8eaade62206d8a8f70511625 Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Fri, 15 Mar 2024 16:32:09 +0100 Subject: [PATCH 20/27] Remove debug print --- modules/llama_cpp_plugin/include/state.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/llama_cpp_plugin/include/state.hpp b/modules/llama_cpp_plugin/include/state.hpp index 18e615888..efd612bc3 100644 --- a/modules/llama_cpp_plugin/include/state.hpp +++ b/modules/llama_cpp_plugin/include/state.hpp @@ -14,7 +14,6 @@ namespace ov { LlamaCppState() = delete; LlamaCppState(const std::shared_ptr& model_ptr) : m_model_ptr(model_ptr), IVariableState("llama_cpp_state") {} void reset() override { - std::cout << "VSHAMPOR: resetting state" << std::endl; llama_kv_cache_clear(m_model_ptr->m_llama_ctx); } private: From 628375874a2d5e1580cf0f2622da56e23501fb54 Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Fri, 15 Mar 2024 16:56:26 +0100 Subject: [PATCH 21/27] Add README.md --- modules/llama_cpp_plugin/README.md | 52 ++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 modules/llama_cpp_plugin/README.md diff --git a/modules/llama_cpp_plugin/README.md b/modules/llama_cpp_plugin/README.md new file mode 100644 index 000000000..bd0ce6dd8 --- /dev/null +++ b/modules/llama_cpp_plugin/README.md @@ -0,0 +1,52 @@ +### Build instructions + +This plugin should be built in the same fashion as the rest of the modules: + +1. Check out the OpenVINO repository proper (https://github.com/openvinotoolkit/openvino) +2. Configure the CMake build of the OpenVINO repository, making sure to point the corresponding CMake option to the location of the `openvino_contrib` repository. The command below, executed in the `openvino` repo root, will configure the build so that the modules other `llama_cpp_plugin` module will not be built to save build time - adjust the `-DBUILD_*` options if you need the other modules as well. + +```bash +cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=/modules -DBUILD_java_api=OFF -DBUILD_nvidia_plugin=OFF -DBUILD_custom_operations=OFF -DBUILD_openvino_code=OFF -DBUILD_token_merging=OFF -DENABLE_PLUGINS_XML=ON . +``` + +3. Build the plugin either as part of the complete openvino build by executing: + +```bash +cmake --build build -j`nproc` +``` + +or separately by specifying only the `llama_cpp_plugin` target: + +```bash +cmake --build build -j`nproc` -- llama_cpp_plugin +``` + +4. Now you can utilize the built `libllama_cpp_plugin.so` as a regular OV plugin with the device name `"LLAMA_CPP"` to directly load GGUF files and infer them using OV API with llama.cpp execution under the hood. Make sure that the plugin is discoverable by the OV runtime (e.g. by putting the built `libllama_cpp_plugin.so`, `libllama.so` and the autogenerated `plugins.xml` from the built location to your OV binaries location, or by setting `LD_LIBRARY_PATH` appropriately). + +#### Example of LLM inference code + +```C++ + +ov::Core core; +auto model = core.compile_model("model.gguf", "LLAMA_CPP") +auto input_ids = ov::Tensor(ov::element::Type_t::i64, {1, 128}); +auto position_ids = ov::Tensor(ov::element::Type_t::i64, {1, 128}); +std::iota(position_ids.data(), position_ids.data() + position_ids.get_size(), 0); + +auto infer_request == model.create_infer_request(); +infer_request.set_tensor("input_ids", input_ids); +infer_request.set_tensor("position_ids", position_ids); +infer_request.infer(); + +size_t vocab_size = lm.get_tensor("logits").get_shape().back(); +float* logits = lm.get_tensor("logits").data() + (input_ids_tensor.get_size() - 1) * vocab_size; +int64_t out_token = std::max_element(logits, logits + vocab_size) - logits; +``` + +The models obtained by the `.compile_model` call with the `LLAMA_CPP` plugin expose two inputs (`input_ids` and `position_ids`) and a single output (`logits`) with equivalent meaning to the corresponding arguments in the LLM model representations in the huggingface `transformers` repository. The `attention_mask` and `beam_idx` inputs may be set as well, but will have no effect on the execution. + +Only batch size of 1 is currently supported. + + + + From 43e17410d96c2b57474a462f56b21ec8a176968e Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Fri, 15 Mar 2024 20:23:48 +0100 Subject: [PATCH 22/27] Install correct libtbb --- .github/workflows/llama_cpp_plugin_build_and_test.yml | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/llama_cpp_plugin_build_and_test.yml b/.github/workflows/llama_cpp_plugin_build_and_test.yml index db99f7109..6b01806d1 100644 --- a/.github/workflows/llama_cpp_plugin_build_and_test.yml +++ b/.github/workflows/llama_cpp_plugin_build_and_test.yml @@ -68,10 +68,14 @@ jobs: mkdir -p ${{ github.workspace }}/test_data python3 llama.cpp/convert-hf-to-gguf.py hf_gpt2 --outtype f32 --outfile ${{ github.workspace }}/test_data/gpt2.gguf - - name: Install TBB - run: sudo apt install -y libtbb2 + - name: Install libtbb2 + run: | + wget https://storage.openvinotoolkit.org/dependencies/thirdparty/linux/oneapi-tbb-2021.2.4-lin.tgz + mkdir -p tbb + tar xvzf oneapi-tbb-2021.2.4-lin.tgz + - name: Run E2E tests run: | chmod +x ${{ github.workspace }}/binaries/llama_cpp_e2e_tests - export LD_LIBRARY_PATH=${{ github.workspace }}/binaries + export LD_LIBRARY_PATH=${{ github.workspace }}/binaries:${{ github.workspace }}/tbb/lib ${{ github.workspace }}/binaries/llama_cpp_e2e_tests From 1c6c51e8fa09af455344baeeb1528f8b6ae6fa56 Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Mon, 18 Mar 2024 23:32:47 +0100 Subject: [PATCH 23/27] Use OV from master --- .github/workflows/llama_cpp_plugin_build_and_test.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/llama_cpp_plugin_build_and_test.yml b/.github/workflows/llama_cpp_plugin_build_and_test.yml index 6b01806d1..d73757ad5 100644 --- a/.github/workflows/llama_cpp_plugin_build_and_test.yml +++ b/.github/workflows/llama_cpp_plugin_build_and_test.yml @@ -28,8 +28,7 @@ jobs: uses: actions/checkout@v3 with: submodules: recursive - repository: vshampor/openvino - ref: llama_cpp_mod + repository: openvinotoolkit/openvino path: openvino - name: CMake - configure From 53fe441740e80c631aac1951d8ddd1eae0633b59 Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Tue, 19 Mar 2024 15:00:29 +0100 Subject: [PATCH 24/27] Remove gitmodules --- .gitmodules | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 .gitmodules diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index 29da379f7..000000000 --- a/.gitmodules +++ /dev/null @@ -1,3 +0,0 @@ -[submodule "modules/llama_cpp_plugin/third_party/llama.cpp"] - path = modules/llama_cpp_plugin/third_party/llama.cpp - url = https://github.com/vshampor/llama.cpp From d5447c9683426a7f7d3309473e57a52600afc014 Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Fri, 22 Mar 2024 14:19:37 +0100 Subject: [PATCH 25/27] Apply comments --- .../llama_cpp_plugin_build_and_test.yml | 70 +++++----- modules/llama_cpp_plugin/CMakeLists.txt | 10 +- modules/llama_cpp_plugin/README.md | 10 +- .../include/compiled_model.hpp | 124 ++++++++---------- .../include/infer_request.hpp | 8 +- modules/llama_cpp_plugin/include/plugin.hpp | 54 ++++---- modules/llama_cpp_plugin/include/state.hpp | 31 +++-- modules/llama_cpp_plugin/src/CMakeLists.txt | 13 +- .../llama_cpp_plugin/src/compiled_model.cpp | 17 --- modules/llama_cpp_plugin/src/plugin.cpp | 30 ++--- .../llama_cpp_plugin/tests/e2e/CMakeLists.txt | 26 ++-- 11 files changed, 177 insertions(+), 216 deletions(-) diff --git a/.github/workflows/llama_cpp_plugin_build_and_test.yml b/.github/workflows/llama_cpp_plugin_build_and_test.yml index d73757ad5..4f9ecb4d9 100644 --- a/.github/workflows/llama_cpp_plugin_build_and_test.yml +++ b/.github/workflows/llama_cpp_plugin_build_and_test.yml @@ -1,38 +1,38 @@ name: llama_cpp_plugin_build_and_test on: - pull_request: - types: - - opened + pull_request: + types: + - opened - reopened - synchronize paths: - - 'modules/llama_cpp_plugin/**' + - 'modules/llama_cpp_plugin/**' jobs: - build_ubuntu20: - runs-on: ubuntu-20.04 + build_ubuntu20: + runs-on: ubuntu-20.04 steps: - - name: Setup cmake - uses: jwlawson/actions-setup-cmake@v1.14 - with: - cmake-version: '3.24.x' + - name: Setup cmake + uses: jwlawson/actions-setup-cmake@v1.14 + with: + cmake-version: '3.24.x' - name: Checkout openvino_contrib uses: actions/checkout@v3 with: - submodules: recursive - path: openvino_contrib + submodules: recursive + path: openvino_contrib - name: Checkout openvino uses: actions/checkout@v3 with: - submodules: recursive - repository: openvinotoolkit/openvino - path: openvino + submodules: recursive + repository: openvinotoolkit/openvino + path: openvino - name: CMake - configure - run: cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=${{ github.workspace }}/openvino_contrib/modules -DBUILD_java_api=OFF -DBUILD_nvidia_plugin=OFF -DBUILD_custom_operations=OFF -DBUILD_openvino_code=OFF -DBUILD_token_merging=OFF -DENABLE_TESTS=ON -DENABLE_FUNCTIONAL_TESTS=ON -DENABLE_PLUGINS_XML=ON openvino + run: cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=${{ github.workspace }}/openvino_contrib/modules/llama_cpp_plugin -DENABLE_TESTS=ON -DENABLE_FUNCTIONAL_TESTS=ON -DENABLE_PLUGINS_XML=ON -DENABLE_PLUGIN_REGISTRATION=ON openvino - name: CMake - build run: cmake --build build -j`nproc` -- llama_cpp_plugin llama_cpp_e2e_tests @@ -41,40 +41,40 @@ jobs: - name: Upload build artifacts uses: actions/upload-artifact@v4 with: - name: build_artifacts - path: ${{ github.workspace }}/openvino/bin/intel64/Release/ + name: build_artifacts + path: ${{ github.workspace }}/openvino/bin/intel64/Release/ - test_ubuntu20: +test_ubuntu20: needs: build_ubuntu20 runs-on: ubuntu-20.04 steps: - - name: Download build artifacts - uses: actions/download-artifact@v4 - with: - name: build_artifacts + - name: Download build artifacts + uses: actions/download-artifact@v4 + with: + name: build_artifacts path: ${{ github.workspace }}/binaries - name: Prepare test data - checkout llama.cpp repo uses: actions/checkout@v3 with: - repository: ggerganov/llama.cpp - path: llama.cpp + repository: ggerganov/llama.cpp + path: llama.cpp - name: Prepare test data - convert test model files run: | - pip install -r llama.cpp/requirements/requirements-convert-hf-to-gguf.txt - huggingface-cli download gpt2 model.safetensors tokenizer.json tokenizer_config.json vocab.json config.json merges.txt --local-dir hf_gpt2 - mkdir -p ${{ github.workspace }}/test_data - python3 llama.cpp/convert-hf-to-gguf.py hf_gpt2 --outtype f32 --outfile ${{ github.workspace }}/test_data/gpt2.gguf + pip install -r llama.cpp/requirements/requirements-convert-hf-to-gguf.txt + huggingface-cli download gpt2 model.safetensors tokenizer.json tokenizer_config.json vocab.json config.json merges.txt --local-dir hf_gpt2 + mkdir -p ${{ github.workspace }}/test_data + python3 llama.cpp/convert-hf-to-gguf.py hf_gpt2 --outtype f32 --outfile ${{ github.workspace }}/test_data/gpt2.gguf - name: Install libtbb2 run: | - wget https://storage.openvinotoolkit.org/dependencies/thirdparty/linux/oneapi-tbb-2021.2.4-lin.tgz - mkdir -p tbb - tar xvzf oneapi-tbb-2021.2.4-lin.tgz + wget https://storage.openvinotoolkit.org/dependencies/thirdparty/linux/oneapi-tbb-2021.2.4-lin.tgz + mkdir -p tbb + tar xvzf oneapi-tbb-2021.2.4-lin.tgz - name: Run E2E tests run: | - chmod +x ${{ github.workspace }}/binaries/llama_cpp_e2e_tests - export LD_LIBRARY_PATH=${{ github.workspace }}/binaries:${{ github.workspace }}/tbb/lib - ${{ github.workspace }}/binaries/llama_cpp_e2e_tests + chmod +x ${{ github.workspace }}/binaries/llama_cpp_e2e_tests + export LD_LIBRARY_PATH=${{ github.workspace }}/binaries:${{ github.workspace }}/tbb/lib + ${{ github.workspace }}/binaries/llama_cpp_e2e_tests diff --git a/modules/llama_cpp_plugin/CMakeLists.txt b/modules/llama_cpp_plugin/CMakeLists.txt index 393f4f219..8c9939eab 100644 --- a/modules/llama_cpp_plugin/CMakeLists.txt +++ b/modules/llama_cpp_plugin/CMakeLists.txt @@ -7,15 +7,15 @@ project(LlamaCppPlugin) find_package(OpenVINODeveloperPackage REQUIRED) -ov_option(ENABLE_LLAMA_CPP_PLUGIN_REGISTRATION "Enables registration of LLAMA_CPP plugin" ON) +ov_option(ENABLE_LLAMA_CPP_PLUGIN_REGISTRATION "Enables registration of LLAMA_CPP plugin" OFF) add_subdirectory(src) FetchContent_Declare( - llama_cpp - GIT_REPOSITORY https://github.com/ggerganov/llama.cpp - GIT_TAG b2417 -) + llama_cpp + GIT_REPOSITORY https://github.com/ggerganov/llama.cpp + GIT_TAG b2417 + ) FetchContent_MakeAvailable(llama_cpp) diff --git a/modules/llama_cpp_plugin/README.md b/modules/llama_cpp_plugin/README.md index bd0ce6dd8..df20db7d3 100644 --- a/modules/llama_cpp_plugin/README.md +++ b/modules/llama_cpp_plugin/README.md @@ -6,19 +6,19 @@ This plugin should be built in the same fashion as the rest of the modules: 2. Configure the CMake build of the OpenVINO repository, making sure to point the corresponding CMake option to the location of the `openvino_contrib` repository. The command below, executed in the `openvino` repo root, will configure the build so that the modules other `llama_cpp_plugin` module will not be built to save build time - adjust the `-DBUILD_*` options if you need the other modules as well. ```bash -cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=/modules -DBUILD_java_api=OFF -DBUILD_nvidia_plugin=OFF -DBUILD_custom_operations=OFF -DBUILD_openvino_code=OFF -DBUILD_token_merging=OFF -DENABLE_PLUGINS_XML=ON . +cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=/modules/llama_cpp_plugin -DENABLE_PLUGINS_XML=ON -DENABLE_PLUGIN_REGISTRATION=ON . ``` 3. Build the plugin either as part of the complete openvino build by executing: ```bash -cmake --build build -j`nproc` +cmake --build build --parallel ``` or separately by specifying only the `llama_cpp_plugin` target: ```bash -cmake --build build -j`nproc` -- llama_cpp_plugin +cmake --build build --parallel -- llama_cpp_plugin ``` 4. Now you can utilize the built `libllama_cpp_plugin.so` as a regular OV plugin with the device name `"LLAMA_CPP"` to directly load GGUF files and infer them using OV API with llama.cpp execution under the hood. Make sure that the plugin is discoverable by the OV runtime (e.g. by putting the built `libllama_cpp_plugin.so`, `libllama.so` and the autogenerated `plugins.xml` from the built location to your OV binaries location, or by setting `LD_LIBRARY_PATH` appropriately). @@ -28,7 +28,7 @@ cmake --build build -j`nproc` -- llama_cpp_plugin ```C++ ov::Core core; -auto model = core.compile_model("model.gguf", "LLAMA_CPP") +auto model = core.compile_model("model.gguf", "LLAMA_CPP") auto input_ids = ov::Tensor(ov::element::Type_t::i64, {1, 128}); auto position_ids = ov::Tensor(ov::element::Type_t::i64, {1, 128}); std::iota(position_ids.data(), position_ids.data() + position_ids.get_size(), 0); @@ -43,7 +43,7 @@ float* logits = lm.get_tensor("logits").data() + (input_ids_tensor.get_si int64_t out_token = std::max_element(logits, logits + vocab_size) - logits; ``` -The models obtained by the `.compile_model` call with the `LLAMA_CPP` plugin expose two inputs (`input_ids` and `position_ids`) and a single output (`logits`) with equivalent meaning to the corresponding arguments in the LLM model representations in the huggingface `transformers` repository. The `attention_mask` and `beam_idx` inputs may be set as well, but will have no effect on the execution. +The models obtained by the `.compile_model` call with the `LLAMA_CPP` plugin expose two inputs (`input_ids` and `position_ids`) and a single output (`logits`) with equivalent meaning to the corresponding arguments in the LLM model representations in the huggingface `transformers` repository. The `attention_mask` and `beam_idx` inputs may be set as well, but will have no effect on the execution. Only batch size of 1 is currently supported. diff --git a/modules/llama_cpp_plugin/include/compiled_model.hpp b/modules/llama_cpp_plugin/include/compiled_model.hpp index 38f3696e2..4dce17819 100644 --- a/modules/llama_cpp_plugin/include/compiled_model.hpp +++ b/modules/llama_cpp_plugin/include/compiled_model.hpp @@ -4,85 +4,75 @@ #ifndef LLAMA_CPP_COMPILED_MODEL_HPP #define LLAMA_CPP_COMPILED_MODEL_HPP +#include "llama.h" #include "openvino/runtime/icompiled_model.hpp" #include "openvino/runtime/isync_infer_request.hpp" -#include "llama.h" namespace ov { - namespace llama_cpp_plugin { - class LlamaCppSyncInferRequest; - class LlamaCppPlugin; - class LlamaCppState; - class LlamaCppModel: public ICompiledModel { - public: - LlamaCppModel(const std::shared_ptr& model, - const std::shared_ptr& plugin, - const ov::SoPtr& context, - const std::shared_ptr& task_executor - ); - - LlamaCppModel(const std::shared_ptr& ov_model, - std::istream& input_file, - const std::shared_ptr& plugin); +namespace llama_cpp_plugin { +class LlamaCppSyncInferRequest; +class LlamaCppPlugin; +class LlamaCppState; +class LlamaCppModel : public ICompiledModel { +public: + LlamaCppModel(const std::string& gguf_fname, const std::shared_ptr& plugin); + /** + * @brief Export compiled model to stream + * + * @param model output stream + */ + virtual void export_model(std::ostream& model) const override; - LlamaCppModel(const std::string& gguf_fname, - const std::shared_ptr& plugin); - /** - * @brief Export compiled model to stream - * - * @param model output stream - */ - virtual void export_model(std::ostream& model) const override; + /** + * @brief Returns runtime model + * + * @return OpenVINO Model which represents runtime graph + */ + virtual std::shared_ptr get_runtime_model() const override; - /** - * @brief Returns runtime model - * - * @return OpenVINO Model which represents runtime graph - */ - virtual std::shared_ptr get_runtime_model() const override; + /** + * @brief Allows to set property + * + * @param properties new plugin properties + */ + virtual void set_property(const ov::AnyMap& properties) override; - /** - * @brief Allows to set property - * - * @param properties new plugin properties - */ - virtual void set_property(const ov::AnyMap& properties) override; + /** + * @brief Returns property + * + * @param name Property name + * + * @return Property value + * virtual std::shared_ptr create_sync_infer_request() const override; + **/ + virtual ov::Any get_property(const std::string& name) const override; + virtual const std::vector>& inputs() const override; + virtual const std::vector>& outputs() const override; + virtual ~LlamaCppModel(); - /** - * @brief Returns property - * - * @param name Property name - * - * @return Property value - * virtual std::shared_ptr create_sync_infer_request() const override; - **/ - virtual ov::Any get_property(const std::string& name) const override; - virtual const std::vector>& inputs() const override; - virtual const std::vector>& outputs() const override; - virtual ~LlamaCppModel(); - protected: - /** - * @brief Method creates infer request implementation - * - * @return Sync infer request - */ - virtual std::shared_ptr create_sync_infer_request() const override; +protected: + /** + * @brief Method creates infer request implementation + * + * @return Sync infer request + */ + virtual std::shared_ptr create_sync_infer_request() const override; - private: - gguf_context* m_gguf_ctx = nullptr; - std::string m_gguf_fname; +private: + gguf_context* m_gguf_ctx = nullptr; + std::string m_gguf_fname; - llama_model* m_llama_model_ptr = nullptr; - llama_context* m_llama_ctx = nullptr; - std::shared_ptr m_fake_model; + llama_model* m_llama_model_ptr = nullptr; + llama_context* m_llama_ctx = nullptr; + std::shared_ptr m_fake_model; - std::vector> m_fake_inputs; - std::vector> m_fake_outputs; + std::vector> m_fake_inputs; + std::vector> m_fake_outputs; - friend class ov::llama_cpp_plugin::LlamaCppSyncInferRequest; - friend class ov::llama_cpp_plugin::LlamaCppState; - }; - } + friend class ov::llama_cpp_plugin::LlamaCppSyncInferRequest; + friend class ov::llama_cpp_plugin::LlamaCppState; +}; +} // namespace llama_cpp_plugin } // namespace ov #endif // LLAMA_CPP_COMPILED_MODEL_HPP diff --git a/modules/llama_cpp_plugin/include/infer_request.hpp b/modules/llama_cpp_plugin/include/infer_request.hpp index e8a0da65d..8f298ab57 100644 --- a/modules/llama_cpp_plugin/include/infer_request.hpp +++ b/modules/llama_cpp_plugin/include/infer_request.hpp @@ -4,17 +4,16 @@ #ifndef LLAMA_CPP_INFER_REQUEST_HPP #define LLAMA_CPP_INFER_REQUEST_HPP -#include "openvino/openvino.hpp" #include "compiled_model.hpp" +#include "openvino/openvino.hpp" namespace ov { namespace llama_cpp_plugin { - class LlamaCppSyncInferRequest : public ISyncInferRequest { public: explicit LlamaCppSyncInferRequest(const std::shared_ptr& compiled_model); - virtual ~LlamaCppSyncInferRequest() {}; + virtual ~LlamaCppSyncInferRequest(){}; virtual void set_tensors_impl(const ov::Output port, const std::vector>& tensors) override; @@ -22,11 +21,12 @@ class LlamaCppSyncInferRequest : public ISyncInferRequest { virtual void infer() override; virtual std::vector get_profiling_info() const override; virtual std::vector> query_state() const override; + private: std::shared_ptr m_compiled_model_ptr; }; -} // namespace LlamaCppPlugin +} // namespace llama_cpp_plugin }; // namespace ov #endif /* LLAMA_CPP_INFER_REQUEST_HPP */ diff --git a/modules/llama_cpp_plugin/include/plugin.hpp b/modules/llama_cpp_plugin/include/plugin.hpp index f68ebd3d6..1bcb6abbd 100644 --- a/modules/llama_cpp_plugin/include/plugin.hpp +++ b/modules/llama_cpp_plugin/include/plugin.hpp @@ -7,42 +7,40 @@ #include "openvino/runtime/iplugin.hpp" namespace ov { - namespace llama_cpp_plugin { - class LlamaCppPlugin : public IPlugin { - public: - LlamaCppPlugin(); - virtual std::shared_ptr compile_model(const std::shared_ptr& model, - const ov::AnyMap& properties) const override; +namespace llama_cpp_plugin { +class LlamaCppPlugin : public IPlugin { +public: + LlamaCppPlugin(); + virtual std::shared_ptr compile_model(const std::shared_ptr& model, + const ov::AnyMap& properties) const override; - virtual std::shared_ptr compile_model(const std::shared_ptr& model, - const ov::AnyMap& properties, - const ov::SoPtr& context) const override; + virtual std::shared_ptr compile_model( + const std::shared_ptr& model, + const ov::AnyMap& properties, + const ov::SoPtr& context) const override; - virtual void set_property(const ov::AnyMap& properties) override; + virtual void set_property(const ov::AnyMap& properties) override; - virtual ov::Any get_property(const std::string& name, const ov::AnyMap& arguments) const override; + virtual ov::Any get_property(const std::string& name, const ov::AnyMap& arguments) const override; - virtual ov::SoPtr create_context(const ov::AnyMap& remote_properties) const override; + virtual ov::SoPtr create_context(const ov::AnyMap& remote_properties) const override; - virtual ov::SoPtr get_default_context(const ov::AnyMap& remote_properties) const override; + virtual ov::SoPtr get_default_context(const ov::AnyMap& remote_properties) const override; - virtual std::shared_ptr import_model(std::istream& model, - const ov::AnyMap& properties) const override; + virtual std::shared_ptr import_model(std::istream& model, + const ov::AnyMap& properties) const override; - virtual std::shared_ptr compile_model(const std::string& fname, - const ov::AnyMap& properties) const override; + virtual std::shared_ptr compile_model(const std::string& fname, + const ov::AnyMap& properties) const override; - virtual std::shared_ptr import_model(std::istream& model, - const ov::SoPtr& context, - const ov::AnyMap& properties) const override; + virtual std::shared_ptr import_model(std::istream& model, + const ov::SoPtr& context, + const ov::AnyMap& properties) const override; - virtual ov::SupportedOpsMap query_model(const std::shared_ptr& model, - const ov::AnyMap& properties) const override; - - private: - std::string m_cache_dir = ""; - }; - } // namespace llama_cpp_plugin + virtual ov::SupportedOpsMap query_model(const std::shared_ptr& model, + const ov::AnyMap& properties) const override; +}; +} // namespace llama_cpp_plugin } // namespace ov -#endif // LLAMA_CPP_PLUGIN_HPP +#endif // LLAMA_CPP_PLUGIN_HPP diff --git a/modules/llama_cpp_plugin/include/state.hpp b/modules/llama_cpp_plugin/include/state.hpp index efd612bc3..229970894 100644 --- a/modules/llama_cpp_plugin/include/state.hpp +++ b/modules/llama_cpp_plugin/include/state.hpp @@ -4,21 +4,24 @@ #ifndef LLAMA_CPP_PLUGIN_HPP #define LLAMA_CPP_PLUGIN_HPP -#include "openvino/runtime/ivariable_state.hpp" #include "compiled_model.hpp" +#include "openvino/runtime/ivariable_state.hpp" namespace ov { - namespace llama_cpp_plugin { - class LlamaCppState : public IVariableState { - public: - LlamaCppState() = delete; - LlamaCppState(const std::shared_ptr& model_ptr) : m_model_ptr(model_ptr), IVariableState("llama_cpp_state") {} - void reset() override { - llama_kv_cache_clear(m_model_ptr->m_llama_ctx); - } - private: - const std::shared_ptr& m_model_ptr; - }; +namespace llama_cpp_plugin { +class LlamaCppState : public IVariableState { +public: + LlamaCppState() = delete; + LlamaCppState(const std::shared_ptr& model_ptr) + : m_model_ptr(model_ptr), + IVariableState("llama_cpp_state") {} + void reset() override { + llama_kv_cache_clear(m_model_ptr->m_llama_ctx); } -} -#endif // LLAMA_CPP_STATE_HPP + +private: + const std::shared_ptr& m_model_ptr; +}; +} // namespace llama_cpp_plugin +} // namespace ov +#endif // LLAMA_CPP_STATE_HPP diff --git a/modules/llama_cpp_plugin/src/CMakeLists.txt b/modules/llama_cpp_plugin/src/CMakeLists.txt index d99a44795..258df852f 100644 --- a/modules/llama_cpp_plugin/src/CMakeLists.txt +++ b/modules/llama_cpp_plugin/src/CMakeLists.txt @@ -25,14 +25,11 @@ endif() # adds a shared library with plugin ov_add_plugin(NAME ${TARGET_NAME} - DEVICE_NAME ${PLUGIN_DEVICE_NAME} - SOURCES ${SOURCES} ${HEADERS} - ${skip_plugin} - VERSION_DEFINES_FOR plugin.cpp - ADD_CLANG_FORMAT) - -# Enable support of CC for the plugin -ov_mark_target_as_cc(${TARGET_NAME}) + DEVICE_NAME ${PLUGIN_DEVICE_NAME} + SOURCES ${SOURCES} ${HEADERS} + ${skip_plugin} + VERSION_DEFINES_FOR plugin.cpp + ADD_CLANG_FORMAT) target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" diff --git a/modules/llama_cpp_plugin/src/compiled_model.cpp b/modules/llama_cpp_plugin/src/compiled_model.cpp index 5af82100d..adf9e17cf 100644 --- a/modules/llama_cpp_plugin/src/compiled_model.cpp +++ b/modules/llama_cpp_plugin/src/compiled_model.cpp @@ -17,23 +17,6 @@ namespace ov { namespace llama_cpp_plugin { -LlamaCppModel::LlamaCppModel(const std::shared_ptr& model, - const std::shared_ptr& plugin, - const ov::SoPtr& context, - const std::shared_ptr& task_executor) - : ICompiledModel(model, plugin, context, task_executor) { - OPENVINO_THROW_NOT_IMPLEMENTED("Currently only direct GGUF file loading is " - "supported for the LLAMA_CPP* plugins"); -} - -LlamaCppModel::LlamaCppModel(const std::shared_ptr& ov_model, - std::istream& input_stream, - const std::shared_ptr& plugin) - : ICompiledModel(ov_model, plugin) { - OPENVINO_THROW_NOT_IMPLEMENTED("Currently only direct GGUF file loading is " - "supported for the LLAMA_CPP* plugins"); -} - LlamaCppModel::~LlamaCppModel() { llama_free(m_llama_ctx); llama_free_model(m_llama_model_ptr); diff --git a/modules/llama_cpp_plugin/src/plugin.cpp b/modules/llama_cpp_plugin/src/plugin.cpp index 77287555b..52536130c 100644 --- a/modules/llama_cpp_plugin/src/plugin.cpp +++ b/modules/llama_cpp_plugin/src/plugin.cpp @@ -23,38 +23,31 @@ LlamaCppPlugin::LlamaCppPlugin() : IPlugin() { } std::shared_ptr LlamaCppPlugin::compile_model(const std::shared_ptr& model, const ov::AnyMap& properties) const { - OPENVINO_DEBUG << "llama_cpp_plugin: LlamaCppPlugin::compile_model" << std::endl; - return compile_model(model, properties, {}); + OPENVINO_THROW_NOT_IMPLEMENTED("Currently only direct GGUF file loading is " + "supported for the LLAMA_CPP* plugins"); } -std::shared_ptr LlamaCppPlugin::compile_model(const std::string& fname, - const ov::AnyMap& properties) const { - return std::make_shared(fname, shared_from_this()); -} std::shared_ptr LlamaCppPlugin::compile_model(const std::shared_ptr& model, const ov::AnyMap& properties, const ov::SoPtr& context) const { - OPENVINO_DEBUG << "llama_cpp_plugin: compile_model called in C++" << std::endl; - return std::make_shared(model->clone(), - shared_from_this(), - context, - get_executor_manager()->get_executor(template_exclusive_executor)); + OPENVINO_THROW_NOT_IMPLEMENTED("Currently only direct GGUF file loading is " + "supported for the LLAMA_CPP* plugins"); +} +std::shared_ptr LlamaCppPlugin::compile_model(const std::string& fname, + const ov::AnyMap& properties) const { + return std::make_shared(fname, shared_from_this()); } void LlamaCppPlugin::set_property(const ov::AnyMap& properties) { for (const auto& map_entry : properties) { - if (map_entry.first == ov::cache_dir.name()) { - m_cache_dir = map_entry.second.as(); - } else { - OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: setting property ", map_entry.first, "not implemented"); - } + OPENVINO_THROW_NOT_IMPLEMENTED("llama_cpp_plugin: setting property ", map_entry.first, "not implemented"); } } ov::Any LlamaCppPlugin::get_property(const std::string& name, const ov::AnyMap& arguments) const { if (ov::supported_properties == name) { return decltype(ov::supported_properties)::value_type( - std::vector({ov::cache_dir, ov::device::capabilities, ov::device::full_name})); + std::vector({ov::device::capabilities, ov::device::full_name})); } if (ov::device::capabilities == name) { return decltype(ov::device::capabilities)::value_type( @@ -65,9 +58,6 @@ ov::Any LlamaCppPlugin::get_property(const std::string& name, const ov::AnyMap& std::vector({ov::internal::caching_properties})); } - if (ov::cache_dir == name) { - return m_cache_dir; - } if (ov::internal::caching_properties == name) { return std::vector{ov::device::full_name}; } diff --git a/modules/llama_cpp_plugin/tests/e2e/CMakeLists.txt b/modules/llama_cpp_plugin/tests/e2e/CMakeLists.txt index ea96e9d3b..096ad46ad 100644 --- a/modules/llama_cpp_plugin/tests/e2e/CMakeLists.txt +++ b/modules/llama_cpp_plugin/tests/e2e/CMakeLists.txt @@ -4,17 +4,17 @@ set(TARGET_NAME llama_cpp_e2e_tests) ov_add_test_target( - NAME ${TARGET_NAME} - ROOT ${CMAKE_CURRENT_SOURCE_DIR} - DEPENDENCIES - llama_cpp_plugin - LINK_LIBRARIES - openvino::runtime::dev - openvino::funcSharedTests - INCLUDES - "${OpenVINOTemplatePlugin_SOURCE_DIR}/include" - ADD_CLANG_FORMAT - LABELS - OV UNIT TEMPLATE -) + NAME ${TARGET_NAME} + ROOT ${CMAKE_CURRENT_SOURCE_DIR} + DEPENDENCIES + llama_cpp_plugin + LINK_LIBRARIES + openvino::runtime::dev + openvino::funcSharedTests + INCLUDES + "${LlamaCppPlugin_SOURCE_DIR}/include" + ADD_CLANG_FORMAT + LABELS + OV UNIT TEMPLATE + ) From de225a507574882956dfa48f3edf9fdcbd0088cf Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Fri, 22 Mar 2024 20:29:53 +0100 Subject: [PATCH 26/27] Fix workflow indents --- .../llama_cpp_plugin_build_and_test.yml | 70 +++++++++---------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/.github/workflows/llama_cpp_plugin_build_and_test.yml b/.github/workflows/llama_cpp_plugin_build_and_test.yml index 4f9ecb4d9..50ec92dc1 100644 --- a/.github/workflows/llama_cpp_plugin_build_and_test.yml +++ b/.github/workflows/llama_cpp_plugin_build_and_test.yml @@ -1,38 +1,38 @@ name: llama_cpp_plugin_build_and_test on: - pull_request: - types: - - opened + pull_request: + types: + - opened - reopened - synchronize paths: - - 'modules/llama_cpp_plugin/**' + - 'modules/llama_cpp_plugin/**' jobs: - build_ubuntu20: - runs-on: ubuntu-20.04 + build_ubuntu20: + runs-on: ubuntu-20.04 steps: - - name: Setup cmake - uses: jwlawson/actions-setup-cmake@v1.14 - with: - cmake-version: '3.24.x' + - name: Setup cmake + uses: jwlawson/actions-setup-cmake@v1.14 + with: + cmake-version: '3.24.x' - name: Checkout openvino_contrib uses: actions/checkout@v3 with: - submodules: recursive - path: openvino_contrib + submodules: recursive + path: openvino_contrib - name: Checkout openvino uses: actions/checkout@v3 with: - submodules: recursive - repository: openvinotoolkit/openvino - path: openvino + submodules: recursive + repository: openvinotoolkit/openvino + path: openvino - name: CMake - configure - run: cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=${{ github.workspace }}/openvino_contrib/modules/llama_cpp_plugin -DENABLE_TESTS=ON -DENABLE_FUNCTIONAL_TESTS=ON -DENABLE_PLUGINS_XML=ON -DENABLE_PLUGIN_REGISTRATION=ON openvino + run: cmake -B build -DCMAKE_BUILD_TYPE=Release -DOPENVINO_EXTRA_MODULES=${{ github.workspace }}/openvino_contrib/modules/llama_cpp_plugin -DENABLE_TESTS=ON -DENABLE_FUNCTIONAL_TESTS=ON -DENABLE_PLUGINS_XML=ON -DENABLE_LLAMA_CPP_PLUGIN_REGISTRATION=ON openvino - name: CMake - build run: cmake --build build -j`nproc` -- llama_cpp_plugin llama_cpp_e2e_tests @@ -41,40 +41,40 @@ jobs: - name: Upload build artifacts uses: actions/upload-artifact@v4 with: - name: build_artifacts - path: ${{ github.workspace }}/openvino/bin/intel64/Release/ + name: build_artifacts + path: ${{ github.workspace }}/openvino/bin/intel64/Release/ -test_ubuntu20: + test_ubuntu20: needs: build_ubuntu20 runs-on: ubuntu-20.04 steps: - - name: Download build artifacts - uses: actions/download-artifact@v4 - with: - name: build_artifacts + - name: Download build artifacts + uses: actions/download-artifact@v4 + with: + name: build_artifacts path: ${{ github.workspace }}/binaries - name: Prepare test data - checkout llama.cpp repo uses: actions/checkout@v3 with: - repository: ggerganov/llama.cpp - path: llama.cpp + repository: ggerganov/llama.cpp + path: llama.cpp - name: Prepare test data - convert test model files run: | - pip install -r llama.cpp/requirements/requirements-convert-hf-to-gguf.txt - huggingface-cli download gpt2 model.safetensors tokenizer.json tokenizer_config.json vocab.json config.json merges.txt --local-dir hf_gpt2 - mkdir -p ${{ github.workspace }}/test_data - python3 llama.cpp/convert-hf-to-gguf.py hf_gpt2 --outtype f32 --outfile ${{ github.workspace }}/test_data/gpt2.gguf + pip install -r llama.cpp/requirements/requirements-convert-hf-to-gguf.txt + huggingface-cli download gpt2 model.safetensors tokenizer.json tokenizer_config.json vocab.json config.json merges.txt --local-dir hf_gpt2 + mkdir -p ${{ github.workspace }}/test_data + python3 llama.cpp/convert-hf-to-gguf.py hf_gpt2 --outtype f32 --outfile ${{ github.workspace }}/test_data/gpt2.gguf - name: Install libtbb2 run: | - wget https://storage.openvinotoolkit.org/dependencies/thirdparty/linux/oneapi-tbb-2021.2.4-lin.tgz - mkdir -p tbb - tar xvzf oneapi-tbb-2021.2.4-lin.tgz + wget https://storage.openvinotoolkit.org/dependencies/thirdparty/linux/oneapi-tbb-2021.2.4-lin.tgz + mkdir -p tbb + tar xvzf oneapi-tbb-2021.2.4-lin.tgz - name: Run E2E tests run: | - chmod +x ${{ github.workspace }}/binaries/llama_cpp_e2e_tests - export LD_LIBRARY_PATH=${{ github.workspace }}/binaries:${{ github.workspace }}/tbb/lib - ${{ github.workspace }}/binaries/llama_cpp_e2e_tests + chmod +x ${{ github.workspace }}/binaries/llama_cpp_e2e_tests + export LD_LIBRARY_PATH=${{ github.workspace }}/binaries:${{ github.workspace }}/tbb/lib + ${{ github.workspace }}/binaries/llama_cpp_e2e_tests From aef994892ef3c37b427225818d5188f3a9503825 Mon Sep 17 00:00:00 2001 From: Vasily Shamporov Date: Mon, 25 Mar 2024 17:27:25 +0100 Subject: [PATCH 27/27] Improve workflow --- .../workflows/llama_cpp_plugin_build_and_test.yml | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/.github/workflows/llama_cpp_plugin_build_and_test.yml b/.github/workflows/llama_cpp_plugin_build_and_test.yml index 50ec92dc1..4d0af3bdf 100644 --- a/.github/workflows/llama_cpp_plugin_build_and_test.yml +++ b/.github/workflows/llama_cpp_plugin_build_and_test.yml @@ -2,16 +2,12 @@ name: llama_cpp_plugin_build_and_test on: pull_request: - types: - - opened - - reopened - - synchronize paths: - 'modules/llama_cpp_plugin/**' jobs: build_ubuntu20: - runs-on: ubuntu-20.04 + runs-on: ubuntu-20.04-8-cores steps: - name: Setup cmake uses: jwlawson/actions-setup-cmake@v1.14 @@ -19,13 +15,13 @@ jobs: cmake-version: '3.24.x' - name: Checkout openvino_contrib - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: submodules: recursive path: openvino_contrib - name: Checkout openvino - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: submodules: recursive repository: openvinotoolkit/openvino @@ -55,7 +51,7 @@ jobs: path: ${{ github.workspace }}/binaries - name: Prepare test data - checkout llama.cpp repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: repository: ggerganov/llama.cpp path: llama.cpp