diff --git a/src/cpp/include/openvino/genai/llm_pipeline.hpp b/src/cpp/include/openvino/genai/llm_pipeline.hpp index 00dda872f9..009f08138b 100644 --- a/src/cpp/include/openvino/genai/llm_pipeline.hpp +++ b/src/cpp/include/openvino/genai/llm_pipeline.hpp @@ -274,6 +274,7 @@ class OPENVINO_GENAI_EXPORTS LLMPipeline { * Turns off keeping KV cache between generate calls. */ void finish_chat(); + private: std::unique_ptr m_pimpl; }; diff --git a/src/cpp/include/openvino/genai/lora_adapter.hpp b/src/cpp/include/openvino/genai/lora_adapter.hpp index 754553fc43..b7bfcf01d1 100644 --- a/src/cpp/include/openvino/genai/lora_adapter.hpp +++ b/src/cpp/include/openvino/genai/lora_adapter.hpp @@ -37,6 +37,7 @@ class OPENVINO_GENAI_EXPORTS Adapter { Adapter(const std::shared_ptr& pimpl); public: explicit Adapter(const std::filesystem::path& path); + explicit Adapter(const ov::Tensor& safetensor); Adapter() = default; operator bool() const { diff --git a/src/cpp/src/icontinuous_batching.hpp b/src/cpp/src/icontinuous_batching.hpp index a1700c9c31..229bd2ae8c 100644 --- a/src/cpp/src/icontinuous_batching.hpp +++ b/src/cpp/src/icontinuous_batching.hpp @@ -134,4 +134,4 @@ class ContinuousBatchingPipeline::IContinuousBatchingPipeline { */ void finish_chat(); }; -} \ No newline at end of file +} diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index 2af32642cc..ecd2f1b258 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -346,7 +346,6 @@ void StatefulLLMPipeline::finish_chat() { m_history.clear(); }; - std::unique_ptr LLMPipelineFactory::create(const std::filesystem::path& models_path, const ov::AnyMap& config) { diff --git a/src/cpp/src/lora_adapter.cpp b/src/cpp/src/lora_adapter.cpp index db0f6fd0ca..1e4bfb711f 100644 --- a/src/cpp/src/lora_adapter.cpp +++ b/src/cpp/src/lora_adapter.cpp @@ -60,8 +60,6 @@ using namespace ov::op; using namespace ov::genai::utils; // FIXME: Use ov::AlignedBuffer instead of std::vector. ov::AlignedBuffer is not available in public OV API -using Buffer = std::vector; -using BufferPtr = std::shared_ptr; using ConstantVector = std::vector>; @@ -69,25 +67,6 @@ using ConstantVector = std::vector>; using LoRANode = LoRAParts>; using LoRAPartsParser = LoRAParts(const std::string& name)>>; - -// Read binary file to memory. -BufferPtr read_file_helper(const std::filesystem::path& filename) { - std::ifstream file(filename, std::ios::binary | std::ios::ate); - OPENVINO_ASSERT(file.is_open(), "Cannot open file with LoRA weights: ", filename); - - size_t filesize = file.tellg(); - auto buffer = std::make_shared(filesize); - file.seekg(0, std::ios::beg); - // TODO: Use mmapped AlignedBuffer as ov::Core::read_model can do, necessary functionality is not available in public OV API. - // LoRA files do not usually have huge size in comparison to the base models, but it can vary depending on adapter, - // and using mmap will help to optimize memory consumption and could be critical - // when the application at the edge of available memory that is not really uncommon for applications dealing with LLMs. - file.read(&(*buffer)[0], filesize); - - return buffer; -} - - // Converts Safetensors element type to OV element type. Only part of the types are supported. ov::element::Type safetensors_to_ov_element_type (int dtype) { switch(dtype) { @@ -102,10 +81,8 @@ ov::element::Type safetensors_to_ov_element_type (int dtype) { } } - using ConstantMap = std::map>; - // Safetensor file parser that deallocates temporary buffers automatically. // Drop-in replacement for the third party safetensors_File struct. struct AutoSafetensor: public safetensors_File { @@ -115,19 +92,14 @@ struct AutoSafetensor: public safetensors_File { } }; - -// Reads a file with a given filename expecting Safetensors file format. -// The data is read to a solid memory block and the function returns a map of OV Constants allocated on top of that block. // The key in the map is a tensor name and the Constant uses a region of memory from the memory block. // Each Constant holds a shared pointer to the block in the runtime info. // The memory block will be deallocated when the last Constant is destroyed. -ConstantMap read_safetensors(const std::filesystem::path& filename) { - auto buffer = read_file_helper(filename); +ConstantMap safetensor_to_constant_map(const ov::Tensor& safetensor) { AutoSafetensor safe_tensors_file{}; - OPENVINO_ASSERT( - safetensors_file_init(&(*buffer)[0], buffer->size(), &safe_tensors_file) == nullptr, - "Cannot parse ", filename, " as a Safetensors file format. Safetensors file format is supported only" + OPENVINO_ASSERT(safetensors_file_init(safetensor.data(), safetensor.get_byte_size(), &safe_tensors_file) == nullptr, + "Cannot parse safetensor as a Safetensors file format. Safetensors file format is supported only" ); ConstantMap tensors; @@ -137,20 +109,22 @@ ConstantMap read_safetensors(const std::filesystem::path& filename) { ov::Shape shape(tensor.shape, tensor.shape + tensor.n_dimensions); void* ptr = tensor.ptr; // FIXME: needs a non-constant pointer because Tensor doesn't accept a constant pointer - OPENVINO_ASSERT( - ov::shape_size(shape) <= tensor.end_offset_bytes - tensor.begin_offset_bytes, - "Tensor shape ", ov::shape_size(shape), " for tensor \"", name, "\" from Safetensors file \"", filename, "\" doesn't match the expected tensor size ", - tensor.end_offset_bytes - tensor.begin_offset_bytes); - auto type = safetensors_to_ov_element_type(tensor.dtype); auto constant = std::make_shared(type, shape, ptr, nullptr); // wraps existing memory, no ownership - constant->get_rt_info()["__safetensors_buffer_holder"] = buffer; // to automatically deallocate underlying memory buffer when last constant that holds it is destroyed + constant->get_rt_info()["__safetensors_buffer_holder"] = safetensor; // to automatically deallocate underlying memory buffer when last constant that holds it is destroyed tensors[name] = constant; } return tensors; } +// Reads a file with a given filename expecting Safetensors file format. +// The file data is mmaped to tensor. +ConstantMap read_safetensors(const std::filesystem::path& filename) { + auto safetensor = ov::read_tensor_data(filename); + + return safetensor_to_constant_map(safetensor); +} // Default LoRA tensor name patterns observed in the existing LoRA adapters, captures the prefix that should correspond to a layer name in the base model LoRAPartsParser default_lora_patterns () { @@ -847,6 +821,9 @@ class SafetensorsAdapterImpl : public AdapterImpl { SafetensorsAdapterImpl(const std::filesystem::path& path) : tensors(group_lora_tensors(read_safetensors(path), default_lora_patterns())) {} + SafetensorsAdapterImpl(const ov::Tensor& safetensor) + : tensors(group_lora_tensors(safetensor_to_constant_map(safetensor), default_lora_patterns())) {} + const LoRATensors& get_tensors() const override { return tensors; } @@ -923,6 +900,10 @@ Adapter::Adapter(const std::filesystem::path& path) : } +Adapter::Adapter(const ov::Tensor& safetensor) : + m_pimpl(std::make_shared(safetensor)) { +} + bool operator== (const Adapter& a, const Adapter& b) { return a.m_pimpl->eq(b.m_pimpl.get()); } @@ -1382,7 +1363,6 @@ void AdapterController::apply(ov::InferRequest request, const std::optionalhas_state_name(name); } diff --git a/src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp b/src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp index 0535931d81..1393cecc13 100644 --- a/src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp +++ b/src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp @@ -49,4 +49,4 @@ class ContinuousBatchingPipeline::PromptLookupImpl : public ContinuousBatchingPi SpeculativeDecodingMetrics get_metrics(); }; -} \ No newline at end of file +} diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp index 4023519287..56626dae48 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp @@ -71,4 +71,4 @@ class ContinuousBatchingPipeline::SpeculativeDecodingImpl : public ContinuousBat SpeculativeDecodingMetrics get_speculative_decoding_metrics(); }; -} \ No newline at end of file +} diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index 0ed1f12caa..42528cf03e 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -21,6 +21,12 @@ class Adapter: Immutable LoRA Adapter that carries the adaptation matrices and serves as unique adapter identifier. path (os.PathLike): Path to adapter file in safetensors format. """ + @typing.overload + def __init__(self, safetensor: openvino._pyopenvino.Tensor) -> None: + """ + Immutable LoRA Adapter that carries the adaptation matrices and serves as unique adapter identifier. + safetensor (ov.Tensor): Pre-read LoRA Adapter safetensor. + """ class AdapterConfig: """ Adapter config that defines a combination of LoRA adapters with blending parameters. diff --git a/src/python/py_lora_adapter.cpp b/src/python/py_lora_adapter.cpp index 54ea6cf0b9..cc795bd1cc 100644 --- a/src/python/py_lora_adapter.cpp +++ b/src/python/py_lora_adapter.cpp @@ -25,6 +25,16 @@ void init_lora_adapter(py::module_& m) { Immutable LoRA Adapter that carries the adaptation matrices and serves as unique adapter identifier. path (os.PathLike): Path to adapter file in safetensors format. )") + .def(py::init([]( + const ov::Tensor& safetensor + ) { + return ov::genai::Adapter(safetensor); + }), + py::arg("safetensor"), "ov::Tensor with pre-read LoRA Adapter safetensor", + R"( + Immutable LoRA Adapter that carries the adaptation matrices and serves as unique adapter identifier. + safetensor (ov.Tensor): Pre-read LoRA Adapter safetensor. + )") .def( "__bool__", [](ov::genai::Adapter& self