@@ -28,7 +28,7 @@ void allocate_tensor_impl(ov::SoPtr<ov::ITensor>& tensor,
28
28
LlamaCppSyncInferRequest::LlamaCppSyncInferRequest (const std::shared_ptr<const LlamaCppModel>& compiled_model,
29
29
size_t num_threads)
30
30
: ov::ISyncInferRequest(compiled_model) {
31
- OPENVINO_DEBUG << " llama_cpp_plugin: infer request ctor called\n " ;
31
+ OPENVINO_DEBUG ( " llama_cpp_plugin: infer request ctor called\n " ) ;
32
32
llama_context_params cparams = llama_context_default_params ();
33
33
cparams.n_threads = num_threads ? num_threads : std::thread::hardware_concurrency ();
34
34
cparams.n_ctx = 0 ; // this means that the actual n_ctx will be taken equal to the model's train-time value
@@ -51,7 +51,7 @@ LlamaCppSyncInferRequest::LlamaCppSyncInferRequest(const std::shared_ptr<const L
51
51
}
52
52
void LlamaCppSyncInferRequest::set_tensors_impl (const ov::Output<const ov::Node> port,
53
53
const std::vector<ov::SoPtr<ov::ITensor>>& tensors) {
54
- OPENVINO_DEBUG << " llama_cpp_plugin: set_tensors_impl called\n " ;
54
+ OPENVINO_DEBUG ( " llama_cpp_plugin: set_tensors_impl called\n " ) ;
55
55
}
56
56
57
57
void llama_batch_add_reimpl (struct llama_batch & batch,
@@ -131,12 +131,12 @@ void LlamaCppSyncInferRequest::infer() {
131
131
llama_batch_free (batch);
132
132
};
133
133
std::vector<ov::ProfilingInfo> LlamaCppSyncInferRequest::get_profiling_info () const {
134
- OPENVINO_DEBUG << " llama_cpp_plugin: get_profiling_info() called\n " ;
134
+ OPENVINO_DEBUG ( " llama_cpp_plugin: get_profiling_info() called\n " ) ;
135
135
return std::vector<ov::ProfilingInfo>{};
136
136
};
137
137
138
138
std::vector<ov::SoPtr<ov::IVariableState>> LlamaCppSyncInferRequest::query_state () const {
139
- OPENVINO_DEBUG << " llama_cpp_plugin: query_state() called\n " ;
139
+ OPENVINO_DEBUG ( " llama_cpp_plugin: query_state() called\n " ) ;
140
140
return {std::static_pointer_cast<ov::IVariableState>(std::make_shared<LlamaCppState>(m_llama_ctx))};
141
141
}
142
142
0 commit comments