openvinotoolkit
diff --git a/‎modules/nvidia_plugin/README.md
+16-33 b/‎modules/nvidia_plugin/README.md
+16-33
diff --git a/‎modules/nvidia_plugin/src/CMakeLists.txt
+3-3 b/‎modules/nvidia_plugin/src/CMakeLists.txt
+3-3
diff --git a/‎modules/nvidia_plugin/src/cancellation_token.hpp
+4-14 b/‎modules/nvidia_plugin/src/cancellation_token.hpp
+4-14
diff --git a/‎modules/nvidia_plugin/src/cuda/blas.hpp
+1-1 b/‎modules/nvidia_plugin/src/cuda/blas.hpp
+1-1
diff --git a/‎modules/nvidia_plugin/src/cuda/constant_factory.hpp
+1-1 b/‎modules/nvidia_plugin/src/cuda/constant_factory.hpp
+1-1
diff --git a/‎modules/nvidia_plugin/src/cuda/cuda_type_traits.hpp
+2-1 b/‎modules/nvidia_plugin/src/cuda/cuda_type_traits.hpp
+2-1
diff --git a/‎modules/nvidia_plugin/src/cuda/descriptor_utils.cpp
+1-1 b/‎modules/nvidia_plugin/src/cuda/descriptor_utils.cpp
+1-1
diff --git a/‎modules/nvidia_plugin/src/cuda/descriptor_utils.hpp
+3-1 b/‎modules/nvidia_plugin/src/cuda/descriptor_utils.hpp
+3-1
diff --git a/‎modules/nvidia_plugin/src/cuda/dnn.hpp
+1-2 b/‎modules/nvidia_plugin/src/cuda/dnn.hpp
+1-2
diff --git a/‎modules/nvidia_plugin/src/cuda/dnn_be.hpp
+1-1 b/‎modules/nvidia_plugin/src/cuda/dnn_be.hpp
+1-1
diff --git a/‎modules/nvidia_plugin/src/cuda/graph.cpp
+2-4 b/‎modules/nvidia_plugin/src/cuda/graph.cpp
+2-4
diff --git a/‎modules/nvidia_plugin/src/cuda/runtime.hpp
+2-2 b/‎modules/nvidia_plugin/src/cuda/runtime.hpp
+2-2
diff --git a/‎modules/nvidia_plugin/src/cuda/tensor.hpp
+1-1 b/‎modules/nvidia_plugin/src/cuda/tensor.hpp
+1-1
diff --git a/‎modules/nvidia_plugin/src/cuda_async_infer_request.cpp
+30-29 b/‎modules/nvidia_plugin/src/cuda_async_infer_request.cpp
+30-29
diff --git a/‎modules/nvidia_plugin/src/cuda_async_infer_request.hpp
+13-17 b/‎modules/nvidia_plugin/src/cuda_async_infer_request.hpp
+13-17
@@ -34,8 +34,8 @@ sudo apt-get install clang-8 clang++8
 
 2. Install suitable **NVIDIA driver** from [NVIDIA download drivers](http://www.nvidia.com/Download/index.aspx?lang=en-us)
 3. Install **CUDA 11.8** from [How to install CUDA](https://docs.nvidia.com/cuda/cuda-quick-start-guide/index.html)
-   
-   Do not forget to add `<path_to_cuda>/bin/` in **PATH** variable for example `export PATH="<path_to_cuda>/bin:$PATH"`    
+
+   Do not forget to add `<path_to_cuda>/bin/` in **PATH** variable for example `export PATH="<path_to_cuda>/bin:$PATH"`
 
 4. Install **cuDNN 8.6.0** from [How to install cuDNN](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html)
 5. Install **cuTENSOR 1.6.1** from [How to install cuTENSOR](https://docs.nvidia.com/cuda/cutensor/getting_started.html#installation-and-compilation)
@@ -164,12 +164,21 @@ docker commit openvino/cudaplugin-2022.3 <name of new image>
 ```
 
 ## Supported Configuration Parameters
-The plugin supports the configuration parameters listed below. All parameters must be set before calling `ov::Core::compile_model()` in order to take effect. When specifying key values as raw strings (that is, when using Python API), omit the `KEY_` prefix.
+The plugin supports the configuration parameters listed below:
+* `ov::hint::performance_mode`
+* `ov::hint::execution_mode`
+* `ov::hint::inference_precision`
+* `ov::num_streams`
+* `ov::enable_profiling`
+
+Please refer to OpenVINO documentation for details.
 
-Parameter name  | Parameter values  | Default  | Description
-------------- | ------------- | ------------- | -------------
-`NVIDIA_THROUGHPUT_STREAMS`   | `NVIDIA_THROUGHPUT_AUTO`, or non negative integer values  | 1  | Specifies number of CPU "execution" streams for the throughput mode. Upper bound for the number of inference requests that can be executed simultaneously.
-`NVIDIA_OPERATION_BENCHMARK`   | `NVIDIA_YES`, `NVIDIA_NO`  | `NVIDIA_NO`  | Specifies if operation level benchmark should be run for increasing performance of network
+### Plugin specific parameters
+* `ov::nvidia_gpu::operation_benchmark` - specifies if operation level benchmark should be run for increasing performance of network (`false` by default)
+
+All parameters must be set before calling `ov::Core::compile_model()` in order to take effect.
+ 
+## Compile options
 
 During compilation of the openvino_nvidia_gpu_plugin, user could specify the following options:
 1) `-DCUDA_KERNEL_PRINT_LOG=ON` enables print logs from kernels (WARNING, be careful with this options, could print to many logs)
@@ -182,32 +191,6 @@ nvidia-smi --query-gpu=compute_cap --format=csv
 ## Supported Layers and Limitations
 The plugin supports IRv10 and higher. The list of supported layers and its limitations are defined in [cuda_opset.md](docs/cuda_opset.md).
 
-## Supported Model Formats
-* FP32 – Supported
-* FP16 – Supported and preferred
-* U8 - Not supported
-* U16 - Not supported
-* I8 - Not supported
-* I16 - Not supported
-
-## Supported Input Precision
-* FP32 - Supported
-* FP16 - Supported
-* U8 - Not supported
-* U16 - Not supported
-* I8 - Not supported
-* I16 - Not supported
-
-## Supported Output Precision
-* FP32 – Supported
-* FP16 - Not supported
-
-## Supported Input Layout
-* NCDHW – Not supported
-* NCHW - Supported
-* NHWC - Supported
-* NC - Supported
-
 ## License
 OpenVINO™ NVIDIA GPU plugin is licensed under [Apache License Version 2.0](LICENSE).
 By contributing to the project, you agree to the license and copyright terms therein
 
@@ -14,7 +14,7 @@ file(GLOB_RECURSE
 	CONFIGURE_DEPENDS
 	${SOURCE_MASKS}
 )
-list(REMOVE_ITEM SOURCES cuda_create_plugin.cpp)
+list(REMOVE_ITEM SOURCES cuda_create_plugin.cpp cuda_create_extensions.cpp)
 list(FILTER SOURCES EXCLUDE REGEX "^ops/examples/.*$")
 file(GLOB_RECURSE
 	HEADERS
@@ -25,12 +25,12 @@ file(GLOB_RECURSE
 set_source_files_properties(*.cu *.cuh PROPERTIES LANGUAGE CUDA)
 
 add_library(${OBJ_NAME} STATIC ${SOURCES})
-target_compile_definitions(${OBJ_NAME} PRIVATE IMPLEMENT_INFERENCE_ENGINE_PLUGIN)
+target_compile_definitions(${OBJ_NAME} PRIVATE IMPLEMENT_INFERENCE_ENGINE_PLUGIN IMPLEMENT_OPENVINO_EXTENSION_API)
 
 # Adds a shared library with plugin
 ie_add_plugin(NAME ${TARGET_NAME}
               DEVICE_NAME "NVIDIA"
-              SOURCES ${HEADERS} cuda_create_plugin.cpp
+              SOURCES ${HEADERS} cuda_create_plugin.cpp cuda_create_extensions.cpp
               SKIP_INSTALL # ATTENTION: uncomment to install component
               VERSION_DEFINES_FOR cuda_create_plugin.cpp)
 
 
@@ -26,23 +26,13 @@ class CancellationToken {
     /**
      * Set token status as cancelled
      */
-    void Cancel() { is_cancelled_.store(true, std::memory_order_release); }
-
-    /**
-     * Throws exception THROW_IE_EXCEPTION_WITH_STATUS(INFER_CANCELLED) if detected cancel status
-     */
-    void Check() {
-        if (is_cancelled_.load(std::memory_order_acquire)) {
-            is_cancelled_.store(false, std::memory_order_release);
-            if (cancel_callback_) {
-                cancel_callback_();
-            }
-            throwInferCancelled();
-        }
+    void cancel() {
+        if (cancel_callback_) {
+            cancel_callback_();
+        };
     }
 
 private:
-    std::atomic<bool> is_cancelled_{false};
     std::function<void()> cancel_callback_;
 };
 
 
@@ -39,7 +39,7 @@ inline std::string cublasGetErrorString(cublasStatus_t status) {
 inline void throwIfError(
     cublasStatus_t err,
     const std::experimental::source_location& location = std::experimental::source_location::current()) {
-    if (err != CUBLAS_STATUS_SUCCESS) ov::nvidia_gpu::throwIEException(cublasGetErrorString(err), location);
+    if (err != CUBLAS_STATUS_SUCCESS) ov::nvidia_gpu::throw_ov_exception(cublasGetErrorString(err), location);
 }
 
 inline void logIfError(
 
@@ -154,7 +154,7 @@ inline const constants::AnyNumeric& NumericConst(cudaDataType_t computeType) {
             return C<std::uint32_t>::value;
         }
         default:
-            ov::nvidia_gpu::throwIEException(
+            ov::nvidia_gpu::throw_ov_exception(
                 fmt::format("The ngraph element type {} is not supported by "
                             "the cuda library",
                             computeType));
 
@@ -4,7 +4,8 @@
 
 #pragma once
 
-#include <ngraph/type/element_type_traits.hpp>
+#include "openvino/core/type/element_type.hpp"
+
 #ifdef __CUDACC__
 #include <cuda/float16.hpp>
 #endif
 
@@ -9,7 +9,7 @@
 
 namespace CUDA {
 
-DnnTensorDescriptor makeDnnTensorDescr(const ngraph::element::Type& type, const ngraph::Shape& shape) {
+DnnTensorDescriptor makeDnnTensorDescr(const ov::element::Type& type, const ov::Shape& shape) {
     OPENVINO_ASSERT(!shape.empty());
     OPENVINO_ASSERT(shape.size() <= CUDNN_DIM_MAX);
     std::vector<int> dims;
 
@@ -3,7 +3,9 @@
 //
 
 #include <cuda/dnn.hpp>
-#include <ngraph/node.hpp>
+
+#include "openvino/core/node.hpp"
+#include "openvino/core/type/element_type.hpp"
 
 namespace CUDA {
 
 
@@ -7,7 +7,6 @@
 #include <cudnn.h>
 
 #include <functional>
-#include <ngraph/type/element_type.hpp>
 #include <optional>
 
 #include "constant_factory.hpp"
@@ -39,7 +38,7 @@ inline std::string cudnnGetErrorString(cudnnConvolutionFwdAlgo_t algo) {
 inline void throwIfError(
     cudnnStatus_t err,
     const std::experimental::source_location& location = std::experimental::source_location::current()) {
-    if (err != CUDNN_STATUS_SUCCESS) ov::nvidia_gpu::throwIEException(cudnnGetErrorString(err), location);
+    if (err != CUDNN_STATUS_SUCCESS) ov::nvidia_gpu::throw_ov_exception(cudnnGetErrorString(err), location);
 }
 
 inline void logIfError(
 
@@ -517,7 +517,7 @@ class DnnBEEngineConfigDescriptor : public DnnBackendDescriptor {
 
     DnnBEEngine getEngine() const {
         auto engines = getBEDescAttributeValues<CUDNN_ATTR_ENGINECFG_ENGINE, DnnBEEngine>();
-        if (engines.size() != 1) ov::nvidia_gpu::throwIEException("Unexpected number of cuDNN Backend engines");
+        if (engines.size() != 1) ov::nvidia_gpu::throw_ov_exception("Unexpected number of cuDNN Backend engines");
         return std::move(*engines[0]);
     }
 
 
@@ -3,7 +3,7 @@
 //
 
 #include "graph.hpp"
-#include <ie_common.h>
+#include "openvino/core/except.hpp"
 #include <fmt/format.h>
 
 namespace CUDA {
@@ -27,7 +27,6 @@ cudaGraph_t Graph::createNativeWithFlags(unsigned int flags) {
     return g;
 }
 
-// clang-format off
 GraphExec::GraphExec(const Graph &g)
 #if !defined(NDEBUG) || defined(_DEBUG)
 try
@@ -43,10 +42,9 @@ Handle(cudaGraphInstantiate, cudaGraphExecDestroy, g.get(), static_cast<cudaGrap
 }
 #if !defined(NDEBUG) || defined(_DEBUG)
 catch (std::exception &e) {
-    throw InferenceEngine::GeneralError { fmt::format("{}: {}", e.what(), errorMsg_) };
+    OPENVINO_THROW(e.what(), ": ", errorMsg_);
 }
 #endif
-// clang-format on
 
 cudaGraphExecUpdateResult GraphExec::update(const Graph &g) {
     cudaGraphExecUpdateResult res;
 
@@ -16,7 +16,7 @@
 inline void throwIfError(
     cudaError_t err,
     const std::experimental::source_location& location = std::experimental::source_location::current()) {
-    if (err != cudaSuccess) ov::nvidia_gpu::throwIEException(cudaGetErrorString(err), location);
+    if (err != cudaSuccess) ov::nvidia_gpu::throw_ov_exception(cudaGetErrorString(err), location);
 }
 
 inline void logIfError(
@@ -116,7 +116,7 @@ inline int residentGrids(const cudaDeviceProp& p) {
     return defaultResidentGrids;
 }
 
-inline int maxConcurrentStreams(CUDA::Device d) {
+inline int max_concurrent_streams(CUDA::Device d) {
     auto p = d.props();
     int r = p.asyncEngineCount;
     if (!p.concurrentKernels) return r + 1;
 
@@ -11,7 +11,7 @@
 inline void throwIfError(
     cutensorStatus_t err,
     const std::experimental::source_location& location = std::experimental::source_location::current()) {
-    if (err != CUTENSOR_STATUS_SUCCESS) ov::nvidia_gpu::throwIEException(cutensorGetErrorString(err), location);
+    if (err != CUTENSOR_STATUS_SUCCESS) ov::nvidia_gpu::throw_ov_exception(cutensorGetErrorString(err), location);
 }
 
 inline void logIfError(
 
@@ -1,62 +1,63 @@
-// Copyright (C) 2018-2021 Intel Corporation
+// Copyright (C) 2018-2023 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
-
 #include "cuda_async_infer_request.hpp"
-
-#include <threading/ie_cpu_streams_executor.hpp>
-
-#include "cuda_executable_network.hpp"
 #include "cuda_itt.hpp"
 #include "cuda_thread_pool.hpp"
 
 namespace ov {
 namespace nvidia_gpu {
 
-CudaAsyncInferRequest::CudaAsyncInferRequest(const CudaInferRequest::Ptr& inferRequest,
-                                             const InferenceEngine::ITaskExecutor::Ptr& cpuTaskExecutor,
-                                             const InferenceEngine::ITaskExecutor::Ptr& waitExecutor,
-                                             const InferenceEngine::ITaskExecutor::Ptr& callbackExecutor)
-    : AsyncInferRequestThreadSafeDefault(inferRequest, cpuTaskExecutor, callbackExecutor), _inferRequest(inferRequest) {
+CudaAsyncInferRequest::CudaAsyncInferRequest(const CudaInferRequest::Ptr& request,
+                                             const std::shared_ptr<ov::threading::ITaskExecutor>& task_executor,
+                                             const std::shared_ptr<ov::threading::ITaskExecutor>& wait_executor,
+                                             const std::shared_ptr<ov::threading::ITaskExecutor>& callback_executor)
+    : ov::IAsyncInferRequest(request, task_executor, callback_executor),
+      request_(request) {
     // In current implementation we have CPU only tasks and no needs in 2 executors
     // So, by default single stage pipeline is created.
     // This stage executes InferRequest::Infer() using cpuTaskExecutor.
     // But if remote asynchronous device is used the pipeline can by splitted tasks that are executed by cpuTaskExecutor
     // and waiting tasks. Waiting tasks can lock execution thread so they use separate threads from other executor.
     constexpr const auto remoteDevice = true;
 
-    auto cudaThreadPool = std::dynamic_pointer_cast<CudaThreadPool>(waitExecutor);
+    auto cuda_thread_pool = std::dynamic_pointer_cast<CudaThreadPool>(wait_executor);
     if (remoteDevice) {
-        _pipeline = {{cpuTaskExecutor,
+        m_pipeline = {{task_executor,
                       [this] {
-                          OV_ITT_SCOPED_TASK(itt::domains::nvidia_gpu, "CudaAsyncInferRequest::Preprocessing");
-                          _inferRequest->inferPreprocess();
+                          OV_ITT_SCOPED_TASK(itt::domains::nvidia_gpu, "CudaAsyncInferRequest::infer_preprocess");
+                          request_->infer_preprocess();
                       }},
-                     {waitExecutor,
-                      [this, cudaThreadPool] {
-                          auto& threadContext = cudaThreadPool->GetThreadContext();
+                     {wait_executor,
+                      [this, cuda_thread_pool] {
+                          auto& threadContext = cuda_thread_pool->get_thread_context();
                           {
-                              OV_ITT_SCOPED_TASK(itt::domains::nvidia_gpu, "CudaAsyncInferRequest::StartPipeline");
-                              _inferRequest->startPipeline(threadContext);
+                              OV_ITT_SCOPED_TASK(itt::domains::nvidia_gpu, "CudaAsyncInferRequest::start_pipeline");
+                              request_->start_pipeline(threadContext);
                           }
                           {
-                              OV_ITT_SCOPED_TASK(itt::domains::nvidia_gpu, "CudaAsyncInferRequest::WaitPipeline");
-                              _inferRequest->waitPipeline(threadContext);
+                              OV_ITT_SCOPED_TASK(itt::domains::nvidia_gpu, "CudaAsyncInferRequest::wait_pipeline");
+                              request_->wait_pipeline(threadContext);
                           }
                       }},
-                     {cpuTaskExecutor, [this] {
-                          OV_ITT_SCOPED_TASK(itt::domains::nvidia_gpu, "CudaAsyncInferRequest::Postprocessing");
-                          _inferRequest->inferPostprocess();
+                     {task_executor, [this] {
+                          OV_ITT_SCOPED_TASK(itt::domains::nvidia_gpu, "CudaAsyncInferRequest::infer_postprocess");
+                          request_->infer_postprocess();
                       }}};
     }
 }
 
-void CudaAsyncInferRequest::Cancel() {
-    InferenceEngine::AsyncInferRequestThreadSafeDefault::Cancel();
-    _inferRequest->Cancel();
+CudaAsyncInferRequest::~CudaAsyncInferRequest() {
+    ov::IAsyncInferRequest::stop_and_wait();
 }
 
-void CudaAsyncInferRequest::Infer_ThreadUnsafe() { StartAsync_ThreadUnsafe(); }
+void CudaAsyncInferRequest::cancel() {
+    ov::IAsyncInferRequest::cancel();
+    request_->cancel();
+}
 
+void CudaAsyncInferRequest::infer_thread_unsafe() {
+    start_async_thread_unsafe();
+}
 }  // namespace nvidia_gpu
 }  // namespace ov
@@ -1,34 +1,30 @@
-// Copyright (C) 2018-2021 Intel Corporation
+// Copyright (C) 2018-2023 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #pragma once
 
-#include <cpp_interfaces/impl/ie_infer_async_request_thread_safe_default.hpp>
+#include "openvino/runtime/iasync_infer_request.hpp"
+#include "openvino/runtime/iinfer_request.hpp"
 
 #include "cuda_infer_request.hpp"
 
 namespace ov {
 namespace nvidia_gpu {
 
-class CudaAsyncInferRequest : public InferenceEngine::AsyncInferRequestThreadSafeDefault {
+class CudaAsyncInferRequest : public ov::IAsyncInferRequest {
 public:
-    CudaAsyncInferRequest(const CudaInferRequest::Ptr& inferRequest,
-                          const InferenceEngine::ITaskExecutor::Ptr& taskExecutor,
-                          const InferenceEngine::ITaskExecutor::Ptr& waitExecutor,
-                          const InferenceEngine::ITaskExecutor::Ptr& callbackExecutor);
-
-    /**
-     * Cancel AsyncInferRequest
-     */
-    void Cancel() override;
-    /**
-     * Overrides default behaviour and run request asynchronous
-     */
-    void Infer_ThreadUnsafe() override;
+    CudaAsyncInferRequest(const CudaInferRequest::Ptr& request,
+                          const std::shared_ptr<ov::threading::ITaskExecutor>& task_executor,
+                          const std::shared_ptr<ov::threading::ITaskExecutor>& wait_executor,
+                          const std::shared_ptr<ov::threading::ITaskExecutor>& callback_executor);
+
+    ~CudaAsyncInferRequest();
+    void cancel() override;
+    void infer_thread_unsafe() override;
 
 private:
-    CudaInferRequest::Ptr _inferRequest;
+    CudaInferRequest::Ptr request_;
 };
 
 }  // namespace nvidia_gpu
Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,7 @@ inline std::string cublasGetErrorString(cublasStatus_t status) {`
`39`	`39`	`inline void throwIfError(`
`40`	`40`	`cublasStatus_t err,`
`41`	`41`	`const std::experimental::source_location& location = std::experimental::source_location::current()) {`
`42`		`- if (err != CUBLAS_STATUS_SUCCESS) ov::nvidia_gpu::throwIEException(cublasGetErrorString(err), location);`
	`42`	`+ if (err != CUBLAS_STATUS_SUCCESS) ov::nvidia_gpu::throw_ov_exception(cublasGetErrorString(err), location);`
`43`	`43`	`}`
`44`	`44`
`45`	`45`	`inline void logIfError(`
Original file line number	Diff line number	Diff line change
`@@ -154,7 +154,7 @@ inline const constants::AnyNumeric& NumericConst(cudaDataType_t computeType) {`
`154`	`154`	`return C<std::uint32_t>::value;`
`155`	`155`	`}`
`156`	`156`	`default:`
`157`		`- ov::nvidia_gpu::throwIEException(`
	`157`	`+ ov::nvidia_gpu::throw_ov_exception(`
`158`	`158`	`fmt::format("The ngraph element type {} is not supported by "`
`159`	`159`	`"the cuda library",`
`160`	`160`	`computeType));`
Original file line number	Diff line number	Diff line change
`@@ -517,7 +517,7 @@ class DnnBEEngineConfigDescriptor : public DnnBackendDescriptor {`
`517`	`517`
`518`	`518`	`DnnBEEngine getEngine() const {`
`519`	`519`	`auto engines = getBEDescAttributeValues<CUDNN_ATTR_ENGINECFG_ENGINE, DnnBEEngine>();`
`520`		`- if (engines.size() != 1) ov::nvidia_gpu::throwIEException("Unexpected number of cuDNN Backend engines");`
	`520`	`+ if (engines.size() != 1) ov::nvidia_gpu::throw_ov_exception("Unexpected number of cuDNN Backend engines");`
`521`	`521`	`return std::move(*engines[0]);`
`522`	`522`	`}`
`523`	`523`
Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@`
`11`	`11`	`inline void throwIfError(`
`12`	`12`	`cutensorStatus_t err,`
`13`	`13`	`const std::experimental::source_location& location = std::experimental::source_location::current()) {`
`14`		`- if (err != CUTENSOR_STATUS_SUCCESS) ov::nvidia_gpu::throwIEException(cutensorGetErrorString(err), location);`
	`14`	`+ if (err != CUTENSOR_STATUS_SUCCESS) ov::nvidia_gpu::throw_ov_exception(cutensorGetErrorString(err), location);`
`15`	`15`	`}`
`16`	`16`
`17`	`17`	`inline void logIfError(`