dmitry-gorokhov · alvoron · Mar 2, 2023 · Mar 3, 2023 · Mar 7, 2023 · Mar 14, 2023
diff --git a/src/plugins/intel_cpu/src/nodes/deconv.cpp b/src/plugins/intel_cpu/src/nodes/deconv.cpp
diff --git a/src/plugins/intel_cpu/src/nodes/deconv.h b/src/plugins/intel_cpu/src/nodes/deconv.h
@@ -11,6 +11,8 @@
 #include <vector>
 #include "common/dnnl_executor.h"
 
+#include "executors/deconv_list.hpp"
+
 namespace ov {
 namespace intel_cpu {
 namespace node {
@@ -20,6 +22,7 @@ class Deconvolution : public Node {
     Deconvolution(const std::shared_ptr<ngraph::Node>& op, const GraphContext::CPtr context);
 
     void getSupportedDescriptors() override;
+    void initSupportedPrimitiveDescriptors() override;
     void createDescriptor(const std::vector<MemoryDescPtr>& inputDesc,
                           const std::vector<MemoryDescPtr>& outputDesc) override;
     void createPrimitive() override;
@@ -43,7 +46,7 @@ class Deconvolution : public Node {
     bool canFuse(const NodePtr& node) const override;
 
     const VectorDims& getWeightDims() const { return getInputShapeAtPort(1).getStaticDims(); }
-    const std::vector<ptrdiff_t>& getStride() const { return stride; }
+    const std::vector<ptrdiff_t>& getStride() const { return deconvAttrs.stride; }
 
     void prepareParams() override;
     void execute(dnnl::stream strm) override;
@@ -62,6 +65,7 @@ class Deconvolution : public Node {
 private:
     using executorPtr = std::shared_ptr<DnnlExecutor>;
     executorPtr execPtr = nullptr;
+    std::shared_ptr<DeconvExecutor> execPtrDeconv = nullptr;
 
     class DeconvExecutorDefault : public DnnlExecutor {
         public:
@@ -81,23 +85,8 @@ class Deconvolution : public Node {
                                const dnnl::engine& engine);
     };
 
-    bool withGroups = false;
-    bool isDW = false;
-    bool isInt8 = false;
-    bool autoPad = false;
-    bool externOutShape = false;
-    size_t groupNum = 1;
-    size_t IC;
-    size_t OC;
-    std::vector<ptrdiff_t> kernel;
-    std::vector<ptrdiff_t> stride;
-    std::vector<ptrdiff_t> dilation;
-    ov::CoordinateDiff paddingL;
-    ov::CoordinateDiff paddingR;
-    ov::CoordinateDiff outputPadding;
-    std::vector<int32_t> lastOutputSpatialDims;
-    VectorDims int8WeightDims;
-    VectorDims biasesDims;
+    bool useACL = false;
+    DeconvAttrs deconvAttrs;
 
     Shape inShape;
 
@@ -112,7 +101,6 @@ class Deconvolution : public Node {
     void initPaddingR(const Shape &inShape, const Shape &outShape);
     std::vector<int32_t> readOutputSpatialDims() const;
     std::pair<VectorDims, VectorDims> makeDummyInOutShape();
-    bool withBiases = false;
     size_t biasPort;
 
     std::string errorPrefix;

diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.cpp
@@ -0,0 +1,127 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "acl_deconv.hpp"
+#include "acl_utils.hpp"
+#include "ie_parallel.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+using namespace arm_compute;
+
+//FIXME: add context
+AclDeconvExecutor::AclDeconvExecutor() : DeconvExecutor() {}
+
+bool AclDeconvExecutor::init(const DeconvAttrs& deconvAttrs,
+                          const std::vector<MemoryDescPtr>& srcDescs,
+                          const std::vector<MemoryDescPtr>& dstDescs,
+                          const dnnl::primitive_attr &attr) {
+    this->deconvAttrs = deconvAttrs;
+    auto srcDims  = srcDescs[0]->getShape().getStaticDims();
+    auto weiDims  = srcDescs[1]->getShape().getStaticDims();
+    //swap input and output channels dimensions to be align with ACL
+    //weights tensor shape is changed because ACL expects [W, H, I, O] tensor while OV uses [I, O, H, W] tensor
+    std::swap(weiDims[0], weiDims[1]);
+    auto dstDims  = dstDescs[0]->getShape().getStaticDims();
+
+    VectorDims biasDims;
+    TensorInfo biasTensorInfo;
+    if (deconvAttrs.withBiases) {
+        biasDims = srcDescs[2]->getShape().getStaticDims();
+        //bias presicion is I32 but ACL requests bias precision as input ones
+        biasTensorInfo = TensorInfo(shapeCast(biasDims), 1,
+        precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[2]));
+    }
+
+    TensorInfo srcTensorInfo = TensorInfo(shapeCast(srcDims), 1,
+    precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[0]));
+    TensorInfo weiTensorInfo = TensorInfo(shapeCast(weiDims), 1,
+    precisionToAclDataType(srcDescs[1]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[1]));
+    TensorInfo dstTensorInfo = TensorInfo(shapeCast(dstDims), 1,
+    precisionToAclDataType(dstDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(dstDescs[0]));
+
+    unsigned int pad_l = deconvAttrs.paddingL.at(1);
+    unsigned int pad_r = deconvAttrs.paddingR.at(1);
+    unsigned int pad_t = deconvAttrs.paddingL.at(0);
+    unsigned int pad_b = deconvAttrs.paddingR.at(0);
+    unsigned int stride_x = deconvAttrs.stride.at(1);
+    unsigned int stride_y = deconvAttrs.stride.at(0);
+    unsigned int dilation_x = deconvAttrs.dilation.at(1) + 1;
+    unsigned int dilation_y = deconvAttrs.dilation.at(0) + 1;
+
+    arm_compute::PadStrideInfo deconv_info(stride_x, stride_y, pad_l, pad_r, pad_t, pad_b, arm_compute::DimensionRoundingType::FLOOR);
+    arm_compute::Size2D dilation(dilation_x, dilation_y);
+
+    arm_compute::Status status = arm_compute::NEDeconvolutionLayer::validate(&srcTensorInfo,
+                                                                           &weiTensorInfo,
+                                                                           deconvAttrs.withBiases ? &biasTensorInfo : nullptr,
+                                                                           &dstTensorInfo,
+                                                                           deconv_info);
+    if (!status) {
+        DEBUG_LOG("NEDeconvolutionLayer validation failed: ", status.error_description());
+        return false;
+    }
+
+    srcTensor.allocator()->init(srcTensorInfo);
+    weiTensor.allocator()->init(weiTensorInfo);
+    dstTensor.allocator()->init(dstTensorInfo);
+    if (deconvAttrs.withBiases)
+        biasTensor.allocator()->init(biasTensorInfo);
+
+    deconv = std::make_unique<arm_compute::NEDeconvolutionLayer>();
+    deconv->configure(&srcTensor, &weiTensor, deconvAttrs.withBiases ? &biasTensor : nullptr, &dstTensor, deconv_info);
+
+    return true;
+}
+
+static void transpose_to_1023(const MemoryCPtr& srcMemPtr, std::vector<float>& dst_data) {
+    const auto src_data = reinterpret_cast<float*>(srcMemPtr->GetPtr());
+
+    const int DIM0 = srcMemPtr->getStaticDims()[0];
+    const int DIM1 = srcMemPtr->getStaticDims()[1];
+    const int DIM2 = srcMemPtr->getStaticDims()[2];
+    const int DIM3 = srcMemPtr->getStaticDims()[3];
+
+    parallel_for3d(DIM0, DIM1, DIM2, [&](const int dim0, const int dim1, const int dim2) {
+                for (int dim3 = 0; dim3 < DIM3; ++dim3) {
+                    const int src_off = dim0 * DIM1 * DIM2 * DIM3 +
+                                        dim1 * DIM2 * DIM3 +
+                                        dim2 * DIM3 +
+                                        dim3;
+                    const int dst_off = dim1 * DIM0 * DIM2 * DIM3 +
+                                        dim0 * DIM2 * DIM3 +
+                                        dim2 * DIM3 +
+                                        dim3;
+
+                    dst_data[dst_off] = src_data[src_off];
+                }
+    });
+}
+
+void AclDeconvExecutor::exec(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst, const void *post_ops_data_) {
+    //weights tensor shape is changed because ACL expects [W, H, I, O] tensor while OV uses [I, O, H, W] tensor
+    std::vector<float> weiBuffer(src[1]->getStaticDims()[0] *
+                                 src[1]->getStaticDims()[1] *
+                                 src[1]->getStaticDims()[2] *
+                                 src[1]->getStaticDims()[3]);
+    transpose_to_1023(src[1], weiBuffer);
+
+    srcTensor.allocator()->import_memory(src[0]->GetPtr());
+    dstTensor.allocator()->import_memory(dst[0]->GetPtr());
+    weiTensor.allocator()->import_memory(weiBuffer.data());
+    if (deconvAttrs.withBiases)
+        biasTensor.allocator()->import_memory(src[2]->GetPtr());
+
+    deconv->run();
+
+    srcTensor.allocator()->free();
+    dstTensor.allocator()->free();
+    weiTensor.allocator()->free();
+    if (deconvAttrs.withBiases)
+        biasTensor.allocator()->free();
+}
+
+}   // namespace intel_cpu
+}   // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.hpp
@@ -0,0 +1,102 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "nodes/executors/deconv.hpp"
+#include "arm_compute/runtime/NEON/NEFunctions.h"
+#include "utils/debug_capabilities.h"
+
+namespace ov {
+namespace intel_cpu {
+
+class AclDeconvExecutor : public DeconvExecutor {
+public:
+    AclDeconvExecutor();
+
+    bool init(const DeconvAttrs& deconvAttrs,
+              const std::vector<MemoryDescPtr>& srcDescs,
+              const std::vector<MemoryDescPtr>& dstDescs,
+              const dnnl::primitive_attr &attr) override;
+    void exec(const std::vector<MemoryCPtr>& src,
+              const std::vector<MemoryPtr>& dst,
+              const void *post_ops_data_) override;
+
+    impl_desc_type getImplType() const override {
+        return implType;
+    }
+
+private:
+    DeconvAttrs deconvAttrs;
+    impl_desc_type implType = impl_desc_type::acl;
+
+    arm_compute::Tensor srcTensor;
+    arm_compute::Tensor weiTensor;
+    arm_compute::Tensor biasTensor;
+    arm_compute::Tensor dstTensor;
+    std::unique_ptr<arm_compute::NEDeconvolutionLayer> deconv = nullptr;
+};
+
+class AclDeconvExecutorBuilder : public DeconvExecutorBuilder {
+public:
+    bool isSupported(const DeconvAttrs& deconvAttrs,
+                     const std::vector<MemoryDescPtr>& srcDescs,
+                     const std::vector<MemoryDescPtr>& dstDescs) const override {
+        if ((srcDescs[0]->getPrecision() != InferenceEngine::Precision::FP32 &&
+             srcDescs[1]->getPrecision() != InferenceEngine::Precision::FP32 &&
+             dstDescs[0]->getPrecision() != InferenceEngine::Precision::FP32) &&
+            (srcDescs[0]->getPrecision() != InferenceEngine::Precision::FP16 &&
+             srcDescs[1]->getPrecision() != InferenceEngine::Precision::FP16 &&
+             dstDescs[0]->getPrecision() != InferenceEngine::Precision::FP16)) {
+            DEBUG_LOG("AclDeconvExecutor does not support precisions:",
+                      " src[0]=", srcDescs[0]->getPrecision(),
+                      " src[1]=", srcDescs[1]->getPrecision(),
+                      " dst[0]=", dstDescs[0]->getPrecision());
+            return false;
+        }
+        if (deconvAttrs.withBiases &&
+           srcDescs[2]->getPrecision() != srcDescs[0]->getPrecision()) {
+            DEBUG_LOG("AclDeconvExecutor does not support precisions:",
+                      " src[2]=", srcDescs[2]->getPrecision());
+            return false;
+           }
+
+        if (!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) &&
+              srcDescs[1]->hasLayoutType(LayoutType::ncsp) &&
+              dstDescs[0]->hasLayoutType(LayoutType::ncsp)) &&
+            !(srcDescs[0]->hasLayoutType(LayoutType::nspc) &&
+              srcDescs[1]->hasLayoutType(LayoutType::nspc) &&
+              dstDescs[0]->hasLayoutType(LayoutType::nspc))) {
+                DEBUG_LOG("AclDeconvExecutor does not support layouts:",
+                    " src[0]=", srcDescs[0]->serializeFormat(),
+                    " src[1]=", srcDescs[1]->serializeFormat(),
+                    " dst=", dstDescs[0]->serializeFormat());
+                return false;
+              }
+        if (deconvAttrs.withBiases &&
+            !(srcDescs[0]->hasLayoutType(LayoutType::ncsp) &&
+              srcDescs[1]->hasLayoutType(LayoutType::ncsp) &&
+              srcDescs[2]->hasLayoutType(LayoutType::ncsp) &&
+              dstDescs[0]->hasLayoutType(LayoutType::ncsp)) &&
+            !(srcDescs[0]->hasLayoutType(LayoutType::nspc) &&
+              srcDescs[1]->hasLayoutType(LayoutType::nspc) &&
+              srcDescs[2]->hasLayoutType(LayoutType::nspc) &&
+              dstDescs[0]->hasLayoutType(LayoutType::nspc))) {
+                DEBUG_LOG("AclDeconvExecutor does not support layouts:",
+                    " src[0]=", srcDescs[0]->serializeFormat(),
+                    " src[1]=", srcDescs[1]->serializeFormat(),
+                    " src[2]=", srcDescs[2]->serializeFormat(),
+                    " dst=", dstDescs[0]->serializeFormat());
+                return false;
+              }
+        return true;
+    }
+
+    DeconvExecutorPtr makeExecutor() const override {
+        return std::make_shared<AclDeconvExecutor>();
+    }
+};
+
+}   // namespace intel_cpu
+}   // namespace ov
diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp
@@ -3,9 +3,36 @@
 //
 #pragma once
 
+#include "ie_precision.hpp"
+#include "memory_desc/cpu_memory_desc.h"
+#include "arm_compute/core/Types.h"
+
 namespace ov {
 namespace intel_cpu {
 
+inline arm_compute::TensorShape shapeCast(const VectorDims& dims) {
+    arm_compute::TensorShape tensorShape;
+    for (std::size_t i = 0; i < dims.size(); ++i) {
+        tensorShape.set(dims.size() - i - 1, dims[i], false);
+    }
+    if (tensorShape.num_dimensions() == 0) {
+        tensorShape.set(0, 1, false);
+        tensorShape.set_num_dimensions(1);
+    }
+    return tensorShape;
+}
+
+inline std::size_t axisCast(const std::size_t axis, const std::size_t shapeSize) {
+    return shapeSize - axis - 1;
+}
+
+inline Dim vectorProduct(const VectorDims& vec, size_t size) {
+    Dim prod = 1;
+    for (size_t i = 0; i < size; ++i)
+        prod *= vec[i];
+    return prod;
+}
+
 /**
 * @brief Return ComputeLibrary DataType that corresponds to the given precision
 * @param precision precision to be converted
@@ -27,7 +54,6 @@ inline arm_compute::DataType precisionToAclDataType(InferenceEngine::Precision p
         default:                                return arm_compute::DataType::UNKNOWN;
     }
 }
-
 /**
 * @brief Return ComputeLibrary DataLayout that corresponds to MemoryDecs layout
 * @param desc MemoryDecs from which layout is retrieved
@@ -36,8 +62,8 @@ inline arm_compute::DataType precisionToAclDataType(InferenceEngine::Precision p
 inline arm_compute::DataLayout getAclDataLayoutByMemoryDesc(MemoryDescCPtr desc) {
     if (desc->hasLayoutType(LayoutType::ncsp)) {
         if (desc->getShape().getRank() == 4) return arm_compute::DataLayout::NCHW;
-        if (desc->getShape().getRank() == 5) return arm_compute::DataLayout::NCDHW; 
-    } else if(desc->hasLayoutType(LayoutType::nspc)) {
+        if (desc->getShape().getRank() == 5) return arm_compute::DataLayout::NCDHW;
+    } else if (desc->hasLayoutType(LayoutType::nspc)) {
         if (desc->getShape().getRank() == 4) return arm_compute::DataLayout::NHWC;
         if (desc->getShape().getRank() == 5) return arm_compute::DataLayout::NDHWC;
     }

diff --git a/src/plugins/intel_cpu/src/nodes/executors/deconv.cpp b/src/plugins/intel_cpu/src/nodes/executors/deconv.cpp
@@ -0,0 +1,16 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "deconv.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+using namespace InferenceEngine;
+
+DeconvExecutor::DeconvExecutor() {}
+
+
+}   // namespace intel_cpu
+}   // namespace ov