Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modified Deconvolution op #104

Open
wants to merge 13 commits into
base: poc/arm_backup
Choose a base branch
from
636 changes: 381 additions & 255 deletions src/plugins/intel_cpu/src/nodes/deconv.cpp

Large diffs are not rendered by default.

26 changes: 7 additions & 19 deletions src/plugins/intel_cpu/src/nodes/deconv.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
#include <vector>
#include "common/dnnl_executor.h"

#include "executors/deconv_list.hpp"

namespace ov {
namespace intel_cpu {
namespace node {
Expand All @@ -20,6 +22,7 @@ class Deconvolution : public Node {
Deconvolution(const std::shared_ptr<ngraph::Node>& op, const GraphContext::CPtr context);

void getSupportedDescriptors() override;
void initSupportedPrimitiveDescriptors() override;
void createDescriptor(const std::vector<MemoryDescPtr>& inputDesc,
const std::vector<MemoryDescPtr>& outputDesc) override;
void createPrimitive() override;
Expand All @@ -43,7 +46,7 @@ class Deconvolution : public Node {
bool canFuse(const NodePtr& node) const override;

const VectorDims& getWeightDims() const { return getInputShapeAtPort(1).getStaticDims(); }
const std::vector<ptrdiff_t>& getStride() const { return stride; }
const std::vector<ptrdiff_t>& getStride() const { return deconvAttrs.stride; }

void prepareParams() override;
void execute(dnnl::stream strm) override;
Expand All @@ -62,6 +65,7 @@ class Deconvolution : public Node {
private:
using executorPtr = std::shared_ptr<DnnlExecutor>;
executorPtr execPtr = nullptr;
std::shared_ptr<DeconvExecutor> execPtrDeconv = nullptr;

class DeconvExecutorDefault : public DnnlExecutor {
public:
Expand All @@ -81,23 +85,8 @@ class Deconvolution : public Node {
const dnnl::engine& engine);
};

bool withGroups = false;
bool isDW = false;
bool isInt8 = false;
bool autoPad = false;
bool externOutShape = false;
size_t groupNum = 1;
size_t IC;
size_t OC;
std::vector<ptrdiff_t> kernel;
std::vector<ptrdiff_t> stride;
std::vector<ptrdiff_t> dilation;
ov::CoordinateDiff paddingL;
ov::CoordinateDiff paddingR;
ov::CoordinateDiff outputPadding;
std::vector<int32_t> lastOutputSpatialDims;
VectorDims int8WeightDims;
VectorDims biasesDims;
bool useACL = false;
DeconvAttrs deconvAttrs;

Shape inShape;

Expand All @@ -112,7 +101,6 @@ class Deconvolution : public Node {
void initPaddingR(const Shape &inShape, const Shape &outShape);
std::vector<int32_t> readOutputSpatialDims() const;
std::pair<VectorDims, VectorDims> makeDummyInOutShape();
bool withBiases = false;
size_t biasPort;

std::string errorPrefix;
Expand Down
127 changes: 127 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "acl_deconv.hpp"
#include "acl_utils.hpp"
#include "ie_parallel.hpp"

namespace ov {
namespace intel_cpu {

using namespace arm_compute;

//FIXME: add context
AclDeconvExecutor::AclDeconvExecutor() : DeconvExecutor() {}

bool AclDeconvExecutor::init(const DeconvAttrs& deconvAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const dnnl::primitive_attr &attr) {
this->deconvAttrs = deconvAttrs;
auto srcDims = srcDescs[0]->getShape().getStaticDims();
auto weiDims = srcDescs[1]->getShape().getStaticDims();
//swap input and output channels dimensions to be align with ACL
//weights tensor shape is changed because ACL expects [W, H, I, O] tensor while OV uses [I, O, H, W] tensor
std::swap(weiDims[0], weiDims[1]);
auto dstDims = dstDescs[0]->getShape().getStaticDims();

VectorDims biasDims;
TensorInfo biasTensorInfo;
if (deconvAttrs.withBiases) {
biasDims = srcDescs[2]->getShape().getStaticDims();
//bias presicion is I32 but ACL requests bias precision as input ones
biasTensorInfo = TensorInfo(shapeCast(biasDims), 1,
precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[2]));
}

TensorInfo srcTensorInfo = TensorInfo(shapeCast(srcDims), 1,
precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[0]));
TensorInfo weiTensorInfo = TensorInfo(shapeCast(weiDims), 1,
precisionToAclDataType(srcDescs[1]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[1]));
TensorInfo dstTensorInfo = TensorInfo(shapeCast(dstDims), 1,
precisionToAclDataType(dstDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(dstDescs[0]));

unsigned int pad_l = deconvAttrs.paddingL.at(1);
unsigned int pad_r = deconvAttrs.paddingR.at(1);
unsigned int pad_t = deconvAttrs.paddingL.at(0);
unsigned int pad_b = deconvAttrs.paddingR.at(0);
unsigned int stride_x = deconvAttrs.stride.at(1);
unsigned int stride_y = deconvAttrs.stride.at(0);
unsigned int dilation_x = deconvAttrs.dilation.at(1) + 1;
unsigned int dilation_y = deconvAttrs.dilation.at(0) + 1;

arm_compute::PadStrideInfo deconv_info(stride_x, stride_y, pad_l, pad_r, pad_t, pad_b, arm_compute::DimensionRoundingType::FLOOR);
arm_compute::Size2D dilation(dilation_x, dilation_y);

arm_compute::Status status = arm_compute::NEDeconvolutionLayer::validate(&srcTensorInfo,
&weiTensorInfo,
deconvAttrs.withBiases ? &biasTensorInfo : nullptr,
&dstTensorInfo,
deconv_info);
if (!status) {
DEBUG_LOG("NEDeconvolutionLayer validation failed: ", status.error_description());
return false;
}

srcTensor.allocator()->init(srcTensorInfo);
weiTensor.allocator()->init(weiTensorInfo);
dstTensor.allocator()->init(dstTensorInfo);
if (deconvAttrs.withBiases)
biasTensor.allocator()->init(biasTensorInfo);

deconv = std::make_unique<arm_compute::NEDeconvolutionLayer>();
deconv->configure(&srcTensor, &weiTensor, deconvAttrs.withBiases ? &biasTensor : nullptr, &dstTensor, deconv_info);

return true;
}

static void transpose_to_1023(const MemoryCPtr& srcMemPtr, std::vector<float>& dst_data) {
const auto src_data = reinterpret_cast<float*>(srcMemPtr->GetPtr());

const int DIM0 = srcMemPtr->getStaticDims()[0];
const int DIM1 = srcMemPtr->getStaticDims()[1];
const int DIM2 = srcMemPtr->getStaticDims()[2];
const int DIM3 = srcMemPtr->getStaticDims()[3];

parallel_for3d(DIM0, DIM1, DIM2, [&](const int dim0, const int dim1, const int dim2) {
for (int dim3 = 0; dim3 < DIM3; ++dim3) {
const int src_off = dim0 * DIM1 * DIM2 * DIM3 +
dim1 * DIM2 * DIM3 +
dim2 * DIM3 +
dim3;
const int dst_off = dim1 * DIM0 * DIM2 * DIM3 +
dim0 * DIM2 * DIM3 +
dim2 * DIM3 +
dim3;

dst_data[dst_off] = src_data[src_off];
}
});
}

void AclDeconvExecutor::exec(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst, const void *post_ops_data_) {
//weights tensor shape is changed because ACL expects [W, H, I, O] tensor while OV uses [I, O, H, W] tensor
std::vector<float> weiBuffer(src[1]->getStaticDims()[0] *
src[1]->getStaticDims()[1] *
src[1]->getStaticDims()[2] *
src[1]->getStaticDims()[3]);
transpose_to_1023(src[1], weiBuffer);

srcTensor.allocator()->import_memory(src[0]->GetPtr());
dstTensor.allocator()->import_memory(dst[0]->GetPtr());
weiTensor.allocator()->import_memory(weiBuffer.data());
if (deconvAttrs.withBiases)
biasTensor.allocator()->import_memory(src[2]->GetPtr());

deconv->run();

srcTensor.allocator()->free();
dstTensor.allocator()->free();
weiTensor.allocator()->free();
if (deconvAttrs.withBiases)
biasTensor.allocator()->free();
}

} // namespace intel_cpu
} // namespace ov
102 changes: 102 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_deconv.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "nodes/executors/deconv.hpp"
#include "arm_compute/runtime/NEON/NEFunctions.h"
#include "utils/debug_capabilities.h"

namespace ov {
namespace intel_cpu {

class AclDeconvExecutor : public DeconvExecutor {
public:
AclDeconvExecutor();

bool init(const DeconvAttrs& deconvAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const dnnl::primitive_attr &attr) override;
void exec(const std::vector<MemoryCPtr>& src,
const std::vector<MemoryPtr>& dst,
const void *post_ops_data_) override;

impl_desc_type getImplType() const override {
return implType;
}

private:
DeconvAttrs deconvAttrs;
impl_desc_type implType = impl_desc_type::acl;

arm_compute::Tensor srcTensor;
arm_compute::Tensor weiTensor;
arm_compute::Tensor biasTensor;
arm_compute::Tensor dstTensor;
std::unique_ptr<arm_compute::NEDeconvolutionLayer> deconv = nullptr;
};

class AclDeconvExecutorBuilder : public DeconvExecutorBuilder {
public:
bool isSupported(const DeconvAttrs& deconvAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs) const override {
if ((srcDescs[0]->getPrecision() != InferenceEngine::Precision::FP32 &&
srcDescs[1]->getPrecision() != InferenceEngine::Precision::FP32 &&
dstDescs[0]->getPrecision() != InferenceEngine::Precision::FP32) &&
(srcDescs[0]->getPrecision() != InferenceEngine::Precision::FP16 &&
srcDescs[1]->getPrecision() != InferenceEngine::Precision::FP16 &&
dstDescs[0]->getPrecision() != InferenceEngine::Precision::FP16)) {
DEBUG_LOG("AclDeconvExecutor does not support precisions:",
" src[0]=", srcDescs[0]->getPrecision(),
" src[1]=", srcDescs[1]->getPrecision(),
" dst[0]=", dstDescs[0]->getPrecision());
return false;
}
if (deconvAttrs.withBiases &&
srcDescs[2]->getPrecision() != srcDescs[0]->getPrecision()) {
DEBUG_LOG("AclDeconvExecutor does not support precisions:",
" src[2]=", srcDescs[2]->getPrecision());
return false;
}

if (!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) &&
srcDescs[1]->hasLayoutType(LayoutType::ncsp) &&
dstDescs[0]->hasLayoutType(LayoutType::ncsp)) &&
!(srcDescs[0]->hasLayoutType(LayoutType::nspc) &&
srcDescs[1]->hasLayoutType(LayoutType::nspc) &&
dstDescs[0]->hasLayoutType(LayoutType::nspc))) {
DEBUG_LOG("AclDeconvExecutor does not support layouts:",
" src[0]=", srcDescs[0]->serializeFormat(),
" src[1]=", srcDescs[1]->serializeFormat(),
" dst=", dstDescs[0]->serializeFormat());
return false;
}
if (deconvAttrs.withBiases &&
!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) &&
srcDescs[1]->hasLayoutType(LayoutType::ncsp) &&
srcDescs[2]->hasLayoutType(LayoutType::ncsp) &&
dstDescs[0]->hasLayoutType(LayoutType::ncsp)) &&
!(srcDescs[0]->hasLayoutType(LayoutType::nspc) &&
srcDescs[1]->hasLayoutType(LayoutType::nspc) &&
srcDescs[2]->hasLayoutType(LayoutType::nspc) &&
dstDescs[0]->hasLayoutType(LayoutType::nspc))) {
DEBUG_LOG("AclDeconvExecutor does not support layouts:",
" src[0]=", srcDescs[0]->serializeFormat(),
" src[1]=", srcDescs[1]->serializeFormat(),
" src[2]=", srcDescs[2]->serializeFormat(),
" dst=", dstDescs[0]->serializeFormat());
return false;
}
return true;
}

DeconvExecutorPtr makeExecutor() const override {
return std::make_shared<AclDeconvExecutor>();
}
};

} // namespace intel_cpu
} // namespace ov
32 changes: 29 additions & 3 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,36 @@
//
#pragma once

#include "ie_precision.hpp"
#include "memory_desc/cpu_memory_desc.h"
#include "arm_compute/core/Types.h"

namespace ov {
namespace intel_cpu {

inline arm_compute::TensorShape shapeCast(const VectorDims& dims) {
arm_compute::TensorShape tensorShape;
for (std::size_t i = 0; i < dims.size(); ++i) {
tensorShape.set(dims.size() - i - 1, dims[i], false);
}
if (tensorShape.num_dimensions() == 0) {
tensorShape.set(0, 1, false);
tensorShape.set_num_dimensions(1);
}
return tensorShape;
}

inline std::size_t axisCast(const std::size_t axis, const std::size_t shapeSize) {
return shapeSize - axis - 1;
}

inline Dim vectorProduct(const VectorDims& vec, size_t size) {
Dim prod = 1;
for (size_t i = 0; i < size; ++i)
prod *= vec[i];
return prod;
}

/**
* @brief Return ComputeLibrary DataType that corresponds to the given precision
* @param precision precision to be converted
Expand All @@ -27,7 +54,6 @@ inline arm_compute::DataType precisionToAclDataType(InferenceEngine::Precision p
default: return arm_compute::DataType::UNKNOWN;
}
}

/**
* @brief Return ComputeLibrary DataLayout that corresponds to MemoryDecs layout
* @param desc MemoryDecs from which layout is retrieved
Expand All @@ -36,8 +62,8 @@ inline arm_compute::DataType precisionToAclDataType(InferenceEngine::Precision p
inline arm_compute::DataLayout getAclDataLayoutByMemoryDesc(MemoryDescCPtr desc) {
if (desc->hasLayoutType(LayoutType::ncsp)) {
if (desc->getShape().getRank() == 4) return arm_compute::DataLayout::NCHW;
if (desc->getShape().getRank() == 5) return arm_compute::DataLayout::NCDHW;
} else if(desc->hasLayoutType(LayoutType::nspc)) {
if (desc->getShape().getRank() == 5) return arm_compute::DataLayout::NCDHW;
} else if (desc->hasLayoutType(LayoutType::nspc)) {
if (desc->getShape().getRank() == 4) return arm_compute::DataLayout::NHWC;
if (desc->getShape().getRank() == 5) return arm_compute::DataLayout::NDHWC;
}
Expand Down
16 changes: 16 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/deconv.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "deconv.hpp"

namespace ov {
namespace intel_cpu {

using namespace InferenceEngine;

DeconvExecutor::DeconvExecutor() {}


} // namespace intel_cpu
} // namespace ov
Loading