mvafin
diff --git a/‎.gitmodules
+3 b/‎.gitmodules
+3
diff --git a/‎licensing/runtime-third-party-programs.txt
+26 b/‎licensing/runtime-third-party-programs.txt
+26
diff --git a/‎src/bindings/python/tests/test_onnx/test_zoo_models.py
+1 b/‎src/bindings/python/tests/test_onnx/test_zoo_models.py
+1
diff --git a/‎src/plugins/intel_cpu/CMakeLists.txt
+15-1 b/‎src/plugins/intel_cpu/CMakeLists.txt
+15-1
diff --git a/‎src/plugins/intel_cpu/src/graph_optimizer.cpp
+3 b/‎src/plugins/intel_cpu/src/graph_optimizer.cpp
+3
diff --git a/‎src/plugins/intel_cpu/src/mlas/sgemm.cpp
+94 b/‎src/plugins/intel_cpu/src/mlas/sgemm.cpp
+94
diff --git a/‎src/plugins/intel_cpu/src/mlas/sgemm.hpp
+109 b/‎src/plugins/intel_cpu/src/mlas/sgemm.hpp
+109
diff --git a/‎src/plugins/intel_cpu/src/mlas/thread_pool.cpp
+33 b/‎src/plugins/intel_cpu/src/mlas/thread_pool.cpp
+33
diff --git a/‎src/plugins/intel_cpu/src/mlas/thread_pool.hpp
+25 b/‎src/plugins/intel_cpu/src/mlas/thread_pool.hpp
+25
diff --git a/‎src/plugins/intel_cpu/src/node.cpp
+1 b/‎src/plugins/intel_cpu/src/node.cpp
+1
@@ -72,3 +72,6 @@
 [submodule "ARMComputeLibrary"]
 	path = src/plugins/intel_cpu/thirdparty/ComputeLibrary
 	url = https://github.com/ARM-software/ComputeLibrary.git
+[submodule "src/plugins/intel_cpu/thirdparty/mlas"]
+	path = src/plugins/intel_cpu/thirdparty/mlas
+	url = https://github.com/openvinotoolkit/mlas.git
@@ -1399,3 +1399,29 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 
+-------------------------------------------------------------
+
+21 MLAS (https://github.com/microsoft/onnxruntime)
+
+MIT License
+
+Copyright (c) Microsoft Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
@@ -115,6 +115,7 @@ def tinyyolov3_post_processing(outputs : Sequence[Any]) -> Sequence[Any]:
     "GPT2": {"atol": 5e-06, "rtol": 0.01},
     "GPT-2-LM-HEAD": {"atol": 4e-06},
     "test_retinanet_resnet101": {"atol": 1.3e-06},
+    "resnet34-v1-7" : {"atol": 1e-5}
 }
 
 def tolerance_map_key_in_model_path(path):
 
@@ -20,6 +20,8 @@ elseif(OV_COMPILER_IS_CLANG)
     ie_add_compiler_flags(-Wno-delete-non-abstract-non-virtual-dtor)
 endif()
 
+# enbale mlas for X86 cpus only
+ie_dependent_option(ENABLE_MLAS_FOR_CPU "MLAS GEMM for OpenVINO CPU Plugin" ON "X86 OR X86_64" OFF)
 add_subdirectory(thirdparty)
 
 if(WIN32)
@@ -64,6 +66,10 @@ if(NOT (AARCH64 OR ARM))
     list(APPEND EXCLUDE_PATHS ${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/cpu_opset/arm/*)
 endif()
 
+if (NOT ENABLE_MLAS_FOR_CPU)
+    list(APPEND EXCLUDE_PATHS ${CMAKE_CURRENT_SOURCE_DIR}/src/mlas/*)
+endif()
+
 file(GLOB_RECURSE FILES_TO_REMOVE ${EXCLUDE_PATHS})
 list(REMOVE_ITEM SOURCES ${FILES_TO_REMOVE})
 list(REMOVE_ITEM HEADERS ${FILES_TO_REMOVE})
@@ -94,8 +100,12 @@ target_link_libraries(${TARGET_NAME} PRIVATE dnnl
 
 target_compile_definitions(${TARGET_NAME} PRIVATE IMPLEMENT_INFERENCE_EXTENSION_API)
 target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
+if (ENABLE_MLAS_FOR_CPU)
+    target_link_libraries(${TARGET_NAME} PRIVATE mlas)
+    target_include_directories(${TARGET_NAME} SYSTEM PRIVATE $<TARGET_PROPERTY:mlas,INCLUDE_DIRECTORIES>)
+    add_definitions(-DOV_CPU_WITH_MLAS)
+endif()
 target_include_directories(${TARGET_NAME} SYSTEM PRIVATE $<TARGET_PROPERTY:dnnl,INCLUDE_DIRECTORIES>)
-
 # Cross compiled function
 # TODO: The same for proposal, proposalONNX, topk
 cross_compiled_file(${TARGET_NAME}
@@ -133,6 +143,10 @@ if(BUILD_SHARED_LIBS)
             $<TARGET_PROPERTY:openvino::conditional_compilation,INTERFACE_INCLUDE_DIRECTORIES>)
 
     target_include_directories(${TARGET_NAME}_obj SYSTEM PUBLIC $<TARGET_PROPERTY:dnnl,INCLUDE_DIRECTORIES>)
+    
+    if(ENABLE_MLAS_FOR_CPU)
+        target_include_directories(${TARGET_NAME}_obj SYSTEM PUBLIC $<TARGET_PROPERTY:mlas,INCLUDE_DIRECTORIES>)
+    endif()
 
     set_ie_threading_interface_for(${TARGET_NAME}_obj)
 
 
@@ -705,6 +705,9 @@ void GraphOptimizer::FuseFCAndConvertOnWeights(Graph& graph) {
         if (parent->getType() == Type::Convert && parent->isConstant() && parent->getChildEdgeAt(0)->getChild()->getType() == Type::FullyConnected
                 && parent->getOriginalInputPrecisionAtPort(0) == Precision::FP16
                 && one_of(parent->getOriginalOutputPrecisionAtPort(0), Precision::FP32, Precision::BF16)) {
+            auto childNode = parent->getChildEdgeAt(0)->getChild();
+            // set correct weight precision
+            childNode->setOriginalInputPrecisionAtPort(1, parent->getOriginalInputPrecisionAtPort(0));
             graph.DropNode(parent);
         }
     }
 
@@ -0,0 +1,94 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include "sgemm.hpp"
+
+#include <string>
+#include <vector>
+
+#include "mlas.h"
+#include "onednn/dnnl.h"
+#include "openvino/core/parallel.hpp"
+#include "thread_pool.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+size_t mlas_sgemm_pack_get_size(const int64_t N, const int64_t K) {
+    return MlasGemmPackBSize(N, K);
+}
+
+void mlas_sgemm_pack(const char* transb,
+                     const int64_t N,
+                     const int64_t K,
+                     const int64_t ldb,
+                     const float* src,
+                     float* dst) {
+    MlasGemmPackB(*transb == 'T' ? CblasTrans : CblasNoTrans, N, K, src, ldb, dst);
+}
+
+void mlas_sgemm(const char* transa,
+                const char* transb,
+                const int64_t M,
+                const int64_t N,
+                const int64_t K,
+                const float alpha,
+                const float* A,
+                const int64_t lda,
+                const float* B,
+                const int64_t ldb,
+                const float beta,
+                float* C,
+                const int64_t ldc,
+                size_t thread_num) {
+    // C = alpha*op( A )op( B ) + beta * C
+    MLAS_SGEMM_DATA_PARAMS sgemmParam;
+    sgemmParam.BIsPacked = false;
+    sgemmParam.A = A;
+    sgemmParam.lda = lda;
+    sgemmParam.B = B;
+    sgemmParam.ldb = ldb;
+    sgemmParam.C = C;
+    sgemmParam.ldc = ldc;
+    sgemmParam.alpha = alpha;
+    sgemmParam.beta = beta;
+    auto _transa = *transa == 'N' ? CblasNoTrans : CblasTrans;
+    auto _transb = *transb == 'N' ? CblasNoTrans : CblasTrans;
+    ov::cpu::OVMlasThreadPool threadPool(0 == thread_num ? parallel_get_num_threads() : thread_num);
+    MlasGemmBatch(_transa, _transb, M, N, K, &sgemmParam, 1, &threadPool);
+}
+
+void mlas_sgemm_compute(const char* transa,
+                        const char* transb,
+                        const int64_t M,
+                        const int64_t N,
+                        const int64_t K,
+                        const float alpha,
+                        const float* A,
+                        const int64_t lda,
+                        const float* B,
+                        const int64_t ldb,
+                        const float beta,
+                        float* C,
+                        const int64_t ldc,
+                        const float* bias,
+                        size_t thread_num) {
+    // C = alpha*op( A )op( B ) + beta * C
+    ov::cpu::OVMlasThreadPool threadPool(0 == thread_num ? parallel_get_num_threads() : thread_num);
+    MLAS_SGEMM_DATA_PARAMS sgemmParam;
+    sgemmParam.BIsPacked = true;
+    sgemmParam.A = A;
+    sgemmParam.lda = lda;
+    sgemmParam.B = B;
+    sgemmParam.ldb = ldb;
+    sgemmParam.C = C;
+    sgemmParam.ldc = ldc;
+    sgemmParam.alpha = alpha;
+    sgemmParam.beta = beta;
+    sgemmParam.bias = bias;
+    auto _transa = *transa == 'N' ? CblasNoTrans : CblasTrans;
+    auto _transb = *transb == 'N' ? CblasNoTrans : CblasTrans;
+    MlasGemmBatch(_transa, _transb, M, N, K, &sgemmParam, 1, &threadPool);
+}
+}  // namespace intel_cpu
+}  // namespace ov
@@ -0,0 +1,109 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace ov {
+namespace intel_cpu {
+/**
+ * @brief  Computes the length in bytes for the packed matrix B buffer(SGEMM).
+ *
+ * @param N       Supplies the number of columns of matrix B.
+ * @param K       Supplies the number of rows of matrix B.
+ * @return        bytes of the packing buffer
+ */
+size_t mlas_sgemm_pack_get_size(const int64_t N, const int64_t K);
+
+/**
+ * @brief  Packs the contents of matrix B
+ *
+ * @param transb  T for transpose B, N for none-tranpose B
+ * @param N       Supplies the number of columns of matrix B and matrix C.
+ * @param K       Supplies the number of columns of matrix A and the number
+                  of rows of matrix B.
+ * @param ldb     Supplies the first dimension of matrix B.
+ * @param src     Supplies the address of matrix B
+ * @param dst     Supplies pointer to prePacked B buffer
+ */
+void mlas_sgemm_pack(const char* transb,
+                     const int64_t N,
+                     const int64_t K,
+                     const int64_t ldb,
+                     const float* src,
+                     float* dst);
+
+/**
+ * @brief  SGEMM with planar B matrix
+ *
+ * @param transa       T for transpose A, N for none-tranpose A.
+ * @param transb       T for transpose B, N for none-tranpose B.
+ * @param M            Supplies the number of rows of matrix A and matrix C.
+ * @param N            Supplies the number of columns of matrix B and matrix C.
+ * @param K            Supplies the number of columns of matrix A and the number
+                       of rows of matrix B.
+ * @param alpha        Supplies the scalar alpha multiplier (see SGEMM definition)
+ * @param A            Supplies the address of matrix A
+ * @param lda          Supplies the first dimension of matrix A.
+ * @param B            Supplies the address of matrix B
+ * @param ldb          Supplies the first dimension of matrix B.
+ * @param beta         Supplies the scalar beta multiplier (see SGEMM definition)
+ * @param C            Supplies the address of matrix C
+ * @param ldc          Supplies the first dimension of matrix C.
+ * @param thread_num   0 for all threads, otherwise use thread_num
+ */
+void mlas_sgemm(const char* transa,
+                const char* transb,
+                const int64_t M,
+                const int64_t N,
+                const int64_t K,
+                const float alpha,
+                const float* A,
+                const int64_t lda,
+                const float* B,
+                const int64_t ldb,
+                const float beta,
+                float* C,
+                const int64_t ldc,
+                size_t thread_num = 0);
+
+/**
+ * @brief SGEMM with B matrix prepacked
+ *
+ * @param transa       T for transpose A, N for none-tranpose A.
+ * @param transb       T for transpose B, N for none-tranpose B.
+ * @param M            Supplies the number of rows of matrix A and matrix C.
+ * @param N            Supplies the number of columns of matrix B and matrix C.
+ * @param K            Supplies the number of columns of matrix A and the number
+                       of rows of matrix B.
+ * @param alpha        Supplies the scalar alpha multiplier (see SGEMM definition)
+ * @param A            Supplies the address of matrix A
+ * @param lda          Supplies the first dimension of matrix A.
+ * @param B            Supplies the address of matrix B
+ * @param ldb          Supplies the first dimension of matrix B.
+ * @param beta         Supplies the scalar beta multiplier (see SGEMM definition)
+ * @param C            Supplies the address of matrix C
+ * @param ldc          Supplies the first dimension of matrix C.
+ * @param bias         Supplies the address of by-channel bias
+ * @param thread_num   0 for all threads, otherwise use thread_num
+ */
+void mlas_sgemm_compute(const char* transa,
+                        const char* transb,
+                        const int64_t M,
+                        const int64_t N,
+                        const int64_t K,
+                        const float alpha,
+                        const float* A,
+                        const int64_t lda,
+                        const float* B,
+                        const int64_t ldb,
+                        const float beta,
+                        float* C,
+                        const int64_t ldc,
+                        const float* bias = nullptr,
+                        size_t thread_num = 0);
+}  // namespace intel_cpu
+}  // namespace ov
@@ -0,0 +1,33 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "thread_pool.hpp"
+
+#include "onednn/dnnl.h"
+#include "openvino/core/parallel.hpp"
+
+// This function impl the forward declaration in MLAS
+size_t getCacheSizeMlas(int level, bool perCore) {
+    return dnnl::utils::get_cache_size(level, perCore);
+}
+
+namespace ov {
+namespace cpu {
+
+size_t OVMlasThreadPool::DegreeOfParallelism() {
+    // threadpool nullptr means single threaded
+    return threadNum;
+}
+
+void OVMlasThreadPool::TrySimpleParallelFor(const std::ptrdiff_t total, const std::function<void(std::ptrdiff_t)>& fn) {
+    ov::parallel_nt(threadNum, [&](const size_t ithr, const size_t nthr) {
+        std::ptrdiff_t start = 0, end = 0;
+        ov::splitter(total, nthr, ithr, start, end);
+        for (std::ptrdiff_t i = start; i < end; i++) {
+            fn(i);
+        }
+    });
+}
+};  // namespace cpu
+};  // namespace ov
@@ -0,0 +1,25 @@
+// Copyright (C) 2018-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include "mlas.h"
+
+namespace ov {
+namespace cpu {
+class OVMlasThreadPool : public IMlasThreadPool {
+public:
+    OVMlasThreadPool() = delete;
+    explicit OVMlasThreadPool(const size_t& threadNum) : threadNum(threadNum) {}
+    size_t DegreeOfParallelism() override;
+    void TrySimpleParallelFor(const std::ptrdiff_t total, const std::function<void(std::ptrdiff_t)>& fn) override;
+public:
+    // the actual threads used for sgemm
+    size_t threadNum = 0;
+};
+};  // namespace cpu
+};  // namespace ov
@@ -471,6 +471,7 @@ std::string Node::getPrimitiveDescriptorType() const {
     SEARCH_TYPE(avx);
     SEARCH_TYPE(sse42);
     SEARCH_TYPE(blas);
+    SEARCH_TYPE(mlas);
     SEARCH_TYPE(any);
     SEARCH_TYPE(uni);
Original file line number	Diff line number	Diff line change
`@@ -115,6 +115,7 @@ def tinyyolov3_post_processing(outputs : Sequence[Any]) -> Sequence[Any]:`
`115`	`115`	`"GPT2": {"atol": 5e-06, "rtol": 0.01},`
`116`	`116`	`"GPT-2-LM-HEAD": {"atol": 4e-06},`
`117`	`117`	`"test_retinanet_resnet101": {"atol": 1.3e-06},`
	`118`	`+ "resnet34-v1-7" : {"atol": 1e-5}`
`118`	`119`	`}`
`119`	`120`
`120`	`121`	`def tolerance_map_key_in_model_path(path):`
Original file line number	Diff line number	Diff line change
`@@ -705,6 +705,9 @@ void GraphOptimizer::FuseFCAndConvertOnWeights(Graph& graph) {`
`705`	`705`	`if (parent->getType() == Type::Convert && parent->isConstant() && parent->getChildEdgeAt(0)->getChild()->getType() == Type::FullyConnected`
`706`	`706`	`&& parent->getOriginalInputPrecisionAtPort(0) == Precision::FP16`
`707`	`707`	`&& one_of(parent->getOriginalOutputPrecisionAtPort(0), Precision::FP32, Precision::BF16)) {`
	`708`	`+ auto childNode = parent->getChildEdgeAt(0)->getChild();`
	`709`	`+ // set correct weight precision`
	`710`	`+ childNode->setOriginalInputPrecisionAtPort(1, parent->getOriginalInputPrecisionAtPort(0));`
`708`	`711`	`graph.DropNode(parent);`
`709`	`712`	`}`
`710`	`713`	`}`