[CPU] Enabled float (fp32/fp16/bf16) to nf4 precision conversion

dmitrygo · dmitrygo · commit 569d3f28dec1 · 2025-02-05T12:00:23.000+04:00
diff --git a/src/plugins/intel_cpu/src/node.cpp b/src/plugins/intel_cpu/src/node.cpp
@@ -1588,24 +1588,6 @@ ov::element::Type Node::getRuntimePrecision() const {
 }
 
 Node* Node::NodesFactory::create(const std::shared_ptr<ov::Node>& op, const GraphContext::CPtr& context) {
-    // getExceptionDescWithoutStatus removes redundant information from the exception message. For instance, the
-    // NotImplemented exception is generated in the form: full_path_to_src_file:line_number [ NOT_IMPLEMENTED ] reason.
-    // An example for gather node:
-    // /path-to-openVino-root/src/plugins/intel_cpu/nodes/gather.cpp:42 [ NOT_IMPLEMENTED ] Only opset7 Gather operation
-    // is supported The most important part of the message is the reason, so the lambda trims everything up to "]" Note
-    // that the op type and its friendly name will also be provided if we fail to create the node.
-    auto getExceptionDescWithoutStatus = [](const ov::Exception& ex) {
-        std::string desc = ex.what();
-        size_t pos = desc.find(']');
-        if (pos != std::string::npos) {
-            if (desc.size() == pos + 1) {
-                desc.erase(0, pos + 1);
-            } else {
-                desc.erase(0, pos + 2);
-            }
-        }
-        return desc;
-    };
     Node* newNode = nullptr;
     std::string errorMessage;
     if (newNode == nullptr) {
@@ -1616,7 +1598,7 @@ Node* Node::NodesFactory::create(const std::shared_ptr<ov::Node>& op, const Grap
             }
         } catch (const ov::Exception& ex) {
             if (dynamic_cast<const ov::NotImplemented*>(&ex) != nullptr) {
-                errorMessage += getExceptionDescWithoutStatus(ex);
+                errorMessage += ex.what();
             } else {
                 throw;
             }
@@ -1631,7 +1613,7 @@ Node* Node::NodesFactory::create(const std::shared_ptr<ov::Node>& op, const Grap
             }
         } catch (const ov::Exception& ex) {
             if (dynamic_cast<const ov::NotImplemented*>(&ex) != nullptr) {
-                const auto currErrorMess = getExceptionDescWithoutStatus(ex);
+                const std::string currErrorMess = ex.what();
                 if (!currErrorMess.empty()) {
                     errorMessage += errorMessage.empty() ? currErrorMess : "\n" + currErrorMess;
                 }
diff --git a/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp b/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp
@@ -893,6 +893,48 @@ struct ConvertFrom4BitPrecision<std::tuple<src_t, dst_t>> {
     }
 };
 
+#define INTEL_CPU_CVT_TO_4BIT_LIST                                              \
+    INTEL_CPU_CVT(f32, nf4), INTEL_CPU_CVT(f16, nf4), INTEL_CPU_CVT(bf16, nf4)
+
+struct ConvertTo4BitContext {
+    ov::element::Type_t outType;
+    const void* srcPtr;
+    void* dstPtr;
+    size_t size;
+    bool converted;
+};
+
+template <typename T>
+struct ConvertTo4BitPrecision;
+
+template <typename src_t, typename dst_t>
+struct ConvertTo4BitPrecision<std::tuple<src_t, dst_t>> {
+    void operator()(ConvertTo4BitContext& ctx) {
+        auto insert_half_byte = [](uint8_t dst, uint8_t val, bool high_half) -> uint8_t {
+            uint8_t shift = high_half ? 4 : 0;
+            return dst | (uint8_t) (val << shift);
+        };
+
+        auto src = static_cast<const src_t*>(ctx.srcPtr);
+        auto dst = static_cast<uint8_t*>(ctx.dstPtr);
+        // each byte must be fully processed within same thread
+        auto work_amount = div_up(ctx.size, 2);
+        if (ctx.outType == ov::element::nf4) {
+            parallel_for(work_amount, [&](size_t ib) {
+                for (int i = 0; i < 2; i++) {
+                    int idx = ib * 2 + i;
+                    uint8_t val = idx % 2 == 0 ? 0 : dst[idx / 2];
+                    val = insert_half_byte(val, ConvertNF4::quantize(static_cast<float>(src[idx])), idx % 2);
+                    dst[idx / 2] = val;
+                }
+            });
+        } else {
+            OPENVINO_THROW("cpu_convert doesn't support output data type: ", ctx.outType, ". Not implemented.");
+        }
+        ctx.converted = true;
+    }
+};
+
 #define INTEL_CPU_CVT_FROM_BYTE_FP_LIST \
     INTEL_CPU_CVT(f8e8m0, f32), INTEL_CPU_CVT(f8e8m0, bf16), INTEL_CPU_CVT(f8e8m0, f16)
 
@@ -1017,6 +1059,12 @@ void cpu_convert(const void* srcPtr,
         if (!ctx.converted) {
             OPENVINO_THROW("cpu_convert can't convert from: ", srcPrc, " precision to: ", dstPrc);
         }
+    } else if (dstPrc.bitwidth() == 4u) {
+        ConvertTo4BitContext ctx{dstPrc, srcPtr, dstPtr, size, false};
+        OV_SWITCH(intel_cpu, ConvertTo4BitPrecision, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_TO_4BIT_LIST);
+        if (!ctx.converted) {
+            OPENVINO_THROW("cpu_convert can't convert from: ", srcPrc, " precision to: ", dstPrc);
+        }
     } else if (srcPrc == ov::element::f8e8m0) {
         ConvertFromByteFPContext ctx{srcPrc, srcPtr, dstPtr, size, false};
         OV_SWITCH(intel_cpu,
@@ -1063,6 +1111,7 @@ bool is_supported_convert(ov::element::Type srcPrc, ov::element::Type dstPrc) {
     OV_SWITCH(intel_cpu, isSupported, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_FROM_BIN_LIST);
     OV_SWITCH(intel_cpu, isSupported, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_FROM_4BIT_LIST);
     OV_SWITCH(intel_cpu, isSupported, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_FROM_BYTE_FP_LIST);
+    OV_SWITCH(intel_cpu, isSupported, ctx, std::tie(srcPrc, dstPrc), INTEL_CPU_CVT_TO_4BIT_LIST);
     return ctx.isSupported;
 }
 
diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.cpp
@@ -151,8 +151,16 @@ void ConvertCPULayerTest::generate_inputs(const std::vector<ov::Shape>& targetIn
     const auto& funcInputs = function->inputs();
     for (size_t i = 0; i < funcInputs.size(); ++i) {
         const auto& funcInput = funcInputs[i];
-        ov::Tensor tensor =
-            ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), targetInputStaticShapes[i]);
+        ov::Tensor tensor;
+        if (outPrc == ov::element::nf4) {
+            tensor = ov::test::utils::create_and_fill_tensor_real_distribution(funcInput.get_element_type(),
+                                                                               targetInputStaticShapes[i],
+                                                                               -1.f,
+                                                                               1.f,
+                                                                               1);
+        } else {
+            tensor = ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), targetInputStaticShapes[i]);
+        }
         if (special_value != ov::test::SpecialValue::none) {
             if (inPrc == ov::element::f32) {
                 modify_value<float>(tensor, special_value);
@@ -176,6 +184,35 @@ void ConvertCPULayerTest::validate_out_prc() const {
         FAIL() << "ConvertCPULayerTest supports only non boolean output prc";
 }
 
+void ConvertCPULayerTest::validate() {
+    if (outPrc == ov::element::nf4) {
+        // Use custom bit-exact validation, because common tests infra doesn't support 4bits tensors comparision
+        auto div_up = [&](auto a, auto b) {
+            assert(b);
+            return (a + b - 1) / b;
+        };
+
+        auto actualOutputs = get_plugin_outputs();
+        auto expectedOutputs = calculate_refs();
+        ASSERT_EQ(expectedOutputs.size(), actualOutputs.size());
+        ASSERT_EQ(expectedOutputs.size(), 1);
+        ASSERT_EQ(expectedOutputs[0].get_shape(), actualOutputs[0].get_shape());
+
+        auto expected_data = reinterpret_cast<const uint8_t*>(expectedOutputs[0].data());
+        auto actual_data = reinterpret_cast<const uint8_t*>(actualOutputs[0].data());
+        size_t shape_size_cnt = div_up(shape_size(expectedOutputs[0].get_shape()), 2);
+        for (size_t i = 0; i < shape_size_cnt; ++i) {
+            uint8_t expected_value = expected_data[i];
+            uint8_t actual_value = actual_data[i];
+            ASSERT_EQ(expected_value, actual_value);
+        }
+
+        return;
+    }
+
+    SubgraphBaseTest::validate();
+}
+
 void ConvertToBooleanCPULayerTest::validate_out_prc() const {
     if (outPrc != ov::element::boolean)
         FAIL() << "ConvertToBooleanCPULayerTest supports only boolean output prc";
diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.hpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/conversion.hpp
@@ -29,6 +29,7 @@ class ConvertCPULayerTest : public testing::WithParamInterface<convertLayerTestP
 protected:
     void SetUp() override;
     void generate_inputs(const std::vector<ov::Shape>& targetInputStaticShapes) override;
+    void validate() override;
     virtual void validate_out_prc() const;
 
     ov::element::Type inPrc, outPrc;
diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/conversion.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/conversion.cpp
@@ -64,6 +64,15 @@ const std::vector<ov::element::Type> float_precisions = {
     ov::element::bf16,
 };
 
+INSTANTIATE_TEST_SUITE_P(smoke_ConvertCPULayerTest_float_to_nf4, ConvertCPULayerTest,
+                        ::testing::Combine(
+                                ::testing::ValuesIn(inShapes_4D_dynamic()),
+                                ::testing::ValuesIn(float_precisions),
+                                ::testing::Values(ov::element::nf4),
+                                ::testing::Values(ov::test::SpecialValue::none),
+                                ::testing::Values(CPUSpecificParams({nchw}, {nchw}, {}, {"ref"}))),
+                        ConvertCPULayerTest::getTestCaseName);
+
 const std::vector<ov::element::Type> f8_precisions = {
     ov::element::f8e4m3,
     ov::element::f8e5m2,